diff --git a/Cargo.toml b/Cargo.toml index 63f540e2..c0c9223b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,9 +15,11 @@ members = [ "influxdb2_client", "iox_http", "iox_query_influxql", + "iox_query_influxql_rewrite", "iox_query", "iox_system_tables", "iox_time", + "iox_v1_query_api", "logfmt", "meta_data_cache", "metric_exporters", @@ -74,26 +76,26 @@ arrow-schema = { version = "55" } bincode = { version = "2", default-features = false, features = ["alloc", "derive"] } # Use DataFusion fork # See https://github.com/influxdata/arrow-datafusion/pull/73 for contents -datafusion = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "a9cf9aca9ebf0d6c04e0861d2baebffa0ba77dbc" } -datafusion-proto = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "a9cf9aca9ebf0d6c04e0861d2baebffa0ba77dbc" } +datafusion = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "ee81b1cc652bde6c131973d091b178836692112d" } +datafusion-proto = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "ee81b1cc652bde6c131973d091b178836692112d" } hashbrown = { version = "0.14.5" } http = { version = "1" } http-body = { version = "1" } http-body-util = { version = "0.1" } hyper = { version = "1" } hyper-util = { version = "0.1" } -object_store = { version = "0.12.3", features = ["aws", "azure", "gcp"] } +object_store = { version = "0.12.4", features = ["aws", "azure", "gcp"] } parquet = { version = "55", features = ["object_store"] } -pbjson = { version = "0.7" } -pbjson-build = { version = "0.7" } +pbjson = { version = "0.8" } +pbjson-build = { version = "0.8" } pbjson-types = { version = "0.7" } proptest = { version = "1", default-features = false, features = ["std"] } prost = { version = "0.13" } prost-build = { version = "0.13" } prost-types = { version = "0.13" } reqwest = { version = "0.12", default-features = false } -rstest = { version = "0.21" } -sqlx = { version = "0.8.6", features = ["sqlite"] } +rstest = { version = "0.26" } +sqlx = { version = "0.8.6" } tower = { version = "0.5" } tracing = { version = "0.1", features = ["log", "max_level_trace"] } tracing-log = { version = "0.2" } diff --git a/arrow_util/Cargo.toml b/arrow_util/Cargo.toml index da4f41ad..ec8759a5 100644 --- a/arrow_util/Cargo.toml +++ b/arrow_util/Cargo.toml @@ -22,7 +22,7 @@ comfy-table = { version = "7.2", default-features = false } hashbrown = { workspace = true } num-traits = "0.2" parquet = { workspace = true } -regex = "1.11.2" +regex = "1.12.2" snafu = "0.8" uuid = "1" workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/authz/Cargo.toml b/authz/Cargo.toml index cc8333fd..d59a697d 100644 --- a/authz/Cargo.toml +++ b/authz/Cargo.toml @@ -25,10 +25,10 @@ snafu = "0.8" [dev-dependencies] assert_matches = "1.5.0" -parking_lot = "0.12.4" +parking_lot = "0.12.5" paste = "1.0.15" test_helpers_authz = { path = "../test_helpers_authz" } -tokio = "1.47.1" +tokio = "1.48.0" [features] http = ["dep:http"] diff --git a/backoff/Cargo.toml b/backoff/Cargo.toml index a1e54563..8de749ff 100644 --- a/backoff/Cargo.toml +++ b/backoff/Cargo.toml @@ -9,7 +9,7 @@ license.workspace = true workspace = true [dependencies] -tokio = { version = "1.47", features = ["macros", "time"] } +tokio = { version = "1.48", features = ["macros", "time"] } tracing = { workspace = true } rand = "0.9" snafu = "0.8" diff --git a/catalog_cache/Cargo.toml b/catalog_cache/Cargo.toml index 9bc2783d..eafc31b1 100644 --- a/catalog_cache/Cargo.toml +++ b/catalog_cache/Cargo.toml @@ -19,7 +19,7 @@ iox_http_util = { path = "../iox_http_util" } tracing = { workspace = true } reqwest = { workspace = true } snafu = "0.8" -tokio = { version = "1.47", default-features = false, features = [ +tokio = { version = "1.48", default-features = false, features = [ "macros", "rt", ] } diff --git a/catalog_cache/benches/list_encode.rs b/catalog_cache/benches/list_encode.rs index 17a4d009..018d52ed 100644 --- a/catalog_cache/benches/list_encode.rs +++ b/catalog_cache/benches/list_encode.rs @@ -103,12 +103,25 @@ fn encode_partition_snapshot(i: usize) -> Bytes { let partition_key = PartitionKey::from(format!("arbitrary_{i}")); let expected_partition_hash_id = PartitionHashId::new(table_id, &partition_key); let generation = 6; - let parquet_file_defaults = ParquetFile { + + let partition = Partition::new_catalog_only( + partition_id, + table_id, + partition_key.clone(), + Default::default(), + Default::default(), + Default::default(), + Default::default(), + None, // max_time + Default::default(), + ); + // Create associated Parquet file + let parquet_files = vec![ParquetFile { id: ParquetFileId::new(7 + i as i64), namespace_id, table_id, partition_id, - partition_hash_id: Some(expected_partition_hash_id.clone()), + partition_hash_id: expected_partition_hash_id.clone(), object_store_id: ObjectStoreId::from_str("00000000-0000-0001-0000-000000000000").unwrap(), min_time: Timestamp::new(2), max_time: Timestamp::new(3), @@ -120,31 +133,9 @@ fn encode_partition_snapshot(i: usize) -> Bytes { column_set: ColumnSet::empty(), max_l0_created_at: Timestamp::new(6), source: None, - }; + }]; - let partition = Partition::new_catalog_only( - partition_id, - Some(expected_partition_hash_id.clone()), - table_id, - partition_key.clone(), - Default::default(), - Default::default(), - Default::default(), - Default::default(), - None, // max_time - ); - // Create associated Parquet files: - let parquet_files = vec![ - // one addressed by numeric ID, - ParquetFile { - partition_hash_id: None, - ..parquet_file_defaults.clone() - }, - // one addressed by hash ID. - parquet_file_defaults.clone(), - ]; - - // Encode the partition and its Parquet files, + // Encode the partition and its Parquet file let snapshot = PartitionSnapshot::encode( namespace_id, partition, diff --git a/client_util/Cargo.toml b/client_util/Cargo.toml index ff2c1341..aa8698f6 100644 --- a/client_util/Cargo.toml +++ b/client_util/Cargo.toml @@ -16,13 +16,13 @@ reqwest = { workspace = true, features = ["stream", "rustls-tls-native-roots"] } # This direct dependency on rustls can probably be removed when tonic is upgraded to 0.13+. # See for more details. rustls = { version = "0.23", default-features = false } -thiserror = "2.0.16" +thiserror = "2.0.17" tonic = { version = "0.12", features = ["gzip", "tls", "tls-native-roots", "zstd"] } tower = { workspace = true } workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] -tokio = { version = "1.47", features = [ +tokio = { version = "1.48", features = [ "macros", "parking_lot", "rt-multi-thread", diff --git a/data_types/Cargo.toml b/data_types/Cargo.toml index b5f2e5c9..0c4654b2 100644 --- a/data_types/Cargo.toml +++ b/data_types/Cargo.toml @@ -14,7 +14,7 @@ arrow = { workspace = true } arrow-buffer = { workspace = true } bytes = "1.10" chrono = { version = "0.4", default-features = false } -croaring = "2.4.0" +croaring = "2.5.1" influxdb-line-protocol = { path = "../influxdb_line_protocol" } iox_time = { path = "../iox_time" } generated_types = { path = "../generated_types" } @@ -33,7 +33,7 @@ sqlx = { workspace = true, features = [ "postgres", "uuid", ] } -thiserror = "2.0.16" +thiserror = "2.0.17" uuid = { version = "1", features = ["v4"] } workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/data_types/src/lib.rs b/data_types/src/lib.rs index 2c44c84b..18d0d3c6 100644 --- a/data_types/src/lib.rs +++ b/data_types/src/lib.rs @@ -810,15 +810,30 @@ pub struct TableSchema { /// the table's columns by their name pub columns: ColumnsByName, + + /// Whether or not iceberg is enabled for this table + pub iceberg_enabled: bool, } impl TableSchema { - /// Initialize new `TableSchema` from the information in the given `Table`. + /// Initialize new [`TableSchema`] from the information in the given [`Table`]. pub fn new_empty_from(table: &Table) -> Self { Self { id: table.id, partition_template: table.partition_template.clone(), columns: ColumnsByName::default(), + iceberg_enabled: table.iceberg_enabled, + } + } + + /// Initialize a new [`TableSchema`] with the given id, no columns, default partition, and + /// iceberg disabled. + pub fn new_with(id: TableId) -> Self { + Self { + id, + partition_template: TablePartitionTemplateOverride::default(), + columns: ColumnsByName::default(), + iceberg_enabled: false, } } @@ -1077,8 +1092,8 @@ pub struct ParquetFile { pub table_id: TableId, /// the partition identifier pub partition_id: PartitionId, - /// the optional partition hash id - pub partition_hash_id: Option, + /// the partition hash id + pub partition_hash_id: PartitionHashId, /// the uuid used in the object store path for this file pub object_store_id: ObjectStoreId, /// the min timestamp of data in this file @@ -1178,11 +1193,7 @@ impl ParquetFile { /// Estimate the memory consumption of this object and its contents pub fn size(&self) -> usize { - let hash_id = self - .partition_hash_id - .as_ref() - .map(|x| x.size()) - .unwrap_or_default(); + let hash_id = self.partition_hash_id.size(); size_of_val(self) + hash_id + self.column_set.size() - size_of_val(&self.column_set) } @@ -1211,7 +1222,7 @@ impl ParquetFile { /// Temporary to aid incremental migration pub fn transition_partition_id(&self) -> TransitionPartitionId { - TransitionPartitionId::from_parts(self.partition_id, self.partition_hash_id.clone()) + TransitionPartitionId::from_parts(self.partition_id, Some(self.partition_hash_id.clone())) } } @@ -1222,10 +1233,7 @@ impl From for catalog_proto::ParquetFile { namespace_id: v.namespace_id.get(), table_id: v.table_id.get(), partition_id: v.partition_id.get(), - partition_hash_id: v - .partition_hash_id - .map(|x| x.as_bytes().to_vec()) - .unwrap_or_default(), + partition_hash_id: v.partition_hash_id.as_bytes().to_vec(), object_store_id: v.object_store_id.to_string(), min_time: v.min_time.get(), max_time: v.max_time.get(), @@ -1266,11 +1274,7 @@ impl TryFrom for ParquetFile { namespace_id: NamespaceId::new(v.namespace_id), table_id: TableId::new(v.table_id), partition_id: PartitionId::new(v.partition_id), - partition_hash_id: if v.partition_hash_id.is_empty() { - None - } else { - Some(v.partition_hash_id[..].try_into()?) - }, + partition_hash_id: v.partition_hash_id[..].try_into()?, object_store_id: ObjectStoreId::from_str(&v.object_store_id)?, min_time: Timestamp::new(v.min_time), max_time: Timestamp::new(v.max_time), @@ -1346,7 +1350,7 @@ pub struct ParquetFileParams { /// the partition identifier pub partition_id: PartitionId, /// the partition hash ID - pub partition_hash_id: Option, + pub partition_hash_id: PartitionHashId, /// the uuid used in the object store path for this file pub object_store_id: ObjectStoreId, /// the min timestamp of data in this file @@ -3329,6 +3333,7 @@ mod tests { id: TableId::new(1), partition_template: Default::default(), columns: ColumnsByName::default(), + iceberg_enabled: false, }; let schema2 = TableSchema { id: TableId::new(2), @@ -3339,6 +3344,7 @@ mod tests { name: String::from("foo"), column_type: ColumnType::Bool, }]), + iceberg_enabled: false, }; assert!(schema1.size() < schema2.size()); } @@ -3361,11 +3367,7 @@ mod tests { id: NamespaceId::new(1), active_tables: BTreeMap::from([( String::from("foo"), - TableSchema { - id: TableId::new(1), - columns: ColumnsByName::default(), - partition_template: Default::default(), - }, + TableSchema::new_with(TableId::new(1)), )]), deleted_tables: BTreeSet::new(), partition_template: Default::default(), @@ -3412,41 +3414,13 @@ mod tests { #[test] fn catalog_service_parquet_file_serde_roundtrip() { - // This part of the test can be removed when all partitions have hash IDs. - let old_style_parquet_file = ParquetFile { - id: ParquetFileId::new(3), - namespace_id: NamespaceId::new(4), - table_id: TableId::new(5), - partition_id: PartitionId::new(6), - partition_hash_id: None, // this is the important part for this test - object_store_id: ObjectStoreId::new(), - min_time: Timestamp::new(30), - max_time: Timestamp::new(50), - to_delete: None, - file_size_bytes: 1024, - row_count: 42, - compaction_level: CompactionLevel::Initial, - created_at: Timestamp::new(70), - column_set: ColumnSet::empty(), - max_l0_created_at: Timestamp::new(70), - source: None, - }; - let catalog_proto_old_style_parquet_file = - catalog_proto::ParquetFile::from(old_style_parquet_file.clone()); - let round_trip_old_style_parquet_file = - ParquetFile::try_from(catalog_proto_old_style_parquet_file).unwrap(); - assert_eq!(old_style_parquet_file, round_trip_old_style_parquet_file); - let table_id = TableId::new(5); let parquet_file = ParquetFile { id: ParquetFileId::new(3), namespace_id: NamespaceId::new(4), table_id, partition_id: PartitionId::new(6), - partition_hash_id: Some(PartitionHashId::new( - table_id, - &PartitionKey::from("arbitrary"), - )), + partition_hash_id: PartitionHashId::new(table_id, &PartitionKey::from("arbitrary")), object_store_id: ObjectStoreId::new(), min_time: Timestamp::new(30), max_time: Timestamp::new(50), diff --git a/data_types/src/partition.rs b/data_types/src/partition.rs index 61260e13..5c4f2ea9 100644 --- a/data_types/src/partition.rs +++ b/data_types/src/partition.rs @@ -578,9 +578,8 @@ impl sqlx::postgres::PgHasArrayType for PartitionHashId { pub struct Partition { /// the id of the partition pub id: PartitionId, - /// The unique hash derived from the table ID and partition key, if available. This will become - /// required when partitions without the value have aged out. - hash_id: Option, + /// The unique hash derived from the table ID and partition key. + hash_id: PartitionHashId, /// the table the partition is under pub table_id: TableId, /// the string key of the partition @@ -640,7 +639,6 @@ impl Partition { #[expect(clippy::too_many_arguments)] pub fn new_catalog_only( id: PartitionId, - hash_id: Option, table_id: TableId, partition_key: PartitionKey, sort_key_ids: SortKeyIds, @@ -648,10 +646,11 @@ impl Partition { cold_compact_at: Option, created_at: Option, max_time: Option, + estimated_size_bytes: Option, ) -> Self { Self { id, - hash_id, + hash_id: PartitionHashId::new(table_id, &partition_key), table_id, partition_key, sort_key_ids, @@ -659,19 +658,19 @@ impl Partition { cold_compact_at, created_at, max_time, - estimated_size_bytes: None, + estimated_size_bytes, } } /// If this partition has a `PartitionHashId` stored in the catalog, use that. Otherwise, use /// the database-assigned `PartitionId`. pub fn transition_partition_id(&self) -> TransitionPartitionId { - TransitionPartitionId::from((self.id, self.hash_id.as_ref())) + TransitionPartitionId::from((self.id, Some(&self.hash_id))) } - /// The unique hash derived from the table ID and partition key, if it exists in the catalog. - pub fn hash_id(&self) -> Option<&PartitionHashId> { - self.hash_id.as_ref() + /// The unique hash derived from the table ID and partition key + pub fn hash_id(&self) -> &PartitionHashId { + &self.hash_id } /// The sort key IDs, if the sort key has been set diff --git a/data_types/src/snapshot/partition.rs b/data_types/src/snapshot/partition.rs index 23e30de5..fe46200c 100644 --- a/data_types/src/snapshot/partition.rs +++ b/data_types/src/snapshot/partition.rs @@ -64,7 +64,7 @@ pub struct PartitionSnapshot { /// The [`PartitionId`] partition_id: PartitionId, /// The [`PartitionHashId`] - partition_hash_id: Option, + partition_hash_id: PartitionHashId, /// The generation of this snapshot generation: u64, /// The partition key @@ -84,6 +84,9 @@ pub struct PartitionSnapshot { /// The time this Partition was created at, or `None` if this partition was created before this /// field existed. Not the time the snapshot was created. created_at: Option, + /// Estimated size in bytes of all the active files in this partition, or `None` + /// if the partition size has not been computed yet. + estimated_size_bytes: Option, } impl PartitionSnapshot { @@ -122,7 +125,8 @@ impl PartitionSnapshot { max_l0_created_at: file.max_l0_created_at.0, column_mask: Some(mask.finish().into()), source: file.source.map(|i| i as i32).unwrap_or_default(), - use_numeric_partition_id: Some(file.partition_hash_id.is_none()), + #[expect(deprecated)] + use_numeric_partition_id: Some(false), } }) .collect(); @@ -132,7 +136,7 @@ impl PartitionSnapshot { columns, namespace_id, partition_id: partition.id, - partition_hash_id: partition.hash_id().cloned(), + partition_hash_id: partition.hash_id().clone(), key: partition.partition_key.as_bytes().to_vec().into(), files: MessageList::encode(files).context(FileEncodeSnafu)?, sort_key: partition.sort_key_ids().cloned().unwrap_or_default(), @@ -141,15 +145,14 @@ impl PartitionSnapshot { skipped_compaction: skipped_compaction.map(|sc| sc.into()), cold_compact_at: partition.cold_compact_at, created_at: partition.created_at(), + estimated_size_bytes: partition.estimated_size_bytes, }) } /// Create a new [`PartitionSnapshot`] from a `proto` and generation pub fn decode(proto: proto::Partition, generation: u64) -> Self { let table_id = TableId::new(proto.table_id); - let partition_hash_id = proto - .partition_hash_id - .then(|| PartitionHashId::from_raw(table_id, proto.key.as_ref())); + let partition_hash_id = PartitionHashId::from_raw(table_id, proto.key.as_ref()); Self { generation, @@ -165,6 +168,7 @@ impl PartitionSnapshot { skipped_compaction: proto.skipped_compaction, cold_compact_at: proto.cold_compact_at.map(Timestamp::new), created_at: proto.created_at.map(Timestamp::new), + estimated_size_bytes: proto.estimated_size_bytes, } } @@ -179,8 +183,8 @@ impl PartitionSnapshot { } /// Returns the [`PartitionHashId`] if any - pub fn partition_hash_id(&self) -> Option<&PartitionHashId> { - self.partition_hash_id.as_ref() + pub fn partition_hash_id(&self) -> &PartitionHashId { + &self.partition_hash_id } /// Returns the file at index `idx` @@ -204,22 +208,7 @@ impl PartitionSnapshot { namespace_id: self.namespace_id, table_id: self.table_id, partition_id: self.partition_id, - partition_hash_id: match file.use_numeric_partition_id { - // If the Parquet file uses the numeric partition ID, don't set a - // `partition_hash_id`, regardless of whether the Partition uses a `hash_id` - Some(true) => None, - Some(false) => Some(match self.partition_hash_id.clone() { - Some(hash_id) => hash_id, - // If the Parquet file uses the hash ID but the Partition doesn't yet, - // compute it - None => self - .key() - .map(|key| PartitionHashId::new(self.table_id, &key))?, - }), - // If the Parquet file doesn't specify whether it uses a hash ID, fall back to - // whatever the Partition uses - None => self.partition_hash_id.clone(), - }, + partition_hash_id: self.partition_hash_id.clone(), object_store_id: ObjectStoreId::from_uuid(uuid.into()), min_time: Timestamp(file.min_time), max_time: Timestamp(file.max_time), @@ -249,7 +238,6 @@ impl PartitionSnapshot { pub fn partition(&self) -> Result { Ok(Partition::new_catalog_only( self.partition_id, - self.partition_hash_id.clone(), self.table_id, self.key()?, self.sort_key.clone(), @@ -257,6 +245,7 @@ impl PartitionSnapshot { self.cold_compact_at, self.created_at, None, // max_time - not stored in snapshot (can be computed from partition key) + self.estimated_size_bytes, )) } @@ -272,6 +261,13 @@ impl PartitionSnapshot { .cloned() .map(|sc| sc.into()) } + + /// Returns the estimated size of the partition in bytes. + pub fn estimated_size_bytes(&self) -> i64 { + // Treat None as 0. Since this is an estimated size, + // it is acceptable to treat partitions with None as having size 0. + self.estimated_size_bytes.unwrap_or(0) + } } impl From for proto::Partition { @@ -282,13 +278,14 @@ impl From for proto::Partition { namespace_id: value.namespace_id.get(), table_id: value.table_id.get(), partition_id: value.partition_id.get(), - partition_hash_id: value.partition_hash_id.is_some(), + partition_hash_id: true, column_ids: value.columns.iter().map(|x| x.get()).collect(), sort_key_ids: value.sort_key.iter().map(|x| x.get()).collect(), new_file_at: value.new_file_at.map(|x| x.get()), skipped_compaction: value.skipped_compaction, cold_compact_at: value.cold_compact_at.map(|x| x.get()), created_at: value.created_at.map(|x| x.get()), + estimated_size_bytes: value.estimated_size_bytes, } } } @@ -296,94 +293,11 @@ impl From for proto::Partition { #[cfg(test)] mod tests { use super::*; - use crate::{CompactionLevel, PartitionKey}; - use std::str::FromStr; - - #[test] - fn partition_hash_id_transition_parquet_files_individually() { - let namespace_id = NamespaceId::new(3); - let table_id = TableId::new(4); - let partition_id = PartitionId::new(5); - let partition_key = PartitionKey::from("arbitrary"); - let expected_partition_hash_id = PartitionHashId::new(table_id, &partition_key); - let generation = 6; - let parquet_file_defaults = ParquetFile { - id: ParquetFileId::new(7), - namespace_id, - table_id, - partition_id, - partition_hash_id: Some(expected_partition_hash_id.clone()), - object_store_id: ObjectStoreId::from_str("00000000-0000-0001-0000-000000000000") - .unwrap(), - min_time: Timestamp::new(2), - max_time: Timestamp::new(3), - to_delete: None, - file_size_bytes: 4, - row_count: 5, - compaction_level: CompactionLevel::Initial, - created_at: Timestamp::new(6), - column_set: ColumnSet::empty(), - max_l0_created_at: Timestamp::new(6), - source: None, - }; - - let encode_and_compare = |use_partition_hash_id: bool| { - // For a partition with or without a hash ID as specified, - let partition = Partition::new_catalog_only( - partition_id, - if use_partition_hash_id { - Some(expected_partition_hash_id.clone()) - } else { - None - }, - table_id, - partition_key.clone(), - Default::default(), - Default::default(), - Default::default(), - Default::default(), - None, // max_time - ); - // Create associated Parquet files: - let parquet_files = vec![ - // one addressed by numeric ID, - ParquetFile { - partition_hash_id: None, - ..parquet_file_defaults.clone() - }, - // one addressed by hash ID. - parquet_file_defaults.clone(), - ]; - - // Encode the partition and its Parquet files, - let encoded_partition = PartitionSnapshot::encode( - namespace_id, - partition, - parquet_files.clone(), - None, - generation, - ) - .unwrap(); - - // then ensure accessing each Parquet file returns the same information as was encoded. - assert_eq!( - &encoded_partition.file(0).unwrap(), - &parquet_files[0], - "use_partition_hash_id: {use_partition_hash_id}" - ); - assert_eq!( - &encoded_partition.file(1).unwrap(), - &parquet_files[1], - "use_partition_hash_id: {use_partition_hash_id}" - ); - }; - - // Encoding and accessing Parquet files should work whether their associated Partition - // has a hash ID or not. - encode_and_compare(true); - encode_and_compare(false); - } + use crate::PartitionKey; + // Even though all partitions now have hash IDs, keep this test to ensure we can continue to + // decode and use any cached proto that doesn't use hash IDs. + #[expect(deprecated)] #[test] fn decode_old_cached_proto() { let partition_key = PartitionKey::from("arbitrary"); @@ -448,6 +362,7 @@ mod tests { new_file_at: Default::default(), skipped_compaction: Default::default(), sort_key_ids: Default::default(), + estimated_size_bytes: Default::default(), }; let numeric_id_partition_proto = proto::Partition { partition_hash_id: false, @@ -458,37 +373,46 @@ mod tests { let decoded_hash_id_partition = PartitionSnapshot::decode(hash_id_partition_proto, 1); let decoded_numeric_id_partition = PartitionSnapshot::decode(numeric_id_partition_proto, 1); - // For the Parquet file without `use_numeric_partition_id` set, it should be addressed in - // the same way as its partition is. + // For the Parquet file without `use_numeric_partition_id` set, it should be addressed + // with hash ID because this should be impossible now. let pf0_hash_id_partition = decoded_hash_id_partition.file(0).unwrap(); assert_eq!( pf0_hash_id_partition.partition_hash_id, - Some(decoded_hash_id_partition.partition_hash_id.clone().unwrap()) + decoded_hash_id_partition.partition_hash_id.clone() ); let pf0_numeric_id_partition = decoded_numeric_id_partition.file(0).unwrap(); - assert_eq!(pf0_numeric_id_partition.partition_hash_id, None); + assert_eq!( + pf0_numeric_id_partition.partition_hash_id, + decoded_hash_id_partition.partition_hash_id + ); // For the Parquet file with `use_numeric_partition_id` set to `false`, it should be // addressed with hash ID, regardless of how the partition is addressed. let pf1_hash_id_partition = decoded_hash_id_partition.file(1).unwrap(); assert_eq!( pf1_hash_id_partition.partition_hash_id, - Some(decoded_hash_id_partition.partition_hash_id.clone().unwrap()) + decoded_hash_id_partition.partition_hash_id.clone() ); let pf1_numeric_id_partition = decoded_numeric_id_partition.file(1).unwrap(); assert_eq!( pf1_numeric_id_partition.partition_hash_id, - Some(PartitionHashId::new( + PartitionHashId::new( decoded_numeric_id_partition.table_id, &decoded_numeric_id_partition.key().unwrap() - )) + ) ); // For the Parquet file with `use_numeric_partition_id` set to `true`, it should be - // addressed with numeric ID, regardless of how the partition is addressed. + // addressed with hash ID because this should be impossible now. let pf1_hash_id_partition = decoded_hash_id_partition.file(2).unwrap(); - assert_eq!(pf1_hash_id_partition.partition_hash_id, None); + assert_eq!( + pf1_hash_id_partition.partition_hash_id, + decoded_hash_id_partition.partition_hash_id.clone() + ); let pf1_numeric_id_partition = decoded_numeric_id_partition.file(2).unwrap(); - assert_eq!(pf1_numeric_id_partition.partition_hash_id, None); + assert_eq!( + pf1_numeric_id_partition.partition_hash_id, + decoded_hash_id_partition.partition_hash_id.clone() + ); } } diff --git a/datafusion_util/Cargo.toml b/datafusion_util/Cargo.toml index d7951579..892fbf38 100644 --- a/datafusion_util/Cargo.toml +++ b/datafusion_util/Cargo.toml @@ -16,7 +16,7 @@ futures = "0.3" object_store = { workspace = true } pin-project = "1.1" schema = { path = "../schema" } -tokio = { version = "1.47", features = ["parking_lot", "sync"] } +tokio = { version = "1.48", features = ["parking_lot", "sync"] } tokio-stream = "0.1" tracing = { workspace = true } url = "2.5" diff --git a/executor/Cargo.toml b/executor/Cargo.toml index 340c4eca..226d3a4e 100644 --- a/executor/Cargo.toml +++ b/executor/Cargo.toml @@ -13,7 +13,7 @@ futures = "0.3" metric = { path = "../metric" } parking_lot = "0.12" snafu = "0.8" -tokio = { version = "1.47" } +tokio = { version = "1.48" } tokio_metrics_bridge = { path = "../tokio_metrics_bridge" } tokio_watchdog = { path = "../tokio_watchdog" } tracing = { workspace = true } diff --git a/generated_types/protos/influxdata/iox/catalog/v2/service.proto b/generated_types/protos/influxdata/iox/catalog/v2/service.proto index 7984d263..e016db07 100644 --- a/generated_types/protos/influxdata/iox/catalog/v2/service.proto +++ b/generated_types/protos/influxdata/iox/catalog/v2/service.proto @@ -68,7 +68,10 @@ service CatalogService { rpc PartitionNewFileBetween(PartitionNewFileBetweenRequest) returns (stream PartitionNewFileBetweenResponse); rpc PartitionNeedingColdCompact(PartitionNeedingColdCompactRequest) returns (stream PartitionNeedingColdCompactResponse); rpc PartitionUpdateColdCompact(PartitionUpdateColdCompactRequest) returns (PartitionUpdateColdCompactResponse); - rpc PartitionListOldStyle(PartitionListOldStyleRequest) returns (stream PartitionListOldStyleResponse); + rpc PartitionUpdateStorageSize(PartitionUpdateStorageSizeRequest) returns (PartitionUpdateStorageSizeResponse); + rpc PartitionListOldStyle(PartitionListOldStyleRequest) returns (stream PartitionListOldStyleResponse) { + option deprecated = true; + }; rpc PartitionDeleteByRetention(PartitionDeleteByRetentionRequest) returns (stream PartitionDeleteByRetentionResponse); rpc PartitionDeleteBatch(PartitionDeleteBatchRequest) returns (stream PartitionDeleteBatchResponse); rpc PartitionSnapshot(PartitionSnapshotRequest) returns (PartitionSnapshotResponse); @@ -352,13 +355,19 @@ message TableEnableIcebergRequest { int64 table_id = 1; } -message TableEnableIcebergResponse {} +message TableEnableIcebergResponse { + Table table = 1; + int64 router_version = 2; +} message TableDisableIcebergRequest { int64 table_id = 1; } -message TableDisableIcebergResponse {} +message TableDisableIcebergResponse { + Table table = 1; + int64 router_version = 2; +} message TableSoftDeleteRequest { int64 table_id = 1; @@ -568,6 +577,7 @@ message PartitionNewFileBetweenResponse { message PartitionNeedingColdCompactRequest { int64 maximum_time = 1; uint64 n = 2; + optional int64 redo_before_nanos = 3; } message PartitionNeedingColdCompactResponse { @@ -581,6 +591,13 @@ message PartitionUpdateColdCompactRequest { message PartitionUpdateColdCompactResponse {} +message PartitionUpdateStorageSizeRequest { + int64 partition_id = 1; + int64 estimated_size_bytes = 2; +} + +message PartitionUpdateStorageSizeResponse {} + message PartitionListOldStyleRequest {} message PartitionListOldStyleResponse { diff --git a/generated_types/protos/influxdata/iox/catalog_cache/v1/value.proto b/generated_types/protos/influxdata/iox/catalog_cache/v1/value.proto index c03ce8d3..5bc99f26 100644 --- a/generated_types/protos/influxdata/iox/catalog_cache/v1/value.proto +++ b/generated_types/protos/influxdata/iox/catalog_cache/v1/value.proto @@ -82,6 +82,10 @@ message Partition { // The time this partition was created, or `None` if this partition was // created before this field existed optional int64 created_at = 13; + + /// Estimated size in bytes of all the active files in this partition, or `None` + /// if the partition size has not been computed yet. + optional int64 estimated_size_bytes = 14; } enum ParquetFileSource { @@ -123,16 +127,8 @@ message PartitionFile { // Which component created this parquet file ParquetFileSource source = 11; - // Present and true if this parquet file's object store path uses the partition's numeric ID - // (as opposed to the partition's hash ID, which is what most files use). - // Exists to enable file-by-file transition to always using the partition's hash ID. - // - // The current code always sets this value when encoding. This value's absence indicates the - // protobuf was cached before the code adding this field was deployed. In that case, the code - // will fall back to the behavior the catalog cache exhibited previously, which was looking at - // the partition to know which ID to use and assuming all Parquet files in a partition were - // addressed with the same ID. - optional bool use_numeric_partition_id = 12; + // No longer needed because all partitions and Parquet files should have partition hash IDs. + optional bool use_numeric_partition_id = 12 [deprecated = true]; } message Table { diff --git a/generated_types/protos/influxdata/iox/catalog_storage/v1/service.proto b/generated_types/protos/influxdata/iox/catalog_storage/v1/service.proto index 4afb4833..4cecaedd 100644 --- a/generated_types/protos/influxdata/iox/catalog_storage/v1/service.proto +++ b/generated_types/protos/influxdata/iox/catalog_storage/v1/service.proto @@ -73,6 +73,10 @@ message GetNamespacesWithStorageRequest { // Filter namespaces by name (case-insensitive partial match) // If provided, only namespaces with names containing this string are returned optional string name_filter = 6; + + // Filter namespaces by ID (partial match) + // If provided, only namespaces with IDs containing this string are returned + optional string id_filter = 7; } // Request to get a specific namespace with storage. @@ -170,6 +174,14 @@ message GetTablesWithStorageRequest { // Filter by soft-deleted status // If not specified, only the active tables are returned optional influxdata.iox.common.v1.SoftDeleted deleted = 6; + + // Filter tables by name (case-insensitive partial match) + // If provided, only tables with names containing this string are returned + optional string name_filter = 7; + + // Filter tables by ID (partial match) + // If provided, only tables with IDs containing this string are returned + optional string id_filter = 8; } // Request to get a specific table with storage. diff --git a/generated_types/protos/influxdata/iox/gossip/v1/schema.proto b/generated_types/protos/influxdata/iox/gossip/v1/schema.proto index d5b5a3b5..8d3cbfd8 100644 --- a/generated_types/protos/influxdata/iox/gossip/v1/schema.proto +++ b/generated_types/protos/influxdata/iox/gossip/v1/schema.proto @@ -60,6 +60,9 @@ message TableUpdated { // should follow the same rules about what value to hold (and when) as the // TableDropped.router_version field) int64 router_version = 6; + + // if the version of the table accompanying router_version has the `iceberg_enabled` field set + bool iceberg_enabled = 7; } // Initialisation of a new table occurred. diff --git a/generated_types/protos/influxdata/iox/namespace/v1/service.proto b/generated_types/protos/influxdata/iox/namespace/v1/service.proto index b4186e41..4c8e4fab 100644 --- a/generated_types/protos/influxdata/iox/namespace/v1/service.proto +++ b/generated_types/protos/influxdata/iox/namespace/v1/service.proto @@ -276,6 +276,10 @@ message GetNamespacesWithStorageRequest { // Filter namespaces by name (case-insensitive partial match) // If provided, only namespaces with names containing this string are returned optional string name_filter = 7; + + // Filter namespaces by ID (partial match) + // If provided, only namespaces with IDs containing this string are returned + optional string id_filter = 8; } // Request to get a specific namespace with storage. diff --git a/generated_types/protos/influxdata/iox/table/v1/service.proto b/generated_types/protos/influxdata/iox/table/v1/service.proto index 33cd7e94..9efc7729 100644 --- a/generated_types/protos/influxdata/iox/table/v1/service.proto +++ b/generated_types/protos/influxdata/iox/table/v1/service.proto @@ -181,6 +181,14 @@ message GetTablesWithStorageRequest { // Filter by soft-deleted status // If not specified, only the active tables are returned optional influxdata.iox.common.v1.SoftDeleted deleted = 6; + + // Filter tables by name (case-insensitive partial match) + // If provided, only tables with names containing this string are returned + optional string name_filter = 7; + + // Filter tables by ID (partial match) + // If provided, only tables with IDs containing this string are returned + optional string id_filter = 8; } // Request to get a specific table with storage. @@ -212,6 +220,7 @@ message GetTableWithStorageResponse { // Request to enable iceberg exports for the given table. message EnableIcebergRequest { int64 table_id = 1; + int64 namespace_id = 2; } message EnableIcebergResponse {} @@ -219,6 +228,7 @@ message EnableIcebergResponse {} // Request to disable iceberg exports for the given table. message DisableIcebergRequest { int64 table_id = 1; + int64 namespace_id = 2; } message DisableIcebergResponse {} diff --git a/influxdb2_client/Cargo.toml b/influxdb2_client/Cargo.toml index 6c56d339..27da1489 100644 --- a/influxdb2_client/Cargo.toml +++ b/influxdb2_client/Cargo.toml @@ -24,5 +24,5 @@ smallvec = { workspace = true } [dev-dependencies] # In alphabetical order mockito = { version ="1.7", default-features = false } parking_lot = "0.12" -tokio = { version = "1.47", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] } +tokio = { version = "1.48", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] } test_helpers = { path = "../test_helpers" } diff --git a/influxdb_influxql_parser/Cargo.toml b/influxdb_influxql_parser/Cargo.toml index 0ec2e2eb..229b3490 100644 --- a/influxdb_influxql_parser/Cargo.toml +++ b/influxdb_influxql_parser/Cargo.toml @@ -18,7 +18,7 @@ num-integer = { version = "0.1", default-features = false, features = [ "std", ] } num-traits = "0.2" -thiserror = "2.0.16" +thiserror = "2.0.17" workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] # In alphabetical order diff --git a/influxdb_iox_client/Cargo.toml b/influxdb_iox_client/Cargo.toml index 2d2161c3..b3243da8 100644 --- a/influxdb_iox_client/Cargo.toml +++ b/influxdb_iox_client/Cargo.toml @@ -43,14 +43,15 @@ rand = { version = "0.9.2", optional = true } reqwest = { workspace = true, features = ["stream", "rustls-tls-native-roots"] } schema = { path = "../schema", optional = true } serde_json = { version = "1.0.145", optional = true } -tokio = { version = "1.47", features = [ +tokio = { version = "1.48", features = [ "macros", "parking_lot", "rt-multi-thread", ] } tokio-stream = "0.1.17" -thiserror = "2.0.16" +thiserror = "2.0.17" tonic-reflection = { version = "0.12" } +tracing = { workspace = true } [dev-dependencies] insta = { version = "1" } diff --git a/influxdb_iox_client/src/client.rs b/influxdb_iox_client/src/client.rs index f7f532ed..38067c39 100644 --- a/influxdb_iox_client/src/client.rs +++ b/influxdb_iox_client/src/client.rs @@ -47,3 +47,6 @@ pub mod test; /// Client for write API pub mod write; + +/// Batched write client for efficient bulk writes +pub mod batched_write; diff --git a/influxdb_iox_client/src/client/batched_write.rs b/influxdb_iox_client/src/client/batched_write.rs new file mode 100644 index 00000000..7d8fc48e --- /dev/null +++ b/influxdb_iox_client/src/client/batched_write.rs @@ -0,0 +1,379 @@ +//! Batched write client for efficient bulk writing of line protocol data. +//! +//! This module provides a `BatchedWriteClient` that wraps the standard write client +//! and batches multiple write requests together before sending them. This is particularly +//! useful for high-throughput scenarios like query logging where many small writes +//! can be combined into fewer, larger requests. + +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::Duration; + +use tokio::sync::Mutex; +use tokio::task::JoinHandle; +use tracing::error; + +use crate::{ + error::Error, + write::{Client as WriteClient, DatabaseName}, +}; + +/// A wrapper around either a regular or batched write client. +/// +/// This enum allows code to work with either client type transparently, +/// which is particularly useful for optional batching functionality. +#[derive(Debug, Clone)] +pub enum MaybeBatchedWriteClient { + /// Regular write client without batching + Unbatched(WriteClient), + /// Batched write client + Batched(Arc), +} + +impl MaybeBatchedWriteClient { + /// Write line protocol data to the specified database. + pub async fn write_lp( + &mut self, + database: impl Into + Send, + lp_data: impl Into + Send, + ) -> Result<(), Error> { + match self { + Self::Unbatched(client) => { + client.write_lp(database, lp_data).await?; + Ok(()) + } + Self::Batched(client) => client.write_lp(database, lp_data).await, + } + } +} + +/// Default maximum number of line protocol entries to batch before flushing +const DEFAULT_MAX_BATCH_SIZE: usize = 100; + +/// Default flush interval for periodic flushing +const DEFAULT_FLUSH_INTERVAL: Duration = Duration::from_secs(3); + +/// Configuration for the batched write client +#[derive(Debug, Clone, Copy)] +pub struct BatchedWriteClientConfig { + /// Maximum number of line protocol entries to batch before flushing + pub max_batch_size: usize, + /// Interval at which to automatically flush pending writes, even if batch size hasn't been reached + pub flush_interval: Duration, +} + +impl Default for BatchedWriteClientConfig { + fn default() -> Self { + Self { + max_batch_size: DEFAULT_MAX_BATCH_SIZE, + flush_interval: DEFAULT_FLUSH_INTERVAL, + } + } +} + +/// A batched write client that accumulates writes and flushes them in batches. +/// +/// This client wraps a [`WriteClient`] and batches multiple write requests together +/// before sending them to reduce network overhead. Writes are flushed when: +/// - The batch reaches `max_batch_size` line protocol entries +/// - The `flush_interval` timer expires (default: 3 seconds) +/// - The client is dropped (graceful shutdown) +/// - `flush()` is explicitly called +/// +/// # Example +/// +/// ```no_run +/// # use influxdb_iox_client::{ +/// # connection::Builder, +/// # write::Client as WriteClient, +/// # batched_write::{BatchedWriteClient, BatchedWriteClientConfig}, +/// # }; +/// # #[tokio::main] +/// # async fn main() { +/// let connection = Builder::default() +/// .build("http://127.0.0.1:8080") +/// .await +/// .unwrap(); +/// +/// let write_client = WriteClient::new(connection); +/// let config = BatchedWriteClientConfig::default(); +/// let batched_client = BatchedWriteClient::new(write_client, config); +/// +/// // Writes are automatically batched +/// batched_client.write_lp("my_db", "cpu,host=a usage=0.5").await.unwrap(); +/// batched_client.write_lp("my_db", "cpu,host=b usage=0.7").await.unwrap(); +/// # } +/// ``` +pub struct BatchedWriteClient { + /// Internal state protected by a mutex + inner: Arc>, + + /// Configuration + config: BatchedWriteClientConfig, + + /// Shutdown flag for the background flush task + shutdown: Arc, + + /// Handle to the background flush task + _flush_task: JoinHandle<()>, +} + +/// Internal state for the batched write client +#[derive(Debug)] +struct BatchedWriteClientInner { + /// The underlying write client + client: WriteClient, + + /// Buffer for accumulating writes per database + buffer: Vec<(DatabaseName, String)>, +} + +impl BatchedWriteClient { + /// Creates a new batched write client with the given configuration. + pub fn new(client: WriteClient, config: BatchedWriteClientConfig) -> Self { + let inner = BatchedWriteClientInner { + client, + buffer: Vec::new(), + }; + + let inner = Arc::new(Mutex::new(inner)); + let shutdown = Arc::new(AtomicBool::new(false)); + + // Spawn background task to periodically flush + let flush_task = { + let inner = Arc::clone(&inner); + let shutdown = Arc::clone(&shutdown); + let flush_interval = config.flush_interval; + + tokio::spawn(async move { + loop { + tokio::time::sleep(flush_interval).await; + + if shutdown.load(Ordering::Relaxed) { + break; + } + + let mut guard = inner.lock().await; + if let Err(e) = flush_buffer_internal(&mut guard).await { + error!("Failed to flush batched writes from timer: {}", e); + } + } + }) + }; + + Self { + inner, + config, + shutdown, + _flush_task: flush_task, + } + } + + /// Creates a new batched write client with default configuration. + pub fn new_with_defaults(client: WriteClient) -> Self { + Self::new(client, BatchedWriteClientConfig::default()) + } + + /// Write line protocol data to the specified database. + /// + /// The write is buffered internally and will be flushed when the + /// configured batch size is reached. + pub async fn write_lp( + &self, + database: impl Into + Send, + data: impl Into + Send, + ) -> Result<(), Error> { + let database = database.into(); + let data = data.into(); + + let mut inner = self.inner.lock().await; + + inner.buffer.push((database, data)); + + if inner.buffer.len() >= self.config.max_batch_size { + flush_buffer_internal(&mut inner).await?; + } + + Ok(()) + } + + /// Explicitly flush all pending writes. + /// + /// This method blocks until all currently buffered writes have been sent. + pub async fn flush(&self) -> Result<(), Error> { + let mut inner = self.inner.lock().await; + flush_buffer_internal(&mut inner).await + } +} + +impl std::fmt::Debug for BatchedWriteClient { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("BatchedWriteClient") + .field("inner", &self.inner) + .field("config", &self.config) + .field("shutdown", &self.shutdown) + .field("_flush_task", &"") + .finish() + } +} + +impl Drop for BatchedWriteClient { + fn drop(&mut self) { + // Signal the background task to shut down + self.shutdown.store(true, Ordering::Relaxed); + + // Try to flush remaining data on drop + // We spawn a task since we can't use async in Drop + let inner = Arc::clone(&self.inner); + tokio::spawn(async move { + let mut guard = inner.lock().await; + if !guard.buffer.is_empty() + && let Err(e) = flush_buffer_internal(&mut guard).await + { + error!("Failed to flush remaining batched writes on drop: {}", e); + } + }); + } +} + +/// Flush the buffer by grouping writes by database and sending them +async fn flush_buffer_internal(inner: &mut BatchedWriteClientInner) -> Result<(), Error> { + if inner.buffer.is_empty() { + return Ok(()); + } + + let mut by_database: std::collections::BTreeMap> = + std::collections::BTreeMap::new(); + + for (db, data) in inner.buffer.drain(..) { + by_database.entry(db).or_default().push(data); + } + + for (db_name, data_vec) in by_database { + let combined = data_vec.join("\n"); + + if let Err(e) = inner.client.write_lp(db_name.clone(), combined).await { + error!( + "Failed to write batched data for database {:?}: {}", + db_name, e + ); + return Err(e); + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::write::RequestMaker; + use futures_util::FutureExt; + use futures_util::future::BoxFuture; + use std::sync::{Arc, Mutex}; + + #[derive(Debug)] + struct MockRequestMaker { + requests: Mutex>, + } + + impl MockRequestMaker { + fn new() -> Self { + Self { + requests: Mutex::new(vec![]), + } + } + + fn requests(&self) -> Vec { + self.requests.lock().unwrap().clone() + } + } + + impl RequestMaker for MockRequestMaker { + fn write_source( + &self, + _org_id: String, + _bucket_id: String, + body: String, + ) -> BoxFuture<'_, Result> { + let sz = body.len(); + self.requests.lock().unwrap().push(body); + async move { Ok(sz) }.boxed() + } + } + + #[tokio::test] + async fn test_batching_by_size() { + let mock = Arc::new(MockRequestMaker::new()); + let client = WriteClient::new_with_maker(Arc::clone(&mock) as _); + + let config = BatchedWriteClientConfig { + max_batch_size: 3, + flush_interval: Duration::from_secs(3600), // Long interval to not interfere with test + }; + + let batched = BatchedWriteClient::new(client, config); + + batched.write_lp("test_db", "m1 f=1").await.unwrap(); + batched.write_lp("test_db", "m2 f=2").await.unwrap(); + batched.write_lp("test_db", "m3 f=3").await.unwrap(); + + let requests = mock.requests(); + assert_eq!(requests.len(), 1); + assert!(requests[0].contains("m1 f=1")); + assert!(requests[0].contains("m2 f=2")); + assert!(requests[0].contains("m3 f=3")); + } + + #[tokio::test] + async fn test_explicit_flush() { + let mock = Arc::new(MockRequestMaker::new()); + let client = WriteClient::new_with_maker(Arc::clone(&mock) as _); + + let config = BatchedWriteClientConfig { + max_batch_size: 100, + flush_interval: Duration::from_secs(3600), // Long interval to not interfere with test + }; + + let batched = BatchedWriteClient::new(client, config); + + batched.write_lp("test_db", "m1 f=1").await.unwrap(); + batched.write_lp("test_db", "m2 f=2").await.unwrap(); + + batched.flush().await.unwrap(); + + let requests = mock.requests(); + assert_eq!(requests.len(), 1); + assert!(requests[0].contains("m1 f=1")); + assert!(requests[0].contains("m2 f=2")); + } + + #[tokio::test] + async fn test_timer_flush() { + let mock = Arc::new(MockRequestMaker::new()); + let client = WriteClient::new_with_maker(Arc::clone(&mock) as _); + + let config = BatchedWriteClientConfig { + max_batch_size: 100, // High batch size so it won't trigger + flush_interval: Duration::from_millis(100), // Short interval for testing + }; + + let batched = BatchedWriteClient::new(client, config); + + // Write some data that won't trigger batch size flush + batched.write_lp("test_db", "m1 f=1").await.unwrap(); + batched.write_lp("test_db", "m2 f=2").await.unwrap(); + + // Initially no flush should have happened + assert_eq!(mock.requests().len(), 0); + + // Wait for the timer to trigger + tokio::time::sleep(Duration::from_millis(150)).await; + + // Now the timer should have flushed the data + let requests = mock.requests(); + assert_eq!(requests.len(), 1); + assert!(requests[0].contains("m1 f=1")); + assert!(requests[0].contains("m2 f=2")); + } +} diff --git a/influxdb_iox_client/src/client/table.rs b/influxdb_iox_client/src/client/table.rs index 0fef4244..3283d34d 100644 --- a/influxdb_iox_client/src/client/table.rs +++ b/influxdb_iox_client/src/client/table.rs @@ -103,19 +103,25 @@ impl Client { } /// Enable iceberg exports for a table - pub async fn enable_iceberg(&mut self, table_id: i64) -> Result<(), Error> { + pub async fn enable_iceberg(&mut self, table_id: i64, namespace_id: i64) -> Result<(), Error> { let _ = self .inner - .enable_iceberg(EnableIcebergRequest { table_id }) + .enable_iceberg(EnableIcebergRequest { + table_id, + namespace_id, + }) .await?; Ok(()) } /// Disable iceberg exports for a table - pub async fn disable_iceberg(&mut self, table_id: i64) -> Result<(), Error> { + pub async fn disable_iceberg(&mut self, table_id: i64, namespace_id: i64) -> Result<(), Error> { let _ = self .inner - .disable_iceberg(DisableIcebergRequest { table_id }) + .disable_iceberg(DisableIcebergRequest { + table_id, + namespace_id, + }) .await?; Ok(()) } diff --git a/influxdb_iox_client/src/client/write.rs b/influxdb_iox_client/src/client/write.rs index 718b793e..5d321491 100644 --- a/influxdb_iox_client/src/client/write.rs +++ b/influxdb_iox_client/src/client/write.rs @@ -13,7 +13,7 @@ use reqwest::{Body, Method}; pub const DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES: Option = Some(1024 * 1024); /// Name of a database. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct DatabaseName { /// The database name. database: String, @@ -53,7 +53,7 @@ impl DatabaseName { /// Internally, we speak the v2 protocol which has an "org" parameter. Single tenant instances of InfluxDB /// will tolerate the presence of an "org" parameter provided it's an empty string. - fn get_org_bucket(&self) -> (String, String) { + pub fn get_org_bucket(&self) -> (String, String) { let name = self.clone(); (name.org.unwrap_or_default(), name.database) } @@ -123,7 +123,7 @@ impl Client { } /// Creates a new client with the provided request maker - fn new_with_maker(inner: Arc) -> Self { + pub fn new_with_maker(inner: Arc) -> Self { Self { inner, max_request_payload_size_bytes: DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES, @@ -225,7 +225,7 @@ impl Client { /// Something that knows how to send http data. Exists so it can be /// mocked out for testing -trait RequestMaker: Debug + Send + Sync { +pub trait RequestMaker: Debug + Send + Sync { /// Write the body data to the specified org, bucket, and /// returning the number of bytes written /// diff --git a/iox_query/Cargo.toml b/iox_query/Cargo.toml index ad969a4e..013dc63b 100644 --- a/iox_query/Cargo.toml +++ b/iox_query/Cargo.toml @@ -29,7 +29,7 @@ datafusion_util = { path = "../datafusion_util" } executor = { path = "../executor" } futures = "0.3" hashbrown = { workspace = true } -indexmap = { version = "2.11", features = ["std"] } +indexmap = { version = "2.12", features = ["std"] } influxdb-line-protocol = { path = "../influxdb_line_protocol" } itertools = "0.13.0" iox_query_params = { path = "../iox_query_params" } @@ -46,7 +46,7 @@ parquet_file = { path = "../parquet_file" } query_functions = { path = "../query_functions" } schema = { path = "../schema" } snafu = "0.8" -tokio = { version = "1.47", features = ["macros", "parking_lot"] } +tokio = { version = "1.48", features = ["macros", "parking_lot"] } tokio-stream = "0.1" trace = { path = "../trace" } tracker = { path = "../tracker" } diff --git a/iox_query/src/analyzer/handle_gapfill.rs b/iox_query/src/analyzer/handle_gapfill.rs index 5012cb28..b5c2f20b 100644 --- a/iox_query/src/analyzer/handle_gapfill.rs +++ b/iox_query/src/analyzer/handle_gapfill.rs @@ -4,7 +4,7 @@ pub mod range_predicate; mod virtual_function; -use crate::exec::gapfill::{FillStrategy, GapFill, GapFillParams}; +use crate::exec::gapfill::{FillExpr, FillStrategy, GapFill}; use datafusion::common::{ DFSchema, ExprSchema, internal_datafusion_err, plan_datafusion_err, plan_err, }; @@ -150,7 +150,7 @@ fn build_gapfill_node( new_aggr_plan: LogicalPlan, date_bin_gapfill_index: usize, date_bin_gapfill_args: Vec, - date_bin_udf: Arc, + date_bin_udf: Arc, ) -> Result { match date_bin_gapfill_args.len() { 2 | 3 => (), @@ -159,7 +159,7 @@ fn build_gapfill_node( "DATE_BIN_GAPFILL expects 2 or 3 arguments, got {nargs}", ))); } - } + }; let mut args_iter = date_bin_gapfill_args.into_iter(); @@ -206,6 +206,15 @@ fn build_gapfill_node( .schema() .qualified_field(date_bin_gapfill_index), )); + let time_column_alias = time_column.name_for_alias()?; + + let time_expr = date_bin_udf + .call(if let Some(origin) = origin { + vec![stride, time_column, origin] + } else { + vec![stride, time_column] + }) + .alias(time_column_alias); let LogicalPlan::Aggregate(aggr) = &new_aggr_plan else { return Err(DataFusionError::Internal(format!( @@ -213,26 +222,19 @@ fn build_gapfill_node( new_aggr_plan.display() ))); }; - let mut new_group_expr: Vec<_> = aggr - .schema - .iter() - .map(|(qualifier, field)| { + + let mut col_it = aggr.schema.iter(); + let series_expr = (&mut col_it) + .take(aggr.group_expr.len()) + .enumerate() + .filter(|(idx, _)| *idx != date_bin_gapfill_index) + .map(|(_, (qualifier, field))| { Expr::Column(datafusion::common::Column::from(( qualifier, field.as_ref(), ))) }) .collect(); - let aggr_expr = new_group_expr.split_off(aggr.group_expr.len()); - - match (aggr_expr.len(), aggr.aggr_expr.len()) { - (f, e) if f != e => { - return Err(internal_datafusion_err!( - "The number of aggregate expressions has gotten lost; expected {e}, found {f}. This is a bug, please report it." - )); - } - _ => (), - } // this schema is used for the `FillStrategy::Default` checks below. It also represents the // schema of the projection of `aggr`, meaning that it shows the columns/fields as they exist @@ -241,9 +243,13 @@ fn build_gapfill_node( // value of those types according to the AggregateFunction below, it all works out. let schema = &aggr.schema; - let fill_behavior = aggr_expr - .iter() - .cloned() + let fill_expr = col_it + .map(|(qualifier, field)| { + Expr::Column(datafusion::common::Column::from(( + qualifier, + field.as_ref(), + ))) + }) // `aggr_expr` and `aggr.aggr_expr` should line up in the sense that `aggr.aggr_expr[n]` // represents a transformation that was done to produce `aggr_expr[n]`, so we can zip them // together like this to determine the correct fill type for the produced expression @@ -257,25 +263,32 @@ fn build_gapfill_node( // `col_expr` should be the 'computed'/'transformed' representation of `aggr_expr`, we // `aggr_expr`, we need to make sure that it's a column or else this doesn't really // matter to calculate. - default_return_value_for_aggr_fn(aggr_expr, schema, col_expr.try_as_col()) - .map(|rt| (col_expr, FillStrategy::Default(rt))) + default_return_value_for_aggr_fn(aggr_expr, schema, col_expr.try_as_col()).map(|rt| { + FillExpr { + expr: col_expr, + strategy: FillStrategy::Default(rt), + } + }) }) - .collect::>()?; + .collect::>>()?; + + match (fill_expr.len(), aggr.aggr_expr.len()) { + (f, e) if f != e => { + return Err(internal_datafusion_err!( + "The number of aggregate expressions has gotten lost; expected {e}, found {f}. This is a bug, please report it." + )); + } + _ => (), + } Ok(LogicalPlan::Extension(Extension { node: Arc::new( GapFill::try_new( Arc::new(new_aggr_plan), - new_group_expr, - aggr_expr, - GapFillParams { - date_bin_udf, - stride, - time_column, - origin, - time_range, - fill_strategy: fill_behavior, - }, + series_expr, + time_expr, + fill_expr, + time_range, ) .map_err(|e| e.context("GapFill::try_new"))?, ), @@ -329,7 +342,7 @@ enum RewriteInfo { // The arguments to the call to DATE_BIN_GAPFILL. date_bin_gapfill_args: Vec, // The name of the UDF that provides the DATE_BIN like functionality. - date_bin_udf: Arc, + date_bin_udf: Arc, }, } @@ -375,7 +388,7 @@ fn replace_date_bin_gapfill(aggr: Aggregate) -> Result { } }; - let date_bin_udf = Arc::from(date_bin.name()); + let date_bin_udf = Arc::clone(&date_bin); let mut rewriter = DateBinGapfillRewriter { args: None, date_bin, @@ -985,7 +998,7 @@ mod test { insta::assert_yaml_snapshot!( format_analyzed_plan(plan)?, @r#" - - "GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)], aggr=[[avg(temps.temp)]], time_column=date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" + - "GapFill: series=[], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), fill=[avg(temps.temp)], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" - " Aggregate: groupBy=[[date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)]], aggr=[[avg(temps.temp)]]" - " Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)" - " TableScan: temps" @@ -1014,7 +1027,7 @@ mod test { insta::assert_yaml_snapshot!( format_analyzed_plan(plan)?, @r#" - - "GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time,TimestampNanosecond(7, None))], aggr=[[avg(temps.temp)]], time_column=date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time,TimestampNanosecond(7, None)), stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" + - "GapFill: series=[], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time,TimestampNanosecond(7, None)), TimestampNanosecond(7, None)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time,TimestampNanosecond(7, None)), fill=[avg(temps.temp)], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" - " Aggregate: groupBy=[[date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), temps.time, TimestampNanosecond(7, None)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time,TimestampNanosecond(7, None))]], aggr=[[avg(temps.temp)]]" - " Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)" - " TableScan: temps" @@ -1043,7 +1056,7 @@ mod test { insta::assert_yaml_snapshot!( format_analyzed_plan(plan)?, @r#" - - "GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), temps.loc], aggr=[[avg(temps.temp)]], time_column=date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" + - "GapFill: series=[temps.loc], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), fill=[avg(temps.temp)], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" - " Aggregate: groupBy=[[date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), temps.loc]], aggr=[[avg(temps.temp)]]" - " Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)" - " TableScan: temps" @@ -1094,7 +1107,7 @@ mod test { format_analyzed_plan(plan)?, @r#" - "Projection: date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), avg(temps.temp)" - - " GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)], aggr=[[avg(temps.temp)]], time_column=date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" + - " GapFill: series=[], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), fill=[avg(temps.temp)], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" - " Aggregate: groupBy=[[date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)]], aggr=[[avg(temps.temp)]]" - " Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)" - " TableScan: temps" @@ -1128,7 +1141,7 @@ mod test { format_analyzed_plan(plan)?, @r#" - "Projection: date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), avg(temps.temp) AS locf(avg(temps.temp)), min(temps.temp) AS locf(min(temps.temp))" - - " GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)], aggr=[[LOCF(avg(temps.temp)), LOCF(min(temps.temp))]], time_column=date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" + - " GapFill: series=[], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), fill=[LOCF(avg(temps.temp)), LOCF(min(temps.temp))], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" - " Aggregate: groupBy=[[date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)]], aggr=[[avg(temps.temp), min(temps.temp)]]" - " Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)" - " TableScan: temps" @@ -1161,7 +1174,7 @@ mod test { format_analyzed_plan(plan)?, @r#" - "Projection: date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), min(temps.temp) AS locf(min(temps.temp)) AS locf_min_temp" - - " GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)], aggr=[[avg(temps.temp), LOCF(min(temps.temp))]], time_column=date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" + - " GapFill: series=[], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), fill=[avg(temps.temp), LOCF(min(temps.temp))], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" - " Aggregate: groupBy=[[date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)]], aggr=[[avg(temps.temp), min(temps.temp)]]" - " Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)" - " TableScan: temps" @@ -1195,7 +1208,7 @@ mod test { format_analyzed_plan(plan)?, @r#" - "Projection: date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), avg(temps.temp) AS interpolate(avg(temps.temp)), min(temps.temp) AS interpolate(min(temps.temp))" - - " GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)], aggr=[[INTERPOLATE(avg(temps.temp)), INTERPOLATE(min(temps.temp))]], time_column=date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" + - " GapFill: series=[], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), fill=[INTERPOLATE(avg(temps.temp)), INTERPOLATE(min(temps.temp))], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" - " Aggregate: groupBy=[[date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)]], aggr=[[avg(temps.temp), min(temps.temp)]]" - " Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)" - " TableScan: temps" @@ -1231,7 +1244,7 @@ mod test { insta::assert_yaml_snapshot!( format_analyzed_plan(plan).unwrap(), @r#" - - "GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)], aggr=[[]], time_column=date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" + - "GapFill: series=[], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), fill=[], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" - " Aggregate: groupBy=[[date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)]], aggr=[[]]" - " TableScan: temps projection=[time], full_filters=[temps.time >= TimestampNanosecond(1000, None), temps.time < TimestampNanosecond(2000, None), temps.loc = Utf8(\"foo\")]" "#); diff --git a/iox_query/src/exec.rs b/iox_query/src/exec.rs index c1abe928..e65376cf 100644 --- a/iox_query/src/exec.rs +++ b/iox_query/src/exec.rs @@ -5,6 +5,7 @@ pub(crate) mod context; pub mod gapfill; mod metrics; pub mod query_tracing; +pub mod series_limit; pub mod sleep; pub(crate) mod split; use datafusion_util::config::register_iox_object_store; diff --git a/iox_query/src/exec/context.rs b/iox_query/src/exec/context.rs index a0f69bcf..98ee0a6f 100644 --- a/iox_query/src/exec/context.rs +++ b/iox_query/src/exec/context.rs @@ -4,6 +4,7 @@ use super::{ cross_rt_stream::CrossRtStream, gapfill::{GapFill, plan_gap_fill}, + series_limit::{SeriesLimit, plan_series_limit}, sleep::SleepNode, split::StreamSplitNode, }; @@ -131,6 +132,10 @@ impl ExtensionPlanner for IOxExtensionPlanner { let gap_fill_exec = plan_gap_fill(session_state, gap_fill, logical_inputs, physical_inputs)?; Some(Arc::new(gap_fill_exec)) + } else if let Some(series_limit) = any.downcast_ref::() { + let series_limit_exec = + plan_series_limit(session_state, series_limit, logical_inputs, physical_inputs)?; + Some(Arc::new(series_limit_exec)) } else if let Some(sleep) = any.downcast_ref::() { let sleep = sleep.plan(planner, logical_inputs, physical_inputs, session_state)?; Some(Arc::new(sleep)) diff --git a/iox_query/src/exec/gapfill/algo.rs b/iox_query/src/exec/gapfill/algo.rs index 1c6e283f..b9829931 100644 --- a/iox_query/src/exec/gapfill/algo.rs +++ b/iox_query/src/exec/gapfill/algo.rs @@ -132,17 +132,17 @@ impl GapFiller { pub fn build_gapfilled_output( &mut self, schema: SchemaRef, - input_time_array: (usize, &TimestampNanosecondArray), - group_arrays: &[(usize, ArrayRef)], - aggr_arrays: &[(usize, ArrayRef)], + input_time_array: &TimestampNanosecondArray, + series_arrays: &[ArrayRef], + fill_arrays: &[(usize, ArrayRef)], ) -> Result { - let series_ends = self.plan_output_batch(input_time_array.1, group_arrays)?; + let series_ends = self.plan_output_batch(input_time_array, series_arrays)?; self.cursor.remaining_output_batch_size = self.batch_size; self.build_output( schema, input_time_array, - group_arrays, - aggr_arrays, + series_arrays, + fill_arrays, &series_ends, ) } @@ -167,18 +167,15 @@ impl GapFiller { fn plan_output_batch( &mut self, input_time_array: &TimestampNanosecondArray, - group_arr: &[(usize, ArrayRef)], + series_arr: &[ArrayRef], ) -> Result> { - if group_arr.is_empty() { + if series_arr.is_empty() { // there are no group columns, so the output // will be just one big series. return Ok(vec![input_time_array.len()]); } - let sort_columns = group_arr - .iter() - .map(|(_, arr)| Arc::clone(arr)) - .collect::>(); + let sort_columns = series_arr.to_vec(); let mut ranges = partition(&sort_columns)?.ranges().into_iter(); @@ -218,32 +215,27 @@ impl GapFiller { fn build_output( &mut self, schema: SchemaRef, - input_time_array: (usize, &TimestampNanosecondArray), - group_arr: &[(usize, ArrayRef)], - aggr_arr: &[(usize, ArrayRef)], + input_time_array: &TimestampNanosecondArray, + series_arr: &[ArrayRef], + fill_arr: &[(usize, ArrayRef)], series_ends: &[usize], ) -> Result { - let mut output_arrays: Vec<(usize, ArrayRef)> = - Vec::with_capacity(group_arr.len() + aggr_arr.len() + 1); // plus one for time column + let mut output_arrays: Vec = + Vec::with_capacity(series_arr.len() + fill_arr.len() + 1); // plus one for time column // build the time column let mut cursor = self.cursor.clone_for_aggr_col(None)?; - let (time_idx, input_time_array) = input_time_array; let time_vec = cursor.build_time_vec(&self.params, series_ends, input_time_array)?; let output_time_len = time_vec.len(); - output_arrays.push(( - time_idx, - Arc::new( - TimestampNanosecondArray::from(time_vec) - .with_timezone_opt(input_time_array.timezone()), - ), - )); + let time_arr = Arc::new( + TimestampNanosecondArray::from(time_vec).with_timezone_opt(input_time_array.timezone()), + ); // There may not be any aggregate or group columns, so use this cursor state as the new // GapFiller cursor once this output batch is complete. let mut final_cursor = cursor; // build the other group columns - for (idx, ga) in group_arr { + for ga in series_arr { let mut cursor = self.cursor.clone_for_aggr_col(None)?; let take_vec = cursor.build_group_take_vec(&self.params, series_ends, input_time_array)?; @@ -255,11 +247,12 @@ impl GapFiller { ))); } let take_arr = UInt64Array::from(take_vec); - output_arrays.push((*idx, take::take(ga, &take_arr, None)?)); + output_arrays.push(take::take(ga, &take_arr, None)?); } + output_arrays.push(time_arr); // Build the aggregate columns - for (idx, aa) in aggr_arr { + for (idx, aa) in fill_arr { let mut cursor = self.cursor.clone_for_aggr_col(Some(*idx))?; let output_array = cursor.build_aggr_col(&self.params, series_ends, input_time_array, aa)?; @@ -270,14 +263,13 @@ impl GapFiller { output_time_len ))); } - output_arrays.push((*idx, output_array)); + output_arrays.push(output_array); final_cursor.merge_aggr_col_cursor(cursor); } - output_arrays.sort_by(|(a, _), (b, _)| a.cmp(b)); - let output_arrays: Vec<_> = output_arrays.into_iter().map(|(_, arr)| arr).collect(); - let batch = RecordBatch::try_new(Arc::clone(&schema), output_arrays) - .map_err(|err| DataFusionError::ArrowError(Box::new(err), None))?; + let batch = RecordBatch::try_new(Arc::clone(&schema), output_arrays).map_err(|err| { + DataFusionError::ArrowError(Box::new(err), None).context("build_output") + })?; self.cursor = final_cursor; Ok(batch) diff --git a/iox_query/src/exec/gapfill/buffered_input.rs b/iox_query/src/exec/gapfill/buffered_input.rs index 82a17a3f..8911dadb 100644 --- a/iox_query/src/exec/gapfill/buffered_input.rs +++ b/iox_query/src/exec/gapfill/buffered_input.rs @@ -26,8 +26,9 @@ use super::{FillStrategy, params::GapFillParams}; /// [`FillStrategy::LinearInterpolate`]: super::FillStrategy::LinearInterpolate /// [`GapFillStream`]: super::stream::GapFillStream pub(super) struct BufferedInput { - /// Indexes of group columns in the schema (not including time). - group_cols: Vec, + /// Indexes of series columns in the schema. These are the columns + /// that will have consistent values for all rows in a time series. + series_cols: Vec, /// Indexes of aggregate columns filled via interpolation. interpolate_cols: Vec, /// Buffered records from the input stream. @@ -42,7 +43,7 @@ pub(super) struct BufferedInput { } impl BufferedInput { - pub(super) fn new(params: &GapFillParams, group_cols: Vec) -> Self { + pub(super) fn new(params: &GapFillParams, series_cols: Vec) -> Self { let interpolate_cols = params .fill_strategy .iter() @@ -51,7 +52,7 @@ impl BufferedInput { }) .collect::>(); Self { - group_cols, + series_cols, interpolate_cols, batches: vec![], row_converter: None, @@ -170,7 +171,7 @@ impl BufferedInput { /// /// [`arrow::row`]: https://docs.rs/arrow-row/36.0.0/arrow_row/index.html fn group_columns_changed(&mut self, last_output_row_idx: (usize, usize)) -> Result { - if self.group_cols.is_empty() { + if self.series_cols.is_empty() { return Ok(false); } @@ -193,7 +194,7 @@ impl BufferedInput { if self.row_converter.is_none() { let batch = self.batches.first().expect("at least one batch"); let sort_fields = self - .group_cols + .series_cols .iter() .map(|c| SortField::new(batch.column(*c).data_type().clone())) .collect(); @@ -208,7 +209,7 @@ impl BufferedInput { fn convert_row(&mut self, row_idxs: (usize, usize)) -> Result { let batch = &self.batches[row_idxs.0]; let columns: Vec = self - .group_cols + .series_cols .iter() .map(|col_idx| batch.column(*col_idx).slice(row_idxs.1, 1)) .collect(); diff --git a/iox_query/src/exec/gapfill/exec_tests.rs b/iox_query/src/exec/gapfill/exec_tests.rs index bd5d5dfc..995e12f1 100644 --- a/iox_query/src/exec/gapfill/exec_tests.rs +++ b/iox_query/src/exec/gapfill/exec_tests.rs @@ -15,10 +15,9 @@ use arrow_util::test_util::batches_to_lines; use datafusion::{ error::Result, execution::runtime_env::RuntimeEnvBuilder, - functions::datetime::date_bin::DateBinFunc, physical_plan::{ collect, - expressions::{col as phys_col, lit as phys_lit}, + expressions::{Column, col as phys_col, lit as phys_lit}, test::exec::MockExec, }, prelude::{SessionConfig, SessionContext}, @@ -42,11 +41,13 @@ fn test_gapfill_simple() { struct_cols: vec![], input_batch_size, }; - let params = get_params_ms(&batch, 25, Some(975), 1_125); + let time_expr = get_date_bin_expr(&batch, 25, None); let tc = TestCase { test_records: batch, output_batch_size, - params, + time_expr, + time_range: get_time_range(Some(975), 1_125), + fill_strategy: None, }; // For this simple test case, also test that // memory is tracked correctly, which is done by @@ -84,11 +85,13 @@ fn test_gapfill_simple_tz() { struct_cols: vec![], input_batch_size, }; - let params = get_params_ms(&batch, 25, Some(975), 1_125); + let time_expr = get_date_bin_expr(&batch, 25, None); let tc = TestCase { test_records: batch, output_batch_size, - params, + time_expr, + time_range: get_time_range(Some(975), 1_125), + fill_strategy: None, }; // For this simple test case, also test that // memory is tracked correctly, which is done by @@ -129,11 +132,13 @@ fn test_gapfill_simple_no_group_no_aggr() { struct_cols: vec![], input_batch_size, }; - let params = get_params_ms(&batch, 25, Some(975), 1_125); + let time_expr= get_date_bin_expr(&batch, 25, None); let tc = TestCase { test_records: batch, output_batch_size, - params, + time_expr, + time_range: get_time_range(Some(975), 1_125), + fill_strategy: None, }; let batches = tc.run().unwrap(); let actual = batches_to_lines(&batches); @@ -169,11 +174,13 @@ fn test_gapfill_multi_group_simple() { struct_cols: vec![], input_batch_size, }; - let params = get_params_ms(&records, 25, Some(975), 1_125); + let time_expr = get_date_bin_expr(&records, 25, None); let tc = TestCase { test_records: records, output_batch_size, - params, + time_expr, + time_range: get_time_range(Some(975), 1_125), + fill_strategy: None, }; let batches = tc.run().unwrap(); let actual = batches_to_lines(&batches); @@ -215,11 +222,13 @@ fn test_gapfill_multi_group_simple_origin() { struct_cols: vec![], input_batch_size, }; - let params = get_params_ms_with_origin_fill_strategy(&records, 25, Some(975), 1_125, Some(3), None); + let time_expr = get_date_bin_expr(&records, 25, Some(3)); let tc = TestCase { test_records: records, output_batch_size, - params, + time_expr, + time_range: get_time_range(Some(975), 1_125), + fill_strategy: None, }; let batches = tc.run().unwrap(); let actual = batches_to_lines(&batches); @@ -286,11 +295,13 @@ fn test_gapfill_multi_group_with_nulls() { struct_cols: vec![], input_batch_size, }; - let params = get_params_ms(&records, 25, Some(975), 1_125); + let time_expr = get_date_bin_expr(&records, 25, None); let tc = TestCase { test_records: records, output_batch_size, - params, + time_expr, + time_range: get_time_range(Some(975), 1_125), + fill_strategy: None, }; let batches = tc.run().unwrap(); let actual = batches_to_lines(&batches); @@ -370,11 +381,13 @@ fn test_gapfill_multi_group_cols_with_nulls() { struct_cols: vec![], input_batch_size, }; - let params = get_params_ms(&records, 25, Some(975), 1_125); + let time_expr = get_date_bin_expr(&records, 25, None); let tc = TestCase { test_records: records, output_batch_size, - params, + time_expr, + time_range: get_time_range(Some(975), 1_125), + fill_strategy: None, }; let batches = tc.run().unwrap(); let actual = batches_to_lines(&batches); @@ -431,11 +444,13 @@ fn test_gapfill_multi_group_cols_with_more_nulls() { struct_cols: vec![], input_batch_size, }; - let params = get_params_ms(&records, 25, Some(975), 1_025); + let time_expr = get_date_bin_expr(&records, 25, None); let tc = TestCase { test_records: records, output_batch_size, - params, + time_expr, + time_range: get_time_range(Some(975), 1_025), + fill_strategy: None, }; let batches = tc.run().unwrap(); let actual = batches_to_lines(&batches); @@ -519,11 +534,13 @@ fn test_gapfill_multi_aggr_cols_with_nulls() { struct_cols: vec![], input_batch_size, }; - let params = get_params_ms(&records, 25, Some(975), 1_125); + let time_expr = get_date_bin_expr(&records, 25, None); let tc = TestCase { test_records: records, output_batch_size, - params, + time_expr, + time_range: get_time_range(Some(975), 1_125), + fill_strategy: None, }; let batches = tc.run().unwrap(); let actual = batches_to_lines(&batches); @@ -568,11 +585,13 @@ fn test_gapfill_simple_no_lower_bound() { struct_cols: vec![], input_batch_size, }; - let params = get_params_ms(&batch, 25, None, 1_125); + let time_expr = get_date_bin_expr(&batch, 25, None); let tc = TestCase { test_records: batch, output_batch_size, - params, + time_expr, + time_range: get_time_range(None, 1_125), + fill_strategy: None, }; let batches = tc.run().unwrap(); let actual = batches_to_lines(&batches); @@ -637,11 +656,13 @@ fn test_gapfill_fill_prev() { struct_cols: vec![], input_batch_size, }; - let params = get_params_ms_with_fill_strategy(&records, 25, Some(975), 1_125, Some(FillStrategy::PrevNullAsIntentional)); + let time_expr = get_date_bin_expr(&records, 25, None); let tc = TestCase { test_records: records, output_batch_size, - params, + time_expr, + time_range: get_time_range(Some(975), 1_125), + fill_strategy: Some(FillStrategy::PrevNullAsIntentional), }; let batches = tc.run().unwrap(); let actual = batches_to_lines(&batches); @@ -716,11 +737,13 @@ fn test_gapfill_fill_prev_null_as_missing() { struct_cols: vec![], input_batch_size, }; - let params = get_params_ms_with_fill_strategy(&records, 25, Some(975), 1_125, Some(FillStrategy::PrevNullAsMissing)); + let time_expr = get_date_bin_expr(&records, 25, None ); let tc = TestCase { test_records: records, output_batch_size, - params, + time_expr, + time_range: get_time_range(Some(975), 1_125), + fill_strategy: Some(FillStrategy::PrevNullAsMissing), }; let batches = tc.run().unwrap(); let actual = batches_to_lines(&batches); @@ -816,11 +839,13 @@ fn test_gapfill_fill_prev_null_as_missing_many_nulls() { struct_cols: vec![], input_batch_size, }; - let params = get_params_ms_with_fill_strategy(&records, 25, Some(975), 1_125, Some(FillStrategy::PrevNullAsMissing)); + let time_expr = get_date_bin_expr(&records, 25, None); let tc = TestCase { test_records: records, output_batch_size, - params, + time_expr, + time_range: get_time_range(Some(975), 1_125), + fill_strategy: Some(FillStrategy::PrevNullAsMissing), }; let batches = tc.run().unwrap(); let actual = batches_to_lines(&batches); @@ -926,17 +951,13 @@ fn test_gapfill_fill_interpolate() { struct_cols: vec![], input_batch_size, }; - let params = get_params_ms_with_fill_strategy( - &records, - 25, - Some(975), - 1_125, - Some(FillStrategy::LinearInterpolate) - ); + let time_expr = get_date_bin_expr(&records,25,None); let tc = TestCase { test_records: records, output_batch_size, - params, + time_expr, + time_range: get_time_range(Some(975), 1_125), + fill_strategy: Some(FillStrategy::LinearInterpolate), }; let batches = tc.run().unwrap(); let actual = batches_to_lines(&batches); @@ -1024,11 +1045,13 @@ fn test_gapfill_simple_no_lower_bound_with_nulls() { struct_cols: vec![], input_batch_size, }; - let params = get_params_ms(&batch, 25, None, 1_125); + let time_expr = get_date_bin_expr(&batch, 25, None); let tc = TestCase { test_records: batch, output_batch_size, - params, + time_expr, + time_range: get_time_range(None, 1_125), + fill_strategy: None, }; let batches = tc.run().unwrap(); let actual = batches_to_lines(&batches); @@ -1074,11 +1097,13 @@ fn test_gapfill_oom() { struct_cols: vec![], input_batch_size, }; - let params = get_params_ms(&batch, 25, Some(975), 1_125); + let time_expr = get_date_bin_expr(&batch, 25, None); let tc = TestCase { test_records: batch, output_batch_size, - params, + time_expr, + time_range: get_time_range(Some(975), 1_125), + fill_strategy: None, }; let result = tc.run_with_memory_limit(1); assert_error!(result, DataFusionError::ResourcesExhausted(_)); @@ -1145,17 +1170,13 @@ fn test_gapfill_interpolate_struct() { ]], input_batch_size, }; - let params = get_params_ms_with_fill_strategy( - &records, - 25, - Some(975), - 1_125, - Some(FillStrategy::LinearInterpolate) - ); + let time_expr= get_date_bin_expr(&records, 25, None); let tc = TestCase { test_records: records, output_batch_size, - params, + time_expr, + time_range: get_time_range(Some(975), 1_125), + fill_strategy: Some(FillStrategy::LinearInterpolate), }; let batches = tc.run().unwrap(); let actual = batches_to_lines(&batches); @@ -1251,17 +1272,13 @@ fn test_gapfill_interpolate_struct_additional_data() { ]], input_batch_size, }; - let params = get_params_ms_with_fill_strategy( - &records, - 25, - Some(975), - 1_125, - Some(FillStrategy::LinearInterpolate) - ); + let time_expr = get_date_bin_expr(&records, 25, None); let tc = TestCase { test_records: records, output_batch_size, - params, + time_expr, + time_range: get_time_range(Some(975), 1_125), + fill_strategy: Some(FillStrategy::LinearInterpolate), }; let batches = tc.run().unwrap(); let actual = batches_to_lines(&batches); @@ -1391,7 +1408,7 @@ impl TestRecords { for i in 0..self.schema().fields().len() { match i.cmp(&ngroup_cols) { Ordering::Less => group_expr.push(Arc::new(Column::new(&format!("g{i}"), i))), - Ordering::Equal => group_expr.push(Arc::new(Column::new("t", i))), + Ordering::Equal => continue, Ordering::Greater => { let idx = i - ngroup_cols + 1; aggr_expr.push(Arc::new(Column::new(&format!("a{idx}"), i))); @@ -1464,7 +1481,9 @@ impl TryFrom for Vec { struct TestCase { test_records: TestRecords, output_batch_size: usize, - params: GapFillExecParams, + time_expr: Arc, + time_range: Range>>, + fill_strategy: Option, } impl TestCase { @@ -1502,8 +1521,8 @@ impl TestCase { fn plan(self) -> Result> { let schema = self.test_records.schema(); - let (group_expr, aggr_expr) = self.test_records.exprs()?; - + let (series_expr, _) = self.test_records.exprs()?; + let fill_expr = phys_fill_expr(&self.test_records, self.fill_strategy)?; let input_batch_size = self.test_records.input_batch_size; let num_records = self.test_records.len(); @@ -1522,11 +1541,13 @@ impl TestCase { MockExec::new(batches.into_iter().map(Ok).collect(), Arc::clone(&schema)) .with_use_task(false), ); + let plan = Arc::new(GapFillExec::try_new( input, - group_expr, - aggr_expr, - self.params.clone(), + series_expr, + Arc::clone(&self.time_expr), + fill_expr, + self.time_range.clone(), )?); Ok(plan) } @@ -1548,75 +1569,62 @@ fn bound_included_from_option(o: Option) -> Bound { } } -fn phys_fill_strategies( +fn phys_fill_expr( records: &TestRecords, fill_strategy: Option, -) -> Result, FillStrategy)>> { +) -> Result> { let start = records.group_cols.len() + 1; // 1 is for time col let end = start + records.agg_cols.len() + records.struct_cols.len(); let mut v = Vec::with_capacity(records.agg_cols.len()); for f in &records.schema().fields()[start..end] { - v.push(( - phys_col(f.name(), &records.schema())?, - match fill_strategy { + v.push(PhysicalFillExpr { + expr: phys_col(f.name(), &records.schema())?, + strategy: match fill_strategy { Some(ref fs) => fs.clone(), None => FillStrategy::Default(f.data_type().try_into()?), }, - )); + }); } Ok(v) } -fn get_params_ms_with_fill_strategy( - batch: &TestRecords, - stride_ms: i64, - start: Option, - end: i64, - fill_strategy: Option, -) -> GapFillExecParams { - get_params_ms_with_origin_fill_strategy(batch, stride_ms, start, end, None, fill_strategy) -} - -fn get_params_ms_with_origin_fill_strategy( +fn get_date_bin_expr( batch: &TestRecords, stride_ms: i64, - start: Option, - end: i64, origin_ms: Option, - fill_strategy: Option, -) -> GapFillExecParams { - // stride is in ms - let stride = ScalarValue::new_interval_mdn(0, 0, stride_ms * 1_000_000); - let origin = - origin_ms.map(|o| phys_lit(ScalarValue::TimestampNanosecond(Some(o * 1_000_000), None))); +) -> Arc { + let mut args = vec![ + phys_lit(ScalarValue::new_interval_mdn(0, 0, stride_ms * 1_000_000)), + Arc::new(Column::new("t", batch.group_cols.len())), + ]; + args.extend( + origin_ms + .iter() + .map(|ms| phys_lit(ScalarValue::TimestampNanosecond(Some(ms * 1_000_000), None))), + ); + Arc::new(ScalarFunctionExpr::new( + "time", + datafusion::functions::datetime::date_bin(), + args, + Arc::new(Field::new( + "time", + DataType::Timestamp(TimeUnit::Nanosecond, batch.timezone.clone()), + true, + )), + )) +} - GapFillExecParams { - date_bin_udf: Arc::new(ScalarUDF::new_from_impl(DateBinFunc::new())), - stride: phys_lit(stride), - time_column: Column::new("t", batch.group_cols.len()), - origin, - // timestamps are nanos, so scale them accordingly - time_range: Range { - start: bound_included_from_option(start.map(|start| { - phys_lit(ScalarValue::TimestampNanosecond( - Some(start * 1_000_000), - None, - )) - })), - end: Bound::Included(phys_lit(ScalarValue::TimestampNanosecond( - Some(end * 1_000_000), +fn get_time_range(start: Option, end: i64) -> Range>> { + Range { + start: bound_included_from_option(start.map(|start| { + phys_lit(ScalarValue::TimestampNanosecond( + Some(start * 1_000_000), None, - ))), - }, - fill_strategy: phys_fill_strategies(batch, fill_strategy).unwrap(), + )) + })), + end: Bound::Included(phys_lit(ScalarValue::TimestampNanosecond( + Some(end * 1_000_000), + None, + ))), } } - -fn get_params_ms( - batch: &TestRecords, - stride: i64, - start: Option, - end: i64, -) -> GapFillExecParams { - get_params_ms_with_fill_strategy(batch, stride, start, end, None) -} diff --git a/iox_query/src/exec/gapfill/mod.rs b/iox_query/src/exec/gapfill/mod.rs index 6ccf28fa..3e23043f 100644 --- a/iox_query/src/exec/gapfill/mod.rs +++ b/iox_query/src/exec/gapfill/mod.rs @@ -12,9 +12,11 @@ mod params; mod stream; use self::stream::GapFillStream; +use arrow::datatypes::Schema; use arrow::{compute::SortOptions, datatypes::SchemaRef}; -use datafusion::common::plan_datafusion_err; -use datafusion::physical_expr::{LexOrdering, OrderingRequirements}; +use datafusion::common::{DFSchema, plan_datafusion_err}; +use datafusion::logical_expr::ExprSchemable; +use datafusion::physical_expr::{LexOrdering, OrderingRequirements, ScalarFunctionExpr}; use datafusion::physical_plan::metrics::MetricsSet; use datafusion::{ common::DFSchemaRef, @@ -23,12 +25,11 @@ use datafusion::{ context::{SessionState, TaskContext}, memory_pool::MemoryConsumer, }, - logical_expr::{LogicalPlan, ScalarUDF, UserDefinedLogicalNodeCore}, + logical_expr::{LogicalPlan, UserDefinedLogicalNodeCore}, physical_expr::{EquivalenceProperties, PhysicalSortExpr}, physical_plan::{ DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties, - Partitioning, PhysicalExpr, PlanProperties, SendableRecordBatchStream, Statistics, - expressions::Column, + PhysicalExpr, PlanProperties, SendableRecordBatchStream, Statistics, metrics::{BaselineMetrics, ExecutionPlanMetricsSet}, }, prelude::Expr, @@ -49,13 +50,19 @@ use std::{ pub struct GapFill { /// The incoming logical plan pub input: Arc, - /// Grouping expressions - pub group_expr: Vec, - /// Aggregate expressions - pub aggr_expr: Vec, - /// Parameters to configure the behavior of the - /// gap-filling operation - pub params: GapFillParams, + /// Series expressions + pub series_expr: Vec, + /// Time binning expr + pub time_expr: Expr, + /// Filling expressions + pub fill_expr: Vec, + /// The time range of the time column inferred from predicates + /// in the overall query. The lower bound may be [`Bound::Unbounded`] + /// which implies that gap-filling should just start from the + /// first point in each series. + pub time_range: Range>, + /// The schema after the gap-fill operation + pub schema: DFSchemaRef, } // Manual impl because GapFillParams has a Range and is not PartialOrd @@ -64,31 +71,21 @@ impl PartialOrd for GapFill { other .input .partial_cmp(&self.input) - .then_with_opt(|| self.group_expr.partial_cmp(&other.group_expr)) - .then_with_opt(|| self.aggr_expr.partial_cmp(&other.aggr_expr)) + .then_with_opt(|| self.series_expr.partial_cmp(&other.series_expr)) + .then_with_opt(|| self.fill_expr.partial_cmp(&other.fill_expr)) } } -/// Parameters to the GapFill operation -#[derive(Clone, Debug, Hash, PartialEq, Eq)] -pub struct GapFillParams { - /// The name of the UDF that provides the DATE_BIN like functionality. - pub date_bin_udf: Arc, - /// The stride argument from the call to DATE_BIN_GAPFILL - pub stride: Expr, - /// The source time column - pub time_column: Expr, - /// The origin argument from the call to DATE_BIN_GAPFILL - pub origin: Option, - /// The time range of the time column inferred from predicates - /// in the overall query. The lower bound may be [`Bound::Unbounded`] - /// which implies that gap-filling should just start from the - /// first point in each series. - pub time_range: Range>, - /// What to do when filling aggregate columns. - /// The first item in the tuple will be the column - /// reference for the aggregate column. - pub fill_strategy: Vec<(Expr, FillStrategy)>, +#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd)] +pub struct FillExpr { + pub expr: Expr, + pub strategy: FillStrategy, +} + +impl std::fmt::Display for FillExpr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.strategy.display_with_expr(&self.expr)) + } } /// Describes how to fill gaps in an aggregate column. @@ -116,85 +113,8 @@ impl FillStrategy { Self::PrevNullAsMissing => format!("LOCF({expr})"), Self::LinearInterpolate => format!("INTERPOLATE({expr})"), Self::Default(scalar) if scalar.is_null() => expr.to_string(), - Self::Default(val) => format!("COALESCE({val}, {expr})"), - } - } -} - -impl GapFillParams { - // Extract the expressions so they can be optimized. - fn expressions(&self) -> Vec { - let mut exprs = vec![self.stride.clone(), self.time_column.clone()]; - if let Some(e) = self.origin.as_ref() { - exprs.push(e.clone()) - } - if let Some(start) = bound_extract(&self.time_range.start) { - exprs.push(start.clone()); - } - exprs.push( - bound_extract(&self.time_range.end) - .unwrap_or_else(|| panic!("upper time bound is required")) - .clone(), - ); - exprs - } - - #[expect(clippy::wrong_self_convention)] // follows convention of UserDefinedLogicalNode - fn from_template(&self, exprs: &[Expr], aggr_expr: &[Expr]) -> Self { - let mut e_iter = exprs.iter().cloned(); - - // we only need the third item in the iter if `Some(_) == self.origin` so that's why we - // match against `None | Some(Some(_))` - that ensures either origin is None, or origin is - // Some and e_iter.next() is Some - let (Some(stride), Some(time_column), origin @ (None | Some(Some(_)))) = ( - e_iter.next(), - e_iter.next(), - self.origin.as_ref().map(|_| e_iter.next()), - ) else { - panic!("`exprs` should contain at least a stride, source, and origin"); - }; - - let origin = origin.flatten(); - - let time_range = match try_map_range(&self.time_range, |b| { - try_map_bound(b.as_ref(), |_| { - Ok::<_, Infallible>(e_iter.next().expect("expr count should match template")) - }) - }) { - Ok(tr) => tr, - Err(infallible) => match infallible {}, - }; - - let fill_strategy = aggr_expr - .iter() - .cloned() - .zip( - self.fill_strategy - .iter() - .map(|(_expr, fill_strategy)| fill_strategy.clone()), - ) - .collect(); - - Self { - date_bin_udf: Arc::clone(&self.date_bin_udf), - stride, - time_column, - origin, - time_range, - fill_strategy, - } - } - - // Find the expression that matches `e` and replace its fill strategy. - // If such an expression is found, return the old strategy, and `None` otherwise. - fn replace_fill_strategy(&mut self, e: &Expr, mut fs: FillStrategy) -> Option { - for expr_fs in &mut self.fill_strategy { - if &expr_fs.0 == e { - std::mem::swap(&mut fs, &mut expr_fs.1); - return Some(fs); - } + Self::Default(val) => format!("COALESCE({expr}, {val})"), } - None } } @@ -202,21 +122,61 @@ impl GapFill { /// Create a new gap-filling operator. pub fn try_new( input: Arc, - group_expr: Vec, - aggr_expr: Vec, - params: GapFillParams, + series_expr: Vec, + time_expr: Expr, + fill_expr: Vec, + time_range: Range>, ) -> Result { - if params.time_range.end == Bound::Unbounded { + let (time_alias, time_col) = { + let (time_alias, time_expr) = if let Expr::Alias(alias) = &time_expr { + (Some(alias.name.clone()), alias.expr.as_ref()) + } else { + (None, &time_expr) + }; + let Expr::ScalarFunction(time_func) = time_expr else { + return Err(DataFusionError::Internal( + "GapFill time expression must be a ScalarFunctionExpr".to_string(), + )); + }; + let time_col = time_func.args.get(1).ok_or_else(|| { + DataFusionError::Internal( + "GapFill time expression must have at least two arguments".to_string(), + ) + })?; + (time_alias, time_col.clone()) + }; + + if time_range.end == Bound::Unbounded { return Err(DataFusionError::Internal( "missing upper bound in GapFill time range".to_string(), )); } + let time_schema_expr = if let Some(alias) = &time_alias { + time_col.alias(alias) + } else { + time_col + }; + + let fields = series_expr + .iter() + .chain(std::iter::once(&time_schema_expr)) + .chain(fill_expr.iter().map(|fe| &fe.expr)) + .map(|expr| expr.to_field(input.schema().as_ref())) + .collect::>>()?; + + let schema = Arc::new(DFSchema::new_with_metadata( + fields, + input.schema().metadata().clone(), + )?); + Ok(Self { input, - group_expr, - aggr_expr, - params, + series_expr, + time_expr, + fill_expr, + time_range, + schema, }) } @@ -225,9 +185,15 @@ impl GapFill { pub(crate) fn replace_fill_strategy( &mut self, e: &Expr, - fs: FillStrategy, + mut fs: FillStrategy, ) -> Option { - self.params.replace_fill_strategy(e, fs) + for fe in &mut self.fill_expr { + if &fe.expr == e { + std::mem::swap(&mut fe.strategy, &mut fs); + return Some(fs); + } + } + None } } @@ -241,29 +207,39 @@ impl UserDefinedLogicalNodeCore for GapFill { } fn schema(&self) -> &DFSchemaRef { - self.input.schema() + &self.schema } fn expressions(&self) -> Vec { - self.group_expr - .iter() - .chain(&self.aggr_expr) - .chain(&self.params.expressions()) - .cloned() - .collect() + let mut exprs = Vec::with_capacity(self.series_expr.len() + 1 + self.fill_expr.len() + 2); + for e in &self.series_expr { + exprs.push(e.clone()); + } + exprs.push(self.time_expr.clone()); + for fe in &self.fill_expr { + exprs.push(fe.expr.clone()); + } + if let Some(start) = bound_extract(&self.time_range.start) { + exprs.push(start.clone()); + } + exprs.push( + bound_extract(&self.time_range.end) + .unwrap_or_else(|| panic!("upper time bound is required")) + .clone(), + ); + exprs } fn fmt_for_explain(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let aggr_expr: String = self - .params - .fill_strategy + let fill_expr: String = self + .fill_expr .iter() - .map(|(e, fs)| fs.display_with_expr(e)) + .map(|e| e.to_string()) .collect::>() .join(", "); - let group_expr = self - .group_expr + let series_expr = self + .series_expr .iter() .map(|e| e.to_string()) .collect::>() @@ -271,24 +247,48 @@ impl UserDefinedLogicalNodeCore for GapFill { write!( f, - "{}: groupBy=[{group_expr}], aggr=[[{aggr_expr}]], time_column={}, stride={}, range={:?}", + "{}: series=[{series_expr}], time={}, fill=[{fill_expr}], range={:?}", self.name(), - self.params.time_column, - self.params.stride, - self.params.time_range, + self.time_expr, + self.time_range, ) } fn with_exprs_and_inputs( &self, - mut group_expr: Vec, + mut series_expr: Vec, inputs: Vec, ) -> Result { let plan = inputs[0].clone(); - let mut aggr_expr = group_expr.split_off(self.group_expr.len()); - let param_expr = aggr_expr.split_off(self.aggr_expr.len()); - let params = self.params.from_template(¶m_expr, &aggr_expr); - Self::try_new(Arc::new(plan), group_expr, aggr_expr, params) + let mut fill_expr = series_expr.split_off(self.series_expr.len() + 1); + let time_expr = series_expr + .pop() + .expect("there should be at least one series expr (the time expr)"); + let mut e_iter = fill_expr.split_off(self.fill_expr.len()).into_iter(); + let time_range = match try_map_range(&self.time_range, |b| { + try_map_bound(b.as_ref(), |_| { + Ok::<_, Infallible>(e_iter.next().expect("expr count should match template")) + }) + }) { + Ok(tr) => tr, + Err(infallible) => match infallible {}, + }; + + let fill_expr = fill_expr + .into_iter() + .zip(self.fill_expr.iter().map(|fe| fe.strategy.clone())) + .map(|(e, fs)| FillExpr { + expr: e, + strategy: fs, + }) + .collect(); + Self::try_new( + Arc::new(plan), + series_expr, + time_expr, + fill_expr, + time_range, + ) } /// Projection pushdown is an optmization that pushes a `Projection` node further down @@ -353,79 +353,37 @@ pub(crate) fn plan_gap_fill( } }; - let input_schema = phys_input.schema(); - let input_schema = input_schema.as_ref(); - - let group_expr: Result> = gap_fill - .group_expr - .iter() - .map(|e| session_state.create_physical_expr(e.clone(), input_dfschema)) - .collect(); - let group_expr = group_expr?; - - let aggr_expr: Result> = gap_fill - .aggr_expr + let series_expr = gap_fill + .series_expr .iter() - .map(|e| session_state.create_physical_expr(e.clone(), input_dfschema)) - .collect(); - let aggr_expr = aggr_expr?; - - let Some(logical_time_column) = gap_fill.params.time_column.try_as_col() else { - return Err(DataFusionError::Internal( - "GapFillExec: time column must be a `Column` expression".to_string(), - )); - }; - let time_column = Column::new_with_schema(&logical_time_column.name, input_schema)?; - - let stride = - session_state.create_physical_expr(gap_fill.params.stride.clone(), input_dfschema)?; - - let time_range = &gap_fill.params.time_range; - let time_range = try_map_range(time_range, |b| { + .map(|expr| session_state.create_physical_expr(expr.clone(), input_dfschema)) + .collect::>>()?; + let time_expr = + session_state.create_physical_expr(gap_fill.time_expr.clone(), input_dfschema)?; + let time_range = try_map_range(&gap_fill.time_range, |b| { try_map_bound(b.as_ref(), |e| { session_state.create_physical_expr(e.clone(), input_dfschema) }) })?; - let origin = gap_fill - .params - .origin - .as_ref() - .map(|e| session_state.create_physical_expr(e.clone(), input_dfschema)) - .transpose()?; - - let fill_strategy = gap_fill - .params - .fill_strategy + let fill_expr = gap_fill + .fill_expr .iter() - .map(|(e, fs)| { - Ok(( - session_state.create_physical_expr(e.clone(), input_dfschema)?, - fs.clone(), - )) + .map(|fe| { + Ok(PhysicalFillExpr { + expr: session_state.create_physical_expr(fe.expr.clone(), input_dfschema)?, + strategy: fe.strategy.clone(), + }) }) - .collect::, FillStrategy)>>>()?; - - let date_bin_udf = session_state - .scalar_functions() - .get(gap_fill.params.date_bin_udf.as_ref()) - .cloned() - .ok_or_else(|| { - DataFusionError::Execution(format!( - "ScalarUDF {} not found", - gap_fill.params.date_bin_udf - )) - })?; - - let params = GapFillExecParams { - date_bin_udf, - stride, - time_column, - origin, + .collect::>>()?; + + GapFillExec::try_new( + Arc::clone(phys_input), + series_expr, + time_expr, + fill_expr, time_range, - fill_strategy, - }; - GapFillExec::try_new(Arc::clone(phys_input), group_expr, aggr_expr, params) + ) } fn try_map_range(tr: &Range, mut f: F) -> Result, E> @@ -459,96 +417,97 @@ fn bound_extract(b: &Bound) -> Option<&T> { /// A physical node for the gap-fill operation. pub struct GapFillExec { input: Arc, - // The group by expressions from the original aggregation node. - group_expr: Vec>, - // The aggregate expressions from the original aggregation node. - aggr_expr: Vec>, + // Expressions which separate the time-series that are being filled. + series_expr: Vec>, + /// The time expression within the series. + time_expr: Arc, + /// Expressions the describe how values are filled. + fill_expr: Vec, + /// The output schema. + schema: SchemaRef, // The sort expressions for the required sort order of the input: // all of the group exressions, with the time column being last. sort_expr: LexOrdering, - // Parameters (besides streaming data) to gap filling - params: GapFillExecParams, + /// The time range of source input to DATE_BIN_GAPFILL. + /// Inferred from predicates in the overall query. + time_range: Range>>, /// Metrics reporting behavior during execution. metrics: ExecutionPlanMetricsSet, /// Cache holding plan properties like equivalences, output partitioning, output ordering etc. cache: PlanProperties, } -#[derive(Clone, Debug)] -struct GapFillExecParams { - /// The scalar function used to bin the timestamps. - date_bin_udf: Arc, - /// The uniform interval of incoming timestamps - stride: Arc, - /// The timestamp column produced by date_bin - time_column: Column, - /// The origin argument from the all to DATE_BIN_GAPFILL - origin: Option>, - /// The time range of source input to DATE_BIN_GAPFILL. - /// Inferred from predicates in the overall query. - time_range: Range>>, - /// What to do when filling aggregate columns. - /// The 0th element in each tuple is the aggregate column. - fill_strategy: Vec<(Arc, FillStrategy)>, -} - impl GapFillExec { fn try_new( input: Arc, - group_expr: Vec>, - aggr_expr: Vec>, - params: GapFillExecParams, + series_expr: Vec>, + time_expr: Arc, + fill_expr: Vec, + time_range: Range>>, ) -> Result { + let time_col = { + let Some(time_func) = time_expr.as_any().downcast_ref::() else { + return Err(DataFusionError::Internal(format!( + "GapFill time expression must be a ScalarFunctionExpr: {}", + time_expr + ))); + }; + let Some(time_col) = time_func.args().get(1) else { + return Err(DataFusionError::Internal(format!( + "GapFill time expression must have at least two arguments: {}", + time_expr + ))); + }; + + Arc::clone(time_col) + }; let sort_expr = { - let mut sort_expr: Vec<_> = group_expr + let sort_expr: Vec<_> = series_expr .iter() .map(|expr| PhysicalSortExpr { expr: Arc::clone(expr), options: SortOptions::default(), }) + // Add the time input as the lowest priority sort key. + .chain(std::iter::once(PhysicalSortExpr { + expr: Arc::clone(&time_col), + options: SortOptions::default(), + })) .collect(); - // Ensure that the time column is the last component in the sort - // expressions. - let time_idx = group_expr - .iter() - .enumerate() - .find(|(_i, e)| { - e.as_any() - .downcast_ref::() - .is_some_and(|c| c.index() == params.time_column.index()) - }) - .map(|(i, _)| i); - - if let Some(time_idx) = time_idx { - let last_elem = sort_expr.len() - 1; - sort_expr.swap(time_idx, last_elem); - } else { - return Err(DataFusionError::Internal( - "could not find time column for GapFillExec".to_string(), - )); - } - LexOrdering::new(sort_expr) .ok_or_else(|| plan_datafusion_err!("GapFill sort key empty"))? }; - let cache = Self::compute_properties(&input); + let input_schema = input.schema(); + let fields = series_expr + .iter() + .chain(std::iter::once(&time_col)) + .chain(fill_expr.iter().map(|fe| &fe.expr)) + .map(|expr| expr.return_field(&input_schema)) + .collect::>>()?; + let schema = Arc::new(Schema::new_with_metadata( + fields, + input_schema.metadata().clone(), + )); + + let cache = Self::compute_properties(&input, Arc::clone(&schema)); Ok(Self { input, - group_expr, - aggr_expr, + series_expr, + time_expr, + fill_expr, + schema, sort_expr, - params, + time_range, metrics: ExecutionPlanMetricsSet::new(), cache, }) } /// This function creates the cache object that stores the plan properties such as equivalence properties, partitioning, ordering, etc. - fn compute_properties(input: &Arc) -> PlanProperties { - let schema = input.schema(); + fn compute_properties(input: &Arc, schema: SchemaRef) -> PlanProperties { let eq_properties = match input.properties().output_ordering() { None => EquivalenceProperties::new(schema), Some(output_ordering) => EquivalenceProperties::new_with_orderings( @@ -557,11 +516,9 @@ impl GapFillExec { ), }; - let output_partitioning = Partitioning::UnknownPartitioning(1); - PlanProperties::new( eq_properties, - output_partitioning, + input.properties().output_partitioning().clone(), input.pipeline_behavior(), input.boundedness(), ) @@ -584,7 +541,7 @@ impl ExecutionPlan for GapFillExec { } fn schema(&self) -> SchemaRef { - self.input.schema() + Arc::clone(&self.schema) } fn properties(&self) -> &PlanProperties { @@ -592,9 +549,14 @@ impl ExecutionPlan for GapFillExec { } fn required_input_distribution(&self) -> Vec { - // It seems like it could be possible to partition on all the - // group keys except for the time expression. For now, keep it simple. - vec![Distribution::SinglePartition] + vec![if self.series_expr.is_empty() { + // If there are no series expressions then the input is a + // single time series. There is no advantage to partitioning + // in that case. + Distribution::SinglePartition + } else { + Distribution::HashPartitioned(self.series_expr.clone()) + }] } fn required_input_ordering(&self) -> Vec> { @@ -618,9 +580,10 @@ impl ExecutionPlan for GapFillExec { match children.as_slice() { [child] => Ok(Arc::new(Self::try_new( Arc::clone(child), - self.group_expr.clone(), - self.aggr_expr.clone(), - self.params.clone(), + self.series_expr.clone(), + Arc::clone(&self.time_expr), + self.fill_expr.clone(), + self.time_range.clone(), )?)), _ => Err(DataFusionError::Internal(format!( "GapFillExec wrong number of children: expected 1, found {}", @@ -634,9 +597,15 @@ impl ExecutionPlan for GapFillExec { partition: usize, context: Arc, ) -> Result { - if partition != 0 { + if partition + >= self + .input + .properties() + .output_partitioning() + .partition_count() + { return Err(DataFusionError::Internal(format!( - "GapFillExec invalid partition {partition}, there can be only one partition" + "GapFillExec invalid partition {partition}" ))); } @@ -669,15 +638,19 @@ impl DisplayAs for GapFillExec { DisplayFormatType::Default | DisplayFormatType::Verbose | DisplayFormatType::TreeRender => { - let group_expr: Vec<_> = self.group_expr.iter().map(|e| e.to_string()).collect(); - let aggr_expr: Vec<_> = self - .params - .fill_strategy + let series_expr: Vec<_> = self.series_expr.iter().map(|e| e.to_string()).collect(); + let fill_expr: Vec<_> = self + .fill_expr .iter() - .map(|(e, fs)| fs.display_with_expr(e)) + .map( + |PhysicalFillExpr { + expr: e, + strategy: fs, + }| fs.display_with_expr(e), + ) .collect(); - let time_range = match try_map_range(&self.params.time_range, |b| { + let time_range = match try_map_range(&self.time_range, |b| { try_map_bound(b.as_ref(), |e| Ok::<_, Infallible>(e.to_string())) }) { Ok(tr) => tr, @@ -686,10 +659,10 @@ impl DisplayAs for GapFillExec { write!( f, - "GapFillExec: group_expr=[{}], aggr_expr=[{}], stride={}, time_range={:?}", - group_expr.join(", "), - aggr_expr.join(", "), - self.params.stride, + "GapFillExec: series_expr=[{}], time_expr={}, fill_expr=[{}], time_range={:?}", + series_expr.join(", "), + self.time_expr, + fill_expr.join(", "), time_range ) } @@ -697,6 +670,19 @@ impl DisplayAs for GapFillExec { } } +/// A physical expression that represents a fill operation. +#[derive(Debug, Clone)] +pub struct PhysicalFillExpr { + pub expr: Arc, + pub strategy: FillStrategy, +} + +impl std::fmt::Display for PhysicalFillExpr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.strategy.display_with_expr(&self.expr)) + } +} + #[cfg(test)] mod test { use std::ops::{Bound, Range}; @@ -712,8 +698,8 @@ mod test { common::DFSchema, datasource::empty::EmptyTable, error::Result, - logical_expr::{ExprSchemable, Extension, UserDefinedLogicalNode, logical_plan}, - prelude::{col, lit}, + logical_expr::{ExprSchemable, Extension, logical_plan}, + prelude::{col, date_bin, lit}, scalar::ScalarValue, }; use datafusion_util::lit_timestamptz_nano; @@ -737,12 +723,15 @@ mod test { logical_plan::table_scan(Some("temps"), &schema, None)?.build() } - fn fill_strategy_null(cols: Vec, schema: &DFSchema) -> Vec<(Expr, FillStrategy)> { + fn fill_strategy_null(cols: Vec, schema: &DFSchema) -> Vec { cols.into_iter() .map(|e| { e.get_type(schema) .and_then(|dt| dt.try_into()) - .map(|null| (e, FillStrategy::Default(null))) + .map(|null| FillExpr { + expr: e, + strategy: FillStrategy::Default(null), + }) }) .collect::>>() .unwrap() @@ -754,113 +743,22 @@ mod test { let schema = Arc::clone(scan.schema()); let result = GapFill::try_new( Arc::new(scan), - vec![col("loc"), col("time")], - vec![col("temp")], - GapFillParams { - date_bin_udf: Arc::from("date_bin"), - stride: lit(ScalarValue::new_interval_dt(0, 60_000)), - time_column: col("time"), - origin: None, - time_range: Range { - start: Bound::Included(lit_timestamptz_nano(1000)), - end: Bound::Unbounded, - }, - fill_strategy: fill_strategy_null(vec![col("temp")], schema.as_ref()), + vec![col("loc")], + date_bin( + lit(ScalarValue::new_interval_dt(0, 60_000)), + col("time"), + lit_timestamptz_nano(0), + ), + fill_strategy_null(vec![col("temp")], schema.as_ref()), + Range { + start: Bound::Included(lit_timestamptz_nano(1000)), + end: Bound::Unbounded, }, ); assert_error!(result, DataFusionError::Internal(ref msg) if msg == "missing upper bound in GapFill time range"); } - fn assert_gapfill_from_template_roundtrip(gapfill: &GapFill) { - let gapfill_as_node: &dyn UserDefinedLogicalNode = gapfill; - let scan = table_scan().unwrap(); - let exprs = gapfill_as_node.expressions(); - let want_exprs = gapfill.group_expr.len() - + gapfill.aggr_expr.len() - + 2 // stride, time - + gapfill.params.origin.iter().count() - + bound_extract(&gapfill.params.time_range.start).iter().count() - + bound_extract(&gapfill.params.time_range.end).iter().count(); - assert_eq!(want_exprs, exprs.len()); - let gapfill_ft = gapfill_as_node - .with_exprs_and_inputs(exprs, vec![scan]) - .expect("should be able to create a new `UserDefinedLogicalNode` node"); - let gapfill_ft = gapfill_ft - .as_any() - .downcast_ref::() - .expect("should be a GapFill"); - assert_eq!(gapfill.group_expr, gapfill_ft.group_expr); - assert_eq!(gapfill.aggr_expr, gapfill_ft.aggr_expr); - assert_eq!(gapfill.params, gapfill_ft.params); - } - - #[test] - fn test_from_template() { - let schema = schema().try_into().unwrap(); - - for params in vec![ - // no origin, no start bound - GapFillParams { - date_bin_udf: Arc::from("date_bin"), - stride: lit(ScalarValue::new_interval_dt(0, 60_000)), - time_column: col("time"), - origin: None, - time_range: Range { - start: Bound::Unbounded, - end: Bound::Excluded(lit_timestamptz_nano(2000)), - }, - fill_strategy: fill_strategy_null(vec![col("temp")], &schema), - }, - // no origin, yes start bound - GapFillParams { - date_bin_udf: Arc::from("date_bin"), - stride: lit(ScalarValue::new_interval_dt(0, 60_000)), - time_column: col("time"), - origin: None, - time_range: Range { - start: Bound::Included(lit_timestamptz_nano(1000)), - end: Bound::Excluded(lit_timestamptz_nano(2000)), - }, - fill_strategy: fill_strategy_null(vec![col("temp")], &schema), - }, - // yes origin, no start bound - GapFillParams { - date_bin_udf: Arc::from("date_bin"), - stride: lit(ScalarValue::new_interval_dt(0, 60_000)), - time_column: col("time"), - origin: Some(lit_timestamptz_nano(1_000_000_000)), - time_range: Range { - start: Bound::Unbounded, - end: Bound::Excluded(lit_timestamptz_nano(2000)), - }, - fill_strategy: fill_strategy_null(vec![col("temp")], &schema), - }, - // yes origin, yes start bound - GapFillParams { - date_bin_udf: Arc::from("date_bin"), - stride: lit(ScalarValue::new_interval_dt(0, 60_000)), - time_column: col("time"), - origin: Some(lit_timestamptz_nano(1_000_000_000)), - time_range: Range { - start: Bound::Included(lit_timestamptz_nano(1000)), - end: Bound::Excluded(lit_timestamptz_nano(2000)), - }, - fill_strategy: fill_strategy_null(vec![col("temp")], &schema), - }, - ] { - let scan = table_scan().unwrap(); - let gapfill = GapFill::try_new( - Arc::new(scan.clone()), - vec![col("loc"), col("time")], - vec![col("temp")], - params, - ) - .unwrap(); - assert_gapfill_from_template_roundtrip(&gapfill); - } - } - #[test] fn fmt_logical_plan() -> Result<()> { // This test case does not make much sense but @@ -870,18 +768,16 @@ mod test { let schema = Arc::clone(scan.schema()); let gapfill = GapFill::try_new( Arc::new(scan), - vec![col("loc"), col("time")], - vec![col("temp")], - GapFillParams { - date_bin_udf: Arc::from("date_bin"), - stride: lit(ScalarValue::new_interval_dt(0, 60_000)), - time_column: col("time"), - origin: None, - time_range: Range { - start: Bound::Included(lit_timestamptz_nano(1000)), - end: Bound::Excluded(lit_timestamptz_nano(2000)), - }, - fill_strategy: fill_strategy_null(vec![col("temp")], &schema), + vec![col("loc")], + date_bin( + lit(ScalarValue::new_interval_dt(0, 60_000)), + col("time"), + lit_timestamptz_nano(0), + ), + fill_strategy_null(vec![col("temp")], &schema), + Range { + start: Bound::Included(lit_timestamptz_nano(1000)), + end: Bound::Excluded(lit_timestamptz_nano(2000)), }, )?; let plan = LogicalPlan::Extension(Extension { @@ -891,7 +787,7 @@ mod test { insta::assert_yaml_snapshot!( format_logical_plan(&plan), @r#" - - " GapFill: groupBy=[loc, time], aggr=[[temp]], time_column=time, stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" + - " GapFill: series=[loc], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), time, TimestampNanosecond(0, None)), fill=[temp], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))" - " TableScan: temps" "# ); @@ -922,7 +818,7 @@ mod test { explain, @r#" - " ProjectionExec: expr=[date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 as minute, avg(temps.temp)@1 as avg(temps.temp)]" - - " GapFillExec: group_expr=[date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0], aggr_expr=[avg(temps.temp)@1], stride=IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }, time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")" + - " GapFillExec: series_expr=[], time_expr=date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }, date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0, 0), fill_expr=[avg(temps.temp)@1], time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")" - " SortExec: expr=[date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 ASC], preserve_partitioning=[false]" - " AggregateExec: mode=Single, gby=[date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }, time@0, 0) as date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))], aggr=[avg(temps.temp)]" - " EmptyExec" @@ -949,8 +845,8 @@ mod test { insta::assert_yaml_snapshot!( explain, @r#" - - " ProjectionExec: expr=[loc@0 as loc, date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 as minute, concat(Utf8(\"zz\"),temps.loc)@2 as loczz, avg(temps.temp)@3 as avg(temps.temp)]" - - " GapFillExec: group_expr=[loc@0, date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1, concat(Utf8(\"zz\"),temps.loc)@2], aggr_expr=[avg(temps.temp)@3], stride=IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }, time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")" + - " ProjectionExec: expr=[loc@0 as loc, date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@2 as minute, concat(Utf8(\"zz\"),temps.loc)@1 as loczz, avg(temps.temp)@3 as avg(temps.temp)]" + - " GapFillExec: series_expr=[loc@0, concat(Utf8(\"zz\"),temps.loc)@2], time_expr=date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }, date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1, 0), fill_expr=[avg(temps.temp)@3], time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")" - " SortExec: expr=[loc@0 ASC, concat(Utf8(\"zz\"),temps.loc)@2 ASC, date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 ASC], preserve_partitioning=[false]" - " AggregateExec: mode=Single, gby=[loc@1 as loc, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }, time@0, 0) as date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\")), concat(zz, loc@1) as concat(Utf8(\"zz\"),temps.loc)], aggr=[avg(temps.temp)]" - " EmptyExec" diff --git a/iox_query/src/exec/gapfill/params.rs b/iox_query/src/exec/gapfill/params.rs index b8a9ff88..e53df364 100644 --- a/iox_query/src/exec/gapfill/params.rs +++ b/iox_query/src/exec/gapfill/params.rs @@ -1,5 +1,5 @@ //! Evaluate the parameters to be used for gap filling. -use std::ops::Bound; +use std::ops::{Bound, Range}; use std::sync::Arc; use arrow::{ @@ -7,20 +7,23 @@ use arrow::{ record_batch::RecordBatch, }; use chrono::Duration; +use datafusion::physical_plan::expressions::Column; use datafusion::{ common::exec_err, error::{DataFusionError, Result}, functions::datetime::date_bin::DateBinFunc, logical_expr::ScalarFunctionArgs, - physical_expr::PhysicalExpr, - physical_plan::{ColumnarValue, expressions::Column}, + physical_expr::{PhysicalExpr, ScalarFunctionExpr}, + physical_plan::ColumnarValue, scalar::ScalarValue, }; use hashbrown::HashMap; use query_functions::date_bin_wallclock::DateBinWallclockUDF; +use crate::exec::gapfill::PhysicalFillExpr; + use super::{ - FillStrategy, GapExpander, GapFillExecParams, date_bin_gap_expander::DateBinGapExpander, + FillStrategy, GapExpander, date_bin_gap_expander::DateBinGapExpander, date_bin_wallclock_gap_expander::DateBinWallclockGapExpander, try_map_bound, try_map_range, }; @@ -47,22 +50,40 @@ pub(crate) struct GapFillParams { impl GapFillParams { /// Create a new [GapFillParams] by figuring out the actual values (as native i64) for the stride, /// first and last timestamp for gap filling. - pub(super) fn try_new(schema: SchemaRef, params: &GapFillExecParams) -> Result { - let time_data_type = params.time_column.data_type(schema.as_ref())?; + pub(super) fn try_new( + schema: SchemaRef, + time_expr: &Arc, + fill_expr: &[PhysicalFillExpr], + time_range: &Range>>, + ) -> Result { + let Some(time_func) = time_expr.as_any().downcast_ref::() else { + return Err(DataFusionError::Internal(format!( + "time_expr was not a function call: {time_expr}" + ))); + }; + + let time_data_type = time_func.data_type(schema.as_ref())?; let DataType::Timestamp(_, tz) = time_data_type else { return exec_err!("invalid data type for time column: {time_data_type}"); }; let batch = RecordBatch::new_empty(schema); - let stride = params.stride.evaluate(&batch)?; - let origin = params - .origin - .as_ref() - .map(|e| e.evaluate(&batch)) - .transpose()?; + let (stride, origin) = match time_func.args() { + [stride, _] => (Arc::clone(stride), None), + [stride, _, origin] => (Arc::clone(stride), Some(Arc::clone(origin))), + _ => { + return Err(DataFusionError::Internal(format!( + "unexpected arguments to time_expr: {:?}", + time_func.args() + ))); + } + }; + + let stride = stride.evaluate(&batch)?; + let origin = origin.as_ref().map(|e| e.evaluate(&batch)).transpose()?; // Evaluate the upper and lower bounds of the time range - let range = try_map_range(¶ms.time_range, |b| { + let range = try_map_range(time_range, |b| { try_map_bound(b.as_ref(), |pe| { extract_timestamp_nanos(&pe.evaluate(&batch)?) }) @@ -103,55 +124,48 @@ impl GapFillParams { )); let first_ts = first_ts .map(|_| { - extract_timestamp_nanos(¶ms.date_bin_udf.invoke_with_args( - ScalarFunctionArgs { - args: args.clone(), - arg_fields: arg_fields(&args), - number_rows: 1, - return_field: Arc::clone(&return_field), - }, - )?) + extract_timestamp_nanos(&time_func.fun().invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields(&args), + number_rows: 1, + return_field: Arc::clone(&return_field), + })?) }) .transpose()?; args[1] = i64_to_columnar_ts(Some(last_ts), &tz); - let last_ts = extract_timestamp_nanos(¶ms.date_bin_udf.invoke_with_args( - ScalarFunctionArgs { + let last_ts = + extract_timestamp_nanos(&time_func.fun().invoke_with_args(ScalarFunctionArgs { args: args.clone(), arg_fields: arg_fields(&args), number_rows: 1, return_field: Arc::clone(&return_field), - }, - )?)?; + })?)?; let gap_expander: Arc = - if params.date_bin_udf.inner().as_any().is::() { + if time_func.fun().inner().as_any().is::() { Arc::new(DateBinGapExpander::new(stride_nanos)) - } else if params - .date_bin_udf - .inner() - .as_any() - .is::() - { + } else if time_func.fun().inner().as_any().is::() { Arc::new(DateBinWallclockGapExpander::try_from_df_args(&args)?) } else { return Err(DataFusionError::Execution(format!( "gap filling not supported for {}", - params.date_bin_udf.name() + time_func.fun().name() ))); }; - let fill_strategy = params - .fill_strategy + let fill_strategy = fill_expr .iter() - .map(|(e, fs)| { - let idx = e + .map(|pfe| { + let idx = pfe + .expr .as_any() .downcast_ref::() .ok_or(DataFusionError::Internal(format!( - "fill strategy aggr expr was not a column: {e:?}", + "fill strategy aggr expr was not a column: {:?}", + pfe.expr )))? .index(); - Ok((idx, fs.clone())) + Ok((idx, pfe.strategy.clone())) }) .collect::>>()?; @@ -237,7 +251,7 @@ mod tests { use crate::exec::{ Executor, - gapfill::{FillStrategy, GapFillExec, GapFillExecParams}, + gapfill::{FillStrategy, GapFillExec}, }; #[tokio::test] @@ -367,23 +381,29 @@ mod tests { #[test] fn test_params_no_start() { - let exec_params = GapFillExecParams { - date_bin_udf: Arc::new(ScalarUDF::new_from_impl(DateBinFunc::new())), - stride: interval(1_000_000_000), - time_column: Column::new("time", 0), - origin: None, - time_range: Range { - start: Bound::Unbounded, - end: Bound::Excluded(timestamp(20_000_000_000)), - }, - fill_strategy: std::iter::once(( - Arc::new(Column::new("a0", 1)) as Arc, - FillStrategy::Default(ScalarValue::Null), - )) - .collect(), + let time_range = Range { + start: Bound::Unbounded, + end: Bound::Excluded(timestamp(20_000_000_000)), }; - let params = GapFillParams::try_new(schema().into(), &exec_params).unwrap(); + let time_expr: Arc = Arc::new(ScalarFunctionExpr::new( + "time", + Arc::new(ScalarUDF::new_from_impl(DateBinFunc::new())), + vec![interval(1_000_000_000), Arc::new(Column::new("time", 0))], + Arc::new(Field::new( + "time", + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + )), + )); + + let fill_expr = vec![PhysicalFillExpr { + expr: Arc::new(Column::new("a0", 1)), + strategy: FillStrategy::Default(ScalarValue::Null), + }]; + + let params = + GapFillParams::try_new(schema().into(), &time_expr, &fill_expr, &time_range).unwrap(); assert_eq!( params.gap_expander.to_string(), "DateBinGapExpander [stride=PT1S]" @@ -419,9 +439,11 @@ mod tests { let physical_plan = context.sql_to_physical_plan(sql).await?; let gapfill_node = &physical_plan.children()[0]; let gapfill_node = gapfill_node.as_any().downcast_ref::().unwrap(); - let exec_params = &gapfill_node.params; + let time_expr = &gapfill_node.time_expr; + let fill_expr = &gapfill_node.fill_expr; + let time_range = &gapfill_node.time_range; let schema = schema(); - GapFillParams::try_new(schema.into(), exec_params) + GapFillParams::try_new(schema.into(), time_expr, fill_expr, time_range) } fn simple_fill_strategy() -> HashMap { diff --git a/iox_query/src/exec/gapfill/stream.rs b/iox_query/src/exec/gapfill/stream.rs index 214e7271..fda86e07 100644 --- a/iox_query/src/exec/gapfill/stream.rs +++ b/iox_query/src/exec/gapfill/stream.rs @@ -14,6 +14,7 @@ use arrow_util::optimize::optimize_dictionaries; use datafusion::{ error::{DataFusionError, Result}, execution::memory_pool::MemoryReservation, + physical_expr::ScalarFunctionExpr, physical_plan::{ ExecutionPlan, PhysicalExpr, RecordBatchStream, SendableRecordBatchStream, expressions::Column, @@ -35,15 +36,14 @@ use super::{GapFillExec, algo::GapFiller, buffered_input::BufferedInput, params: pub(super) struct GapFillStream { /// The schema of the input and output. schema: SchemaRef, + /// The columns that define the time series that a value belongs to. + series_expr: Vec>, /// The column from the input that contains the timestamps for each row. /// This column has already had `date_bin` applied to it by a previous `Aggregate` /// operator. time_expr: Arc, - /// The other columns from the input that appeared in the GROUP BY clause of the - /// original query. - group_expr: Vec>, /// The aggregate columns from the select list of the original query. - aggr_expr: Vec>, + fill_expr: Vec>, /// The producer of the input record batches. input: SendableRecordBatchStream, /// Input that has been read from the input stream. @@ -69,35 +69,40 @@ impl GapFillStream { ) -> Result { let schema = exec.schema(); let GapFillExec { - sort_expr, - aggr_expr, - params, + series_expr, + time_expr, + fill_expr, + time_range, .. } = exec; - if sort_expr.is_empty() { + let series_cols = series_expr.iter().map(expr_to_index).collect::>(); + let params = GapFillParams::try_new(Arc::clone(&schema), time_expr, fill_expr, time_range)?; + let buffered_input = BufferedInput::new(¶ms, series_cols); + + let time_expr = if let Some(func) = time_expr.as_any().downcast_ref::() + { + // The time_expr has already been determined to be a + // date_bin call. Thie input time column is the second + // argument. + Arc::clone(&func.args()[1]) + } else { return Err(DataFusionError::Internal( - "empty sort_expr vector for gap filling; should have at least a time expression" - .to_string(), + "time_expr must be a ScalarFunctionExpr".to_string(), )); - } - let mut group_expr = sort_expr + }; + + let fill_expr = fill_expr .iter() - .map(|se| Arc::clone(&se.expr)) + .map(|pfe| Arc::clone(&pfe.expr)) .collect::>(); - let aggr_expr = aggr_expr.to_owned(); - let time_expr = group_expr.split_off(group_expr.len() - 1).pop().unwrap(); - - let group_cols = group_expr.iter().map(expr_to_index).collect::>(); - let params = GapFillParams::try_new(Arc::clone(&schema), params)?; - let buffered_input = BufferedInput::new(¶ms, group_cols); let gap_filler = GapFiller::new(params, batch_size); Ok(Self { schema, + series_expr: series_expr.clone(), time_expr, - group_expr, - aggr_expr, + fill_expr, input, buffered_input, gap_filler, @@ -180,7 +185,7 @@ impl GapFillStream { let old_size = batches.iter().map(|rb| rb.get_array_memory_size()).sum(); - let mut batch = arrow::compute::concat_batches(&self.schema, &batches) + let mut batch = arrow::compute::concat_batches(&batches[0].schema(), &batches) .map_err(|err| DataFusionError::ArrowError(Box::new(err), None))?; self.reservation.try_grow(batch.get_array_memory_size())?; @@ -212,10 +217,9 @@ impl GapFillStream { .ok_or(DataFusionError::Internal( "time array must be a TimestampNanosecondArray".to_string(), ))?; - let input_time_array = (expr_to_index(&self.time_expr), input_time_array); - let group_arrays = self.group_arrays(&input_batch)?; - let aggr_arrays = self.aggr_arrays(&input_batch)?; + let series_arrays = self.series_arrays(&input_batch)?; + let fill_arrays = self.fill_arrays(&input_batch)?; let timer = elapsed_compute.timer(); let output_batch = self @@ -223,8 +227,8 @@ impl GapFillStream { .build_gapfilled_output( Arc::clone(&self.schema), input_time_array, - &group_arrays, - &aggr_arrays, + &series_arrays, + &fill_arrays, ) .record_output(&self.baseline_metrics)?; timer.done(); @@ -241,23 +245,17 @@ impl GapFillStream { /// Produces the arrays for the group columns in the input. /// The first item in the 2-tuple is the arrays offset in the schema. - fn group_arrays(&self, input_batch: &RecordBatch) -> Result> { - self.group_expr + fn series_arrays(&self, input_batch: &RecordBatch) -> Result> { + self.series_expr .iter() - .map(|e| { - Ok(( - expr_to_index(e), - e.evaluate(input_batch)? - .into_array(input_batch.num_rows())?, - )) - }) + .map(|e| e.evaluate(input_batch)?.into_array(input_batch.num_rows())) .collect::>>() } /// Produces the arrays for the aggregate columns in the input. /// The first item in the 2-tuple is the arrays offset in the schema. - fn aggr_arrays(&self, input_batch: &RecordBatch) -> Result> { - self.aggr_expr + fn fill_arrays(&self, input_batch: &RecordBatch) -> Result> { + self.fill_expr .iter() .map(|e| { Ok(( diff --git a/iox_query/src/exec/series_limit/logical.rs b/iox_query/src/exec/series_limit/logical.rs new file mode 100644 index 00000000..30a0be2f --- /dev/null +++ b/iox_query/src/exec/series_limit/logical.rs @@ -0,0 +1,1252 @@ +//! Logical plan node for the SeriesLimit operation. + +use arrow::datatypes::{DataType, Field}; +use datafusion::{ + common::{ + DFSchema, DFSchemaRef, ExprSchema, Result, TableReference, internal_err, + tree_node::{Transformed, TreeNodeContainer, TreeNodeRecursion}, + }, + error::DataFusionError, + logical_expr::{Expr, ExprSchemable, LogicalPlan, SortExpr, UserDefinedLogicalNodeCore}, + sql::sqlparser::ast::NullTreatment, +}; +use std::{collections::BTreeMap, sync::Arc}; + +/// Expression type that describes a time-series column to which per-series +/// `LIMIT` and `OFFSET` operations should be applied. +/// +/// This type represents a single value column in a time series that will have +/// limiting applied independently per series group. It encapsulates not just the +/// expression to evaluate, but also how NULL values should be handled and what +/// default value to use when a row falls outside the limit range. +/// +/// # Purpose +/// +/// `LimitExpr` is used as part of the logical planning phase for InfluxQL queries +/// that apply LIMIT/OFFSET on a per-series basis. Each `LimitExpr` corresponds to +/// one value column in the SELECT clause that needs series-based limiting. +/// +/// # NULL Treatment Modes +/// +/// The `null_treatment` field controls how NULL values are counted: +/// +/// - **`RespectNulls`**: NULL values count toward the row limit and are included +/// in row numbering. This is the default SQL behavior. +/// +/// - **`IgnoreNulls`**: NULL values are skipped and don't count toward the limit. +/// Only non-NULL values contribute to the row count. +/// +/// # Default Values +/// +/// The `default_value` field specifies what value to output when a row is filtered +/// out due to LIMIT/OFFSET constraints, but the timestamp exists in another series. +/// This enables time-aligned output across multiple series even when some series +/// have fewer points. +/// +/// # Examples +/// +/// ## Basic Usage +/// +/// ```text +/// Query: SELECT temperature FROM weather GROUP BY location LIMIT 3 +/// +/// LimitExpr { +/// expr: Column("temperature"), +/// null_treatment: RespectNulls, +/// default_value: Literal(NULL), +/// } +/// ``` +/// +/// ## With Default Values (FILL) +/// +/// ```text +/// Query: SELECT FILL(0, temperature) FROM weather GROUP BY location LIMIT 3 +/// +/// LimitExpr { +/// expr: Column("temperature"), +/// null_treatment: RespectNulls, +/// default_value: Literal(0), +/// } +/// ``` +/// +/// ## Ignoring NULLs +/// +/// ```text +/// Query: SELECT temperature IGNORE NULLS FROM weather GROUP BY location LIMIT 3 +/// +/// LimitExpr { +/// expr: Column("temperature"), +/// null_treatment: IgnoreNulls, +/// default_value: Literal(NULL), +/// } +/// ``` +/// +/// # Type Safety +/// +/// The `expr` and `default_value` must have the same data type. This is validated +/// by the `get_type()` method and enforced during logical plan construction. Type +/// mismatches result in an error during query planning. +/// +/// # Relation to Physical Plan +/// +/// During physical planning, each `LimitExpr` is converted to a [`PhysicalLimitExpr`] +/// which performs the actual row numbering and filtering during query execution. +/// +/// [`PhysicalLimitExpr`]: crate::exec::series_limit::physical::PhysicalLimitExpr +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)] +pub struct LimitExpr { + /// The expression for the values to which the limit will be applied. + /// This must reference exactly one column in the input which will + /// be replaced with the limited version. + pub expr: Expr, + + /// How nulls in the series should be treated. + pub null_treatment: NullTreatment, + + /// The default value that should be output if a point in time is + /// outside of the limits for (or not present in) this series, but + /// the time is present in another series. + pub default_value: Expr, +} + +impl std::fmt::Display for LimitExpr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{} {} (default: {})", + self.expr, self.null_treatment, self.default_value + ) + } +} + +impl LimitExpr { + /// Returns the data type of this expression when evaluated against the given schema. + /// + /// This method ensures that both the expression and default value have the same type, + /// returning an error if they differ. + pub fn get_type(&self, schema: &dyn ExprSchema) -> Result { + let expr_dt = self.expr.get_type(schema)?; + let default_value_dt = self.default_value.get_type(schema)?; + if expr_dt != default_value_dt { + return internal_err!( + "LimitExpr expr and default_value must have the same type, got expr: {expr_dt:?}, default_value: {default_value_dt:?}" + ); + } + Ok(expr_dt) + } + + /// Returns whether this expression is nullable when evaluated against the given schema. + /// + /// The result is nullable if either the expression or the default value is nullable, + /// since either could contribute to the final result. + pub fn nullable(&self, input_schema: &dyn ExprSchema) -> Result { + let expr_nullable = self.expr.nullable(input_schema)?; + let default_nullable = self.default_value.nullable(input_schema)?; + Ok(match self.null_treatment { + // If ignoring nulls, the expression's nullability does not affect the result + NullTreatment::IgnoreNulls => default_nullable, + // If respecting nulls, both expression and default value nullability matter + NullTreatment::RespectNulls => expr_nullable || default_nullable, + }) + } + + /// Returns both the data type and nullability of this expression. + /// + /// This is a convenience method that combines `get_type` and `nullable`. + pub fn data_type_and_nullable(&self, schema: &dyn ExprSchema) -> Result<(DataType, bool)> { + let data_type = self.get_type(schema)?; + let nullable = self.nullable(schema)?; + Ok((data_type, nullable)) + } + + /// Returns a field representation of this expression with its name, data type, and nullability. + /// + /// The field is derived from the underlying expression but with potentially updated + /// nullability based on both the expression and default value. + pub fn to_field( + &self, + input_schema: &dyn ExprSchema, + ) -> Result<(Option, Arc)> { + let (qualifier, field) = self.expr.to_field(input_schema)?; + + // Get the data type and nullability, which may differ from the base expression + let data_type = self.get_type(input_schema)?; + let nullable = self.nullable(input_schema)?; + + // Create a new field with the potentially updated type and nullability + let new_field = + Field::new(field.name(), data_type, nullable).with_metadata(field.metadata().clone()); + + Ok((qualifier, Arc::new(new_field))) + } +} + +impl<'a> TreeNodeContainer<'a, Expr> for LimitExpr { + fn apply_elements Result>( + &'a self, + mut f: F, + ) -> Result { + // Apply to the series expression + let recursion = f(&self.expr)?; + if recursion == TreeNodeRecursion::Stop { + return Ok(TreeNodeRecursion::Stop); + } + + // Apply to the default value expression + f(&self.default_value) + } + + fn map_elements Result>>( + self, + mut f: F, + ) -> Result> { + // Transform the series expression + let expr_result = f(self.expr)?; + let mut transformed = expr_result.transformed; + let mut tnr = expr_result.tnr; + let expr = expr_result.data; + + // Transform the default value expression (if we should continue) + let default_value = match tnr { + TreeNodeRecursion::Continue | TreeNodeRecursion::Jump => { + let default_value_result = f(self.default_value)?; + transformed |= default_value_result.transformed; + tnr = default_value_result.tnr; + default_value_result.data + } + TreeNodeRecursion::Stop => self.default_value, + }; + + Ok(Transformed { + data: Self { + expr, + null_treatment: self.null_treatment, + default_value, + }, + transformed, + tnr, + }) + } +} + +/// Logical plan node for per-series LIMIT and OFFSET operations. +/// +/// This logical plan node represents the InfluxQL-style series limiting operation, +/// which applies LIMIT and OFFSET constraints independently to each time series +/// rather than globally across all results. This is a key semantic difference +/// from standard SQL LIMIT/OFFSET. +/// +/// # Purpose +/// +/// `SeriesLimit` is a custom logical plan node used during query planning for +/// InfluxQL queries. It captures the intent to limit rows on a per-series basis +/// before being converted to a physical execution plan ([`SeriesLimitExec`]). +/// +/// # InfluxQL vs SQL Semantics +/// +/// ## Standard SQL LIMIT +/// ```sql +/// SELECT value FROM measurements LIMIT 10 +/// ``` +/// Returns 10 rows total across all series. +/// +/// ## InfluxQL Series-based LIMIT +/// ```sql +/// SELECT value1 FROM measurements GROUP BY location LIMIT 10 +/// ``` +/// Returns up to 10 rows **per location** (per series), potentially returning +/// many more than 10 rows total. +/// +/// ```sql +/// SELECT value1, value2 FROM measurements GROUP BY location LIMIT 5 OFFSET 2 +/// ``` +/// Returns up to 5 rows **per location**, skipping the first 2 rows in each +/// series. Where the series do not have values with matching timestamps the +/// default_value is used to fill in the gaps where required. +/// +/// # Important Note: SLIMIT vs LIMIT +/// +/// **This operation does NOT implement InfluxQL's `SLIMIT` or `SOFFSET` clauses.** +/// +/// - `SLIMIT`/`SOFFSET`: Limit the number of *series* loaded from storage +/// - `LIMIT`/`OFFSET` (this operation): Limit the number of *rows per series* +/// +/// For example: +/// - `SLIMIT 5` → Load at most 5 different series from storage +/// - `LIMIT 10` → Return at most 10 rows from each series +/// +/// The implementation of `SLIMIT`/`SOFFSET` is tracked by +/// [issue 6940](https://github.com/influxdata/influxdb_iox/issues/6940). +/// +/// # Query Structure +/// +/// A typical InfluxQL query using series limiting looks like: +/// +/// ```text +/// SELECT ... +/// FROM +/// WHERE +/// GROUP BY ... +/// ORDER BY time [ASC|DESC] +/// LIMIT OFFSET +/// ``` +/// +/// This translates to a `SeriesLimit` node with: +/// - `series_expr`: The GROUP BY columns that define series boundaries +/// - `order_expr`: The time column and sort direction (from ORDER BY) +/// - `limit_expr`: The value columns from SELECT clause +/// - `skip`: The OFFSET value (number of rows to skip per series) +/// - `fetch`: The LIMIT value (max rows to return per series) +/// +/// # Components +/// +/// ## Series Expressions (`series_expr`) +/// +/// Define what constitutes a unique time series. Typically these are tag columns. +/// Rows with identical values for all series expressions belong to the same series. +/// +/// Example: `GROUP BY location, sensor_id` → `series_expr = [Column("location"), Column("sensor_id")]` +/// +/// ## Order Expressions (`order_expr`) +/// +/// Defines the sort ordering within each series. This will always include the time +/// column, but may also include additional columns. Each series is independently +/// sorted by this expression before applying LIMIT/OFFSET. +/// +/// Example: `ORDER BY time DESC` → `order_expr = [SortExpr { expr: Column("time"), asc: false, ... }]` +/// +/// ## Limit Expressions (`limit_expr`) +/// +/// The value columns that should be included in the output. Each has associated +/// NULL handling and default value semantics. See [`LimitExpr`] for details. +/// +/// ## Skip and Fetch +/// +/// - `skip`: Number of rows to skip at the start of each series (OFFSET) +/// - `fetch`: Maximum number of rows to return from each series (LIMIT) +/// +/// Both are optional `Expr` types wrapped in `Box` to allow for dynamic values +/// or literals. If `skip` is `None`, no rows are skipped. If `fetch` is `None`, +/// all remaining rows (after skip) are returned. +/// +/// # Examples +/// +/// ## Example 1: Basic Per-Series LIMIT +/// +/// ```text +/// Query: SELECT temperature FROM weather GROUP BY location LIMIT 3 +/// +/// SeriesLimit { +/// input: , +/// series_expr: [Column("location")], +/// order_expr: [SortExpr { expr: Column("time"), asc: true, ... }], +/// limit_expr: [LimitExpr { +/// expr: Column("temperature"), +/// null_treatment: RespectNulls, +/// default_value: Literal(NULL), +/// }], +/// skip: None, +/// fetch: Some(Box::new(Literal(3))), +/// } +/// +/// Result: Up to 3 temperature readings per location +/// ``` +/// +/// ## Example 2: LIMIT with OFFSET +/// +/// ```text +/// Query: SELECT value FROM sensors GROUP BY sensor_id LIMIT 10 OFFSET 5 +/// +/// SeriesLimit { +/// input: , +/// series_expr: [Column("sensor_id")], +/// order_expr: [SortExpr { expr: Column("time"), asc: true, ... }], +/// limit_expr: [LimitExpr { expr: Column("value"), ... }], +/// skip: Some(Box::new(Literal(5))), +/// fetch: Some(Box::new(Literal(10))), +/// } +/// +/// Result: Rows 6-15 from each sensor (skip first 5, take next 10) +/// ``` +/// +/// ## Example 3: Multiple Series Keys and Value Columns +/// +/// ```text +/// Query: SELECT temp, humidity FROM weather +/// GROUP BY location, elevation +/// LIMIT 5 +/// +/// SeriesLimit { +/// series_expr: [Column("location"), Column("elevation")], +/// limit_expr: [ +/// LimitExpr { expr: Column("temp"), ... }, +/// LimitExpr { expr: Column("humidity"), ... }, +/// ], +/// fetch: Some(Box::new(Literal(5))), +/// ... +/// } +/// +/// Result: Up to 5 rows each for temp and humidity per (location, elevation) combination, +/// if temp and humidity have different timestamps this could result it up to 10 output +/// rows per (location, elevation). +/// ``` +/// +/// # See Also +/// +/// - [`LimitExpr`]: The expression type for individual value columns +/// - [`SeriesLimitExec`]: The physical execution plan that implements this operation +/// +/// [`SeriesLimitExec`]: crate::exec::series_limit::physical::SeriesLimitExec +#[derive(Debug, Clone, Hash, PartialEq, Eq)] +pub struct SeriesLimit { + /// The input for this operation. + pub input: Arc, + + /// The expressions that definge which series a particular row is + /// part of. + pub series_expr: Vec, + + /// The expression that defines the ordering of the rows within a + /// series. + pub order_expr: Vec, + + /// The expressions that define the values of each time series that + /// needs to be processed. Each expression must reference exactly + /// one column in the input which will be replaced with the output + /// with the limited version of that column. No two limit + /// expressions may reference the same input column. + pub limit_expr: Vec, + + /// The number of rows to skip (OFFSET) in each time series. + pub skip: Option>, + + /// The maximum number of rows (LIMIT) to include in each time + /// series. + pub fetch: Option>, + + /// The schema of the output of this operation. + pub schema: DFSchemaRef, +} + +impl SeriesLimit { + pub fn try_new( + input: Arc, + series_expr: Vec, + order_expr: Vec, + limit_expr: Vec, + skip: Option>, + fetch: Option>, + ) -> Result { + // Validate that the expressions are all valid against the input schema + let input_schema = input.schema(); + + let mut limited_fields = BTreeMap::new(); + for le in &limit_expr { + let cols = le.expr.column_refs(); + if cols.len() != 1 { + return internal_err!( + "LimitExpr expr must reference exactly one column, found {} columns", + cols.len() + ); + } + let col = cols.into_iter().next().unwrap(); + let idx = input_schema.index_of_column(col)?; + if limited_fields + .insert(idx, le.to_field(input_schema)?) + .is_some() + { + return internal_err!("LimitExpr contains duplicate column reference: {}", col); + } + } + + // The schema is the same as the input with the limited fields + // potentially modified. + let qualified_fields = input_schema + .iter() + .enumerate() + .map(|(idx, (qualifier, field))| { + if let Some((qualifier, field)) = limited_fields.remove(&idx) { + (qualifier, field) + } else { + (qualifier.cloned(), Arc::clone(field)) + } + }) + .collect::>(); + + let schema = Arc::new(DFSchema::new_with_metadata( + qualified_fields, + std::collections::HashMap::new(), + )?); + + Ok(Self { + input, + series_expr, + order_expr, + limit_expr, + skip, + fetch, + schema, + }) + } + + pub fn apply_expressions Result>( + &self, + mut f: F, + ) -> Result { + if self.series_expr.apply_elements(&mut f)? == TreeNodeRecursion::Stop { + return Ok(TreeNodeRecursion::Stop); + } + if self.order_expr.apply_elements(&mut f)? == TreeNodeRecursion::Stop { + return Ok(TreeNodeRecursion::Stop); + } + if self.limit_expr.apply_elements(&mut f)? == TreeNodeRecursion::Stop { + return Ok(TreeNodeRecursion::Stop); + } + if self.skip.apply_elements(&mut f)? == TreeNodeRecursion::Stop { + return Ok(TreeNodeRecursion::Stop); + } + if self.fetch.apply_elements(&mut f)? == TreeNodeRecursion::Stop { + return Ok(TreeNodeRecursion::Stop); + } + Ok(TreeNodeRecursion::Continue) + } +} + +// Manual impl because DFSchemaRef doesn't implement PartialOrd +impl PartialOrd for SeriesLimit { + fn partial_cmp(&self, other: &Self) -> Option { + use std::cmp::Ordering; + + // Compare inputs + match self.input.partial_cmp(&other.input) { + Some(Ordering::Equal) => {} + other => return other, + } + + // Compare series expressions + match self.series_expr.partial_cmp(&other.series_expr) { + Some(Ordering::Equal) => {} + other => return other, + } + + // Compare order expressions + match self.order_expr.partial_cmp(&other.order_expr) { + Some(Ordering::Equal) => {} + other => return other, + } + + // Compare limit expressions + match self.limit_expr.partial_cmp(&other.limit_expr) { + Some(Ordering::Equal) => {} + other => return other, + } + + // Compare skip + match self.skip.partial_cmp(&other.skip) { + Some(Ordering::Equal) => {} + other => return other, + } + + // Compare fetch (skip schema since it doesn't implement PartialOrd) + self.fetch.partial_cmp(&other.fetch) + } +} + +impl UserDefinedLogicalNodeCore for SeriesLimit { + fn name(&self) -> &str { + "SeriesLimit" + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![self.input.as_ref()] + } + + fn schema(&self) -> &DFSchemaRef { + &self.schema + } + + fn expressions(&self) -> Vec { + let mut exprs = Vec::with_capacity( + self.series_expr.len() + self.order_expr.len() + 2 * self.limit_expr.len() + 2, + ); + + self.apply_expressions(|expr| { + exprs.push(expr.clone()); + Ok(TreeNodeRecursion::Continue) + }) + .expect("cannot error"); + + exprs + } + + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let series_expr = self + .series_expr + .iter() + .map(|e| e.to_string()) + .collect::>() + .join(", "); + + let order_expr = self + .order_expr + .iter() + .map(|se| se.to_string()) + .collect::>() + .join(", "); + + let limit_expr = self + .limit_expr + .iter() + .map(|le| le.to_string()) + .collect::>() + .join(", "); + + write!( + f, + "{}: series=[{}], order=[{}], limit_expr=[{}]", + self.name(), + series_expr, + order_expr, + limit_expr, + )?; + + if let Some(skip) = &self.skip { + write!(f, ", skip={}", skip)?; + } + + if let Some(fetch) = &self.fetch { + write!(f, ", fetch={}", fetch)?; + } + + Ok(()) + } + + fn with_exprs_and_inputs(&self, exprs: Vec, inputs: Vec) -> Result { + if inputs.len() != 1 { + return internal_err!("SeriesLimit expects exactly 1 input, got {}", inputs.len()); + } + + let input = Arc::new(inputs.into_iter().next().unwrap()); + + fn map_exprs<'a, C: TreeNodeContainer<'a, Expr>>( + c: C, + exprs: &mut impl Iterator, + ) -> Result { + c.map_elements(|old| { + exprs + .next() + .map(|new| { + let transformed = new == old; + Transformed::new(new, transformed, TreeNodeRecursion::Continue) + }) + .ok_or(DataFusionError::Internal(String::from( + "not enough input expressions for SeriesLimit", + ))) + }) + .map(|t| t.data) + } + + let Self { + series_expr, + order_expr, + limit_expr, + skip, + fetch, + .. + } = self; + + let mut exprs = exprs.into_iter(); + + let series_expr = map_exprs(series_expr.clone(), &mut exprs)?; + let order_expr = map_exprs(order_expr.clone(), &mut exprs)?; + let limit_expr = map_exprs(limit_expr.clone(), &mut exprs)?; + let skip = map_exprs(skip.clone(), &mut exprs)?; + let fetch = map_exprs(fetch.clone(), &mut exprs)?; + + if exprs.next().is_some() { + return internal_err!("too many input expressions for SeriesLimit"); + } + + Self::try_new(input, series_expr, order_expr, limit_expr, skip, fetch) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use datafusion::{ + logical_expr::{col, lit}, + prelude::SessionContext, + scalar::ScalarValue, + }; + use insta::assert_snapshot; + + /// Helper function to create a simple test schema + fn test_schema() -> DFSchemaRef { + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let ctx = SessionContext::new(); + Arc::clone( + ctx.sql("SELECT 1 as a, 2 as b, 3 as time") + .await + .unwrap() + .into_optimized_plan() + .unwrap() + .schema(), + ) + }) + } + + /// Helper function to create a simple LogicalPlan for testing + fn test_plan() -> Arc { + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let ctx = SessionContext::new(); + Arc::new( + ctx.sql("SELECT 1 as a, 2 as b, 3 as time") + .await + .unwrap() + .into_optimized_plan() + .unwrap(), + ) + }) + } + + fn test_plan2() -> Arc { + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let ctx = SessionContext::new(); + Arc::new( + ctx.sql("SELECT 3 as a, 2 as b, 1 as time") + .await + .unwrap() + .into_optimized_plan() + .unwrap(), + ) + }) + } + + /// Helper function to create a LogicalPlan with more columns for testing + fn test_plan_multi_column() -> Arc { + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let ctx = SessionContext::new(); + Arc::new( + ctx.sql("SELECT 1 as a, 2 as b, 3 as c, 4 as time") + .await + .unwrap() + .into_optimized_plan() + .unwrap(), + ) + }) + } + + mod limit_expr_tests { + use super::*; + + #[test] + fn test_display() { + let limit_expr = LimitExpr { + expr: col("temperature"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(0), + }; + + assert_snapshot!(limit_expr.to_string(), @r#"temperature RESPECT NULLS (default: Int32(0))"#); + } + + #[test] + fn test_get_type_matching_types() { + let schema = test_schema(); + let limit_expr = LimitExpr { + expr: col("a"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(ScalarValue::Int64(Some(42))), + }; + + assert_eq!( + limit_expr.get_type(schema.as_ref()).unwrap(), + DataType::Int64 + ); + } + + #[test] + fn test_nullable() { + let schema = test_schema(); + let limit_expr = LimitExpr { + expr: col("a"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(ScalarValue::Int64(Some(42))), + }; + + assert!(!limit_expr.nullable(schema.as_ref()).unwrap()); + } + + #[test] + fn test_data_type_and_nullable() { + let schema = test_schema(); + let limit_expr = LimitExpr { + expr: col("a"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(ScalarValue::Int64(Some(42))), + }; + + assert_eq!( + limit_expr.data_type_and_nullable(schema.as_ref()).unwrap(), + (DataType::Int64, false) + ); + } + + #[test] + fn test_to_field() { + let schema = test_schema(); + let limit_expr = LimitExpr { + expr: col("a"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(ScalarValue::Int64(Some(42))), + }; + + assert_eq!( + limit_expr.to_field(schema.as_ref()).unwrap(), + (None, Arc::new(Field::new("a", DataType::Int64, false))) + ); + } + + #[test] + fn test_tree_node_container_apply_elements() { + let limit_expr = LimitExpr { + expr: col("a"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(42), + }; + + let mut count = 0; + let result = limit_expr.apply_elements(|_expr| { + count += 1; + Ok(TreeNodeRecursion::Continue) + }); + + assert_eq!(result.unwrap(), TreeNodeRecursion::Continue); + assert_eq!(count, 2); // Should visit both expr and default_value + } + + #[test] + fn test_tree_node_container_apply_elements_stop() { + let limit_expr = LimitExpr { + expr: col("a"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(42), + }; + + let mut count = 0; + let result = limit_expr.apply_elements(|_expr| { + count += 1; + Ok(TreeNodeRecursion::Stop) + }); + + assert!(result.is_ok()); + assert_eq!(result.unwrap(), TreeNodeRecursion::Stop); + assert_eq!(count, 1); // Should stop after first expression + } + + #[test] + fn test_tree_node_container_map_elements() { + let limit_expr = LimitExpr { + expr: col("a"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(42), + }; + + let result = limit_expr + .clone() + .map_elements(|expr| Ok(Transformed::no(expr))); + + assert_eq!(result.unwrap(), Transformed::no(limit_expr)); + } + + #[test] + fn test_tree_node_container_map_elements_with_transform() { + let limit_expr = LimitExpr { + expr: col("a"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(42), + }; + + let result = limit_expr + .clone() + .map_elements(|expr| Ok(Transformed::yes(expr))); + + assert_eq!(result.unwrap(), Transformed::yes(limit_expr)); + } + } + + mod series_limit_tests { + use super::*; + use arrow::datatypes::Fields; + use datafusion::logical_expr::{Extension, SortExpr}; + + fn create_test_series_limit() -> SeriesLimit { + let input = test_plan(); + let series_expr = vec![col("a")]; + let order_expr = vec![SortExpr { + expr: col("time"), + asc: true, + nulls_first: false, + }]; + let limit_expr = vec![LimitExpr { + expr: col("b"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(ScalarValue::Int64(Some(0))), + }]; + + SeriesLimit::try_new(input, series_expr, order_expr, limit_expr, None, None).unwrap() + } + + #[test] + fn test_try_new() { + let input = test_plan(); + let series_expr = vec![col("a")]; + let order_expr = vec![SortExpr { + expr: col("time"), + asc: true, + nulls_first: false, + }]; + let limit_expr = vec![LimitExpr { + expr: col("b"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(ScalarValue::Int64(Some(0))), + }]; + + let result = + SeriesLimit::try_new(input, series_expr, order_expr, limit_expr, None, None); + assert!(result.is_ok()); + } + + #[test] + fn test_try_new_with_skip_and_fetch() { + let input = test_plan(); + let series_expr = vec![col("a")]; + let order_expr = vec![SortExpr { + expr: col("time"), + asc: true, + nulls_first: false, + }]; + let limit_expr = vec![LimitExpr { + expr: col("b"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(ScalarValue::Int64(Some(0))), + }]; + let skip = Some(Box::new(lit(10))); + let fetch = Some(Box::new(lit(100))); + + let result = + SeriesLimit::try_new(input, series_expr, order_expr, limit_expr, skip, fetch); + assert!(result.is_ok()); + + let series_limit = result.unwrap(); + assert!(series_limit.skip.is_some()); + assert!(series_limit.fetch.is_some()); + } + + #[test] + fn test_name() { + let series_limit = create_test_series_limit(); + assert_eq!(series_limit.name(), "SeriesLimit"); + } + + #[test] + fn test_inputs() { + let series_limit = create_test_series_limit(); + let inputs = series_limit.inputs(); + assert_eq!(inputs.len(), 1); + } + + #[test] + fn test_schema() { + let series_limit = create_test_series_limit(); + let schema = series_limit.schema(); + assert_eq!( + schema.fields(), + &Fields::from(vec![ + Field::new("a", DataType::Int64, false), + Field::new("b", DataType::Int64, false), + Field::new("time", DataType::Int64, false), + ]) + ); + } + + #[test] + fn test_expressions() { + let series_limit = create_test_series_limit(); + assert_eq!( + series_limit.expressions(), + vec![col("a"), col("time"), col("b"), lit(0_i64)] + ); + } + + #[test] + fn test_expressions_with_skip_and_fetch() { + let input = test_plan(); + let series_expr = vec![col("a")]; + let order_expr = vec![SortExpr { + expr: col("time"), + asc: true, + nulls_first: false, + }]; + let limit_expr = vec![LimitExpr { + expr: col("b"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(ScalarValue::Int64(Some(0))), + }]; + let skip = Some(Box::new(lit(10))); + let fetch = Some(Box::new(lit(100))); + + let series_limit = + SeriesLimit::try_new(input, series_expr, order_expr, limit_expr, skip, fetch) + .unwrap(); + + assert_eq!( + series_limit.expressions(), + vec![ + col("a"), + col("time"), + col("b"), + lit(0_i64), + lit(10), + lit(100) + ] + ); + } + + #[test] + fn test_fmt_for_explain() { + let series_limit = create_test_series_limit(); + assert_snapshot!(format!("{}", LogicalPlan::Extension(Extension{node: Arc::new(series_limit)})), @r" + SeriesLimit: series=[a], order=[time ASC NULLS LAST], limit_expr=[b RESPECT NULLS (default: Int64(0))] + Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS time + EmptyRelation + ") + } + + #[test] + fn test_with_exprs_and_inputs() { + let series_limit = create_test_series_limit(); + let original_exprs = series_limit.expressions(); + + // Create new input + let new_input = test_plan2(); + + let series_limit = series_limit + .with_exprs_and_inputs(original_exprs, vec![(*new_input).clone()]) + .unwrap(); + + assert_snapshot!(format!("{}", LogicalPlan::Extension(Extension{node: Arc::new(series_limit)})), @r" + SeriesLimit: series=[a], order=[time ASC NULLS LAST], limit_expr=[b RESPECT NULLS (default: Int64(0))] + Projection: Int64(3) AS a, Int64(2) AS b, Int64(1) AS time + EmptyRelation + ") + } + + #[test] + fn test_with_exprs_and_inputs_wrong_input_count() { + let series_limit = create_test_series_limit(); + let original_exprs = series_limit.expressions(); + + // Try with wrong number of inputs (0 instead of 1) + let result = series_limit.with_exprs_and_inputs(original_exprs, vec![]); + + assert!(result.is_err()); + let err_str = result.unwrap_err().to_string(); + assert!(err_str.contains("expects exactly 1 input")); + } + + #[test] + fn test_partial_ord() { + let series_limit1 = create_test_series_limit(); + let series_limit2 = create_test_series_limit(); + + // Should be equal + assert_eq!( + series_limit1.partial_cmp(&series_limit2), + Some(std::cmp::Ordering::Equal) + ); + } + + #[test] + fn test_eq() { + let series_limit1 = create_test_series_limit(); + let series_limit2 = create_test_series_limit(); + + assert_eq!(series_limit1, series_limit2); + } + + #[test] + fn test_hash() { + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let series_limit1 = create_test_series_limit(); + let series_limit2 = create_test_series_limit(); + + let mut hasher1 = DefaultHasher::new(); + series_limit1.hash(&mut hasher1); + let hash1 = hasher1.finish(); + + let mut hasher2 = DefaultHasher::new(); + series_limit2.hash(&mut hasher2); + let hash2 = hasher2.finish(); + + assert_eq!(hash1, hash2); + } + + #[test] + fn test_multiple_order_expressions() { + // Test with multiple order expressions (e.g., ORDER BY time, b) + // Schema has: a, b, c, time + let input = test_plan_multi_column(); + let series_expr = vec![col("a")]; + let order_expr = vec![ + SortExpr { + expr: col("time"), + asc: true, + nulls_first: false, + }, + SortExpr { + expr: col("b"), + asc: false, + nulls_first: true, + }, + ]; + let limit_expr = vec![LimitExpr { + expr: col("c"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(ScalarValue::Int64(Some(0))), + }]; + + let series_limit = + SeriesLimit::try_new(input, series_expr, order_expr, limit_expr, None, None) + .unwrap(); + + assert_snapshot!(format!("{}", LogicalPlan::Extension(Extension{node: Arc::new(series_limit)})), @r" + SeriesLimit: series=[a], order=[time ASC NULLS LAST, b DESC NULLS FIRST], limit_expr=[c RESPECT NULLS (default: Int64(0))] + Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c, Int64(4) AS time + EmptyRelation + ") + } + + #[test] + fn test_expressions_with_multiple_order() { + // Test that expressions() includes all order expressions + let input = test_plan_multi_column(); + let series_expr = vec![col("a")]; + let order_expr = vec![ + SortExpr { + expr: col("time"), + asc: true, + nulls_first: false, + }, + SortExpr { + expr: col("b"), + asc: false, + nulls_first: true, + }, + ]; + let limit_expr = vec![LimitExpr { + expr: col("c"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(ScalarValue::Int64(Some(0))), + }]; + + let series_limit = + SeriesLimit::try_new(input, series_expr, order_expr, limit_expr, None, None) + .unwrap(); + + let exprs = series_limit.expressions(); + assert_eq!( + exprs, + vec![col("a"), col("time"), col("b"), col("c"), lit(0_i64)] + ); + } + + #[test] + fn test_with_exprs_and_inputs_multiple_order() { + // Test with_exprs_and_inputs preserves multiple order expressions + let input = test_plan_multi_column(); + let series_expr = vec![col("a")]; + let order_expr = vec![ + SortExpr { + expr: col("time"), + asc: true, + nulls_first: false, + }, + SortExpr { + expr: col("b"), + asc: false, + nulls_first: true, + }, + ]; + let limit_expr = vec![LimitExpr { + expr: col("c"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(ScalarValue::Int64(Some(0))), + }]; + + let series_limit = SeriesLimit::try_new( + Arc::clone(&input), + series_expr, + order_expr, + limit_expr, + None, + None, + ) + .unwrap(); + + let original_exprs = series_limit.expressions(); + let new_input = test_plan_multi_column(); + + let series_limit = series_limit + .with_exprs_and_inputs(original_exprs, vec![(*new_input).clone()]) + .unwrap(); + + assert_snapshot!(format!("{}", LogicalPlan::Extension(Extension{node: Arc::new(series_limit)})), @r" + SeriesLimit: series=[a], order=[time ASC NULLS LAST, b DESC NULLS FIRST], limit_expr=[c RESPECT NULLS (default: Int64(0))] + Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c, Int64(4) AS time + EmptyRelation + ") + } + + #[test] + fn test_fmt_for_explain_multiple_order() { + // Test that fmt_for_explain includes all order expressions + let input = test_plan_multi_column(); + let series_expr = vec![col("a")]; + let order_expr = vec![ + SortExpr { + expr: col("time"), + asc: true, + nulls_first: false, + }, + SortExpr { + expr: col("b"), + asc: false, + nulls_first: true, + }, + ]; + let limit_expr = vec![LimitExpr { + expr: col("c"), + null_treatment: NullTreatment::RespectNulls, + default_value: lit(ScalarValue::Int64(Some(0))), + }]; + + let series_limit = + SeriesLimit::try_new(input, series_expr, order_expr, limit_expr, None, None) + .unwrap(); + + assert_snapshot!(format!("{}", LogicalPlan::Extension(Extension{node: Arc::new(series_limit)})), @r" + SeriesLimit: series=[a], order=[time ASC NULLS LAST, b DESC NULLS FIRST], limit_expr=[c RESPECT NULLS (default: Int64(0))] + Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c, Int64(4) AS time + EmptyRelation + ") + } + } +} diff --git a/iox_query/src/exec/series_limit/mod.rs b/iox_query/src/exec/series_limit/mod.rs new file mode 100644 index 00000000..5a66dd39 --- /dev/null +++ b/iox_query/src/exec/series_limit/mod.rs @@ -0,0 +1,158 @@ +//! Handling of InfluxQL style `LIMIT` and `OFFSET` clauses. +//! +//! This module provides functionality to apply `LIMIT` and `OFFSET` +//! clauses individually to time series data. It is designed to be +//! compatible with older version of InfluxDB that applied `LIMIT` and +//! `OFFSET` conditions in each iterator before they are combined. + +use arrow::compute::SortOptions; +use datafusion::{ + common::{Result, internal_err}, + execution::context::SessionState, + logical_expr::{Expr, LogicalPlan}, + physical_plan::{ExecutionPlan, expressions::PhysicalSortExpr}, + scalar::ScalarValue, + sql::sqlparser::ast::NullTreatment, +}; +use std::sync::Arc; + +mod logical; +mod physical; + +pub use logical::{LimitExpr, SeriesLimit}; +pub use physical::{PhysicalLimitExpr, SeriesLimitExec}; + +/// Plan a SeriesLimit logical node into a physical SeriesLimitExec. +/// +/// This function converts the logical representation of per-series limiting +/// into a physical execution plan that can be executed by DataFusion. +/// +/// # Arguments +/// +/// * `session_state` - The DataFusion session state for creating physical expressions +/// * `series_limit` - The logical SeriesLimit node to plan +/// * `logical_inputs` - The logical input plans (must be exactly 1) +/// * `physical_inputs` - The physical input plans (must be exactly 1) +/// +/// # Returns +/// +/// Returns a `SeriesLimitExec` physical execution plan on success, or an error +/// if the inputs are invalid or expression conversion fails. +/// +/// # Errors +/// +/// This function returns an error if: +/// - The number of logical or physical inputs is not exactly 1 +/// - Skip or fetch expressions cannot be evaluated to usize values +/// - Expression conversion from logical to physical fails +/// - Default value conversion fails +pub(crate) fn plan_series_limit( + session_state: &SessionState, + series_limit: &SeriesLimit, + logical_inputs: &[&LogicalPlan], + physical_inputs: &[Arc], +) -> Result { + // Validate inputs + let input_dfschema = match logical_inputs { + [input] => input.schema().as_ref(), + _ => { + return internal_err!( + "SeriesLimitExec: wrong number of logical inputs; expected 1, found {}", + logical_inputs.len() + ); + } + }; + + let phys_input = match physical_inputs { + [input] => Arc::clone(input), + _ => { + return internal_err!( + "SeriesLimitExec: wrong number of physical inputs; expected 1, found {}", + physical_inputs.len() + ); + } + }; + + // Convert series expressions to physical + let series_expr = series_limit + .series_expr + .iter() + .map(|expr| session_state.create_physical_expr(expr.clone(), input_dfschema)) + .collect::>>()?; + + // Convert order expressions to physical + let order_expr = series_limit + .order_expr + .iter() + .map(|sort_expr| { + Ok(PhysicalSortExpr { + expr: session_state.create_physical_expr(sort_expr.expr.clone(), input_dfschema)?, + options: SortOptions { + descending: !sort_expr.asc, + nulls_first: sort_expr.nulls_first, + }, + }) + }) + .collect::>>()?; + + // Convert limit expressions to physical + let limit_expr = series_limit + .limit_expr + .iter() + .map(|le| { + let expr = session_state.create_physical_expr(le.expr.clone(), input_dfschema)?; + let ignore_nulls = matches!(le.null_treatment, NullTreatment::IgnoreNulls); + + // Evaluate the default value expression to get a ScalarValue + let default_value = if let Expr::Literal(scalar, _) = &le.default_value { + scalar.clone() + } else { + return internal_err!( + "SeriesLimit default_value must be a literal, got: {:?}", + le.default_value + ); + }; + + Ok(PhysicalLimitExpr::new(expr, ignore_nulls, default_value)) + }) + .collect::>>()?; + + // Evaluate skip and fetch expressions + let skip = if let Some(skip_expr) = &series_limit.skip { + if let Expr::Literal(ScalarValue::UInt64(Some(skip_val)), _) = skip_expr.as_ref() { + *skip_val as usize + } else if let Expr::Literal(ScalarValue::Int64(Some(skip_val)), _) = skip_expr.as_ref() { + if *skip_val < 0 { + return internal_err!("SeriesLimit skip must be non-negative, got: {}", skip_val); + } + *skip_val as usize + } else { + return internal_err!( + "SeriesLimit skip must be a non-negative integer literal, got: {:?}", + skip_expr + ); + } + } else { + 0 + }; + + let fetch = if let Some(fetch_expr) = &series_limit.fetch { + if let Expr::Literal(ScalarValue::UInt64(Some(fetch_val)), _) = fetch_expr.as_ref() { + Some(*fetch_val as usize) + } else if let Expr::Literal(ScalarValue::Int64(Some(fetch_val)), _) = fetch_expr.as_ref() { + if *fetch_val < 0 { + return internal_err!("SeriesLimit fetch must be non-negative, got: {}", fetch_val); + } + Some(*fetch_val as usize) + } else { + return internal_err!( + "SeriesLimit fetch must be a non-negative integer literal, got: {:?}", + fetch_expr + ); + } + } else { + None + }; + + SeriesLimitExec::try_new(phys_input, series_expr, order_expr, limit_expr, skip, fetch) +} diff --git a/iox_query/src/exec/series_limit/physical.rs b/iox_query/src/exec/series_limit/physical.rs new file mode 100644 index 00000000..528e6ea1 --- /dev/null +++ b/iox_query/src/exec/series_limit/physical.rs @@ -0,0 +1,2594 @@ +//! Physical executor for the series limit operation. + +use std::{ + collections::BTreeMap, + pin::Pin, + sync::Arc, + task::{Context, Poll}, +}; + +use arrow::{ + array::{ + Array, ArrayRef, BooleanArray, Datum, PrimitiveArray, RecordBatch, Scalar, UInt64Builder, + new_null_array, + }, + compute::{Partitions, partition}, + datatypes::{SchemaRef, UInt64Type}, + error::ArrowError, +}; +use datafusion::{ + common::tree_node::{TreeNode, TreeNodeRecursion}, + error::{DataFusionError, Result}, + execution::{ + RecordBatchStream, SendableRecordBatchStream, + context::TaskContext, + memory_pool::{MemoryConsumer, MemoryReservation}, + }, + physical_expr::{ + EquivalenceProperties, LexOrdering, LexRequirement, OrderingRequirements, PhysicalExprRef, + PhysicalSortExpr, PhysicalSortRequirement, + }, + physical_plan::{ + DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties, + PlanProperties, SendableRecordBatchStream as SendableStream, Statistics, + expressions::Column, + metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}, + }, + scalar::ScalarValue, +}; +use futures::{Stream, StreamExt, ready}; + +#[derive(Debug, Clone)] +pub struct PhysicalLimitExpr { + /// The expression to evaluate for the limit. This must be a Column. + expr: PhysicalExprRef, + + /// Whether to ignore null values in the limit calculation. + ignore_nulls: bool, + + /// The default value to use when a row is filtered out of a time + /// series, but required in an output batch. Typically this is NULL + /// of the same type as the expression, however it could be 0 or another + /// value. When using with InfluxQL this should be the value specified by + /// the FILL clause mapped to an appropriate type for the column. + default_value: ScalarValue, +} + +impl PhysicalLimitExpr { + /// Create a new PhysicalLimitExpr. + pub fn new(expr: PhysicalExprRef, ignore_nulls: bool, default_value: ScalarValue) -> Self { + Self { + expr, + ignore_nulls, + default_value, + } + } +} + +impl std::fmt::Display for PhysicalLimitExpr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{} {}NULLS (default: {})", + self.expr, + if self.ignore_nulls { + "IGNORE " + } else { + "RESPECT " + }, + self.default_value + ) + } +} + +/// Physical execution plan for per-series LIMIT and OFFSET operations. +/// +/// This operator implements InfluxQL-style series limiting, which applies +/// LIMIT and OFFSET constraints independently to each time series. Unlike +/// standard SQL LIMIT/OFFSET which apply globally to the result set, this +/// operator applies them separately to each group of rows sharing the same +/// series key (typically tag values). +/// +/// # Purpose +/// +/// This execution plan is designed to support InfluxQL queries where LIMIT +/// and OFFSET need to apply per-series rather than globally. This is a key +/// semantic difference from SQL that makes InfluxQL suitable for time series +/// data analysis where users want to limit the number of points returned +/// from each individual series independently. +/// +/// # Query Semantics +/// +/// For a query like: +/// ```sql +/// SELECT value FROM measurement WHERE time > now() - 1h GROUP BY tag LIMIT 10 OFFSET 5 +/// ``` +/// +/// Standard SQL would return 10 rows total across all series. This operator +/// returns up to 10 rows **per series** (after skipping the first 5 rows of +/// each series). +/// +/// # Execution Flow +/// +/// 1. **Input Requirements**: The input must be pre-sorted by series expressions +/// followed by order expressions (enforced via `required_input_ordering`). +/// +/// 2. **Partitioning**: If series expressions are present, the operator requires +/// hash partitioning on those expressions to ensure all rows for a given series +/// are processed by the same partition. +/// +/// 3. **Stream Processing**: Each partition creates a [`SeriesLimitStream`] that: +/// - Identifies series boundaries in the sorted input +/// - Assigns row numbers within each series +/// - Filters rows based on skip/fetch values +/// - Maintains state across batch boundaries +/// +/// 4. **Output**: Produces a stream of record batches containing only the rows +/// that fall within the LIMIT/OFFSET window for each series. +/// +/// # NULL Handling +/// +/// The `limit_expr` field supports both RESPECT NULLS and IGNORE NULLS modes: +/// +/// - **RESPECT NULLS**: NULL values count toward the row limit +/// - **IGNORE NULLS**: NULL values are skipped and don't count toward the limit +/// +/// This is controlled by the `ignore_nulls` field in [`PhysicalLimitExpr`]. +/// +/// # Default Values +/// +/// Each limited expression can specify a default value (in [`PhysicalLimitExpr`]) +/// that is used when a row is filtered out. This supports queries where all +/// series need to be time-aligned even when some series have no data for certain +/// timestamps. +/// +/// # Examples +/// +/// ## Basic LIMIT +/// ```text +/// Input (2 series, 4 rows each): +/// tag | time | value +/// ----|------|------ +/// a | 1 | 10 +/// a | 2 | 20 +/// a | 3 | 30 +/// a | 4 | 40 +/// b | 1 | 50 +/// b | 2 | 60 +/// b | 3 | 70 +/// b | 4 | 80 +/// +/// With LIMIT 2 (skip=0, fetch=Some(2)): +/// Output: +/// tag | time | value +/// ----|------|------ +/// a | 1 | 10 +/// a | 2 | 20 +/// b | 1 | 50 +/// b | 2 | 60 +/// ``` +/// +/// ## LIMIT with OFFSET +/// ```text +/// Same input as above. +/// +/// With LIMIT 2 OFFSET 1 (skip=1, fetch=Some(2)): +/// Output: +/// tag | time | value +/// ----|------|------ +/// a | 2 | 20 +/// a | 3 | 30 +/// b | 2 | 60 +/// b | 3 | 70 +/// ``` +/// +/// ## Only OFFSET +/// ```text +/// Same input as above. +/// +/// With OFFSET 2 (skip=2, fetch=None): +/// Output: +/// tag | time | value +/// ----|------|------ +/// a | 3 | 30 +/// a | 4 | 40 +/// b | 3 | 70 +/// b | 4 | 80 +/// ``` +/// +/// ## Respecting NULLs +/// ```text +/// Input (4 series, 4 rows each): +/// tag | time | value1 | value2 +/// ----|------|--------|------- +/// a | 1 | 10 | +/// a | 2 | | 20 +/// a | 3 | 30 | 30 +/// a | 4 | 40 | 40 +/// b | 1 | | +/// b | 2 | | 60 +/// b | 3 | 70 | +/// b | 4 | 80 | 80 +/// +/// With LIMIT 2 OFFSET 1 on both value1 and value2, respecting NULLs, default value NULL: +/// Output: +/// tag | time | value1 | value2 +/// ----|------|--------|------- +/// a | 2 | | 20 +/// a | 3 | 30 | 30 +/// b | 2 | | 60 +/// b | 3 | 70 | +/// ``` +/// +/// ## Ignoring NULLs +/// ```text +/// Input (4 series): +/// tag | time | value1 | value2 +/// ----|------|--------|------- +/// a | 1 | 10 | +/// a | 2 | | 20 +/// a | 3 | 30 | 30 +/// a | 4 | 40 | 40 +/// b | 1 | | +/// b | 2 | | 60 +/// b | 3 | 70 | 70 +/// b | 4 | 80 | 80 +/// b | 5 | 90 | 90 +/// +/// With LIMIT 2 OFFSET 1 on both value1 and value2, ignoring NULLs, default value NULL: +/// Output: +/// tag | time | value1 | value2 +/// ----|------|--------|------- +/// a | 3 | 30 | 30 +/// a | 4 | 40 | 40 +/// b | 4 | | 70 +/// b | 4 | 80 | 80 +/// b | 5 | 90 | +/// ``` +/// +/// # Performance Considerations +/// +/// - **Memory**: Maintains minimal state (current series key + row counts) +/// across batches, making it suitable for large datasets. +/// +/// - **Streaming**: Processes data in a streaming fashion without materializing +/// entire series in memory. +/// +/// - **Early Termination**: Once a series exceeds its limit, subsequent rows +/// for that series can be efficiently filtered without full evaluation. +pub struct SeriesLimitExec { + /// The input execution plan to apply series limiting to. + input: Arc, + + /// Expressions that define the series grouping. + /// + /// Rows with the same values for these expressions belong to the same series. + /// Typically these are tag columns in InfluxQL queries. + series_expr: Vec, + + /// The expressions used for sorting within each series. + /// + /// Each series is sorted by this expression (typically ascending timestamp) + /// before applying LIMIT and OFFSET operations. + order_expr: Vec, + + /// Dynamic limit expressions that can evaluate to per-series limits. + /// + /// These expressions are evaluated for each series and can provide + /// different limit values based on series characteristics. + limit_expr: Vec, + + /// Number of rows to skip at the beginning of each series (OFFSET). + /// + /// A value of 0 means no rows are skipped. + skip: usize, + + /// Maximum number of rows to return from each series (LIMIT). + /// + /// `None` means no limit is applied (return all remaining rows after skip). + /// `Some(n)` limits each series to at most `n` rows. + fetch: Option, + + /// `limit_expr` modfied for use when processing batches. + limited: Arc>, + + /// Metrics tracking execution statistics for this plan node. + /// + /// Collects metrics like elapsed time, number of output rows, etc. + metrics: ExecutionPlanMetricsSet, + + /// Cached plan properties for efficient access. + /// + /// Contains schema, partitioning, execution mode, and sort order information + /// that are computed once and reused across multiple accesses. + cache: PlanProperties, + + /// The required ordering for the input to this plan. + required_ordering: Option, +} + +impl SeriesLimitExec { + /// Create a new SeriesLimitExec. + pub fn try_new( + input: Arc, + series_expr: Vec, + order_expr: Vec, + limit_expr: Vec, + skip: usize, + fetch: Option, + ) -> Result { + let input_schema = input.schema(); + + let mut limited = BTreeMap::new(); + for le in &limit_expr { + let mut index = None; + le.expr.apply(|pe| { + if let Some(column) = pe.as_any().downcast_ref::() { + match index { + None => index = Some(column.index()), + Some(idx) if idx == column.index() => {} + Some(_) => { + return Err(DataFusionError::Plan( + "PhysicalLimitExpr requires a single Column expression".to_string(), + )); + } + } + } + Ok(TreeNodeRecursion::Continue) + })?; + let index = index.ok_or(DataFusionError::Plan( + "PhysicalLimitExpr requires a Column expression".to_string(), + ))?; + if limited.insert(index, LimitParams::try_from(le)?).is_some() { + return Err(DataFusionError::Plan( + "SeriesLimitExec limit expressions must refer to distinct columns".to_string(), + )); + } + } + + // The output schema is the same as the input scheme except for columns + // referenced in limit_expr, these are potentially renamed and may have + // their nullability changed. + let fields = input_schema + .fields() + .iter() + .enumerate() + .map(|(idx, field)| match limited.get(&idx) { + Some(params) => { + let field = params.expr.return_field(input_schema.as_ref())?; + let nullable = field.is_nullable(); + Ok(Arc::new( + Arc::unwrap_or_clone(field).with_nullable(params.is_nullable(nullable)), + )) + } + None => Ok(Arc::clone(field)), + }) + .collect::>>()?; + + let schema = Arc::new(arrow::datatypes::Schema::new_with_metadata( + fields, + input_schema.metadata().clone(), + )); + + let limited = Arc::new(limited); + let required_ordering = Self::compute_ordering(&series_expr, &order_expr); + let cache = Self::compute_properties(&input, schema, &limited); + + Ok(Self { + input, + series_expr, + order_expr, + limit_expr, + skip, + fetch, + limited, + metrics: ExecutionPlanMetricsSet::new(), + cache, + required_ordering, + }) + } + + /// This function creates the cache object that stores the plan properties + /// such as equivalence properties, partitioning, ordering, etc. + fn compute_properties( + input: &Arc, + schema: SchemaRef, + limited: &BTreeMap, + ) -> PlanProperties { + // The output ordering is the same as the input ordering so long as + // it does not depend on any of the limited columns. Iterate through the + // input ordering stopping at the first ordering expression that depends on a + // limited column. + let ordering = input.output_ordering().and_then(|ordering| { + LexOrdering::new( + ordering + .iter() + .take_while(|pse| { + !pse.expr + .exists(|pe| { + Ok(if let Some(col) = pe.as_any().downcast_ref::() { + limited.contains_key(&col.index()) + } else { + false + }) + }) + .expect("cannot error") + }) + .cloned(), + ) + }); + + let eq_properties = if let Some(ordering) = ordering { + EquivalenceProperties::new_with_orderings(schema, std::iter::once(ordering)) + } else { + EquivalenceProperties::new(Arc::clone(&schema)) + }; + + PlanProperties::new( + eq_properties, + input.output_partitioning().clone(), + input.pipeline_behavior(), + input.boundedness(), + ) + } + + fn compute_ordering( + series_expr: &[PhysicalExprRef], + order_expr: &[PhysicalSortExpr], + ) -> Option { + let sort_requirements = series_expr + .iter() + .map(|expr| PhysicalSortRequirement { + expr: Arc::clone(expr), + options: None, + }) + .chain(order_expr.iter().map(|se| PhysicalSortRequirement { + expr: Arc::clone(&se.expr), + options: Some(se.options), + })); + + LexRequirement::new(sort_requirements).map(OrderingRequirements::new) + } +} + +impl std::fmt::Debug for SeriesLimitExec { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SeriesLimitExec") + .field("series_expr", &self.series_expr) + .field("order_expr", &self.order_expr) + .field("limit_expr", &self.limit_expr) + .field("skip", &self.skip) + .field("fetch", &self.fetch) + .finish_non_exhaustive() + } +} + +impl ExecutionPlan for SeriesLimitExec { + fn name(&self) -> &str { + Self::static_name() + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn properties(&self) -> &PlanProperties { + &self.cache + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + match children.as_slice() { + [child] => Ok(Arc::new(Self::try_new( + Arc::clone(child), + self.series_expr.clone(), + self.order_expr.clone(), + self.limit_expr.clone(), + self.skip, + self.fetch, + )?)), + _ => Err(DataFusionError::Internal(format!( + "SeriesLimitExec wrong number of children: expected 1, found {}", + children.len() + ))), + } + } + + fn execute(&self, partition: usize, context: Arc) -> Result { + if partition + >= self + .input + .properties() + .output_partitioning() + .partition_count() + { + return Err(DataFusionError::Internal(format!( + "SeriesLimitExec invalid partition {partition}" + ))); + } + + let input_stream = self.input.execute(partition, Arc::clone(&context))?; + let baseline_metrics = BaselineMetrics::new(&self.metrics, partition); + let reservation = MemoryConsumer::new(format!("SeriesLimitExec[{partition}]")) + .register(context.memory_pool()); + + let series_expr = self.series_expr.clone(); + let limited = Arc::clone(&self.limited); + + let stream = SeriesLimitStream::try_new( + input_stream, + self.schema(), + baseline_metrics, + reservation, + series_expr, + limited, + self.skip as u64, + self.fetch.map(|f| f as u64), + )?; + + Ok(Box::pin(stream)) + } + + fn statistics(&self) -> Result { + Ok(Statistics::new_unknown(&self.schema())) + } + + fn metrics(&self) -> Option { + Some(self.metrics.clone_inner()) + } + + fn required_input_distribution(&self) -> Vec { + vec![if self.series_expr.is_empty() { + Distribution::UnspecifiedDistribution + } else { + Distribution::HashPartitioned(self.series_expr.iter().map(Arc::clone).collect()) + }] + } + + fn required_input_ordering(&self) -> Vec> { + vec![self.required_ordering.clone()] + } + + fn maintains_input_order(&self) -> Vec { + vec![true] + } +} + +impl DisplayAs for SeriesLimitExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match t { + DisplayFormatType::Default + | DisplayFormatType::Verbose + | DisplayFormatType::TreeRender => { + let series_expr = self + .series_expr + .iter() + .map(|e| e.to_string()) + .collect::>() + .join(", "); + + let order_expr = self + .order_expr + .iter() + .map(|se| se.to_string()) + .collect::>() + .join(", "); + + let limit_expr = self + .limit_expr + .iter() + .map(|le| le.to_string()) + .collect::>() + .join(", "); + + write!( + f, + "SeriesLimitExec: series=[{}], order=[{}], limit_expr=[{}]", + series_expr, order_expr, limit_expr + )?; + + if self.skip > 0 { + write!(f, ", skip={}", self.skip)?; + } + + if let Some(fetch) = self.fetch { + write!(f, ", fetch={}", fetch)?; + } + + Ok(()) + } + } + } +} + +/// A streaming implementation of per-series LIMIT and OFFSET operations. +/// +/// This stream processes incoming record batches and applies LIMIT and OFFSET +/// constraints independently to each time series (group of rows with the same +/// series key values). It maintains state across batches to correctly handle +/// series that span multiple batches. +/// +/// # Behavior +/// +/// For each incoming batch, the stream: +/// 1. Evaluates series expressions to determine series boundaries +/// 2. Detects series changes and resets row counters accordingly +/// 3. Assigns row numbers within each series using [`row_number`] +/// 4. Filters rows based on LIMIT (fetch) and OFFSET (skip) constraints +/// 5. Replaces filtered-out values with default values +/// 6. Tracks state for series that continue into subsequent batches +/// +/// # Series Continuation +/// +/// When a series spans multiple batches, the stream maintains: +/// - The current series key values in `current_series` +/// - Row counts for each limited expression in `counts` +/// +/// This allows row numbering to continue correctly across batch boundaries. +/// For example, if a series has 100 rows split across 3 batches, LIMIT 10 OFFSET 5 +/// will correctly skip the first 5 rows (even if they're in the first batch) and +/// return the next 10 rows (even if they span multiple batches). +/// +/// # Memory Management +/// +/// The stream tracks memory usage via `reservation` and grows/shrinks it as the +/// `current_series` state is updated. This ensures proper memory accounting in +/// DataFusion's memory pool system. +/// +/// # Example +/// +/// Given input with two series (tag='a' and tag='b'), each with 4 rows: +/// ```text +/// Input: +/// tag | time | value +/// ----|------|------ +/// a | 1 | 10 +/// a | 2 | 20 +/// a | 3 | 30 +/// a | 4 | 40 +/// b | 1 | 50 +/// b | 2 | 60 +/// b | 3 | 70 +/// b | 4 | 80 +/// +/// With LIMIT 2 OFFSET 1: +/// Output: +/// tag | time | value +/// ----|------|------ +/// a | 2 | 20 (skipped row 1, included rows 2-3) +/// a | 3 | 30 +/// b | 2 | 60 (skipped row 1, included rows 2-3) +/// b | 3 | 70 +/// ``` +/// +/// # Default Values +/// +/// For rows that are filtered out but whose timestamps appear in other series, +/// the stream can emit default values (typically NULL or 0) to maintain time +/// alignment across series. This is controlled by the `limited` field's default +/// value component. +struct SeriesLimitStream { + /// The stream of input batches. + input: SendableRecordBatchStream, + + /// The schema of the output batches. + schema: SchemaRef, + + /// Metrics for tracking execution statistics. + metrics: BaselineMetrics, + + /// Memory reservation for this stream. + reservation: MemoryReservation, + + /// Physical expressions that define the series grouping. + /// + /// Rows with the same values for these expressions belong to the same series. + /// Typically these are tag columns in InfluxQL queries. + series_expr: Vec, + + /// Limited expressions with their null handling and default values. + /// + /// Each tuple contains: + /// - `PhysicalExprRef`: The expression to evaluate (typically a value column) + /// - `bool`: Whether to ignore nulls (true = IGNORE NULLS, false = RESPECT NULLS) + /// - `Scalar`: Default value to use for filtered-out rows + limited: Arc>, + + /// Range of row numbers to allow through the filter. This is a + /// half-open interval of the form (lower, upper]. Rows are numbered + /// from 1 so a lower bound of 0 will allow all rows up to upper. An + /// upper value of u64::MAX is used to mean there is effectively no + /// limit. + lower: Scalar>, + upper: Scalar>, + + /// The current series key being processed. + current_series: Vec>, + + /// Row counts for each limited expression in the current series. + counts: BTreeMap, +} + +impl SeriesLimitStream { + #[expect(clippy::too_many_arguments)] + fn try_new( + input: SendableRecordBatchStream, + schema: SchemaRef, + metrics: BaselineMetrics, + mut reservation: MemoryReservation, + series_expr: Vec, + limited: Arc>, + skip: u64, + fetch: Option, + ) -> Result { + // Set the initial series to be all nulls. + let current_series = series_expr + .iter() + .map(|expr| expr.data_type(input.schema().as_ref())) + .map(|data_type| data_type.map(|data_type| new_null_array(&data_type, 1))) + .collect::>>()?; + // Set the initial memory size. + reservation.resize( + current_series + .iter() + .map(|arr| arr.get_array_memory_size()) + .sum::(), + ); + let current_series = current_series.into_iter().map(Scalar::new).collect(); + let counts = limited.keys().map(|idx| (*idx, 0u64)).collect(); + let lower = PrimitiveArray::::new_scalar(skip); + let upper = + PrimitiveArray::::new_scalar(fetch.map(|n| n + skip).unwrap_or(u64::MAX)); + Ok(Self { + input, + schema, + metrics, + reservation, + series_expr, + limited, + lower, + upper, + current_series, + counts, + }) + } + + fn process_batch(&mut self, batch: RecordBatch) -> Result { + let num_rows = batch.num_rows(); + if num_rows == 0 { + return Ok(RecordBatch::new_empty(Arc::clone(&self.schema))); + } + + let series_arrs = self + .series_expr + .iter() + .map(|pe| pe.evaluate(&batch)) + .map(|res| res.and_then(|cv| cv.to_array(num_rows))) + .collect::>>()?; + + // Check if the series has changed compared to the current series. + // Short-circuit on first mismatch to avoid unnecessary comparisons. + let mut series_changed = false; + for (arr, current) in series_arrs.iter().zip(self.current_series.iter()) { + let first_value = Scalar::new(arr.slice(0, 1)); + + if !arrow::compute::kernels::cmp::eq(&first_value, current)?.value(0) { + series_changed = true; + break; + } + } + + if series_changed { + // Series has changed, reset counts. + for count in &mut self.counts.values_mut() { + *count = 0; + } + } + + // Partition the series. + let partitions = partition(&series_arrs)?; + + // All columns that have ignore_nulls as false will produce the + // same filter, remember it to avoid recomputing. + let mut respect_nulls_cache: Option<(Arc, u64)> = None; + let mut limited_arrs: BTreeMap = BTreeMap::default(); + let mut filters = Vec::with_capacity(self.limited.len()); + + for (idx, params) in self.limited.iter() { + let LimitParams { + expr, + ignore_nulls, + default_value, + } = params; + let arr = expr.evaluate(&batch)?.into_array(num_rows)?; + + let (filter, count) = match (*ignore_nulls, &respect_nulls_cache) { + (true, _) => { + let (arr, count) = row_number(&arr, self.counts[idx], true, &partitions); + let filter = arrow::compute::and( + &arrow::compute::kernels::cmp::gt(&arr, &self.lower)?, + &arrow::compute::kernels::cmp::lt_eq(&arr, &self.upper)?, + )?; + (Arc::new(filter), count) + } + (false, Some((filter, count))) => (Arc::clone(filter), *count), + (false, None) => { + let (arr, count) = row_number(&arr, self.counts[idx], false, &partitions); + let filter = Arc::new(arrow::compute::and( + &arrow::compute::kernels::cmp::gt(&arr, &self.lower)?, + &arrow::compute::kernels::cmp::lt_eq(&arr, &self.upper)?, + )?); + respect_nulls_cache = Some((Arc::clone(&filter), count)); + (filter, count) + } + }; + limited_arrs.insert( + *idx, + arrow::compute::kernels::zip::zip(&filter, &arr, default_value)?, + ); + filters.push(filter); + self.counts.insert(*idx, count); + } + + // Compute the batch filter efficiently by building it in one pass. + // Instead of folding with or_kleene (which creates N-1 intermediate arrays), + // we build the result directly by checking if any filter is true at each position. + let batch_filter = if filters.is_empty() { + BooleanArray::new_null(num_rows) + } else if filters.len() == 1 { + // Fast path: single filter, no need to combine + Arc::unwrap_or_clone(Arc::clone(&filters[0])) + } else { + // Multiple filters: combine them efficiently + let mut batch_filter_builder = arrow::array::BooleanBuilder::with_capacity(num_rows); + + for row_idx in 0..num_rows { + // Check if any filter is true for this row + let any_true = filters.iter().any(|filter| filter.value(row_idx)); + batch_filter_builder.append_value(any_true); + } + + batch_filter_builder.finish() + }; + + let output_arrs = batch + .into_parts() + .1 + .iter() + .enumerate() + .map(|(idx, arr)| { + if let Some(limited_arr) = limited_arrs.get(&idx) { + limited_arr + } else { + arr + } + }) + .map(|arr| arrow::compute::filter(arr, &batch_filter)) + .collect::, ArrowError>>()?; + + // Store the current series. Tracking the memory use. + for (idx, arr) in series_arrs.iter().enumerate() { + let arr = arr.slice(num_rows - 1, 1); + self.reservation.try_grow(arr.get_array_memory_size())?; + let mut value = Scalar::new(arr); + std::mem::swap(&mut self.current_series[idx], &mut value); + let arr = value.into_inner(); + self.reservation.shrink(arr.get_array_memory_size()); + } + + Ok(RecordBatch::try_new(Arc::clone(&self.schema), output_arrs)?) + } +} + +impl RecordBatchStream for SeriesLimitStream { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } +} + +impl Stream for SeriesLimitStream { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + // Poll the input stream for the next batch + match ready!(self.input.poll_next_unpin(cx)) { + Some(Ok(batch)) => { + // Process the batch through our series limiting logic + let elapsed_compute = self.metrics.elapsed_compute().clone(); + let result = { + let _timer = elapsed_compute.timer(); + self.process_batch(batch) + }; + match result { + Ok(output_batch) => { + // Record the number of output rows + self.metrics.record_output(output_batch.num_rows()); + Poll::Ready(Some(Ok(output_batch))) + } + Err(e) => Poll::Ready(Some(Err(e))), + } + } + Some(Err(e)) => Poll::Ready(Some(Err(e))), + None => Poll::Ready(None), + } + } +} + +/// Parameters defining how to process a limited column. +/// +/// `LimitParams` encapsulates the processing rules for a single value column that +/// has per-series LIMIT/OFFSET constraints applied to it. During query execution, +/// these parameters control how rows are numbered, filtered, and replaced with +/// default values. +struct LimitParams { + /// The expression for the limited column. + expr: PhysicalExprRef, + + /// Whether to ignore nulls in the limit calculation. + ignore_nulls: bool, + + /// The default value to use for filtered-out rows. + default_value: Scalar, +} + +impl LimitParams { + /// Determine if the limited column can be nullable in the output. + fn is_nullable(&self, input_nullable: bool) -> bool { + let default_nullable = self.default_value.get().0.is_nullable(); + if self.ignore_nulls { + // Any nulls will be replace by the default value + default_nullable + } else { + // Respect nulls, so nullable if input is nullable or + // default is nullable + input_nullable || default_nullable + } + } +} + +impl TryFrom<&PhysicalLimitExpr> for LimitParams { + type Error = DataFusionError; + + fn try_from(value: &PhysicalLimitExpr) -> Result { + let default_value = value.default_value.to_scalar().map_err(|e| { + DataFusionError::Plan(format!( + "PhysicalLimitExpr failed to convert default value to scalar: {}", + e + )) + })?; + Ok(Self { + expr: Arc::clone(&value.expr), + ignore_nulls: value.ignore_nulls, + default_value, + }) + } +} + +/// Assigns row numbers to elements in an array, respecting partition boundaries. +/// +/// This function generates sequential row numbers for each element in the input array, +/// with special handling for partitions and null values. Row numbers restart at 1 for +/// each new partition. +/// +/// # Arguments +/// +/// * `arr` - The input array for which to generate row numbers. This is typically a +/// value column, and is used only to check for null values when `ignore_nulls` is true. +/// * `start` - The starting row number for the first partition. Subsequent partitions +/// always start at 1. This allows continuing numbering across multiple batches within +/// the same partition. +/// * `ignore_nulls` - Controls null handling behavior: +/// - `false` (RESPECT NULLS): Null values receive row numbers like any other value +/// - `true` (IGNORE NULLS): Null values are skipped and assigned null row numbers +/// * `partitions` - Defines the partition boundaries within the array. Each partition +/// represents a distinct group (e.g., time series) where row numbering should restart. +/// +/// # Returns +/// +/// Returns a tuple of: +/// * `PrimitiveArray` - An array of row numbers corresponding to each element +/// in the input array. Elements may be null if `ignore_nulls` is true and the corresponding +/// input element is null. +/// * `u64` - The final row number assigned in the last partition. This can be used as the +/// `start` value for subsequent calls to continue numbering within the same partition. +/// +/// # Examples +/// +/// ```text +/// // Single partition, no nulls, starting from 0: +/// arr = [10, 20, 30] +/// partitions = single partition covering all elements +/// result = ([1, 2, 3], 3) +/// +/// // Single partition with RESPECT NULLS: +/// arr = [10, null, 30] +/// ignore_nulls = false +/// result = ([1, 2, 3], 3) +/// +/// // Single partition with IGNORE NULLS: +/// arr = [10, null, 30] +/// ignore_nulls = true +/// result = ([1, null, 2], 2) +/// +/// // Multiple partitions (e.g., two different series): +/// arr = [1, 2, 3, 4, 5, 6] +/// partitions = [0..3, 3..6] (two partitions) +/// result = ([1, 2, 3, 1, 2, 3], 3) +/// // Note: row numbering resets for second partition +/// +/// // Continuing numbering across batches: +/// // Batch 1: +/// arr1 = [10, 20] +/// (result1, last1) = row_number(arr1, 0, false, single_partition) +/// // result1 = [1, 2], last1 = 2 +/// +/// // Batch 2 (same partition continues): +/// arr2 = [30, 40] +/// (result2, last2) = row_number(arr2, last1, false, single_partition) +/// // result2 = [3, 4], last2 = 4 +/// ``` +fn row_number( + arr: &ArrayRef, + start: u64, + ignore_nulls: bool, + partitions: &Partitions, +) -> (PrimitiveArray, u64) { + let mut builder = UInt64Builder::with_capacity(arr.len()); + let mut row_number = start; + + for (idx, range) in partitions.ranges().iter().enumerate() { + if idx > 0 { + row_number = 0; + } + for idx in range.start..range.end { + if ignore_nulls && arr.is_null(idx) { + builder.append_null(); + continue; + } + row_number += 1; + builder.append_value(row_number); + } + } + + (builder.finish(), row_number) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{Int64Array, StringArray}; + use datafusion::physical_expr::expressions::Column; + use insta::assert_snapshot; + + mod physical_limit_expr_tests { + use super::*; + + #[test] + fn test_new() { + let expr = Arc::new(Column::new("value", 2)) as PhysicalExprRef; + let default_value = ScalarValue::Float64(Some(0.0)); + let limit_expr = PhysicalLimitExpr::new(expr, true, default_value.clone()); + + assert!(limit_expr.ignore_nulls); + assert_eq!(limit_expr.default_value, default_value); + } + + #[test] + fn test_display_ignore_nulls() { + let expr = Arc::new(Column::new("value", 2)) as PhysicalExprRef; + let default_value = ScalarValue::Float64(Some(0.0)); + let limit_expr = PhysicalLimitExpr::new(expr, true, default_value); + + let display_str = format!("{}", limit_expr); + assert!(display_str.contains("IGNORE NULLS")); + assert!(display_str.contains("default:")); + assert!(display_str.contains("0")); + } + + #[test] + fn test_display_respect_nulls() { + let expr = Arc::new(Column::new("value", 2)) as PhysicalExprRef; + let default_value = ScalarValue::Float64(Some(99.9)); + let limit_expr = PhysicalLimitExpr::new(expr, false, default_value); + + let display_str = format!("{}", limit_expr); + assert!(display_str.contains("RESPECT NULLS")); + assert!(display_str.contains("default:")); + assert!(display_str.contains("99.9")); + } + + #[test] + fn test_clone() { + let expr = Arc::new(Column::new("value", 2)) as PhysicalExprRef; + let default_value = ScalarValue::Float64(Some(0.0)); + let limit_expr = PhysicalLimitExpr::new(expr, true, default_value.clone()); + + let cloned = limit_expr.clone(); + assert_eq!(cloned.ignore_nulls, limit_expr.ignore_nulls); + assert_eq!(cloned.default_value, limit_expr.default_value); + } + + #[test] + fn test_debug() { + let expr = Arc::new(Column::new("value", 2)) as PhysicalExprRef; + let default_value = ScalarValue::Float64(Some(0.0)); + let limit_expr = PhysicalLimitExpr::new(expr, true, default_value); + + let debug_str = format!("{:?}", limit_expr); + assert!(!debug_str.is_empty()); + assert!(debug_str.contains("PhysicalLimitExpr")); + } + } + + mod series_limit_exec_tests { + use super::*; + use arrow::array::Float64Array; + use arrow::compute::SortOptions; + use arrow::datatypes::{Field, Schema}; + use datafusion::common::test_util::batches_to_string; + use datafusion::physical_expr::LexOrdering; + use datafusion::physical_plan::display::DisplayableExecutionPlan; + use datafusion::{ + datasource::{memory::MemorySourceConfig, source::DataSourceExec}, + execution::context::SessionContext, + physical_plan::sorts::sort::SortExec, + }; + use futures::StreamExt; + + fn string_array(vals: I) -> ArrayRef + where + I: IntoIterator, + O: Into>, + S: AsRef, + { + Arc::new(StringArray::from_iter(vals.into_iter().map(|v| v.into()))) + } + + fn int_array(vals: impl IntoIterator>>) -> ArrayRef { + Arc::new(Int64Array::from_iter(vals.into_iter().map(|v| v.into()))) + } + + fn float_array(vals: impl IntoIterator>>) -> ArrayRef { + Arc::new(Float64Array::from_iter(vals.into_iter().map(|v| v.into()))) + } + + fn input_plan( + arrs: impl IntoIterator, impl Into)>, + sort: impl IntoIterator, impl Into>)>, + ) -> Arc { + let columns: Vec<(String, ArrayRef)> = arrs + .into_iter() + .map(|(name, arr)| (name.into(), arr.into())) + .collect(); + let fields = columns + .iter() + .map(|(name, arr)| Field::new(name, arr.data_type().clone(), arr.null_count() > 0)) + .collect::>(); + let schema = Arc::new(Schema::new(fields)); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + columns.into_iter().map(|(_, arr)| arr).collect(), + ) + .unwrap(); + let empty = batch.num_rows() == 0; + let mut plan: Arc = Arc::new(DataSourceExec::new(Arc::new( + MemorySourceConfig::try_new(&[vec![batch]], schema, None).unwrap(), + ))); + + let mut sort_it = sort.into_iter().peekable(); + if !empty && sort_it.peek().is_some() { + let sort_expr = LexOrdering::new(sort_it.map(|(name, opts)| { + let name = name.into(); + PhysicalSortExpr::new( + Arc::new(Column::new(&name, plan.schema().index_of(&name).unwrap())), + opts.into().unwrap_or_default(), + ) + })) + .unwrap(); + plan = Arc::new(SortExec::new(sort_expr, plan)); + } + + plan + } + + fn test_input_plan( + tag: I, + time: impl IntoIterator>>, + value: impl IntoIterator>>, + ) -> Arc + where + I: IntoIterator, + O: Into>, + S: AsRef, + { + input_plan( + [ + ("tag", string_array(tag)), + ("time", int_array(time)), + ("value", float_array(value)), + ], + [("tag", None), ("time", None)], + ) + } + + /// Helper to collect all batches from a stream + async fn collect_stream( + mut stream: Pin>, + ) -> Result> { + let mut batches = vec![]; + while let Some(batch) = stream.next().await { + batches.push(batch?); + } + Ok(batches) + } + + #[tokio::test] + async fn test_basic_limit() { + // Test basic LIMIT functionality - limit 2 rows per series + let input = test_input_plan( + ["a", "a", "a", "a", "b", "b", "b", "b"], + [1, 2, 3, 4, 1, 2, 3, 4], + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(2)) + .unwrap(); + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let stream = exec.execute(0, task_ctx).unwrap(); + let batches = collect_stream(stream).await.unwrap(); + + assert_snapshot!(batches_to_string(&batches), @r#" + +-----+------+-------+ + | tag | time | value | + +-----+------+-------+ + | a | 1 | 1.0 | + | a | 2 | 2.0 | + | b | 1 | 5.0 | + | b | 2 | 6.0 | + +-----+------+-------+ + "#); + } + + #[tokio::test] + async fn test_basic_offset() { + // Test basic OFFSET functionality - skip first 2 rows per series + let input = test_input_plan(["a", "a", "a", "a"], [1, 2, 3, 4], [1.0, 2.0, 3.0, 4.0]); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 2, None) + .unwrap(); + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let stream = exec.execute(0, task_ctx).unwrap(); + let batches = collect_stream(stream).await.unwrap(); + + assert_snapshot!(batches_to_string(&batches), @r#" + +-----+------+-------+ + | tag | time | value | + +-----+------+-------+ + | a | 3 | 3.0 | + | a | 4 | 4.0 | + +-----+------+-------+ + "#); + } + + #[tokio::test] + async fn test_limit_and_offset() { + // Test combined LIMIT and OFFSET + let input = test_input_plan( + ["a", "a", "a", "a", "a", "a"], + [1, 2, 3, 4, 5, 6], + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + // Skip 2, take 2 + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 2, Some(2)) + .unwrap(); + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let stream = exec.execute(0, task_ctx).unwrap(); + let batches = collect_stream(stream).await.unwrap(); + + assert_snapshot!(batches_to_string(&batches), @r#" + +-----+------+-------+ + | tag | time | value | + +-----+------+-------+ + | a | 3 | 3.0 | + | a | 4 | 4.0 | + +-----+------+-------+ + "#); + } + + #[tokio::test] + async fn test_multiple_series() { + // Test that limits apply independently to each series + let input = test_input_plan( + ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + [1, 2, 3, 1, 2, 3, 1, 2, 3], + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + // Limit 1 row per series + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(1)) + .unwrap(); + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let stream = exec.execute(0, task_ctx).unwrap(); + let batches = collect_stream(stream).await.unwrap(); + + assert_snapshot!(batches_to_string(&batches), @r#" + +-----+------+-------+ + | tag | time | value | + +-----+------+-------+ + | a | 1 | 1.0 | + | b | 1 | 4.0 | + | c | 1 | 7.0 | + +-----+------+-------+ + "#); + } + + #[tokio::test] + async fn test_empty_batch() { + // Test handling of empty batches + let input = test_input_plan(Vec::::new(), Vec::::new(), Vec::::new()); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10)) + .unwrap(); + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let stream = exec.execute(0, task_ctx).unwrap(); + let batches = collect_stream(stream).await.unwrap(); + + assert_snapshot!(batches_to_string(&batches), @r#" + +-----+------+-------+ + | tag | time | value | + +-----+------+-------+ + +-----+------+-------+ + "#); + } + + #[tokio::test] + async fn test_with_nulls() { + // Test handling of null values with ignore_nulls = false + let input = test_input_plan( + ["a", "a", "a", "a"], + [1, 2, 3, 4], + [Some(1.0), None, Some(3.0), Some(4.0)], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, // RESPECT NULLS + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(2)) + .unwrap(); + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let stream = exec.execute(0, task_ctx).unwrap(); + let batches = collect_stream(stream).await.unwrap(); + + assert_snapshot!(batches_to_string(&batches), @r#" + +-----+------+-------+ + | tag | time | value | + +-----+------+-------+ + | a | 1 | 1.0 | + | a | 2 | | + +-----+------+-------+ + "#); + } + + #[tokio::test] + async fn test_with_nulls_ignore() { + // Test handling of null values with ignore_nulls = true + let input = test_input_plan( + ["a", "a", "a", "a", "a"], + [1, 2, 3, 4, 5], + [Some(1.0), None, Some(3.0), None, Some(5.0)], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + true, // IGNORE NULLS + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(2)) + .unwrap(); + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let stream = exec.execute(0, task_ctx).unwrap(); + let batches = collect_stream(stream).await.unwrap(); + + assert_snapshot!(batches_to_string(&batches), @r#" + +-----+------+-------+ + | tag | time | value | + +-----+------+-------+ + | a | 1 | 1.0 | + | a | 3 | 3.0 | + +-----+------+-------+ + "#); + } + + #[test] + fn test_execute_invalid_partition() { + // Test that execute returns an error for invalid partition number + let input = test_input_plan(["a"], [1], [1.0]); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, None) + .unwrap(); + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + + // Try to execute with invalid partition number (only partition 0 exists) + let result = exec.execute(999, task_ctx); + assert!(result.is_err()); + let err_string = result.err().unwrap().to_string(); + assert!(err_string.contains("invalid partition")); + } + + #[test] + fn test_with_new_children_wrong_count_zero() { + // Test with_new_children with 0 children + let input = test_input_plan(["a"], [1], [1.0]); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = Arc::new( + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, None) + .unwrap(), + ); + + // Try with 0 children + let result = exec.with_new_children(vec![]); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("wrong number of children") + ); + } + + #[test] + fn test_with_new_children_wrong_count_two() { + // Test with_new_children with 2 children + let input1 = test_input_plan(["a"], [1], [1.0]); + let input2 = Arc::clone(&input1); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = Arc::new( + SeriesLimitExec::try_new(input1, series_expr, order_expr, limit_expr, 0, None) + .unwrap(), + ); + + let input3 = + test_input_plan(Vec::::new(), Vec::::new(), Vec::::new()); + // Try with 2 children + let result = exec.with_new_children(vec![input2, input3]); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("wrong number of children") + ); + } + + #[tokio::test] + async fn test_preserve_schema() { + // Test that limits apply independently to each series + let input = input_plan( + [ + ( + "value1", + float_array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]), + ), + ( + "tag1", + string_array(["a", "a", "a", "b", "b", "b", "c", "c", "c"]), + ), + ( + "value2", + float_array([10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0]), + ), + ("time", int_array([1, 2, 3, 1, 2, 3, 1, 2, 3])), + ( + "tag2", + string_array(["A", "A", "B", "B", "B", "C", "C", "C", "D"]), + ), + ], + [("tag1", None), ("tag2", None), ("time", None)], + ); + + let series_expr = vec![ + Arc::new(Column::new("tag1", 1)) as PhysicalExprRef, + Arc::new(Column::new("tag2", 4)) as PhysicalExprRef, + ]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 3)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + let limit_expr = vec![ + PhysicalLimitExpr::new( + Arc::new(Column::new("value1", 0)) as PhysicalExprRef, + false, + ScalarValue::Float64(None), + ), + PhysicalLimitExpr::new( + Arc::new(Column::new("value2", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(None), + ), + ]; + + // Limit 1 row per series + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(1)) + .unwrap(); + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let stream = exec.execute(0, task_ctx).unwrap(); + let batches = collect_stream(stream).await.unwrap(); + + assert_snapshot!(batches_to_string(&batches), @r" + +--------+------+--------+------+------+ + | value1 | tag1 | value2 | time | tag2 | + +--------+------+--------+------+------+ + | 1.0 | a | 10.0 | 1 | A | + | 3.0 | a | 30.0 | 3 | B | + | 4.0 | b | 40.0 | 1 | B | + | 6.0 | b | 60.0 | 3 | C | + | 7.0 | c | 70.0 | 1 | C | + | 9.0 | c | 90.0 | 3 | D | + +--------+------+--------+------+------+ + "); + } + + #[test] + fn test_display_as() { + // Test DisplayAs formatting + let input = test_input_plan(["a"], [1], [1.0]); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 5, Some(10)) + .unwrap(); + + assert_snapshot!(DisplayableExecutionPlan::new(&exec).indent(false), @r" + SeriesLimitExec: series=[tag@0], order=[time@1 ASC], limit_expr=[value@2 RESPECT NULLS (default: 0)], skip=5, fetch=10 + SortExec: expr=[tag@0 ASC, time@1 ASC], preserve_partitioning=[false] + DataSourceExec: partitions=1, partition_sizes=[1] + "); + } + + #[test] + fn test_empty_series_expr_distribution() { + // Test UnspecifiedDistribution when series_expr is empty + let input = test_input_plan(["a"], [1], [1.0]); + + let series_expr = vec![]; // Empty! + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10)) + .unwrap(); + + let distributions = exec.required_input_distribution(); + assert_eq!(distributions.len(), 1); + matches!(distributions[0], Distribution::UnspecifiedDistribution); + } + + #[tokio::test] + async fn test_descending_time_ordering() { + // Test with descending time ordering + let input = input_plan( + [ + ("tag", string_array(["a", "a", "a"])), + ("time", int_array([3, 2, 1])), // Descending time + ("value", float_array([30.0, 20.0, 10.0])), + ], + [ + ("tag", None), + ( + "time", + Some(SortOptions { + descending: true, + nulls_first: true, + }), + ), + ], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions { + descending: true, + nulls_first: true, + }, + }]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(2)) + .unwrap(); + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let stream = exec.execute(0, task_ctx).unwrap(); + let batches = collect_stream(stream).await.unwrap(); + + assert_snapshot!(batches_to_string(&batches), @r#" + +-----+------+-------+ + | tag | time | value | + +-----+------+-------+ + | a | 3 | 30.0 | + | a | 2 | 20.0 | + +-----+------+-------+ + "#); + } + + #[tokio::test] + async fn test_multiple_limited_expressions() { + let input = input_plan( + [ + ("tag", string_array(["a", "a", "a", "a"])), + ("time", int_array([1, 2, 3, 4])), + ("value1", float_array([1.0, 2.0, 3.0, 4.0])), + ("value2", float_array([10.0, 20.0, 30.0, 40.0])), + ], + [("tag", None), ("time", None)], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + // TWO limited expressions + let limit_expr = vec![ + PhysicalLimitExpr::new( + Arc::new(Column::new("value1", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + ), + PhysicalLimitExpr::new( + Arc::new(Column::new("value2", 3)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + ), + ]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(2)) + .unwrap(); + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let stream = exec.execute(0, task_ctx).unwrap(); + let batches = collect_stream(stream).await.unwrap(); + + assert_snapshot!(batches_to_string(&batches), @r#" + +-----+------+--------+--------+ + | tag | time | value1 | value2 | + +-----+------+--------+--------+ + | a | 1 | 1.0 | 10.0 | + | a | 2 | 2.0 | 20.0 | + +-----+------+--------+--------+ + "#); + } + + #[tokio::test] + async fn test_multiple_limited_expressions_ignore_nulls() { + // Test with multiple limited expressions ignoring nulls + let input = input_plan( + [ + ("tag", string_array(["a", "a", "a", "a"])), + ("time", int_array([1, 2, 3, 4])), + ( + "value1", + float_array([Some(1.0), None, Some(3.0), Some(4.0)]), + ), + ( + "value2", + float_array([Some(10.0), Some(20.0), None, Some(40.0)]), + ), + ], + [("tag", None), ("time", None)], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + // TWO limited expressions + let limit_expr = vec![ + PhysicalLimitExpr::new( + Arc::new(Column::new("value1", 2)) as PhysicalExprRef, + true, + ScalarValue::Float64(Some(0.0)), + ), + PhysicalLimitExpr::new( + Arc::new(Column::new("value2", 3)) as PhysicalExprRef, + true, + ScalarValue::Float64(Some(0.0)), + ), + ]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(2)) + .unwrap(); + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let stream = exec.execute(0, task_ctx).unwrap(); + let batches = collect_stream(stream).await.unwrap(); + + assert_snapshot!(batches_to_string(&batches), @r" + +-----+------+--------+--------+ + | tag | time | value1 | value2 | + +-----+------+--------+--------+ + | a | 1 | 1.0 | 10.0 | + | a | 2 | 0.0 | 20.0 | + | a | 3 | 3.0 | 0.0 | + +-----+------+--------+--------+ + "); + } + + #[tokio::test] + async fn test_multiple_limited_expressions_ignore_nulls_with_offset() { + // Test with multiple limited expressions ignoring nulls and with offset + let input = input_plan( + [ + ("tag", string_array(["a", "a", "a", "a"])), + ("time", int_array([1, 2, 3, 4])), + ( + "value1", + float_array([Some(1.0), None, Some(3.0), Some(4.0)]), + ), + ( + "value2", + float_array([Some(10.0), Some(20.0), None, Some(40.0)]), + ), + ], + [("tag", None), ("time", None)], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + // TWO limited expressions + let limit_expr = vec![ + PhysicalLimitExpr::new( + Arc::new(Column::new("value1", 2)) as PhysicalExprRef, + true, + ScalarValue::Float64(Some(0.0)), + ), + PhysicalLimitExpr::new( + Arc::new(Column::new("value2", 3)) as PhysicalExprRef, + true, + ScalarValue::Float64(Some(0.0)), + ), + ]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 1, Some(2)) + .unwrap(); + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let stream = exec.execute(0, task_ctx).unwrap(); + let batches = collect_stream(stream).await.unwrap(); + + assert_snapshot!(batches_to_string(&batches), @r" + +-----+------+--------+--------+ + | tag | time | value1 | value2 | + +-----+------+--------+--------+ + | a | 2 | 0.0 | 20.0 | + | a | 3 | 3.0 | 0.0 | + | a | 4 | 4.0 | 40.0 | + +-----+------+--------+--------+ + "); + } + + #[tokio::test] + async fn test_multiple_order_expressions() { + // Test with multiple order expressions (e.g., ORDER BY time, value) + let input = input_plan( + [ + ("tag", string_array(["a", "a", "a", "a", "a", "a"])), + ("time", int_array([1, 1, 1, 2, 2, 2])), // Same time values to test secondary ordering + ("tag2", string_array(["c", "c", "c", "c", "c", "c"])), + ("value", float_array([30.0, 20.0, 10.0, 60.0, 50.0, 40.0])), + ], + [ + ("tag", None), + ("time", None), + ("tag2", None), + ( + "value", + Some(SortOptions { + descending: true, + nulls_first: false, + }), + ), + ], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![ + PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }, + PhysicalSortExpr { + expr: Arc::new(Column::new("tag2", 2)) as PhysicalExprRef, + options: SortOptions { + descending: true, + nulls_first: false, + }, + }, + ]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 3)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(2)) + .unwrap(); + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let stream = exec.execute(0, task_ctx).unwrap(); + let batches = collect_stream(stream).await.unwrap(); + + assert_snapshot!(batches_to_string(&batches), @r" + +-----+------+------+-------+ + | tag | time | tag2 | value | + +-----+------+------+-------+ + | a | 1 | c | 30.0 | + | a | 1 | c | 20.0 | + +-----+------+------+-------+ + "); + } + + #[tokio::test] + async fn test_multiple_order_with_multiple_series() { + // Test multiple order expressions with multiple series + let input = input_plan( + [ + ( + "tag", + string_array(["a", "a", "a", "a", "b", "b", "b", "b"]), + ), + ("time", int_array([1, 1, 2, 2, 1, 1, 2, 2])), + ( + "tag2", + string_array(["c", "c", "c", "c", "c", "c", "c", "c"]), + ), + ( + "value", + float_array([20.0, 10.0, 40.0, 30.0, 70.0, 60.0, 90.0, 80.0]), + ), + ], + [("tag", None), ("time", None), ("tag2", None)], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![ + PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }, + PhysicalSortExpr { + expr: Arc::new(Column::new("tag2", 2)) as PhysicalExprRef, + options: SortOptions::default(), + }, + ]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 3)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + // Limit to 3 rows per series + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(3)) + .unwrap(); + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let stream = exec.execute(0, task_ctx).unwrap(); + let batches = collect_stream(stream).await.unwrap(); + + assert_snapshot!(batches_to_string(&batches), @r" + +-----+------+------+-------+ + | tag | time | tag2 | value | + +-----+------+------+-------+ + | a | 1 | c | 20.0 | + | a | 1 | c | 10.0 | + | a | 2 | c | 40.0 | + | b | 1 | c | 70.0 | + | b | 1 | c | 60.0 | + | b | 2 | c | 90.0 | + +-----+------+------+-------+ + "); + } + + #[tokio::test] + async fn test_multiple_order_with_offset() { + // Test multiple order expressions with OFFSET + let input = input_plan( + [ + ("tag", string_array(["a", "a", "a", "a", "a"])), + ("time", int_array([1, 1, 2, 2, 3])), + ("tag2", string_array(["c", "c", "c", "c", "c"])), + ("value", float_array([10.0, 20.0, 30.0, 40.0, 50.0])), + ], + [("tag", None), ("time", None), ("tag2", None)], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![ + PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }, + PhysicalSortExpr { + expr: Arc::new(Column::new("tag2", 2)) as PhysicalExprRef, + options: SortOptions::default(), + }, + ]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 3)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + // Skip 2, take 2 + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 2, Some(2)) + .unwrap(); + + let session_ctx = SessionContext::new(); + let task_ctx = session_ctx.task_ctx(); + let stream = exec.execute(0, task_ctx).unwrap(); + let batches = collect_stream(stream).await.unwrap(); + + assert_snapshot!(batches_to_string(&batches), @r" + +-----+------+------+-------+ + | tag | time | tag2 | value | + +-----+------+------+-------+ + | a | 2 | c | 30.0 | + | a | 2 | c | 40.0 | + +-----+------+------+-------+ + "); + } + + #[test] + fn test_required_ordering_multiple_order() { + // Test that required_input_ordering includes all order expressions + let input = test_input_plan(["a"], [1], [1.0]); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![ + PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }, + PhysicalSortExpr { + expr: Arc::new(Column::new("value", 2)) as PhysicalExprRef, + options: SortOptions { + descending: true, + nulls_first: false, + }, + }, + ]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10)) + .unwrap(); + + let expect = OrderingRequirements::new( + LexRequirement::new(vec![ + PhysicalSortRequirement { + expr: Arc::new(Column::new("tag", 0)), + options: None, + }, + PhysicalSortRequirement { + expr: Arc::new(Column::new("time", 1)), + options: Some(SortOptions::default()), + }, + PhysicalSortRequirement { + expr: Arc::new(Column::new("value", 2)), + options: Some(SortOptions { + descending: true, + nulls_first: false, + }), + }, + ]) + .unwrap(), + ); + + let required_ordering = exec.required_input_ordering(); + assert_eq!(required_ordering, vec![Some(expect)]); + } + + #[test] + fn test_display_as_multiple_order() { + // Test DisplayAs formatting with multiple order expressions + let input = test_input_plan(["a"], [1], [1.0]); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![ + PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }, + PhysicalSortExpr { + expr: Arc::new(Column::new("value", 2)) as PhysicalExprRef, + options: SortOptions { + descending: true, + nulls_first: false, + }, + }, + ]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 5, Some(10)) + .unwrap(); + + assert_snapshot!(DisplayableExecutionPlan::new(&exec).indent(false), @r" + SeriesLimitExec: series=[tag@0], order=[time@1 ASC, value@2 DESC NULLS LAST], limit_expr=[value@2 RESPECT NULLS (default: 0)], skip=5, fetch=10 + SortExec: expr=[tag@0 ASC, time@1 ASC], preserve_partitioning=[false] + DataSourceExec: partitions=1, partition_sizes=[1] + "); + } + + #[test] + fn test_output_ordering_preserved_when_limited_column_not_in_ordering() { + // Test that output ordering is fully preserved when the limited column + // doesn't appear in the input ordering + let input = input_plan( + [ + ("tag", string_array(["a"])), + ("time", int_array([1])), + ("value", float_array([1.0])), + ("other", float_array([2.0])), + ], + [("tag", None), ("time", None), ("other", None)], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![ + PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }, + PhysicalSortExpr { + expr: Arc::new(Column::new("other", 3)) as PhysicalExprRef, + options: SortOptions::default(), + }, + ]; + // Limit "value" column which is NOT in the input ordering + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10)) + .unwrap(); + + // Output ordering should be fully preserved: tag, time, other + assert_snapshot!( + exec.properties().output_ordering().unwrap(), + @"tag@0 ASC, time@1 ASC, other@3 ASC", + ); + } + + #[test] + fn test_output_ordering_truncated_when_limited_column_in_ordering() { + // Test that output ordering is truncated when a limited column appears + // in the input ordering + let input = input_plan( + [ + ("tag", string_array(["a"])), + ("time", int_array([1])), + ("value", float_array([1.0])), + ], + [("tag", None), ("time", None), ("value", None)], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![ + PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }, + PhysicalSortExpr { + expr: Arc::new(Column::new("value", 2)) as PhysicalExprRef, + options: SortOptions::default(), + }, + ]; + // Limit "value" column which IS in the input ordering + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10)) + .unwrap(); + + // Output ordering should be truncated before "value": only tag, time + assert_snapshot!( + exec.properties().output_ordering().unwrap(), + @"tag@0 ASC, time@1 ASC", + ); + } + + #[test] + fn test_output_ordering_empty_when_first_column_limited() { + // Test that output ordering becomes empty when the first order column + // is a limited column + let input = input_plan( + [ + ("tag", string_array(["a"])), + ("time", int_array([1])), + ("value", float_array([1.0])), + ], + [("tag", None), ("value", None)], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("value", 2)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + // Limit "value" column which is the FIRST in the input ordering + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10)) + .unwrap(); + + // Output ordering should only have tag (series expr), not value + assert_snapshot!( + exec.properties().output_ordering().unwrap(), + @"tag@0 ASC", + ); + } + + #[test] + fn test_output_ordering_with_no_input_ordering() { + // Test behavior when input has no ordering + let input = input_plan( + [ + ("tag", string_array(["a"])), + ("time", int_array([1])), + ("value", float_array([1.0])), + ], + Vec::<(&str, Option)>::new(), // No sorting + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10)) + .unwrap(); + + // Output ordering should be None since input has no ordering + assert!(exec.properties().output_ordering().is_none()); + } + + #[test] + fn test_output_ordering_with_multiple_limited_columns() { + // Test ordering when multiple columns are limited + let input = input_plan( + [ + ("tag", string_array(["a"])), + ("time", int_array([1])), + ("value1", float_array([1.0])), + ("value2", float_array([2.0])), + ("value3", float_array([3.0])), + ], + [ + ("tag", None), + ("time", None), + ("value2", None), + ("value3", None), + ], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![ + PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions::default(), + }, + PhysicalSortExpr { + expr: Arc::new(Column::new("value2", 3)) as PhysicalExprRef, + options: SortOptions::default(), + }, + PhysicalSortExpr { + expr: Arc::new(Column::new("value3", 4)) as PhysicalExprRef, + options: SortOptions::default(), + }, + ]; + // Limit multiple columns + let limit_expr = vec![ + PhysicalLimitExpr::new( + Arc::new(Column::new("value1", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + ), + PhysicalLimitExpr::new( + Arc::new(Column::new("value2", 3)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + ), + ]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10)) + .unwrap(); + + // Output ordering should be truncated right before value2 (first limited column in ordering) + // Should include: tag, time (but not value2 or value3) + assert_snapshot!( + exec.properties().output_ordering().unwrap(), + @"tag@0 ASC, time@1 ASC", + ); + } + + #[test] + fn test_output_ordering_preserves_sort_options() { + // Test that sort options (descending, nulls_first) are preserved + let input = input_plan( + [ + ("tag", string_array(["a"])), + ("time", int_array([1])), + ("value", float_array([1.0])), + ("other", float_array([2.0])), + ], + [ + ( + "tag", + Some(SortOptions { + descending: false, + nulls_first: true, + }), + ), + ( + "time", + Some(SortOptions { + descending: true, + nulls_first: false, + }), + ), + ( + "other", + Some(SortOptions { + descending: false, + nulls_first: false, + }), + ), + ], + ); + + let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef]; + let order_expr = vec![ + PhysicalSortExpr { + expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef, + options: SortOptions { + descending: true, + nulls_first: false, + }, + }, + PhysicalSortExpr { + expr: Arc::new(Column::new("other", 3)) as PhysicalExprRef, + options: SortOptions { + descending: false, + nulls_first: false, + }, + }, + ]; + let limit_expr = vec![PhysicalLimitExpr::new( + Arc::new(Column::new("value", 2)) as PhysicalExprRef, + false, + ScalarValue::Float64(Some(0.0)), + )]; + + let exec = + SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10)) + .unwrap(); + + // Output ordering should preserve sort options + assert_snapshot!(exec.properties().output_ordering().unwrap(), + @"tag@0 ASC, time@1 DESC NULLS LAST, other@3 ASC NULLS LAST", + ); + } + } + + mod row_number_tests { + use super::*; + + #[test] + fn test_row_number_simple() { + // Single partition - all rows in same group + let group_arr = Arc::new(StringArray::from(vec!["a", "a", "a"])) as ArrayRef; + let value_arr = Arc::new(Int64Array::from(vec![1, 2, 3])) as ArrayRef; + let partitions = partition(&[group_arr]).unwrap(); + + let (result, final_count) = row_number(&value_arr, 0, false, &partitions); + + assert_eq!(result, PrimitiveArray::from_iter_values([1_u64, 2, 3])); + assert_eq!(final_count, 3); + } + + #[test] + fn test_row_number_with_start() { + let group_arr = Arc::new(StringArray::from(vec!["a", "a", "a"])) as ArrayRef; + let value_arr = Arc::new(Int64Array::from(vec![1, 2, 3])) as ArrayRef; + let partitions = partition(&[group_arr]).unwrap(); + + let (result, final_count) = row_number(&value_arr, 10, false, &partitions); + + assert_eq!(result, PrimitiveArray::from_iter_values([11_u64, 12, 13])); + assert_eq!(final_count, 13); + } + + #[test] + fn test_row_number_with_nulls_respect() { + let group_arr = Arc::new(StringArray::from(vec!["a", "a", "a"])) as ArrayRef; + let value_arr = Arc::new(Int64Array::from(vec![Some(1), None, Some(3)])) as ArrayRef; + let partitions = partition(&[group_arr]).unwrap(); + + let (result, final_count) = row_number(&value_arr, 0, false, &partitions); + + // Respect nulls means nulls still get row numbers + assert_eq!(result, PrimitiveArray::from_iter_values([1_u64, 2, 3])); + assert_eq!(final_count, 3); + } + + #[test] + fn test_row_number_with_nulls_ignore() { + let group_arr = Arc::new(StringArray::from(vec!["a", "a", "a"])) as ArrayRef; + let value_arr = Arc::new(Int64Array::from(vec![Some(1), None, Some(3)])) as ArrayRef; + let partitions = partition(&[group_arr]).unwrap(); + + let (result, final_count) = row_number(&value_arr, 0, true, &partitions); + + // Ignore nulls means nulls are skipped in numbering + assert_eq!( + result, + PrimitiveArray::from_iter(vec![Some(1u64), None, Some(2u64)]) + ); + assert_eq!(final_count, 2); + } + + #[test] + fn test_row_number_multiple_partitions() { + // Two partitions - rows grouped by "a" and "b" + let group_arr = + Arc::new(StringArray::from(vec!["a", "a", "a", "b", "b", "b"])) as ArrayRef; + let value_arr = Arc::new(Int64Array::from(vec![1, 2, 3, 4, 5, 6])) as ArrayRef; + let partitions = partition(&[group_arr]).unwrap(); + + let (result, final_count) = row_number(&value_arr, 0, false, &partitions); + + // Numbers start at one for each partition + assert_eq!( + result, + PrimitiveArray::from_iter_values([1_u64, 2, 3, 1, 2, 3]) + ); + assert_eq!(final_count, 3); + } + + #[test] + fn test_row_number_empty() { + let group_arr = Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef; + let value_arr = Arc::new(Int64Array::from(Vec::::new())) as ArrayRef; + let partitions = partition(&[group_arr]).unwrap(); + + let (result, final_count) = row_number(&value_arr, 0, false, &partitions); + + assert_eq!(result, PrimitiveArray::::from_iter_values([])); + assert_eq!(final_count, 0); + } + + #[test] + fn test_row_number_single_partition_with_start_and_nulls() { + let group_arr = Arc::new(StringArray::from(vec!["a", "a", "a", "a", "a"])) as ArrayRef; + let value_arr = Arc::new(Int64Array::from(vec![ + Some(1), + None, + Some(3), + None, + Some(5), + ])) as ArrayRef; + let partitions = partition(&[group_arr]).unwrap(); + + let (result, final_count) = row_number(&value_arr, 5, true, &partitions); + + assert_eq!( + result, + PrimitiveArray::from_iter([Some(6_u64), None, Some(7), None, Some(8)]) + ); + assert_eq!(final_count, 8); + } + } +} diff --git a/iox_query/src/lib.rs b/iox_query/src/lib.rs index 4e860d23..9bb66dcb 100644 --- a/iox_query/src/lib.rs +++ b/iox_query/src/lib.rs @@ -6,8 +6,9 @@ use arrow::{ record_batch::RecordBatch, }; use async_trait::async_trait; -use data_types::{ChunkId, ChunkOrder, TransitionPartitionId}; +use data_types::{ChunkId, ChunkOrder, Namespace, TransitionPartitionId}; use datafusion::{ + common::not_impl_err, error::DataFusionError, physical_plan::{SendableRecordBatchStream, Statistics}, prelude::SessionContext, @@ -167,6 +168,14 @@ pub trait QueryDatabase: Debug + Send + Sync + 'static { include_debug_info_tables: bool, ) -> Result>, DataFusionError>; + /// List all namespaces + async fn list_namespaces( + &self, + _span: Option, + ) -> Result, DataFusionError> { + not_impl_err!("QueryDatabase::list_namespaces is only used in InfluxDB 3 Core/Enterprise") + } + /// Acquire concurrency-limiting semapahore async fn acquire_semaphore(&self, span: Option) -> InstrumentedAsyncOwnedSemaphorePermit; diff --git a/iox_query/src/physical_optimizer/dedup/split.rs b/iox_query/src/physical_optimizer/dedup/split.rs index 55aa9c58..607f28ac 100644 --- a/iox_query/src/physical_optimizer/dedup/split.rs +++ b/iox_query/src/physical_optimizer/dedup/split.rs @@ -212,7 +212,6 @@ mod tests { test::TestChunk, util::arrow_sort_key_exprs, }; - use data_types::{PartitionHashId, PartitionId, TransitionPartitionId}; use datafusion::{ physical_plan::{expressions::Literal, filter::FilterExec}, scalar::ScalarValue, @@ -335,59 +334,6 @@ mod tests { ); } - #[test] - fn test_different_partitions_with_and_without_hash_ids() { - // Partition without hash ID in the catalog - let legacy_partition_id = 1; - let legacy_transition_partition_id = - TransitionPartitionId::Catalog(PartitionId::new(legacy_partition_id)); - - // Partition with hash ID in the catalog - let transition_partition_id = - TransitionPartitionId::Hash(PartitionHashId::arbitrary_for_testing()); - - let chunk1 = chunk(1).with_partition_id(legacy_transition_partition_id.clone()); - let chunk2 = chunk(2).with_partition_id(transition_partition_id.clone()); - - let chunk3 = chunk(3) - .with_dummy_parquet_file() - .with_partition_id(legacy_transition_partition_id.clone()); - let chunk4 = chunk(4) - .with_dummy_parquet_file() - .with_partition_id(transition_partition_id.clone()); - let chunk5 = chunk(5) - .with_dummy_parquet_file() - .with_partition_id(legacy_transition_partition_id.clone()); - let chunk6 = chunk(6) - .with_dummy_parquet_file() - .with_partition_id(legacy_transition_partition_id.clone()); - let schema = chunk1.schema().clone(); - let plan = dedup_plan(schema, vec![chunk1, chunk2, chunk3, chunk4, chunk5, chunk6]); - let mut config = ConfigOptions::default(); - config.execution.target_partitions = 2; - insta::assert_yaml_snapshot!( - OptimizationTest::new_with_config(plan, SplitDedup, &config), - @r#" - input: - - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - - " UnionExec" - - " RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]" - - " DataSourceExec: file_groups={2 groups: [[3.parquet, 5.parquet], [4.parquet, 6.parquet]]}, projection=[field, tag1, tag2, time], file_type=parquet" - output: - Ok: - - " UnionExec" - - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - - " UnionExec" - - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]" - - " DataSourceExec: file_groups={2 groups: [[3.parquet, 6.parquet], [5.parquet]]}, projection=[field, tag1, tag2, time], file_type=parquet" - - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - - " UnionExec" - - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]" - - " DataSourceExec: file_groups={1 group: [[4.parquet]]}, projection=[field, tag1, tag2, time], file_type=parquet" - "# - ); - } - #[test] fn test_max_split() { let chunk1 = chunk(1) diff --git a/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs b/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs index 8d934af7..df77132e 100644 --- a/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs +++ b/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs @@ -1,11 +1,14 @@ use std::sync::Arc; use datafusion::{ - common::tree_node::{Transformed, TreeNode}, + common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}, config::ConfigOptions, error::Result, physical_optimizer::PhysicalOptimizerRule, - physical_plan::{ExecutionPlan, sorts::sort_preserving_merge::SortPreservingMergeExec}, + physical_plan::{ + ExecutionPlan, Partitioning, repartition::RepartitionExec, + sorts::sort_preserving_merge::SortPreservingMergeExec, + }, }; use itertools::Itertools; @@ -146,21 +149,26 @@ fn swap_spm_for_progeval( ) -> Result>> { let ordering_req = original_spm.expr(); - // Step 1: Split and regroup partitioned file scans. Also re-orders the scan partitions. - // This step maximizes our chances of getting a disjoint, nonoverlapping lexical ranges. + // Step 1: Remove any RoundRobin repartition nodes that may interfere with optimization let input = Arc::clone(original_spm.input()) + .transform_down(remove_rr_repartition_if_exists) + .map(|t| t.data)?; + + // Step 2: Split and regroup partitioned file scans. Also re-orders the scan partitions. + // This step maximizes our chances of getting a disjoint, nonoverlapping lexical ranges. + let input = input .transform_down(|plan| split_and_regroup_parquet_files(plan, ordering_req)) .map(|t| t.data)?; - // Step 2: compensate for previous redistribution (for parallelized sorting) passes. + // Step 3: compensate for previous redistribution (for parallelized sorting) passes. let input = merge_partitions_after_parallelized_sorting(input, ordering_req)?; - // Step 3: try to extract the lexical ranges for the input partitions + // Step 4: try to extract the lexical ranges for the input partitions let Some(lexical_ranges) = extract_disjoint_ranges_from_plan(ordering_req, &input)? else { return Ok(Transformed::no(return_unaltered_plan)); }; - // Step 4: if needed, re-order the partitions + // Step 5: if needed, re-order the partitions let ordered_input = if lexical_ranges.indices().is_sorted() { input } else { @@ -171,7 +179,7 @@ fn swap_spm_for_progeval( )?) as Arc }; - // Step 5: Replace SortPreservingMergeExec with ProgressiveEvalExec + // Step 6: Replace SortPreservingMergeExec with ProgressiveEvalExec let progresive_eval_exec = Arc::new(ProgressiveEvalExec::new( ordered_input, Some(lexical_ranges.ordered_ranges().cloned().collect_vec()), @@ -181,6 +189,34 @@ fn swap_spm_for_progeval( Ok(Transformed::yes(progresive_eval_exec)) } +/// Remove any RoundRobin repartition nodes that may interfere with optimization. +/// +/// If the current node is a RepartitionExec with Partitioning::RoundRobinBatch, +/// then remove that node and return its child. +fn remove_rr_repartition_if_exists( + plan: Arc, +) -> Result>> { + if let Some(repartition_exec) = plan.as_any().downcast_ref::() + && matches!( + repartition_exec.partitioning(), + Partitioning::RoundRobinBatch(_) + ) + { + // Remove the RoundRobin repartition node and return its child + Ok(Transformed::new( + Arc::clone(repartition_exec.input()), + true, + TreeNodeRecursion::Continue, + )) + } else if plan.as_any().is::() { + // halt at the next SPM. + // that will be considered separately at the root PhysicalOptimizer::optimize(), as it checks per SPM found + Ok(Transformed::new(plan, false, TreeNodeRecursion::Jump)) + } else { + Ok(Transformed::no(plan)) + } +} + #[cfg(test)] mod test { use std::sync::Arc; @@ -424,7 +460,7 @@ mod test { ); } - // No limit & but the input is in the right sort preserving merge struct --> optimize + // No limit & the input is in the right sort preserving merge struct --> optimize #[test] fn test_spm_time_desc() { test_helpers::maybe_start_logging(); @@ -488,6 +524,74 @@ mod test { ); } + // No limit & the input is in the right sort preserving merge struct + // has a rr repartitoning --> should remove + // then --> optimize + #[test] + fn test_spm_time_desc_rr_repartition() { + test_helpers::maybe_start_logging(); + + let schema = schema(); + let sort_exprs = [ + ("col2", SortOp::Asc), + ("col1", SortOp::Asc), + ("time", SortOp::Asc), + ]; + + let plan_parquet = PlanBuilder::data_source_exec_parquet(&schema, 1000, 2000); + let plan_parquet2 = PlanBuilder::data_source_exec_parquet(&schema, 2001, 3000); + let plan_batches = PlanBuilder::record_batches_exec(2, 2500, 3500); + + let plan_sort1 = plan_batches.sort(sort_exprs); + let plan_union_1 = plan_sort1.union(plan_parquet2); + let plan_spm_for_dedupe = plan_union_1.sort_preserving_merge(sort_exprs); + let plan_dedupe = plan_spm_for_dedupe.deduplicate(sort_exprs, false); + + let sort_exprs = [("time", SortOp::Desc)]; + let plan_sort1 = plan_parquet.sort(sort_exprs); + let plan_sort2 = plan_dedupe.sort(sort_exprs); + + let plan_union_2 = plan_sort1.union(plan_sort2); + let repartioned = plan_union_2.round_robin_repartition(4); + + let plan_spm = repartioned.sort_preserving_merge(sort_exprs); + + // Output plan: rr Repartition will be removed + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_spm.build(), opt), + @r#" + input: + - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]" + - " RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" + - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet" + - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" + - " DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " SortPreservingMergeExec: [col2@1 ASC NULLS LAST, col1@0 ASC NULLS LAST, time@3 ASC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[col2@1 ASC NULLS LAST, col1@0 ASC NULLS LAST, time@3 ASC NULLS LAST], preserve_partitioning=[false]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet" + output: + Ok: + - " ProgressiveEvalExec: input_ranges=[(2001)->(3500), (1000)->(2000)]" + - " ReorderPartitionsExec: mapped_partition_indices=[1, 0]" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" + - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet" + - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" + - " DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " SortPreservingMergeExec: [col2@1 ASC NULLS LAST, col1@0 ASC NULLS LAST, time@3 ASC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[col2@1 ASC NULLS LAST, col1@0 ASC NULLS LAST, time@3 ASC NULLS LAST], preserve_partitioning=[false]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet" + "# + ); + } + // No limit & but the input is in the right sort preserving merge struct --> optimize #[test] fn test_spm_non_time_desc() { @@ -1206,13 +1310,9 @@ mod test { ); } - // ------------------------------------------------------------------ - // Negative tests: the right structure not found -> nothing optimized - // ------------------------------------------------------------------ - - // Right stucture but sort on 2 columns --> plan stays the same + // Right stucture and sort on 2 columns --> optimize #[test] - fn test_negative_spm_2_column_sort_desc() { + fn test_spm_2_column_sort_desc() { test_helpers::maybe_start_logging(); // plan: @@ -1272,104 +1372,10 @@ mod test { ); } - // No limit & random plan --> plan stay the same - #[test] - fn test_negative_no_limit() { - test_helpers::maybe_start_logging(); - - let schema = schema(); - let sort_exprs = [ - ("col2", SortOp::Asc), - ("col1", SortOp::Asc), - ("time", SortOp::Asc), - ]; - - let plan_parquet = PlanBuilder::data_source_exec_parquet(&schema, 1000, 2000); - let plan_batches = PlanBuilder::record_batches_exec(2, 1500, 2500); - - let plan = plan_batches - .union(plan_parquet) - .round_robin_repartition(8) - .hash_repartition(vec!["col2", "col1", "time"], 8) - .sort(sort_exprs) - .deduplicate(sort_exprs, true); - - // input and output are the same - let opt = OrderUnionSortedInputs; - insta::assert_yaml_snapshot!( - OptimizationTest::new(plan.build(), opt), - @r#" - input: - - " DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]" - - " SortExec: expr=[col2@1 ASC NULLS LAST, col1@0 ASC NULLS LAST, time@3 ASC NULLS LAST], preserve_partitioning=[false]" - - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3], 8), input_partitions=8" - - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=3" - - " UnionExec" - - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet" - output: - Ok: - - " DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]" - - " SortExec: expr=[col2@1 ASC NULLS LAST, col1@0 ASC NULLS LAST, time@3 ASC NULLS LAST], preserve_partitioning=[false]" - - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3], 8), input_partitions=8" - - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=3" - - " UnionExec" - - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet" - "# - ); - } - - // has limit but no sort preserving merge --> plan stay the same - #[test] - fn test_negative_limit_no_preserving_merge() { - test_helpers::maybe_start_logging(); - - let plan_batches1 = PlanBuilder::record_batches_exec(1, 1000, 2000); - let plan_batches2 = PlanBuilder::record_batches_exec(3, 2001, 3000); - let plan_batches3 = PlanBuilder::record_batches_exec(2, 2500, 3500); - - let plan_union_1 = plan_batches2.union(plan_batches3); - - let sort_exprs = [("time", SortOp::Desc)]; - let plan_sort1 = plan_batches1.sort(sort_exprs); - let plan_sort2 = plan_union_1.sort(sort_exprs); - - let plan_union_2 = plan_sort1.union(plan_sort2); - - let plan_limit = plan_union_2.limit(0, Some(1)); - - // input and output are the same - let opt = OrderUnionSortedInputs; - insta::assert_yaml_snapshot!( - OptimizationTest::new(plan_limit.build(), opt), - @r#" - input: - - " GlobalLimitExec: skip=0, fetch=1" - - " UnionExec" - - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" - - " RecordBatchesExec: chunks=1, projection=[col1, col2, field1, time, __chunk_order]" - - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" - - " UnionExec" - - " RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]" - - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - output: - Ok: - - " GlobalLimitExec: skip=0, fetch=1" - - " UnionExec" - - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" - - " RecordBatchesExec: chunks=1, projection=[col1, col2, field1, time, __chunk_order]" - - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" - - " UnionExec" - - " RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]" - - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - "# - ); - } - - // right structure and same sort order but inputs of uion overlap --> plan stay the same + // right structure and same sort order + // inputs of union touch, but do not overlap --> optimize #[test] - fn test_negative_overlap() { + fn test_touching_ranges() { test_helpers::maybe_start_logging(); // Input plan: @@ -1442,64 +1448,10 @@ mod test { ); } - // No limit & but the input is in the right union struct --> plan stay the same + // Projection expression (field + field) + // but the sort order is not on field, only time ==> optimize #[test] - fn test_negative_no_sortpreservingmerge_input_union() { - test_helpers::maybe_start_logging(); - - // plan: - // UnionExec - // SortExec: expr=[time@2 DESC] - // DataSourceExec - // SortExec: expr=[time@2 DESC] - // UnionExec - // RecordBatchesExec - // DataSourceExec - - let schema = schema(); - - let plan_parquet = PlanBuilder::data_source_exec_parquet(&schema, 1000, 2000); - let plan_parquet2 = PlanBuilder::data_source_exec_parquet(&schema, 2001, 3000); - let plan_batches = PlanBuilder::record_batches_exec(2, 2500, 3500); - - let plan_union_1 = plan_batches.union(plan_parquet2); - - let sort_exprs = [("time", SortOp::Desc)]; - - let plan_sort1 = plan_parquet.sort(sort_exprs); - let plan_sort2 = plan_union_1.sort(sort_exprs); - - let plan_union_2 = plan_sort1.union(plan_sort2); - - // input and output are the same - let opt = OrderUnionSortedInputs; - insta::assert_yaml_snapshot!( - OptimizationTest::new(plan_union_2.build(), opt), - @r#" - input: - - " UnionExec" - - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" - - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet" - - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" - - " UnionExec" - - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet" - output: - Ok: - - " UnionExec" - - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" - - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet" - - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" - - " UnionExec" - - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet" - "# - ); - } - - // Projection expression (field + field) ==> not optimze. Plan stays the same - #[test] - fn test_negative_spm_time_desc_with_dedupe_and_proj_on_expr() { + fn test_spm_time_desc_with_dedupe_and_proj_on_expr() { test_helpers::maybe_start_logging(); // plan: @@ -1629,6 +1581,164 @@ mod test { ); } + // ------------------------------------------------------------------ + // Negative tests: the right structure not found -> nothing optimized + // ------------------------------------------------------------------ + + // No limit & random plan --> plan stay the same + #[test] + fn test_negative_no_limit() { + test_helpers::maybe_start_logging(); + + let schema = schema(); + let sort_exprs = [ + ("col2", SortOp::Asc), + ("col1", SortOp::Asc), + ("time", SortOp::Asc), + ]; + + let plan_parquet = PlanBuilder::data_source_exec_parquet(&schema, 1000, 2000); + let plan_batches = PlanBuilder::record_batches_exec(2, 1500, 2500); + + let plan = plan_batches + .union(plan_parquet) + .round_robin_repartition(8) + .hash_repartition(vec!["col2", "col1", "time"], 8) + .sort(sort_exprs) + .deduplicate(sort_exprs, true); + + // input and output are the same + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan.build(), opt), + @r#" + input: + - " DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " SortExec: expr=[col2@1 ASC NULLS LAST, col1@0 ASC NULLS LAST, time@3 ASC NULLS LAST], preserve_partitioning=[false]" + - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3], 8), input_partitions=8" + - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=3" + - " UnionExec" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet" + output: + Ok: + - " DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " SortExec: expr=[col2@1 ASC NULLS LAST, col1@0 ASC NULLS LAST, time@3 ASC NULLS LAST], preserve_partitioning=[false]" + - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3], 8), input_partitions=8" + - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=3" + - " UnionExec" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet" + "# + ); + } + + // has limit but no sort preserving merge --> plan stay the same + #[test] + fn test_negative_limit_no_preserving_merge() { + test_helpers::maybe_start_logging(); + + let plan_batches1 = PlanBuilder::record_batches_exec(1, 1000, 2000); + let plan_batches2 = PlanBuilder::record_batches_exec(3, 2001, 3000); + let plan_batches3 = PlanBuilder::record_batches_exec(2, 2500, 3500); + + let plan_union_1 = plan_batches2.union(plan_batches3); + + let sort_exprs = [("time", SortOp::Desc)]; + let plan_sort1 = plan_batches1.sort(sort_exprs); + let plan_sort2 = plan_union_1.sort(sort_exprs); + + let plan_union_2 = plan_sort1.union(plan_sort2); + + let plan_limit = plan_union_2.limit(0, Some(1)); + + // input and output are the same + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_limit.build(), opt), + @r#" + input: + - " GlobalLimitExec: skip=0, fetch=1" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" + - " RecordBatchesExec: chunks=1, projection=[col1, col2, field1, time, __chunk_order]" + - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" + - " UnionExec" + - " RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + output: + Ok: + - " GlobalLimitExec: skip=0, fetch=1" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" + - " RecordBatchesExec: chunks=1, projection=[col1, col2, field1, time, __chunk_order]" + - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" + - " UnionExec" + - " RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + "# + ); + } + + // No limit & but the input is in the right union struct --> plan stay the same + #[test] + fn test_negative_no_sortpreservingmerge_input_union() { + test_helpers::maybe_start_logging(); + + // plan: + // UnionExec + // SortExec: expr=[time@2 DESC] + // DataSourceExec + // SortExec: expr=[time@2 DESC] + // UnionExec + // RecordBatchesExec + // DataSourceExec + + let schema = schema(); + + let plan_parquet = PlanBuilder::data_source_exec_parquet(&schema, 1000, 2000); + let plan_parquet2 = PlanBuilder::data_source_exec_parquet(&schema, 2001, 3000); + let plan_batches = PlanBuilder::record_batches_exec(2, 2500, 3500); + + let plan_union_1 = plan_batches.union(plan_parquet2); + + let sort_exprs = [("time", SortOp::Desc)]; + + let plan_sort1 = plan_parquet.sort(sort_exprs); + let plan_sort2 = plan_union_1.sort(sort_exprs); + + let plan_union_2 = plan_sort1.union(plan_sort2); + + // input and output are the same + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_union_2.build(), opt), + @r#" + input: + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" + - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet" + - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" + - " UnionExec" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet" + output: + Ok: + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" + - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet" + - " SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]" + - " UnionExec" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet" + "# + ); + } + + // ------------------------------------------------------------------ + // Many partitioned files tests + // ------------------------------------------------------------------ + // Reproduce of https://github.com/influxdata/influxdb_iox/issues/12461#issuecomment-2430196754 // The reproducer needs big non-overlapped files so its first physical plan will have DataSourceExec with multiple // file groups, each file group has multiple partitioned files. diff --git a/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs_for_constants.rs b/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs_for_constants.rs index 1ece2ce9..5237b4da 100644 --- a/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs_for_constants.rs +++ b/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs_for_constants.rs @@ -327,6 +327,52 @@ mod test { ); } + // Under sort preserving merge is not UnionExec, + // although the new optimizer can handle it. + #[test] + fn test_replace_spm_with_no_union_under_spm() { + test_helpers::maybe_start_logging(); + + let schema = schema(); + let sort_order = sort_order_for_sort(); + + // First sort on parquet file + let plan_parquet = data_source_exec_parquet_with_value_range(&schema, 1000, 2000); + let plan_projection_1 = Arc::new( + ProjectionExec::try_new( + projection_expr_with_2_constants("m1", "tag0", &schema), + plan_parquet, + ) + .unwrap(), + ); + let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_projection_1)); + + // add sort preserving merge on top + let plan_spm = Arc::new(SortPreservingMergeExec::new( + sort_order_for_sort_preserving_merge(), + plan_sort1, + )); + + // input and output are the same + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_spm, opt), + @r#" + input: + - " SortPreservingMergeExec: [iox::measurement@0 ASC NULLS LAST, key@1 ASC NULLS LAST, value@2 ASC NULLS LAST]" + - " SortExec: expr=[value@2 ASC NULLS LAST], preserve_partitioning=[false]" + - " ProjectionExec: expr=[m1 as iox::measurement, tag0 as key, tag0@1 as value]" + - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[tag2, tag0, tag1, field1, time, __chunk_order], output_ordering=[__chunk_order@5 ASC], file_type=parquet" + output: + Ok: + - " ProgressiveEvalExec: input_ranges=[(m1,tag0)->(m1,tag0)]" + - " SortExec: expr=[value@2 ASC NULLS LAST], preserve_partitioning=[false]" + - " ProjectionExec: expr=[m1 as iox::measurement, tag0 as key, tag0@1 as value]" + - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[tag2, tag0, tag1, field1, time, __chunk_order], output_ordering=[__chunk_order@5 ASC], file_type=parquet" + "# + ); + } + // ------------------------------------------------------------------ // Negative tests: wrong structure -> not optimized // ------------------------------------------------------------------ @@ -395,52 +441,6 @@ mod test { ); } - // Under sort preserving merge is not UnionExec, - // altho the new optimizer can handle it. - #[test] - fn test_replace_spm_with_no_union_under_spm() { - test_helpers::maybe_start_logging(); - - let schema = schema(); - let sort_order = sort_order_for_sort(); - - // First sort on parquet file - let plan_parquet = data_source_exec_parquet_with_value_range(&schema, 1000, 2000); - let plan_projection_1 = Arc::new( - ProjectionExec::try_new( - projection_expr_with_2_constants("m1", "tag0", &schema), - plan_parquet, - ) - .unwrap(), - ); - let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_projection_1)); - - // add sort preserving merge on top - let plan_spm = Arc::new(SortPreservingMergeExec::new( - sort_order_for_sort_preserving_merge(), - plan_sort1, - )); - - // input and output are the same - let opt = OrderUnionSortedInputs; - insta::assert_yaml_snapshot!( - OptimizationTest::new(plan_spm, opt), - @r#" - input: - - " SortPreservingMergeExec: [iox::measurement@0 ASC NULLS LAST, key@1 ASC NULLS LAST, value@2 ASC NULLS LAST]" - - " SortExec: expr=[value@2 ASC NULLS LAST], preserve_partitioning=[false]" - - " ProjectionExec: expr=[m1 as iox::measurement, tag0 as key, tag0@1 as value]" - - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[tag2, tag0, tag1, field1, time, __chunk_order], output_ordering=[__chunk_order@5 ASC], file_type=parquet" - output: - Ok: - - " ProgressiveEvalExec: input_ranges=[(m1,tag0)->(m1,tag0)]" - - " SortExec: expr=[value@2 ASC NULLS LAST], preserve_partitioning=[false]" - - " ProjectionExec: expr=[m1 as iox::measurement, tag0 as key, tag0@1 as value]" - - " DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[tag2, tag0, tag1, field1, time, __chunk_order], output_ordering=[__chunk_order@5 ASC], file_type=parquet" - "# - ); - } - // Under Union is not all SortExec #[test] fn test_negative_not_all_sorts_under_union() { diff --git a/iox_query/src/query_log.rs b/iox_query/src/query_log.rs index 929b26c3..cfd24b35 100644 --- a/iox_query/src/query_log.rs +++ b/iox_query/src/query_log.rs @@ -8,7 +8,7 @@ use datafusion::physical_plan::{ ExecutionPlan, metrics::{MetricValue, MetricsSet}, }; -use influxdb_iox_client::write::Client as WriteClient; +use influxdb_iox_client::batched_write::MaybeBatchedWriteClient as WriteClient; use influxdb_line_protocol::LineProtocolBuilder; use iox_query_params::StatementParams; use iox_time::{Time, TimeProvider}; @@ -230,6 +230,7 @@ impl QueryLogEntryState { let mut lp = builder .measurement(measurement_name) + .tag("id", &self.id.to_string()) .tag("namespace_id", &self.namespace_id.get().to_string()) .tag("namespace_name", &self.namespace_name) .tag("query_type", self.query_type) @@ -1477,7 +1478,7 @@ mod test_super { insta::assert_snapshot!( format_line_protocol(&lp), - @r#"query_log_test,namespace_id=1,namespace_name=ns,query_type=sql,phase=cancel running="false",success="false",query_text="SELECT 1",query_params="Params { }",query_issue_time_ns=100000000i,end_to_end_duration_ns=0u 1000000000000000000"# + @r#"query_log_test,id=00000000-0000-0000-0000-000000000001,namespace_id=1,namespace_name=ns,query_type=sql,phase=cancel running="false",success="false",query_text="SELECT 1",query_params="Params { }",query_issue_time_ns=100000000i,end_to_end_duration_ns=0u 1000000000000000000"# ); } @@ -1510,7 +1511,7 @@ mod test_super { let lp = lp_builder.build(); insta::assert_snapshot!( format_line_protocol(&lp), - @r#"query_log_test,namespace_id=1,namespace_name=ns,query_type=sql,phase=success running="false",success="true",query_text="SELECT 1",query_params="Params { }",query_issue_time_ns=100000000i,partition_count=0u,parquet_file_count=0u,permit_duration_ns=2000000u,plan_duration_ns=1000000u,execute_duration_ns=5000000u,end_to_end_duration_ns=8000000u,compute_duration_ns=1337000000u,max_memory_bytes=0i,ingester_latency_to_plan_ns=0u,ingester_latency_to_full_data_ns=0u,ingester_response_row_count=0u,ingester_response_size_bytes=0u,ingester_partition_count=0u 1000000000000000000"#); + @r#"query_log_test,id=00000000-0000-0000-0000-000000000001,namespace_id=1,namespace_name=ns,query_type=sql,phase=success running="false",success="true",query_text="SELECT 1",query_params="Params { }",query_issue_time_ns=100000000i,partition_count=0u,parquet_file_count=0u,permit_duration_ns=2000000u,plan_duration_ns=1000000u,execute_duration_ns=5000000u,end_to_end_duration_ns=8000000u,compute_duration_ns=1337000000u,max_memory_bytes=0i,ingester_latency_to_plan_ns=0u,ingester_latency_to_full_data_ns=0u,ingester_response_row_count=0u,ingester_response_size_bytes=0u,ingester_partition_count=0u 1000000000000000000"#); } #[test] @@ -1550,7 +1551,7 @@ mod test_super { insta::assert_snapshot!( format_line_protocol(&lp), - @r#"query_log_test,namespace_id=1,namespace_name=ns,query_type=sql,phase=received,auth_id=user123,trace_id=42 running="true",success="false",query_text="SELECT 1",query_params="Params { }",query_issue_time_ns=100000000i 1000000000000000000"# + @r#"query_log_test,id=00000000-0000-0000-0000-000000000001,namespace_id=1,namespace_name=ns,query_type=sql,phase=received,auth_id=user123,trace_id=42 running="true",success="false",query_text="SELECT 1",query_params="Params { }",query_issue_time_ns=100000000i 1000000000000000000"# ); } diff --git a/iox_query/src/statistics/partition_statistics/mod.rs b/iox_query/src/statistics/partition_statistics/mod.rs index 92371a0b..a2369506 100644 --- a/iox_query/src/statistics/partition_statistics/mod.rs +++ b/iox_query/src/statistics/partition_statistics/mod.rs @@ -11,6 +11,7 @@ use datafusion::{ coalesce_partitions::CoalescePartitionsExec, coop::CooperativeExec, empty::EmptyExec, + expressions::Column, filter::FilterExec, limit::{GlobalLimitExec, LocalLimitExec}, placeholder_row::PlaceholderRowExec, @@ -157,16 +158,19 @@ impl PartitionStatistics for ProjectionExec { |mut acc, child| { let child_stats = statistics_by_partition(child.as_ref())?; - let child_stats_with_project_exec_projected = - child_stats.into_iter().map(|stats| { - proj_exec_stats( - Arc::unwrap_or_clone(stats), - self.expr().iter(), - &self.schema(), - ) - }); - - acc.extend(child_stats_with_project_exec_projected); + let child_stats_with_project_exec_projected: Result, DataFusionError> = + child_stats + .into_iter() + .map(|stats| { + proj_exec_stats( + Arc::unwrap_or_clone(stats), + self.expr().iter(), + &self.schema(), + ) + }) + .collect(); + + acc.extend(child_stats_with_project_exec_projected?); Ok::(acc) }, )?; @@ -270,27 +274,52 @@ impl PartitionStatistics for AggregateExec { fn statistics_by_partition(&self) -> Result { if self.aggr_expr().is_empty() { let inner_stats_per_partition = statistics_by_partition(self.input.as_ref())?; + let input_schema = self.input.schema(); - Ok(inner_stats_per_partition + inner_stats_per_partition .iter() .map(|stats| { - // only retain the min/max per column - // whereas the remaining stats can be changed by the grouping - Arc::new(Statistics { + // Create column statistics for each output GROUP BY expression + let column_statistics: Result, DataFusionError> = self + .output_group_expr() + .iter() + .map(|group_expr| { + // Check if this group expression corresponds to an input column + if let Some(input_col_idx) = group_expr + .as_any() + .downcast_ref::() + .and_then(|col| input_schema.index_of(col.name()).ok()) + { + // This is a direct column reference, use existing statistics + if input_col_idx < stats.column_statistics.len() { + let col_stats = &stats.column_statistics[input_col_idx]; + Ok(ColumnStatistics { + min_value: col_stats.min_value.clone(), + max_value: col_stats.max_value.clone(), + ..Default::default() + }) + } else { + // Input column index out of bounds - this should not happen + Err(internal_datafusion_err!( + "Column index {input_col_idx} out of bounds in partition statistics (available columns: {}, column found in schema)", + stats.column_statistics.len() + )) + } + } else { + // This is a computed expression (like date_part), return unknown stats + Ok(ColumnStatistics::default()) + } + }) + .collect(); + + let column_statistics = column_statistics?; + Ok(Arc::new(Statistics { num_rows: Precision::Absent, total_byte_size: Precision::Absent, - column_statistics: stats - .column_statistics - .iter() - .map(|col_stats| ColumnStatistics { - min_value: col_stats.min_value.clone(), - max_value: col_stats.max_value.clone(), - ..Default::default() - }) - .collect(), - }) + column_statistics, + })) }) - .collect()) + .collect() } else { // if aggr expr is not empty, then the projected values (per column) could be different Ok(unknown_statistics_by_partition(self)) diff --git a/iox_query/src/statistics/partition_statistics/project_schema.rs b/iox_query/src/statistics/partition_statistics/project_schema.rs index 22d199d6..d235b7d5 100644 --- a/iox_query/src/statistics/partition_statistics/project_schema.rs +++ b/iox_query/src/statistics/partition_statistics/project_schema.rs @@ -232,14 +232,26 @@ pub(super) fn proj_exec_stats<'a>( mut stats: Statistics, exprs: impl Iterator, String)>, projexec_schema: &SchemaRef, -) -> Arc { +) -> Result> { let mut primitive_row_size = 0; let mut primitive_row_size_possible = true; let mut column_statistics = vec![]; for (expr, _) in exprs { let col_stats = if let Some(col) = expr.as_any().downcast_ref::() { // handle columns in schema - stats.column_statistics[col.index()].clone() + let col_index = col.index(); + if col_index >= stats.column_statistics.len() { + return Err(internal_datafusion_err!( + "Column index {} out of bounds in partition statistics projection \ + (available columns: {}, column name: '{}'). \ + This indicates a schema mismatch between projection expressions and input statistics.", + col_index, + stats.column_statistics.len(), + col.name() + )); + } else { + stats.column_statistics[col_index].clone() + } } else if let Some(lit_expr) = expr.as_any().downcast_ref::() { // handle constants match lit_expr.value() { @@ -277,7 +289,7 @@ pub(super) fn proj_exec_stats<'a>( stats.total_byte_size = Precision::Exact(primitive_row_size).multiply(&stats.num_rows); } stats.column_statistics = column_statistics; - Arc::new(stats) + Ok(Arc::new(stats)) } #[cfg(test)] @@ -387,7 +399,8 @@ mod tests { Arc::unwrap_or_clone(src_stats), exprs.iter(), &project_schema, - ); + ) + .unwrap(); assert_eq!( actual, expected_stats, "should be able to project all columns" @@ -457,7 +470,8 @@ mod tests { Arc::unwrap_or_clone(src_stats), exprs.iter(), &project_schema, - ); + ) + .unwrap(); assert_eq!( actual, expected_stats, "should be able to remove and re-order columns" @@ -549,7 +563,8 @@ mod tests { Arc::unwrap_or_clone(src_stats), exprs.iter(), &project_schema, - ); + ) + .unwrap(); assert_eq!( actual, expected_stats, "should be able to handle schema with aliases" @@ -717,7 +732,8 @@ mod tests { Arc::unwrap_or_clone(src_stats), exprs.iter(), &project_schema, - ); + ) + .unwrap(); assert_eq!( actual, expected_stats, "should be able to handle schema with same-named fields" @@ -789,7 +805,8 @@ mod tests { Arc::unwrap_or_clone(src_stats), exprs.iter(), &project_schema, - ); + ) + .unwrap(); assert_eq!( actual, expected_stats, "should be able to handle schema with same-named fields, reversed ordering" @@ -816,7 +833,8 @@ mod tests { Arc::unwrap_or_clone(Arc::clone(&src_stats)), exprs.iter(), &src_schema, - ); + ) + .unwrap(); assert_eq!( actual, src_stats, "proj_exec_stats should extract the proper columns from the physical exprs" @@ -831,7 +849,8 @@ mod tests { Arc::unwrap_or_clone(Arc::clone(&src_stats)), exprs.iter(), &src_schema, - ); + ) + .unwrap(); // min/max are the constants assert_eq!( actual.column_statistics[0].min_value.get_value(), @@ -881,7 +900,8 @@ mod tests { (lit(ScalarValue::Null), "col_a".to_string()), (lit(ScalarValue::Null), "col_b".to_string()), ]; - let actual = proj_exec_stats(Arc::unwrap_or_clone(src_stats), exprs.iter(), &src_schema); + let actual = + proj_exec_stats(Arc::unwrap_or_clone(src_stats), exprs.iter(), &src_schema).unwrap(); // min/max are the constants assert_eq!( actual.column_statistics[0].min_value.get_value(), diff --git a/iox_query/src/test.rs b/iox_query/src/test.rs index fa516714..5b140500 100644 --- a/iox_query/src/test.rs +++ b/iox_query/src/test.rs @@ -19,7 +19,9 @@ use arrow::{ record_batch::RecordBatch, }; use async_trait::async_trait; -use data_types::{ChunkId, ChunkOrder, NamespaceId, PartitionKey, TableId, TransitionPartitionId}; +use data_types::{ + ChunkId, ChunkOrder, Namespace, NamespaceId, PartitionKey, TableId, TransitionPartitionId, +}; use datafusion::error::DataFusionError; use datafusion::logical_expr::Expr; use datafusion::physical_plan::ExecutionPlan; @@ -114,6 +116,28 @@ impl QueryDatabase for TestDatabaseStore { Ok(databases.get(name).cloned().map(|ns| ns as _)) } + async fn list_namespaces( + &self, + _span: Option, + ) -> Result, DataFusionError> { + Ok(self + .databases + .lock() + .iter() + .enumerate() + .map(|(i, (name, db))| Namespace { + id: NamespaceId::new(i as i64), + name: name.to_owned(), + retention_period_ns: db.retention_time_ns, + max_tables: Default::default(), + max_columns_per_table: Default::default(), + deleted_at: Default::default(), + partition_template: Default::default(), + router_version: Default::default(), + }) + .collect()) + } + async fn acquire_semaphore(&self, span: Option) -> InstrumentedAsyncOwnedSemaphorePermit { Arc::clone(&self.query_semaphore) .acquire_owned(span) diff --git a/iox_query_influxql/Cargo.toml b/iox_query_influxql/Cargo.toml index b24ede58..faf44fdf 100644 --- a/iox_query_influxql/Cargo.toml +++ b/iox_query_influxql/Cargo.toml @@ -11,6 +11,7 @@ workspace = true [dependencies] arrow = { workspace = true } assert_matches = "1" +async-trait = { version = "0.1.89", default-features = false } chrono-tz = { version = "0.10" } datafusion = { workspace = true } datafusion_util = { path = "../datafusion_util" } diff --git a/iox_query_influxql/src/lib.rs b/iox_query_influxql/src/lib.rs index 0d0d8fc8..478f4b5b 100644 --- a/iox_query_influxql/src/lib.rs +++ b/iox_query_influxql/src/lib.rs @@ -11,6 +11,8 @@ mod aggregate; mod error; pub mod frontend; pub mod plan; +pub mod show_databases; +pub mod show_retention_policies; mod window; /// A list of the numeric types supported by InfluxQL that can be be used diff --git a/iox_query_influxql/src/plan/planner.rs b/iox_query_influxql/src/plan/planner.rs index 0d37e757..f95dfdd4 100644 --- a/iox_query_influxql/src/plan/planner.rs +++ b/iox_query_influxql/src/plan/planner.rs @@ -100,7 +100,7 @@ use iox_query::analyzer::default_return_value_for_aggr_fn; use iox_query::analyzer::range_predicate::find_time_range; use iox_query::config::{IoxConfigExt, MetadataCutoff}; use iox_query::exec::IOxSessionContext; -use iox_query::exec::gapfill::{FillStrategy, GapFill, GapFillParams}; +use iox_query::exec::gapfill::{FillExpr, FillStrategy, GapFill}; use iox_query_params::StatementParams; use itertools::Itertools; use query_functions::date_bin_wallclock::DateBinWallclockUDF; @@ -1580,7 +1580,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> { ctx.group_by.and_then(|gb| gb.time_dimension()), fill_strategy, ) { - build_gap_fill_node(plan, time_column, fill_strategy, &ctx.projection_type)? + build_gap_fill_node(plan, fill_strategy, &ctx.projection_type)? } else { plan }; @@ -2291,106 +2291,10 @@ impl<'a> InfluxQLToLogicalPlan<'a> { iql: &IQLExpr, schema: &IQLSchema<'_>, ) -> Result { - let df_schema = &schema.df_schema; match iql { // rewriter is expected to expand wildcard expressions IQLExpr::Wildcard(_) => error::internal("unexpected wildcard in projection"), - IQLExpr::VarRef(VarRef { - name, - data_type: opt_dst_type, - }) => { - Ok(match (scope, name.as_str()) { - // Per the Go implementation, the time column is case-insensitive in the - // `WHERE` clause and disregards any postfix type cast operator. - // - // See: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L5751-L5753 - (ExprScope::Where, name) if name.eq_ignore_ascii_case("time") => { - "time".as_expr() - } - (ExprScope::Projection, "time") => "time".as_expr(), - (_, name) => match df_schema - .fields_with_unqualified_name(name) - .first() - .map(|f| f.data_type().clone()) - { - Some(src_type) => { - let column = name.as_expr(); - - match opt_dst_type.and_then(var_ref_data_type_to_data_type) { - Some(dst_type) => { - fn is_numeric(dt: &DataType) -> bool { - matches!( - dt, - DataType::Int64 | DataType::Float64 | DataType::UInt64 - ) - } - - if src_type == dst_type { - column - } else if is_numeric(&src_type) && is_numeric(&dst_type) { - // InfluxQL only allows casting between numeric types, - // and it is safe to unconditionally unwrap, as the - // `is_numeric_type` call guarantees it can be mapped to - // an Arrow DataType - column.cast_to(&dst_type, &schema.df_schema)? - } else { - // If the cast is incompatible, evaluates to NULL - Expr::Literal(ScalarValue::Null, None) - } - } - None => column, - } - } - _ => { - // For non-existent columns, we need to check if the user specified a gap-filling value. - // See [`VirtualColumnFillConfig`] for more details. - match fill_config { - Some(VirtualColumnFillConfig { - fill_clause: Some(FillClause::Value(n)), - data_type, - }) => { - // The user specified a gap-filling value - match data_type { - Some(InfluxColumnType::Field(InfluxFieldType::Integer)) => { - Expr::Literal( - number_to_scalar(n, &DataType::Int64)?, - None, - ) - } - Some(InfluxColumnType::Field(InfluxFieldType::Float)) => { - Expr::Literal( - number_to_scalar(n, &DataType::Float64)?, - None, - ) - } - Some(InfluxColumnType::Tag) => { - // Do not gap-fill tags - Expr::Literal(ScalarValue::Null, None) - } - _ => { - match n { - // Default to the data type of the gap-filling value - Number::Integer(_) => Expr::Literal( - number_to_scalar(n, &DataType::Int64)?, - None, - ), - Number::Float(_) => Expr::Literal( - number_to_scalar(n, &DataType::Float64)?, - None, - ), - } - } - } - } - _ => { - // No gap-filling config or value, return NULL - Expr::Literal(ScalarValue::Null, None) - } - } - } - }, - }) - } + IQLExpr::VarRef(varref) => self.varref_to_df_expr(fill_config, scope, varref, schema), IQLExpr::BindParameter(id) => { let err = BindParameterError::NotDefined(id.to_string()); error::params(err.to_string()) @@ -2425,6 +2329,101 @@ impl<'a> InfluxQLToLogicalPlan<'a> { } } + /// Map an InfluxQL variable reference to a DataFusion expression. + fn varref_to_df_expr( + &self, + fill_config: &Option, + scope: ExprScope, + varref: &VarRef, + schema: &IQLSchema<'_>, + ) -> Result { + let df_schema = &schema.df_schema; + let VarRef { + name, + data_type: opt_dst_type, + } = varref; + Ok(match (scope, name.as_str()) { + // Per the Go implementation, the time column is case-insensitive in the + // `WHERE` clause and disregards any postfix type cast operator. + // + // See: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L5751-L5753 + (ExprScope::Where, name) if name.eq_ignore_ascii_case("time") => "time".as_expr(), + (ExprScope::Projection, "time") => "time".as_expr(), + (_, name) => match df_schema + .fields_with_unqualified_name(name) + .first() + .map(|f| f.data_type().clone()) + { + Some(src_type) => { + let column = name.as_expr(); + + match opt_dst_type.and_then(var_ref_data_type_to_data_type) { + Some(dst_type) => { + fn is_numeric(dt: &DataType) -> bool { + matches!(dt, DataType::Int64 | DataType::Float64 | DataType::UInt64) + } + + if src_type == dst_type { + column + } else if is_numeric(&src_type) && is_numeric(&dst_type) { + // InfluxQL only allows casting between numeric types, + // and it is safe to unconditionally unwrap, as the + // `is_numeric_type` call guarantees it can be mapped to + // an Arrow DataType + column.cast_to(&dst_type, &schema.df_schema)? + } else { + // If the cast is incompatible, evaluates to NULL + Expr::Literal(ScalarValue::Null, None) + } + } + None => column, + } + } + _ => { + // For non-existent columns, we need to check if the user specified a gap-filling value. + // See [`VirtualColumnFillConfig`] for more details. + match fill_config { + Some(VirtualColumnFillConfig { + fill_clause: Some(FillClause::Value(n)), + data_type, + }) => { + // The user specified a gap-filling value + match data_type { + Some(InfluxColumnType::Field(InfluxFieldType::Integer)) => { + Expr::Literal(number_to_scalar(n, &DataType::Int64)?, None) + } + Some(InfluxColumnType::Field(InfluxFieldType::Float)) => { + Expr::Literal(number_to_scalar(n, &DataType::Float64)?, None) + } + Some(InfluxColumnType::Tag) => { + // Do not gap-fill tags + Expr::Literal(ScalarValue::Null, None) + } + _ => { + match n { + // Default to the data type of the gap-filling value + Number::Integer(_) => Expr::Literal( + number_to_scalar(n, &DataType::Int64)?, + None, + ), + Number::Float(_) => Expr::Literal( + number_to_scalar(n, &DataType::Float64)?, + None, + ), + } + } + } + } + _ => { + // No gap-filling config or value, return NULL + Expr::Literal(ScalarValue::Null, None) + } + } + } + }, + }) + } + /// Map an InfluxQL function call to a DataFusion expression. /// /// A full list of supported functions available via the [InfluxQL documentation][docs]. @@ -3891,37 +3890,64 @@ impl<'a> InfluxQLToLogicalPlan<'a> { /// /// # Arguments /// -/// * `input` - An aggregate plan which requires gap-filling. -/// * `time_column` - The `date_bin` expression. -/// * `fill_strategy` - The strategy used to fill gaps in the data. Should be equal in length to -/// `input.aggr_exprs`, where fill_strategy\[n\] is the strategy for aggr_exprs\[n\] +/// * `input` - A plan which requires gap-filling, it is required that +/// the input plan includes an Aggregate node. +/// * `fill_strategy` - The strategy used to fill gaps in the data. +/// Should be equal in length to `input.aggr_exprs`, where +/// fill_strategy\[n\] is the strategy for aggr_exprs\[n\]. +/// * `projection_type` - The type of projection being performed. fn build_gap_fill_node( input: LogicalPlan, - time_column: &Expr, fill_strategy: Vec, projection_type: &ProjectionType, ) -> Result { - let (expr, alias) = match time_column { - Expr::Alias(Alias { - expr, - relation: None, - name: alias, - metadata: _, - }) => (expr.as_ref(), alias), - _ => return error::internal("expected time column to have an alias function"), + let mut aggr = None; + input.apply(|expr| { + if let LogicalPlan::Aggregate(a) = expr { + aggr = Some(a.clone()); + Ok(TreeNodeRecursion::Stop) + } else { + Ok(TreeNodeRecursion::Continue) + } + })?; + let Some(aggr) = aggr else { + return error::internal("GapFill requires an Aggregate ancestor"); }; - let (date_bin_udf, date_bin_args) = match expr { - Expr::ScalarFunction(ScalarFunction { func: udf, args }) - if udf.inner().as_any().is::() - || udf.inner().as_any().is::() => - { - (Arc::::from(udf.name()), args) - } + let group_expr = aggr.group_expr; + + // Extract the DATE_BIN expression from the aggregate's group + // expressions. + let (time_column_idx, time_column_alias, date_bin_udf, date_bin_args) = match group_expr + .iter() + .enumerate() + .filter_map(|(idx, expr)| match expr { + Expr::Alias(alias) => { + if let Expr::ScalarFunction(fun) = alias.expr.as_ref() { + if fun.func.inner().as_any().is::() + || fun.func.inner().as_any().is::() + { + Some(( + idx, + alias.name.clone(), + Arc::clone(&fun.func), + fun.args.clone(), + )) + } else { + None + } + } else { + None + } + } + _ => None, + }) + .collect::>() + .as_slice() + { + [(idx, alias, udf, args)] => (*idx, alias.to_owned(), Arc::clone(udf), args.to_owned()), _ => { - // The InfluxQL planner adds the `date_bin` function, - // so this condition represents an internal failure. - return error::internal("expected DATE_BIN function"); + return error::internal("expected exactly one DATE_BIN in Aggregate group expressions"); } }; @@ -3995,26 +4021,43 @@ fn build_gap_fill_node( let aggr_expr = new_group_expr.split_off(aggr.group_expr.len()); // The fill strategy for InfluxQL is specified at the query level - let fill_strategy = aggr_expr.iter().cloned().zip(fill_strategy).collect(); + let fill_expr = aggr_expr + .iter() + .cloned() + .zip(fill_strategy) + .map(|(e, s)| FillExpr { + expr: e, + strategy: s, + }) + .collect(); - let time_column = col(input - .schema() - .qualified_field_with_unqualified_name(alias) - .map(Column::from)?); + let series_expr = group_expr + .iter() + .enumerate() + .filter_map(|(i, e)| { + if i != time_column_idx { + Some(e.clone()) + } else { + None + } + }) + .collect(); + + let time_expr = Expr::ScalarFunction(ScalarFunction { + func: date_bin_udf, + args: vec![stride.clone(), col(time_column_alias)] + .into_iter() + .chain(origin.clone()) + .collect(), + }); Ok(LogicalPlan::Extension(Extension { node: Arc::new(GapFill::try_new( Arc::new(input), - new_group_expr, - aggr_expr, - GapFillParams { - date_bin_udf, - stride: stride.clone(), - time_column, - origin, - time_range, - fill_strategy, - }, + series_expr, + time_expr, + fill_expr, + time_range, )?), })) } @@ -5296,7 +5339,7 @@ mod tests { Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, difference(avg(cpu.usage_idle)) AS difference [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, difference:Float64;N] Filter: difference(avg(cpu.usage_idle)) IS NOT NULL [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, difference(avg(cpu.usage_idle)):Float64;N] WindowAggr: windowExpr=[[difference(avg(cpu.usage_idle)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS difference(avg(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, difference(avg(cpu.usage_idle)):Float64;N] - GapFill: groupBy=[time], aggr=[[avg(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[avg(cpu.usage_idle)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[avg(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] @@ -5322,12 +5365,25 @@ mod tests { Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, non_negative_difference(avg(cpu.usage_idle)) AS non_negative_difference [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_difference:Float64;N] Filter: non_negative_difference(avg(cpu.usage_idle)) IS NOT NULL [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, non_negative_difference(avg(cpu.usage_idle)):Float64;N] WindowAggr: windowExpr=[[non_negative_difference(avg(cpu.usage_idle)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_difference(avg(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, non_negative_difference(avg(cpu.usage_idle)):Float64;N] - GapFill: groupBy=[time], aggr=[[avg(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[avg(cpu.usage_idle)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[avg(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] "#); + + // aggregate SUM regex + assert_snapshot!(plan("SELECT NON_NEGATIVE_DIFFERENCE(SUM(/usage_.*/)) FROM cpu GROUP BY time(10s)"), @r#" + Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_difference_usage_idle:Float64;N, non_negative_difference_usage_system:Float64;N, non_negative_difference_usage_user:Float64;N] + Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, non_negative_difference(sum(cpu.usage_idle)) AS non_negative_difference_usage_idle, non_negative_difference(sum(cpu.usage_system)) AS non_negative_difference_usage_system, non_negative_difference(sum(cpu.usage_user)) AS non_negative_difference_usage_user [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_difference_usage_idle:Float64;N, non_negative_difference_usage_system:Float64;N, non_negative_difference_usage_user:Float64;N] + Filter: non_negative_difference(sum(cpu.usage_idle)) IS NOT NULL OR non_negative_difference(sum(cpu.usage_system)) IS NOT NULL OR non_negative_difference(sum(cpu.usage_user)) IS NOT NULL [time:Timestamp(Nanosecond, None);N, sum(cpu.usage_idle):Float64;N, sum(cpu.usage_system):Float64;N, sum(cpu.usage_user):Float64;N, non_negative_difference(sum(cpu.usage_idle)):Float64;N, non_negative_difference(sum(cpu.usage_system)):Float64;N, non_negative_difference(sum(cpu.usage_user)):Float64;N] + WindowAggr: windowExpr=[[non_negative_difference(sum(cpu.usage_idle)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_difference(sum(cpu.usage_idle)), non_negative_difference(sum(cpu.usage_system)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_difference(sum(cpu.usage_system)), non_negative_difference(sum(cpu.usage_user)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_difference(sum(cpu.usage_user))]] [time:Timestamp(Nanosecond, None);N, sum(cpu.usage_idle):Float64;N, sum(cpu.usage_system):Float64;N, sum(cpu.usage_user):Float64;N, non_negative_difference(sum(cpu.usage_idle)):Float64;N, non_negative_difference(sum(cpu.usage_system)):Float64;N, non_negative_difference(sum(cpu.usage_user)):Float64;N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[sum(cpu.usage_idle), sum(cpu.usage_system), sum(cpu.usage_user)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, sum(cpu.usage_idle):Float64;N, sum(cpu.usage_system):Float64;N, sum(cpu.usage_user):Float64;N] + Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[sum(cpu.usage_idle), sum(cpu.usage_system), sum(cpu.usage_user)]] [time:Timestamp(Nanosecond, None);N, sum(cpu.usage_idle):Float64;N, sum(cpu.usage_system):Float64;N, sum(cpu.usage_user):Float64;N] + Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + Filter: cpu.usage_idle IS NOT NULL OR cpu.usage_system IS NOT NULL OR cpu.usage_user IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + "#); } #[test] @@ -5348,7 +5404,7 @@ mod tests { Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, moving_average(avg(cpu.usage_idle),Int64(3)) AS moving_average [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, moving_average:Float64;N] Filter: moving_average(avg(cpu.usage_idle),Int64(3)) IS NOT NULL [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, moving_average(avg(cpu.usage_idle),Int64(3)):Float64;N] WindowAggr: windowExpr=[[moving_average(avg(cpu.usage_idle), Int64(3)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS moving_average(avg(cpu.usage_idle),Int64(3))]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, moving_average(avg(cpu.usage_idle),Int64(3)):Float64;N] - GapFill: groupBy=[time], aggr=[[avg(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[avg(cpu.usage_idle)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[avg(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] @@ -5383,7 +5439,7 @@ mod tests { Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, derivative(avg(cpu.usage_idle)) AS derivative [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, derivative:Float64;N] Filter: derivative(avg(cpu.usage_idle)) IS NOT NULL [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, derivative(avg(cpu.usage_idle)):Float64;N] WindowAggr: windowExpr=[[derivative(avg(cpu.usage_idle), IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS derivative(avg(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, derivative(avg(cpu.usage_idle)):Float64;N] - GapFill: groupBy=[time], aggr=[[avg(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[avg(cpu.usage_idle)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[avg(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] @@ -5409,7 +5465,7 @@ mod tests { Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, non_negative_derivative(avg(cpu.usage_idle)) AS non_negative_derivative [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_derivative:Float64;N] Filter: non_negative_derivative(avg(cpu.usage_idle)) IS NOT NULL [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, non_negative_derivative(avg(cpu.usage_idle)):Float64;N] WindowAggr: windowExpr=[[non_negative_derivative(avg(cpu.usage_idle), IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_derivative(avg(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, non_negative_derivative(avg(cpu.usage_idle)):Float64;N] - GapFill: groupBy=[time], aggr=[[avg(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[avg(cpu.usage_idle)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[avg(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] @@ -5422,7 +5478,7 @@ mod tests { Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, non_negative_derivative(selector_last(cpu.usage_idle,cpu.time)[value]) AS non_negative_derivative [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_derivative:Float64;N] Filter: non_negative_derivative(selector_last(cpu.usage_idle,cpu.time)[value]) IS NOT NULL [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N, non_negative_derivative(selector_last(cpu.usage_idle,cpu.time)[value]):Float64;N] WindowAggr: windowExpr=[[non_negative_derivative(get_field(selector_last(cpu.usage_idle,cpu.time), Utf8("value")), IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_derivative(selector_last(cpu.usage_idle,cpu.time)[value])]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N, non_negative_derivative(selector_last(cpu.usage_idle,cpu.time)[value]):Float64;N] - GapFill: groupBy=[time], aggr=[[selector_last(cpu.usage_idle,cpu.time)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[selector_last(cpu.usage_idle,cpu.time)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_last(cpu.usage_idle, cpu.time)]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] @@ -5448,7 +5504,7 @@ mod tests { Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, cumulative_sum(avg(cpu.usage_idle)) AS cumulative_sum [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cumulative_sum:Float64;N] Filter: cumulative_sum(avg(cpu.usage_idle)) IS NOT NULL [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, cumulative_sum(avg(cpu.usage_idle)):Float64;N] WindowAggr: windowExpr=[[cumumlative_sum(avg(cpu.usage_idle)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS cumulative_sum(avg(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, cumulative_sum(avg(cpu.usage_idle)):Float64;N] - GapFill: groupBy=[time], aggr=[[avg(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[avg(cpu.usage_idle)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[avg(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] @@ -5462,7 +5518,7 @@ mod tests { Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, difference:Float64;N, mean:Float64;N] Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, difference(avg(cpu.usage_idle)) AS difference, avg(cpu.usage_idle) AS mean [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, difference:Float64;N, mean:Float64;N] WindowAggr: windowExpr=[[difference(avg(cpu.usage_idle)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS difference(avg(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, difference(avg(cpu.usage_idle)):Float64;N] - GapFill: groupBy=[time], aggr=[[avg(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[avg(cpu.usage_idle)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[avg(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N] Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] @@ -5598,7 +5654,7 @@ mod tests { assert_snapshot!(plan("SELECT LAST(usage_idle) FROM cpu GROUP BY TIME(5s)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N] Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, get_field(selector_last(cpu.usage_idle,cpu.time), Utf8("value")) AS last [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N] - GapFill: groupBy=[time], aggr=[[selector_last(cpu.usage_idle,cpu.time)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), time, TimestampNanosecond(0, None)), fill=[selector_last(cpu.usage_idle,cpu.time)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_last(cpu.usage_idle, cpu.time)]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] @@ -5611,7 +5667,7 @@ mod tests { assert_snapshot!(plan("SELECT FIRST(usage_idle) FROM cpu GROUP BY TIME(5s) FILL(0)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N] Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, get_field(coalesce_struct(selector_first(cpu.usage_idle,cpu.time), Struct({value:0.0,time:1970-01-01T00:00:00})), Utf8("value")) AS first [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N] - GapFill: groupBy=[time], aggr=[[COALESCE({value:0.0,time:1970-01-01T00:00:00}, selector_first(cpu.usage_idle,cpu.time))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_first(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(selector_first(cpu.usage_idle,cpu.time), {value:0.0,time:1970-01-01T00:00:00})], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_first(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_first(cpu.usage_idle, cpu.time)]] [time:Timestamp(Nanosecond, None);N, selector_first(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] @@ -5658,7 +5714,7 @@ mod tests { assert_snapshot!(plan("SELECT LAST(usage_idle), usage_system FROM cpu GROUP BY TIME(5s)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N, usage_system:Float64;N] Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, get_field(selector_last(cpu.usage_idle,cpu.time,cpu.usage_system), Utf8("value")) AS last, get_field(selector_last(cpu.usage_idle,cpu.time,cpu.usage_system), Utf8("other_1")) AS usage_system [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N, usage_system:Float64;N] - GapFill: groupBy=[time], aggr=[[selector_last(cpu.usage_idle,cpu.time,cpu.usage_system)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), time, TimestampNanosecond(0, None)), fill=[selector_last(cpu.usage_idle,cpu.time,cpu.usage_system)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_last(cpu.usage_idle, cpu.time, cpu.usage_system)]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] Filter: cpu.usage_idle IS NOT NULL OR cpu.usage_system IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] @@ -5670,7 +5726,7 @@ mod tests { assert_snapshot!(plan("SELECT LAST(usage_idle), usage_system FROM cpu GROUP BY TIME(5s) FILL(0)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N, usage_system:Float64;N] Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, get_field(coalesce_struct(selector_last(cpu.usage_idle,cpu.time,cpu.usage_system), Struct({value:0.0,time:1970-01-01T00:00:00,other_1:0.0})), Utf8("value")) AS last, get_field(coalesce_struct(selector_last(cpu.usage_idle,cpu.time,cpu.usage_system), Struct({value:0.0,time:1970-01-01T00:00:00,other_1:0.0})), Utf8("other_1")) AS usage_system [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N, usage_system:Float64;N] - GapFill: groupBy=[time], aggr=[[COALESCE({value:0.0,time:1970-01-01T00:00:00,other_1:0.0}, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(selector_last(cpu.usage_idle,cpu.time,cpu.usage_system), {value:0.0,time:1970-01-01T00:00:00,other_1:0.0})], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_last(cpu.usage_idle, cpu.time, cpu.usage_system)]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] Filter: cpu.usage_idle IS NOT NULL OR cpu.usage_system IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] @@ -5787,7 +5843,7 @@ mod tests { assert_snapshot!(plan("SELECT percentile(usage_idle,50), percentile(usage_idle,90) FROM cpu WHERE time >= 0 AND time < 60000000000 GROUP BY time(10s), cpu"), @r#" Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, percentile:Float64;N, percentile_1:Float64;N] Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, cpu.cpu AS cpu, percentile(cpu.usage_idle,Int64(50)) AS percentile, percentile(cpu.usage_idle,Int64(90)) AS percentile_1 [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, percentile:Float64;N, percentile_1:Float64;N] - GapFill: groupBy=[time, cpu.cpu], aggr=[[percentile(cpu.usage_idle,Int64(50)), percentile(cpu.usage_idle,Int64(90))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Included(Literal(TimestampNanosecond(0, None), None))..Included(Literal(TimestampNanosecond(59999999999, None), None)) [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, percentile(cpu.usage_idle,Int64(50)):Float64;N, percentile(cpu.usage_idle,Int64(90)):Float64;N] + GapFill: series=[cpu.cpu], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[percentile(cpu.usage_idle,Int64(50)), percentile(cpu.usage_idle,Int64(90))], range=Included(Literal(TimestampNanosecond(0, None), None))..Included(Literal(TimestampNanosecond(59999999999, None), None)) [cpu:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None);N, percentile(cpu.usage_idle,Int64(50)):Float64;N, percentile(cpu.usage_idle,Int64(90)):Float64;N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time, cpu.cpu]], aggr=[[percentile(cpu.usage_idle, Int64(50)), percentile(cpu.usage_idle, Int64(90))]] [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, percentile(cpu.usage_idle,Int64(50)):Float64;N, percentile(cpu.usage_idle,Int64(90)):Float64;N] Filter: cpu.time >= TimestampNanosecond(0, None) AND cpu.time <= TimestampNanosecond(59999999999, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] @@ -6681,7 +6737,7 @@ mod tests { assert_snapshot!(plan("SELECT count(f64_field) FROM data GROUP BY TIME(10s)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64] Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, count(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64] - GapFill: groupBy=[time], aggr=[[COALESCE(0, count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 0)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] @@ -6695,7 +6751,7 @@ mod tests { assert_snapshot!(plan("SELECT count(f64_field) FROM data WHERE time < '2022-10-31T02:02:00Z' GROUP BY TIME(10s)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64] Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, count(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64] - GapFill: groupBy=[time], aggr=[[COALESCE(0, count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1667181719999999999, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 0)], range=Unbounded..Included(Literal(TimestampNanosecond(1667181719999999999, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Filter: data.time <= TimestampNanosecond(1667181719999999999, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] @@ -6709,7 +6765,7 @@ mod tests { assert_snapshot!(plan("SELECT count(f64_field) FROM data WHERE time >= '2022-10-31T02:00:00Z' GROUP BY TIME(10s)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64] Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, count(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64] - GapFill: groupBy=[time], aggr=[[COALESCE(0, count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Included(Literal(TimestampNanosecond(1667181600000000000, None), None))..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 0)], range=Included(Literal(TimestampNanosecond(1667181600000000000, None), None))..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Filter: data.time >= TimestampNanosecond(1667181600000000000, None) AND data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] @@ -6723,7 +6779,7 @@ mod tests { assert_snapshot!(plan("SELECT count(f64_field) FROM data WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(10s)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64] Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, count(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64] - GapFill: groupBy=[time], aggr=[[COALESCE(0, count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Included(Literal(TimestampNanosecond(1667181600000000000, None), None))..Included(Literal(TimestampNanosecond(1667181719999999999, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 0)], range=Included(Literal(TimestampNanosecond(1667181600000000000, None), None))..Included(Literal(TimestampNanosecond(1667181719999999999, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Filter: data.time >= TimestampNanosecond(1667181600000000000, None) AND data.time <= TimestampNanosecond(1667181719999999999, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] @@ -6736,7 +6792,7 @@ mod tests { assert_snapshot!(plan("SELECT count(f64_field) FROM data GROUP BY TIME(10s)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64] Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, count(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64] - GapFill: groupBy=[time], aggr=[[COALESCE(0, count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 0)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] @@ -6749,7 +6805,7 @@ mod tests { assert_snapshot!(plan("SELECT count(f64_field) FROM data GROUP BY TIME(10s) FILL(null)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64] Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, count(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64] - GapFill: groupBy=[time], aggr=[[COALESCE(0, count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 0)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] @@ -6762,7 +6818,7 @@ mod tests { assert_snapshot!(plan("SELECT count(f64_field) FROM data GROUP BY TIME(10s) FILL(previous)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64] Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, count(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64] - GapFill: groupBy=[time], aggr=[[LOCF(count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[LOCF(count(data.f64_field))], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] @@ -6775,7 +6831,7 @@ mod tests { assert_snapshot!(plan("SELECT count(f64_field) FROM data GROUP BY TIME(10s) FILL(0)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N] Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(count(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N] - GapFill: groupBy=[time], aggr=[[COALESCE(0, count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 0)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] @@ -6788,7 +6844,7 @@ mod tests { assert_snapshot!(plan("SELECT count(f64_field) FROM data GROUP BY TIME(10s) FILL(linear)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64] Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, count(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64] - GapFill: groupBy=[time], aggr=[[INTERPOLATE(count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[INTERPOLATE(count(data.f64_field))], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] @@ -6804,7 +6860,7 @@ mod tests { assert_snapshot!(plan("SELECT first(f64_field) FROM data GROUP BY TIME(10s) FILL(null)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N] Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, get_field(selector_first(data.f64_field,data.time), Utf8("value")) AS first [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N] - GapFill: groupBy=[time], aggr=[[selector_first(data.f64_field,data.time)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[selector_first(data.f64_field,data.time)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_first(data.f64_field, data.time)]] [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] @@ -6814,7 +6870,7 @@ mod tests { assert_snapshot!(plan("SELECT first(f64_field) * 1 FROM data GROUP BY TIME(10s) FILL(null)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N] Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, get_field(selector_first(data.f64_field,data.time), Utf8("value")) * Int64(1) AS first [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N] - GapFill: groupBy=[time], aggr=[[selector_first(data.f64_field,data.time)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[selector_first(data.f64_field,data.time)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_first(data.f64_field, data.time)]] [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] @@ -6824,7 +6880,7 @@ mod tests { assert_snapshot!(plan("SELECT first(f64_field) / 1 FROM data GROUP BY TIME(10s) FILL(null)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N] Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, CASE WHEN Int64(1) = Float64(0) AND get_field(selector_first(data.f64_field,data.time), Utf8("value")) IS NOT NULL THEN Float64(0) ELSE get_field(selector_first(data.f64_field,data.time), Utf8("value")) / Int64(1) END AS first [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N] - GapFill: groupBy=[time], aggr=[[selector_first(data.f64_field,data.time)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[selector_first(data.f64_field,data.time)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_first(data.f64_field, data.time)]] [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] @@ -6838,7 +6894,7 @@ mod tests { assert_snapshot!(plan("SELECT count(f64_field) FROM data GROUP BY TIME(10s) FILL(3.2)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N] Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(count(data.f64_field), Int64(3)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N] - GapFill: groupBy=[time], aggr=[[COALESCE(3, count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 3)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64] Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] @@ -6852,7 +6908,7 @@ mod tests { assert_snapshot!(plan("SELECT count(f64_field) + MEAN(f64_field) FROM data GROUP BY TIME(10s) FILL(3.2)"), @r#" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count_mean:Float64;N] Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(count(data.f64_field), Int64(3)) + coalesce_struct(avg(data.f64_field), Float64(3.2)) AS count_mean [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count_mean:Float64;N] - GapFill: groupBy=[time], aggr=[[COALESCE(3, count(data.f64_field)), COALESCE(3.2, avg(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64, avg(data.f64_field):Float64;N] + GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 3), COALESCE(avg(data.f64_field), 3.2)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64, avg(data.f64_field):Float64;N] Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field), avg(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64, avg(data.f64_field):Float64;N] Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] diff --git a/iox_query_influxql/src/plan/rewriter.rs b/iox_query_influxql/src/plan/rewriter.rs index f34d3d08..db57ef4b 100644 --- a/iox_query_influxql/src/plan/rewriter.rs +++ b/iox_query_influxql/src/plan/rewriter.rs @@ -730,18 +730,16 @@ fn fields_expand_wildcards( } Expr::Call(Call { name, args }) => { - let mut name = name; - let mut args = args; + let mut stack = vec![(name, args)]; // Search for the call with a wildcard by continuously descending until // we no longer have a call. while let Some(Expr::Call(Call { name: inner_name, args: inner_args, - })) = args.first() + })) = stack.last().unwrap().1.first() { - name = inner_name; - args = inner_args; + stack.push((inner_name, inner_args)); } // a list of supported types that may be selected from the var_refs @@ -753,7 +751,7 @@ fn fields_expand_wildcards( ]); // Modify the supported types for certain functions. - match name.as_str() { + match stack.last().unwrap().0.as_str() { "count" | "first" | "last" | "distinct" | "elapsed" | "mode" | "sample" => { supported_types .extend([Some(VarRefDataType::String), Some(VarRefDataType::Boolean)]); @@ -767,21 +765,38 @@ fn fields_expand_wildcards( _ => {} } + // Reverse the stack so that new fields can be added by + // applying the required function calls starting with + // the inner-most. + stack.reverse(); let add_field = |v: &VarRef| { - let mut args = args.clone(); - args[0] = Expr::VarRef(v.clone()); + let var_ref_name = v.name.clone(); + let mut e = Expr::VarRef(v.clone()); + for (name, args) in &stack { + e = Expr::Call(Call { + name: (*name).clone(), + // The first argument is always e as it is + // either the new field reference or the + // next inner function call. Any remaining + // arguments are appended. + args: vec![e] + .into_iter() + .chain(args.iter().skip(1).cloned()) + .collect(), + }) + } new_fields.push(influxdb_influxql_parser::select::Field { - expr: Expr::Call(Call { - name: name.clone(), - args, - }), - alias: Some(format!("{}_{}", field_name(&f), v.name).into()), + expr: e, + alias: Some(format!("{}_{}", field_name(&f), var_ref_name).into()), }) }; - match args.first() { + match stack.first().unwrap().1.first() { Some(Expr::Wildcard(Some(WildcardType::Tag))) => { - return error::query(format!("unable to use tag as wildcard in {name}()")); + return error::query(format!( + "unable to use tag as wildcard in {}()", + stack.first().unwrap().0 + )); } Some(Expr::Wildcard(_)) => { var_refs @@ -1670,16 +1685,19 @@ fn select_statement_info( #[cfg(test)] mod test { - use super::Result; + use super::{Result, VarRef}; use crate::plan::ir::{Field, Select}; use crate::plan::rewriter::{ - ProjectionType, SelectStatementInfo, find_table_names, has_wildcards, rewrite_select, - rewrite_statement, + ProjectionType, SelectStatementInfo, fields_expand_wildcards, find_table_names, + has_wildcards, rewrite_select, rewrite_statement, }; use crate::plan::test_utils::{MockSchemaProvider, parse_select}; use assert_matches::assert_matches; use datafusion::error::DataFusionError; + use influxdb_influxql_parser::expression::VarRefDataType; + use influxdb_influxql_parser::identifier::Identifier; use influxdb_influxql_parser::select::SelectStatement; + use influxdb_influxql_parser::string::Regex; use test_helpers::{assert_contains, assert_error}; #[test] @@ -2812,4 +2830,232 @@ mod test { assert!(!res.0); assert!(!res.1); } + + #[test] + fn test_nested_function_wildcard_expansion() { + // Test that wildcards in nested functions are properly expanded + // This tests the fix for expanding regular expressions in nested functions + + let _namespace = MockSchemaProvider::default(); + + // Create var_refs for cpu table (based on database::schemas()) + // Tags: host, region, cpu + // Fields: usage_user, usage_system, usage_idle (all Float) + let var_refs = vec![ + VarRef { + name: Identifier::new("host".to_string()), + data_type: Some(VarRefDataType::Tag), + }, + VarRef { + name: Identifier::new("region".to_string()), + data_type: Some(VarRefDataType::Tag), + }, + VarRef { + name: Identifier::new("cpu".to_string()), + data_type: Some(VarRefDataType::Tag), + }, + VarRef { + name: Identifier::new("usage_user".to_string()), + data_type: Some(VarRefDataType::Float), + }, + VarRef { + name: Identifier::new("usage_system".to_string()), + data_type: Some(VarRefDataType::Float), + }, + VarRef { + name: Identifier::new("usage_idle".to_string()), + data_type: Some(VarRefDataType::Float), + }, + ]; + + // Test difference(sum(*)) - a nested function combination with wildcard + let fields = vec![influxdb_influxql_parser::select::Field { + expr: influxdb_influxql_parser::expression::Expr::Call( + influxdb_influxql_parser::expression::Call { + name: "difference".to_string(), + args: vec![influxdb_influxql_parser::expression::Expr::Call( + influxdb_influxql_parser::expression::Call { + name: "sum".to_string(), + args: vec![influxdb_influxql_parser::expression::Expr::Wildcard(None)], + }, + )], + }, + ), + alias: None, + }]; + + // Expand wildcards + let expanded_fields = fields_expand_wildcards(fields, var_refs.clone()).unwrap(); + + // Check that wildcards were expanded to actual fields (only numeric fields) + assert!( + expanded_fields.len() == 3, + "Expected 3 numeric fields after expansion" + ); + + // Verify each field has proper nested structure difference(sum(field)) + for field in &expanded_fields { + // Should have an alias like difference_ (outermost function name + field) + assert!(field.alias.is_some(), "Field should have an alias"); + let alias = field.alias.as_ref().unwrap(); + assert!( + alias.starts_with("difference_"), + "Alias should start with difference_" + ); + + // Verify it's a difference(sum(field)) structure + match &field.expr { + influxdb_influxql_parser::expression::Expr::Call(outer_call) => { + assert_eq!(outer_call.name, "difference"); + assert_eq!(outer_call.args.len(), 1); + + match &outer_call.args[0] { + influxdb_influxql_parser::expression::Expr::Call(inner_call) => { + assert_eq!(inner_call.name, "sum"); + assert_eq!(inner_call.args.len(), 1); + + // Should be a VarRef to an actual field + assert_matches!( + &inner_call.args[0], + influxdb_influxql_parser::expression::Expr::VarRef(_) + ); + } + _ => panic!("Expected inner call to be sum()"), + } + } + _ => panic!("Expected outer call to be difference()"), + } + } + + // Test with regex pattern in nested function - difference(sum(/usage.*/)) + let fields_regex = vec![influxdb_influxql_parser::select::Field { + expr: influxdb_influxql_parser::expression::Expr::Call( + influxdb_influxql_parser::expression::Call { + name: "difference".to_string(), + args: vec![influxdb_influxql_parser::expression::Expr::Call( + influxdb_influxql_parser::expression::Call { + name: "sum".to_string(), + args: vec![influxdb_influxql_parser::expression::Expr::Literal( + influxdb_influxql_parser::literal::Literal::Regex(Regex::new( + "usage.*".to_string(), + )), + )], + }, + )], + }, + ), + alias: None, + }]; + + // Expand regex pattern + let expanded_regex = fields_expand_wildcards(fields_regex, var_refs.clone()).unwrap(); + + // Should expand to fields matching the pattern + assert_eq!( + expanded_regex.len(), + 3, + "Expected exactly 3 fields matching 'usage.*'" + ); + + for field in &expanded_regex { + assert!(field.alias.is_some()); + let alias = field.alias.as_ref().unwrap(); + assert!( + alias.starts_with("difference_usage"), + "Expanded field should start with 'difference_usage'" + ); + + // Verify it's a difference(sum(field)) structure + match &field.expr { + influxdb_influxql_parser::expression::Expr::Call(outer_call) => { + assert_eq!(outer_call.name, "difference"); + match &outer_call.args[0] { + influxdb_influxql_parser::expression::Expr::Call(inner_call) => { + assert_eq!(inner_call.name, "sum"); + } + _ => panic!("Expected inner call to be sum()"), + } + } + _ => panic!("Expected outer call to be difference()"), + } + } + + // Test that the stack-based traversal correctly handles deeply nested functions + // This is the core of the fix - ensuring we properly rebuild the nested structure + let deep_nested = vec![influxdb_influxql_parser::select::Field { + expr: influxdb_influxql_parser::expression::Expr::Call( + influxdb_influxql_parser::expression::Call { + name: "non_negative_difference".to_string(), + args: vec![influxdb_influxql_parser::expression::Expr::Call( + influxdb_influxql_parser::expression::Call { + name: "mean".to_string(), + args: vec![influxdb_influxql_parser::expression::Expr::Call( + influxdb_influxql_parser::expression::Call { + name: "sum".to_string(), + args: vec![ + influxdb_influxql_parser::expression::Expr::Wildcard(None), + ], + }, + )], + }, + )], + }, + ), + alias: None, + }]; + + // This should expand the wildcard while preserving the full nested structure + let expanded_deep = fields_expand_wildcards(deep_nested, var_refs).unwrap(); + assert_eq!( + expanded_deep.len(), + 3, + "Deep nested functions should expand to 3 numeric fields" + ); + + // Verify the structure is preserved: non_negative_difference(mean(sum(field))) + for field in &expanded_deep { + assert!(field.alias.is_some()); + let alias = field.alias.as_ref().unwrap(); + assert!( + alias.starts_with("non_negative_difference_usage"), + "Deep nested alias should start with non_negative_difference_usage" + ); + + match &field.expr { + influxdb_influxql_parser::expression::Expr::Call(outer) => { + assert_eq!(outer.name, "non_negative_difference"); + assert_eq!( + outer.args.len(), + 1, + "non_negative_difference should have 1 arg" + ); + + // First arg should be mean(sum(field)) + match &outer.args[0] { + influxdb_influxql_parser::expression::Expr::Call(middle) => { + assert_eq!(middle.name, "mean"); + assert_eq!(middle.args.len(), 1); + + // Inner should be sum(field) + match &middle.args[0] { + influxdb_influxql_parser::expression::Expr::Call(inner) => { + assert_eq!(inner.name, "sum"); + assert_eq!(inner.args.len(), 1); + + // Should be a VarRef + assert_matches!( + &inner.args[0], + influxdb_influxql_parser::expression::Expr::VarRef(_) + ); + } + _ => panic!("Expected innermost call to be sum()"), + } + } + _ => panic!("Expected middle call to be mean()"), + } + } + _ => panic!("Expected outer call to be non_negative_difference()"), + } + } + } } diff --git a/iox_query_influxql/src/show_databases.rs b/iox_query_influxql/src/show_databases.rs new file mode 100644 index 00000000..66531f39 --- /dev/null +++ b/iox_query_influxql/src/show_databases.rs @@ -0,0 +1,99 @@ +use std::collections::HashMap; +use std::fmt::Debug; + +use arrow::datatypes::SchemaRef; +use datafusion::common::Result; +use datafusion::execution::SendableRecordBatchStream; +use generated_types::influxdata::iox::querier::v1::InfluxQlMetadata; +use schema::INFLUXQL_METADATA_KEY; + +/// Trait for handling the `SHOW DATABASES` query +/// +/// This allows for optional `SHOW DATABASES` handling for systems such as +/// InfluxDB3 Enterprise, without requiring it to be implemented on all systems +#[async_trait::async_trait] +pub trait InfluxQlShowDatabases: Debug + Send + Sync + 'static { + /// Produce the Arrow schema for the `SHOW DATABASES` InfluxQL query + fn schema(&self) -> SchemaRef; + /// Produce a record batch stream containing the results for the `SHOW DATABASES` query + /// + /// Accepts `database_names` which represents the list of databases the requestor is + /// authorized to read. The underlying implementation should only produce the databases listed + /// in the resulting record batch stream. + async fn show_databases( + &self, + database_names: Vec, + ) -> Result; +} + +/// Generate the default InfluxQL metadata map for producing a `Schema` for the `SHOW DATABASES` +/// query. +pub fn generate_metadata(measurement_column_index: u32) -> HashMap { + let md = serde_json::to_string(&InfluxQlMetadata { + measurement_column_index, + tag_key_columns: vec![], + }) + .expect("metadata should serialize as JSON"); + [(INFLUXQL_METADATA_KEY.to_string(), md)] + .into_iter() + .collect() +} + +pub mod mock { + use std::sync::Arc; + + use arrow::{ + array::{Array, RecordBatch, StringArray}, + datatypes::{DataType, Field, Schema}, + }; + use datafusion_util::MemoryStream; + use schema::INFLUXQL_MEASUREMENT_COLUMN_NAME; + + use super::*; + + #[derive(Debug)] + pub struct MockShowDatabases { + database_names: Vec, + } + + impl MockShowDatabases { + pub fn new(database_names: impl IntoIterator>) -> Self { + Self { + database_names: database_names.into_iter().map(Into::into).collect(), + } + } + } + + #[async_trait::async_trait] + impl InfluxQlShowDatabases for MockShowDatabases { + fn schema(&self) -> SchemaRef { + Arc::new( + Schema::new(vec![ + Field::new(INFLUXQL_MEASUREMENT_COLUMN_NAME, DataType::Utf8, false), + Field::new("name", arrow::datatypes::DataType::Utf8, false), + ]) + .with_metadata(generate_metadata(0)), + ) + } + + async fn show_databases( + &self, + database_names: Vec, + ) -> Result { + let names = self + .database_names + .iter() + .filter(|n| database_names.contains(*n)) + .map(String::as_str) + .collect::>(); + let measurement_array: StringArray = vec!["databases"; names.len()].into(); + let names_array: StringArray = names.into(); + let arrays = vec![ + Arc::new(measurement_array) as Arc, + Arc::new(names_array) as Arc, + ]; + let batch = RecordBatch::try_new(self.schema(), arrays)?; + Ok(Box::pin(MemoryStream::new(vec![batch]))) + } + } +} diff --git a/iox_query_influxql/src/show_retention_policies.rs b/iox_query_influxql/src/show_retention_policies.rs new file mode 100644 index 00000000..2097dc8a --- /dev/null +++ b/iox_query_influxql/src/show_retention_policies.rs @@ -0,0 +1,227 @@ +use std::collections::HashMap; +use std::fmt::Debug; + +use arrow::datatypes::SchemaRef; +use datafusion::common::Result; +use datafusion::execution::SendableRecordBatchStream; +use generated_types::influxdata::iox::querier::v1::InfluxQlMetadata; +use schema::INFLUXQL_METADATA_KEY; + +/// Trait for handling the `SHOW RETENTION POLICIES` query +/// +/// This allows for optional `SHOW RETENTION POLICIES` handling for systems such as +/// InfluxDB3 Enterprise, without requiring it to be implemented on all systems +#[async_trait::async_trait] +pub trait InfluxQlShowRetentionPolicies: Debug + Send + Sync + 'static { + /// Produce the Arrow schema for the `SHOW RETENTION POLICIES` InfluxQL query + fn schema(&self) -> SchemaRef; + /// Produce a record batch stream containing the results for the `SHOW RETENTION POLICIES` query + async fn show_retention_policies(&self, db_name: String) -> Result; +} + +/// Generate the default InfluxQL metadata map for producing a `Schema` for the +/// `SHOW RETENTION POLICIES` query. +pub fn generate_metadata(measurement_column_index: u32) -> HashMap { + let md = serde_json::to_string(&InfluxQlMetadata { + measurement_column_index, + tag_key_columns: vec![], + }) + .expect("metadata should serialize as JSON"); + [(INFLUXQL_METADATA_KEY.to_string(), md)] + .into_iter() + .collect() +} + +pub mod mock { + use std::{collections::BTreeMap, sync::Arc, time::Duration}; + + use arrow::{ + array::{Array, BooleanArray, RecordBatch, StringArray, UInt64Array}, + datatypes::{DataType, Field, Schema}, + }; + use datafusion_util::MemoryStream; + use schema::INFLUXQL_MEASUREMENT_COLUMN_NAME; + + use super::*; + + #[derive(Debug)] + pub struct MockRetentionPolicy { + name: String, + duration: Duration, + shard_group_duration: Duration, + replica_n: u64, + future_write_limit: Duration, + past_write_limit: Duration, + default: bool, + } + + impl Default for MockRetentionPolicy { + fn default() -> Self { + Self { + name: "autogen".to_string(), + duration: Duration::ZERO, + shard_group_duration: Duration::from_secs(7 * 60 * 60 * 24), // default is 7 days + replica_n: 1, + future_write_limit: Duration::ZERO, + past_write_limit: Duration::ZERO, + default: true, + } + } + } + + impl MockRetentionPolicy { + /// Create a named policy that is not the default policy + pub fn new(name: impl Into) -> Self { + Self::default().with_name(name).with_default(false) + } + + fn with_name(mut self, name: impl Into) -> Self { + self.name = name.into(); + self + } + + fn with_default(mut self, default: bool) -> Self { + self.default = default; + self + } + + pub fn with_duration(mut self, duration: Duration) -> Self { + self.duration = duration; + self + } + } + + #[derive(Debug, Default)] + pub struct MockShowRetentionPolicies { + retention_policies: BTreeMap>, + } + + impl MockShowRetentionPolicies { + pub fn new() -> Self { + Self::default() + } + + pub fn with_default_retention_policy(mut self, db_name: impl Into) -> Self { + self.retention_policies + .entry(db_name.into()) + .or_insert_with(|| vec![MockRetentionPolicy::default()]); + self + } + + pub fn with_retention_policy( + mut self, + db_name: impl Into, + policy: MockRetentionPolicy, + ) -> Self { + self.retention_policies + .entry(db_name.into()) + .or_default() + .push(policy); + self + } + } + + /// The implementation of this follows that of InfluxDB v1.12.2's /query API with respect to + /// the field names provided in the `SHOW RETENTION POLICIES` response schema. As such, this + /// would be a good reference point for implementing this interface in production. + /// + /// One distinction with the v1.12.2 response is that durations are reported there using a more + /// human-friendly format. For example, 1 hour and 30 minutes would be displayed as "1h30m", + /// whereas in this implementation, which uses `std::time::Duration`'s pretty formatting, the + /// same would be displayed as "5400s". + #[async_trait::async_trait] + impl InfluxQlShowRetentionPolicies for MockShowRetentionPolicies { + fn schema(&self) -> SchemaRef { + Arc::new( + Schema::new(vec![ + Field::new(INFLUXQL_MEASUREMENT_COLUMN_NAME, DataType::Utf8, false), + Field::new("name", arrow::datatypes::DataType::Utf8, false), + Field::new("duration", arrow::datatypes::DataType::Utf8, false), + Field::new( + "shardGroupDuration", + arrow::datatypes::DataType::Utf8, + false, + ), + Field::new("replicaN", arrow::datatypes::DataType::UInt64, false), + Field::new("futureWriteLimit", arrow::datatypes::DataType::Utf8, false), + Field::new("pastWriteLimit", arrow::datatypes::DataType::Utf8, false), + Field::new("default", arrow::datatypes::DataType::Boolean, false), + ]) + .with_metadata(generate_metadata(0)), + ) + } + + async fn show_retention_policies( + &self, + db_name: String, + ) -> Result { + let Some(db) = self.retention_policies.get(&db_name) else { + return Err(datafusion::error::DataFusionError::Plan(format!( + "database not found: {db_name}" + ))); + }; + let measurement_array: StringArray = vec!["retention_policies"; db.len()].into(); + let names_array: StringArray = db + .iter() + .map(|p| p.name.as_str()) + .collect::>() + .into(); + let durations_array: StringArray = db + .iter() + .map(|MockRetentionPolicy { duration, .. }| format!("{duration:#?}")) + .collect::>() + .into(); + let shard_group_durations_array: StringArray = db + .iter() + .map( + |MockRetentionPolicy { + shard_group_duration, + .. + }| format!("{shard_group_duration:#?}"), + ) + .collect::>() + .into(); + let replica_n_array: UInt64Array = db + .iter() + .map(|MockRetentionPolicy { replica_n, .. }| *replica_n) + .collect::>() + .into(); + let future_write_limit_array: StringArray = db + .iter() + .map( + |MockRetentionPolicy { + future_write_limit, .. + }| format!("{future_write_limit:#?}"), + ) + .collect::>() + .into(); + let past_write_limit_array: StringArray = db + .iter() + .map( + |MockRetentionPolicy { + past_write_limit, .. + }| format!("{past_write_limit:#?}"), + ) + .collect::>() + .into(); + let default_array: BooleanArray = db + .iter() + .map(|MockRetentionPolicy { default, .. }| *default) + .collect::>() + .into(); + + let arrays = vec![ + Arc::new(measurement_array) as Arc, + Arc::new(names_array) as Arc, + Arc::new(durations_array) as Arc, + Arc::new(shard_group_durations_array) as Arc, + Arc::new(replica_n_array) as Arc, + Arc::new(future_write_limit_array) as Arc, + Arc::new(past_write_limit_array) as Arc, + Arc::new(default_array) as Arc, + ]; + let batch = RecordBatch::try_new(self.schema(), arrays)?; + Ok(Box::pin(MemoryStream::new(vec![batch]))) + } + } +} diff --git a/iox_query_influxql_rewrite/Cargo.toml b/iox_query_influxql_rewrite/Cargo.toml new file mode 100644 index 00000000..3e39fddf --- /dev/null +++ b/iox_query_influxql_rewrite/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "iox_query_influxql_rewrite" +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +# workspace dependencies: +influxdb_influxql_parser = { path = "../influxdb_influxql_parser" } +workspace-hack = { version = "0.1", path = "../workspace-hack" } + +# crates.io dependencies: +thiserror = "2.0" + +[lints] +workspace = true diff --git a/iox_query_influxql_rewrite/src/lib.rs b/iox_query_influxql_rewrite/src/lib.rs new file mode 100644 index 00000000..731468c4 --- /dev/null +++ b/iox_query_influxql_rewrite/src/lib.rs @@ -0,0 +1,572 @@ +use workspace_hack as _; + +use std::collections::HashSet; + +use influxdb_influxql_parser::{ + common::ParseError, + explain::ExplainStatement, + identifier::Identifier, + parse_statements as parse_internal, + select::{MeasurementSelection, SelectStatement}, + show_measurements::ExtendedOnClause, + statement::Statement, +}; + +/// Rewritten takes a Statement and attempts to derive the db/rp +/// from the body of the statement. +#[derive(Debug)] +pub struct Rewritten { + database: Option, + retention_policy: Option, + statement: S, +} + +impl Rewritten { + fn new(statement: S) -> Self { + Self { + database: None, + retention_policy: None, + statement, + } + } + + fn with_database(mut self, db: Option) -> Self { + self.database = db; + self + } + + fn with_retention_policy(mut self, rp: Option) -> Self { + self.retention_policy = rp; + self + } + + pub fn set_retention_policy(&mut self, rp: String) { + self.retention_policy = Some(Identifier::from(rp.as_str())); + } + + pub fn database(&self) -> Option<&Identifier> { + self.database.as_ref() + } + + pub fn retention_policy(&self) -> Option<&Identifier> { + self.retention_policy.as_ref() + } + + pub fn statement(&self) -> &S { + &self.statement + } + + pub fn to_statement(self) -> S { + self.statement + } + + pub fn resolve_dbrp(&self) -> Option { + // We use `as_str().to_owned()` to avoid the + // quoting logic that is part of Identifier's `Display` implementation + match (&self.database, &self.retention_policy) { + (None, None) | (None, Some(_)) => None, + (Some(db), None) => Some(db.as_str().to_owned()), + (Some(db), Some(rp)) => { + if rp.as_str() != "autogen" && rp.as_str() != "default" { + Some(format!("{}/{rp}", db.as_str())) + } else { + Some(db.as_str().to_owned()) + } + } + } + } +} + +impl From> for Statement { + fn from(r: Rewritten) -> Self { + r.to_statement() + } +} + +impl TryFrom for Rewritten { + type Error = Error; + + fn try_from(statement: Statement) -> Result { + match statement { + Statement::ShowMeasurements(mut s) => { + if let Some(on) = s.on.take() { + let (db, rp) = match on { + ExtendedOnClause::Database(db) => (Some(db), None), + ExtendedOnClause::DatabaseRetentionPolicy(db, rp) => (Some(db), Some(rp)), + ExtendedOnClause::AllDatabases + | ExtendedOnClause::AllDatabasesAndRetentionPolicies => { + return Err(Error::MultiDatabase); + } + }; + Ok(Self::new(Statement::ShowMeasurements(s)) + .with_database(db) + .with_retention_policy(rp)) + } else { + Ok(Self::new(Statement::ShowMeasurements(s))) + } + } + Statement::ShowRetentionPolicies(mut s) => { + let identifier = s.database.take().map(Into::into); + Ok(Self::new(Statement::ShowRetentionPolicies(s)).with_database(identifier)) + } + Statement::ShowTagKeys(mut s) => { + let identifier = s.database.take().map(Into::into); + Ok(Self::new(Statement::ShowTagKeys(s)).with_database(identifier)) + } + Statement::ShowTagValues(mut s) => { + let identifier = s.database.take().map(Into::into); + Ok(Self::new(Statement::ShowTagValues(s)).with_database(identifier)) + } + Statement::ShowFieldKeys(mut s) => { + let identifier = s.database.take().map(Into::into); + Ok(Self::new(Statement::ShowFieldKeys(s)).with_database(identifier)) + } + Statement::Select(s) => { + let ss = Rewritten::::try_from(*s)?; + let db = ss.database.to_owned(); + let rp = ss.retention_policy.to_owned(); + Ok(Self::new(Statement::Select(Box::new(ss.to_statement()))) + .with_database(db) + .with_retention_policy(rp)) + } + Statement::Explain(mut s) => { + let options = s.options.take(); + let s = Self::try_from(*s.statement)?; + let db = s.database.to_owned(); + let rp = s.retention_policy.to_owned(); + Ok(Self::new(Statement::Explain(Box::new(ExplainStatement { + options, + statement: Box::new(s.to_statement()), + }))) + .with_database(db) + .with_retention_policy(rp)) + } + // For all other statements, we just pass them through. Explicitly + // do not use a catch-all match arm here in the event that new variants + // are added to the Statement enum, we want the compiler to direct us + // here to handle, if relevant. + Statement::CreateDatabase(_) + | Statement::Delete(_) + | Statement::DropMeasurement(_) + | Statement::ShowDatabases(_) => Ok(Self::new(statement)), + } + } +} + +impl TryFrom for Rewritten { + type Error = Error; + + fn try_from(mut select_statement: SelectStatement) -> Result { + let mut db_rp_set = HashSet::new(); + let from_clause = select_statement + .from + .take() + .into_iter() + .map(|ms| { + let (db, rp, ms) = match ms { + MeasurementSelection::Name(mut qn) => { + let db = qn.database.take(); + let rp = qn.retention_policy.take(); + (db, rp, MeasurementSelection::Name(qn)) + } + // Recursively call try_from on nested sub-queries, and compare their + // resulting db/rp to the same at this level. Sub-queries that have + // multiple db/rp in them will throw the MultiDatabase error. + MeasurementSelection::Subquery(s) => { + let ss = Self::try_from(*s)?; + ( + ss.database.to_owned(), + ss.retention_policy.to_owned(), + MeasurementSelection::Subquery(Box::new(ss.to_statement())), + ) + } + }; + if db_rp_set.insert((db, rp)) && db_rp_set.len() > 1 { + Err(Error::MultiDatabase) + } else { + Ok(ms) + } + }) + .collect::, Error>>()?; + select_statement.from.replace(from_clause); + let mut result = Self::new(select_statement); + if let Some((db, rp)) = db_rp_set.into_iter().next() { + result = result.with_database(db).with_retention_policy(rp); + } + Ok(result) + } +} + +#[derive(Debug, thiserror::Error, Clone, PartialEq, Eq)] +pub enum Error { + #[error("can only perform queries on a single database")] + MultiDatabase, + #[error("parsing error: {0}")] + Parse(ParseError), +} + +pub fn parse_statements(input: &str) -> Result>, Error> { + parse_internal(input) + .map_err(Error::Parse)? + .into_iter() + .map(Rewritten::::try_from) + .collect::>, Error>>() +} + +#[cfg(test)] +mod tests { + use influxdb_influxql_parser::statement::Statement; + + use crate::{Error, Rewritten, parse_statements}; + + fn parse_single(input: &str) -> Rewritten { + parse_statements(input).unwrap().pop().unwrap() + } + + fn parse_single_failure(input: &str) -> Error { + parse_statements(input).unwrap_err() + } + + struct TestCase { + input: &'static str, + expected: &'static str, + db: Option<&'static str>, + rp: Option<&'static str>, + } + + impl TestCase { + fn assert(&self) { + let s = parse_single(self.input); + assert_eq!(s.database().map(|db| db.as_str()), self.db); + assert_eq!(s.retention_policy().map(|rp| rp.as_str()), self.rp); + assert_eq!(self.expected, s.to_statement().to_string()); + } + } + + struct TestFailure { + input: &'static str, + expected: Error, + } + + impl TestFailure { + fn assert(&self) { + let e = parse_single_failure(self.input); + assert_eq!(self.expected, e, "input: {}", self.input); + } + } + + #[test] + fn show_measurements() { + TestCase { + input: "SHOW MEASUREMENTS", + expected: "SHOW MEASUREMENTS", + db: None, + rp: None, + } + .assert(); + TestCase { + input: "SHOW MEASUREMENTS ON foo", + expected: "SHOW MEASUREMENTS", + db: Some("foo"), + rp: None, + } + .assert(); + TestCase { + input: "SHOW MEASUREMENTS ON foo.bar", + expected: "SHOW MEASUREMENTS", + db: Some("foo"), + rp: Some("bar"), + } + .assert(); + } + + #[test] + fn show_measurements_failure_modes() { + TestFailure { + input: "SHOW MEASUREMENTS ON *.*", + expected: Error::MultiDatabase, + } + .assert(); + TestFailure { + input: r#"SHOW MEASUREMENTS ON *"#, + expected: Error::MultiDatabase, + } + .assert(); + } + + #[test] + fn show_retention_policies() { + TestCase { + input: "SHOW RETENTION POLICIES", + expected: "SHOW RETENTION POLICIES", + db: None, + rp: None, + } + .assert(); + TestCase { + input: "SHOW RETENTION POLICIES ON foo", + expected: "SHOW RETENTION POLICIES", + db: Some("foo"), + rp: None, + } + .assert(); + } + + #[test] + fn show_tag_keys() { + TestCase { + input: "SHOW TAG KEYS", + expected: "SHOW TAG KEYS", + db: None, + rp: None, + } + .assert(); + TestCase { + input: "SHOW TAG KEYS FROM cpu", + expected: "SHOW TAG KEYS FROM cpu", + db: None, + rp: None, + } + .assert(); + TestCase { + input: "SHOW TAG KEYS ON foo", + expected: "SHOW TAG KEYS", + db: Some("foo"), + rp: None, + } + .assert(); + TestCase { + input: "SHOW TAG KEYS ON foo FROM cpu", + expected: "SHOW TAG KEYS FROM cpu", + db: Some("foo"), + rp: None, + } + .assert(); + } + + #[test] + fn show_tag_values() { + TestCase { + input: "SHOW TAG VALUES WITH KEY = host", + expected: "SHOW TAG VALUES WITH KEY = host", + db: None, + rp: None, + } + .assert(); + TestCase { + input: "SHOW TAG VALUES FROM cpu WITH KEY = host", + expected: "SHOW TAG VALUES FROM cpu WITH KEY = host", + db: None, + rp: None, + } + .assert(); + TestCase { + input: "SHOW TAG VALUES ON foo WITH KEY = host", + expected: "SHOW TAG VALUES WITH KEY = host", + db: Some("foo"), + rp: None, + } + .assert(); + TestCase { + input: "SHOW TAG VALUES ON foo FROM cpu WITH KEY = host", + expected: "SHOW TAG VALUES FROM cpu WITH KEY = host", + db: Some("foo"), + rp: None, + } + .assert(); + } + + #[test] + fn show_field_keys() { + TestCase { + input: "SHOW FIELD KEYS", + expected: "SHOW FIELD KEYS", + db: None, + rp: None, + } + .assert(); + TestCase { + input: "SHOW FIELD KEYS FROM cpu", + expected: "SHOW FIELD KEYS FROM cpu", + db: None, + rp: None, + } + .assert(); + TestCase { + input: "SHOW FIELD KEYS ON foo", + expected: "SHOW FIELD KEYS", + db: Some("foo"), + rp: None, + } + .assert(); + TestCase { + input: "SHOW FIELD KEYS ON foo FROM cpu", + expected: "SHOW FIELD KEYS FROM cpu", + db: Some("foo"), + rp: None, + } + .assert(); + } + + #[test] + fn select() { + TestCase { + input: "SELECT * FROM cpu", + expected: "SELECT * FROM cpu", + db: None, + rp: None, + } + .assert(); + TestCase { + input: "SELECT * FROM bar.cpu", + expected: "SELECT * FROM cpu", + db: None, + rp: Some("bar"), + } + .assert(); + TestCase { + input: "SELECT * FROM foo.bar.cpu", + expected: "SELECT * FROM cpu", + db: Some("foo"), + rp: Some("bar"), + } + .assert(); + TestCase { + input: r#"SELECT * FROM (SELECT * FROM cpu)"#, + expected: r#"SELECT * FROM (SELECT * FROM cpu)"#, + db: None, + rp: None, + } + .assert(); + TestCase { + input: r#"SELECT * FROM (SELECT * FROM bar.cpu), bar.mem"#, + expected: r#"SELECT * FROM (SELECT * FROM cpu), mem"#, + db: None, + rp: Some("bar"), + } + .assert(); + TestCase { + input: r#"SELECT * FROM (SELECT * FROM foo.bar.cpu), foo.bar.mem"#, + expected: r#"SELECT * FROM (SELECT * FROM cpu), mem"#, + db: Some("foo"), + rp: Some("bar"), + } + .assert(); + TestCase { + input: "SELECT * FROM \"5318725357728643_8729387113858758\".bar.cpu", + expected: "SELECT * FROM cpu", + db: Some("5318725357728643_8729387113858758"), + rp: Some("bar"), + } + .assert(); + } + + #[test] + fn select_failure_modes() { + TestFailure { + input: r#"SELECT * FROM foo.bar.cpu, baz.bop.cpu"#, + expected: Error::MultiDatabase, + } + .assert(); + TestFailure { + input: r#"SELECT * FROM cpu, baz.bop.cpu"#, + expected: Error::MultiDatabase, + } + .assert(); + TestFailure { + input: r#"SELECT * FROM bar.cpu, baz.bop.cpu"#, + expected: Error::MultiDatabase, + } + .assert(); + TestFailure { + input: r#"SELECT * FROM foo.bar.cpu, (SELECT * FROM mem)"#, + expected: Error::MultiDatabase, + } + .assert(); + } + + #[test] + fn explain() { + TestCase { + input: "EXPLAIN SELECT * FROM cpu", + expected: "EXPLAIN SELECT * FROM cpu", + db: None, + rp: None, + } + .assert(); + TestCase { + input: "EXPLAIN SELECT * FROM bar.cpu", + expected: "EXPLAIN SELECT * FROM cpu", + db: None, + rp: Some("bar"), + } + .assert(); + TestCase { + input: "EXPLAIN SELECT * FROM foo.bar.cpu", + expected: "EXPLAIN SELECT * FROM cpu", + db: Some("foo"), + rp: Some("bar"), + } + .assert(); + TestCase { + input: r#"EXPLAIN SELECT * FROM (SELECT * FROM cpu)"#, + expected: r#"EXPLAIN SELECT * FROM (SELECT * FROM cpu)"#, + db: None, + rp: None, + } + .assert(); + TestCase { + input: r#"EXPLAIN SELECT * FROM (SELECT * FROM bar.cpu), bar.mem"#, + expected: r#"EXPLAIN SELECT * FROM (SELECT * FROM cpu), mem"#, + db: None, + rp: Some("bar"), + } + .assert(); + TestCase { + input: r#"EXPLAIN SELECT * FROM (SELECT * FROM foo.bar.cpu), foo.bar.mem"#, + expected: r#"EXPLAIN SELECT * FROM (SELECT * FROM cpu), mem"#, + db: Some("foo"), + rp: Some("bar"), + } + .assert(); + } + + #[test] + fn noop_rewrites() { + TestCase { + input: "CREATE DATABASE foo", + expected: "CREATE DATABASE foo", + db: None, + rp: None, + } + .assert(); + TestCase { + input: "DELETE FROM cpu", + expected: "DELETE FROM cpu", + db: None, + rp: None, + } + .assert(); + TestCase { + input: "DROP MEASUREMENT cpu", + expected: "DROP MEASUREMENT cpu", + db: None, + rp: None, + } + .assert(); + TestCase { + input: "EXPLAIN SELECT * FROM cpu", + expected: "EXPLAIN SELECT * FROM cpu", + db: None, + rp: None, + } + .assert(); + TestCase { + input: "SHOW DATABASES", + expected: "SHOW DATABASES", + db: None, + rp: None, + } + .assert(); + } +} diff --git a/iox_time/Cargo.toml b/iox_time/Cargo.toml index 70a240c2..8d946cfa 100644 --- a/iox_time/Cargo.toml +++ b/iox_time/Cargo.toml @@ -12,7 +12,7 @@ workspace = true [dependencies] chrono = { version = "0.4.42", default-features = false, features = ["clock", "std"] } parking_lot = "0.12" -tokio = { version = "1.47", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] } +tokio = { version = "1.48", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] } workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] diff --git a/iox_v1_query_api/Cargo.toml b/iox_v1_query_api/Cargo.toml new file mode 100644 index 00000000..3425b5f0 --- /dev/null +++ b/iox_v1_query_api/Cargo.toml @@ -0,0 +1,55 @@ +[package] +name = "iox_v1_query_api" +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true + +[lints] +workspace = true + +[dependencies] # In alphabetical order +anyhow = "1.0.99" +arrow = { workspace = true } +authz = { path = "../authz", features = ["http"] } +bytes = "1.10.1" +chrono = "0.4.42" +datafusion = { workspace = true } +futures = "0.3" +generated_types = { path = "../generated_types" } +http = { workspace = true } +http-body-util = { workspace = true } +iox_http_util = { path = "../iox_http_util" } +iox_query = { path = "../iox_query" } +iox_query_params = { path = "../iox_query_params" } +iox_query_influxql = { path = "../iox_query_influxql" } +iox_query_influxql_rewrite = { path = "../iox_query_influxql_rewrite" } +mime = "0.3.16" +multer = "3.1.0" +trace_http = { path = "../trace_http" } +tracing = { workspace = true } +rmp-serde = "1.3.0" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0.145" +serde_urlencoded = "0.7.0" +schema = { path = "../schema" } +thiserror = "2.0.16" +trace = { path = "../trace" } +workspace-hack = { version = "0.1", path = "../workspace-hack" } + +[dev-dependencies] # In alphabetical order +async-trait = "0.1" +data_types = { path = "../data_types" } +datafusion_util = { path = "../datafusion_util" } +insta = { version = "1.43.2", features = ["json", "redactions"] } +iox_time = { path = "../iox_time" } +metric = { path = "../metric" } +tokio = { version = "1.48", features = [ + "macros", + "net", + "parking_lot", + "rt-multi-thread", + "signal", + "sync", + "time", +] } diff --git a/iox_v1_query_api/src/error.rs b/iox_v1_query_api/src/error.rs new file mode 100644 index 00000000..916c1cd8 --- /dev/null +++ b/iox_v1_query_api/src/error.rs @@ -0,0 +1,122 @@ +use std::fmt::Debug; + +use datafusion::error::DataFusionError; +use iox_query_influxql_rewrite as rewrite; +use thiserror::Error; + +/// Error type for the v1 API +/// +/// This is used to catch errors that occur during the streaming process. +/// [`anyhow::Error`] is used as a catch-all because if anything fails during +/// that process it will result in a 500 INTERNAL ERROR. +#[derive(Debug, thiserror::Error)] +#[error("unexpected query error: {0}")] +pub struct QueryError(#[from] pub anyhow::Error); + +#[derive(Debug, Error)] +pub enum Error { + /// The requested path has no registered handler. + #[error("not found: {0}")] + NoHandler(String), + + #[error("authorization failure: {0}")] + AuthorizationFailure(String), + + #[error("invalid mime type ({0})")] + InvalidMimeType(String), + + /// Missing parameters for query + #[error("missing query parameters 'db' and 'q'")] + MissingQueryParams, + + #[error("error decoding multipart file upload: {0}")] + MultipartFile(String), + + #[error("Invalid UTF8: {message} {error}")] + Utf8 { + message: &'static str, + error: String, + }, + + /// Serde decode error + #[error("error decoding params from url: {0}")] + SerdeUrlDecoding(#[from] serde_urlencoded::de::Error), + + // SerdeJsonError + #[error("error decoding query body: {0}")] + SerdeJson(#[from] serde_json::Error), + + #[error("datafusion error: {0}")] + Datafusion(#[from] DataFusionError), + + #[error("error in InfluxQL statement: {0}")] + InfluxqlRewrite(#[from] rewrite::Error), + + #[error("must provide only one InfluxQl statement per query")] + InfluxqlSingleStatement, + + #[error("must specify a 'db' parameter, or provide the database in the InfluxQL query")] + InfluxqlNoDatabase, + + #[error( + "provided a database in both the parameters ({param_db}) and \ + query string ({query_db}) that do not match, if providing a query \ + that specifies the database, you can omit the 'database' parameter \ + from your request" + )] + InfluxqlDatabaseMismatch { param_db: String, query_db: String }, + + #[error( + "provided a retention policy in both the parameters ({param_rp}) and \ + query string ({query_rp}) that do not match, if providing a query \ + that specifies the retention_policy, you can omit the 'rp' parameter \ + from your request" + )] + InfluxqlRetentionPolicyMismatch { param_rp: String, query_rp: String }, + + #[error("error reading field from body: {name} -- {error}")] + FieldRead { name: &'static str, error: String }, + + #[error("Cannot retrieve database: {0}")] + Database(DataFusionError), + + #[error("Database {0} not found")] + DatabaseNotFound(String), + + #[error("v1 query API error: {0}")] + V1Query(#[from] QueryError), +} + +#[derive(Debug, Clone)] +pub enum HttpError { + NotFound(String), + Unauthorized(String), + Invalid(String), + InternalError(String), +} + +impl From for HttpError { + fn from(e: Error) -> Self { + use Error::*; + use HttpError::*; + match e { + NoHandler(_) => NotFound(e.to_string()), + InvalidMimeType(_) + | MissingQueryParams + | InfluxqlSingleStatement + | InfluxqlNoDatabase + | Database(_) + | DatabaseNotFound(_) + | InfluxqlDatabaseMismatch { .. } + | InfluxqlRetentionPolicyMismatch { .. } + | MultipartFile(_) + | SerdeUrlDecoding(_) + | SerdeJson(_) + | Utf8 { .. } + | FieldRead { .. } + | InfluxqlRewrite(_) => Invalid(e.to_string()), + Datafusion(_) | V1Query(_) => InternalError(e.to_string()), + AuthorizationFailure(_) => Unauthorized(e.to_string()), + } + } +} diff --git a/iox_v1_query_api/src/handler.rs b/iox_v1_query_api/src/handler.rs new file mode 100644 index 00000000..0afd9857 --- /dev/null +++ b/iox_v1_query_api/src/handler.rs @@ -0,0 +1,963 @@ +use std::{collections::HashMap, sync::Arc}; + +use arrow::datatypes::Schema; +use authz::{Authorization, Authorizer, Permission, http::AuthorizationHeaderExtension}; +use bytes::Bytes; +use datafusion::{ + execution::SendableRecordBatchStream, parquet::data_type::AsBytes, physical_plan::ExecutionPlan, +}; +use futures::{StreamExt, stream::BoxStream}; +use http::{ + HeaderValue, Method, + header::{ACCEPT, CONTENT_TYPE}, + status::StatusCode, +}; +use http_body_util::BodyExt; +use iox_http_util::{ + Request, Response, ResponseBuilder, empty_response_body, stream_bytes_to_response_body, +}; +use iox_query::{ + QueryDatabase, + exec::IOxSessionContext, + query_log::{PermitAndToken, QueryCompletedToken, StatePlanned}, +}; +use iox_query_influxql::{ + frontend::planner::InfluxQLQueryPlanner, show_databases::InfluxQlShowDatabases, + show_retention_policies::InfluxQlShowRetentionPolicies, +}; +use iox_query_influxql_rewrite::{self as rewrite}; +use iox_query_params::StatementParams; +use mime::Mime; +use multer::Multipart; +use serde::Deserialize; +use serde_json::ser::{CompactFormatter, PrettyFormatter}; +use trace::{TraceCollector, ctx::SpanContext, span::SpanExt}; +use trace_http::{ + ctx::{RequestLogContext, RequestLogContextExt}, + query_variant::QueryVariant, +}; +use tracing::{info, warn}; + +use super::{ + DEFAULT_CHUNK_SIZE, Error, QueryFormat, QueryParams, Result, StatementFuture, types::Precision, +}; +use crate::{ + HttpError, + response::{ + buffered::BufferedResponseStream, + chunked::ChunkedResponseStream, + csv::CsvStream, + json::{BufferedJsonStream, ChunkedJsonStream}, + msgpack::{BufferedMessagePackStream, ChunkedMessagePackStream}, + }, + types::Statement, +}; + +#[derive(Debug)] +struct QueryPlan { + physical_plan: Arc, + schema: Arc, + query_completed_token: QueryCompletedToken, + context: IOxSessionContext, +} + +#[derive(Debug, Clone)] +pub struct V1HttpHandler { + database: Arc, + authz: Option>, + trace_collector: Option>, + iox_version: String, + show_databases: Option>, + show_retention_policies: Option>, +} + +impl V1HttpHandler { + pub fn new( + database: Arc, + authz: Option>, + trace_collector: Option>, + iox_version: String, + ) -> Self { + Self { + database, + authz, + trace_collector, + iox_version, + show_databases: None, + show_retention_policies: None, + } + } + + /// Add a `InfluxQlShowDatabases` to the handler + /// + /// This allows the implemention of `InfluxQlShowDatabases` to be added optionally so that + /// systems that support `SHOW DATABASES` queries (i.e., Core and Enterprise) can opt-in to that + /// functonality when constructing the `V1HttpHandler`. + pub fn with_show_databases(mut self, show_databases: Arc) -> Self { + self.show_databases = Some(show_databases); + self + } + + /// Add a `InfluxQlShowRetentionPolicies` to the handler + /// + /// This allows the implemention of `InfluxQlShowRetentionPolicies` to be added optionally so + /// that systems that support `SHOW RETENTION POLICIES` queries (i.e., Core and Enterprise) can + /// opt-in to that functonality when constructing the `V1HttpHandler`. + pub fn with_show_retention_policies( + mut self, + show_retention_policies: Arc, + ) -> Self { + self.show_retention_policies = Some(show_retention_policies); + self + } + + pub async fn route_request(&self, req: Request) -> Result { + match (req.method(), req.uri().path()) { + (&Method::GET | &Method::POST, "/query") => self + .handle_parameterized_query(req) + .await + .inspect_err(|e| warn!("error encountered while handling /query: {:?}", e)), + (&Method::GET | &Method::HEAD, "/ping") => self.ping(req).await, + _ => Err(HttpError::NotFound(req.uri().path().to_owned())), + } + } + + async fn ping(&self, _req: Request) -> Result { + ResponseBuilder::new() + .status(StatusCode::NO_CONTENT) + // This is important for backwards compat with one of the clients + .header("X-Influxdb-Build", "cloud2") + .header("X-Influxdb-Version", self.iox_version.clone()) + .body(empty_response_body()) + .map_err(|e| HttpError::InternalError(e.to_string())) + } + + async fn handle_parameterized_query(&self, mut req: Request) -> Result { + let span_ctx = Some(SpanContext::new_with_optional_collector( + self.trace_collector.as_ref().map(Arc::clone), + )); + + // Go ahead and get the token before we consume the body, + // but we can't use it until later once we know the database. + let token = self.get_token_from_request(&mut req)?; + + let (params, format) = extract_request(req).await?; + + let QueryParams { + chunk_size, + chunked, + database, + retention_policy, + epoch, + pretty: _, + query, + params, + } = params; + let chunk_size = + chunked.and_then(|chunked| chunked.then(|| chunk_size.unwrap_or(DEFAULT_CHUNK_SIZE))); + + // Make a provided but empty db param None for better error messaging. + let database = if let Some("") = database.as_deref() { + None + } else { + database + }; + + if query.is_none() { + return Err(HttpError::Invalid( + "expected a query to be provided in the query string or body".to_owned(), + )); + } + + let query = query.unwrap(); + + let sp: StatementParams = params + .map(|s| serde_json::from_str(&s)) + .transpose() + .map_err(Error::from)? + .unwrap_or_default(); + + let statements = rewrite::parse_statements(query.as_str()).map_err(Error::from); + + let statements = match statements { + Ok(statements) => statements, + Err(e) => { + let statement = error_statement(e); + let response = statements_to_response(vec![statement], chunk_size, epoch, format); + + return ResponseBuilder::new() + .status(200) + .header(CONTENT_TYPE, format.as_content_type()) + .body(stream_bytes_to_response_body(response)) + .map_err(|e| HttpError::InternalError(e.to_string())); + } + }; + + let resolve_db = |request_db: Option, query_db: Option| { + match (request_db, query_db) { + (None, None) => None, + (None, Some(db)) | (Some(db), None) => Some(db), + (Some(_), Some(q)) => { + // Influxqlbridge prioritizes the embedded dp/rp in the query + // over the params if both are specified. + Some(q) + } + } + }; + + let executing_statements = statements + .into_iter() + .map(|mut statement| { + let fut = async { + if statement.statement().is_show_databases() + && let Some(sd) = self.show_databases.as_ref() + { + let namespaces = match self + .database + .list_namespaces(span_ctx.child_span("list_namespaces")) + .await + .map_err(Error::Datafusion) + { + Ok(n) => n, + Err(e) => return Ok::<_, Error>(error_statement(e)), + }; + let permissions = namespaces + .into_iter() + .map(|n| { + Permission::ResourceAction( + authz::Resource::Database(authz::Target::ResourceName(n.name)), + authz::Action::Read, + ) + }) + .collect::>(); + + let authorized = self + .authz + .authorize(token.clone(), &permissions) + .await + .map_err(|error| Error::AuthorizationFailure(error.to_string()))?; + + let db_names = authorized + .permissions() + .iter() + .filter_map(|p| match p { + Permission::ResourceAction( + authz::Resource::Database(authz::Target::ResourceName(db_name)), + _, + ) => Some(db_name.to_owned()), + _ => None, + }) + .collect::>(); + + match sd.show_databases(db_names).await { + Ok(stream) => Ok(get_executing_statement_from_stream(stream)), + Err(error) => Ok(error_statement(error.into())), + } + } else if statement.statement().is_show_retention_policies() + && let Some(srp) = self.show_retention_policies.as_ref() + { + // Resolve database + let Some(database) = resolve_db(database.clone(), statement.resolve_dbrp()) + else { + return Ok::<_, Error>(error_statement(Error::InfluxqlNoDatabase)); + }; + + self.authz + .authorize( + token.clone(), + &[Permission::ResourceAction( + authz::Resource::Database(authz::Target::ResourceName( + database.clone(), + )), + authz::Action::Read, + )], + ) + .await + .map_err(|error| Error::AuthorizationFailure(error.to_string()))?; + + match srp.show_retention_policies(database).await { + Ok(stream) => Ok(get_executing_statement_from_stream(stream)), + Err(error) => Ok(error_statement(error.into())), + } + } else { + // Handle retention policy + match (retention_policy.clone(), statement.retention_policy()) { + (None, None) | (None, Some(_)) => {} + (Some(rp), None) => { + statement.set_retention_policy(rp); + } + (Some(_), Some(_)) => { + // Influxqlbridge prioritizes the embedded dp/rp in the query + // over the params if both are specified. + } + }; + + // Resolve database + let Some(database) = resolve_db(database.clone(), statement.resolve_dbrp()) + else { + return Ok::<_, Error>(error_statement(Error::InfluxqlNoDatabase)); + }; + + // Authorize request + let authz = self.authorize_request(token.clone(), &database).await?; + + // Generate the query + let query = statement.to_statement().to_string(); + + // Plan the query + let sp_clone = sp.clone(); + let span_ctx = span_ctx.clone(); + let query_plan = self + .plan_query( + query, + database, + sp_clone, + authz.into_subject(), + span_ctx, + None, + ) + .await; + + match query_plan { + Ok(query_plan) => { + // Get executing statement + let query_statement_result_stream = + get_executing_statement_from_plan( + query_plan, + Arc::clone(&self.database), + ); + + Ok(query_statement_result_stream) + } + Err(err) => Ok(error_statement(err)), + } + } + }; + + Ok::<_, Error>(fut) + }) + .collect::, _>>()?; + + // Execute these futures + let executing_statements = futures::future::try_join_all(executing_statements) + .await? + .into_iter() + .collect::>(); + + let response = statements_to_response(executing_statements, chunk_size, epoch, format); + + ResponseBuilder::new() + .status(200) + .header(CONTENT_TYPE, format.as_content_type()) + .body(stream_bytes_to_response_body(response)) + .map_err(|e| HttpError::InternalError(e.to_string())) + } + + async fn plan_query( + &self, + query: String, + database: String, + params: StatementParams, + authz_id: Option, + span_ctx: Option, + external_span_ctx: Option, + ) -> Result { + let namespace: Arc = database.into(); + let namespace_name = Arc::clone(&namespace); + let namespace_name = namespace_name.as_ref(); + + let db = self + .database + .namespace(namespace_name, span_ctx.child_span("get_namespace"), false) + .await + .map_err(Error::Database)? + .ok_or(Error::DatabaseNotFound(namespace_name.to_string()))?; + + let query_completed_token = db.record_query( + external_span_ctx.as_ref().map(RequestLogContext::ctx), + QueryVariant::InfluxQl.str(), + Box::new(query.to_string()), + params.clone(), + authz_id, + ); + + // Log after we acquire the permit and are about to start execution + info!( + %namespace_name, + %query, + trace=external_span_ctx.format_jaeger().as_str(), + variant=QueryVariant::InfluxQl.str(), + request_protocol="v1_http_query", + "InfluxQL request planning", + ); + + let context = db.new_query_context(span_ctx, None); + + let planner_ctx = context.child_ctx("v1 query planner"); + // Run planner on a separate threadpool, rather than the IO pool that is servicing this request + let physical_plan_res = + context + .run(async move { + InfluxQLQueryPlanner::query(query.as_ref(), params, &planner_ctx).await + }) + .await; + + let (physical_plan, query_completed_token) = match physical_plan_res { + Ok(physical_plan) => { + let query_completed_token = + query_completed_token.planned(&context, Arc::clone(&physical_plan)); + (physical_plan, query_completed_token) + } + Err(e) => { + query_completed_token.fail(); + Err(Error::from(e))? + } + }; + + let schema = Arc::clone(&physical_plan.schema()); + Ok(QueryPlan { + physical_plan, + schema, + query_completed_token, + context, + }) + } + + fn get_token_from_request(&self, req: &mut Request) -> Result>, Error> { + let token = if let Some(p) = extract_v1_auth_token(req) { + Some(p) + } else { + let auth_header = req.extensions().get::(); + auth_header + .and_then(|auth_header| { + let header_value = &**auth_header; + header_value.as_ref().map(validate_auth_header) + }) + .transpose()? + }; + + Ok(token) + } + + async fn authorize_request( + &self, + token: Option>, + database: &str, + ) -> Result { + let required_permission = authz::Permission::ResourceAction( + authz::Resource::Database(authz::Target::ResourceName(database.to_string())), + authz::Action::Read, + ); + + self.authz + .authorize(token, &[required_permission]) + .await + .map_err(|e| Error::AuthorizationFailure(e.to_string())) + } +} + +fn statements_to_response( + executing_statements: Vec, + chunk_size: Option, + epoch: Option, + format: QueryFormat, +) -> BoxStream<'static, Bytes> { + match format { + QueryFormat::Csv => CsvStream::new(executing_statements) + .with_epoch(epoch) + .boxed(), + QueryFormat::Json => match chunk_size { + Some(chunk_size) => { + let response_stream = ChunkedResponseStream::new(executing_statements, chunk_size); + ChunkedJsonStream::new(response_stream, || CompactFormatter, epoch).boxed() + } + None => { + let response_stream = BufferedResponseStream::new(executing_statements); + BufferedJsonStream::new(response_stream, || CompactFormatter, epoch).boxed() + } + }, + QueryFormat::JsonPretty => match chunk_size { + Some(chunk_size) => { + let response_stream = ChunkedResponseStream::new(executing_statements, chunk_size); + ChunkedJsonStream::new(response_stream, PrettyFormatter::new, epoch).boxed() + } + None => { + let response_stream = BufferedResponseStream::new(executing_statements); + BufferedJsonStream::new(response_stream, PrettyFormatter::new, epoch).boxed() + } + }, + QueryFormat::MsgPack => match chunk_size { + Some(chunk_size) => { + let response_stream = ChunkedResponseStream::new(executing_statements, chunk_size); + ChunkedMessagePackStream::new(response_stream, epoch).boxed() + } + None => { + let response_stream = BufferedResponseStream::new(executing_statements); + BufferedMessagePackStream::new(response_stream, epoch).boxed() + } + }, + } +} + +fn get_executing_statement_from_stream(stream: SendableRecordBatchStream) -> StatementFuture { + Box::new(async move { Ok(Statement::new(stream.schema(), None, stream)) }) +} + +fn get_executing_statement_from_plan( + query_plan: QueryPlan, + database: Arc, +) -> StatementFuture { + let QueryPlan { + physical_plan, + schema, + query_completed_token, + context, + } = query_plan; + + let fut = async move { + let permit_span = context.child_span("query_rate_limit_semaphore"); + let permit = database.acquire_semaphore(permit_span).await; + let query_completed_token: iox_query::query_log::QueryCompletedToken< + iox_query::query_log::StatePermit, + > = query_completed_token.permit(); + + context + .execute_stream(physical_plan) + .await + .map(|stream| { + Statement::new( + Arc::clone(&schema), + Some(PermitAndToken { + permit, + query_completed_token, + }), + stream, + ) + }) + .map_err(Error::from) + }; + + Box::new(fut) +} + +fn error_statement(error: Error) -> StatementFuture { + Box::new(futures::future::err(error)) +} + +#[derive(Debug, Deserialize)] +struct V1AuthParameters { + #[serde(rename = "p")] + password: Option, +} + +fn extract_v1_auth_token(req: &mut Request) -> Option> { + req.uri() + .path_and_query() + .and_then(|pq| match pq.path() { + "/query" => pq.query(), + _ => None, + }) + .map(serde_urlencoded::from_str::) + .transpose() + .ok() + .flatten() + .and_then(|params| params.password) + .map(String::into_bytes) +} + +fn validate_auth_header(header: &HeaderValue) -> Result> { + let header = header.to_str().map_err(|e| Error::Utf8 { + message: "auth header", + error: e.to_string(), + })?; + authz::extract_token(Some(header)).ok_or(Error::AuthorizationFailure( + "failed to extract token from header".to_owned(), + )) +} + +enum SupportedContentType { + ApplicationInfluxql, + FormUrlEncoded, + MultipartFormData, +} + +impl SupportedContentType { + fn from_request(req: &Request) -> Result { + if let Some(ct) = req.headers().get("Content-Type") { + let ct = std::str::from_utf8(ct.as_bytes()).map_err(|e| Error::Utf8 { + message: "mime type", + error: e.to_string(), + })?; + let mime: Mime = ct + .parse() + .map_err(|x: mime::FromStrError| Error::InvalidMimeType(x.to_string()))?; + + match (mime.type_(), mime.subtype()) { + (mime::APPLICATION, mime::WWW_FORM_URLENCODED) => Ok(Self::FormUrlEncoded), + (mime::APPLICATION, subtype) if subtype.as_str() == "vnd.influxql" => { + Ok(Self::ApplicationInfluxql) + } + (mime::MULTIPART, mime::FORM_DATA) => Ok(Self::MultipartFormData), + _ => Err(Error::InvalidMimeType(mime.to_string())), + } + } else { + // Default to assuming an influxql POST body + Ok(Self::ApplicationInfluxql) + } + } +} + +async fn influxql_body(req: Request) -> Result { + let mut params = QueryParams::from_request_query_string(&req)?; + // We support a "q" query string for POST too. + // If empty, check the content-type and parse the body appropriately. + if params.query.as_ref().is_none_or(|x| x.is_empty()) { + let bytes = req + .into_body() + .collect() + .await + .map_err(|_| { + HttpError::Invalid("Error retrieving bytes from response body".to_owned()) + })? + .to_bytes(); + params.query = Some(String::from_utf8(bytes.to_vec()).map_err(|_| { + HttpError::Invalid("Error retrieving query from request body".to_owned()) + })?); + }; + + Ok(params) +} + +async fn form_urlencoded(req: Request) -> Result { + let (body_params, _) = form_urlencoded_inner(req).await?; + Ok(body_params) +} + +async fn form_urlencoded_inner(req: Request) -> Result<(QueryParams, Bytes), HttpError> { + // The 1.x implementation uses [FormValue](https://pkg.go.dev/net/http#Request.FormValue) + // which relies on [ParseForm](https://pkg.go.dev/net/http#Request.ParseForm). + // + // This will always parse the URL query string as well as parsing the form body when required. + // Request body parameters take precedence over URL query string values. + + // It is okay to swallow the error here, since a query string is not mandatory. + let query_string_params = QueryParams::from_request_query_string(&req).unwrap_or_default(); + + let bytes = req + .into_body() + .collect() + .await + .map_err(|e| Error::FieldRead { + name: "body", + error: e.to_string(), + })? + .to_bytes(); + let mut body_params = QueryParams::from_bytes_form_urlencoded(&bytes)?; + + body_params.merge(query_string_params); + + Ok((body_params, bytes)) +} + +async fn multipart_upload(req: Request) -> Result { + let boundary = req + .headers() + .get(CONTENT_TYPE) + .and_then(|ct| ct.to_str().ok()) + .and_then(|ct| multer::parse_boundary(ct).ok()); + + if boundary.is_none() { + return Err(HttpError::Invalid( + "A boundary header is required for multipart upload".to_owned(), + )); + } + + let (lower_precedence_params, bytes) = form_urlencoded_inner(req).await?; + + let mut fields: HashMap = HashMap::new(); + // all the fields are strings, + // then parse in a method on QueryParams + let mut multipart = Multipart::new( + futures::stream::once(async { Ok::<_, Error>(bytes) }), + boundary.unwrap(), // safe due to is_none check above + ); + while let Some(mut field) = multipart + .next_field() + .await + .map_err(|e| Error::MultipartFile(e.to_string()))? + { + if let Some(name) = field.name() { + let name = name.to_owned(); + let mut value = Vec::new(); + while let Some(field_chunk) = field + .chunk() + .await + .map_err(|e| Error::MultipartFile(e.to_string()))? + { + value.extend_from_slice(field_chunk.as_bytes()); + } + + let value = String::from_utf8(value).map_err(|e| Error::Utf8 { + message: "multipart field", + error: e.to_string(), + })?; + fields.insert(name, value); + } + } + + let mut body_params = QueryParams::from_hashmap_multipart(fields)?; + body_params.merge(lower_precedence_params); + + Ok(body_params) +} + +async fn extract_request(req: Request) -> Result<(QueryParams, QueryFormat), HttpError> { + // Pull the mime_type out before we consume the body in the match + let accept = req.headers().get(ACCEPT).cloned(); + let mime_type = accept.as_ref().map(HeaderValue::as_bytes); + + let qp = match *req.method() { + Method::GET => Ok(QueryParams::from_request_query_string(&req)?), + Method::POST => { + let content_type = SupportedContentType::from_request(&req)?; + match content_type { + SupportedContentType::ApplicationInfluxql => influxql_body(req).await, + SupportedContentType::FormUrlEncoded => form_urlencoded(req).await, + SupportedContentType::MultipartFormData => multipart_upload(req).await, + } + } + _ => Err(HttpError::Invalid("Invalid request method".to_owned())), + }?; + + let qf = QueryFormat::from_bytes(mime_type, qp.pretty.unwrap_or_default())?; + Ok((qp, qf)) +} + +#[cfg(test)] +mod tests { + use std::{sync::Arc, time::Duration}; + + use authz::{Action, Authorization, Authorizer, Error, Permission, Target}; + use iox_http_util::{RequestBuilder, empty_request_body, read_body_bytes_for_tests}; + use iox_query::{QueryDatabase, test::TestDatabaseStore}; + use iox_query_influxql::{ + show_databases::mock::MockShowDatabases, + show_retention_policies::mock::{MockRetentionPolicy, MockShowRetentionPolicies}, + }; + + use crate::V1HttpHandler; + + #[derive(Debug)] + struct MockAuthorizer { + authorized_databases: Vec, + } + + impl MockAuthorizer { + fn new(databases: impl IntoIterator>) -> Self { + Self { + authorized_databases: databases.into_iter().map(Into::into).collect(), + } + } + } + + #[async_trait::async_trait] + impl Authorizer for MockAuthorizer { + async fn authorize( + &self, + _token: Option>, + _perms: &[Permission], + ) -> Result { + let permissions = self + .authorized_databases + .iter() + .map(|n| { + Permission::ResourceAction( + authz::Resource::Database(Target::ResourceName(n.to_string())), + Action::Read, + ) + }) + .collect::>(); + Ok(Authorization::new(None, permissions)) + } + } + + #[tokio::test] + async fn test_show_databases() { + let db = Arc::new(TestDatabaseStore::default()); + db.db_or_create("foo").await; + db.db_or_create("bar").await; + let show_databases = Arc::new(MockShowDatabases::new(["foo", "bar"])); + let handler = V1HttpHandler::new(db, None, None, "test".to_string()) + .with_show_databases(show_databases); + let req = RequestBuilder::new() + .method("GET") + .uri("http://foo.bar/query?q=show%20databases") + .body(empty_request_body()) + .unwrap(); + let res = handler.route_request(req).await.unwrap(); + let res = read_body_bytes_for_tests(res.into_body()).await; + let res = String::from_utf8(Vec::::from(res)).unwrap(); + insta::with_settings!({ + description => "SHOW DATABASES -- on handler with SHOW DATABASES enabled", + },{ + insta::assert_snapshot!(res); + }); + } + + #[tokio::test] + async fn test_show_databases_with_no_impl() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let handler = V1HttpHandler::new(db, None, None, "test".to_string()); + let req = RequestBuilder::new() + .method("GET") + .uri("http://foo.bar/query?q=show%20databases") + .body(empty_request_body()) + .unwrap(); + let res = handler.route_request(req).await.unwrap(); + let res = read_body_bytes_for_tests(res.into_body()).await; + let res = String::from_utf8(Vec::::from(res)).unwrap(); + insta::with_settings!({ + description => "SHOW DATABASES -- on handler with SHOW DATABASES _not_ enabled", + },{ + insta::assert_snapshot!(res); + }); + } + + #[tokio::test] + async fn test_show_databases_with_authz() { + let db = Arc::new(TestDatabaseStore::default()); + db.db_or_create("foo").await; + db.db_or_create("bar").await; + db.db_or_create("mop").await; + { + let authz = Arc::new(MockAuthorizer::new(["foo", "bar"])); + // The show databases has databases foo, bar, and mop, but only foo and bar will be returned + // due to the mock authorizer... + let show_databases = Arc::new(MockShowDatabases::new(["foo", "bar", "mop"])); + let handler = + V1HttpHandler::new(Arc::clone(&db) as _, Some(authz), None, "test".to_string()) + .with_show_databases(show_databases); + let req = RequestBuilder::new() + .method("GET") + .uri("http://foo.bar/query?q=show%20databases") + .body(empty_request_body()) + .unwrap(); + let res = handler.route_request(req).await.unwrap(); + let res = read_body_bytes_for_tests(res.into_body()).await; + let res = String::from_utf8(Vec::::from(res)).unwrap(); + insta::with_settings!({ + description => "SHOW DATABASES -- should not return mop database due to authz", + },{ + insta::assert_snapshot!(res); + }); + } + { + let authz = Arc::new(MockAuthorizer::new(["foo", "bar", "mop"])); + // The show databases has databases foo, bar, and mop, but only foo and bar will be returned + // due to the mock authorizer... + let show_databases = Arc::new(MockShowDatabases::new(["foo", "bar", "mop"])); + let handler = V1HttpHandler::new(db, Some(authz), None, "test".to_string()) + .with_show_databases(show_databases); + let req = RequestBuilder::new() + .method("GET") + .uri("http://foo.bar/query?q=show%20databases") + .body(empty_request_body()) + .unwrap(); + let res = handler.route_request(req).await.unwrap(); + let res = read_body_bytes_for_tests(res.into_body()).await; + let res = String::from_utf8(Vec::::from(res)).unwrap(); + insta::with_settings!({ + description => "SHOW DATABASES -- should return mop database after adding to authz", + },{ + insta::assert_snapshot!(res); + }); + } + } + + #[tokio::test] + async fn test_show_retention_policies() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let show_retention_policies = Arc::new( + MockShowRetentionPolicies::new() + .with_default_retention_policy("foo") + .with_default_retention_policy("bar") + .with_retention_policy( + "bar", + MockRetentionPolicy::new("short").with_duration(Duration::from_secs(100)), + ), + ); + let handler = V1HttpHandler::new(db, None, None, "test".to_string()) + .with_show_retention_policies(show_retention_policies); + // on foo db: + { + let req = RequestBuilder::new() + .method("GET") + .uri("http://foo.bar/query?db=foo&q=show%20retention%20policies") + .body(empty_request_body()) + .unwrap(); + let res = handler.route_request(req).await.unwrap(); + let res = read_body_bytes_for_tests(res.into_body()).await; + let res = String::from_utf8(Vec::::from(res)).unwrap(); + insta::with_settings!({ + description => "SHOW RETENTION POLICIES -- on `foo` database which contains one \ + default policy", + },{ + insta::assert_snapshot!(res); + }); + } + // on bar db: + { + let req = RequestBuilder::new() + .method("GET") + .uri("http://foo.bar/query?db=bar&q=show%20retention%20policies") + .body(empty_request_body()) + .unwrap(); + let res = handler.route_request(req).await.unwrap(); + let res = read_body_bytes_for_tests(res.into_body()).await; + let res = String::from_utf8(Vec::::from(res)).unwrap(); + insta::with_settings!({ + description => "SHOW RETENTION POLICIES -- on `bar` database which contains one \ + default policy and one non-default policy", + },{ + insta::assert_snapshot!(res); + }); + } + // on non-existent db: + { + let req = RequestBuilder::new() + .method("GET") + .uri("http://foo.bar/query?db=frodo&q=show%20retention%20policies") + .body(empty_request_body()) + .unwrap(); + let res = handler.route_request(req).await.unwrap(); + let res = read_body_bytes_for_tests(res.into_body()).await; + let res = String::from_utf8(Vec::::from(res)).unwrap(); + insta::with_settings!({ + description => "SHOW RETENTION POLICIES -- on `frodo` database which does not \ + exist", + },{ + insta::assert_snapshot!(res); + }); + } + } + + #[tokio::test] + async fn test_show_retention_policies_with_no_impl() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let handler = V1HttpHandler::new(db, None, None, "test".to_string()); + let req = RequestBuilder::new() + .method("GET") + .uri("http://foo.bar/query?db=foo&q=show%20retention%20policies") + .body(empty_request_body()) + .unwrap(); + let res = handler.route_request(req).await.unwrap(); + let res = read_body_bytes_for_tests(res.into_body()).await; + let res = String::from_utf8(Vec::::from(res)).unwrap(); + insta::with_settings!({ + description => "SHOW RETENTION POLICIES -- on handler with SHOW RETENTION POLICIES \ + _not_ enabled", + },{ + insta::assert_snapshot!(res); + }); + } +} diff --git a/iox_v1_query_api/src/lib.rs b/iox_v1_query_api/src/lib.rs new file mode 100644 index 00000000..b099c875 --- /dev/null +++ b/iox_v1_query_api/src/lib.rs @@ -0,0 +1,216 @@ +use workspace_hack as _; + +// A large majority of the code in this file was copied from the Monolith +// project and then adapted to fit our format/needs. +use std::{collections::HashMap, fmt::Debug, num::ParseIntError, str::ParseBoolError}; + +use bytes::Bytes; +use error::Error; +use iox_http_util::Request; +use serde::Deserialize; +use types::Precision; +use types::Statement; + +mod error; +pub use error::HttpError; +mod handler; +pub use handler::V1HttpHandler; +mod response; +mod types; +mod value; + +const DEFAULT_CHUNK_SIZE: usize = 10_000; + +type Result = std::result::Result; +type StatementFuture = Box> + Send>; +/// Enum representing the query format for the v1/query API. +/// +/// The original API supports CSV, JSON, and "pretty" JSON formats. +#[derive(Debug, Default, Deserialize, Clone, Copy, PartialEq)] +#[serde(rename_all = "snake_case")] +pub enum QueryFormat { + Csv, + #[default] + Json, + JsonPretty, + MsgPack, +} + +impl QueryFormat { + /// Returns the content type as a string slice for the query format. + /// + /// Maps the `QueryFormat` variants to their corresponding MIME types as strings. + /// This is useful for setting the `Content-Type` header in HTTP responses. + pub fn as_content_type(&self) -> &str { + match self { + Self::Csv => "application/csv", + Self::Json | Self::JsonPretty => "application/json", + Self::MsgPack => "application/x-msgpack", + } + } + + /// Extracts the [`QueryFormat`] from an HTTP [`Request`]. + /// + /// Parses the HTTP request to determine the desired query format. The `pretty` + /// parameter indicates if the pretty format is requested via a query parameter. + /// The function inspects the `Accept` header of the request to determine the + /// format, defaulting to JSON if no specific format is requested. If the format + /// is invalid or non-UTF8, an error is returned. + pub fn from_bytes(mime_type: Option<&[u8]>, pretty: bool) -> Result { + match mime_type { + Some(b"application/csv" | b"text/csv") => Ok(Self::Csv), + Some(b"application/x-msgpack") => Ok(Self::MsgPack), + Some(b"application/json" | b"*/*") | None => { + // If no specific format is requested via the Accept header, + // and the 'pretty' parameter is true, use the pretty JSON format. + // Otherwise, default to the regular JSON format. + if pretty { + Ok(Self::JsonPretty) + } else { + Ok(Self::Json) + } + } + Some(mime_type) => match std::str::from_utf8(mime_type) { + Ok(s) => Err(Error::InvalidMimeType(s.to_owned())), + Err(e) => Err(Error::Utf8 { + message: "mime type", + error: e.to_string(), + }), + }, + } + } +} + +/// Query parameters for the v1/query API +/// +/// The original API supports a `u` parameter, for "username", as well as a `p`, +/// for "password". The password is extracted upstream, and username is ignored. +#[derive(Debug, Default, Deserialize)] +pub struct QueryParams { + /// Chunk the response into chunks of size `chunk_size`, or 10,000, or by series + pub chunked: Option, + /// Define the number of records that will go into a chunk + pub chunk_size: Option, + /// Database to perform the query against + /// + /// This is optional because the query string may specify the database + #[serde(rename = "db")] + pub database: Option, + /// Retention Policy to perform the query against + /// + /// This is optional because the query string may specify the rp + #[serde(rename = "rp")] + pub retention_policy: Option, + /// Map timestamps to UNIX epoch time, with the given precision + pub epoch: Option, + /// Format the JSON outputted in pretty format + pub pretty: Option, + /// The InfluxQL query string + #[serde(rename = "q")] + pub query: Option, + /// Params for parameterized queries + pub params: Option, +} + +impl QueryParams { + /// Extract [`QueryParams`] from an HTTP [`Request`] + pub fn from_request_query_string(req: &Request) -> Result { + let query = req.uri().query().ok_or(Error::MissingQueryParams)?; + let mut params: Self = serde_urlencoded::from_str(query).map_err(Error::from)?; + + // For other request types we need to know if the value was set or not, + // so we have to unwrap_or_default here rather than on QueryParams directly. + params.chunked = Some(params.chunked.unwrap_or_default()); + params.pretty = Some(params.pretty.unwrap_or_default()); + + Ok(params) + } + + pub fn from_bytes_form_urlencoded(bytes: &Bytes) -> Result { + serde_urlencoded::from_bytes(bytes).map_err(Into::into) + } + + pub fn from_hashmap_multipart(fields: HashMap) -> Result { + let mut this = Self::default(); + + if let Some(chunked) = fields.get("chunked") { + let b = chunked + .trim() + .parse() + .map_err(|e: ParseBoolError| Error::FieldRead { + name: "chunked", + error: e.to_string(), + })?; + this.chunked = Some(b); + } + + if let Some(chunk_size) = fields.get("chunk_size") { + let u = chunk_size + .trim() + .parse() + .map_err(|e: ParseIntError| Error::FieldRead { + name: "chunk_size", + error: e.to_string(), + })?; + this.chunk_size = Some(u); + } + + if let Some(epoch) = fields.get("epoch") { + let e = epoch.trim(); + let e = serde_json::from_str(e).map_err(Error::from)?; + this.epoch = Some(e) + } + + if let Some(pretty) = fields.get("pretty") { + let p = pretty + .trim() + .parse() + .map_err(|e: ParseBoolError| Error::FieldRead { + name: "pretty", + error: e.to_string(), + })?; + this.pretty = Some(p); + } + + this.database = fields.get("db").cloned(); + this.retention_policy = fields.get("rp").cloned(); + this.query = fields.get("q").cloned(); + this.params = fields.get("params").cloned(); + + Ok(this) + } + + pub fn merge(&mut self, lower_precedence: Self) { + if self.chunked.is_none() { + self.chunked = lower_precedence.chunked; + } + + if self.chunk_size.is_none() { + self.chunk_size = lower_precedence.chunk_size; + } + + if self.database.is_none() { + self.database = lower_precedence.database; + } + + if self.retention_policy.is_none() { + self.retention_policy = lower_precedence.retention_policy; + } + + if self.epoch.is_none() { + self.epoch = lower_precedence.epoch; + } + + if self.pretty.is_none() { + self.pretty = lower_precedence.pretty; + } + + if self.query.is_none() { + self.query = lower_precedence.query; + } + + if self.params.is_none() { + self.params = lower_precedence.params; + } + } +} diff --git a/iox_v1_query_api/src/response.rs b/iox_v1_query_api/src/response.rs new file mode 100644 index 00000000..0b7c4fc8 --- /dev/null +++ b/iox_v1_query_api/src/response.rs @@ -0,0 +1,634 @@ +use arrow::array::RecordBatch; +use serde::{ + Serialize, Serializer, + ser::{SerializeSeq, SerializeStruct}, +}; +use std::{ + collections::BTreeMap, + fmt::{Debug, Formatter}, + sync::Arc, +}; + +pub(super) mod buffered; +pub(super) mod chunked; +pub(super) mod csv; +pub(super) mod json; +pub(super) mod msgpack; +use buffered::BufferedResponseStream; +use chunked::ChunkedResponseStream; +mod stream; +use stream::{SeriesChunkMergeStream, SeriesChunkStream}; + +use super::{ + types::Precision, + value::{Value, ValueSerializer}, +}; + +#[derive(Debug, PartialEq)] +pub(super) struct Series { + measurement: String, + tags: BTreeMap, String>, +} + +impl Series { + fn new(measurement: String, tags: BTreeMap, String>) -> Self { + Self { measurement, tags } + } +} + +/// This represents a discrete chunk of data as defined by the V1 query API. +/// This can be a complete Series when: +/// 1. We are not running in chunked mode +/// 2. We are in chunked mode, and the series is smaller than the chunk size +/// +/// or this will be a Chunk (a subset of a Series) as defined by the chunk size. +/// +/// The intention is that this type can be easily converted to our output formats +/// and returned/streamed by another stream. +#[derive(PartialEq)] +pub(crate) struct SeriesChunk { + measurement_column: usize, + + tag_columns: Arc, usize>>, + + value_columns: Arc<[(Arc, usize)]>, + + /// SeriesChunks may contain one or more record batches. + data: Vec, + /// If this chunk is a partial chunk. + partial: bool, +} + +impl SeriesChunk { + /// Create a new SeriesChunk from a RecordBatch. + fn new( + measurement_column: usize, + tag_columns: Arc, usize>>, + value_columns: Arc<[(Arc, usize)]>, + batch: RecordBatch, + ) -> Self { + Self { + measurement_column, + tag_columns, + value_columns, + data: vec![batch], + partial: false, + } + } + + /// Get the measurement name this series belongs to. + fn measurement(&self) -> String { + assert!( + !self.data.is_empty(), + "SeriesChunk should have at least one record batch" + ); + Value::new(self.data[0].column(self.measurement_column), 0).to_string() + } + + /// Get the tags that define this series. + fn tags(&self) -> BTreeMap, String> { + assert!( + !self.data.is_empty(), + "SeriesChunk should have at least one record batch" + ); + self.tag_columns + .iter() + .map(|(name, idx)| (Arc::clone(name), Value::new(self.data[0].column(*idx), 0))) + .map(|(name, value)| (name, value.to_string())) + .collect() + } + + /// Get the definition of the Series this chunk is part of. + fn series(&self) -> Series { + Series::new(self.measurement(), self.tags()) + } + + // Get the names of the columns in this SeriesChunk. + fn columns(&self) -> Vec> { + self.value_columns + .iter() + .map(|(name, _)| Arc::clone(name)) + .collect() + } + + /// Get the total number of rows in this SeriesChunk. + fn num_rows(&self) -> usize { + self.data.iter().map(|x| x.num_rows()).sum() + } + + /// Get the values at the given row. + fn row(&self, row: usize) -> Option> { + if row >= self.num_rows() { + return None; + } + + // Calculate the index of the record batch, and the index of the row within that batch. + // + // For example, if we have 3 batches that look like this: + // [ + // [a, b, c], + // [d, e, f, g], + // [h, i] + // ] + // and we want to get the value of row 5 (rows starting from 0), which is "f", + // the index of the batch is 1, and the index of the row within that batch is 2. + let mut batch_idx = 0; + let mut row_idx = row; + while row_idx >= self.data[batch_idx].num_rows() { + row_idx -= self.data[batch_idx].num_rows(); + batch_idx += 1; + } + + if batch_idx > self.data.len() { + return None; + } + + let mut values = Vec::new(); + for (_, idx) in self.value_columns.iter() { + values.push(Value::new(self.data[batch_idx].column(*idx), row_idx)); + } + Some(values) + } + + /// Split this SeriesChunk into two SeriesChunks at the given size. + fn split_at(self, mut size: usize) -> (Self, Self) { + let mut left = Self { + measurement_column: self.measurement_column, + tag_columns: Arc::clone(&self.tag_columns), + value_columns: Arc::clone(&self.value_columns), + data: Vec::new(), + partial: self.partial, + }; + let mut right = Self { + measurement_column: self.measurement_column, + tag_columns: self.tag_columns, + value_columns: self.value_columns, + data: Vec::new(), + partial: self.partial, + }; + let it = self.data.into_iter(); + for batch in it { + if size > 0 { + if batch.num_rows() > size { + left.data.push(batch.slice(0, size)); + right.data.push(batch.slice(size, batch.num_rows() - size)); + size = 0; + } else { + size -= batch.num_rows(); + left.data.push(batch); + } + } else { + right.data.push(batch); + } + } + (left, right) + } + + /// Merge another SeriesChunk into this one. + fn merge(&mut self, other: Self) { + assert_eq!(self.series(), other.series()); + self.data.extend(other.data); + } +} + +struct SeriesChunkSerializer<'a> { + chunk: &'a SeriesChunk, + epoch: Option, + /// Allow infinite values + allow_inf: bool, +} + +impl<'a> SeriesChunkSerializer<'a> { + fn new(chunk: &'a SeriesChunk, epoch: Option, allow_inf: bool) -> Self { + Self { + chunk, + epoch, + allow_inf, + } + } +} + +impl Serialize for SeriesChunkSerializer<'_> { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut fields = 0; + let name = self.chunk.measurement(); + if !name.is_empty() { + fields += 1; + } + let tags = self.chunk.tags(); + if !tags.is_empty() { + fields += 1; + } + let columns = self.chunk.columns(); + if !columns.is_empty() { + fields += 1; + } + if self.chunk.num_rows() > 0 { + fields += 1; + } + if self.chunk.partial { + fields += 1; + } + + let mut obj = serializer.serialize_struct("", fields)?; + if !name.is_empty() { + obj.serialize_field("name", &name)?; + } + if !tags.is_empty() { + obj.serialize_field("tags", &tags)?; + } + if !columns.is_empty() { + obj.serialize_field("columns", &self.chunk.columns())?; + } + if self.chunk.num_rows() > 0 { + obj.serialize_field( + "values", + &SeriesValues { + chunk: self.chunk, + epoch: self.epoch, + allow_inf: self.allow_inf, + }, + )?; + } + if self.chunk.partial { + obj.serialize_field("partial", &self.chunk.partial)?; + } + obj.end() + } +} + +impl Debug for SeriesChunk { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Chunk") + .field("measurement", &self.measurement()) + .field("tags", &self.tags()) + .field("columns", &self.columns()) + .finish_non_exhaustive() + } +} + +impl<'a> IntoIterator for &'a SeriesChunk { + type Item = Vec; + type IntoIter = SeriesChunkIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + SeriesChunkIter { + chunk: self, + row: 0, + } + } +} + +pub(crate) struct SeriesChunkIter<'a> { + chunk: &'a SeriesChunk, + row: usize, +} + +impl Iterator for SeriesChunkIter<'_> { + type Item = Vec; + + fn next(&mut self) -> Option { + let row = self.chunk.row(self.row); + if row.is_some() { + self.row += 1; + } + row + } + + fn size_hint(&self) -> (usize, Option) { + let remaining = self.chunk.num_rows() - self.row; + (remaining, Some(remaining)) + } +} + +/// This is a helper struct to serialize a SeriesChunk into the JSON +/// format +struct SeriesValues<'a> { + chunk: &'a SeriesChunk, + epoch: Option, + /// Allow infinite values + allow_inf: bool, +} + +impl Serialize for SeriesValues<'_> { + fn serialize( + &self, + serializer: S, + ) -> std::result::Result { + let mut seq = serializer.serialize_seq(Some(self.chunk.num_rows()))?; + for row in self.chunk.into_iter() { + let row = row + .iter() + .map(|e| ValueSerializer::new(e, self.epoch, self.allow_inf)) + .collect::>(); + seq.serialize_element(&row)?; + } + seq.end() + } +} + +/// The result of a single InfluxQL statement. This is equivalent to +/// [query.Result](https://github.com/influxdata/influxdb/blob/master-1.x/query/result.go#L86) +/// from InfluxDB v1. +/// +/// N.B. This doesn't support the messages field, as we have no use for +/// it. +#[derive(Debug, PartialEq)] +pub(crate) struct StatementResult { + statement_id: usize, + series: Vec, + partial: bool, + error: String, +} + +impl StatementResult { + fn new(statement_id: usize) -> Self { + Self { + statement_id, + series: Vec::new(), + partial: false, + error: String::new(), + } + } + + fn add_series(&mut self, series: SeriesChunk) { + self.series.push(series); + } + + fn set_partial(&mut self, partial: bool) { + self.partial = partial; + } + + fn set_error(&mut self, error: String) { + self.error = error; + } + + fn is_error(&self) -> bool { + !self.error.is_empty() + } +} + +struct StatementResultSerializer<'a> { + result: &'a StatementResult, + epoch: Option, + /// Allow infinite values + allow_inf: bool, +} + +impl Serialize for StatementResultSerializer<'_> { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut fields = 1; + if !self.result.series.is_empty() { + fields += 1; + } + if self.result.partial { + fields += 1; + } + if !self.result.error.is_empty() { + fields += 1; + } + let mut obj = serializer.serialize_struct("", fields)?; + obj.serialize_field("statement_id", &self.result.statement_id)?; + if !self.result.series.is_empty() { + let series = self + .result + .series + .iter() + .map(|s| SeriesChunkSerializer::new(s, self.epoch, self.allow_inf)) + .collect::>(); + obj.serialize_field("series", &series)?; + } + if self.result.partial { + obj.serialize_field("partial", &self.result.partial)?; + } + if !self.result.error.is_empty() { + obj.serialize_field("error", &self.result.error)?; + } + obj.end() + } +} + +#[derive(Debug, Default, PartialEq)] +pub(crate) struct Response(Vec); + +impl Response { + pub(crate) fn add_result(&mut self, result: StatementResult) { + self.0.push(result); + } +} + +struct ResponseSerializer<'a> { + response: &'a Response, + epoch: Option, + /// Allow infinite values + allow_inf: bool, +} + +impl<'a> ResponseSerializer<'a> { + fn new(response: &'a Response, epoch: Option, allow_inf: bool) -> Self { + Self { + response, + epoch, + allow_inf, + } + } +} + +impl Serialize for ResponseSerializer<'_> { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let fields = if self.response.0.is_empty() { 0 } else { 1 }; + let mut obj = serializer.serialize_struct("", fields)?; + if !self.response.0.is_empty() { + let result = self + .response + .0 + .iter() + .map(|r| StatementResultSerializer { + result: r, + epoch: self.epoch, + allow_inf: self.allow_inf, + }) + .collect::>(); + obj.serialize_field("results", &result)?; + } + obj.end() + } +} + +#[cfg(test)] +mod tests { + use crate::StatementFuture; + use crate::error::Error; + use crate::types::Statement; + use arrow::{ + array::{ArrayRef, RecordBatch}, + datatypes::{DataType, Field, Schema, SchemaRef}, + }; + use data_types::NamespaceId; + use datafusion::physical_plan::{ExecutionPlan, test::exec::MockExec}; + use generated_types::influxdata::iox::querier::v1::{ + InfluxQlMetadata, influx_ql_metadata::TagKeyColumn, + }; + use iox_query::QueryDatabase; + use iox_query::exec::IOxSessionContext; + use iox_query::query_log::{PermitAndToken, QueryLog}; + use iox_query_params::StatementParams; + use schema::{INFLUXQL_MEASUREMENT_COLUMN_NAME, INFLUXQL_METADATA_KEY, TIME_COLUMN_NAME}; + use std::{collections::HashMap, sync::Arc}; + + #[derive(Clone)] + pub(super) enum Column { + Measurement, + Tag { + name: &'static str, + group_by: bool, + projected: bool, + }, + Time, + Field { + name: &'static str, + }, + } + + pub(super) fn make_statement( + database: &Arc, + ctx: &Arc, + log: &Arc, + columns: impl IntoIterator, + data: Vec, + ) -> StatementFuture { + let (schema, batches) = make_schema_and_batches(columns, vec![data]); + let exec: Arc = Arc::new( + MockExec::new( + batches.into_iter().map(Ok).collect(), + SchemaRef::clone(&schema), + ) + .with_use_task(false), + ); + + let database = Arc::clone(database); + let ctx = Arc::clone(ctx); + let log = Arc::clone(log); + let fut = async move { + let token = log.push( + NamespaceId::new(0), + Arc::from("test"), + "test_query", + Box::new("test_query".to_string()), + StatementParams::new(), + None, + None, + ); + let token = token.planned(ctx.as_ref(), Arc::clone(&exec)); + let permit = database.acquire_semaphore(None).await; + let query_completed_token = token.permit(); + let permit_state = Some(PermitAndToken { + permit, + query_completed_token, + }); + ctx.execute_stream(exec) + .await + .map(|stream| Statement { + schema, + permit_state, + stream, + }) + .map_err(Error::from) + }; + + Box::new(fut) + } + + pub(super) fn make_schema_and_batches( + columns: impl IntoIterator, + data: Vec>, + ) -> (SchemaRef, Vec) { + let mut measurement_column_index = None; + let mut tag_key_columns = vec![]; + let fields = columns + .into_iter() + .enumerate() + .inspect(|(i, column)| { + if let Column::Tag { + name, + group_by: true, + projected, + } = column + { + tag_key_columns.push(TagKeyColumn { + tag_key: name.to_string(), + column_index: *i as u32, + is_projected: *projected, + }); + } + if let Column::Measurement = column { + measurement_column_index = Some(*i as u32); + } + }) + .map(|(i, column)| match column { + Column::Measurement => Field::new( + INFLUXQL_MEASUREMENT_COLUMN_NAME, + data.first() + .and_then(|batch| batch.get(i)) + .map(|arr| arr.data_type().clone()) + .unwrap_or(DataType::Utf8), + false, + ), + Column::Tag { name, .. } => Field::new( + name, + data.first() + .and_then(|batch| batch.get(i)) + .map(|arr| arr.data_type().clone()) + .unwrap_or(DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Utf8), + )), + true, + ), + Column::Time => Field::new( + TIME_COLUMN_NAME, + data.first() + .and_then(|batch| batch.get(i)) + .map(|arr| arr.data_type().clone()) + .unwrap_or(DataType::Timestamp( + arrow::datatypes::TimeUnit::Nanosecond, + None, + )), + false, + ), + Column::Field { name } => Field::new( + name, + data.first() + .and_then(|batch| batch.get(i)) + .map(|arr| arr.data_type().clone()) + .unwrap_or(DataType::Float64), + true, + ), + }) + .collect::>(); + let md = InfluxQlMetadata { + measurement_column_index: measurement_column_index.unwrap(), + tag_key_columns, + }; + let md = serde_json::to_string(&md).unwrap(); + let schema = + Schema::new(fields).with_metadata(HashMap::from([(INFLUXQL_METADATA_KEY.into(), md)])); + let schema = SchemaRef::new(schema); + + let batches = data + .into_iter() + .map(|d| RecordBatch::try_new(SchemaRef::clone(&schema), d).unwrap()) + .collect(); + + (schema, batches) + } +} diff --git a/iox_v1_query_api/src/response/buffered.rs b/iox_v1_query_api/src/response/buffered.rs new file mode 100644 index 00000000..8447e9ec --- /dev/null +++ b/iox_v1_query_api/src/response/buffered.rs @@ -0,0 +1,402 @@ +//! Streams for producing responses where chunking is not enabled. + +use super::{Response, SeriesChunk, SeriesChunkMergeStream, SeriesChunkStream, StatementResult}; +use crate::{Result, types::Statement}; +use datafusion::physical_plan::SendableRecordBatchStream; +use futures::{Stream, ready}; +use iox_query::query_log::PermitAndToken; +use std::{ + future::Future, + pin::Pin, + task::{Context, Poll}, +}; + +/// A stream of one [Response] value where the response contains the +/// result data for all the corresponding statements. +pub(crate) struct BufferedResponseStream { + statements: Vec> + Send>>>, + statement_id: usize, + current_statement: Option<( + Option, + BufferedResultStream>, + )>, + // Buffer the statment results for all the statements in one response + response: Option, +} + +impl BufferedResponseStream { + pub(crate) fn new(statements: Vec> + Send>>) -> Self { + let response = (!statements.is_empty()).then_some(Response::default()); + + let statements = statements.into_iter().map(Box::into_pin).collect(); + Self { + statements, + statement_id: 0, + current_statement: None, + response, + } + } + + fn poll_statement(&mut self, cx: &mut Context<'_>) -> Poll> { + self.statements[self.statement_id].as_mut().poll(cx) + } + + fn poll_next_result(&mut self, cx: &mut Context<'_>) -> Poll> { + let (_, stream) = self + .current_statement + .as_mut() + .expect("current_statement is None"); + Pin::new(stream).poll_next(cx) + } + + fn poll_next_unpin(&mut self, cx: &mut Context<'_>) -> Poll> { + if self.statement_id >= self.statements.len() { + return Poll::Ready(self.response.take()); + } + + if self.current_statement.is_some() { + match ready!(self.poll_next_result(cx)) { + Some(result) => { + if result.is_error() { + let (permit_state, _) = self.current_statement.take().unwrap(); // safe to unwrap because we just checked above + if let Some(permit_state) = permit_state { + permit_state.query_completed_token.fail(); + } + self.statement_id += 1; + } + self.response.as_mut().unwrap().add_result(result); // safe to unwrap because we just checked above + } + None => { + let (permit_state, _) = self.current_statement.take().unwrap(); // safe to unwrap because we just checked above + if let Some(permit_state) = permit_state { + permit_state.query_completed_token.success(); + } + self.statement_id += 1; + } + } + } else { + match ready!(self.poll_statement(cx)) { + Ok(Statement { + schema, + permit_state, + stream, + }) => match SeriesChunkStream::try_new(stream, schema) { + Ok(stream) => { + self.current_statement = Some(( + permit_state, + BufferedResultStream::new(stream, self.statement_id), + )); + } + Err(e) => { + if let Some(permit_state) = permit_state { + permit_state.query_completed_token.fail(); + } + let mut result = StatementResult::new(self.statement_id); + result.set_error(e.to_string()); + self.statement_id += 1; + self.response.as_mut().unwrap().add_result(result); // safe to unwrap because we just checked above + } + }, + Err(e) => { + let mut result = StatementResult::new(self.statement_id); + result.set_error(e.to_string()); + self.statement_id += 1; + self.response.as_mut().unwrap().add_result(result); // safe to unwrap because we just checked above + } + } + } + self.poll_next_unpin(cx) + } +} + +impl Stream for BufferedResponseStream { + type Item = Response; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + this.poll_next_unpin(cx) + } +} + +pub(super) struct BufferedResultStream { + inner: SeriesChunkMergeStream, + result: Option, +} + +impl BufferedResultStream { + pub(super) fn new(stream: S, statement_id: usize) -> Self { + let inner = SeriesChunkMergeStream::new(stream, None); + Self { + inner, + result: Some(StatementResult::new(statement_id)), + } + } +} + +impl BufferedResultStream +where + S: Stream> + Unpin, +{ + fn poll_next_unpin(&mut self, cx: &mut Context<'_>) -> Poll> { + loop { + if self.result.is_none() { + return Poll::Ready(None); + } + + match ready!(Pin::new(&mut self.inner).poll_next(cx)) { + Some(Ok(chunk)) => { + self.result.as_mut().unwrap().add_series(chunk); // safe to unwrap because we just checked above + } + Some(Err(e)) => { + let mut result = self.result.take().unwrap(); // safe to unwrap because we just checked above + result.set_error(e.to_string()); + return Poll::Ready(Some(result)); + } + None => return Poll::Ready(self.result.take()), + } + } + } +} + +impl Stream for BufferedResultStream +where + S: Stream> + Unpin, +{ + type Item = StatementResult; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + this.poll_next_unpin(cx) + } +} + +#[cfg(test)] +mod tests { + use crate::response::ResponseSerializer; + use crate::types::Precision; + + use super::super::tests::{Column, make_statement}; + use super::*; + use arrow::array::{DictionaryArray, Float64Array, StringArray, TimestampNanosecondArray}; + use arrow::datatypes::Int32Type; + use futures::StreamExt; + use iox_query::QueryDatabase; + use iox_query::exec::IOxSessionContext; + use iox_query::query_log::QueryLog; + use iox_query::test::TestDatabaseStore; + use iox_time::SystemProvider; + use serde::ser::Serialize; + use serde_json::ser::Serializer; + use std::sync::Arc; + + #[tokio::test] + async fn no_statements() { + let mut stream = BufferedResponseStream::new(vec![]); + assert!(stream.next().await.is_none()); + } + + #[tokio::test] + async fn single_statement_single_series() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])), + Arc::new(DictionaryArray::::from_iter(vec!["t1a", "t1a"])), + Arc::new(Float64Array::from(vec![1.0, 2.0])), + ], + ); + insta::assert_snapshot!(collect_output(BufferedResponseStream::new(vec![statement])).await); + } + + #[tokio::test] + async fn single_statement_multi_series() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: true, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1b", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])), + ], + ); + + insta::assert_snapshot!(collect_output(BufferedResponseStream::new(vec![statement])).await); + } + + #[tokio::test] + async fn many_statements() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement1 = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])), + ], + ); + let statement2 = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: true, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1b", "t1b", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])), + ], + ); + + insta::assert_snapshot!( + collect_output(BufferedResponseStream::new(vec![statement1, statement2])).await + ); + } + + #[tokio::test] + async fn test_epoch() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: true, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1b", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])), + ], + ); + + insta::assert_snapshot!( + collect_output_epoch( + BufferedResponseStream::new(vec![statement]), + Some(Precision::Milliseconds) + ) + .await + ); + } + + async fn collect_output(stream: impl Stream + Send) -> String { + collect_output_epoch(stream, None).await + } + + async fn collect_output_epoch( + stream: impl Stream + Send, + epoch: Option, + ) -> String { + stream + .map(|r| { + let mut w = vec![]; + let mut ser = Serializer::new(&mut w); + let r = ResponseSerializer::new(&r, epoch, true); + r.serialize(&mut ser).unwrap(); + w.push(b'\n'); + String::from_utf8(w).unwrap() + }) + .collect() + .await + } +} diff --git a/iox_v1_query_api/src/response/chunked.rs b/iox_v1_query_api/src/response/chunked.rs new file mode 100644 index 00000000..7700b2a8 --- /dev/null +++ b/iox_v1_query_api/src/response/chunked.rs @@ -0,0 +1,465 @@ +//! Streams for producing responses where chunking is enabled. + +use super::{Response, SeriesChunk, SeriesChunkMergeStream, SeriesChunkStream, StatementResult}; +use crate::{Result, StatementFuture, error::Error, types::Statement}; +use datafusion::physical_plan::SendableRecordBatchStream; +use futures::{Stream, ready}; +use iox_query::query_log::PermitAndToken; +use std::{ + num::NonZero, + pin::Pin, + task::{Context, Poll}, +}; + +/// A stream of [Response] values where each response contains a single +/// chunk of result data. +pub(crate) struct ChunkedResponseStream { + statements: Vec>, + chunk_size: usize, + statement_id: usize, + current_statement: Option<( + Option, + ChunkedResultStream>, + )>, +} + +impl ChunkedResponseStream { + pub(crate) fn new(statements: Vec, chunk_size: usize) -> Self { + let statements = statements.into_iter().map(Box::into_pin).collect(); + Self { + statements, + chunk_size, + statement_id: 0, + current_statement: None, + } + } + + fn poll_statement(&mut self, cx: &mut Context<'_>) -> Poll> { + self.statements[self.statement_id] + .as_mut() + .poll(cx) + .map_err(Error::from) + } + + fn poll_next_result(&mut self, cx: &mut Context<'_>) -> Poll> { + let (_, stream) = self + .current_statement + .as_mut() + .expect("current_statement is None"); + Pin::new(stream).poll_next(cx) + } + + fn poll_next_unpin(&mut self, cx: &mut Context<'_>) -> Poll> { + if self.statement_id >= self.statements.len() { + return Poll::Ready(None); + } + if self.current_statement.is_some() { + match ready!(self.poll_next_result(cx)) { + Some(result) => { + if result.is_error() { + let (permit_state, _) = self.current_statement.take().unwrap(); // safe to unwrap because we just checked above + if let Some(permit_state) = permit_state { + permit_state.query_completed_token.fail(); + } + self.statement_id += 1; + } + let mut resp = Response::default(); + resp.add_result(result); + Poll::Ready(Some(resp)) + } + None => { + let (permit_state, _) = self.current_statement.take().unwrap(); // safe to unwrap because we just checked above + if let Some(permit_state) = permit_state { + permit_state.query_completed_token.success(); + } + self.statement_id += 1; + self.poll_next_unpin(cx) + } + } + } else { + match ready!(self.poll_statement(cx)) { + Ok(Statement { + schema, + permit_state, + stream, + }) => match SeriesChunkStream::try_new(stream, schema) { + Ok(stream) => { + self.current_statement = Some(( + permit_state, + ChunkedResultStream::new(stream, self.chunk_size, self.statement_id), + )); + self.poll_next_unpin(cx) + } + Err(e) => { + if let Some(permit_state) = permit_state { + permit_state.query_completed_token.fail(); + } + let mut result = StatementResult::new(self.statement_id); + result.set_error(e.to_string()); + self.statement_id += 1; + let mut resp = Response::default(); + resp.add_result(result); + Poll::Ready(Some(resp)) + } + }, + Err(e) => { + let mut result = StatementResult::new(self.statement_id); + result.set_error(e.to_string()); + self.statement_id += 1; + let mut resp = Response::default(); + resp.add_result(result); + Poll::Ready(Some(resp)) + } + } + } + } +} + +impl Stream for ChunkedResponseStream { + type Item = Response; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + this.poll_next_unpin(cx) + } +} + +pub(super) struct ChunkedResultStream { + inner: SeriesChunkMergeStream, + statement_id: usize, + buffered: Option, + done: bool, +} + +impl ChunkedResultStream { + pub(super) fn new(stream: S, chunk_size: usize, statement_id: usize) -> Self { + let inner = SeriesChunkMergeStream::new(stream, NonZero::new(chunk_size)); + Self { + inner, + statement_id, + buffered: None, + done: false, + } + } +} + +impl ChunkedResultStream +where + S: Stream> + Unpin, +{ + fn poll_next_unpin(&mut self, cx: &mut Context<'_>) -> Poll> { + if self.done { + return Poll::Ready(None); + } + match ready!(Pin::new(&mut self.inner).poll_next(cx)) { + Some(Ok(chunk)) => { + let mut result = StatementResult::new(self.statement_id); + result.add_series(chunk); + result.set_partial(true); + match self.buffered.replace(result) { + Some(result) => Poll::Ready(Some(result)), + None => self.poll_next_unpin(cx), + } + } + Some(Err(e)) => { + let mut result = self + .buffered + .take() + .unwrap_or_else(|| StatementResult::new(self.statement_id)); + result.set_error(e.to_string()); + Poll::Ready(Some(result)) + } + None => { + self.done = true; + match self.buffered.take() { + Some(mut result) => { + result.set_partial(false); + Poll::Ready(Some(result)) + } + None => Poll::Ready(None), + } + } + } + } +} + +impl Stream for ChunkedResultStream +where + S: Stream> + Unpin, +{ + type Item = StatementResult; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + this.poll_next_unpin(cx) + } +} + +#[cfg(test)] +mod tests { + use crate::response::ResponseSerializer; + use crate::types::Precision; + + use super::super::tests::{Column, make_statement}; + use super::*; + use arrow::array::{DictionaryArray, Float64Array, StringArray, TimestampNanosecondArray}; + use arrow::datatypes::Int32Type; + use futures::StreamExt; + use iox_query::QueryDatabase; + use iox_query::exec::IOxSessionContext; + use iox_query::query_log::QueryLog; + use iox_query::test::TestDatabaseStore; + use iox_time::SystemProvider; + use serde::ser::Serialize; + use serde_json::ser::Serializer; + use std::sync::Arc; + + #[tokio::test] + async fn no_statements() { + let mut stream = ChunkedResponseStream::new(vec![], 3); + assert!(stream.next().await.is_none()); + } + + #[tokio::test] + async fn single_chunk() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])), + Arc::new(DictionaryArray::::from_iter(vec!["t1a", "t1a"])), + Arc::new(Float64Array::from(vec![1.0, 2.0])), + ], + ); + insta::assert_snapshot!( + collect_output(ChunkedResponseStream::new(vec![statement], 2)).await + ); + } + + #[tokio::test] + async fn many_chunks() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])), + ], + ); + insta::assert_snapshot!( + collect_output(ChunkedResponseStream::new(vec![statement], 2)).await + ); + } + + #[tokio::test] + async fn many_chunks_with_groups() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])), + ], + ); + insta::assert_snapshot!( + collect_output(ChunkedResponseStream::new(vec![statement], 2)).await + ); + } + + #[tokio::test] + async fn many_statements() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement1 = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])), + ], + ); + let statement2 = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: true, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1b", "t1b", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])), + ], + ); + insta::assert_snapshot!( + collect_output(ChunkedResponseStream::new(vec![statement1, statement2], 2)).await + ); + } + + #[tokio::test] + async fn test_epoch() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])), + Arc::new(DictionaryArray::::from_iter(vec!["t1a", "t1a"])), + Arc::new(Float64Array::from(vec![1.0, 2.0])), + ], + ); + + insta::assert_snapshot!( + collect_output_epoch( + ChunkedResponseStream::new(vec![statement], 2), + Some(Precision::Milliseconds) + ) + .await + ); + } + + async fn collect_output(stream: impl Stream + Send) -> String { + collect_output_epoch(stream, None).await + } + + async fn collect_output_epoch( + stream: impl Stream + Send, + epoch: Option, + ) -> String { + stream + .map(|r| { + let mut w = vec![]; + let mut ser = Serializer::new(&mut w); + let r = ResponseSerializer::new(&r, epoch, true); + r.serialize(&mut ser).unwrap(); + w.push(b'\n'); + String::from_utf8(w).unwrap() + }) + .collect() + .await + } +} diff --git a/iox_v1_query_api/src/response/csv.rs b/iox_v1_query_api/src/response/csv.rs new file mode 100644 index 00000000..7c4dcf73 --- /dev/null +++ b/iox_v1_query_api/src/response/csv.rs @@ -0,0 +1,824 @@ +//! InfluxDB v1 compatible CSV streaming output for InfluxQL queries. + +use super::{SeriesChunk, SeriesChunkStream}; +use crate::error::Error; +use crate::types::Precision; +use crate::{Result, StatementFuture, types::Statement, value::ValueType}; +use bytes::{Bytes, BytesMut}; +use datafusion::execution::SendableRecordBatchStream; +use futures::{Stream, ready}; +use iox_query::query_log::PermitAndToken; +use std::pin::Pin; +use std::task::{Context, Poll}; +use tracing::warn; + +/// A stream of CSV data produced by executing InfluxQL statements. +pub(crate) struct CsvStream { + statements: Vec>, + statement_id: usize, + current_statement: Option<( + Option, + SeriesChunkStream, + )>, + add_headers: bool, + add_newline: bool, + epoch: Precision, +} + +impl CsvStream { + pub(crate) fn new(statements: Vec) -> Self { + let statements = statements.into_iter().map(Box::into_pin).collect(); + Self { + statements, + statement_id: 0, + current_statement: None, + add_headers: false, + add_newline: false, + epoch: Precision::Nanoseconds, + } + } + + pub(crate) fn with_epoch(mut self, epoch: Option) -> Self { + self.epoch = epoch.unwrap_or(Precision::Nanoseconds); + self + } + + fn poll_statement(&mut self, cx: &mut Context<'_>) -> Poll> { + self.statements[self.statement_id].as_mut().poll(cx) + } + + fn poll_next_chunk(&mut self, cx: &mut Context<'_>) -> Poll>> { + let (_, stream) = self + .current_statement + .as_mut() + .expect("no active SeriesChunkStream"); + Pin::new(stream).poll_next(cx).map_err(Error::from) + } +} + +impl Stream for CsvStream { + type Item = Bytes; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + if this.statement_id >= this.statements.len() { + return Poll::Ready(None); + } + + if this.current_statement.is_none() { + let res = ready!(this.poll_statement(cx)); + let Statement { + schema, + permit_state, + stream, + } = match res { + Ok(v) => v, + Err(e) => { + warn!(error=%e, "Error executing query"); + this.statement_id += 1; + cx.waker().wake_by_ref(); + return Poll::Pending; + } + }; + let stream = match SeriesChunkStream::try_new(stream, schema) { + Ok(v) => v, + Err(e) => { + warn!(error=%e, "Error creating SeriesChunk stream"); + if let Some(permit_state) = permit_state { + permit_state.query_completed_token.fail(); + } + this.statement_id += 1; + cx.waker().wake_by_ref(); + return Poll::Pending; + } + }; + this.add_headers = true; + this.add_newline = this.statement_id != 0; + this.current_statement = Some((permit_state, stream)); + } + assert!(this.current_statement.is_some()); + match ready!(this.poll_next_chunk(cx)) { + None => { + let (permit_state, _) = this.current_statement.take().unwrap(); + if let Some(permit_state) = permit_state { + permit_state.query_completed_token.success(); + } + this.statement_id += 1; + cx.waker().wake_by_ref(); + Poll::Pending + } + Some(Ok(chunk)) => { + let mut chunk = CsvSeriesChunk::new(chunk, this.epoch); + if this.add_newline { + chunk = chunk.with_newline(); + this.add_newline = false; + } + if this.add_headers { + chunk = chunk.with_headers(); + this.add_headers = false; + } + Poll::Ready(Some(chunk.into())) + } + Some(Err(e)) => { + warn!(error=%e, "Error streaming SeriesChunk"); + let (permit_state, _) = this.current_statement.take().unwrap(); + if let Some(permit_state) = permit_state { + permit_state.query_completed_token.fail(); + } + this.statement_id += 1; + cx.waker().wake_by_ref(); + Poll::Pending + } + } + } +} + +/// A chunk of CSV data which represents part of the result of executing +/// an InfluxQL statement. Each chunk is for a single series, but may +/// not contain a complete series. +#[derive(Debug)] +pub(crate) struct CsvSeriesChunk { + chunk: SeriesChunk, + emit_headers: bool, + emit_newline: bool, + epoch: Precision, +} + +impl CsvSeriesChunk { + fn new(chunk: SeriesChunk, epoch: Precision) -> Self { + Self { + chunk, + emit_headers: false, + emit_newline: false, + epoch, + } + } + + fn with_headers(mut self) -> Self { + self.emit_headers = true; + self + } + + fn with_newline(mut self) -> Self { + self.emit_newline = true; + self + } +} + +impl From for Bytes { + fn from(value: CsvSeriesChunk) -> Self { + let mut bytes = BytesMut::new(); + let epoch = value.epoch; + if value.emit_newline { + bytes.extend_from_slice(b"\n"); + } + if value.emit_headers { + // Measurement name and tag headers are always present. + bytes.extend_from_slice(b"name,tags"); + for column in value.chunk.columns() { + bytes.extend_from_slice(format!(",{column}").as_bytes()); + } + bytes.extend_from_slice(b"\n"); + } + let measurement = csv_escape(value.chunk.measurement()); + let mut tags = String::new(); + for (k, v) in value.chunk.tags() { + tags.push_str(k.as_ref()); + tags.push('='); + tags.push_str(&v); + tags.push(','); + } + if !tags.is_empty() { + tags.pop(); // Remove trailing comma. + } + let tags = csv_escape(tags); + + for row in value.chunk.into_iter() { + bytes.extend_from_slice(measurement.as_bytes()); + bytes.extend_from_slice(b","); + bytes.extend_from_slice(tags.as_bytes()); + for value in row.into_iter() { + bytes.extend_from_slice(b","); + match value.value_type() { + // NOTE(hiltontj): legacy /query API respects the `epoch` parameter, but only + // returns timestamps in epoch format, not in RFC3339. + // + // See: + ValueType::Timestamp(_) => { + let ts = value.as_timestamp_opt().unwrap_or_default(); + let ts = match epoch { + Precision::Nanoseconds => ts.timestamp_nanos_opt().unwrap_or_default(), + Precision::Microseconds => ts.timestamp_micros(), + Precision::Milliseconds => ts.timestamp_millis(), + Precision::Seconds => ts.timestamp(), + Precision::Minutes => ts.timestamp() / 60, + Precision::Hours => ts.timestamp() / (60 * 60), + Precision::Days => ts.timestamp() / (60 * 60 * 24), + Precision::Weeks => ts.timestamp() / (60 * 60 * 24 * 7), + }; + bytes.extend_from_slice(ts.to_string().as_bytes()); + } + _ => { + bytes.extend_from_slice(csv_escape(format!("{value}")).as_bytes()); + } + } + } + bytes.extend_from_slice(b"\n"); + } + + bytes.into() + } +} + +fn csv_escape(s: String) -> String { + if s.contains(',') || s.contains('"') { + format!("\"{}\"", s.replace("\"", "\"\"")) + } else { + s + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use super::super::tests::{Column, make_statement}; + use super::*; + use arrow::{ + array::{DictionaryArray, Float64Array, StringArray, TimestampNanosecondArray}, + datatypes::Int32Type, + }; + use datafusion::error::DataFusionError; + use futures::StreamExt; + use iox_query::QueryDatabase; + use iox_query::exec::IOxSessionContext; + use iox_query::query_log::QueryLog; + use iox_query::test::TestDatabaseStore; + use iox_time::SystemProvider; + + #[tokio::test] + async fn no_statements() { + let mut stream = CsvStream::new(vec![]); + assert!(stream.next().await.is_none()); + } + + #[tokio::test] + async fn single_statement_no_group() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])), + ], + ); + let stream = CsvStream::new(vec![statement]); + let output = stream.map(Bytes::from).map(Vec::::from).concat().await; + let output = String::from_utf8(output).unwrap(); + insta::assert_snapshot!(output, @r" + name,tags,time,t1,f1 + m1,,1000000000,t1a,1 + m1,,2000000000,t1a,2 + m1,,3000000000,t1a,3 + m2,,1000000000,t1a,4 + "); + } + + #[tokio::test] + async fn single_statement_tag_group() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: true, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])), + ], + ); + let stream = CsvStream::new(vec![statement]); + let output = stream.map(Bytes::from).map(Vec::::from).concat().await; + let output = String::from_utf8(output).unwrap(); + insta::assert_snapshot!(output, @r" + name,tags,time,f1 + m1,t1=t1a,1000000000,1 + m1,t1=t1a,2000000000,2 + m1,t1=t1a,3000000000,3 + m2,t1=t1a,1000000000,4 + "); + } + + #[tokio::test] + async fn single_statement_tag_group_projected() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: true, + projected: true, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])), + ], + ); + let stream = CsvStream::new(vec![statement]); + let output = stream.map(Bytes::from).map(Vec::::from).concat().await; + let output = String::from_utf8(output).unwrap(); + insta::assert_snapshot!(output, @r" + name,tags,time,t1,f1 + m1,t1=t1a,1000000000,t1a,1 + m1,t1=t1a,2000000000,t1a,2 + m1,t1=t1a,3000000000,t1a,3 + m2,t1=t1a,1000000000,t1a,4 + "); + } + + #[tokio::test] + async fn single_statement_tag_group_multiple_tags() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: true, + projected: false, + }, + Column::Tag { + name: "t2", + group_by: true, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", "t1a", + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "\"t2a\"", "\"t2a\"", "\"t2b\"", "\"t2a\"", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])), + ], + ); + let stream = CsvStream::new(vec![statement]); + let output = stream.map(Bytes::from).map(Vec::::from).concat().await; + let output = String::from_utf8(output).unwrap(); + insta::assert_snapshot!(output, @r#" + name,tags,time,f1 + m1,"t1=t1a,t2=""t2a""",1000000000,1 + m1,"t1=t1a,t2=""t2a""",2000000000,2 + m1,"t1=t1a,t2=""t2b""",3000000000,3 + m2,"t1=t1a,t2=""t2a""",1000000000,4 + "#); + } + + #[tokio::test] + async fn single_statement_infinite_value() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![ + f64::NEG_INFINITY, + f64::INFINITY, + f64::NEG_INFINITY, + f64::NAN, + ])), + ], + ); + let stream = CsvStream::new(vec![statement]); + let output = stream.map(Bytes::from).map(Vec::::from).concat().await; + let output = String::from_utf8(output).unwrap(); + insta::assert_snapshot!(output, @r" + name,tags,time,t1,f1 + m1,,1000000000,t1a,-inf + m1,,2000000000,t1a,inf + m1,,3000000000,t1a,-inf + m2,,1000000000,t1a,NaN + "); + } + + #[tokio::test] + async fn multiple_statements() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement1 = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])), + ], + ); + let statement2 = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t2", + group_by: false, + projected: false, + }, + Column::Field { name: "f2" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m3", "m3", "m3", "m3"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 4000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t2a", "t2a", "t2a", "t2a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])), + ], + ); + let stream = CsvStream::new(vec![statement1, statement2]); + let output = stream.map(Bytes::from).map(Vec::::from).concat().await; + let output = String::from_utf8(output).unwrap(); + insta::assert_snapshot!(output, @r" + name,tags,time,t1,f1 + m1,,1000000000,t1a,1 + m1,,2000000000,t1a,2 + m1,,3000000000,t1a,3 + m2,,1000000000,t1a,4 + + name,tags,time,t2,f2 + m3,,1000000000,t2a,1 + m3,,2000000000,t2a,2 + m3,,3000000000,t2a,3 + m3,,4000000000,t2a,4 + "); + } + + #[tokio::test] + async fn multiple_statements_with_error() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement1 = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])), + ], + ); + let statement2 = make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t2", + group_by: false, + projected: false, + }, + Column::Field { name: "f2" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m3", "m3", "m3", "m3"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 4000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t2a", "t2a", "t2a", "t2a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])), + ], + ); + let stream = CsvStream::new(vec![ + statement1, + Box::new(async { Err(DataFusionError::Internal("test error".to_string()))? }), + statement2, + ]); + let output = stream.map(Bytes::from).map(Vec::::from).concat().await; + let output = String::from_utf8(output).unwrap(); + insta::assert_snapshot!(output, @r" + name,tags,time,t1,f1 + m1,,1000000000,t1a,1 + m1,,2000000000,t1a,2 + m1,,3000000000,t1a,3 + m2,,1000000000,t1a,4 + + name,tags,time,t2,f2 + m3,,1000000000,t2a,1 + m3,,2000000000,t2a,2 + m3,,3000000000,t2a,3 + m3,,4000000000,t2a,4 + "); + } + + #[tokio::test] + async fn test_csv_epoch_handling() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let statement = || { + make_statement( + &db, + &ctx, + &log, + [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ], + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1_000_000_000_000_000, + 2_000_000_000_000_000, + 3_000_000_000_000_000, + 4_000_000_000_000_000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])), + ], + ) + }; + { + let stream = CsvStream::new(vec![statement()]).with_epoch(None); + let output = stream.map(Bytes::from).map(Vec::::from).concat().await; + let output = String::from_utf8(output).unwrap(); + insta::assert_snapshot!(output, @r" + name,tags,time,t1,f1 + m1,,1000000000000000,t1a,1 + m1,,2000000000000000,t1a,2 + m1,,3000000000000000,t1a,3 + m2,,4000000000000000,t1a,4 + "); + } + { + let stream = CsvStream::new(vec![statement()]).with_epoch(Some(Precision::Nanoseconds)); + let output = stream.map(Bytes::from).map(Vec::::from).concat().await; + let output = String::from_utf8(output).unwrap(); + insta::assert_snapshot!(output, @r" + name,tags,time,t1,f1 + m1,,1000000000000000,t1a,1 + m1,,2000000000000000,t1a,2 + m1,,3000000000000000,t1a,3 + m2,,4000000000000000,t1a,4 + "); + } + { + let stream = + CsvStream::new(vec![statement()]).with_epoch(Some(Precision::Microseconds)); + let output = stream.map(Bytes::from).map(Vec::::from).concat().await; + let output = String::from_utf8(output).unwrap(); + insta::assert_snapshot!(output, @r" + name,tags,time,t1,f1 + m1,,1000000000000,t1a,1 + m1,,2000000000000,t1a,2 + m1,,3000000000000,t1a,3 + m2,,4000000000000,t1a,4 + "); + } + { + let stream = + CsvStream::new(vec![statement()]).with_epoch(Some(Precision::Milliseconds)); + let output = stream.map(Bytes::from).map(Vec::::from).concat().await; + let output = String::from_utf8(output).unwrap(); + insta::assert_snapshot!(output, @r" + name,tags,time,t1,f1 + m1,,1000000000,t1a,1 + m1,,2000000000,t1a,2 + m1,,3000000000,t1a,3 + m2,,4000000000,t1a,4 + "); + } + { + let stream = CsvStream::new(vec![statement()]).with_epoch(Some(Precision::Seconds)); + let output = stream.map(Bytes::from).map(Vec::::from).concat().await; + let output = String::from_utf8(output).unwrap(); + insta::assert_snapshot!(output, @r#" + name,tags,time,t1,f1 + m1,,1000000,t1a,1 + m1,,2000000,t1a,2 + m1,,3000000,t1a,3 + m2,,4000000,t1a,4 + "#); + } + { + let stream = CsvStream::new(vec![statement()]).with_epoch(Some(Precision::Minutes)); + let output = stream.map(Bytes::from).map(Vec::::from).concat().await; + let output = String::from_utf8(output).unwrap(); + insta::assert_snapshot!(output, @r#" + name,tags,time,t1,f1 + m1,,16666,t1a,1 + m1,,33333,t1a,2 + m1,,50000,t1a,3 + m2,,66666,t1a,4 + "#); + } + { + let stream = CsvStream::new(vec![statement()]).with_epoch(Some(Precision::Hours)); + let output = stream.map(Bytes::from).map(Vec::::from).concat().await; + let output = String::from_utf8(output).unwrap(); + insta::assert_snapshot!(output, @r#" + name,tags,time,t1,f1 + m1,,277,t1a,1 + m1,,555,t1a,2 + m1,,833,t1a,3 + m2,,1111,t1a,4 + "#); + } + { + let stream = CsvStream::new(vec![statement()]).with_epoch(Some(Precision::Days)); + let output = stream.map(Bytes::from).map(Vec::::from).concat().await; + let output = String::from_utf8(output).unwrap(); + insta::assert_snapshot!(output, @r#" + name,tags,time,t1,f1 + m1,,11,t1a,1 + m1,,23,t1a,2 + m1,,34,t1a,3 + m2,,46,t1a,4 + "#); + } + { + let stream = CsvStream::new(vec![statement()]).with_epoch(Some(Precision::Weeks)); + let output = stream.map(Bytes::from).map(Vec::::from).concat().await; + let output = String::from_utf8(output).unwrap(); + insta::assert_snapshot!(output, @r#" + name,tags,time,t1,f1 + m1,,1,t1a,1 + m1,,3,t1a,2 + m1,,4,t1a,3 + m2,,6,t1a,4 + "#); + } + } +} diff --git a/iox_v1_query_api/src/response/json.rs b/iox_v1_query_api/src/response/json.rs new file mode 100644 index 00000000..7e13331f --- /dev/null +++ b/iox_v1_query_api/src/response/json.rs @@ -0,0 +1,461 @@ +//! JSON encoding of InfluxQL query results. +use crate::types::Precision; + +use super::{BufferedResponseStream, ChunkedResponseStream, Response, ResponseSerializer}; +use bytes::buf::BufMut; +use bytes::{Bytes, BytesMut}; +use futures::Stream; +use serde::Serialize; +use serde_json::ser::Serializer; +use std::{ + pin::Pin, + task::{Context, Poll}, +}; +use tracing::warn; + +/// A generic JSON-encoded [Response] stream. +pub(crate) struct JsonStream { + stream: S, + formatter_fn: F, + epoch: Option, +} + +impl JsonStream { + pub(crate) fn new(stream: S, formatter_fn: F, epoch: Option) -> Self { + Self { + stream, + formatter_fn, + epoch, + } + } + + fn poll_next_inner(&mut self, cx: &mut Context<'_>) -> Poll> + where + S: Stream + Unpin, + { + Pin::new(&mut self.stream).poll_next(cx) + } +} + +impl Stream for JsonStream +where + S: Stream + Unpin, + F: Fn() -> Fmt + Unpin, + Fmt: serde_json::ser::Formatter, +{ + type Item = Bytes; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + this.poll_next_inner(cx).map(|opt| { + opt.and_then(|resp| { + let mut w = BytesMut::new().writer(); + let formatter = (this.formatter_fn)(); + let mut serializer = Serializer::with_formatter(&mut w, formatter); + let resp = ResponseSerializer::new(&resp, this.epoch, false); + if let Err(e) = resp.serialize(&mut serializer) { + warn!(error = %e, "failed to serialize response"); + let mut w = BytesMut::new().writer(); + let formatter = (this.formatter_fn)(); + let mut serializer = Serializer::with_formatter(&mut w, formatter); + let error_field = serde_json::json!({ + "error": format!("{}", e), + }); + if let Err(e) = error_field.serialize(&mut serializer) { + warn!(error = %e, "failed to serialize error field"); + return None; + } + Some(w.into_inner().freeze()) + } else { + Some(w.into_inner().freeze()) + } + }) + }) + } +} + +pub(crate) type ChunkedJsonStream = JsonStream; +pub(crate) type BufferedJsonStream = JsonStream; + +#[cfg(test)] +mod tests { + use super::super::tests::{Column, make_statement}; + use super::*; + + use arrow::array::{ + ArrayRef, DictionaryArray, Float64Array, StringArray, TimestampNanosecondArray, + }; + use arrow::datatypes::Int32Type; + use futures::StreamExt; + use iox_query::QueryDatabase; + use iox_query::exec::IOxSessionContext; + use iox_query::query_log::QueryLog; + use iox_query::test::TestDatabaseStore; + use iox_time::SystemProvider; + use serde_json::ser::{CompactFormatter, PrettyFormatter}; + use std::sync::Arc; + + #[tokio::test] + async fn empty_stream() { + let stream = ChunkedResponseStream::new(vec![], 2); + let mut chunked_json_stream = ChunkedJsonStream::new(stream, || CompactFormatter, None); + assert!(chunked_json_stream.next().await.is_none()); + + let stream = BufferedResponseStream::new(vec![]); + let mut buffered_json_stream = BufferedJsonStream::new(stream, || CompactFormatter, None); + assert!(buffered_json_stream.next().await.is_none()); + } + + #[tokio::test] + async fn single_chunk() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let columns = [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ]; + let data: Vec = vec![ + Arc::new(StringArray::from(vec!["m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])), + Arc::new(DictionaryArray::::from_iter(vec!["t1a", "t1a"])), + Arc::new(Float64Array::from(vec![1.0, 2.0])), + ]; + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = ChunkedResponseStream::new(vec![statement], 2); + let stream = ChunkedJsonStream::new(stream, || CompactFormatter, None); + insta::assert_snapshot!(collect_output(stream).await); + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = BufferedResponseStream::new(vec![statement]); + let stream = BufferedJsonStream::new(stream, || CompactFormatter, None); + insta::assert_snapshot!(collect_output(stream).await); + } + + #[tokio::test] + async fn single_chunk_pretty() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let columns = [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ]; + let data: Vec = vec![ + Arc::new(StringArray::from(vec!["m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])), + Arc::new(DictionaryArray::::from_iter(vec!["t1a", "t1a"])), + Arc::new(Float64Array::from(vec![1.0, 2.0])), + ]; + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = ChunkedResponseStream::new(vec![statement], 2); + let stream = ChunkedJsonStream::new(stream, PrettyFormatter::new, None); + insta::assert_snapshot!(collect_output(stream).await); + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = BufferedResponseStream::new(vec![statement]); + let stream = BufferedJsonStream::new(stream, PrettyFormatter::new, None); + insta::assert_snapshot!(collect_output(stream).await); + } + + #[tokio::test] + async fn single_chunk_exponential_value() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let columns = [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ]; + let data: Vec = vec![ + Arc::new(StringArray::from(vec!["m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])), + Arc::new(DictionaryArray::::from_iter(vec!["t1a", "t1a"])), + Arc::new(Float64Array::from(vec![ + 73070599793680680000000000000000000000000000.0, + 73070599793680670000000000000000000000000000.0, + ])), + ]; + + // Only test for buffered since the purpose is to test the formatting of the exponential value + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = BufferedResponseStream::new(vec![statement]); + let stream = BufferedJsonStream::new(stream, || CompactFormatter, None); + insta::assert_snapshot!(collect_output(stream).await); + } + + #[tokio::test] + async fn single_chunk_infinite_value() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let columns = [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ]; + let data: Vec = vec![ + Arc::new(StringArray::from(vec!["m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])), + Arc::new(DictionaryArray::::from_iter(vec!["t1a", "t1a"])), + Arc::new(Float64Array::from(vec![f64::INFINITY, f64::NEG_INFINITY])), + ]; + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = ChunkedResponseStream::new(vec![statement], 2); + let stream = ChunkedJsonStream::new(stream, PrettyFormatter::new, None); + insta::assert_snapshot!(collect_output(stream).await); + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = BufferedResponseStream::new(vec![statement]); + let stream = BufferedJsonStream::new(stream, PrettyFormatter::new, None); + insta::assert_snapshot!(collect_output(stream).await); + } + + #[tokio::test] + async fn many_chunks() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let columns = [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ]; + let data: Vec = vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])), + ]; + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = ChunkedResponseStream::new(vec![statement], 2); + let stream = ChunkedJsonStream::new(stream, || CompactFormatter, None); + insta::assert_snapshot!(collect_output(stream).await); + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = BufferedResponseStream::new(vec![statement]); + let stream = BufferedJsonStream::new(stream, || CompactFormatter, None); + insta::assert_snapshot!(collect_output(stream).await); + } + + #[tokio::test] + async fn many_chunks_many_measurements() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let columns = [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ]; + let data: Vec = vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.5])), + ]; + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = ChunkedResponseStream::new(vec![statement], 2); + let stream = ChunkedJsonStream::new(stream, || CompactFormatter, None); + insta::assert_snapshot!(collect_output(stream).await); + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = BufferedResponseStream::new(vec![statement]); + let stream = BufferedJsonStream::new(stream, || CompactFormatter, None); + insta::assert_snapshot!(collect_output(stream).await); + } + + #[tokio::test] + async fn many_statements() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let columns1 = [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ]; + let data1: Vec = vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])), + ]; + let columns2 = [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: true, + projected: false, + }, + Column::Field { name: "f1" }, + ]; + let data2: Vec = vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1b", "t1b", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])), + ]; + + let statement1 = make_statement(&db, &ctx, &log, columns1.clone(), data1.clone()); + let statement2 = make_statement(&db, &ctx, &log, columns2.clone(), data2.clone()); + let stream = ChunkedResponseStream::new(vec![statement1, statement2], 2); + let stream = ChunkedJsonStream::new(stream, || CompactFormatter, None); + insta::assert_snapshot!(collect_output(stream).await); + + let statement1 = make_statement(&db, &ctx, &log, columns1.clone(), data1.clone()); + let statement2 = make_statement(&db, &ctx, &log, columns2.clone(), data2.clone()); + let stream = BufferedResponseStream::new(vec![statement1, statement2]); + let stream = BufferedJsonStream::new(stream, || CompactFormatter, None); + insta::assert_snapshot!(collect_output(stream).await); + } + + #[tokio::test] + async fn test_epoch() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let columns = [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ]; + let data: Vec = vec![ + Arc::new(StringArray::from(vec!["m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])), + Arc::new(DictionaryArray::::from_iter(vec!["t1a", "t1a"])), + Arc::new(Float64Array::from(vec![1.0, 2.0])), + ]; + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = ChunkedResponseStream::new(vec![statement], 2); + let stream = + ChunkedJsonStream::new(stream, || CompactFormatter, Some(Precision::Nanoseconds)); + insta::assert_snapshot!(collect_output(stream).await); + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = BufferedResponseStream::new(vec![statement]); + let stream = + BufferedJsonStream::new(stream, || CompactFormatter, Some(Precision::Nanoseconds)); + insta::assert_snapshot!(collect_output(stream).await); + } + + async fn collect_output + Send>(stream: S) -> String { + String::from_utf8( + stream + .map(Vec::::from) + .map(|mut v| { + v.push(b'\n'); + v + }) + .concat() + .await, + ) + .unwrap() + } +} diff --git a/iox_v1_query_api/src/response/msgpack.rs b/iox_v1_query_api/src/response/msgpack.rs new file mode 100644 index 00000000..089fc772 --- /dev/null +++ b/iox_v1_query_api/src/response/msgpack.rs @@ -0,0 +1,337 @@ +//! Message Pack encoding of InfluxQL query results. +use crate::types::Precision; + +use super::{BufferedResponseStream, ChunkedResponseStream, Response, ResponseSerializer}; +use bytes::buf::BufMut; +use bytes::{Bytes, BytesMut}; +use futures::Stream; +use rmp_serde::Serializer; +use serde::Serialize; +use std::{ + pin::Pin, + task::{Context, Poll}, +}; +use tracing::warn; + +/// A generic Message Pack-encoded [Response] stream. +pub(crate) struct MessagePackStream { + stream: S, + epoch: Option, +} + +impl MessagePackStream { + pub(crate) fn new(stream: S, epoch: Option) -> Self { + Self { stream, epoch } + } + + fn poll_next_inner(&mut self, cx: &mut Context<'_>) -> Poll> + where + S: Stream + Unpin, + { + Pin::new(&mut self.stream).poll_next(cx) + } +} + +impl Stream for MessagePackStream +where + S: Stream + Unpin, +{ + type Item = Bytes; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + this.poll_next_inner(cx).map(|opt| { + opt.and_then(|resp| { + let mut w = BytesMut::new().writer(); + let mut serializer = Serializer::new(&mut w); + let resp = ResponseSerializer::new(&resp, this.epoch, true); + if let Err(e) = resp.serialize(&mut serializer) { + warn!(error = %e, "failed to serialize response"); + None + } else { + Some(w.into_inner().freeze()) + } + }) + }) + } +} + +pub(crate) type ChunkedMessagePackStream = MessagePackStream; +pub(crate) type BufferedMessagePackStream = MessagePackStream; + +#[cfg(test)] +mod tests { + use crate::response::buffered::BufferedResponseStream; + + use super::super::tests::{Column, make_statement}; + use super::*; + use arrow::array::{ + ArrayRef, DictionaryArray, Float64Array, StringArray, TimestampNanosecondArray, + }; + use arrow::datatypes::Int32Type; + use futures::StreamExt; + use iox_query::QueryDatabase; + use iox_query::exec::IOxSessionContext; + use iox_query::query_log::QueryLog; + use iox_query::test::TestDatabaseStore; + use iox_time::SystemProvider; + use rmp_serde::Deserializer; + use serde::Deserialize; + use serde_json::Value; + use std::sync::Arc; + + #[tokio::test] + async fn empty_stream() { + let stream = ChunkedResponseStream::new(vec![], 2); + let mut stream = ChunkedMessagePackStream::new(stream, None); + assert!(stream.next().await.is_none()); + + let stream = BufferedResponseStream::new(vec![]); + let mut stream = BufferedMessagePackStream::new(stream, None); + assert!(stream.next().await.is_none()); + } + + #[tokio::test] + async fn single_chunk() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let columns = [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ]; + let data: Vec = vec![ + Arc::new(StringArray::from(vec!["m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])), + Arc::new(DictionaryArray::::from_iter(vec!["t1a", "t1a"])), + Arc::new(Float64Array::from(vec![1.0, 2.0])), + ]; + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = ChunkedResponseStream::new(vec![statement], 2); + let stream = ChunkedMessagePackStream::new(stream, None); + insta::assert_snapshot!(collect_output(stream).await); + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = BufferedResponseStream::new(vec![statement]); + let stream = BufferedMessagePackStream::new(stream, None); + insta::assert_snapshot!(collect_output(stream).await); + } + + #[tokio::test] + async fn many_chunks() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let columns = [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ]; + let data: Vec = vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])), + ]; + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = ChunkedResponseStream::new(vec![statement], 2); + let stream = ChunkedMessagePackStream::new(stream, None); + insta::assert_snapshot!(collect_output(stream).await); + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = BufferedResponseStream::new(vec![statement]); + let stream = BufferedMessagePackStream::new(stream, None); + insta::assert_snapshot!(collect_output(stream).await); + } + + #[tokio::test] + async fn many_chunks_many_measurments() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let columns = [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ]; + let data: Vec = vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])), + ]; + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = ChunkedResponseStream::new(vec![statement], 2); + let stream = ChunkedMessagePackStream::new(stream, None); + insta::assert_snapshot!(collect_output(stream).await); + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = BufferedResponseStream::new(vec![statement]); + let stream = BufferedMessagePackStream::new(stream, None); + insta::assert_snapshot!(collect_output(stream).await); + } + + #[tokio::test] + async fn many_statements() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let columns1 = [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ]; + let data1: Vec = vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1a", "t1a", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])), + ]; + + let columns2 = [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: true, + projected: false, + }, + Column::Field { name: "f1" }, + ]; + let data2: Vec = vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, 1000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "t1a", "t1a", "t1b", "t1b", + ])), + Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])), + ]; + + let statement1 = make_statement(&db, &ctx, &log, columns1.clone(), data1.clone()); + let statement2 = make_statement(&db, &ctx, &log, columns2.clone(), data2.clone()); + let stream = ChunkedResponseStream::new(vec![statement1, statement2], 2); + let stream = ChunkedMessagePackStream::new(stream, None); + insta::assert_snapshot!(collect_output(stream).await); + + let statement1 = make_statement(&db, &ctx, &log, columns1.clone(), data1.clone()); + let statement2 = make_statement(&db, &ctx, &log, columns2.clone(), data2.clone()); + let stream = BufferedResponseStream::new(vec![statement1, statement2]); + let stream = BufferedMessagePackStream::new(stream, None); + insta::assert_snapshot!(collect_output(stream).await); + } + + #[tokio::test] + async fn test_epoch() { + let db: Arc = Arc::new(TestDatabaseStore::default()); + let ctx = Arc::new(IOxSessionContext::with_testing()); + let log = Arc::new(QueryLog::new( + 1, + Arc::new(SystemProvider::new()), + &metric::Registry::new(), + None, + )); + let columns = [ + Column::Measurement, + Column::Time, + Column::Tag { + name: "t1", + group_by: false, + projected: false, + }, + Column::Field { name: "f1" }, + ]; + let data: Vec = vec![ + Arc::new(StringArray::from(vec!["m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])), + Arc::new(DictionaryArray::::from_iter(vec!["t1a", "t1a"])), + Arc::new(Float64Array::from(vec![1.0, 2.0])), + ]; + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = ChunkedResponseStream::new(vec![statement], 2); + let stream = ChunkedMessagePackStream::new(stream, Some(Precision::Microseconds)); + insta::assert_snapshot!(collect_output(stream).await); + + let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone()); + let stream = BufferedResponseStream::new(vec![statement]); + let stream = BufferedMessagePackStream::new(stream, Some(Precision::Microseconds)); + insta::assert_snapshot!(collect_output(stream).await); + } + + async fn collect_output + Send>(stream: S) -> String { + String::from_utf8( + stream + .map(Vec::::from) + .map(|v| { + // Docode the msgpack and recode as JSON to make it + // easier to validate. + let mut de = Deserializer::new(&v[..]); + let value = Value::deserialize(&mut de).unwrap(); + let mut v = serde_json::to_vec(&value).unwrap(); + v.push(b'\n'); + v + }) + .concat() + .await, + ) + .unwrap() + } +} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__epoch.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__epoch.snap new file mode 100644 index 00000000..fde51ebd --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__epoch.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/buffered.rs +expression: "collect_output_epoch(BufferedResponseStream::new(vec![statement]),\nSome(Precision::Milliseconds)).await" +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","tags":{"t1":"t1a"},"columns":["time","f1"],"values":[[1000,1],[2000,2]]},{"name":"m1","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[[3000,3]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__many_statements.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__many_statements.snap new file mode 100644 index 00000000..4bc626e9 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__many_statements.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/buffered.rs +expression: "collect_output(BufferedResponseStream::new(vec![statement1,\nstatement2])).await" +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2],["1970-01-01T00:00:03Z","t1a",3]]},{"name":"m2","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1]]}]},{"statement_id":1,"series":[{"name":"m1","tags":{"t1":"t1a"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1],["1970-01-01T00:00:02Z",2]]},{"name":"m1","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:03Z",3]]},{"name":"m2","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__single_statement_multi_series.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__single_statement_multi_series.snap new file mode 100644 index 00000000..e2d64bff --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__single_statement_multi_series.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/buffered.rs +expression: "collect_output(BufferedResponseStream::new(vec![statement])).await" +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","tags":{"t1":"t1a"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1],["1970-01-01T00:00:02Z",2]]},{"name":"m1","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:03Z",3]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__single_statement_single_series.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__single_statement_single_series.snap new file mode 100644 index 00000000..c9be940f --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__single_statement_single_series.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/buffered.rs +expression: "collect_output(BufferedResponseStream::new(vec![statement])).await" +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__epoch.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__epoch.snap new file mode 100644 index 00000000..a3fa491d --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__epoch.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/chunked.rs +expression: "collect_output_epoch(ChunkedResponseStream::new(vec![statement], 2),\nSome(Precision::Milliseconds)).await" +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[[1000,"t1a",1],[2000,"t1a",2]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_chunks.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_chunks.snap new file mode 100644 index 00000000..9743f3b7 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_chunks.snap @@ -0,0 +1,6 @@ +--- +source: iox_v1_query_api/src/response/chunked.rs +expression: "collect_output(ChunkedResponseStream::new(vec![statement], 2)).await" +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],"partial":true}],"partial":true}]} +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:03Z","t1a",3]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_chunks_with_groups.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_chunks_with_groups.snap new file mode 100644 index 00000000..40c3f1a1 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_chunks_with_groups.snap @@ -0,0 +1,7 @@ +--- +source: iox_v1_query_api/src/response/chunked.rs +expression: "collect_output(ChunkedResponseStream::new(vec![statement], 2)).await" +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],"partial":true}],"partial":true}]} +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:03Z","t1a",3]]}],"partial":true}]} +{"results":[{"statement_id":0,"series":[{"name":"m2","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_statements.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_statements.snap new file mode 100644 index 00000000..140954f4 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_statements.snap @@ -0,0 +1,10 @@ +--- +source: iox_v1_query_api/src/response/chunked.rs +expression: "collect_output(ChunkedResponseStream::new(vec![statement1, statement2],\n2)).await" +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],"partial":true}],"partial":true}]} +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:03Z","t1a",3]]}],"partial":true}]} +{"results":[{"statement_id":0,"series":[{"name":"m2","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1]]}]}]} +{"results":[{"statement_id":1,"series":[{"name":"m1","tags":{"t1":"t1a"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1],["1970-01-01T00:00:02Z",2]]}],"partial":true}]} +{"results":[{"statement_id":1,"series":[{"name":"m1","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:03Z",3]]}],"partial":true}]} +{"results":[{"statement_id":1,"series":[{"name":"m2","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__single_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__single_chunk.snap new file mode 100644 index 00000000..dc6cdf9e --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__single_chunk.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/chunked.rs +expression: "collect_output(ChunkedResponseStream::new(vec![statement], 2)).await" +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__epoch-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__epoch-2.snap new file mode 100644 index 00000000..0064e3bb --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__epoch-2.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/json.rs +expression: collect_output(stream).await +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[[1000000000,"t1a",1],[2000000000,"t1a",2]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__epoch.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__epoch.snap new file mode 100644 index 00000000..0064e3bb --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__epoch.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/json.rs +expression: collect_output(stream).await +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[[1000000000,"t1a",1],[2000000000,"t1a",2]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks-2.snap new file mode 100644 index 00000000..55e5b019 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks-2.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/json.rs +expression: collect_output(stream).await +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2],["1970-01-01T00:00:03Z","t1a",3]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks.snap new file mode 100644 index 00000000..cb7718e5 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks.snap @@ -0,0 +1,6 @@ +--- +source: iox_v1_query_api/src/response/json.rs +expression: collect_output(stream).await +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],"partial":true}],"partial":true}]} +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:03Z","t1a",3]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks_many_measurements-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks_many_measurements-2.snap new file mode 100644 index 00000000..b39ce63c --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks_many_measurements-2.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/json.rs +expression: collect_output(stream).await +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2],["1970-01-01T00:00:03Z","t1a",3]]},{"name":"m2","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1.5]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks_many_measurements.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks_many_measurements.snap new file mode 100644 index 00000000..bcf2cc05 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks_many_measurements.snap @@ -0,0 +1,7 @@ +--- +source: iox_v1_query_api/src/response/json.rs +expression: collect_output(stream).await +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],"partial":true}],"partial":true}]} +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:03Z","t1a",3]]}],"partial":true}]} +{"results":[{"statement_id":0,"series":[{"name":"m2","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1.5]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_statements-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_statements-2.snap new file mode 100644 index 00000000..202f297f --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_statements-2.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/json.rs +expression: collect_output(stream).await +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2],["1970-01-01T00:00:03Z","t1a",3]]},{"name":"m2","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1]]}]},{"statement_id":1,"series":[{"name":"m1","tags":{"t1":"t1a"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1],["1970-01-01T00:00:02Z",2]]},{"name":"m1","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:03Z",3]]},{"name":"m2","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_statements.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_statements.snap new file mode 100644 index 00000000..d3367a69 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_statements.snap @@ -0,0 +1,10 @@ +--- +source: iox_v1_query_api/src/response/json.rs +expression: collect_output(stream).await +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],"partial":true}],"partial":true}]} +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:03Z","t1a",3]]}],"partial":true}]} +{"results":[{"statement_id":0,"series":[{"name":"m2","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1]]}]}]} +{"results":[{"statement_id":1,"series":[{"name":"m1","tags":{"t1":"t1a"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1],["1970-01-01T00:00:02Z",2]]}],"partial":true}]} +{"results":[{"statement_id":1,"series":[{"name":"m1","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:03Z",3]]}],"partial":true}]} +{"results":[{"statement_id":1,"series":[{"name":"m2","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk-2.snap new file mode 100644 index 00000000..407c983b --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk-2.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/json.rs +expression: collect_output(stream).await +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk.snap new file mode 100644 index 00000000..407c983b --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/json.rs +expression: collect_output(stream).await +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_exponential_value.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_exponential_value.snap new file mode 100644 index 00000000..9c665f3c --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_exponential_value.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/json.rs +expression: collect_output(stream).await +--- +{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",7.307059979368068e43],["1970-01-01T00:00:02Z","t1a",7.307059979368067e43]]}]}]} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_infinite_value-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_infinite_value-2.snap new file mode 100644 index 00000000..b984a48e --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_infinite_value-2.snap @@ -0,0 +1,7 @@ +--- +source: iox_v1_query_api/src/response/json.rs +expression: collect_output(stream).await +--- +{ + "error": "json: unsupported value: +Inf" +} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_infinite_value.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_infinite_value.snap new file mode 100644 index 00000000..b984a48e --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_infinite_value.snap @@ -0,0 +1,7 @@ +--- +source: iox_v1_query_api/src/response/json.rs +expression: collect_output(stream).await +--- +{ + "error": "json: unsupported value: +Inf" +} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_pretty-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_pretty-2.snap new file mode 100644 index 00000000..17146b1b --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_pretty-2.snap @@ -0,0 +1,33 @@ +--- +source: iox_v1_query_api/src/response/json.rs +expression: collect_output(stream).await +--- +{ + "results": [ + { + "statement_id": 0, + "series": [ + { + "name": "m1", + "columns": [ + "time", + "t1", + "f1" + ], + "values": [ + [ + "1970-01-01T00:00:01Z", + "t1a", + 1 + ], + [ + "1970-01-01T00:00:02Z", + "t1a", + 2 + ] + ] + } + ] + } + ] +} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_pretty.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_pretty.snap new file mode 100644 index 00000000..17146b1b --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_pretty.snap @@ -0,0 +1,33 @@ +--- +source: iox_v1_query_api/src/response/json.rs +expression: collect_output(stream).await +--- +{ + "results": [ + { + "statement_id": 0, + "series": [ + { + "name": "m1", + "columns": [ + "time", + "t1", + "f1" + ], + "values": [ + [ + "1970-01-01T00:00:01Z", + "t1a", + 1 + ], + [ + "1970-01-01T00:00:02Z", + "t1a", + 2 + ] + ] + } + ] + } + ] +} diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__epoch-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__epoch-2.snap new file mode 100644 index 00000000..5793540f --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__epoch-2.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/msgpack.rs +expression: collect_output(stream).await +--- +[[[0,[["m1",["time","t1","f1"],[[1000000,"t1a",1],[2000000,"t1a",2]]]]]]] diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__epoch.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__epoch.snap new file mode 100644 index 00000000..5793540f --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__epoch.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/msgpack.rs +expression: collect_output(stream).await +--- +[[[0,[["m1",["time","t1","f1"],[[1000000,"t1a",1],[2000000,"t1a",2]]]]]]] diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks-2.snap new file mode 100644 index 00000000..4434440c --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks-2.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/msgpack.rs +expression: collect_output(stream).await +--- +[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2],["1970-01-01T00:00:03Z","t1a",3]]]]]]] diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks.snap new file mode 100644 index 00000000..1fd528b3 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks.snap @@ -0,0 +1,6 @@ +--- +source: iox_v1_query_api/src/response/msgpack.rs +expression: collect_output(stream).await +--- +[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],true]],true]]] +[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:03Z","t1a",3]]]]]]] diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks_many_measurments-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks_many_measurments-2.snap new file mode 100644 index 00000000..ce6925f1 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks_many_measurments-2.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/msgpack.rs +expression: collect_output(stream).await +--- +[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2],["1970-01-01T00:00:03Z","t1a",3]]],["m2",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1]]]]]]] diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks_many_measurments.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks_many_measurments.snap new file mode 100644 index 00000000..7e040072 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks_many_measurments.snap @@ -0,0 +1,7 @@ +--- +source: iox_v1_query_api/src/response/msgpack.rs +expression: collect_output(stream).await +--- +[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],true]],true]]] +[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:03Z","t1a",3]]]],true]]] +[[[0,[["m2",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1]]]]]]] diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_statements-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_statements-2.snap new file mode 100644 index 00000000..86899e59 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_statements-2.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/msgpack.rs +expression: collect_output(stream).await +--- +[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2],["1970-01-01T00:00:03Z","t1a",3]]],["m2",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1]]]]],[1,[["m1",{"t1":"t1a"},["time","f1"],[["1970-01-01T00:00:01Z",1],["1970-01-01T00:00:02Z",2]]],["m1",{"t1":"t1b"},["time","f1"],[["1970-01-01T00:00:03Z",3]]],["m2",{"t1":"t1b"},["time","f1"],[["1970-01-01T00:00:01Z",1]]]]]]] diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_statements.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_statements.snap new file mode 100644 index 00000000..45c35345 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_statements.snap @@ -0,0 +1,10 @@ +--- +source: iox_v1_query_api/src/response/msgpack.rs +expression: collect_output(stream).await +--- +[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],true]],true]]] +[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:03Z","t1a",3]]]],true]]] +[[[0,[["m2",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1]]]]]]] +[[[1,[["m1",{"t1":"t1a"},["time","f1"],[["1970-01-01T00:00:01Z",1],["1970-01-01T00:00:02Z",2]]]],true]]] +[[[1,[["m1",{"t1":"t1b"},["time","f1"],[["1970-01-01T00:00:03Z",3]]]],true]]] +[[[1,[["m2",{"t1":"t1b"},["time","f1"],[["1970-01-01T00:00:01Z",1]]]]]]] diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__single_chunk-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__single_chunk-2.snap new file mode 100644 index 00000000..15a643dc --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__single_chunk-2.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/msgpack.rs +expression: collect_output(stream).await +--- +[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]]]]]]] diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__single_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__single_chunk.snap new file mode 100644 index 00000000..15a643dc --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__single_chunk.snap @@ -0,0 +1,5 @@ +--- +source: iox_v1_query_api/src/response/msgpack.rs +expression: collect_output(stream).await +--- +[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]]]]]]] diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_eq.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_eq.snap new file mode 100644 index 00000000..05cfdbb0 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_eq.snap @@ -0,0 +1,57 @@ +--- +source: iox_v1_query_api/src/response/stream.rs +expression: chunks +--- +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:01Z" + - 1 + - - "1970-01-01T00:00:02Z" + - 2 +- name: m1 + tags: + tag0: b + tag1: "2" + columns: + - time + - val + values: + - - "1970-01-01T00:00:03Z" + - 3 + - - "1970-01-01T00:00:04Z" + - 4 + - - "1970-01-01T00:00:05Z" + - 5 +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:06Z" + - 6 + - - "1970-01-01T00:00:07Z" + - 7 + - - "1970-01-01T00:00:08Z" + - 8 + partial: true +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:09Z" + - 9 + - - "1970-01-01T00:00:10Z" + - 10 diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_gt.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_gt.snap new file mode 100644 index 00000000..3f8eacb9 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_gt.snap @@ -0,0 +1,75 @@ +--- +source: iox_v1_query_api/src/response/stream.rs +expression: chunks +--- +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:01Z" + - 1 + - - "1970-01-01T00:00:02Z" + - 2 +- name: m1 + tags: + tag0: b + tag1: "2" + columns: + - time + - val + values: + - - "1970-01-01T00:00:03Z" + - 3 + - - "1970-01-01T00:00:04Z" + - 4 + partial: true +- name: m1 + tags: + tag0: b + tag1: "2" + columns: + - time + - val + values: + - - "1970-01-01T00:00:05Z" + - 5 +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:06Z" + - 6 + - - "1970-01-01T00:00:07Z" + - 7 + partial: true +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:08Z" + - 8 + - - "1970-01-01T00:00:09Z" + - 9 + partial: true +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:10Z" + - 10 diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_lt.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_lt.snap new file mode 100644 index 00000000..1d149291 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_lt.snap @@ -0,0 +1,48 @@ +--- +source: iox_v1_query_api/src/response/stream.rs +expression: chunks +--- +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:01Z" + - 1 + - - "1970-01-01T00:00:02Z" + - 2 +- name: m1 + tags: + tag0: b + tag1: "2" + columns: + - time + - val + values: + - - "1970-01-01T00:00:03Z" + - 3 + - - "1970-01-01T00:00:04Z" + - 4 + - - "1970-01-01T00:00:05Z" + - 5 +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:06Z" + - 6 + - - "1970-01-01T00:00:07Z" + - 7 + - - "1970-01-01T00:00:08Z" + - 8 + - - "1970-01-01T00:00:09Z" + - 9 + - - "1970-01-01T00:00:10Z" + - 10 diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_single_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_single_chunk.snap new file mode 100644 index 00000000..1d149291 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_single_chunk.snap @@ -0,0 +1,48 @@ +--- +source: iox_v1_query_api/src/response/stream.rs +expression: chunks +--- +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:01Z" + - 1 + - - "1970-01-01T00:00:02Z" + - 2 +- name: m1 + tags: + tag0: b + tag1: "2" + columns: + - time + - val + values: + - - "1970-01-01T00:00:03Z" + - 3 + - - "1970-01-01T00:00:04Z" + - 4 + - - "1970-01-01T00:00:05Z" + - 5 +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:06Z" + - 6 + - - "1970-01-01T00:00:07Z" + - 7 + - - "1970-01-01T00:00:08Z" + - 8 + - - "1970-01-01T00:00:09Z" + - 9 + - - "1970-01-01T00:00:10Z" + - 10 diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_eq.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_eq.snap new file mode 100644 index 00000000..746c873e --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_eq.snap @@ -0,0 +1,59 @@ +--- +source: iox_v1_query_api/src/response/stream.rs +expression: chunks +--- +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:01Z" + - 1 + - - "1970-01-01T00:00:02Z" + - 2 + - - "1970-01-01T00:00:03Z" + - 3 + partial: true +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:04Z" + - 4 + - - "1970-01-01T00:00:05Z" + - 5 + - - "1970-01-01T00:00:06Z" + - 6 + partial: true +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:07Z" + - 7 + - - "1970-01-01T00:00:08Z" + - 8 + - - "1970-01-01T00:00:09Z" + - 9 + partial: true +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:10Z" + - 10 diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_gt.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_gt.snap new file mode 100644 index 00000000..369c05e1 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_gt.snap @@ -0,0 +1,68 @@ +--- +source: iox_v1_query_api/src/response/stream.rs +expression: chunks +--- +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:01Z" + - 1 + - - "1970-01-01T00:00:02Z" + - 2 + partial: true +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:03Z" + - 3 + - - "1970-01-01T00:00:04Z" + - 4 + partial: true +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:05Z" + - 5 + - - "1970-01-01T00:00:06Z" + - 6 + partial: true +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:07Z" + - 7 + - - "1970-01-01T00:00:08Z" + - 8 + partial: true +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:09Z" + - 9 + - - "1970-01-01T00:00:10Z" + - 10 diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_lt.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_lt.snap new file mode 100644 index 00000000..65d7b7f2 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_lt.snap @@ -0,0 +1,32 @@ +--- +source: iox_v1_query_api/src/response/stream.rs +expression: chunks +--- +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:01Z" + - 1 + - - "1970-01-01T00:00:02Z" + - 2 + - - "1970-01-01T00:00:03Z" + - 3 + - - "1970-01-01T00:00:04Z" + - 4 + - - "1970-01-01T00:00:05Z" + - 5 + - - "1970-01-01T00:00:06Z" + - 6 + - - "1970-01-01T00:00:07Z" + - 7 + - - "1970-01-01T00:00:08Z" + - 8 + - - "1970-01-01T00:00:09Z" + - 9 + - - "1970-01-01T00:00:10Z" + - 10 diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_single_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_single_chunk.snap new file mode 100644 index 00000000..65d7b7f2 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_single_chunk.snap @@ -0,0 +1,32 @@ +--- +source: iox_v1_query_api/src/response/stream.rs +expression: chunks +--- +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:01Z" + - 1 + - - "1970-01-01T00:00:02Z" + - 2 + - - "1970-01-01T00:00:03Z" + - 3 + - - "1970-01-01T00:00:04Z" + - 4 + - - "1970-01-01T00:00:05Z" + - 5 + - - "1970-01-01T00:00:06Z" + - 6 + - - "1970-01-01T00:00:07Z" + - 7 + - - "1970-01-01T00:00:08Z" + - 8 + - - "1970-01-01T00:00:09Z" + - 9 + - - "1970-01-01T00:00:10Z" + - 10 diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_multi_chunks.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_multi_chunks.snap new file mode 100644 index 00000000..6da6f7de --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_multi_chunks.snap @@ -0,0 +1,31 @@ +--- +source: iox_v1_query_api/src/response/stream.rs +expression: chunks +--- +- name: m1 + columns: + - time + - tag0 + - tag1 + - val + values: + - - "1970-01-01T00:00:01Z" + - a + - "1" + - 1 + - - "1970-01-01T00:00:02Z" + - a + - "1" + - 2 + partial: true +- name: m1 + columns: + - time + - tag0 + - tag1 + - val + values: + - - "1970-01-01T00:00:03Z" + - a + - "1" + - 3 diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_single_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_single_chunk.snap new file mode 100644 index 00000000..5ccf7d20 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_single_chunk.snap @@ -0,0 +1,23 @@ +--- +source: iox_v1_query_api/src/response/stream.rs +expression: chunks +--- +- name: m1 + columns: + - time + - tag0 + - tag1 + - val + values: + - - "1970-01-01T00:00:01Z" + - a + - "1" + - 1 + - - "1970-01-01T00:00:02Z" + - a + - "1" + - 2 + - - "1970-01-01T00:00:03Z" + - a + - "1" + - 3 diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_zero_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_zero_chunk.snap new file mode 100644 index 00000000..5ccf7d20 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_zero_chunk.snap @@ -0,0 +1,23 @@ +--- +source: iox_v1_query_api/src/response/stream.rs +expression: chunks +--- +- name: m1 + columns: + - time + - tag0 + - tag1 + - val + values: + - - "1970-01-01T00:00:01Z" + - a + - "1" + - 1 + - - "1970-01-01T00:00:02Z" + - a + - "1" + - 2 + - - "1970-01-01T00:00:03Z" + - a + - "1" + - 3 diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__single_batch_single_series_multi_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__single_batch_single_series_multi_chunk.snap new file mode 100644 index 00000000..fb810fba --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__single_batch_single_series_multi_chunk.snap @@ -0,0 +1,27 @@ +--- +source: iox_v1_query_api/src/response/stream.rs +expression: chunks +--- +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:01Z" + - 1 + - - "1970-01-01T00:00:02Z" + - 2 + partial: true +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:03Z" + - 3 diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__single_batch_single_series_single_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__single_batch_single_series_single_chunk.snap new file mode 100644 index 00000000..5da76922 --- /dev/null +++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__single_batch_single_series_single_chunk.snap @@ -0,0 +1,18 @@ +--- +source: iox_v1_query_api/src/response/stream.rs +expression: chunks +--- +- name: m1 + tags: + tag0: a + tag1: "1" + columns: + - time + - val + values: + - - "1970-01-01T00:00:01Z" + - 1 + - - "1970-01-01T00:00:02Z" + - 2 + - - "1970-01-01T00:00:03Z" + - 3 diff --git a/iox_v1_query_api/src/response/stream.rs b/iox_v1_query_api/src/response/stream.rs new file mode 100644 index 00000000..04ed9b27 --- /dev/null +++ b/iox_v1_query_api/src/response/stream.rs @@ -0,0 +1,980 @@ +use super::SeriesChunk; +use crate::Result; +use crate::error::Error; +use arrow::array::RecordBatch; +use arrow::compute::partition; +use arrow::datatypes::SchemaRef; +use futures::{Stream, ready}; +use generated_types::influxdata::iox::querier::v1::InfluxQlMetadata; +use schema::INFLUXQL_METADATA_KEY; +use std::{ + collections::{BTreeMap, BTreeSet}, + num::NonZeroUsize, + ops::Range, + pin::Pin, + sync::Arc, + task::{Context, Poll}, +}; + +/// Stream that processes a stream of [SeriesChunk]s, merging them +/// subsequet chunks that are for the same series, and splitting chunks +/// that are larger that the specified chunk size. +pub(crate) struct SeriesChunkMergeStream { + input: S, + chunk_size: Option, + + current: Option, +} + +impl SeriesChunkMergeStream { + pub(crate) fn new(input: S, chunk_size: Option) -> Self { + Self { + input, + chunk_size, + current: None, + } + } +} + +impl Stream for SeriesChunkMergeStream +where + S: Stream> + Unpin, +{ + type Item = Result; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + let mut input = Pin::new(&mut this.input); + loop { + if let Some(chunk_size) = this.chunk_size { + let chunk_size: usize = chunk_size.into(); + if let Some(current) = this.current.take() { + if current.num_rows() > chunk_size { + let (mut left, right) = current.split_at(chunk_size); + left.partial = true; + this.current = Some(right); + return Poll::Ready(Some(Ok(left))); + } else { + this.current = Some(current); + } + } + } + + match ready!(input.as_mut().poll_next(cx)) { + None => { + return if let Some(current) = this.current.take() { + Poll::Ready(Some(Ok(current))) + } else { + Poll::Ready(None) + }; + } + Some(Err(e)) => return Poll::Ready(Some(Err(e))), + Some(Ok(chunk)) => { + match this.current { + Some(ref mut current) => { + if current.series() == chunk.series() { + current.merge(chunk); + } else { + let current = this.current.take().unwrap(); // safe unwrap due to check above + this.current = Some(chunk); + return Poll::Ready(Some(Ok(current))); + } + } + None => { + this.current = Some(chunk); + } + } + } + } + } + } +} + +/// SeriesChunkStream processes a stream of [RecordBatch]es, breaking +/// each one into [SeriesChunk]s. Each [SeriesChunk] contains the data +/// from a single Series. +pub(crate) struct SeriesChunkStream { + /// A stream that returns [RecordBatch]es + record_batch_stream: S, + + measurement: usize, + tag_columns: Arc, usize>>, + value_columns: Arc<[(Arc, usize)]>, + + batch: RecordBatch, + partitions: Vec>, + current_partition: usize, +} + +impl SeriesChunkStream { + pub(crate) fn try_new(record_batch_stream: S, schema: SchemaRef) -> Result { + let md = schema.metadata.get(INFLUXQL_METADATA_KEY).ok_or( + datafusion::error::DataFusionError::Internal( + "Missing INFLUXQL_METADATA in RecordBatch schema".to_owned(), + ), + )?; + let iox_metadata: InfluxQlMetadata = serde_json::from_str(md.as_str()) + .map_err(|x| datafusion::error::DataFusionError::Internal(x.to_string()))?; + + let measurement = iox_metadata.measurement_column_index as usize; + let mut elided_columns = BTreeSet::new(); + elided_columns.insert(measurement); + let tags = iox_metadata + .tag_key_columns + .iter() + .inspect(|x| { + if !x.is_projected { + elided_columns.insert(x.column_index as usize); + } + }) + .map(|x| (Arc::from(x.tag_key.as_str()), x.column_index as usize)) + .collect::>(); + let mut columns = Vec::new(); + + schema.fields().iter().enumerate().for_each(|(i, f)| { + if !elided_columns.contains(&i) { + columns.push((Arc::from(f.name().as_str()), i)); + } + }); + + Ok(Self { + record_batch_stream, + measurement, + tag_columns: Arc::from(tags), + value_columns: Arc::from(columns), + batch: RecordBatch::new_empty(schema), + partitions: Vec::new(), + current_partition: 0, + }) + } + + fn chunk(&self, range: &Range, batch: &RecordBatch) -> SeriesChunk { + let batch = batch.slice(range.start, range.end - range.start); + SeriesChunk::new( + self.measurement, + Arc::clone(&self.tag_columns), + Arc::clone(&self.value_columns), + batch, + ) + } + + fn get_partitions_from_record_batch( + &self, + batch: &RecordBatch, + ) -> datafusion::common::Result>> { + let mut tag_keys_columns = Vec::with_capacity(self.tag_columns.len() + 1); + tag_keys_columns.push(Arc::clone(batch.column(self.measurement))); + for (_, idx) in self.tag_columns.iter() { + tag_keys_columns.push(Arc::clone(batch.column(*idx))); + } + Ok(partition(tag_keys_columns.as_slice())?.ranges()) + } +} + +impl SeriesChunkStream +where + S: Stream> + Unpin, +{ + fn poll_next_inner(&mut self, cx: &mut Context<'_>) -> Poll>> { + Pin::new(&mut self.record_batch_stream) + .poll_next(cx) + .map_err(Error::from) + } +} + +impl Stream for SeriesChunkStream +where + S: Stream> + Unpin, +{ + type Item = Result; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + while this.current_partition >= this.partitions.len() { + match ready!(this.poll_next_inner(cx)) { + None => return Poll::Ready(None), + Some(Err(e)) => return Poll::Ready(Some(Err(e))), + Some(Ok(batch)) => { + this.partitions = this.get_partitions_from_record_batch(&batch)?; + this.batch = batch; + this.current_partition = 0; + } + } + } + let chunk = this.chunk(&this.partitions[this.current_partition], &this.batch); + this.current_partition += 1; + Poll::Ready(Some(Ok(chunk))) + } +} + +#[cfg(test)] +mod tests { + use crate::response::SeriesChunkSerializer; + + use super::super::tests::{Column, make_schema_and_batches}; + use super::*; + use arrow::array::{Array, DictionaryArray, Int64Array, StringArray, TimestampNanosecondArray}; + use arrow::datatypes::Int32Type; + use datafusion_util::MemoryStream; + use futures::TryStreamExt; + + macro_rules! insta_assert_yaml_snapshot { + ($INPUT:expr) => { + let chunks = $INPUT + .iter() + .map(|s| SeriesChunkSerializer::new(s, None, true)) + .collect::>(); + + insta::assert_yaml_snapshot!(chunks); + }; + } + + #[tokio::test] + async fn test_no_batch() { + let chunks = make_chunks(TestParams { + columns: vec![ + Column::Measurement, + Column::Time, + Column::Tag { + name: "tag0", + group_by: false, + projected: false, + }, + Column::Field { name: "val" }, + ], + + data: vec![], // no record batch + chunk_size: None, + }) + .await; + + assert!(chunks.is_empty()); + } + + #[tokio::test] + async fn test_no_group_by_single_chunk() { + let chunks = make_chunks(TestParams { + columns: vec![ + Column::Measurement, + Column::Time, + Column::Tag { + name: "tag0", + group_by: false, // no group by + projected: false, + }, + Column::Tag { + name: "tag1", + group_by: false, // no group by + projected: false, + }, + Column::Field { name: "val" }, + ], + // single record batch + data: vec![vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + // single series: a1 + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![1, 2, 3])), + ]], + chunk_size: None, // single chunk + }) + .await; + + insta_assert_yaml_snapshot!(chunks); + } + + #[tokio::test] + async fn test_no_group_by_multi_chunks() { + let chunks = make_chunks(TestParams { + columns: vec![ + Column::Measurement, + Column::Time, + Column::Tag { + name: "tag0", + group_by: false, // no group by + projected: false, + }, + Column::Tag { + name: "tag1", + group_by: false, // no group by + projected: false, + }, + Column::Field { name: "val" }, + ], + // single record batch + data: vec![vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + // single series: a1 + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![1, 2, 3])), + ]], + chunk_size: Some(2), // multi chunks + }) + .await; + + insta_assert_yaml_snapshot!(chunks); + } + + #[tokio::test] + async fn test_no_group_by_zero_chunk() { + let chunks = make_chunks(TestParams { + columns: vec![ + Column::Measurement, + Column::Time, + Column::Tag { + name: "tag0", + group_by: false, // no group by + projected: false, + }, + Column::Tag { + name: "tag1", + group_by: false, // no group by + projected: false, + }, + Column::Field { name: "val" }, + ], + // single record batch + data: vec![vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + // single series: a1 + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![1, 2, 3])), + ]], + chunk_size: Some(0), // this is equivalent to None, i.e. single chunk + }) + .await; + + insta_assert_yaml_snapshot!(chunks); + } + + #[tokio::test] + async fn test_single_batch_single_series_single_chunk() { + let chunks = make_chunks(TestParams { + columns: vec![ + Column::Measurement, + Column::Time, + Column::Tag { + name: "tag0", + group_by: true, // group by tag0 + projected: false, + }, + Column::Tag { + name: "tag1", + group_by: true, // group by tag1 + projected: false, + }, + Column::Field { name: "val" }, + ], + // single record batch + data: vec![vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + // single series: a1 + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![1, 2, 3])), + ]], + chunk_size: None, // single chunk + }) + .await; + + insta_assert_yaml_snapshot!(chunks); + } + + #[tokio::test] + async fn test_single_batch_single_series_multi_chunk() { + let chunks = make_chunks(TestParams { + columns: vec![ + Column::Measurement, + Column::Time, + Column::Tag { + name: "tag0", + group_by: true, // group by tag0 + projected: false, + }, + Column::Tag { + name: "tag1", + group_by: true, // group by tag1 + projected: false, + }, + Column::Field { name: "val" }, + ], + // single record batch + data: vec![vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + // single series: a1 + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![1, 2, 3])), + ]], + chunk_size: Some(2), // multi chunks + }) + .await; + + insta_assert_yaml_snapshot!(chunks); + } + + #[tokio::test] + async fn test_many_batches_single_series_single_chunk() { + let chunks = make_chunks(TestParams { + columns: vec![ + Column::Measurement, + Column::Time, + Column::Tag { + name: "tag0", + group_by: true, + projected: false, + }, + Column::Tag { + name: "tag1", + group_by: true, + projected: false, + }, + Column::Field { name: "val" }, + ], + data: vec![ + // record batch 1 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + // single series: a1 + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![1, 2, 3])), + ], + // record batch 2 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 4000000000, 5000000000, 6000000000, 7000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "a", "a", "a", "a", + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "1", "1", "1", "1", + ])), + Arc::new(Int64Array::from(vec![4, 5, 6, 7])), + ], + // record batch 3 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 8000000000, + 9000000000, + 10000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![8, 9, 10])), + ], + ], + chunk_size: None, // single chunk + }) + .await; + + insta_assert_yaml_snapshot!(chunks); + } + + #[tokio::test] + async fn test_many_batches_single_series_multi_chunk_gt() { + // Testing batch size > chunk size + + let chunks = make_chunks(TestParams { + columns: vec![ + Column::Measurement, + Column::Time, + Column::Tag { + name: "tag0", + group_by: true, + projected: false, + }, + Column::Tag { + name: "tag1", + group_by: true, + projected: false, + }, + Column::Field { name: "val" }, + ], + data: vec![ + // record batch 1 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + // single series: a1 + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![1, 2, 3])), + ], + // record batch 2 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 4000000000, 5000000000, 6000000000, 7000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "a", "a", "a", "a", + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "1", "1", "1", "1", + ])), + Arc::new(Int64Array::from(vec![4, 5, 6, 7])), + ], + // record batch 3 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 8000000000, + 9000000000, + 10000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![8, 9, 10])), + ], + ], + chunk_size: Some(2), // multi chunks + }) + .await; + + insta_assert_yaml_snapshot!(chunks); + } + + #[tokio::test] + async fn test_many_batches_single_series_multi_chunk_eq() { + // Testing batch size = chunk size + + let chunks = make_chunks(TestParams { + columns: vec![ + Column::Measurement, + Column::Time, + Column::Tag { + name: "tag0", + group_by: true, + projected: false, + }, + Column::Tag { + name: "tag1", + group_by: true, + projected: false, + }, + Column::Field { name: "val" }, + ], + data: vec![ + // record batch 1 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + // single series: a1 + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![1, 2, 3])), + ], + // record batch 2 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 4000000000, 5000000000, 6000000000, 7000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "a", "a", "a", "a", + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "1", "1", "1", "1", + ])), + Arc::new(Int64Array::from(vec![4, 5, 6, 7])), + ], + // record batch 3 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 8000000000, + 9000000000, + 10000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![8, 9, 10])), + ], + ], + chunk_size: Some(3), // multi chunks + }) + .await; + + insta_assert_yaml_snapshot!(chunks); + } + + #[tokio::test] + async fn test_many_batches_single_series_multi_chunk_lt() { + // Testing batch size < chunk size + + let chunks = make_chunks(TestParams { + columns: vec![ + Column::Measurement, + Column::Time, + Column::Tag { + name: "tag0", + group_by: true, + projected: false, + }, + Column::Tag { + name: "tag1", + group_by: true, + projected: false, + }, + Column::Field { name: "val" }, + ], + data: vec![ + // record batch 1 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + // single series: a1 + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![1, 2, 3])), + ], + // record batch 2 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 4000000000, 5000000000, 6000000000, 7000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "a", "a", "a", "a", + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "1", "1", "1", "1", + ])), + Arc::new(Int64Array::from(vec![4, 5, 6, 7])), + ], + // record batch 3 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 8000000000, + 9000000000, + 10000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![8, 9, 10])), + ], + ], + chunk_size: Some(12), // multi chunks + }) + .await; + + insta_assert_yaml_snapshot!(chunks); + } + + #[tokio::test] + async fn test_many_batches_multi_series_single_chunk() { + let chunks = make_chunks(TestParams { + columns: vec![ + Column::Measurement, + Column::Time, + Column::Tag { + name: "tag0", + group_by: true, + projected: false, + }, + Column::Tag { + name: "tag1", + group_by: true, + projected: false, + }, + Column::Field { name: "val" }, + ], + data: vec![ + // record batch 1 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + // multi series: a1, b2 + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "b"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "2"])), + Arc::new(Int64Array::from(vec![1, 2, 3])), + ], + // record batch 2 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 4000000000, 5000000000, 6000000000, 7000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "b", "b", "a", "a", + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "2", "2", "1", "1", + ])), + Arc::new(Int64Array::from(vec![4, 5, 6, 7])), + ], + // record batch 3 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 8000000000, + 9000000000, + 10000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![8, 9, 10])), + ], + ], + chunk_size: None, // single chunk + }) + .await; + + insta_assert_yaml_snapshot!(chunks); + } + + #[tokio::test] + async fn test_many_batches_multi_series_multi_chunk_lt() { + // Testing batch size < chunk size + + let chunks = make_chunks(TestParams { + columns: vec![ + Column::Measurement, + Column::Time, + Column::Tag { + name: "tag0", + group_by: true, + projected: false, + }, + Column::Tag { + name: "tag1", + group_by: true, + projected: false, + }, + Column::Field { name: "val" }, + ], + data: vec![ + // record batch 1 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + // multi series: a1, b2 + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "b"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "2"])), + Arc::new(Int64Array::from(vec![1, 2, 3])), + ], + // record batch 2 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 4000000000, 5000000000, 6000000000, 7000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "b", "b", "a", "a", + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "2", "2", "1", "1", + ])), + Arc::new(Int64Array::from(vec![4, 5, 6, 7])), + ], + // record batch 3 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 8000000000, + 9000000000, + 10000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![8, 9, 10])), + ], + ], + chunk_size: Some(12), // multi chunks + }) + .await; + + insta_assert_yaml_snapshot!(chunks); + } + + #[tokio::test] + async fn test_many_batches_multi_series_multi_chunk_eq() { + // Testing batch size = chunk size + + let chunks = make_chunks(TestParams { + columns: vec![ + Column::Measurement, + Column::Time, + Column::Tag { + name: "tag0", + group_by: true, + projected: false, + }, + Column::Tag { + name: "tag1", + group_by: true, + projected: false, + }, + Column::Field { name: "val" }, + ], + data: vec![ + // record batch 1 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + // multi series: a1, b2 + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "b"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "2"])), + Arc::new(Int64Array::from(vec![1, 2, 3])), + ], + // record batch 2 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 4000000000, 5000000000, 6000000000, 7000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "b", "b", "a", "a", + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "2", "2", "1", "1", + ])), + Arc::new(Int64Array::from(vec![4, 5, 6, 7])), + ], + // record batch 3 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 8000000000, + 9000000000, + 10000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![8, 9, 10])), + ], + ], + chunk_size: Some(3), // multi chunks + }) + .await; + + insta_assert_yaml_snapshot!(chunks); + } + + #[tokio::test] + async fn test_many_batches_multi_series_multi_chunk_gt() { + // Testing batch size > chunk size + + let chunks = make_chunks(TestParams { + columns: vec![ + Column::Measurement, + Column::Time, + Column::Tag { + name: "tag0", + group_by: true, + projected: false, + }, + Column::Tag { + name: "tag1", + group_by: true, + projected: false, + }, + Column::Field { name: "val" }, + ], + data: vec![ + // record batch 1 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 1000000000, 2000000000, 3000000000, + ])), + // multi series: a1, b2 + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "b"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "2"])), + Arc::new(Int64Array::from(vec![1, 2, 3])), + ], + // record batch 2 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 4000000000, 5000000000, 6000000000, 7000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "b", "b", "a", "a", + ])), + Arc::new(DictionaryArray::::from_iter(vec![ + "2", "2", "1", "1", + ])), + Arc::new(Int64Array::from(vec![4, 5, 6, 7])), + ], + // record batch 3 + vec![ + Arc::new(StringArray::from(vec!["m1", "m1", "m1"])), + Arc::new(TimestampNanosecondArray::from(vec![ + 8000000000, + 9000000000, + 10000000000, + ])), + Arc::new(DictionaryArray::::from_iter(vec!["a", "a", "a"])), + Arc::new(DictionaryArray::::from_iter(vec!["1", "1", "1"])), + Arc::new(Int64Array::from(vec![8, 9, 10])), + ], + ], + chunk_size: Some(2), // multi chunks + }) + .await; + + insta_assert_yaml_snapshot!(chunks); + } + + struct TestParams { + columns: Vec, + data: Vec>>, + chunk_size: Option, + } + + async fn make_chunks(params: TestParams) -> Vec { + let TestParams { + columns, + data, + chunk_size, + } = params; + + let (schema, batches) = make_schema_and_batches(columns, data); + + let stream = MemoryStream::new_with_schema(batches, Arc::clone(&schema)); + let stream = SeriesChunkStream::try_new(stream, schema).unwrap(); + let stream = SeriesChunkMergeStream::new(stream, chunk_size.and_then(NonZeroUsize::new)); + + let chunks: Result> = stream.try_collect().await; + chunks.unwrap() + } +} diff --git a/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases.snap b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases.snap new file mode 100644 index 00000000..f3788aa2 --- /dev/null +++ b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases.snap @@ -0,0 +1,6 @@ +--- +source: iox_v1_query_api/src/handler.rs +description: SHOW DATABASES -- on handler with SHOW DATABASES enabled +expression: res +--- +{"results":[{"statement_id":0,"series":[{"name":"databases","columns":["name"],"values":[["foo"],["bar"]]}]}]} diff --git a/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_authz-2.snap b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_authz-2.snap new file mode 100644 index 00000000..f968d2cc --- /dev/null +++ b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_authz-2.snap @@ -0,0 +1,6 @@ +--- +source: iox_v1_query_api/src/handler.rs +description: SHOW DATABASES -- should return mop database after adding to authz +expression: res +--- +{"results":[{"statement_id":0,"series":[{"name":"databases","columns":["name"],"values":[["foo"],["bar"],["mop"]]}]}]} diff --git a/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_authz.snap b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_authz.snap new file mode 100644 index 00000000..2dd37332 --- /dev/null +++ b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_authz.snap @@ -0,0 +1,6 @@ +--- +source: iox_v1_query_api/src/handler.rs +description: SHOW DATABASES -- should not return mop database due to authz +expression: res +--- +{"results":[{"statement_id":0,"series":[{"name":"databases","columns":["name"],"values":[["foo"],["bar"]]}]}]} diff --git a/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_no_impl.snap b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_no_impl.snap new file mode 100644 index 00000000..150825be --- /dev/null +++ b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_no_impl.snap @@ -0,0 +1,6 @@ +--- +source: iox_v1_query_api/src/handler.rs +description: SHOW DATABASES -- on handler with SHOW DATABASES _not_ enabled +expression: res +--- +{"results":[{"statement_id":0,"error":"must specify a 'db' parameter, or provide the database in the InfluxQL query"}]} diff --git a/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies-2.snap b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies-2.snap new file mode 100644 index 00000000..5a2e5ab8 --- /dev/null +++ b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies-2.snap @@ -0,0 +1,6 @@ +--- +source: iox_v1_query_api/src/handler.rs +description: "SHOW RETENTION POLICIES -- on `bar` database which contains one default policy and one non-default policy" +expression: res +--- +{"results":[{"statement_id":0,"series":[{"name":"retention_policies","columns":["name","duration","shardGroupDuration","replicaN","futureWriteLimit","pastWriteLimit","default"],"values":[["autogen","0ns","604800s",1,"0ns","0ns",true],["short","100s","604800s",1,"0ns","0ns",false]]}]}]} diff --git a/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies-3.snap b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies-3.snap new file mode 100644 index 00000000..d17c3991 --- /dev/null +++ b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies-3.snap @@ -0,0 +1,6 @@ +--- +source: iox_v1_query_api/src/handler.rs +description: "SHOW RETENTION POLICIES -- on `frodo` database which does not exist" +expression: res +--- +{"results":[{"statement_id":0,"error":"datafusion error: Error during planning: database not found: frodo"}]} diff --git a/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies.snap b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies.snap new file mode 100644 index 00000000..07dbc525 --- /dev/null +++ b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies.snap @@ -0,0 +1,6 @@ +--- +source: iox_v1_query_api/src/handler.rs +description: "SHOW RETENTION POLICIES -- on `foo` database which contains one default policy" +expression: res +--- +{"results":[{"statement_id":0,"series":[{"name":"retention_policies","columns":["name","duration","shardGroupDuration","replicaN","futureWriteLimit","pastWriteLimit","default"],"values":[["autogen","0ns","604800s",1,"0ns","0ns",true]]}]}]} diff --git a/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies_with_no_impl.snap b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies_with_no_impl.snap new file mode 100644 index 00000000..cdc81d88 --- /dev/null +++ b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies_with_no_impl.snap @@ -0,0 +1,6 @@ +--- +source: iox_v1_query_api/src/handler.rs +description: SHOW RETENTION POLICIES -- on handler with SHOW RETENTION POLICIES _not_ enabled +expression: res +--- +{"results":[{"statement_id":0,"error":"Database foo not found"}]} diff --git a/iox_v1_query_api/src/types.rs b/iox_v1_query_api/src/types.rs new file mode 100644 index 00000000..04821d34 --- /dev/null +++ b/iox_v1_query_api/src/types.rs @@ -0,0 +1,131 @@ +use arrow::datatypes::SchemaRef; +use datafusion::physical_plan::SendableRecordBatchStream; +use serde::{Deserialize, Serialize}; + +use iox_query::query_log::PermitAndToken; + +/// UNIX epoch precision. +/// Doc: +#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)] +pub enum Precision { + #[serde(rename = "ns")] + Nanoseconds, + #[serde(rename = "u", alias = "µ")] + Microseconds, + #[serde(rename = "ms")] + Milliseconds, + #[serde(rename = "s")] + Seconds, + #[serde(rename = "m")] + Minutes, + #[serde(rename = "h")] + Hours, + #[serde(rename = "d")] + Days, + #[serde(rename = "w")] + Weeks, +} + +/// An executing InfluxQL statement that produces results that can be +/// streamed as CSV. +pub(crate) struct Statement { + pub schema: SchemaRef, + /// Optional Permit/Token to support commands such as `SHOW DATABASES`, + /// which do not go through the query planner/executor. + pub permit_state: Option, + pub stream: SendableRecordBatchStream, +} + +impl Statement { + pub(crate) fn new( + schema: SchemaRef, + permit_state: Option, + stream: SendableRecordBatchStream, + ) -> Self { + Self { + schema, + permit_state, + stream, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use serde_json::{from_str, to_string}; + + #[test] + fn test_precision_deserialize() { + // Test standard rename attributes + assert_eq!( + from_str::(r#""ns""#).unwrap(), + Precision::Nanoseconds + ); + assert_eq!( + from_str::(r#""u""#).unwrap(), + Precision::Microseconds + ); + assert_eq!( + from_str::(r#""ms""#).unwrap(), + Precision::Milliseconds + ); + assert_eq!(from_str::(r#""s""#).unwrap(), Precision::Seconds); + assert_eq!(from_str::(r#""m""#).unwrap(), Precision::Minutes); + assert_eq!(from_str::(r#""h""#).unwrap(), Precision::Hours); + assert_eq!(from_str::(r#""d""#).unwrap(), Precision::Days); + assert_eq!(from_str::(r#""w""#).unwrap(), Precision::Weeks); + + // Test the alias for Microseconds + assert_eq!( + from_str::(r#""µ""#).unwrap(), + Precision::Microseconds + ); + } + + #[test] + fn test_precision_serialize() { + // Test that each enum variant serializes to the expected string + assert_eq!(to_string(&Precision::Nanoseconds).unwrap(), r#""ns""#); + assert_eq!(to_string(&Precision::Microseconds).unwrap(), r#""u""#); // Note: serializes to "u", not "µ" + assert_eq!(to_string(&Precision::Milliseconds).unwrap(), r#""ms""#); + assert_eq!(to_string(&Precision::Seconds).unwrap(), r#""s""#); + assert_eq!(to_string(&Precision::Minutes).unwrap(), r#""m""#); + assert_eq!(to_string(&Precision::Hours).unwrap(), r#""h""#); + assert_eq!(to_string(&Precision::Days).unwrap(), r#""d""#); + assert_eq!(to_string(&Precision::Weeks).unwrap(), r#""w""#); + } + + #[test] + fn test_precision_error_cases() { + // Test invalid inputs + let invalid_result = from_str::(r#""invalid""#); + assert!(invalid_result.is_err()); + + // Test case sensitivity (serde is case-sensitive by default) + let uppercase_result = from_str::(r#""NS""#); + assert!(uppercase_result.is_err()); + } + + #[test] + fn test_precision_roundtrip() { + // Test serialization and deserialization roundtrip for all variants + let variants = vec![ + Precision::Nanoseconds, + Precision::Microseconds, + Precision::Milliseconds, + Precision::Seconds, + Precision::Minutes, + Precision::Hours, + Precision::Days, + Precision::Weeks, + ]; + + for variant in variants { + let serialized = to_string(&variant).unwrap(); + let deserialized = from_str::(&serialized).unwrap(); + assert_eq!(variant, deserialized); + } + } +} diff --git a/iox_v1_query_api/src/value.rs b/iox_v1_query_api/src/value.rs new file mode 100644 index 00000000..1686fb9c --- /dev/null +++ b/iox_v1_query_api/src/value.rs @@ -0,0 +1,303 @@ +//! Types to represent values produced by the InfluxQL queries in +//! InfluxDB. These types are used to serialize the results for the +//! v1 API. +use arrow::array::timezone::Tz; +use arrow::array::{Array, ArrayRef, ArrowPrimitiveType, AsArray}; +use arrow::datatypes::{ + DataType, Float64Type, Int32Type, Int64Type, TimeUnit, TimestampNanosecondType, UInt64Type, +}; +use arrow::temporal_conversions::timestamp_ns_to_datetime; +use chrono::{DateTime, SecondsFormat, Utc}; +use serde::Serialize; +use std::fmt; +use std::str::FromStr; +use std::sync::Arc; + +use crate::types::Precision; + +/// The InfluxQL type of a value. +#[derive(Debug, PartialEq)] +pub(crate) enum ValueType { + Boolean, + Integer, + Float, + String, + Timestamp(Option>), + Unsigned, + Null, +} + +/// A velue produced by an InfluxQL query. This is a reference to a +/// single element of a an arrow array. +pub(crate) struct Value { + arr: ArrayRef, + row: usize, +} + +impl Value { + /// Create a new value wrapping the element at `row` in the `arr` + /// array. + pub(crate) fn new(arr: &ArrayRef, row: usize) -> Self { + Self { + arr: ArrayRef::clone(arr), + row, + } + } + + /// Return the InfluxQL type of the value. + pub(crate) fn value_type(&self) -> ValueType { + match self.arr.data_type() { + DataType::Boolean => ValueType::Boolean, + DataType::Int64 => ValueType::Integer, + DataType::Float64 | DataType::Float32 | DataType::Float16 => ValueType::Float, + DataType::Utf8 => ValueType::String, + DataType::Dictionary(k, v) + if k.equals_datatype(&DataType::Int32) && v.equals_datatype(&DataType::Utf8) => + { + ValueType::String + } + DataType::Timestamp(TimeUnit::Nanosecond, tz) => ValueType::Timestamp(tz.clone()), + DataType::UInt64 => ValueType::Unsigned, + DataType::Null => ValueType::Null, + dt => panic!("Unsupported InfluxQL data type: {dt}"), + } + } + + /// Return the value as a boolean, if it is one. + pub(crate) fn as_boolean_opt(&self) -> Option { + if self.arr.is_valid(self.row) { + self.arr.as_boolean_opt().map(|a| a.value(self.row)) + } else { + None + } + } + + /// Return the value as an integer, if it is one. + pub(crate) fn as_integer_opt(&self) -> Option { + self.as_primitive_opt::() + } + + /// Return the value as a float, if it is one. + pub(crate) fn as_float_opt(&self) -> Option { + self.as_primitive_opt::() + } + + fn as_primitive_opt(&self) -> Option { + if self.arr.is_valid(self.row) { + self.arr.as_primitive_opt::().map(|a| a.value(self.row)) + } else { + None + } + } + + /// Return the value as a string, if it is one. + pub(crate) fn as_string_opt(&self) -> Option<&str> { + if self.arr.is_valid(self.row) { + let (arr, idx) = match self.arr.as_dictionary_opt::() { + Some(a) => (a.values(), a.key(self.row)), + None => (&self.arr, Some(self.row)), + }; + idx.and_then(|idx| arr.as_string_opt::().map(|a| a.value(idx))) + } else { + None + } + } + + /// Return the value as a timestamp, if it is one. + pub(crate) fn as_timestamp_opt(&self) -> Option> { + self.as_primitive_opt::() + .and_then(timestamp_ns_to_datetime) + .map(|t| t.and_utc()) + } + + /// Return the value as an unsigned integer, if it is one. + pub(crate) fn as_unsigned_opt(&self) -> Option { + self.as_primitive_opt::() + } +} + +impl fmt::Debug for Value { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.value_type() { + ValueType::Boolean => write!(f, "{:?}", self.as_boolean_opt()), + ValueType::Integer => write!(f, "{:?}", self.as_integer_opt()), + ValueType::Float => write!(f, "{:?}", self.as_float_opt()), + ValueType::String => write!(f, "{:?}", self.as_string_opt()), + ValueType::Timestamp(_) => write!(f, "{:?}", self.as_timestamp_opt()), + ValueType::Unsigned => write!(f, "{:?}", self.as_unsigned_opt()), + ValueType::Null => write!(f, "null"), + } + } +} + +impl fmt::Display for Value { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.value_type() { + ValueType::Boolean => self + .as_boolean_opt() + .map(|v| write!(f, "{v}")) + .unwrap_or(Ok(())), + ValueType::Integer => self + .as_integer_opt() + .map(|v| write!(f, "{v}")) + .unwrap_or(Ok(())), + ValueType::Float => self + .as_float_opt() + .map(|v| write!(f, "{v}")) + .unwrap_or(Ok(())), + ValueType::String => self + .as_string_opt() + .map(|v| write!(f, "{v}")) + .unwrap_or(Ok(())), + ValueType::Timestamp(tz) => { + let tz = tz + .and_then(|tz| Tz::from_str(&tz).ok()) + .unwrap_or_else(|| Tz::from_str("UTC").unwrap()); + self.as_timestamp_opt() + .map(|t| write!(f, "{}", t.with_timezone(&tz).to_rfc3339())) + .unwrap_or(Ok(())) + } + ValueType::Unsigned => self + .as_unsigned_opt() + .map(|v| write!(f, "{v}")) + .unwrap_or(Ok(())), + ValueType::Null => Ok(()), + } + } +} + +impl PartialEq for Value { + fn eq(&self, other: &Self) -> bool { + if self.value_type() != other.value_type() { + return false; + } + match self.value_type() { + ValueType::Boolean => self.as_boolean_opt() == other.as_boolean_opt(), + ValueType::Integer => self.as_integer_opt() == other.as_integer_opt(), + ValueType::Float => self.as_float_opt() == other.as_float_opt(), + ValueType::String => self.as_string_opt() == other.as_string_opt(), + ValueType::Timestamp(_) => self.as_timestamp_opt() == other.as_timestamp_opt(), + ValueType::Unsigned => self.as_unsigned_opt() == other.as_unsigned_opt(), + ValueType::Null => true, + } + } +} + +pub(crate) struct ValueSerializer<'a> { + value: &'a Value, + epoch: Option, + // Allow infinite values + allow_inf: bool, +} + +impl<'a> ValueSerializer<'a> { + pub(crate) fn new(value: &'a Value, epoch: Option, allow_inf: bool) -> Self { + Self { + value, + epoch, + allow_inf, + } + } +} + +impl Serialize for ValueSerializer<'_> { + fn serialize(&self, serializer: S) -> Result { + match self.value.value_type() { + ValueType::Boolean => { + if let Some(v) = self.value.as_boolean_opt() { + serializer.serialize_bool(v) + } else { + serializer.serialize_none() + } + } + ValueType::Integer => { + if let Some(v) = self.value.as_integer_opt() { + serializer.serialize_i64(v) + } else { + serializer.serialize_none() + } + } + ValueType::Float => { + if let Some(v) = self.value.as_float_opt() { + if v.fract() == 0.0 && (v.abs() < (i64::MAX as f64)) { + // Only turn x.0 into x if it is small enough to fit in an i64. + // For example, 100.0 becomes 100. + // But 1_000_000_000_000_000_000_000.0 still stays as 1_000_000_000_000_000_000_000.0 + // because it is too large to fit in an i64. + serializer.serialize_i64(v as i64) + } else if v.is_infinite() && !self.allow_inf { + // JSON and /query 1.x doesn't support infinite values + // + // https://www.rfc-editor.org/rfc/rfc4627#:~:text=Numeric%20values%20that%20cannot%20be%20represented%20as%20sequences%20of%20digits%0A%20%20%20(such%20as%20Infinity%20and%20NaN)%20are%20not%20permitted. + if v > 0.0 { + Err(serde::ser::Error::custom("json: unsupported value: +Inf")) + } else { + Err(serde::ser::Error::custom("json: unsupported value: -Inf")) + } + } else if v.is_nan() { + // /query 1.x serilizes NaN as null for json and msgpack + serializer.serialize_none() + } else { + serializer.serialize_f64(v) + } + } else { + serializer.serialize_none() + } + } + ValueType::String => { + if let Some(v) = self.value.as_string_opt() { + serializer.serialize_str(v) + } else { + serializer.serialize_none() + } + } + ValueType::Timestamp(tz) => { + if let Some(v) = self.value.as_timestamp_opt() { + match self.epoch { + Some(Precision::Nanoseconds) => { + chrono::serde::ts_nanoseconds::serialize(&v, serializer) + } + Some(Precision::Microseconds) => { + chrono::serde::ts_microseconds::serialize(&v, serializer) + } + Some(Precision::Milliseconds) => { + chrono::serde::ts_milliseconds::serialize(&v, serializer) + } + Some(Precision::Seconds) => { + chrono::serde::ts_seconds::serialize(&v, serializer) + } + Some(Precision::Minutes) => serializer.serialize_i64(v.timestamp() / 60), + Some(Precision::Hours) => { + serializer.serialize_i64(v.timestamp() / (60 * 60)) + } + Some(Precision::Days) => { + serializer.serialize_i64(v.timestamp() / (60 * 60 * 24)) + } + Some(Precision::Weeks) => { + serializer.serialize_i64(v.timestamp() / (60 * 60 * 24 * 7)) + } + None => match tz.and_then(|tz| Tz::from_str(tz.as_ref()).ok()) { + Some(tz) => v + .with_timezone(&tz) + .to_rfc3339_opts(SecondsFormat::AutoSi, true) + .serialize(serializer), + None => v + .to_rfc3339_opts(SecondsFormat::AutoSi, true) + .serialize(serializer), + }, + } + } else { + serializer.serialize_none() + } + } + ValueType::Unsigned => { + if let Some(v) = self.value.as_unsigned_opt() { + serializer.serialize_u64(v) + } else { + serializer.serialize_none() + } + } + ValueType::Null => serializer.serialize_none(), + } + } +} diff --git a/jemalloc_stats/Cargo.toml b/jemalloc_stats/Cargo.toml index 44918295..54d7313d 100644 --- a/jemalloc_stats/Cargo.toml +++ b/jemalloc_stats/Cargo.toml @@ -8,7 +8,7 @@ license.workspace = true [dependencies] snafu = "0.8" tikv-jemalloc-ctl = { version = "0.5.4", features = ["use_std"] } -tokio = { version = "1.47.1", features = ["rt", "sync", "time"] } +tokio = { version = "1.48.0", features = ["rt", "sync", "time"] } workspace-hack = { version = "0.1", path = "../workspace-hack" } [lints] @@ -18,4 +18,4 @@ workspace = true tikv-jemallocator = { version = "0.5", features = [ "unprefixed_malloc_on_supported_platforms", ] } -tokio = { version = "1.47.1", features = ["macros", "rt-multi-thread"] } +tokio = { version = "1.48.0", features = ["macros", "rt-multi-thread"] } diff --git a/jemalloc_stats/src/lib.rs b/jemalloc_stats/src/lib.rs index 38b3e87c..88a17358 100644 --- a/jemalloc_stats/src/lib.rs +++ b/jemalloc_stats/src/lib.rs @@ -1,6 +1,6 @@ #![expect(missing_copy_implementations)] -use std::{sync::LazyLock, time::Duration}; +use std::{sync::OnceLock, time::Duration}; use tikv_jemalloc_ctl::{epoch as epoch_ctl, stats}; use tokio::{sync::watch, task::JoinHandle}; @@ -17,8 +17,9 @@ pub use monitor::{AllocationMonitor, AllocationMonitorError}; /// [`Refresher::handle()`] to obtain periodic updates. /// /// The first reference to [`STATS`] MUST be made from within an async tokio -/// runtime. -pub static STATS: LazyLock = LazyLock::new(Refresher::new); +/// runtime because a background tokio task is spawned by the initialised +/// [`Refresher`]. +pub static STATS: OnceLock = OnceLock::new(); /// Defines the frequency at which updated [`Stats`] are obtained and published. /// @@ -58,7 +59,7 @@ impl Refresher { /// Construct a new [`Stats`]. /// /// Intentionally non-pub to enforce a singleton exposed via [`STATS`]. - fn new() -> Self { + pub fn new(tick_duration: Duration) -> Self { let (tx, rx) = watch::channel(Stats::default()); Self { @@ -66,7 +67,7 @@ impl Refresher { // Spawn a background task to ask jemalloc to refresh the statistics // periodically, and publish the result. - refresh_task: tokio::task::spawn(refresh(tx)), + refresh_task: tokio::task::spawn(refresh(tx, tick_duration)), } } @@ -82,8 +83,8 @@ impl Refresher { /// ```rust /// # fn do_slow_thing() {} /// # let _guard = tokio::runtime::Runtime::new().unwrap().enter(); - /// # - /// let handle = jemalloc_stats::STATS.handle(); + /// # let REFRESH_INTERVAL = std::time::Duration::from_millis(9100); + /// let handle = jemalloc_stats::STATS.get_or_init(|| jemalloc_stats::Refresher::new(REFRESH_INTERVAL)).handle(); /// /// // Good: /// let stats = handle.borrow().clone(); @@ -104,7 +105,7 @@ impl Drop for Refresher { } } -async fn refresh(tx: watch::Sender) { +async fn refresh(tx: watch::Sender, tick_duration: Duration) { let epoch = epoch_ctl::mib().unwrap(); let active = stats::active::mib().unwrap(); let allocated = stats::allocated::mib().unwrap(); @@ -135,7 +136,7 @@ async fn refresh(tx: watch::Sender) { return; } - tokio::time::sleep(REFRESH_INTERVAL).await; + tokio::time::sleep(tick_duration).await; } } @@ -147,7 +148,8 @@ mod tests { /// reported. #[tokio::test] async fn test_stats() { - let handle = STATS.handle(); + let stats = STATS.get_or_init(|| Refresher::new(REFRESH_INTERVAL)); + let handle = stats.handle(); tokio::time::timeout(Duration::from_secs(10), async move { loop { diff --git a/linear_buffer/Cargo.toml b/linear_buffer/Cargo.toml new file mode 100644 index 00000000..188fdadf --- /dev/null +++ b/linear_buffer/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "linear_buffer" +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true + +[lints] +workspace = true + +# avoid compiling all the workspace-hack dependencies for MIRI tests +[target.'cfg(not(miri))'.dependencies] +workspace-hack = { version = "0.1", path = "../workspace-hack" } + +[dev-dependencies] diff --git a/linear_buffer/src/allocation.rs b/linear_buffer/src/allocation.rs new file mode 100644 index 00000000..0e7a5881 --- /dev/null +++ b/linear_buffer/src/allocation.rs @@ -0,0 +1,77 @@ +//! Allocation-related tools. + +use std::{ + alloc::Layout, + mem::MaybeUninit, + num::NonZeroUsize, + ops::{Deref, DerefMut}, + ptr::NonNull, +}; + +/// An allocation of potentially uninitialized memory. +/// +/// This is basically `Box<[MaybeUninit]>` but allows us to control the alignment as well. +pub(crate) struct Allocation { + layout: Layout, + ptr: NonNull, +} + +impl Allocation { + /// Create new allocation with given size and alignment. + pub(crate) fn new(size: usize, alignment: NonZeroUsize) -> Self { + let layout = Layout::array::(size) + .expect("size fits `isize`") + .align_to(alignment.get()) + .expect("valid alignment"); + + let ptr = if size == 0 { + // That's basically what the standard library does for empty `Vec`s. We are allowed to create an empty + // slice based on this pointer. + NonNull::::without_provenance(alignment) + } else { + // SAFETY: we made sure that the size is non-zero + let ptr = unsafe { std::alloc::alloc(layout) }; + + match NonNull::new(ptr) { + Some(ptr) => ptr, + None => { + panic!("cannot allocate {size} bytes with alignment {alignment}") + } + } + }; + + Self { layout, ptr } + } + + /// Correctly typed pointer. + fn ptr(&self) -> NonNull> { + self.ptr.cast() + } +} + +impl Drop for Allocation { + fn drop(&mut self) { + let Self { layout, ptr } = self; + + if layout.size() != 0 { + // SAFETY: this is a valid pointer and there are no dangling references + unsafe { std::alloc::dealloc(ptr.as_ptr(), *layout) }; + } + } +} + +impl Deref for Allocation { + type Target = [MaybeUninit]; + + fn deref(&self) -> &Self::Target { + // SAFETY: this is a valid pointer + unsafe { std::slice::from_raw_parts(self.ptr().as_ptr(), self.layout.size()) } + } +} + +impl DerefMut for Allocation { + fn deref_mut(&mut self) -> &mut Self::Target { + // SAFETY: this is a valid pointer + unsafe { std::slice::from_raw_parts_mut(self.ptr().as_ptr(), self.layout.size()) } + } +} diff --git a/linear_buffer/src/extend.rs b/linear_buffer/src/extend.rs new file mode 100644 index 00000000..64b406b2 --- /dev/null +++ b/linear_buffer/src/extend.rs @@ -0,0 +1,243 @@ +//! Extensions traits for [`LinearBuffer`] to simplify common tasks. + +use std::mem::MaybeUninit; + +use crate::LinearBuffer; + +/// Extension methods for [`LinearBuffer`] that are a safe combination of [`tail`](LinearBuffer::tail) and +/// [`bump`](LinearBuffer::bump). +pub trait LinearBufferExtend { + /// Append data to buffer. + /// + /// # Panic + /// There must be enough space left. In case of a panic, the buffer will be left untouched. + /// + /// # Example + /// ``` + /// # use linear_buffer::{LinearBuffer, LinearBufferExtend}; + /// let mut buffer = LinearBuffer::new(6); + /// + /// buffer.append(b"foo"); + /// buffer.append(b"bar"); + /// + /// assert_eq!( + /// buffer.slice_initialized_part(..).as_ref(), + /// b"foobar", + /// ); + /// ``` + fn append(&mut self, data: &[u8]); + + /// Extend buffer with constant value. + /// + /// This can be used for example to zero-extend the buffer without allocating a temporary slice for + /// [`append`](Self::append). + /// + /// # Panic + /// There must be enough space left. In case of a panic, the buffer will be left untouched. + /// + /// # Example + /// ``` + /// # use linear_buffer::{LinearBuffer, LinearBufferExtend}; + /// let mut buffer = LinearBuffer::new(6); + /// + /// buffer.fill(0, 2); + /// buffer.fill(0xff, 4); + /// + /// assert_eq!( + /// buffer.slice_initialized_part(..).as_ref(), + /// [0, 0, 0xff, 0xff, 0xff, 0xff], + /// ); + /// ``` + fn fill(&mut self, value: u8, n: usize); +} + +impl LinearBufferExtend for LinearBuffer { + fn append(&mut self, data: &[u8]) { + let space_left = self.space_left(); + assert!( + data.len() <= space_left, + "want to append {} bytes but buffer only has {space_left} bytes left", + data.len(), + ); + + let tail = self.tail(); + + // SAFETY: we've just checked that there is enough space left + let target = unsafe { tail.get_unchecked_mut(0..data.len()) }; + + // there is no good stable way to write a slice, see: + // https://github.com/rust-lang/rust/issues/79995 + // so we gonna hand-roll that + + // SAFETY: &[T] and &[MaybeUninit] have the same layout + let uninit_src: &[MaybeUninit] = unsafe { std::mem::transmute(data) }; + target.copy_from_slice(uninit_src); + + // SAFETY: we just wrote that data + unsafe { self.bump(data.len()) }; + } + + fn fill(&mut self, value: u8, n: usize) { + let space_left = self.space_left(); + assert!( + n <= space_left, + "want to fill {n} bytes but buffer only has {space_left} bytes left", + ); + + let tail = self.tail(); + + // SAFETY: we've just checked that there is enough space left + let target = unsafe { tail.get_unchecked_mut(0..n) }; + + // filling `MaybeUninit` is currently not simple on stable, see + // https://github.com/rust-lang/rust/issues/117428 + // + // So we just hand-roll it. In contrast to the stdlib implementation though, we don't need to care about `Drop` because `u8` doesn't need it. + for x in target.iter_mut() { + x.write(value); + } + + // SAFETY: we just wrote that data + unsafe { self.bump(n) }; + } +} + +#[cfg(test)] +mod test { + use std::panic::AssertUnwindSafe; + + use super::*; + + #[test] + fn append() { + let mut buffer = LinearBuffer::new(5); + buffer.append(b"foo"); + buffer.append(b"ba"); + assert_eq!(buffer.slice_initialized_part(..).as_ref(), b"fooba"); + } + + #[test] + fn append_empty() { + let mut buffer = LinearBuffer::new(3); + fill_buffer_with_ff(&mut buffer); + + buffer.append(b""); + + // buffer init position didn't change + assert_eq!(buffer.space_left(), 3); + + // our pre-initialized tail wasn't overridden + unsafe { + buffer.bump(3); + } + assert_eq!( + buffer.slice_initialized_part(0..3).as_ref(), + [0xff, 0xff, 0xff] + ); + } + + #[test] + fn panic_append_to_much() { + let mut buffer = LinearBuffer::new(3); + fill_buffer_with_ff(&mut buffer); + + buffer.append(b"ab"); + assert_eq!(buffer.slice_initialized_part(0..2).as_ref(), b"ab"); + + let err = std::panic::catch_unwind(AssertUnwindSafe(|| { + buffer.append(b"cd"); + })) + .unwrap_err(); + assert_eq!( + err.downcast_ref::().unwrap(), + "want to append 2 bytes but buffer only has 1 bytes left", + ); + + // buffer init position didn't change + assert_eq!(buffer.space_left(), 1); + + // our pre-initialized tail wasn't overridden + unsafe { + buffer.bump(1); + } + assert_eq!(buffer.slice_initialized_part(0..3).as_ref(), b"ab\xff"); + } + + #[test] + fn fill() { + let mut buffer = LinearBuffer::new(5); + buffer.fill(1, 3); + buffer.fill(42, 2); + assert_eq!( + buffer.slice_initialized_part(..).as_ref(), + [1, 1, 1, 42, 42] + ); + } + + #[test] + fn fill_n_0() { + let mut buffer = LinearBuffer::new(3); + fill_buffer_with_ff(&mut buffer); + + buffer.fill(1, 0); + + // buffer init position didn't change + assert_eq!(buffer.space_left(), 3); + + // our pre-initialized tail wasn't overridden + unsafe { + buffer.bump(3); + } + assert_eq!( + buffer.slice_initialized_part(0..3).as_ref(), + [0xff, 0xff, 0xff] + ); + } + + #[test] + fn panic_fill_to_much() { + let mut buffer = LinearBuffer::new(3); + fill_buffer_with_ff(&mut buffer); + + buffer.fill(0, 2); + assert_eq!(buffer.slice_initialized_part(0..2).as_ref(), [0, 0]); + + let err = std::panic::catch_unwind(AssertUnwindSafe(|| { + buffer.fill(0, 2); + })) + .unwrap_err(); + assert_eq!( + err.downcast_ref::().unwrap(), + "want to fill 2 bytes but buffer only has 1 bytes left", + ); + + // buffer init position didn't change + assert_eq!(buffer.space_left(), 1); + + // our pre-initialized tail wasn't overridden + unsafe { + buffer.bump(1); + } + assert_eq!(buffer.slice_initialized_part(0..3).as_ref(), [0, 0, 0xff]); + } + + #[test] + fn test_fill_buffer_with_ff() { + let mut buffer = LinearBuffer::new(3); + fill_buffer_with_ff(&mut buffer); + unsafe { + buffer.bump(3); + } + assert_eq!( + buffer.slice_initialized_part(0..3).as_ref(), + [0xff, 0xff, 0xff] + ); + } + + /// Fill buffer with pattern `0xff` without advancing the "initialized" position so we can check certain behavior. + fn fill_buffer_with_ff(buffer: &mut LinearBuffer) { + for x in buffer.tail().iter_mut() { + x.write(0xff); + } + } +} diff --git a/linear_buffer/src/lib.rs b/linear_buffer/src/lib.rs new file mode 100644 index 00000000..758fc18b --- /dev/null +++ b/linear_buffer/src/lib.rs @@ -0,0 +1,12 @@ +//! Crate that implements [`LinearBuffer`]. +mod allocation; +mod extend; +mod linear_buffer; + +// Workaround for "unused crate" lint false positives. +// This is only done if we do NOT run under MIRI to avoid costlly compliation of a lot of unused dependencies. +#[cfg(not(miri))] +use workspace_hack as _; + +pub use extend::LinearBufferExtend; +pub use linear_buffer::{LinearBuffer, Slice}; diff --git a/linear_buffer/src/linear_buffer.rs b/linear_buffer/src/linear_buffer.rs new file mode 100644 index 00000000..21631afc --- /dev/null +++ b/linear_buffer/src/linear_buffer.rs @@ -0,0 +1,529 @@ +//! Implementation of the buffer construct itself. +use std::{ + cell::UnsafeCell, + mem::MaybeUninit, + num::NonZeroUsize, + ops::{Bound, Deref, Range, RangeBounds}, + sync::Arc, +}; + +use crate::allocation::Allocation; + +/// Fixed-size buffer that supports [append] and +/// [reading initialized parts](Self::slice_initialized_part) at the same time. +/// +/// # Use Case +/// This construct allows you to [append] data to a buffer but at the same time hand out slices to the +/// already-initialized part of it. This is normally not possible with Rust's borrowing rules. An example is when you +/// receive data from a network and want to cache data in-memory (like an entire file), but also want to run write +/// operations for the already-received data to disk (e.g. for caching). +/// +/// Furthermore, the buffer can be [initialized with a desired alignment](Self::with_alignment). +/// +/// Neither of this is possible with purely safe standard library tooling nor with the famous [`bytes`] crate. +/// +/// # Implementation +/// The data layout looks like this: +/// +/// ```text +/// |<-----------------total_size------------------------------>| +/// | | +/// [============== allocation =================================] +/// [✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓????????????????????????????????????????] +/// | | | +/// |<---initialized--->| | +/// |<---unitialized / space left / tail--->| +/// ^ +/// | +/// first_uninit_element +/// ``` +/// +/// The _allocation_ is held as a [`MaybeUninit`] slice to avoid zeroing the buffer just to overwrite the data shortly +/// after. The _allocation_ NEVER moves and is only dropped when the [`LinearBuffer`] and all [`Slice`]s are dropped. +/// +/// The user can [get slices of the _initialized_ part](Self::slice_initialized_part). At the same time there exists +/// only at max one [`LinearBuffer`] which acts as a mutable reference to the uninitialized part: +/// +/// ```text +/// |<----------------LinearBuffer----------------------------->| +/// | | +/// | | +/// V V +/// [============== allocation =================================] +/// [✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓??????????????????????????????????] +/// ^ ^ ^ ^ +/// | | | | +/// | | | | +/// | |<---Slice 1--->| +/// | | +/// | | +/// |<--Slice 2-->| +/// ``` +/// +/// +/// [append]: crate::LinearBufferExtend::append +/// [`bytes`]: https://crates.io/crates/bytes +#[derive(Debug)] +pub struct LinearBuffer { + data: SharedAllocation, + first_uninit_element: usize, +} + +impl LinearBuffer { + /// Allocate new buffer of given size in bytes. + /// + /// # Panic + /// If we cannot allocate the buffer, we panic. + pub fn new(len: usize) -> Self { + Self::with_alignment(len, NonZeroUsize::MIN) + } + + /// Allocate new buffer of given size in bytes and alignment. + /// + /// # Panic + /// If we cannot allocate the buffer, we panic. + /// + /// Alignment must be a power of 2. + /// + /// # Alignment Rust Type + /// Once is closed and we have a proper stable `Alignment` type, + /// we should use that. For now we only enforce "not zero" on the type level and the rest during runtime. + #[expect(clippy::arc_with_non_send_sync)] + pub fn with_alignment(len: usize, alignment: NonZeroUsize) -> Self { + Self { + data: SharedAllocation(Arc::new(UnsafeCell::new(Allocation::new(len, alignment)))), + first_uninit_element: 0, + } + } + + /// Size of the entire buffer, including the initialized part and the uninitialized part. + /// + /// Also see [`space_left`](Self::space_left) and [`initialized_bytes`](Self::initialized_bytes). + pub fn total_size(&self) -> usize { + self.data.total_size() + } + + /// How much space is left. + /// + /// This is identical to the length of the [`tail`](Self::tail), but does not require a mutable reference to obtain. + pub fn space_left(&self) -> usize { + self.total_size() - self.first_uninit_element + } + + /// Number of initialized bytes. + pub fn initialized_bytes(&self) -> usize { + self.first_uninit_element + } + + /// Number of references that point to the allocation. + pub fn strong_count(&self) -> usize { + self.data.strong_count() + } + + /// The uninitialized part of the buffer. + /// + /// This can be used as a target for I/O operations. After writing data in, call [`bump`](Self::bump) to specify + /// the amount of data written to the START of the tail. + /// + /// If you want to append data from an existing slice or a constant value, it is easier to use + /// [`LinearBufferExtend`]. However, using this low-level interface might work better if you have I/O operations + /// that can read into a pre-allocated buffer. + /// + /// # Example + /// ``` + /// # use linear_buffer::LinearBuffer; + /// let mut buffer = LinearBuffer::new(3); + /// + /// let tail = buffer.tail(); + /// tail[0].write(b'f'); + /// tail[1].write(b'o'); + /// tail[2].write(b'o'); + /// + /// unsafe { buffer.bump(3) }; + /// + /// assert_eq!( + /// buffer.slice_initialized_part(0..3).as_ref(), + /// b"foo", + /// ); + /// ``` + /// + /// + /// [`LinearBufferExtend`]: crate::LinearBufferExtend + pub fn tail(&mut self) -> &mut [MaybeUninit] { + let data_ptr = self.data.0.get(); + + // SAFETY: there can only be one caller that accesses the tail due to Rust's borrowing rules + let partially_initialized_buffer = unsafe { &mut *data_ptr }; + + // SAFETY: first_uninit_element is always in bounds because we reject "overshooting" in `bump` + unsafe { partially_initialized_buffer.get_unchecked_mut(self.first_uninit_element..) } + } + + /// Bump initialized part of the buffer by given amount of bytes (= delta). + /// + /// # Panic + /// There must be enough space left in buffer. + /// + /// # Safety + /// The caller must ensure that they initialized the respective portion of the buffer using [`tail`](Self::tail). + pub unsafe fn bump(&mut self, initialized: usize) { + let space_left = self.space_left(); + assert!( + initialized <= space_left, + "buffer only has {space_left} bytes left but initialized part should be bumped by {initialized} bytes", + ); + + self.first_uninit_element += initialized; + } + + /// Get a slice of the initialized portion of the buffer. + /// + /// You may hold multiple overlapping slices to the same initialized memory. + /// + /// # Panic + /// The range must be well-formed and within the range of the initialized part. + #[track_caller] + pub fn slice_initialized_part(&self, range: impl RangeBounds) -> Slice { + let len = self.total_size(); + + let begin = match range.start_bound() { + Bound::Included(&n) => n, + Bound::Excluded(&n) => n.checked_add(1).expect("out of range"), + Bound::Unbounded => 0, + }; + + let end = match range.end_bound() { + Bound::Included(&n) => n.checked_add(1).expect("out of range"), + Bound::Excluded(&n) => n, + Bound::Unbounded => len, + }; + + assert!( + begin <= end, + "range start must not be greater than end: {begin} <= {end}", + ); + assert!( + end <= self.first_uninit_element, + "range end out of bounds: {end} <= {}", + self.first_uninit_element, + ); + + Slice { + data: self.data.clone(), + range: begin..end, + } + } +} + +/// Wrapper around the half-initialized buffer. +#[derive(Debug, Clone)] +struct SharedAllocation(Arc>); + +// SAFETY: We manually make sure that: +// - the inner allocation never changes +// - there is only at max one mutable reference to the tail part of the buffer +// - there is NO mutable reference to the initialized part of the buffer +// - slices (i.e. non-mut references) only exist to the initialized part of the buffer +unsafe impl Send for SharedAllocation {} +unsafe impl Sync for SharedAllocation {} + +impl SharedAllocation { + /// Size of the entire buffer, including the initialized part and the uninitialized part. + fn total_size(&self) -> usize { + let data_ptr = self.0.get(); + + // SAFETY: we NEVER change the underlying allocation + let allocation = unsafe { &*data_ptr }; + + allocation.len() + } + + /// Number of references that point to the allocation. + fn strong_count(&self) -> usize { + Arc::strong_count(&self.0) + } +} + +/// A slice of initialized data from a [`LinearBuffer`]. +#[derive(Clone)] +pub struct Slice { + data: SharedAllocation, + range: Range, +} + +impl Slice { + /// Size of the underlying allocation in bytes. + pub fn allocation_size(&self) -> usize { + self.data.total_size() + } + + /// Number of references that point to the allocation. + pub fn strong_count(&self) -> usize { + self.data.strong_count() + } +} + +impl std::fmt::Debug for Slice { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.deref().fmt(f) + } +} + +impl Deref for Slice { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + let data_ptr = self.data.0.get(); + + // SAFETY: the actual allocation is never changed + let partially_initialized_buffer = unsafe { &*data_ptr }; + + // SAFETY: we've check the bounds in LinearBuffer::slice_initialized_part + let init_part = unsafe { partially_initialized_buffer.get_unchecked(self.range.clone()) }; + + // SAFETY: this is only the initialized part + unsafe { + // Slice methods of "assume init" aren't stable yet, see + // https://github.com/rust-lang/rust/issues/63569 + // + // So we just use the code from the stdlib + &*(init_part as *const [MaybeUninit] as *const [u8]) + } + } +} + +impl AsRef<[u8]> for Slice { + fn as_ref(&self) -> &[u8] { + self.deref() + } +} + +#[cfg(test)] +mod test { + use crate::LinearBufferExtend; + + use super::*; + + #[test] + #[should_panic( + expected = "buffer only has 2 bytes left but initialized part should be bumped by 3 bytes" + )] + fn panic_bump_too_much() { + let mut buffer = LinearBuffer::new(3); + + buffer.tail()[0].write(1); + unsafe { buffer.bump(1) }; + + buffer.tail()[0].write(1); + buffer.tail()[1].write(1); + unsafe { buffer.bump(3) }; + } + + #[test] + #[should_panic(expected = "out of range")] + fn panic_slice_begin_usize_out_of_range() { + let mut buffer = LinearBuffer::new(3); + buffer.append(b"foo"); + + buffer.slice_initialized_part((Bound::Excluded(usize::MAX), Bound::Unbounded)); + } + + #[test] + #[should_panic(expected = "out of range")] + fn panic_slice_end_usize_out_of_range() { + let mut buffer = LinearBuffer::new(3); + buffer.append(b"foo"); + + buffer.slice_initialized_part(..=usize::MAX); + } + + #[test] + #[should_panic(expected = "range start must not be greater than end: 2 <= 1")] + #[expect(clippy::reversed_empty_ranges)] + fn panic_slice_begin_past_end() { + let mut buffer = LinearBuffer::new(10); + buffer.append(b"foo"); + + buffer.slice_initialized_part(2..1); + } + + #[test] + #[should_panic(expected = "range end out of bounds: 4 <= 3")] + fn panic_slice_end_past_init_part() { + let mut buffer = LinearBuffer::new(10); + buffer.append(b"foo"); + + buffer.slice_initialized_part(..4); + } + + #[test] + fn empty_slice() { + let mut buffer = LinearBuffer::new(3); + buffer.append(b"foo"); + + let bytes = buffer.slice_initialized_part(0..0); + assert_eq!(bytes.as_ref(), b""); + } + + #[test] + fn slices_are_zero_copy() { + let mut buffer = LinearBuffer::new(10); + buffer.append(b"foo"); + + let bytes_1 = buffer.slice_initialized_part(..3); + let ptr_1 = bytes_1.as_ptr().expose_provenance(); + assert_eq!(bytes_1.as_ref(), b"foo".as_slice()); + + buffer.append(b"bar"); + + let bytes_2 = buffer.slice_initialized_part(..6); + let ptr_2 = bytes_1.as_ptr().expose_provenance(); + assert_eq!(bytes_2.as_ref(), b"foobar".as_slice()); + assert_eq!(ptr_1, ptr_2); + + buffer.append(b"xxxx"); + + let data = buffer.slice_initialized_part(..); + let data_ptr = data.as_ptr().expose_provenance(); + assert_eq!(data_ptr, ptr_1); + assert_eq!(data.as_ref(), b"foobarxxxx".as_slice()); + } + + #[test] + fn can_read_slice_after_buffer_drop() { + let mut buffer = LinearBuffer::new(3); + buffer.append(b"foo"); + + let bytes = buffer.slice_initialized_part(..); + drop(buffer); + + assert_eq!(bytes.as_ref(), b"foo".as_slice()); + } + + #[test] + fn slice_clone() { + let mut buffer = LinearBuffer::new(3); + buffer.append(b"foo"); + + let bytes = buffer.slice_initialized_part(..); + assert_eq!(bytes.as_ref(), b"foo".as_slice()); + assert_eq!(bytes.strong_count(), 2); + + drop(buffer); + assert_eq!(bytes.strong_count(), 1); + + let bytes2 = bytes.clone(); + assert_eq!(bytes2.as_ref(), b"foo".as_slice()); + assert_eq!(bytes.strong_count(), 2); + assert_eq!(bytes2.strong_count(), 2); + assert_eq!( + bytes.as_ptr().expose_provenance(), + bytes2.as_ptr().expose_provenance(), + "slice cloning MUST NOT clone the actual data", + ); + + drop(bytes); + assert_eq!(bytes2.strong_count(), 1); + assert_eq!(bytes2.as_ref(), b"foo".as_slice()); + } + + #[test] + fn slice_debug() { + let mut buffer = LinearBuffer::new(3); + buffer.append(b"foo"); + + let slice = buffer.slice_initialized_part(..); + assert_eq!(format!("{slice:?}"), "[102, 111, 111]"); + assert_eq!(format!("{slice:x?}"), "[66, 6f, 6f]"); + } + + #[test] + fn empty_buffer() { + let buffer = LinearBuffer::new(0); + let slice = buffer.slice_initialized_part(..); + assert_eq!(slice.as_ref(), b""); + } + + #[test] + #[should_panic(expected = "size fits `isize`")] + fn new_panics_larger_than_isize() { + LinearBuffer::new(usize::MAX); + } + + #[test] + #[should_panic(expected = "cannot allocate 9223372036854775807 bytes with alignment 1")] + #[cfg(not(miri))] // MIRI cannot handle this + fn new_panics_out_of_memory() { + LinearBuffer::new(isize::MAX as usize); + } + + #[test] + #[should_panic(expected = "valid alignment")] + fn new_panics_if_alignment_is_not_power_of_two() { + LinearBuffer::with_alignment(1, NonZeroUsize::new(3).unwrap()); + } + + #[test] + fn alignment() { + for size in [0, 13] { + for shift in 0..13 { + let alignment = NonZeroUsize::new(1 << shift).unwrap(); + println!("size={size} alignment={alignment}"); + + let mut buffer = LinearBuffer::with_alignment(size, alignment); + assert_eq!(buffer.total_size(), size); + + let slice = buffer.slice_initialized_part(0..0); + assert_eq!(slice.as_ptr().align_offset(alignment.get()), 0); + + buffer.fill(0, size); + let slice = buffer.slice_initialized_part(..size); + assert_eq!(slice.as_ptr().align_offset(alignment.get()), 0); + } + } + } + + #[test] + fn strong_count() { + let buffer = LinearBuffer::new(3); + assert_eq!(buffer.strong_count(), 1); + + let slice_1 = buffer.slice_initialized_part(..0); + assert_eq!(buffer.strong_count(), 2); + assert_eq!(slice_1.strong_count(), 2); + + let slice_2 = buffer.slice_initialized_part(..0); + assert_eq!(buffer.strong_count(), 3); + assert_eq!(slice_1.strong_count(), 3); + assert_eq!(slice_2.strong_count(), 3); + + drop(slice_1); + assert_eq!(buffer.strong_count(), 2); + assert_eq!(slice_2.strong_count(), 2); + + drop(buffer); + assert_eq!(slice_2.strong_count(), 1); + } + + #[test] + #[ignore = "this is unsound, it just demonstrates that MIRI will find out about it"] + fn miri_finds_it() { + let mut buffer = LinearBuffer::new(3); + + buffer.tail()[0].write(1); + + // we lie about the amount of data written + unsafe { buffer.bump(3) }; + + let bytes = buffer.slice_initialized_part(..); + assert_ne!(bytes.as_ref(), b"xxx".as_slice()); + } + + const fn assert_send() {} + const fn assert_sync() {} + + const _: () = assert_send::(); + const _: () = assert_sync::(); + const _: () = assert_send::(); + const _: () = assert_sync::(); +} diff --git a/meta_data_cache/Cargo.toml b/meta_data_cache/Cargo.toml index d87af7a7..1166d667 100644 --- a/meta_data_cache/Cargo.toml +++ b/meta_data_cache/Cargo.toml @@ -22,7 +22,7 @@ futures = { version = "0.3.31" } [dev-dependencies] arrow_util = { path = "../arrow_util" } bytes = "1.10" -tokio = { version = "1.47.1", default-features = false } +tokio = { version = "1.48.0", default-features = false } uuid = { version = "1", features = ["v4"] } [lints] diff --git a/object_store_mem_cache/Cargo.toml b/object_store_mem_cache/Cargo.toml index be94e16b..28b4fa91 100644 --- a/object_store_mem_cache/Cargo.toml +++ b/object_store_mem_cache/Cargo.toml @@ -12,15 +12,17 @@ bytes = { version = "1.10.1", default-features = false } dashmap = "6.1.0" data_types = { path = "../data_types" } futures = { version = "0.3.31" } -indexmap = { version = "2.11", features = ["std"] } +http = { workspace = true } +indexmap = { version = "2.12", features = ["std"] } iox_time = { path = "../iox_time" } +linear_buffer = { path = "../linear_buffer" } metric = { path = "../metric" } object_store.workspace = true object_store_metrics = { path = "../object_store_metrics" } object_store_mock = { path = "../object_store_mock" } object_store_size_hinting = { path = "../object_store_size_hinting" } tracing = { workspace = true } -tokio = { version = "1.47.1", default-features = false } +tokio = { version = "1.48.0", default-features = false } tracker = { path = "../tracker" } workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/object_store_mem_cache/src/buffer_channel.rs b/object_store_mem_cache/src/buffer_channel.rs new file mode 100644 index 00000000..d451beb3 --- /dev/null +++ b/object_store_mem_cache/src/buffer_channel.rs @@ -0,0 +1,175 @@ +//! Channel to hand a buffer from an inner store to the in-mem cache. +//! +//! Normally we would just use [`bytes`], however the crate suffers from gate-keeping and even though many users would +//! like to see it, there is currently no proper way to build [`bytes`]-based buffers with proper alignment or a +//! custom vtable. So we work around it. + +use std::{ + pin::Pin, + sync::{ + Arc, Mutex, + atomic::{AtomicBool, Ordering}, + }, + task::{Context, Poll}, +}; + +use futures::FutureExt; +use linear_buffer::Slice; +use tokio::sync::oneshot::{Receiver, Sender, error::RecvError}; + +/// Create channel that can be used ONCE to send a [`Slice`]. +/// +/// The sender may choose not to accept the transfer (by not calling [`accept`](BufferSender::accept)), i.e. if it does +/// not implement buffer handling. +pub fn channel() -> (BufferSender, BufferReceiver) { + let accepted = Arc::new(AtomicBool::new(false)); + let (sender, receiver) = tokio::sync::oneshot::channel(); + let sender = BufferSender { + accepted: Arc::clone(&accepted), + sender: Arc::new(Mutex::new(Some(sender))), + }; + let receiver = BufferReceiver { accepted, receiver }; + (sender, receiver) +} + +/// Sender-side for a [`Slice`]. +/// +/// The sender is clonable so it can be used with [`http::Extensions`], but you must only call [`accept`](Self::accept) +/// at most once. +#[derive(Debug, Clone)] +pub struct BufferSender { + accepted: Arc, + sender: Arc>>>, +} + +impl BufferSender { + /// Accept that we will have a [`Slice`] available at some point. + /// + /// After calling this function, the sender MUST provide a slice at some point. Dropping the returned + /// [handle](BufferSenderAccepted) without doing so will result in an error on the + /// [receiver side](BufferReceiverAccepted). + /// + /// # Panic + /// Across all clones, this method must only be called at most once. + pub fn accept(self) -> BufferSenderAccepted { + let Self { accepted, sender } = self; + let maybe_sender = { + let mut guard = sender.lock().unwrap(); + guard.take() + }; + let sender = maybe_sender.expect("can only accept once"); + accepted.store(true, Ordering::SeqCst); + BufferSenderAccepted { sender } + } +} + +/// Sender-side in an [accepted](BufferSender::accept) state. +#[derive(Debug)] +pub struct BufferSenderAccepted { + sender: Sender, +} + +impl BufferSenderAccepted { + /// Send slice. + pub fn send(self, buffer: Slice) { + let Self { sender } = self; + sender.send(buffer).ok(); + } +} + +/// Receiver side of a [`Slice`]. +#[derive(Debug)] +pub struct BufferReceiver { + accepted: Arc, + receiver: Receiver, +} + +impl BufferReceiver { + pub fn accepted(self) -> Option { + let Self { accepted, receiver } = self; + accepted + .load(Ordering::SeqCst) + .then_some(BufferReceiverAccepted { receiver }) + } +} + +/// Receiver side of the [`Slice`] for which the sender has accepted the transfer. +#[derive(Debug)] +pub struct BufferReceiverAccepted { + receiver: Receiver, +} + +impl Future for BufferReceiverAccepted { + type Output = Result; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + self.receiver.poll_unpin(cx) + } +} + +#[cfg(test)] +mod test { + use linear_buffer::LinearBuffer; + + use super::*; + + #[test] + #[should_panic(expected = "can only accept once")] + fn panic_accept_twice() { + let (tx, _rx) = channel(); + let tx2 = tx.clone(); + + tx.accept(); + tx2.accept(); + } + + #[tokio::test] + async fn err_accepted_sender_dropped() { + let (tx, rx) = channel(); + let tx = tx.accept(); + let rx = rx.accepted().unwrap(); + drop(tx); + rx.await.unwrap_err(); + } + + #[tokio::test] + async fn accept_accepted_send_receive() { + let buffer = LinearBuffer::new(0); + let slice = buffer.slice_initialized_part(..); + + let (tx, rx) = channel(); + let tx = tx.accept(); + let rx = rx.accepted().unwrap(); + tx.send(slice.clone()); + let slice2 = rx.await.unwrap(); + + assert_eq!( + slice.as_ptr().expose_provenance(), + slice2.as_ptr().expose_provenance(), + ); + } + + #[tokio::test] + async fn accept_send_accepted_receive() { + let buffer = LinearBuffer::new(0); + let slice = buffer.slice_initialized_part(..); + + let (tx, rx) = channel(); + let tx = tx.accept(); + tx.send(slice.clone()); + let rx = rx.accepted().unwrap(); + let slice2 = rx.await.unwrap(); + + assert_eq!( + slice.as_ptr().expose_provenance(), + slice2.as_ptr().expose_provenance(), + ); + } + + #[tokio::test] + async fn not_accepted() { + let (tx, rx) = channel(); + drop(tx); + assert!(rx.accepted().is_none()); + } +} diff --git a/object_store_mem_cache/src/cache_system/mod.rs b/object_store_mem_cache/src/cache_system/mod.rs index cbe76ac8..2a3f841b 100644 --- a/object_store_mem_cache/src/cache_system/mod.rs +++ b/object_store_mem_cache/src/cache_system/mod.rs @@ -86,6 +86,12 @@ where } } +impl HasSize for () { + fn size(&self) -> usize { + 0 + } +} + /// Dynamic error type. pub type DynError = Arc; diff --git a/object_store_mem_cache/src/cache_system/s3_fifo_cache/fifo.rs b/object_store_mem_cache/src/cache_system/s3_fifo_cache/fifo.rs index 9fb36b28..88a4a3b5 100644 --- a/object_store_mem_cache/src/cache_system/s3_fifo_cache/fifo.rs +++ b/object_store_mem_cache/src/cache_system/s3_fifo_cache/fifo.rs @@ -15,10 +15,21 @@ impl Fifo where T: HasSize, { + /// Create a new Fifo from a VecDeque. + pub(crate) fn new(queue: VecDeque) -> Self { + let memory_size = queue.iter().map(|o| o.size()).sum(); + Self { queue, memory_size } + } + pub(crate) fn memory_size(&self) -> usize { self.memory_size } + /// Return a count of items in the queue. + pub(crate) fn len(&self) -> usize { + self.queue.len() + } + pub(crate) fn iter(&self) -> vec_deque::Iter<'_, T> { self.queue.iter() } @@ -37,6 +48,15 @@ where None => None, } } + + /// Drain all elements from the queue, consuming the underlying VecDeque + /// and returning a iterator over the items. + /// + /// This preserves the ordering of elements and avoids re-allocation. + pub(crate) fn drain(&mut self) -> impl Iterator { + self.memory_size = 0; + std::mem::take(&mut self.queue).into_iter() + } } impl Default for Fifo diff --git a/object_store_mem_cache/src/cache_system/s3_fifo_cache/mod.rs b/object_store_mem_cache/src/cache_system/s3_fifo_cache/mod.rs index 1ab24109..ddf11079 100644 --- a/object_store_mem_cache/src/cache_system/s3_fifo_cache/mod.rs +++ b/object_store_mem_cache/src/cache_system/s3_fifo_cache/mod.rs @@ -15,8 +15,8 @@ use tracker::{ AsyncSemaphoreMetrics, InstrumentedAsyncOwnedSemaphorePermit, InstrumentedAsyncSemaphore, }; -// for benchmarks -pub use s3_fifo::{S3Config, S3Fifo}; +// for benchmarks and tests +pub use s3_fifo::{S3Config, S3Fifo, s3_fifo_entry_overhead_size}; use crate::cache_system::{AsyncDrop, DynError, InUse}; @@ -264,9 +264,27 @@ where /// /// Note that the keys listed in the cache are those which have returned from the /// [`CacheFn`] function, i.e. they are the keys that have been successfully fetched. + /// + /// These keys do not have any guaranteed ordering. pub fn list(&self) -> impl Iterator> { self.cache.keys() } + + /// Evict multiple keys from the S3FifoCache, in a blocking manner. + /// + /// This method directly removes entries from the cache without going through + /// the normal eviction process, where the S3-Fifo algorthim decides what to evict. + /// This is useful for cache management operations like repair/validation. + /// + /// This method is blocking, and holds a mutex in order to replace the [`S3Fifo`] cache + /// at once. + /// + /// Returns the number of keys that were successfully evicted. If a key does not + /// exist in the cache and cannot be evicted, it will be ignored (and the + /// returned count of evicted items will be lower). + pub fn evict_keys(&self, keys: impl Iterator) -> usize { + self.cache.remove_keys(keys) + } } #[async_trait] @@ -1135,4 +1153,256 @@ mod tests { let result2 = res2.await.unwrap(); assert_eq!(result2, Arc::from("value2")); } + + #[tokio::test] + async fn test_evict_keys_small_queue() { + let hook = Arc::new(TestHook::default()); + let cache = S3FifoCache::, Arc, ()>::new( + S3Config { + max_memory_size: 1000, + max_ghost_memory_size: 500, + move_to_main_threshold: 0.1, + hook: Arc::clone(&hook) as _, + inflight_bytes: 250, + }, + &metric::Registry::new(), + ); + + // Insert 5 keys in order: key1, key2, key3, key4, key5 + let keys = vec!["key1", "key2", "key3", "key4", "key5"]; + let mut inserted_keys = Vec::new(); + + for key_str in &keys { + let key = Arc::from(*key_str); + let value = Arc::from(format!("value_{}", key_str)); + inserted_keys.push(Arc::clone(&key)); + + let (res, _, state) = cache.get_or_fetch( + &key, + Box::new({ + let value = Arc::clone(&value); + move || futures::future::ready(Ok(value)).boxed() + }), + (), + Some(value.size()), + ); + assert_eq!(state, CacheState::NewEntry); + res.await.unwrap(); + } + + // Verify all keys are in the cache + assert_eq!(cache.len(), 5); + for key in &inserted_keys { + assert!(cache.get(key).is_some(), "Key {key:?} should be in cache"); + } + + // Get list of keys before eviction to verify ordering preservation + let keys_before: Vec> = cache + .cache + .small_queue_keys() + .into_iter() + .map(Arc::unwrap_or_clone) + .collect(); + assert_eq!(keys_before.len(), 5, "Should have 5 keys before eviction"); + + // Confirm have empty ghost queue + assert_eq!(cache.cache.ghost_len(), 0, "Ghost queue should be empty"); + + // Evict only key2 and key4 (selective eviction) + let keys_to_evict = vec![ + Arc::clone(&inserted_keys[1]), // key2 + Arc::clone(&inserted_keys[3]), // key4 + ]; + let evicted_count = cache.evict_keys(keys_to_evict.clone().into_iter()); + assert_eq!(evicted_count, 2, "Should have evicted exactly 2 keys"); + + // Verify cache size is reduced + assert_eq!( + cache.len(), + 3, + "Cache should contain 3 entries after eviction" + ); + + // Get list of keys after eviction + let keys_after: Vec> = cache + .cache + .small_queue_keys() + .into_iter() + .map(Arc::unwrap_or_clone) + .collect(); + let expected_remaining_keys: Vec> = keys_before + .into_iter() + .filter(|key| !keys_to_evict.contains(key)) + .collect(); + assert_eq!( + keys_after, expected_remaining_keys, + "Remaining keys should match expected keys, and retain the same ordering" + ); + + // Check that evicted keys are removed from S3Fifo::entries + for evicted_key in &keys_to_evict { + assert!( + !cache.cache.contains_key_in_entries(evicted_key), + "Evicted key {evicted_key:?} should be removed from entries" + ); + } + + // Check that remaining keys are still in S3Fifo::entries + for remaining_key in &expected_remaining_keys { + assert!( + cache.cache.contains_key_in_entries(remaining_key), + "Remaining key {remaining_key:?} should still be in entries" + ); + } + + // Check ghost queue is still empty + assert_eq!( + cache.cache.ghost_len(), + 0, + "Ghost queue should remain empty" + ); + } + + #[tokio::test] + async fn test_evict_keys_main_queue_and_ghost() { + let hook = Arc::new(TestHook::default()); + let cache = S3FifoCache::, Arc, ()>::new( + S3Config { + max_memory_size: 150 + 100, + max_ghost_memory_size: 150, + move_to_main_threshold: 0.3, + hook: Arc::clone(&hook) as _, + inflight_bytes: 50, + }, + &metric::Registry::new(), + ); + + // Insert 6 keys in order: key1, key2, key3, key4, key5, key6 + let keys = vec!["key1", "key2", "key3", "key4", "key5", "key6"]; + let mut inserted_keys = Vec::new(); + + for key_str in &keys { + let key = Arc::from(*key_str); + let value = Arc::from(format!("value_{}", key_str)); + inserted_keys.push(Arc::clone(&key)); + + let (res, _, state) = cache.get_or_fetch( + &key, + Box::new({ + let value = Arc::clone(&value); + move || futures::future::ready(Ok(value)).boxed() + }), + (), + Some(value.size()), + ); + assert_eq!(state, CacheState::NewEntry); + res.await.unwrap(); + } + + // Verify cache only has the last 3 keys (key4, key5, key6) due to eviction + assert_eq!(cache.len(), 3, "Cache should contain exactly 3 entries"); + + // The first 3 keys should have been evicted and logged in the ghost + assert_eq!(cache.cache.ghost_len(), 3, "Ghost should have 3 entries"); + + // Re-insert the first 3 keys (key1, key2, key3) + for i in 0..3 { + let key = Arc::clone(&inserted_keys[i]); + let value = Arc::from(format!("value_{}", keys[i])); + + let (res, _, state) = cache.get_or_fetch( + &key, + Box::new({ + let value = Arc::clone(&value); + move || futures::future::ready(Ok(value)).boxed() + }), + (), + Some(value.size()), + ); + assert_eq!(state, CacheState::NewEntry); + res.await.unwrap(); + } + + // Verify the first 3 keys are now in the main queue (since they were in ghost) + let main_queue_keys = cache.cache.main_queue_keys(); + assert_eq!( + main_queue_keys, + vec![ + Arc::new(Arc::clone(&inserted_keys[0])), + Arc::new(Arc::clone(&inserted_keys[1])), + Arc::new(Arc::clone(&inserted_keys[2])), + ] + ); + + // Verify they are no longer in the ghost + assert!( + !cache + .cache + .contains_key_in_ghost(&Arc::new(Arc::clone(&inserted_keys[0]))), + "key1 should no longer be in ghost" + ); + assert!( + !cache + .cache + .contains_key_in_ghost(&Arc::new(Arc::clone(&inserted_keys[1]))), + "key2 should no longer be in ghost" + ); + assert!( + !cache + .cache + .contains_key_in_ghost(&Arc::new(Arc::clone(&inserted_keys[2]))), + "key3 should no longer be in ghost" + ); + // Instead, we have key4 & key5 & key6 in the ghost + assert_eq!( + cache.cache.ghost_len(), + 3, + "Ghost should have 3 NEW entries" + ); + assert!( + cache + .cache + .contains_key_in_ghost(&Arc::new(Arc::clone(&inserted_keys[3]))), + "key4 should be in ghost" + ); + + // Evict key1 (main queue) and key4 (ghost) from the cache + let keys_to_evict = vec![Arc::clone(&inserted_keys[0]), Arc::clone(&inserted_keys[3])]; // key1, key4 + let evicted_count = cache.evict_keys(keys_to_evict.clone().into_iter()); + assert_eq!( + evicted_count, 1, + "Should have evicted exactly 1 key -- since only 1 is currently in the queue" + ); + + // Verify key1 is removed from main queue + let main_queue_keys_after = cache.cache.main_queue_keys(); + assert!( + !main_queue_keys_after.contains(&Arc::new(Arc::clone(&inserted_keys[0]))), + "key1 should be removed from main queue" + ); + + // Verify key1 is removed from entries (should not be in cache anymore) + assert!( + !cache.cache.contains_key_in_entries(&inserted_keys[0]), + "key1 should be removed from entries" + ); + + // Verify key2 & key 3 are still in main queue, as well as the ordering is retained. + assert_eq!( + main_queue_keys_after, + vec![ + Arc::new(Arc::clone(&inserted_keys[1])), + Arc::new(Arc::clone(&inserted_keys[2])), + ], + "key2 & key3 should still be in main queue" + ); + + // Verify key4 is still in ghost queue (should remain there) + assert!( + cache + .cache + .contains_key_in_ghost(&Arc::new(Arc::clone(&inserted_keys[3]))), + "key4 should still be in ghost after eviction" + ); + } } diff --git a/object_store_mem_cache/src/cache_system/s3_fifo_cache/ordered_set.rs b/object_store_mem_cache/src/cache_system/s3_fifo_cache/ordered_set.rs index a4dfc6ce..85eea7da 100644 --- a/object_store_mem_cache/src/cache_system/s3_fifo_cache/ordered_set.rs +++ b/object_store_mem_cache/src/cache_system/s3_fifo_cache/ordered_set.rs @@ -139,6 +139,11 @@ where pub(crate) fn len(&self) -> usize { self.set.len() } + + #[cfg(test)] + pub(crate) fn contains(&self, o: &T) -> bool { + self.set.contains(&Entry::Data(o)) + } } /// Encode implementation, with trait bounds for `T`. diff --git a/object_store_mem_cache/src/cache_system/s3_fifo_cache/s3_fifo.rs b/object_store_mem_cache/src/cache_system/s3_fifo_cache/s3_fifo.rs index bd923005..78c82672 100644 --- a/object_store_mem_cache/src/cache_system/s3_fifo_cache/s3_fifo.rs +++ b/object_store_mem_cache/src/cache_system/s3_fifo_cache/s3_fifo.rs @@ -1,6 +1,7 @@ use bincode::{Decode, Encode}; use dashmap::DashMap; use std::{ + collections::{HashSet, VecDeque}, fmt::{Debug, Formatter}, hash::Hash, sync::{ @@ -123,6 +124,23 @@ where } } +/// Returns the overhead size of the [`S3FifoEntry`] +/// placed into the S3 Fifo cache manager. +/// +/// This is useful for testing, since it's the size used +/// for eviction decisions. +pub fn s3_fifo_entry_overhead_size() -> usize { + // The overhead size is the size of the S3FifoEntry struct, + // which is used to store the cache entry in the S3 FIFO cache manager. + Arc::new(S3FifoEntry { + key: Arc::new(()), + value: Arc::new(()), + generation: 0, + freq: AtomicU8::new(0), + }) + .size() +} + pub(crate) type CacheEntry = Arc>; type Entries = DashMap, CacheEntry>; pub(crate) type Evicted = Vec>>; @@ -356,6 +374,33 @@ where self.entries.iter().map(|entry| Arc::clone(entry.key())) } + /// Remove multiple keys from the cache, in a blocking manner. + /// + /// This method directly removes entries from the cache without going through + /// the normal eviction process. This is useful for cache management operations + /// like repair/validation. + /// + /// Returns the number of keys that were successfully removed. If a key does not + /// exist in the cache and cannot be removed, it will be ignored (and the returned count + /// of removed items will be lower). + pub fn remove_keys(&self, keys: impl Iterator) -> usize + where + K: Sized + Clone + Debug, + { + let mut guard = self.locked_state.lock(); + + // Remove keys from the entries map + let to_remove_from_state: HashSet = keys + .filter_map(|k| self.entries.remove(&k).map(|_| k)) + .collect(); + + // Remove from locked state. + let count_removed = guard.remove_keys(&to_remove_from_state); + drop(guard); + + count_removed + } + /// Create a snapshot of the locked state. /// /// This function serializes the [`S3Fifo`] inner state using bincode, allowing for @@ -423,6 +468,38 @@ where let guard = self.locked_state.lock(); guard.ghost.len() } + + #[cfg(test)] + pub(crate) fn small_queue_keys(&self) -> Vec> { + let guard = self.locked_state.lock(); + guard + .small + .iter() + .map(|entry| Arc::clone(&entry.key)) + .collect() + } + + #[cfg(test)] + pub(crate) fn main_queue_keys(&self) -> Vec> { + let guard = self.locked_state.lock(); + guard + .main + .iter() + .map(|entry| Arc::clone(&entry.key)) + .collect() + } + + #[cfg(test)] + pub(crate) fn contains_key_in_entries(&self, key: &K) -> bool { + self.entries.contains_key(key) + } + + #[cfg(test)] + pub(crate) fn contains_key_in_ghost(&self, key: &Arc) -> bool { + let guard = self.locked_state.lock(); + // The ghost stores Arc, so we need to check by content + guard.ghost.contains(key) + } } /// Calls [`drop`] but isn't inlined, so it is easier to see on profiles. @@ -734,6 +811,41 @@ where } } } + + /// Remove multiple keys from the small and main queues. + /// + /// This method efficiently removes multiple keys by iterating through each queue once. + /// It first checks the small queue for all keys, then checks the main queue for any + /// remaining keys that weren't found in the small queue. + /// + /// Returns the number of keys that were successfully removed. If a key does not + /// exist in the cache and cannot be removed, it will be ignored (and the returned count + /// of removed items will be lower). + fn remove_keys(&mut self, keys_to_remove: &HashSet) -> usize + where + K: Sized + Clone + Debug, + { + let initial_count = self.small.len() + self.main.len(); + + // Remove from small queue + let filtered_small: VecDeque<_> = self + .small + .drain() + .filter(|entry| !keys_to_remove.contains(entry.key.as_ref())) + .collect(); + self.small = Fifo::new(filtered_small); + + // Remove from main queue + let filtered_main: VecDeque<_> = self + .main + .drain() + .filter(|entry| !keys_to_remove.contains(entry.key.as_ref())) + .collect(); + self.main = Fifo::new(filtered_main); + + // Return the number of keys that were actually removed + initial_count - (self.small.len() + self.main.len()) + } } #[cfg(test)] diff --git a/object_store_mem_cache/src/lib.rs b/object_store_mem_cache/src/lib.rs index faefcf54..90e7094f 100644 --- a/object_store_mem_cache/src/lib.rs +++ b/object_store_mem_cache/src/lib.rs @@ -7,6 +7,7 @@ use clap as _; use rand as _; use workspace_hack as _; +pub mod buffer_channel; pub mod cache_system; pub mod object_store_cache_tests; pub mod object_store_helpers; diff --git a/object_store_mem_cache/src/object_store_cache_tests.rs b/object_store_mem_cache/src/object_store_cache_tests.rs index 13af15aa..7bf18d3c 100644 --- a/object_store_mem_cache/src/object_store_cache_tests.rs +++ b/object_store_mem_cache/src/object_store_cache_tests.rs @@ -2,8 +2,10 @@ use std::sync::Arc; use bytes::Bytes; use futures::future::BoxFuture; +use http::Extensions; use object_store::{ - DynObjectStore, Error, GetResult, GetResultPayload, ObjectMeta, PutPayload, path::Path, + DynObjectStore, Error, GetOptions, GetResult, GetResultPayload, ObjectMeta, PutPayload, + path::Path, }; /// Abstract test setup. @@ -25,6 +27,11 @@ pub trait Setup: Send { /// /// This store MUST reject writes. fn outer(&self) -> &Arc; + + /// Extensions used by the store. + fn extensions(&self) -> Extensions { + Default::default() + } } fn get_result(data: &'static [u8], path: &Path) -> GetResult { @@ -53,14 +60,19 @@ where let location_a = Path::parse("x").unwrap(); let location_b = Path::parse("y").unwrap(); + let get_ops = GetOptions { + extensions: setup.extensions(), + ..Default::default() + }; + Arc::clone(setup.inner()) .mock_next(object_store_mock::MockCall::GetOpts { - params: (location_a.clone(), Default::default()), + params: (location_a.clone(), get_ops.clone().into()), barriers: vec![], res: Ok(get_result(b"foo", &location_a)), }) .mock_next(object_store_mock::MockCall::GetOpts { - params: (location_b.clone(), Default::default()), + params: (location_b.clone(), get_ops.clone().into()), barriers: vec![], res: Ok(get_result(b"bar", &location_b)), }); @@ -107,14 +119,19 @@ where let location_a = Path::parse("x").unwrap(); let location_b = Path::parse("y").unwrap(); + let get_ops = GetOptions { + extensions: setup.extensions(), + ..Default::default() + }; + Arc::clone(setup.inner()) .mock_next(object_store_mock::MockCall::GetOpts { - params: (location_a.clone(), Default::default()), + params: (location_a.clone(), get_ops.clone().into()), barriers: vec![], res: Ok(get_result(b"foo", &location_a)), }) .mock_next(object_store_mock::MockCall::GetOpts { - params: (location_b.clone(), Default::default()), + params: (location_b.clone(), get_ops.clone().into()), barriers: vec![], res: Ok(get_result(b"bar", &location_b)), }); @@ -147,8 +164,13 @@ where let location = Path::parse("x").unwrap(); + let get_ops = GetOptions { + extensions: setup.extensions(), + ..Default::default() + }; + Arc::clone(setup.inner()).mock_next(object_store_mock::MockCall::GetOpts { - params: (location.clone(), Default::default()), + params: (location.clone(), get_ops.clone().into()), barriers: vec![], res: Err(Error::NotFound { path: location.to_string(), @@ -171,8 +193,13 @@ where let location = Path::parse("x").unwrap(); + let get_ops = GetOptions { + extensions: setup.extensions(), + ..Default::default() + }; + Arc::clone(setup.inner()).mock_next(object_store_mock::MockCall::GetOpts { - params: (location.clone(), Default::default()), + params: (location.clone(), get_ops.clone().into()), barriers: vec![], res: Ok(get_result(b"foo", &location)), }); @@ -185,9 +212,9 @@ where assert_eq!(data_1.as_ref(), b"foo"); let res_2 = setup.outer().get(&location).await.unwrap(); - assert_eq!( + assert_ne!( CacheState::try_from(res_2.attributes.get(&ATTR_CACHE_STATE).unwrap()).unwrap(), - CacheState::WasCached, + CacheState::NewEntry, // should be loading, or in cache ); let data_2 = res_2.bytes().await.unwrap(); assert_eq!(data_1, data_2); @@ -221,8 +248,11 @@ where let location = Path::parse("x").unwrap(); let data = b"foo"; + let mut get_ops = hint_size(data.len() as u64); + get_ops.extensions.extend(setup.extensions()); + Arc::clone(setup.inner()).mock_next(object_store_mock::MockCall::GetOpts { - params: (location.clone(), hint_size(data.len() as u64).into()), + params: (location.clone(), get_ops.clone().into()), barriers: vec![], res: Ok(get_result(data, &location)), }); diff --git a/object_store_mem_cache/src/store.rs b/object_store_mem_cache/src/store.rs index 486c30c3..ffba3573 100644 --- a/object_store_mem_cache/src/store.rs +++ b/object_store_mem_cache/src/store.rs @@ -3,6 +3,7 @@ use std::{num::NonZeroUsize, ops::Range, sync::Arc}; use async_trait::async_trait; use bytes::Bytes; use futures::{FutureExt, StreamExt, TryStreamExt, stream::BoxStream}; +use linear_buffer::Slice; use metric::U64Counter; use object_store::{ AttributeValue, Attributes, DynObjectStore, Error, GetOptions, GetResult, GetResultPayload, @@ -27,9 +28,38 @@ use crate::{ const CACHE_NAME: &str = "object_store"; const STORE_NAME: &str = "mem_cache"; +#[derive(Debug)] +enum CacheValueData { + Owned(Bytes), + Shared(Slice), +} + +impl CacheValueData { + fn size(&self) -> usize { + match self { + Self::Owned(bytes) => bytes.len(), + Self::Shared(slice) => slice.allocation_size(), + } + } + + fn as_bytes(&self) -> Bytes { + match self { + Self::Owned(bytes) => bytes.clone(), + Self::Shared(slice) => Bytes::from_owner(slice.clone()), + } + } + + fn is_unique(&self) -> bool { + match self { + Self::Owned(bytes) => bytes.is_unique(), + Self::Shared(slice) => slice.strong_count() == 1, + } + } +} + #[derive(Debug)] struct CacheValue { - data: Bytes, + data: CacheValueData, meta: ObjectMeta, } @@ -39,29 +69,48 @@ impl CacheValue { location: &Path, size_hint: Option, ) -> Result { - let options = match size_hint { + let mut options = match size_hint { Some(size) => hint_size(size), None => GetOptions::default(), }; + + let (buffer_tx, buffer_rx) = crate::buffer_channel::channel(); + options.extensions.insert(buffer_tx); + let res = store.get_opts(location, options).await?; let meta = res.meta.clone(); - // HACK: `Bytes` is a view-based type and may reference and underlying larger buffer. Maybe that causes - // https://github.com/influxdata/influxdb_iox/issues/13765 (there it was a catalog issue, but we - // seem to have a similar issue with the disk cache interaction?) . So we "unshare" the buffer by - // round-tripping it through an owned type. - // - // We try to be clever by creating 1 "landing buffer" instead of using `res.bytes()` and then an - // additional clone. See https://github.com/influxdata/influxdb_iox/issues/15078#issuecomment-3223376485 - let mut stream = res.into_stream(); - let mut buffer = Vec::with_capacity(meta.size as usize); - while let Some(next) = stream.try_next().await? { - buffer.extend_from_slice(&next); - } - let data = buffer.into(); + let data = if let Some(buffer_rx) = buffer_rx.accepted() { + // drain stream because metric wrappers might depend on it + let mut stream = res.into_stream(); + while stream.try_next().await?.is_some() {} + + CacheValueData::Shared(buffer_rx.await.map_err(|e| Error::Generic { + store: STORE_NAME, + source: Box::new(e), + })?) + } else { + // HACK: `Bytes` is a view-based type and may reference and underlying larger buffer. Maybe that causes + // https://github.com/influxdata/influxdb_iox/issues/13765 (there it was a catalog issue, but we + // seem to have a similar issue with the disk cache interaction?) . So we "unshare" the buffer by + // round-tripping it through an owned type. + // + // We try to be clever by creating 1 "landing buffer" instead of using `res.bytes()` and then an + // additional clone. See https://github.com/influxdata/influxdb_iox/issues/15078#issuecomment-3223376485 + let mut stream = res.into_stream(); + let mut buffer = Vec::with_capacity(meta.size as usize); + while let Some(next) = stream.try_next().await? { + buffer.extend_from_slice(&next); + } + CacheValueData::Owned(buffer.into()) + }; Ok(Self { data, meta }) } + + fn data(&self) -> Bytes { + self.data.as_bytes() + } } impl HasSize for CacheValue { @@ -75,7 +124,7 @@ impl HasSize for CacheValue { version, } = meta; - data.len() + data.size() + location.as_ref().len() + e_tag.as_ref().map(|s| s.capacity()).unwrap_or_default() + version.as_ref().map(|s| s.capacity()).unwrap_or_default() @@ -269,11 +318,13 @@ impl ObjectStore for MemCacheObjectStore { } let (v, state) = self.get_or_fetch(location, size_hint).await?; + let data = v.data(); + let data_len = data.len(); Ok(GetResult { - payload: GetResultPayload::Stream(futures::stream::iter([Ok(v.data.clone())]).boxed()), + payload: GetResultPayload::Stream(futures::stream::iter([Ok(data)]).boxed()), meta: v.meta.clone(), - range: 0..(v.data.len() as u64), + range: 0..(data_len as u64), attributes: Attributes::from_iter([(ATTR_CACHE_STATE, AttributeValue::from(state))]), }) } @@ -289,17 +340,18 @@ impl ObjectStore for MemCacheObjectStore { async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let (v, _state) = self.get_or_fetch(location, None).await?; + let data = v.data(); ranges .iter() .map(|range| { - if range.end > (v.data.len() as u64) { + if range.end > (data.len() as u64) { return Err(Error::Generic { store: STORE_NAME, source: format!( "Range end ({}) out of bounds, object size is {}", range.end, - v.data.len() + data.len(), ) .into(), }); @@ -314,7 +366,7 @@ impl ObjectStore for MemCacheObjectStore { .into(), }); } - Ok(v.data.slice((range.start as usize)..(range.end as usize))) + Ok(data.slice((range.start as usize)..(range.end as usize))) }) .collect() } @@ -374,9 +426,12 @@ impl ObjectStore for MemCacheObjectStore { #[cfg(test)] mod tests { use futures::FutureExt; - use object_store_mock::MockStore; + use http::Extensions; + use linear_buffer::{LinearBuffer, LinearBufferExtend}; + use object_store_mock::{MockCall, MockParam, MockStore, path}; + use tokio::sync::Barrier; - use crate::{gen_store_tests, object_store_cache_tests::Setup}; + use crate::{buffer_channel::BufferSender, gen_store_tests, object_store_cache_tests::Setup}; use super::*; @@ -414,7 +469,129 @@ mod tests { fn outer(&self) -> &Arc { &self.store } + + fn extensions(&self) -> Extensions { + let mut ext = Extensions::default(); + let (tx, _rx) = crate::buffer_channel::channel(); + ext.insert(tx); + ext + } } gen_store_tests!(TestSetup); + + #[tokio::test] + async fn test_cache_value_buffer_copy() { + let location = path(); + let data = Bytes::from(b"foobar".to_vec()); + + let (tx, _rx) = crate::buffer_channel::channel(); + let mut get_ops = GetOptions::default(); + get_ops.extensions.insert(tx); + + let store = MockStore::new() + .mock_next(MockCall::GetOpts { + params: (location.clone(), get_ops.clone().into()), + barriers: vec![], + res: Ok(GetResult { + payload: GetResultPayload::Stream( + futures::stream::iter([Ok(data.clone())]).boxed(), + ), + meta: meta(&location, &data), + range: 0..(data.len() as u64), + attributes: Default::default(), + }), + }) + .as_store(); + + let value = CacheValue::fetch(&store, &location, None).await.unwrap(); + assert!(!value.in_use()); + + let slice = value.data(); + assert_eq!(slice, data); + assert_ne!( + slice.as_ptr().expose_provenance(), + data.as_ptr().expose_provenance(), + "data was copied", + ); + assert!(value.in_use()); + + drop(slice); + assert!(!value.in_use()); + } + + #[tokio::test] + async fn test_cache_value_buffer_nocopy() { + let location = path(); + let data = Bytes::from(b"foobar".to_vec()); + + const OVERALLOCATE: usize = 10; + let mut buffer = LinearBuffer::new(data.len() + OVERALLOCATE); + buffer.append(&data); + + let (tx, _rx) = crate::buffer_channel::channel(); + let mut get_ops = GetOptions::default(); + get_ops.extensions.insert(tx); + + let barrier = Arc::new(Barrier::new(2)); + + let store = MockStore::new().mock_next(MockCall::GetOpts { + params: (location.clone(), get_ops.clone().into()), + barriers: vec![Arc::clone(&barrier)], + res: Ok(GetResult { + payload: GetResultPayload::Stream( + futures::stream::iter([Ok(data.clone())]).boxed(), + ), + meta: meta(&location, &data), + range: 0..(data.len() as u64), + attributes: Default::default(), + }), + }); + let mut store_params = store.observed_params(); + let store = store.as_store(); + + let fut_value = async { CacheValue::fetch(&store, &location, None).await.unwrap() }; + let fut_buffer = async { + let param = store_params.recv().await.unwrap(); + let MockParam::GetOpts((_path, get_options)) = param else { + unreachable!() + }; + let tx = get_options.extensions.get::().unwrap(); + let tx = tx.clone().accept(); + tx.send(buffer.slice_initialized_part(0..data.len())); + barrier.wait().await; + }; + + let (value, ()) = tokio::join!(fut_value, fut_buffer); + assert!(value.in_use()); + + let buffer_ptr = buffer + .slice_initialized_part(0..0) + .as_ptr() + .expose_provenance(); + drop(buffer); + assert!(!value.in_use()); + + let slice = value.data(); + assert_eq!(slice, data); + assert_eq!( + slice.as_ptr().expose_provenance(), + buffer_ptr, + "data was NOT copied", + ); + assert!(value.in_use()); + + drop(slice); + assert!(!value.in_use()); + } + + fn meta(location: &Path, data: &[u8]) -> ObjectMeta { + ObjectMeta { + location: location.clone(), + last_modified: Default::default(), + size: data.len() as u64, + e_tag: None, + version: None, + } + } } diff --git a/object_store_metrics/Cargo.toml b/object_store_metrics/Cargo.toml index 15ddb150..02b070ab 100644 --- a/object_store_metrics/Cargo.toml +++ b/object_store_metrics/Cargo.toml @@ -18,7 +18,7 @@ metric = { version = "0.1.0", path = "../metric" } object_store = { workspace = true } tracing = { workspace = true } pin-project = "1.1.10" -tokio = { version = "1.47", features = ["io-util"] } +tokio = { version = "1.48", features = ["io-util"] } tracker = { path = "../tracker" } workspace-hack = { version = "0.1", path = "../workspace-hack" } @@ -27,7 +27,7 @@ futures_test_utils = { path = "../futures_test_utils" } insta = { version = "1", features = ["yaml"] } object_store_mem_cache = { path = "../object_store_mem_cache" } object_store_mock = { version = "0.1", path = "../object_store_mock" } -rust_decimal = "1.38.0" -tempfile = "3.22.0" +rust_decimal = "1.39.0" +tempfile = "3.23.0" test_helpers = { path = "../test_helpers" } -tokio = { version = "1.47", features = ["macros", "io-util"] } +tokio = { version = "1.48", features = ["macros", "io-util"] } diff --git a/object_store_metrics/src/cache_metrics.rs b/object_store_metrics/src/cache_metrics.rs index 73222902..65fded09 100644 --- a/object_store_metrics/src/cache_metrics.rs +++ b/object_store_metrics/src/cache_metrics.rs @@ -1061,10 +1061,15 @@ mod tests { let capture = capture(); let location = path(); + + let mut get_opts = GetOptions::default(); + let (tx, _rx) = object_store_mem_cache::buffer_channel::channel(); + get_opts.extensions.insert(tx); + let barrier = Arc::new(Barrier::new(2)); let inner: Arc = MockStore::new() .mock_next(GetOpts { - params: (location.clone(), Default::default()), + params: (location.clone(), get_opts.into()), barriers: vec![Arc::clone(&barrier)], res: Ok(get_result_stream()), }) diff --git a/object_store_mock/Cargo.toml b/object_store_mock/Cargo.toml index af5b0c83..45e1ce29 100644 --- a/object_store_mock/Cargo.toml +++ b/object_store_mock/Cargo.toml @@ -12,7 +12,7 @@ async-trait = { version = "0.1.89", default-features = false } bytes = { version = "1.10.1", default-features = false } futures = { version = "0.3.31" } object_store.workspace = true -tokio = { version = "1.47.1", default-features = false, features = [ +tokio = { version = "1.48.0", default-features = false, features = [ "macros", "rt-multi-thread", ] } diff --git a/object_store_mock/src/lib.rs b/object_store_mock/src/lib.rs index 50e62c7b..aa6f6af0 100644 --- a/object_store_mock/src/lib.rs +++ b/object_store_mock/src/lib.rs @@ -1,6 +1,6 @@ use std::{ fmt::Display, - ops::Range, + ops::{Deref, Range}, sync::{Arc, Mutex}, }; @@ -13,7 +13,10 @@ use object_store::{ GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult, Result, path::Path, }; -use tokio::sync::Barrier; +use tokio::sync::{ + Barrier, + mpsc::{UnboundedReceiver, UnboundedSender}, +}; // Workaround for "unused crate" lint false positives. use workspace_hack as _; @@ -63,6 +66,12 @@ impl From for WrappedGetOptions { } } +impl From for GetOptions { + fn from(options: WrappedGetOptions) -> Self { + options.0 + } +} + impl Clone for WrappedGetOptions { fn clone(&self) -> Self { Self(GetOptions { @@ -78,6 +87,14 @@ impl Clone for WrappedGetOptions { } } +impl Deref for WrappedGetOptions { + type Target = GetOptions; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + /// Wrapper for PutPayload that implements PartialEQ #[derive(Debug, Clone)] pub struct PutPayloadWrapper(PutPayload); @@ -172,6 +189,17 @@ macro_rules! calls { } } } + + #[derive(Debug)] + #[expect( + unused_parens, + reason = "a single param will expand to ($param)" + )] + pub enum MockParam { + $( + $name (($($param),*),), + )* + } }; } @@ -279,9 +307,46 @@ struct MockStoreState { index_counter: usize, } -#[derive(Debug, Default)] pub struct MockStore { state: Mutex, + tx: UnboundedSender, + rx: Mutex>>, +} + +impl Default for MockStore { + fn default() -> Self { + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + + Self { + state: Default::default(), + tx, + rx: Mutex::new(Some(rx)), + } + } +} + +impl std::fmt::Debug for MockStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let Self { + state, + tx: _, + rx: _, + } = self; + + match state.try_lock() { + Ok(state) => { + let MockStoreState { + calls, + index_counter, + } = state.deref(); + f.debug_struct("MockStore") + .field("calls", calls) + .field("index_counter", index_counter) + .finish_non_exhaustive() + } + Err(_) => f.debug_struct("MockStore").finish_non_exhaustive(), + } + } } impl Drop for MockStore { @@ -327,6 +392,18 @@ impl MockStore { pub fn as_store(self: Arc) -> Arc { self as Arc } + + /// Get receiver for mocked operations. + /// + /// The data will be sent BEFORE barriers are passed and contains the original instance of the parameters, not the + /// one passed to [`mock_next`](Self::mock_next)/[`mock_next_multi`](Self::mock_next_multi). + /// + /// # Panic + /// Since the parameters are not [`Clone`]able, you can only extract the receiver once. + pub fn observed_params(&self) -> UnboundedReceiver { + let maybe_rx = { self.rx.lock().unwrap().take() }; + maybe_rx.expect("cannot take receiver twice") + } } macro_rules! barrier_wait { @@ -383,6 +460,8 @@ macro_rules! mock { params, ); + $self.tx.send(MockParam::$variant(actual)).ok(); + let res = res.into(); if barriers.is_empty() { @@ -635,7 +714,7 @@ mod tests { fn test_debug() { assert_eq!( format!("{:?}", MockStore::new()), - "MockStore { state: Mutex { data: MockStoreState { calls: [], index_counter: 0 }, poisoned: false, .. } }", + "MockStore { calls: [], index_counter: 0, .. }", ); } @@ -814,6 +893,72 @@ mod tests { assert!(stream.next().await.is_none()); } + #[test] + #[should_panic(expected = "cannot take receiver twice")] + fn test_take_param_reciever_twice() { + let store = MockStore::new(); + store.observed_params(); + store.observed_params(); + } + + /// There are two parameters created: + /// + /// 1. the one for the [`MockCall`] that is tested for "equality" + /// 2. the one that the API user of [`ObjectStore`] passes into the respective trait method. + /// + /// The [`MockStore::observed_params`] should return (2), so a test can use it for various things. Returning (1) + /// would be redundant because the mock/test setup actually had that parameter at hand already. + #[tokio::test] + async fn test_param_receiver_has_original_instance() { + let payload_1 = PutPayload::from_bytes(Bytes::from(b"foo".to_vec())); + let payload_1_ptr = payload_1.as_ref().as_ptr().expose_provenance(); + let store = MockStore::new().mock_next(MockCall::Put { + params: (path(), payload_1.clone().into()), + barriers: vec![], + res: Ok(PutResult { + e_tag: None, + version: None, + }), + }); + + let payload_2 = PutPayload::from_bytes(Bytes::from(b"foo".to_vec())); + let payload_2_ptr = payload_2.as_ref().as_ptr().expose_provenance(); + assert_ne!(payload_1_ptr, payload_2_ptr); + store.put(&path(), payload_2.clone()).await.unwrap(); + + let MockParam::Put((_path, payload_3)) = store.observed_params().recv().await.unwrap() + else { + unreachable!() + }; + let payload_3_ptr = payload_3.0.as_ref().as_ptr().expose_provenance(); + assert_eq!(payload_2_ptr, payload_3_ptr); + } + + #[tokio::test] + async fn test_param_receiver_gets_data_before_barrier() { + let barrier = Arc::new(Barrier::new(2)); + let store = MockStore::new().mock_next(MockCall::Copy { + params: (path(), path()), + barriers: vec![Arc::clone(&barrier)], + res: Ok(()), + }); + + let mut recv = store.observed_params(); + + let path = path(); + let mut fut = store.copy(&path, &path); + fut.assert_pending().await; + + // the barrier is still blocked, but we can already retrieve the parameters + assert!(matches!(recv.recv().await.unwrap(), MockParam::Copy(_))); + + // now unblock the barrier + let (res, _) = tokio::join!(fut, async move { + barrier.wait().await; + },); + res.unwrap(); + } + #[test] fn test_paths_different() { assert_ne!(path(), path2()); diff --git a/parquet_file/Cargo.toml b/parquet_file/Cargo.toml index 4b6b6406..085ffc0e 100644 --- a/parquet_file/Cargo.toml +++ b/parquet_file/Cargo.toml @@ -27,9 +27,9 @@ pbjson-types = { workspace = true } prost = { workspace = true } schema = { path = "../schema" } snafu = "0.8" -thiserror = "2.0.16" +thiserror = "2.0.17" thrift = "0.17" -tokio = { version = "1.47", features = [ +tokio = { version = "1.48", features = [ "macros", "parking_lot", "rt", diff --git a/parquet_file/src/lib.rs b/parquet_file/src/lib.rs index f967a4e6..817f316a 100644 --- a/parquet_file/src/lib.rs +++ b/parquet_file/src/lib.rs @@ -163,7 +163,7 @@ impl From<&ParquetFile> for ParquetFilePath { table_id: f.table_id, partition_id: TransitionPartitionId::from_parts( f.partition_id, - f.partition_hash_id.clone(), + Some(f.partition_hash_id.clone()), ), object_store_id: f.object_store_id, } @@ -173,7 +173,7 @@ impl From<&ParquetFile> for ParquetFilePath { impl From<&ParquetFileParams> for ParquetFilePath { fn from(f: &ParquetFileParams) -> Self { let partition_id = - TransitionPartitionId::from_parts(f.partition_id, f.partition_hash_id.clone()); + TransitionPartitionId::from_parts(f.partition_id, Some(f.partition_hash_id.clone())); Self { partition_id, diff --git a/parquet_file/src/metadata.rs b/parquet_file/src/metadata.rs index 00d9291f..1ff7d307 100644 --- a/parquet_file/src/metadata.rs +++ b/parquet_file/src/metadata.rs @@ -465,7 +465,7 @@ impl IoxMetadata { pub fn to_parquet_file( &self, partition_id: PartitionId, - partition_hash_id: Option, + partition_hash_id: PartitionHashId, file_size_bytes: u64, metadata: &IoxParquetMetaData, column_id_map: F, diff --git a/parquet_file/src/serialize.rs b/parquet_file/src/serialize.rs index 7843732b..eb115619 100644 --- a/parquet_file/src/serialize.rs +++ b/parquet_file/src/serialize.rs @@ -42,7 +42,7 @@ use crate::{ pub const ROW_GROUP_WRITE_SIZE: usize = 1024 * 1024; /// ensure read and write work well together -const _: () = assert!(ROW_GROUP_WRITE_SIZE % BATCH_SIZE == 0); +const _: () = assert!(ROW_GROUP_WRITE_SIZE.is_multiple_of(BATCH_SIZE)); /// [`RecordBatch`] to Parquet serialisation errors. /// diff --git a/parquet_file/tests/metadata.rs b/parquet_file/tests/metadata.rs index dc7c4875..696eda7f 100644 --- a/parquet_file/tests/metadata.rs +++ b/parquet_file/tests/metadata.rs @@ -441,7 +441,7 @@ async fn test_derive_parquet_file_params() { let partition_id = PartitionId::new(1); let catalog_data = meta.to_parquet_file( partition_id, - Some(partition_hash_id), + partition_hash_id, file_size, &iox_parquet_meta, |name| *column_id_map.get(name).unwrap(), diff --git a/partition/Cargo.toml b/partition/Cargo.toml index 2e9eb169..87170b2d 100644 --- a/partition/Cargo.toml +++ b/partition/Cargo.toml @@ -16,7 +16,7 @@ hashbrown = { workspace = true } mutable_batch = { path = "../mutable_batch" } percent-encoding = "2.3.2" schema = { path = "../schema" } -thiserror = "2.0.16" +thiserror = "2.0.17" unicode-segmentation = "1.12.0" workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/query_functions/Cargo.toml b/query_functions/Cargo.toml index 842ff9fb..d6bcfebb 100644 --- a/query_functions/Cargo.toml +++ b/query_functions/Cargo.toml @@ -13,7 +13,7 @@ arrow = { workspace = true } chrono = { version = "0.4", default-features = false } datafusion = { workspace = true } regex = "1" -regex-syntax = "0.8.6" +regex-syntax = "0.8.8" schema = { path = "../schema" } snafu = "0.8" workspace-hack = { version = "0.1", path = "../workspace-hack" } @@ -21,4 +21,4 @@ workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] datafusion_util = { path = "../datafusion_util" } itertools = "0.13.0" -tokio = { version = "1.47", features = ["macros", "parking_lot"] } +tokio = { version = "1.48", features = ["macros", "parking_lot"] } diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 908d2ecb..43e5784a 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,3 +1,3 @@ [toolchain] -channel = "1.89.0" +channel = "1.90.0" components = ["rustfmt", "clippy"] diff --git a/schema/Cargo.toml b/schema/Cargo.toml index fcf44f45..fb2d33f4 100644 --- a/schema/Cargo.toml +++ b/schema/Cargo.toml @@ -13,7 +13,7 @@ workspace = true arrow = { workspace = true } base64 = { version = "0.22", optional = true } hashbrown = { workspace = true } -indexmap = { version = "2.11", features = ["std"] } +indexmap = { version = "2.12", features = ["std"] } tracing = { workspace = true } snafu = "0.8" workspace-hack = { version = "0.1", path = "../workspace-hack" } @@ -24,6 +24,6 @@ rstest = { version = "0.26.1" } [features] # Enable features from the newly proposed v3 data model, see: # https://github.com/influxdata/influxdb/issues/24979 -# +# # This feature is experimental, and is not enabled by default. v3 = ["dep:base64"] diff --git a/schema/src/lib.rs b/schema/src/lib.rs index 7870edcf..f0403f61 100644 --- a/schema/src/lib.rs +++ b/schema/src/lib.rs @@ -56,6 +56,7 @@ use hashbrown::HashSet; use crate::sort::SortKey; use snafu::{OptionExt, Snafu}; + #[cfg(feature = "v3")] use tracing::warn; @@ -1321,6 +1322,10 @@ mod test { use crate::test_util::make_field; use super::{builder::SchemaBuilder, *}; + + use rstest as _; // workaround for "unused crate" false positive + + #[cfg(feature = "v3")] use rstest::rstest; #[test] diff --git a/service_grpc_flight/Cargo.toml b/service_grpc_flight/Cargo.toml index aae1583e..ed6068d4 100644 --- a/service_grpc_flight/Cargo.toml +++ b/service_grpc_flight/Cargo.toml @@ -34,7 +34,7 @@ prost = { workspace = true } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0.145" snafu = "0.8" -tokio = { version = "1.47", features = [ +tokio = { version = "1.48", features = [ "macros", "net", "parking_lot", diff --git a/test_helpers/Cargo.toml b/test_helpers/Cargo.toml index edf091dd..631ae35d 100644 --- a/test_helpers/Cargo.toml +++ b/test_helpers/Cargo.toml @@ -11,14 +11,14 @@ workspace = true [dependencies] # In alphabetical order async-trait = "0.1.89" dotenvy = "0.15.7" -ordered-float = "5.0.0" +ordered-float = "5.1.0" parking_lot = "0.12" prometheus-parse = "0.2.5" reqwest = { workspace = true, features = ["stream", "rustls-tls-native-roots"] } serde = { version = "1.0", features = ["derive"] } -tempfile = "3.22.0" -thiserror = "2.0.16" -tokio = { version = "1.47.1", default-features = false, features = ["time"] } +tempfile = "3.23.0" +thiserror = "2.0.17" +tokio = { version = "1.48.0", default-features = false, features = ["time"] } tracing = { workspace = true } tracing-log = { workspace = true } tracing-subscriber = { workspace = true } diff --git a/test_helpers_authz/Cargo.toml b/test_helpers_authz/Cargo.toml index b51bc51d..3ee9d27c 100644 --- a/test_helpers_authz/Cargo.toml +++ b/test_helpers_authz/Cargo.toml @@ -14,7 +14,7 @@ workspace-hack = { version = "0.1", path = "../workspace-hack" } # Crates.io dependencies, in alphabetical order futures = "0.3" rand = "0.9.2" -tokio = "1.47" +tokio = "1.48" [lints] workspace = true diff --git a/tokio_metrics_bridge/Cargo.toml b/tokio_metrics_bridge/Cargo.toml index d7c797df..17a41fa8 100644 --- a/tokio_metrics_bridge/Cargo.toml +++ b/tokio_metrics_bridge/Cargo.toml @@ -10,8 +10,8 @@ workspace = true [dependencies] metric = { path = "../metric" } -parking_lot = "0.12.4" -tokio = { version = "1.47", features = ["macros", "net", "parking_lot", "rt-multi-thread", "sync", "time"] } +parking_lot = "0.12.5" +tokio = { version = "1.48", features = ["macros", "net", "parking_lot", "rt-multi-thread", "sync", "time"] } workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] diff --git a/tokio_watchdog/Cargo.toml b/tokio_watchdog/Cargo.toml index 14857513..1872dada 100644 --- a/tokio_watchdog/Cargo.toml +++ b/tokio_watchdog/Cargo.toml @@ -11,7 +11,7 @@ workspace = true [dependencies] metric = { path = "../metric" } tracing = { workspace = true } -tokio = { version = "1.47", features = ["macros", "net", "parking_lot", "rt-multi-thread", "sync", "time"] } +tokio = { version = "1.48", features = ["macros", "net", "parking_lot", "rt-multi-thread", "sync", "time"] } workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] diff --git a/trace_exporters/Cargo.toml b/trace_exporters/Cargo.toml index f459ba2a..1bef8cb1 100644 --- a/trace_exporters/Cargo.toml +++ b/trace_exporters/Cargo.toml @@ -16,9 +16,9 @@ futures = "0.3" iox_time = { path = "../iox_time" } tracing = { workspace = true } snafu = "0.8" -socket2 = "0.6.0" +socket2 = "0.6.1" thrift = { version = "0.17.0" } -tokio = { version = "1.47", features = ["macros", "parking_lot", "rt", "sync"] } +tokio = { version = "1.48", features = ["macros", "parking_lot", "rt", "sync"] } trace = { path = "../trace" } workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/tracker/Cargo.toml b/tracker/Cargo.toml index 4accba77..fb6476a2 100644 --- a/tracker/Cargo.toml +++ b/tracker/Cargo.toml @@ -20,13 +20,13 @@ parking_lot = "0.12" pin-project = "1.1" # Delaying upgrade until is fixed sysinfo = "<0.38" -tokio = { version = "1.47", features = ["macros", "parking_lot", "sync", "time"] } +tokio = { version = "1.48", features = ["macros", "parking_lot", "sync", "time"] } tokio-util = { version = "0.7.16" } trace = { path = "../trace"} workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] -tempfile = "3.22.0" +tempfile = "3.23.0" # Need the multi-threaded executor for testing -tokio = { version = "1.47", features = ["macros", "parking_lot", "rt-multi-thread", "time"] } +tokio = { version = "1.48", features = ["macros", "parking_lot", "rt-multi-thread", "time"] } test_helpers = { path = "../test_helpers" } diff --git a/trogging/Cargo.toml b/trogging/Cargo.toml index 402ab2fa..789bfcc0 100644 --- a/trogging/Cargo.toml +++ b/trogging/Cargo.toml @@ -12,7 +12,7 @@ workspace = true [dependencies] clap = { version = "4", features = ["derive", "env"], optional = true } logfmt = { path = "../logfmt" } -thiserror = "2.0.16" +thiserror = "2.0.17" tracing-log = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 925c95cc..04ff1188 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -23,76 +23,114 @@ ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] arrayvec = { version = "0.7", default-features = false, features = ["std"] } arrow-ipc = { version = "55", features = ["lz4", "zstd"] } arrow-schema = { version = "55", default-features = false, features = ["canonical_extension_types"] } +aws-credential-types = { version = "1", default-features = false, features = ["test-util"] } +aws-sdk-s3 = { version = "1", features = ["behavior-version-latest"] } +aws-smithy-runtime = { version = "1", default-features = false, features = ["client", "default-https-client", "rt-tokio", "tls-rustls"] } +aws-smithy-runtime-api = { version = "1", features = ["client", "http-02x", "http-auth", "test-util"] } +aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "http-body-1-x", "rt-tokio", "test-util"] } base64 = { version = "0.22" } +bigdecimal = { version = "0.4", features = ["serde"] } +bincode = { version = "2", default-features = false, features = ["alloc", "derive", "serde"] } +bloom2 = { version = "0.5", default-features = false, features = ["serde"] } byteorder = { version = "1" } bytes = { version = "1" } -chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] } -clap = { version = "4", features = ["derive", "env"] } -clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "suggestions", "usage"] } +chrono = { version = "0.4", features = ["serde"] } +clap = { version = "4", features = ["derive", "env", "string"] } +clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "string", "suggestions", "usage"] } crossbeam-utils = { version = "0.8" } +crypto-bigint = { version = "0.5", features = ["generic-array", "zeroize"] } crypto-common = { version = "0.1", default-features = false, features = ["std"] } -datafusion-common = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "a9cf9aca9ebf0d6c04e0861d2baebffa0ba77dbc", default-features = false, features = ["object_store", "parquet_encryption", "recursive_protection"] } -datafusion-expr = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "a9cf9aca9ebf0d6c04e0861d2baebffa0ba77dbc", default-features = false, features = ["recursive_protection"] } -digest = { version = "0.10", features = ["mac", "std"] } +datafusion-common = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "ee81b1cc652bde6c131973d091b178836692112d", default-features = false, features = ["object_store", "parquet_encryption", "recursive_protection"] } +datafusion-expr = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "ee81b1cc652bde6c131973d091b178836692112d", default-features = false, features = ["recursive_protection"] } +digest = { version = "0.10", features = ["mac", "oid", "std"] } either = { version = "1", features = ["serde", "use_std"] } fastrand = { version = "2" } +flatbuffers = { version = "25" } flate2 = { version = "1", features = ["zlib-rs"] } form_urlencoded = { version = "1" } -futures = { version = "0.3" } futures-channel = { version = "0.3", features = ["sink"] } futures-core = { version = "0.3" } futures-executor = { version = "0.3" } futures-io = { version = "0.3" } futures-sink = { version = "0.3" } futures-task = { version = "0.3", default-features = false, features = ["std"] } -futures-util = { version = "0.3", default-features = false, features = ["async-await-macro", "channel", "io", "sink"] } -getrandom = { version = "0.3", default-features = false, features = ["std"] } +futures-util = { version = "0.3", features = ["channel", "io", "sink"] } +generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] } +getrandom-468e82937335b1c9 = { package = "getrandom", version = "0.3", default-features = false, features = ["std"] } +getrandom-6f8ce4dd05d13bba = { package = "getrandom", version = "0.2", default-features = false, features = ["std"] } hashbrown-3575ec1268b04181 = { package = "hashbrown", version = "0.15" } hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] } +hickory-proto = { version = "0.25", default-features = false, features = ["serde", "text-parsing", "tokio"] } httparse = { version = "1" } hyper = { version = "1", features = ["client", "http1", "http2", "server"] } -hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] } +hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "server-graceful", "service"] } indexmap = { version = "2" } -libc = { version = "0.2", features = ["use_std"] } +insta = { version = "1", features = ["json", "redactions", "yaml"] } +ipnet = { version = "2", features = ["serde"] } +libc = { version = "0.2", features = ["extra_traits", "use_std"] } +lock_api = { version = "0.4", features = ["arc_lock"] } log = { version = "0.4", default-features = false, features = ["std"] } md-5 = { version = "0.10" } memchr = { version = "2" } +moka = { version = "0.12", features = ["future", "sync"] } +num-bigint = { version = "0.4", features = ["serde"] } +num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } object_store = { version = "0.12", features = ["aws", "azure", "gcp"] } +once_cell = { version = "1", features = ["critical-section"] } +parking_lot = { version = "0.12", features = ["arc_lock"] } parquet = { version = "55", features = ["encryption", "object_store"] } percent-encoding = { version = "2" } +portable-atomic = { version = "1" } +proptest = { version = "1" } prost = { version = "0.13", features = ["prost-derive"] } prost-types = { version = "0.13" } rand-274715c4dabd11b0 = { package = "rand", version = "0.9" } rand-c38e5c1d305a1b54 = { package = "rand", version = "0.8", features = ["small_rng"] } rand_chacha = { version = "0.9", default-features = false, features = ["std"] } +rand_core = { version = "0.9", default-features = false, features = ["os_rng", "std"] } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-build", "dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "std", "unicode"] } regex-syntax = { version = "0.8" } -reqwest = { version = "0.12", default-features = false, features = ["http2", "json", "rustls-tls-native-roots", "stream"] } +reqwest = { version = "0.12", default-features = false, features = ["gzip", "http2", "json", "multipart", "rustls-tls", "rustls-tls-native-roots", "stream"] } +ring = { version = "0.17", features = ["std"] } +rustls = { version = "0.23", default-features = false, features = ["logging", "prefer-post-quantum", "ring", "std", "tls12"] } +rustls-pemfile = { version = "2" } +rustls-webpki = { version = "0.103", default-features = false, features = ["aws-lc-rs", "ring", "std"] } serde = { version = "1", features = ["alloc", "derive", "rc"] } -serde_core = { version = "1", default-features = false, features = ["alloc", "rc", "result", "std"] } +serde_core = { version = "1", features = ["alloc", "rc"] } serde_json = { version = "1", features = ["raw_value"] } -sha2 = { version = "0.10" } +sha2 = { version = "0.10", features = ["oid"] } +signature = { version = "2", default-features = false, features = ["digest", "rand_core", "std"] } similar = { version = "2", features = ["inline"] } smallvec = { version = "1", default-features = false, features = ["const_new", "serde", "union"] } -socket2 = { version = "0.6", default-features = false, features = ["all"] } +snafu = { version = "0.8", features = ["futures"] } +socket2-3b31131e45eafb45 = { package = "socket2", version = "0.6", default-features = false, features = ["all"] } +spin = { version = "0.9" } sqlparser = { version = "0.55", default-features = false, features = ["recursive-protection", "visitor"] } +sqlx = { version = "0.8", features = ["postgres", "runtime-tokio-rustls", "sqlite", "tls-rustls", "uuid"] } sqlx-core = { version = "0.8", features = ["_rt-tokio", "_tls-rustls-ring-webpki", "any", "json", "migrate", "offline", "uuid"] } sqlx-postgres = { version = "0.8", default-features = false, features = ["any", "json", "migrate", "offline", "uuid"] } sqlx-sqlite = { version = "0.8", default-features = false, features = ["any", "bundled", "json", "migrate", "offline", "uuid"] } +subtle = { version = "2" } sync_wrapper = { version = "1", default-features = false, features = ["futures"] } +thiserror = { version = "2" } thrift = { version = "0.17" } -tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "parking_lot", "rt-multi-thread", "signal", "test-util"] } +time = { version = "0.3", features = ["formatting", "macros", "parsing"] } +tokio = { version = "1", features = ["full", "test-util", "tracing"] } +tokio-metrics = { version = "0.4" } +tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] } tokio-stream = { version = "0.1", features = ["fs", "net"] } -tokio-util = { version = "0.7", features = ["codec", "io"] } +tokio-util = { version = "0.7", features = ["codec", "compat", "io"] } tonic = { version = "0.12", features = ["gzip", "tls-roots", "zstd"] } -tower = { version = "0.5", default-features = false, features = ["util"] } +tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "load-shed"] } tracing = { version = "0.1", features = ["log", "max_level_trace"] } tracing-core = { version = "0.1" } tracing-log = { version = "0.2" } -twox-hash = { version = "2", default-features = false, features = ["xxhash32", "xxhash64"] } -uuid = { version = "1", features = ["js", "v4"] } +twox-hash = { version = "2" } +url = { version = "2" } +uuid = { version = "1", features = ["js", "serde", "v4", "v7"] } +zeroize = { version = "1", features = ["derive", "std"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] } @@ -104,123 +142,188 @@ arrow-schema = { version = "55", default-features = false, features = ["canonica base64 = { version = "0.22" } byteorder = { version = "1" } bytes = { version = "1" } -chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] } +cc = { version = "1", default-features = false, features = ["parallel"] } +chrono = { version = "0.4", features = ["serde"] } crossbeam-utils = { version = "0.8" } crypto-common = { version = "0.1", default-features = false, features = ["std"] } -datafusion-common = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "a9cf9aca9ebf0d6c04e0861d2baebffa0ba77dbc", default-features = false, features = ["object_store", "parquet_encryption", "recursive_protection"] } -datafusion-expr = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "a9cf9aca9ebf0d6c04e0861d2baebffa0ba77dbc", default-features = false, features = ["recursive_protection"] } -digest = { version = "0.10", features = ["mac", "std"] } +datafusion-common = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "ee81b1cc652bde6c131973d091b178836692112d", default-features = false, features = ["object_store", "parquet_encryption", "recursive_protection"] } +datafusion-expr = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "ee81b1cc652bde6c131973d091b178836692112d", default-features = false, features = ["recursive_protection"] } +digest = { version = "0.10", features = ["mac", "oid", "std"] } either = { version = "1", features = ["serde", "use_std"] } fastrand = { version = "2" } +flatbuffers = { version = "25" } flate2 = { version = "1", features = ["zlib-rs"] } form_urlencoded = { version = "1" } -futures = { version = "0.3" } futures-channel = { version = "0.3", features = ["sink"] } futures-core = { version = "0.3" } futures-executor = { version = "0.3" } futures-io = { version = "0.3" } futures-sink = { version = "0.3" } futures-task = { version = "0.3", default-features = false, features = ["std"] } -futures-util = { version = "0.3", default-features = false, features = ["async-await-macro", "channel", "io", "sink"] } -getrandom = { version = "0.3", default-features = false, features = ["std"] } +futures-util = { version = "0.3", features = ["channel", "io", "sink"] } +generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] } +getrandom-468e82937335b1c9 = { package = "getrandom", version = "0.3", default-features = false, features = ["std"] } +getrandom-6f8ce4dd05d13bba = { package = "getrandom", version = "0.2", default-features = false, features = ["std"] } hashbrown-3575ec1268b04181 = { package = "hashbrown", version = "0.15" } hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] } httparse = { version = "1" } hyper = { version = "1", features = ["client", "http1", "http2", "server"] } indexmap = { version = "2" } -libc = { version = "0.2", features = ["use_std"] } +libc = { version = "0.2", features = ["extra_traits", "use_std"] } +lock_api = { version = "0.4", features = ["arc_lock"] } log = { version = "0.4", default-features = false, features = ["std"] } md-5 = { version = "0.10" } memchr = { version = "2" } +num-bigint = { version = "0.4", features = ["serde"] } +num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } object_store = { version = "0.12", features = ["aws", "azure", "gcp"] } +once_cell = { version = "1", features = ["critical-section"] } +parking_lot = { version = "0.12", features = ["arc_lock"] } parquet = { version = "55", features = ["encryption", "object_store"] } percent-encoding = { version = "2" } +portable-atomic = { version = "1" } prost = { version = "0.13", features = ["prost-derive"] } prost-types = { version = "0.13" } rand-274715c4dabd11b0 = { package = "rand", version = "0.9" } rand-c38e5c1d305a1b54 = { package = "rand", version = "0.8", features = ["small_rng"] } rand_chacha = { version = "0.9", default-features = false, features = ["std"] } +rand_core = { version = "0.9", default-features = false, features = ["os_rng", "std"] } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-build", "dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "std", "unicode"] } regex-syntax = { version = "0.8" } -reqwest = { version = "0.12", default-features = false, features = ["http2", "json", "rustls-tls-native-roots", "stream"] } +reqwest = { version = "0.12", default-features = false, features = ["gzip", "http2", "json", "multipart", "rustls-tls", "rustls-tls-native-roots", "stream"] } +ring = { version = "0.17", features = ["std"] } +rustls = { version = "0.23", default-features = false, features = ["logging", "prefer-post-quantum", "ring", "std", "tls12"] } +rustls-pemfile = { version = "2" } +rustls-webpki = { version = "0.103", default-features = false, features = ["aws-lc-rs", "ring", "std"] } serde = { version = "1", features = ["alloc", "derive", "rc"] } -serde_core = { version = "1", default-features = false, features = ["alloc", "rc", "result", "std"] } +serde_core = { version = "1", features = ["alloc", "rc"] } serde_json = { version = "1", features = ["raw_value"] } -sha2 = { version = "0.10" } +sha2 = { version = "0.10", features = ["oid"] } smallvec = { version = "1", default-features = false, features = ["const_new", "serde", "union"] } +spin = { version = "0.9" } sqlparser = { version = "0.55", default-features = false, features = ["recursive-protection", "visitor"] } sqlx-core = { version = "0.8", features = ["_rt-tokio", "_tls-rustls-ring-webpki", "any", "json", "migrate", "offline", "uuid"] } +sqlx-macros = { version = "0.8", features = ["_rt-tokio", "_tls-rustls-ring-webpki", "derive", "json", "macros", "migrate", "postgres", "sqlite", "uuid"] } +sqlx-macros-core = { version = "0.8", features = ["_rt-tokio", "_tls-rustls-ring-webpki", "derive", "json", "macros", "migrate", "postgres", "sqlite", "uuid"] } sqlx-postgres = { version = "0.8", default-features = false, features = ["any", "json", "migrate", "offline", "uuid"] } sqlx-sqlite = { version = "0.8", default-features = false, features = ["any", "bundled", "json", "migrate", "offline", "uuid"] } +subtle = { version = "2" } syn = { version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } sync_wrapper = { version = "1", default-features = false, features = ["futures"] } +thiserror = { version = "2" } thrift = { version = "0.17" } -tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "parking_lot", "rt-multi-thread", "signal", "test-util"] } +tokio = { version = "1", features = ["full", "test-util", "tracing"] } tokio-stream = { version = "0.1", features = ["fs", "net"] } -tokio-util = { version = "0.7", features = ["codec", "io"] } +tokio-util = { version = "0.7", features = ["codec", "compat", "io"] } tracing = { version = "0.1", features = ["log", "max_level_trace"] } tracing-core = { version = "0.1" } -twox-hash = { version = "2", default-features = false, features = ["xxhash32", "xxhash64"] } -uuid = { version = "1", features = ["js", "v4"] } +twox-hash = { version = "2" } +url = { version = "2" } +uuid = { version = "1", features = ["js", "serde", "v4", "v7"] } +zeroize = { version = "1", features = ["derive", "std"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] } [target.x86_64-unknown-linux-gnu.dependencies] +async-compression = { version = "0.4", default-features = false, features = ["bzip2", "gzip", "tokio", "xz", "zstd"] } bitflags = { version = "2", default-features = false, features = ["std"] } -hyper-util = { version = "0.1", default-features = false, features = ["client-proxy"] } -once_cell = { version = "1" } +hyper-rustls = { version = "0.27", default-features = false, features = ["http1", "http2", "native-tokio", "ring", "tls12", "webpki-tokio"] } +lzma-sys = { version = "0.1", default-features = false, features = ["static"] } +nix = { version = "0.30", default-features = false, features = ["fs", "ioctl", "poll", "signal", "socket", "term"] } tower = { version = "0.5", default-features = false, features = ["retry", "timeout"] } +tower-http = { version = "0.6", features = ["catch-panic", "follow-redirect"] } +xz2 = { version = "0.1", default-features = false, features = ["static"] } [target.x86_64-unknown-linux-gnu.build-dependencies] +async-compression = { version = "0.4", default-features = false, features = ["bzip2", "gzip", "tokio", "xz", "zstd"] } bitflags = { version = "2", default-features = false, features = ["std"] } -hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "service"] } -once_cell = { version = "1" } -socket2 = { version = "0.6", default-features = false, features = ["all"] } -tower = { version = "0.5", default-features = false, features = ["retry", "timeout"] } +hyper-rustls = { version = "0.27", default-features = false, features = ["http1", "http2", "native-tokio", "ring", "tls12", "webpki-tokio"] } +hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "server-graceful", "service"] } +ipnet = { version = "2", features = ["serde"] } +lzma-sys = { version = "0.1", default-features = false, features = ["static"] } +socket2-3b31131e45eafb45 = { package = "socket2", version = "0.6", default-features = false, features = ["all"] } +tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] } +tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "load-shed", "retry", "timeout"] } +tower-http = { version = "0.6", features = ["catch-panic", "follow-redirect"] } +xz2 = { version = "0.1", default-features = false, features = ["static"] } [target.x86_64-apple-darwin.dependencies] +async-compression = { version = "0.4", default-features = false, features = ["bzip2", "gzip", "tokio", "xz", "zstd"] } bitflags = { version = "2", default-features = false, features = ["std"] } -hyper-util = { version = "0.1", default-features = false, features = ["client-proxy"] } -once_cell = { version = "1" } +hyper-rustls = { version = "0.27", default-features = false, features = ["http1", "http2", "native-tokio", "ring", "tls12", "webpki-tokio"] } +lzma-sys = { version = "0.1", default-features = false, features = ["static"] } +nix = { version = "0.30", default-features = false, features = ["fs", "ioctl", "poll", "signal", "socket", "term"] } tower = { version = "0.5", default-features = false, features = ["retry", "timeout"] } +tower-http = { version = "0.6", features = ["catch-panic", "follow-redirect"] } +xz2 = { version = "0.1", default-features = false, features = ["static"] } [target.x86_64-apple-darwin.build-dependencies] +async-compression = { version = "0.4", default-features = false, features = ["bzip2", "gzip", "tokio", "xz", "zstd"] } bitflags = { version = "2", default-features = false, features = ["std"] } -hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "service"] } -once_cell = { version = "1" } -socket2 = { version = "0.6", default-features = false, features = ["all"] } -tower = { version = "0.5", default-features = false, features = ["retry", "timeout"] } +hyper-rustls = { version = "0.27", default-features = false, features = ["http1", "http2", "native-tokio", "ring", "tls12", "webpki-tokio"] } +hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "server-graceful", "service"] } +ipnet = { version = "2", features = ["serde"] } +lzma-sys = { version = "0.1", default-features = false, features = ["static"] } +socket2-3b31131e45eafb45 = { package = "socket2", version = "0.6", default-features = false, features = ["all"] } +tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] } +tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "load-shed", "retry", "timeout"] } +tower-http = { version = "0.6", features = ["catch-panic", "follow-redirect"] } +xz2 = { version = "0.1", default-features = false, features = ["static"] } [target.aarch64-apple-darwin.dependencies] +async-compression = { version = "0.4", default-features = false, features = ["bzip2", "gzip", "tokio", "xz", "zstd"] } bitflags = { version = "2", default-features = false, features = ["std"] } -hyper-util = { version = "0.1", default-features = false, features = ["client-proxy"] } -once_cell = { version = "1" } +hyper-rustls = { version = "0.27", default-features = false, features = ["http1", "http2", "native-tokio", "ring", "tls12", "webpki-tokio"] } +lzma-sys = { version = "0.1", default-features = false, features = ["static"] } +nix = { version = "0.30", default-features = false, features = ["fs", "ioctl", "poll", "signal", "socket", "term"] } tower = { version = "0.5", default-features = false, features = ["retry", "timeout"] } +tower-http = { version = "0.6", features = ["catch-panic", "follow-redirect"] } +xz2 = { version = "0.1", default-features = false, features = ["static"] } [target.aarch64-apple-darwin.build-dependencies] +async-compression = { version = "0.4", default-features = false, features = ["bzip2", "gzip", "tokio", "xz", "zstd"] } bitflags = { version = "2", default-features = false, features = ["std"] } -hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "service"] } -once_cell = { version = "1" } -socket2 = { version = "0.6", default-features = false, features = ["all"] } -tower = { version = "0.5", default-features = false, features = ["retry", "timeout"] } +hyper-rustls = { version = "0.27", default-features = false, features = ["http1", "http2", "native-tokio", "ring", "tls12", "webpki-tokio"] } +hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "server-graceful", "service"] } +ipnet = { version = "2", features = ["serde"] } +lzma-sys = { version = "0.1", default-features = false, features = ["static"] } +socket2-3b31131e45eafb45 = { package = "socket2", version = "0.6", default-features = false, features = ["all"] } +tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] } +tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "load-shed", "retry", "timeout"] } +tower-http = { version = "0.6", features = ["catch-panic", "follow-redirect"] } +xz2 = { version = "0.1", default-features = false, features = ["static"] } [target.x86_64-pc-windows-msvc.dependencies] -hyper-util = { version = "0.1", default-features = false, features = ["client-proxy"] } -once_cell = { version = "1" } +async-compression = { version = "0.4", default-features = false, features = ["bzip2", "gzip", "tokio", "xz", "zstd"] } +hyper-rustls = { version = "0.27", default-features = false, features = ["http1", "http2", "native-tokio", "ring", "tls12", "webpki-tokio"] } +lzma-sys = { version = "0.1", default-features = false, features = ["static"] } +socket2-d8f496e17d97b5cb = { package = "socket2", version = "0.5", default-features = false, features = ["all"] } tower = { version = "0.5", default-features = false, features = ["retry", "timeout"] } -windows-sys-73dcd821b1037cfd = { package = "windows-sys", version = "0.59", features = ["Wdk_Foundation", "Wdk_Storage_FileSystem", "Wdk_System_IO", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_IO", "Win32_System_Memory", "Win32_System_Pipes", "Win32_System_SystemServices", "Win32_System_Threading", "Win32_System_WindowsProgramming", "Win32_UI_Input_KeyboardAndMouse", "Win32_UI_Shell"] } -windows-sys-b21d60becc0929df = { package = "windows-sys", version = "0.52", features = ["Win32_Foundation", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_IO", "Win32_System_Threading", "Win32_System_WindowsProgramming"] } -windows-sys-d4189bed749088b6 = { package = "windows-sys", version = "0.61", features = ["Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_SystemInformation"] } +tower-http = { version = "0.6", features = ["catch-panic", "follow-redirect"] } +winapi = { version = "0.3", default-features = false, features = ["cfg", "evntrace", "in6addr", "inaddr", "minwinbase", "minwindef", "ntsecapi", "profileapi", "windef", "winioctl", "winnt"] } +windows-sys-4db8c43aad08e7ae = { package = "windows-sys", version = "0.60", features = ["Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_IO", "Win32_System_Threading", "Win32_System_WindowsProgramming", "Win32_UI_Input_KeyboardAndMouse"] } +windows-sys-73dcd821b1037cfd = { package = "windows-sys", version = "0.59", features = ["Wdk_Foundation", "Wdk_Storage_FileSystem", "Wdk_System_IO", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_IO", "Win32_System_Memory", "Win32_System_Pipes", "Win32_System_Threading", "Win32_System_WindowsProgramming", "Win32_UI_Input_KeyboardAndMouse"] } +windows-sys-c8eced492e86ede7 = { package = "windows-sys", version = "0.48", features = ["Win32_Foundation", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Diagnostics_Debug", "Win32_System_Registry", "Win32_System_Time", "Win32_UI_Shell"] } +windows-sys-d4189bed749088b6 = { package = "windows-sys", version = "0.61", features = ["Win32_Security_Authentication_Identity", "Win32_Security_Credentials", "Win32_Security_Cryptography", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_LibraryLoader", "Win32_System_Memory", "Win32_System_Pipes", "Win32_System_SystemInformation", "Win32_System_SystemServices", "Win32_System_Threading", "Win32_System_WindowsProgramming", "Win32_UI_Shell"] } +xz2 = { version = "0.1", default-features = false, features = ["static"] } [target.x86_64-pc-windows-msvc.build-dependencies] -hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "service"] } -once_cell = { version = "1" } -socket2 = { version = "0.6", default-features = false, features = ["all"] } -tower = { version = "0.5", default-features = false, features = ["retry", "timeout"] } -windows-sys-73dcd821b1037cfd = { package = "windows-sys", version = "0.59", features = ["Wdk_Foundation", "Wdk_Storage_FileSystem", "Wdk_System_IO", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_IO", "Win32_System_Memory", "Win32_System_Pipes", "Win32_System_SystemServices", "Win32_System_Threading", "Win32_System_WindowsProgramming", "Win32_UI_Input_KeyboardAndMouse", "Win32_UI_Shell"] } -windows-sys-d4189bed749088b6 = { package = "windows-sys", version = "0.61", features = ["Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_SystemInformation"] } +async-compression = { version = "0.4", default-features = false, features = ["bzip2", "gzip", "tokio", "xz", "zstd"] } +hyper-rustls = { version = "0.27", default-features = false, features = ["http1", "http2", "native-tokio", "ring", "tls12", "webpki-tokio"] } +hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "server-graceful", "service"] } +ipnet = { version = "2", features = ["serde"] } +lzma-sys = { version = "0.1", default-features = false, features = ["static"] } +socket2-3b31131e45eafb45 = { package = "socket2", version = "0.6", default-features = false, features = ["all"] } +tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] } +tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "load-shed", "retry", "timeout"] } +tower-http = { version = "0.6", features = ["catch-panic", "follow-redirect"] } +windows-sys-4db8c43aad08e7ae = { package = "windows-sys", version = "0.60", features = ["Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_IO", "Win32_System_Threading", "Win32_System_WindowsProgramming", "Win32_UI_Input_KeyboardAndMouse"] } +windows-sys-73dcd821b1037cfd = { package = "windows-sys", version = "0.59", features = ["Wdk_Foundation", "Wdk_Storage_FileSystem", "Wdk_System_IO", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_IO", "Win32_System_Memory", "Win32_System_Pipes", "Win32_System_Threading", "Win32_System_WindowsProgramming", "Win32_UI_Input_KeyboardAndMouse"] } +windows-sys-c8eced492e86ede7 = { package = "windows-sys", version = "0.48", features = ["Win32_Foundation", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Diagnostics_Debug", "Win32_System_Registry", "Win32_System_Time", "Win32_UI_Shell"] } +windows-sys-d4189bed749088b6 = { package = "windows-sys", version = "0.61", features = ["Win32_Security_Authentication_Identity", "Win32_Security_Credentials", "Win32_Security_Cryptography", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_LibraryLoader", "Win32_System_Memory", "Win32_System_Pipes", "Win32_System_SystemInformation", "Win32_System_SystemServices", "Win32_System_Threading", "Win32_System_WindowsProgramming", "Win32_UI_Shell"] } +xz2 = { version = "0.1", default-features = false, features = ["static"] } ### END HAKARI SECTION