diff --git a/Cargo.toml b/Cargo.toml
index 63f540e2..c0c9223b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,9 +15,11 @@ members = [
     "influxdb2_client",
     "iox_http",
     "iox_query_influxql",
+    "iox_query_influxql_rewrite",
     "iox_query",
     "iox_system_tables",
     "iox_time",
+    "iox_v1_query_api",
     "logfmt",
     "meta_data_cache",
     "metric_exporters",
@@ -74,26 +76,26 @@ arrow-schema = { version = "55" }
 bincode = { version = "2", default-features = false, features = ["alloc", "derive"] }
 # Use DataFusion fork
 # See https://github.com/influxdata/arrow-datafusion/pull/73 for contents
-datafusion = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "a9cf9aca9ebf0d6c04e0861d2baebffa0ba77dbc" }
-datafusion-proto = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "a9cf9aca9ebf0d6c04e0861d2baebffa0ba77dbc" }
+datafusion = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "ee81b1cc652bde6c131973d091b178836692112d" }
+datafusion-proto = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "ee81b1cc652bde6c131973d091b178836692112d" }
 hashbrown = { version = "0.14.5" }
 http = { version = "1" }
 http-body = { version = "1" }
 http-body-util = { version = "0.1" }
 hyper = { version = "1" }
 hyper-util = { version = "0.1" }
-object_store = { version = "0.12.3", features = ["aws", "azure", "gcp"] }
+object_store = { version = "0.12.4", features = ["aws", "azure", "gcp"] }
 parquet = { version = "55", features = ["object_store"] }
-pbjson = { version = "0.7" }
-pbjson-build = { version = "0.7" }
+pbjson = { version = "0.8" }
+pbjson-build = { version = "0.8" }
 pbjson-types = { version = "0.7" }
 proptest = { version = "1", default-features = false, features = ["std"] }
 prost = { version = "0.13" }
 prost-build = { version = "0.13" }
 prost-types = { version = "0.13" }
 reqwest = { version = "0.12", default-features = false }
-rstest = { version = "0.21" }
-sqlx = { version = "0.8.6", features = ["sqlite"] }
+rstest = { version = "0.26" }
+sqlx = { version = "0.8.6" }
 tower = { version = "0.5" }
 tracing = { version = "0.1", features = ["log", "max_level_trace"] }
 tracing-log = { version = "0.2" }
diff --git a/arrow_util/Cargo.toml b/arrow_util/Cargo.toml
index da4f41ad..ec8759a5 100644
--- a/arrow_util/Cargo.toml
+++ b/arrow_util/Cargo.toml
@@ -22,7 +22,7 @@ comfy-table = { version = "7.2", default-features = false }
 hashbrown = { workspace = true }
 num-traits = "0.2"
 parquet = { workspace = true }
-regex = "1.11.2"
+regex = "1.12.2"
 snafu = "0.8"
 uuid = "1"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/authz/Cargo.toml b/authz/Cargo.toml
index cc8333fd..d59a697d 100644
--- a/authz/Cargo.toml
+++ b/authz/Cargo.toml
@@ -25,10 +25,10 @@ snafu = "0.8"
 
 [dev-dependencies]
 assert_matches = "1.5.0"
-parking_lot = "0.12.4"
+parking_lot = "0.12.5"
 paste = "1.0.15"
 test_helpers_authz = { path = "../test_helpers_authz" }
-tokio = "1.47.1"
+tokio = "1.48.0"
 
 [features]
 http = ["dep:http"]
diff --git a/backoff/Cargo.toml b/backoff/Cargo.toml
index a1e54563..8de749ff 100644
--- a/backoff/Cargo.toml
+++ b/backoff/Cargo.toml
@@ -9,7 +9,7 @@ license.workspace = true
 workspace = true
 
 [dependencies]
-tokio = { version = "1.47", features = ["macros", "time"] }
+tokio = { version = "1.48", features = ["macros", "time"] }
 tracing = { workspace = true }
 rand = "0.9"
 snafu = "0.8"
diff --git a/catalog_cache/Cargo.toml b/catalog_cache/Cargo.toml
index 9bc2783d..eafc31b1 100644
--- a/catalog_cache/Cargo.toml
+++ b/catalog_cache/Cargo.toml
@@ -19,7 +19,7 @@ iox_http_util = { path = "../iox_http_util" }
 tracing = { workspace = true }
 reqwest = { workspace = true }
 snafu = "0.8"
-tokio = { version = "1.47", default-features = false, features = [
+tokio = { version = "1.48", default-features = false, features = [
     "macros",
     "rt",
 ] }
diff --git a/catalog_cache/benches/list_encode.rs b/catalog_cache/benches/list_encode.rs
index 17a4d009..018d52ed 100644
--- a/catalog_cache/benches/list_encode.rs
+++ b/catalog_cache/benches/list_encode.rs
@@ -103,12 +103,25 @@ fn encode_partition_snapshot(i: usize) -> Bytes {
     let partition_key = PartitionKey::from(format!("arbitrary_{i}"));
     let expected_partition_hash_id = PartitionHashId::new(table_id, &partition_key);
     let generation = 6;
-    let parquet_file_defaults = ParquetFile {
+
+    let partition = Partition::new_catalog_only(
+        partition_id,
+        table_id,
+        partition_key.clone(),
+        Default::default(),
+        Default::default(),
+        Default::default(),
+        Default::default(),
+        None, // max_time
+        Default::default(),
+    );
+    // Create associated Parquet file
+    let parquet_files = vec![ParquetFile {
         id: ParquetFileId::new(7 + i as i64),
         namespace_id,
         table_id,
         partition_id,
-        partition_hash_id: Some(expected_partition_hash_id.clone()),
+        partition_hash_id: expected_partition_hash_id.clone(),
         object_store_id: ObjectStoreId::from_str("00000000-0000-0001-0000-000000000000").unwrap(),
         min_time: Timestamp::new(2),
         max_time: Timestamp::new(3),
@@ -120,31 +133,9 @@ fn encode_partition_snapshot(i: usize) -> Bytes {
         column_set: ColumnSet::empty(),
         max_l0_created_at: Timestamp::new(6),
         source: None,
-    };
+    }];
 
-    let partition = Partition::new_catalog_only(
-        partition_id,
-        Some(expected_partition_hash_id.clone()),
-        table_id,
-        partition_key.clone(),
-        Default::default(),
-        Default::default(),
-        Default::default(),
-        Default::default(),
-        None, // max_time
-    );
-    // Create associated Parquet files:
-    let parquet_files = vec![
-        // one addressed by numeric ID,
-        ParquetFile {
-            partition_hash_id: None,
-            ..parquet_file_defaults.clone()
-        },
-        // one addressed by hash ID.
-        parquet_file_defaults.clone(),
-    ];
-
-    // Encode the partition and its Parquet files,
+    // Encode the partition and its Parquet file
     let snapshot = PartitionSnapshot::encode(
         namespace_id,
         partition,
diff --git a/client_util/Cargo.toml b/client_util/Cargo.toml
index ff2c1341..aa8698f6 100644
--- a/client_util/Cargo.toml
+++ b/client_util/Cargo.toml
@@ -16,13 +16,13 @@ reqwest = { workspace = true, features = ["stream", "rustls-tls-native-roots"] }
 # This direct dependency on rustls can probably be removed when tonic is upgraded to 0.13+.
 # See <https://github.com/influxdata/influxdb_iox/issues/14683> for more details.
 rustls = { version = "0.23", default-features = false }
-thiserror = "2.0.16"
+thiserror = "2.0.17"
 tonic = { version = "0.12", features = ["gzip", "tls", "tls-native-roots", "zstd"] }
 tower = { workspace = true }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies]
-tokio = { version = "1.47", features = [
+tokio = { version = "1.48", features = [
     "macros",
     "parking_lot",
     "rt-multi-thread",
diff --git a/data_types/Cargo.toml b/data_types/Cargo.toml
index b5f2e5c9..0c4654b2 100644
--- a/data_types/Cargo.toml
+++ b/data_types/Cargo.toml
@@ -14,7 +14,7 @@ arrow = { workspace = true }
 arrow-buffer = { workspace = true }
 bytes = "1.10"
 chrono = { version = "0.4", default-features = false }
-croaring = "2.4.0"
+croaring = "2.5.1"
 influxdb-line-protocol = { path = "../influxdb_line_protocol" }
 iox_time = { path = "../iox_time" }
 generated_types = { path = "../generated_types" }
@@ -33,7 +33,7 @@ sqlx = { workspace = true, features = [
     "postgres",
     "uuid",
 ] }
-thiserror = "2.0.16"
+thiserror = "2.0.17"
 uuid = { version = "1", features = ["v4"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
diff --git a/data_types/src/lib.rs b/data_types/src/lib.rs
index 2c44c84b..18d0d3c6 100644
--- a/data_types/src/lib.rs
+++ b/data_types/src/lib.rs
@@ -810,15 +810,30 @@ pub struct TableSchema {
 
     /// the table's columns by their name
     pub columns: ColumnsByName,
+
+    /// Whether or not iceberg is enabled for this table
+    pub iceberg_enabled: bool,
 }
 
 impl TableSchema {
-    /// Initialize new `TableSchema` from the information in the given `Table`.
+    /// Initialize new [`TableSchema`] from the information in the given [`Table`].
     pub fn new_empty_from(table: &Table) -> Self {
         Self {
             id: table.id,
             partition_template: table.partition_template.clone(),
             columns: ColumnsByName::default(),
+            iceberg_enabled: table.iceberg_enabled,
+        }
+    }
+
+    /// Initialize a new [`TableSchema`] with the given id, no columns, default partition, and
+    /// iceberg disabled.
+    pub fn new_with(id: TableId) -> Self {
+        Self {
+            id,
+            partition_template: TablePartitionTemplateOverride::default(),
+            columns: ColumnsByName::default(),
+            iceberg_enabled: false,
         }
     }
 
@@ -1077,8 +1092,8 @@ pub struct ParquetFile {
     pub table_id: TableId,
     /// the partition identifier
     pub partition_id: PartitionId,
-    /// the optional partition hash id
-    pub partition_hash_id: Option<PartitionHashId>,
+    /// the partition hash id
+    pub partition_hash_id: PartitionHashId,
     /// the uuid used in the object store path for this file
     pub object_store_id: ObjectStoreId,
     /// the min timestamp of data in this file
@@ -1178,11 +1193,7 @@ impl ParquetFile {
 
     /// Estimate the memory consumption of this object and its contents
     pub fn size(&self) -> usize {
-        let hash_id = self
-            .partition_hash_id
-            .as_ref()
-            .map(|x| x.size())
-            .unwrap_or_default();
+        let hash_id = self.partition_hash_id.size();
 
         size_of_val(self) + hash_id + self.column_set.size() - size_of_val(&self.column_set)
     }
@@ -1211,7 +1222,7 @@ impl ParquetFile {
 
     /// Temporary to aid incremental migration
     pub fn transition_partition_id(&self) -> TransitionPartitionId {
-        TransitionPartitionId::from_parts(self.partition_id, self.partition_hash_id.clone())
+        TransitionPartitionId::from_parts(self.partition_id, Some(self.partition_hash_id.clone()))
     }
 }
 
@@ -1222,10 +1233,7 @@ impl From<ParquetFile> for catalog_proto::ParquetFile {
             namespace_id: v.namespace_id.get(),
             table_id: v.table_id.get(),
             partition_id: v.partition_id.get(),
-            partition_hash_id: v
-                .partition_hash_id
-                .map(|x| x.as_bytes().to_vec())
-                .unwrap_or_default(),
+            partition_hash_id: v.partition_hash_id.as_bytes().to_vec(),
             object_store_id: v.object_store_id.to_string(),
             min_time: v.min_time.get(),
             max_time: v.max_time.get(),
@@ -1266,11 +1274,7 @@ impl TryFrom<catalog_proto::ParquetFile> for ParquetFile {
             namespace_id: NamespaceId::new(v.namespace_id),
             table_id: TableId::new(v.table_id),
             partition_id: PartitionId::new(v.partition_id),
-            partition_hash_id: if v.partition_hash_id.is_empty() {
-                None
-            } else {
-                Some(v.partition_hash_id[..].try_into()?)
-            },
+            partition_hash_id: v.partition_hash_id[..].try_into()?,
             object_store_id: ObjectStoreId::from_str(&v.object_store_id)?,
             min_time: Timestamp::new(v.min_time),
             max_time: Timestamp::new(v.max_time),
@@ -1346,7 +1350,7 @@ pub struct ParquetFileParams {
     /// the partition identifier
     pub partition_id: PartitionId,
     /// the partition hash ID
-    pub partition_hash_id: Option<PartitionHashId>,
+    pub partition_hash_id: PartitionHashId,
     /// the uuid used in the object store path for this file
     pub object_store_id: ObjectStoreId,
     /// the min timestamp of data in this file
@@ -3329,6 +3333,7 @@ mod tests {
             id: TableId::new(1),
             partition_template: Default::default(),
             columns: ColumnsByName::default(),
+            iceberg_enabled: false,
         };
         let schema2 = TableSchema {
             id: TableId::new(2),
@@ -3339,6 +3344,7 @@ mod tests {
                 name: String::from("foo"),
                 column_type: ColumnType::Bool,
             }]),
+            iceberg_enabled: false,
         };
         assert!(schema1.size() < schema2.size());
     }
@@ -3361,11 +3367,7 @@ mod tests {
             id: NamespaceId::new(1),
             active_tables: BTreeMap::from([(
                 String::from("foo"),
-                TableSchema {
-                    id: TableId::new(1),
-                    columns: ColumnsByName::default(),
-                    partition_template: Default::default(),
-                },
+                TableSchema::new_with(TableId::new(1)),
             )]),
             deleted_tables: BTreeSet::new(),
             partition_template: Default::default(),
@@ -3412,41 +3414,13 @@ mod tests {
 
     #[test]
     fn catalog_service_parquet_file_serde_roundtrip() {
-        // This part of the test can be removed when all partitions have hash IDs.
-        let old_style_parquet_file = ParquetFile {
-            id: ParquetFileId::new(3),
-            namespace_id: NamespaceId::new(4),
-            table_id: TableId::new(5),
-            partition_id: PartitionId::new(6),
-            partition_hash_id: None, // this is the important part for this test
-            object_store_id: ObjectStoreId::new(),
-            min_time: Timestamp::new(30),
-            max_time: Timestamp::new(50),
-            to_delete: None,
-            file_size_bytes: 1024,
-            row_count: 42,
-            compaction_level: CompactionLevel::Initial,
-            created_at: Timestamp::new(70),
-            column_set: ColumnSet::empty(),
-            max_l0_created_at: Timestamp::new(70),
-            source: None,
-        };
-        let catalog_proto_old_style_parquet_file =
-            catalog_proto::ParquetFile::from(old_style_parquet_file.clone());
-        let round_trip_old_style_parquet_file =
-            ParquetFile::try_from(catalog_proto_old_style_parquet_file).unwrap();
-        assert_eq!(old_style_parquet_file, round_trip_old_style_parquet_file);
-
         let table_id = TableId::new(5);
         let parquet_file = ParquetFile {
             id: ParquetFileId::new(3),
             namespace_id: NamespaceId::new(4),
             table_id,
             partition_id: PartitionId::new(6),
-            partition_hash_id: Some(PartitionHashId::new(
-                table_id,
-                &PartitionKey::from("arbitrary"),
-            )),
+            partition_hash_id: PartitionHashId::new(table_id, &PartitionKey::from("arbitrary")),
             object_store_id: ObjectStoreId::new(),
             min_time: Timestamp::new(30),
             max_time: Timestamp::new(50),
diff --git a/data_types/src/partition.rs b/data_types/src/partition.rs
index 61260e13..5c4f2ea9 100644
--- a/data_types/src/partition.rs
+++ b/data_types/src/partition.rs
@@ -578,9 +578,8 @@ impl sqlx::postgres::PgHasArrayType for PartitionHashId {
 pub struct Partition {
     /// the id of the partition
     pub id: PartitionId,
-    /// The unique hash derived from the table ID and partition key, if available. This will become
-    /// required when partitions without the value have aged out.
-    hash_id: Option<PartitionHashId>,
+    /// The unique hash derived from the table ID and partition key.
+    hash_id: PartitionHashId,
     /// the table the partition is under
     pub table_id: TableId,
     /// the string key of the partition
@@ -640,7 +639,6 @@ impl Partition {
     #[expect(clippy::too_many_arguments)]
     pub fn new_catalog_only(
         id: PartitionId,
-        hash_id: Option<PartitionHashId>,
         table_id: TableId,
         partition_key: PartitionKey,
         sort_key_ids: SortKeyIds,
@@ -648,10 +646,11 @@ impl Partition {
         cold_compact_at: Option<Timestamp>,
         created_at: Option<Timestamp>,
         max_time: Option<i64>,
+        estimated_size_bytes: Option<i64>,
     ) -> Self {
         Self {
             id,
-            hash_id,
+            hash_id: PartitionHashId::new(table_id, &partition_key),
             table_id,
             partition_key,
             sort_key_ids,
@@ -659,19 +658,19 @@ impl Partition {
             cold_compact_at,
             created_at,
             max_time,
-            estimated_size_bytes: None,
+            estimated_size_bytes,
         }
     }
 
     /// If this partition has a `PartitionHashId` stored in the catalog, use that. Otherwise, use
     /// the database-assigned `PartitionId`.
     pub fn transition_partition_id(&self) -> TransitionPartitionId {
-        TransitionPartitionId::from((self.id, self.hash_id.as_ref()))
+        TransitionPartitionId::from((self.id, Some(&self.hash_id)))
     }
 
-    /// The unique hash derived from the table ID and partition key, if it exists in the catalog.
-    pub fn hash_id(&self) -> Option<&PartitionHashId> {
-        self.hash_id.as_ref()
+    /// The unique hash derived from the table ID and partition key
+    pub fn hash_id(&self) -> &PartitionHashId {
+        &self.hash_id
     }
 
     /// The sort key IDs, if the sort key has been set
diff --git a/data_types/src/snapshot/partition.rs b/data_types/src/snapshot/partition.rs
index 23e30de5..fe46200c 100644
--- a/data_types/src/snapshot/partition.rs
+++ b/data_types/src/snapshot/partition.rs
@@ -64,7 +64,7 @@ pub struct PartitionSnapshot {
     /// The [`PartitionId`]
     partition_id: PartitionId,
     /// The [`PartitionHashId`]
-    partition_hash_id: Option<PartitionHashId>,
+    partition_hash_id: PartitionHashId,
     /// The generation of this snapshot
     generation: u64,
     /// The partition key
@@ -84,6 +84,9 @@ pub struct PartitionSnapshot {
     /// The time this Partition was created at, or `None` if this partition was created before this
     /// field existed. Not the time the snapshot was created.
     created_at: Option<Timestamp>,
+    /// Estimated size in bytes of all the active files in this partition, or `None`
+    /// if the partition size has not been computed yet.
+    estimated_size_bytes: Option<i64>,
 }
 
 impl PartitionSnapshot {
@@ -122,7 +125,8 @@ impl PartitionSnapshot {
                     max_l0_created_at: file.max_l0_created_at.0,
                     column_mask: Some(mask.finish().into()),
                     source: file.source.map(|i| i as i32).unwrap_or_default(),
-                    use_numeric_partition_id: Some(file.partition_hash_id.is_none()),
+                    #[expect(deprecated)]
+                    use_numeric_partition_id: Some(false),
                 }
             })
             .collect();
@@ -132,7 +136,7 @@ impl PartitionSnapshot {
             columns,
             namespace_id,
             partition_id: partition.id,
-            partition_hash_id: partition.hash_id().cloned(),
+            partition_hash_id: partition.hash_id().clone(),
             key: partition.partition_key.as_bytes().to_vec().into(),
             files: MessageList::encode(files).context(FileEncodeSnafu)?,
             sort_key: partition.sort_key_ids().cloned().unwrap_or_default(),
@@ -141,15 +145,14 @@ impl PartitionSnapshot {
             skipped_compaction: skipped_compaction.map(|sc| sc.into()),
             cold_compact_at: partition.cold_compact_at,
             created_at: partition.created_at(),
+            estimated_size_bytes: partition.estimated_size_bytes,
         })
     }
 
     /// Create a new [`PartitionSnapshot`] from a `proto` and generation
     pub fn decode(proto: proto::Partition, generation: u64) -> Self {
         let table_id = TableId::new(proto.table_id);
-        let partition_hash_id = proto
-            .partition_hash_id
-            .then(|| PartitionHashId::from_raw(table_id, proto.key.as_ref()));
+        let partition_hash_id = PartitionHashId::from_raw(table_id, proto.key.as_ref());
 
         Self {
             generation,
@@ -165,6 +168,7 @@ impl PartitionSnapshot {
             skipped_compaction: proto.skipped_compaction,
             cold_compact_at: proto.cold_compact_at.map(Timestamp::new),
             created_at: proto.created_at.map(Timestamp::new),
+            estimated_size_bytes: proto.estimated_size_bytes,
         }
     }
 
@@ -179,8 +183,8 @@ impl PartitionSnapshot {
     }
 
     /// Returns the [`PartitionHashId`] if any
-    pub fn partition_hash_id(&self) -> Option<&PartitionHashId> {
-        self.partition_hash_id.as_ref()
+    pub fn partition_hash_id(&self) -> &PartitionHashId {
+        &self.partition_hash_id
     }
 
     /// Returns the file at index `idx`
@@ -204,22 +208,7 @@ impl PartitionSnapshot {
             namespace_id: self.namespace_id,
             table_id: self.table_id,
             partition_id: self.partition_id,
-            partition_hash_id: match file.use_numeric_partition_id {
-                // If the Parquet file uses the numeric partition ID, don't set a
-                // `partition_hash_id`, regardless of whether the Partition uses a `hash_id`
-                Some(true) => None,
-                Some(false) => Some(match self.partition_hash_id.clone() {
-                    Some(hash_id) => hash_id,
-                    // If the Parquet file uses the hash ID but the Partition doesn't yet,
-                    // compute it
-                    None => self
-                        .key()
-                        .map(|key| PartitionHashId::new(self.table_id, &key))?,
-                }),
-                // If the Parquet file doesn't specify whether it uses a hash ID, fall back to
-                // whatever the Partition uses
-                None => self.partition_hash_id.clone(),
-            },
+            partition_hash_id: self.partition_hash_id.clone(),
             object_store_id: ObjectStoreId::from_uuid(uuid.into()),
             min_time: Timestamp(file.min_time),
             max_time: Timestamp(file.max_time),
@@ -249,7 +238,6 @@ impl PartitionSnapshot {
     pub fn partition(&self) -> Result<Partition> {
         Ok(Partition::new_catalog_only(
             self.partition_id,
-            self.partition_hash_id.clone(),
             self.table_id,
             self.key()?,
             self.sort_key.clone(),
@@ -257,6 +245,7 @@ impl PartitionSnapshot {
             self.cold_compact_at,
             self.created_at,
             None, // max_time - not stored in snapshot (can be computed from partition key)
+            self.estimated_size_bytes,
         ))
     }
 
@@ -272,6 +261,13 @@ impl PartitionSnapshot {
             .cloned()
             .map(|sc| sc.into())
     }
+
+    /// Returns the estimated size of the partition in bytes.
+    pub fn estimated_size_bytes(&self) -> i64 {
+        // Treat None as 0. Since this is an estimated size,
+        // it is acceptable to treat partitions with None as having size 0.
+        self.estimated_size_bytes.unwrap_or(0)
+    }
 }
 
 impl From<PartitionSnapshot> for proto::Partition {
@@ -282,13 +278,14 @@ impl From<PartitionSnapshot> for proto::Partition {
             namespace_id: value.namespace_id.get(),
             table_id: value.table_id.get(),
             partition_id: value.partition_id.get(),
-            partition_hash_id: value.partition_hash_id.is_some(),
+            partition_hash_id: true,
             column_ids: value.columns.iter().map(|x| x.get()).collect(),
             sort_key_ids: value.sort_key.iter().map(|x| x.get()).collect(),
             new_file_at: value.new_file_at.map(|x| x.get()),
             skipped_compaction: value.skipped_compaction,
             cold_compact_at: value.cold_compact_at.map(|x| x.get()),
             created_at: value.created_at.map(|x| x.get()),
+            estimated_size_bytes: value.estimated_size_bytes,
         }
     }
 }
@@ -296,94 +293,11 @@ impl From<PartitionSnapshot> for proto::Partition {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{CompactionLevel, PartitionKey};
-    use std::str::FromStr;
-
-    #[test]
-    fn partition_hash_id_transition_parquet_files_individually() {
-        let namespace_id = NamespaceId::new(3);
-        let table_id = TableId::new(4);
-        let partition_id = PartitionId::new(5);
-        let partition_key = PartitionKey::from("arbitrary");
-        let expected_partition_hash_id = PartitionHashId::new(table_id, &partition_key);
-        let generation = 6;
-        let parquet_file_defaults = ParquetFile {
-            id: ParquetFileId::new(7),
-            namespace_id,
-            table_id,
-            partition_id,
-            partition_hash_id: Some(expected_partition_hash_id.clone()),
-            object_store_id: ObjectStoreId::from_str("00000000-0000-0001-0000-000000000000")
-                .unwrap(),
-            min_time: Timestamp::new(2),
-            max_time: Timestamp::new(3),
-            to_delete: None,
-            file_size_bytes: 4,
-            row_count: 5,
-            compaction_level: CompactionLevel::Initial,
-            created_at: Timestamp::new(6),
-            column_set: ColumnSet::empty(),
-            max_l0_created_at: Timestamp::new(6),
-            source: None,
-        };
-
-        let encode_and_compare = |use_partition_hash_id: bool| {
-            // For a partition with or without a hash ID as specified,
-            let partition = Partition::new_catalog_only(
-                partition_id,
-                if use_partition_hash_id {
-                    Some(expected_partition_hash_id.clone())
-                } else {
-                    None
-                },
-                table_id,
-                partition_key.clone(),
-                Default::default(),
-                Default::default(),
-                Default::default(),
-                Default::default(),
-                None, // max_time
-            );
-            // Create associated Parquet files:
-            let parquet_files = vec![
-                // one addressed by numeric ID,
-                ParquetFile {
-                    partition_hash_id: None,
-                    ..parquet_file_defaults.clone()
-                },
-                // one addressed by hash ID.
-                parquet_file_defaults.clone(),
-            ];
-
-            // Encode the partition and its Parquet files,
-            let encoded_partition = PartitionSnapshot::encode(
-                namespace_id,
-                partition,
-                parquet_files.clone(),
-                None,
-                generation,
-            )
-            .unwrap();
-
-            // then ensure accessing each Parquet file returns the same information as was encoded.
-            assert_eq!(
-                &encoded_partition.file(0).unwrap(),
-                &parquet_files[0],
-                "use_partition_hash_id: {use_partition_hash_id}"
-            );
-            assert_eq!(
-                &encoded_partition.file(1).unwrap(),
-                &parquet_files[1],
-                "use_partition_hash_id: {use_partition_hash_id}"
-            );
-        };
-
-        // Encoding and accessing Parquet files should work whether their associated Partition
-        // has a hash ID or not.
-        encode_and_compare(true);
-        encode_and_compare(false);
-    }
+    use crate::PartitionKey;
 
+    // Even though all partitions now have hash IDs, keep this test to ensure we can continue to
+    // decode and use any cached proto that doesn't use hash IDs.
+    #[expect(deprecated)]
     #[test]
     fn decode_old_cached_proto() {
         let partition_key = PartitionKey::from("arbitrary");
@@ -448,6 +362,7 @@ mod tests {
             new_file_at: Default::default(),
             skipped_compaction: Default::default(),
             sort_key_ids: Default::default(),
+            estimated_size_bytes: Default::default(),
         };
         let numeric_id_partition_proto = proto::Partition {
             partition_hash_id: false,
@@ -458,37 +373,46 @@ mod tests {
         let decoded_hash_id_partition = PartitionSnapshot::decode(hash_id_partition_proto, 1);
         let decoded_numeric_id_partition = PartitionSnapshot::decode(numeric_id_partition_proto, 1);
 
-        // For the Parquet file without `use_numeric_partition_id` set, it should be addressed in
-        // the same way as its partition is.
+        // For the Parquet file without `use_numeric_partition_id` set, it should be addressed
+        // with hash ID because this should be impossible now.
         let pf0_hash_id_partition = decoded_hash_id_partition.file(0).unwrap();
         assert_eq!(
             pf0_hash_id_partition.partition_hash_id,
-            Some(decoded_hash_id_partition.partition_hash_id.clone().unwrap())
+            decoded_hash_id_partition.partition_hash_id.clone()
         );
         let pf0_numeric_id_partition = decoded_numeric_id_partition.file(0).unwrap();
-        assert_eq!(pf0_numeric_id_partition.partition_hash_id, None);
+        assert_eq!(
+            pf0_numeric_id_partition.partition_hash_id,
+            decoded_hash_id_partition.partition_hash_id
+        );
 
         // For the Parquet file with `use_numeric_partition_id` set to `false`, it should be
         // addressed with hash ID, regardless of how the partition is addressed.
         let pf1_hash_id_partition = decoded_hash_id_partition.file(1).unwrap();
         assert_eq!(
             pf1_hash_id_partition.partition_hash_id,
-            Some(decoded_hash_id_partition.partition_hash_id.clone().unwrap())
+            decoded_hash_id_partition.partition_hash_id.clone()
         );
         let pf1_numeric_id_partition = decoded_numeric_id_partition.file(1).unwrap();
         assert_eq!(
             pf1_numeric_id_partition.partition_hash_id,
-            Some(PartitionHashId::new(
+            PartitionHashId::new(
                 decoded_numeric_id_partition.table_id,
                 &decoded_numeric_id_partition.key().unwrap()
-            ))
+            )
         );
 
         // For the Parquet file with `use_numeric_partition_id` set to `true`, it should be
-        // addressed with numeric ID, regardless of how the partition is addressed.
+        // addressed with hash ID because this should be impossible now.
         let pf1_hash_id_partition = decoded_hash_id_partition.file(2).unwrap();
-        assert_eq!(pf1_hash_id_partition.partition_hash_id, None);
+        assert_eq!(
+            pf1_hash_id_partition.partition_hash_id,
+            decoded_hash_id_partition.partition_hash_id.clone()
+        );
         let pf1_numeric_id_partition = decoded_numeric_id_partition.file(2).unwrap();
-        assert_eq!(pf1_numeric_id_partition.partition_hash_id, None);
+        assert_eq!(
+            pf1_numeric_id_partition.partition_hash_id,
+            decoded_hash_id_partition.partition_hash_id.clone()
+        );
     }
 }
diff --git a/datafusion_util/Cargo.toml b/datafusion_util/Cargo.toml
index d7951579..892fbf38 100644
--- a/datafusion_util/Cargo.toml
+++ b/datafusion_util/Cargo.toml
@@ -16,7 +16,7 @@ futures = "0.3"
 object_store = { workspace = true }
 pin-project = "1.1"
 schema = { path = "../schema" }
-tokio = { version = "1.47", features = ["parking_lot", "sync"] }
+tokio = { version = "1.48", features = ["parking_lot", "sync"] }
 tokio-stream = "0.1"
 tracing = { workspace = true }
 url = "2.5"
diff --git a/executor/Cargo.toml b/executor/Cargo.toml
index 340c4eca..226d3a4e 100644
--- a/executor/Cargo.toml
+++ b/executor/Cargo.toml
@@ -13,7 +13,7 @@ futures = "0.3"
 metric = { path = "../metric" }
 parking_lot = "0.12"
 snafu = "0.8"
-tokio = { version = "1.47" }
+tokio = { version = "1.48" }
 tokio_metrics_bridge = { path = "../tokio_metrics_bridge" }
 tokio_watchdog = { path = "../tokio_watchdog" }
 tracing = { workspace = true }
diff --git a/generated_types/protos/influxdata/iox/catalog/v2/service.proto b/generated_types/protos/influxdata/iox/catalog/v2/service.proto
index 7984d263..e016db07 100644
--- a/generated_types/protos/influxdata/iox/catalog/v2/service.proto
+++ b/generated_types/protos/influxdata/iox/catalog/v2/service.proto
@@ -68,7 +68,10 @@ service CatalogService {
   rpc PartitionNewFileBetween(PartitionNewFileBetweenRequest) returns (stream PartitionNewFileBetweenResponse);
   rpc PartitionNeedingColdCompact(PartitionNeedingColdCompactRequest) returns (stream PartitionNeedingColdCompactResponse);
   rpc PartitionUpdateColdCompact(PartitionUpdateColdCompactRequest) returns (PartitionUpdateColdCompactResponse);
-  rpc PartitionListOldStyle(PartitionListOldStyleRequest) returns (stream PartitionListOldStyleResponse);
+  rpc PartitionUpdateStorageSize(PartitionUpdateStorageSizeRequest) returns (PartitionUpdateStorageSizeResponse);
+  rpc PartitionListOldStyle(PartitionListOldStyleRequest) returns (stream PartitionListOldStyleResponse) {
+    option deprecated = true;
+  };
   rpc PartitionDeleteByRetention(PartitionDeleteByRetentionRequest) returns (stream PartitionDeleteByRetentionResponse);
   rpc PartitionDeleteBatch(PartitionDeleteBatchRequest) returns (stream PartitionDeleteBatchResponse);
   rpc PartitionSnapshot(PartitionSnapshotRequest) returns (PartitionSnapshotResponse);
@@ -352,13 +355,19 @@ message TableEnableIcebergRequest {
   int64 table_id = 1;
 }
 
-message TableEnableIcebergResponse {}
+message TableEnableIcebergResponse {
+  Table table = 1;
+  int64 router_version = 2;
+}
 
 message TableDisableIcebergRequest {
   int64 table_id = 1;
 }
 
-message TableDisableIcebergResponse {}
+message TableDisableIcebergResponse {
+  Table table = 1;
+  int64 router_version = 2;
+}
 
 message TableSoftDeleteRequest {
   int64 table_id = 1;
@@ -568,6 +577,7 @@ message PartitionNewFileBetweenResponse {
 message PartitionNeedingColdCompactRequest {
   int64 maximum_time = 1;
   uint64 n = 2;
+  optional int64 redo_before_nanos = 3;
 }
 
 message PartitionNeedingColdCompactResponse {
@@ -581,6 +591,13 @@ message PartitionUpdateColdCompactRequest {
 
 message PartitionUpdateColdCompactResponse {}
 
+message PartitionUpdateStorageSizeRequest {
+  int64 partition_id = 1;
+  int64 estimated_size_bytes = 2;
+}
+
+message PartitionUpdateStorageSizeResponse {}
+
 message PartitionListOldStyleRequest {}
 
 message PartitionListOldStyleResponse {
diff --git a/generated_types/protos/influxdata/iox/catalog_cache/v1/value.proto b/generated_types/protos/influxdata/iox/catalog_cache/v1/value.proto
index c03ce8d3..5bc99f26 100644
--- a/generated_types/protos/influxdata/iox/catalog_cache/v1/value.proto
+++ b/generated_types/protos/influxdata/iox/catalog_cache/v1/value.proto
@@ -82,6 +82,10 @@ message Partition {
   // The time this partition was created, or `None` if this partition was
   // created before this field existed
   optional int64 created_at = 13;
+
+  /// Estimated size in bytes of all the active files in this partition, or `None`
+  /// if the partition size has not been computed yet.
+  optional int64 estimated_size_bytes = 14;
 }
 
 enum ParquetFileSource {
@@ -123,16 +127,8 @@ message PartitionFile {
   // Which component created this parquet file
   ParquetFileSource source = 11;
 
-  // Present and true if this parquet file's object store path uses the partition's numeric ID
-  // (as opposed to the partition's hash ID, which is what most files use).
-  // Exists to enable file-by-file transition to always using the partition's hash ID.
-  //
-  // The current code always sets this value when encoding. This value's absence indicates the
-  // protobuf was cached before the code adding this field was deployed. In that case, the code
-  // will fall back to the behavior the catalog cache exhibited previously, which was looking at
-  // the partition to know which ID to use and assuming all Parquet files in a partition were
-  // addressed with the same ID.
-  optional bool use_numeric_partition_id = 12;
+  // No longer needed because all partitions and Parquet files should have partition hash IDs.
+  optional bool use_numeric_partition_id = 12 [deprecated = true];
 }
 
 message Table {
diff --git a/generated_types/protos/influxdata/iox/catalog_storage/v1/service.proto b/generated_types/protos/influxdata/iox/catalog_storage/v1/service.proto
index 4afb4833..4cecaedd 100644
--- a/generated_types/protos/influxdata/iox/catalog_storage/v1/service.proto
+++ b/generated_types/protos/influxdata/iox/catalog_storage/v1/service.proto
@@ -73,6 +73,10 @@ message GetNamespacesWithStorageRequest {
     // Filter namespaces by name (case-insensitive partial match)
     // If provided, only namespaces with names containing this string are returned
     optional string name_filter = 6;
+
+    // Filter namespaces by ID (partial match)
+    // If provided, only namespaces with IDs containing this string are returned
+    optional string id_filter = 7;
 }
 
 // Request to get a specific namespace with storage.
@@ -170,6 +174,14 @@ message GetTablesWithStorageRequest {
     // Filter by soft-deleted status
     // If not specified, only the active tables are returned
     optional influxdata.iox.common.v1.SoftDeleted deleted = 6;
+
+    // Filter tables by name (case-insensitive partial match)
+    // If provided, only tables with names containing this string are returned
+    optional string name_filter = 7;
+
+    // Filter tables by ID (partial match)
+    // If provided, only tables with IDs containing this string are returned
+    optional string id_filter = 8;
 }
 
 // Request to get a specific table with storage.
diff --git a/generated_types/protos/influxdata/iox/gossip/v1/schema.proto b/generated_types/protos/influxdata/iox/gossip/v1/schema.proto
index d5b5a3b5..8d3cbfd8 100644
--- a/generated_types/protos/influxdata/iox/gossip/v1/schema.proto
+++ b/generated_types/protos/influxdata/iox/gossip/v1/schema.proto
@@ -60,6 +60,9 @@ message TableUpdated {
   // should follow the same rules about what value to hold (and when) as the
   // TableDropped.router_version field)
   int64 router_version = 6;
+
+  // if the version of the table accompanying router_version has the `iceberg_enabled` field set
+  bool iceberg_enabled = 7;
 }
 
 // Initialisation of a new table occurred.
diff --git a/generated_types/protos/influxdata/iox/namespace/v1/service.proto b/generated_types/protos/influxdata/iox/namespace/v1/service.proto
index b4186e41..4c8e4fab 100644
--- a/generated_types/protos/influxdata/iox/namespace/v1/service.proto
+++ b/generated_types/protos/influxdata/iox/namespace/v1/service.proto
@@ -276,6 +276,10 @@ message GetNamespacesWithStorageRequest {
   // Filter namespaces by name (case-insensitive partial match)
   // If provided, only namespaces with names containing this string are returned
   optional string name_filter = 7;
+
+  // Filter namespaces by ID (partial match)
+  // If provided, only namespaces with IDs containing this string are returned
+  optional string id_filter = 8;
 }
 
 // Request to get a specific namespace with storage.
diff --git a/generated_types/protos/influxdata/iox/table/v1/service.proto b/generated_types/protos/influxdata/iox/table/v1/service.proto
index 33cd7e94..9efc7729 100644
--- a/generated_types/protos/influxdata/iox/table/v1/service.proto
+++ b/generated_types/protos/influxdata/iox/table/v1/service.proto
@@ -181,6 +181,14 @@ message GetTablesWithStorageRequest {
   // Filter by soft-deleted status
   // If not specified, only the active tables are returned
   optional influxdata.iox.common.v1.SoftDeleted deleted = 6;
+
+  // Filter tables by name (case-insensitive partial match)
+  // If provided, only tables with names containing this string are returned
+  optional string name_filter = 7;
+
+  // Filter tables by ID (partial match)
+  // If provided, only tables with IDs containing this string are returned
+  optional string id_filter = 8;
 }
 
 // Request to get a specific table with storage.
@@ -212,6 +220,7 @@ message GetTableWithStorageResponse {
 // Request to enable iceberg exports for the given table.
 message EnableIcebergRequest {
   int64 table_id = 1;
+  int64 namespace_id = 2;
 }
 
 message EnableIcebergResponse {}
@@ -219,6 +228,7 @@ message EnableIcebergResponse {}
 // Request to disable iceberg exports for the given table.
 message DisableIcebergRequest {
   int64 table_id = 1;
+  int64 namespace_id = 2;
 }
 
 message DisableIcebergResponse {}
diff --git a/influxdb2_client/Cargo.toml b/influxdb2_client/Cargo.toml
index 6c56d339..27da1489 100644
--- a/influxdb2_client/Cargo.toml
+++ b/influxdb2_client/Cargo.toml
@@ -24,5 +24,5 @@ smallvec = { workspace = true }
 [dev-dependencies] # In alphabetical order
 mockito = { version ="1.7", default-features = false }
 parking_lot = "0.12"
-tokio = { version = "1.47", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
+tokio = { version = "1.48", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
 test_helpers = { path = "../test_helpers" }
diff --git a/influxdb_influxql_parser/Cargo.toml b/influxdb_influxql_parser/Cargo.toml
index 0ec2e2eb..229b3490 100644
--- a/influxdb_influxql_parser/Cargo.toml
+++ b/influxdb_influxql_parser/Cargo.toml
@@ -18,7 +18,7 @@ num-integer = { version = "0.1", default-features = false, features = [
     "std",
 ] }
 num-traits = "0.2"
-thiserror = "2.0.16"
+thiserror = "2.0.17"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies] # In alphabetical order
diff --git a/influxdb_iox_client/Cargo.toml b/influxdb_iox_client/Cargo.toml
index 2d2161c3..b3243da8 100644
--- a/influxdb_iox_client/Cargo.toml
+++ b/influxdb_iox_client/Cargo.toml
@@ -43,14 +43,15 @@ rand = { version = "0.9.2", optional = true }
 reqwest = { workspace = true, features = ["stream", "rustls-tls-native-roots"] }
 schema = { path = "../schema", optional = true }
 serde_json = { version = "1.0.145", optional = true }
-tokio = { version = "1.47", features = [
+tokio = { version = "1.48", features = [
     "macros",
     "parking_lot",
     "rt-multi-thread",
 ] }
 tokio-stream = "0.1.17"
-thiserror = "2.0.16"
+thiserror = "2.0.17"
 tonic-reflection = { version = "0.12" }
+tracing = { workspace = true }
 
 [dev-dependencies]
 insta = { version = "1" }
diff --git a/influxdb_iox_client/src/client.rs b/influxdb_iox_client/src/client.rs
index f7f532ed..38067c39 100644
--- a/influxdb_iox_client/src/client.rs
+++ b/influxdb_iox_client/src/client.rs
@@ -47,3 +47,6 @@ pub mod test;
 
 /// Client for write API
 pub mod write;
+
+/// Batched write client for efficient bulk writes
+pub mod batched_write;
diff --git a/influxdb_iox_client/src/client/batched_write.rs b/influxdb_iox_client/src/client/batched_write.rs
new file mode 100644
index 00000000..7d8fc48e
--- /dev/null
+++ b/influxdb_iox_client/src/client/batched_write.rs
@@ -0,0 +1,379 @@
+//! Batched write client for efficient bulk writing of line protocol data.
+//!
+//! This module provides a `BatchedWriteClient` that wraps the standard write client
+//! and batches multiple write requests together before sending them. This is particularly
+//! useful for high-throughput scenarios like query logging where many small writes
+//! can be combined into fewer, larger requests.
+
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::time::Duration;
+
+use tokio::sync::Mutex;
+use tokio::task::JoinHandle;
+use tracing::error;
+
+use crate::{
+    error::Error,
+    write::{Client as WriteClient, DatabaseName},
+};
+
+/// A wrapper around either a regular or batched write client.
+///
+/// This enum allows code to work with either client type transparently,
+/// which is particularly useful for optional batching functionality.
+#[derive(Debug, Clone)]
+pub enum MaybeBatchedWriteClient {
+    /// Regular write client without batching
+    Unbatched(WriteClient),
+    /// Batched write client
+    Batched(Arc<BatchedWriteClient>),
+}
+
+impl MaybeBatchedWriteClient {
+    /// Write line protocol data to the specified database.
+    pub async fn write_lp(
+        &mut self,
+        database: impl Into<DatabaseName> + Send,
+        lp_data: impl Into<String> + Send,
+    ) -> Result<(), Error> {
+        match self {
+            Self::Unbatched(client) => {
+                client.write_lp(database, lp_data).await?;
+                Ok(())
+            }
+            Self::Batched(client) => client.write_lp(database, lp_data).await,
+        }
+    }
+}
+
+/// Default maximum number of line protocol entries to batch before flushing
+const DEFAULT_MAX_BATCH_SIZE: usize = 100;
+
+/// Default flush interval for periodic flushing
+const DEFAULT_FLUSH_INTERVAL: Duration = Duration::from_secs(3);
+
+/// Configuration for the batched write client
+#[derive(Debug, Clone, Copy)]
+pub struct BatchedWriteClientConfig {
+    /// Maximum number of line protocol entries to batch before flushing
+    pub max_batch_size: usize,
+    /// Interval at which to automatically flush pending writes, even if batch size hasn't been reached
+    pub flush_interval: Duration,
+}
+
+impl Default for BatchedWriteClientConfig {
+    fn default() -> Self {
+        Self {
+            max_batch_size: DEFAULT_MAX_BATCH_SIZE,
+            flush_interval: DEFAULT_FLUSH_INTERVAL,
+        }
+    }
+}
+
+/// A batched write client that accumulates writes and flushes them in batches.
+///
+/// This client wraps a [`WriteClient`] and batches multiple write requests together
+/// before sending them to reduce network overhead. Writes are flushed when:
+/// - The batch reaches `max_batch_size` line protocol entries
+/// - The `flush_interval` timer expires (default: 3 seconds)
+/// - The client is dropped (graceful shutdown)
+/// - `flush()` is explicitly called
+///
+/// # Example
+///
+/// ```no_run
+/// # use influxdb_iox_client::{
+/// #     connection::Builder,
+/// #     write::Client as WriteClient,
+/// #     batched_write::{BatchedWriteClient, BatchedWriteClientConfig},
+/// # };
+/// # #[tokio::main]
+/// # async fn main() {
+/// let connection = Builder::default()
+///     .build("http://127.0.0.1:8080")
+///     .await
+///     .unwrap();
+///
+/// let write_client = WriteClient::new(connection);
+/// let config = BatchedWriteClientConfig::default();
+/// let batched_client = BatchedWriteClient::new(write_client, config);
+///
+/// // Writes are automatically batched
+/// batched_client.write_lp("my_db", "cpu,host=a usage=0.5").await.unwrap();
+/// batched_client.write_lp("my_db", "cpu,host=b usage=0.7").await.unwrap();
+/// # }
+/// ```
+pub struct BatchedWriteClient {
+    /// Internal state protected by a mutex
+    inner: Arc<Mutex<BatchedWriteClientInner>>,
+
+    /// Configuration
+    config: BatchedWriteClientConfig,
+
+    /// Shutdown flag for the background flush task
+    shutdown: Arc<AtomicBool>,
+
+    /// Handle to the background flush task
+    _flush_task: JoinHandle<()>,
+}
+
+/// Internal state for the batched write client
+#[derive(Debug)]
+struct BatchedWriteClientInner {
+    /// The underlying write client
+    client: WriteClient,
+
+    /// Buffer for accumulating writes per database
+    buffer: Vec<(DatabaseName, String)>,
+}
+
+impl BatchedWriteClient {
+    /// Creates a new batched write client with the given configuration.
+    pub fn new(client: WriteClient, config: BatchedWriteClientConfig) -> Self {
+        let inner = BatchedWriteClientInner {
+            client,
+            buffer: Vec::new(),
+        };
+
+        let inner = Arc::new(Mutex::new(inner));
+        let shutdown = Arc::new(AtomicBool::new(false));
+
+        // Spawn background task to periodically flush
+        let flush_task = {
+            let inner = Arc::clone(&inner);
+            let shutdown = Arc::clone(&shutdown);
+            let flush_interval = config.flush_interval;
+
+            tokio::spawn(async move {
+                loop {
+                    tokio::time::sleep(flush_interval).await;
+
+                    if shutdown.load(Ordering::Relaxed) {
+                        break;
+                    }
+
+                    let mut guard = inner.lock().await;
+                    if let Err(e) = flush_buffer_internal(&mut guard).await {
+                        error!("Failed to flush batched writes from timer: {}", e);
+                    }
+                }
+            })
+        };
+
+        Self {
+            inner,
+            config,
+            shutdown,
+            _flush_task: flush_task,
+        }
+    }
+
+    /// Creates a new batched write client with default configuration.
+    pub fn new_with_defaults(client: WriteClient) -> Self {
+        Self::new(client, BatchedWriteClientConfig::default())
+    }
+
+    /// Write line protocol data to the specified database.
+    ///
+    /// The write is buffered internally and will be flushed when the
+    /// configured batch size is reached.
+    pub async fn write_lp(
+        &self,
+        database: impl Into<DatabaseName> + Send,
+        data: impl Into<String> + Send,
+    ) -> Result<(), Error> {
+        let database = database.into();
+        let data = data.into();
+
+        let mut inner = self.inner.lock().await;
+
+        inner.buffer.push((database, data));
+
+        if inner.buffer.len() >= self.config.max_batch_size {
+            flush_buffer_internal(&mut inner).await?;
+        }
+
+        Ok(())
+    }
+
+    /// Explicitly flush all pending writes.
+    ///
+    /// This method blocks until all currently buffered writes have been sent.
+    pub async fn flush(&self) -> Result<(), Error> {
+        let mut inner = self.inner.lock().await;
+        flush_buffer_internal(&mut inner).await
+    }
+}
+
+impl std::fmt::Debug for BatchedWriteClient {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("BatchedWriteClient")
+            .field("inner", &self.inner)
+            .field("config", &self.config)
+            .field("shutdown", &self.shutdown)
+            .field("_flush_task", &"<task>")
+            .finish()
+    }
+}
+
+impl Drop for BatchedWriteClient {
+    fn drop(&mut self) {
+        // Signal the background task to shut down
+        self.shutdown.store(true, Ordering::Relaxed);
+
+        // Try to flush remaining data on drop
+        // We spawn a task since we can't use async in Drop
+        let inner = Arc::clone(&self.inner);
+        tokio::spawn(async move {
+            let mut guard = inner.lock().await;
+            if !guard.buffer.is_empty()
+                && let Err(e) = flush_buffer_internal(&mut guard).await
+            {
+                error!("Failed to flush remaining batched writes on drop: {}", e);
+            }
+        });
+    }
+}
+
+/// Flush the buffer by grouping writes by database and sending them
+async fn flush_buffer_internal(inner: &mut BatchedWriteClientInner) -> Result<(), Error> {
+    if inner.buffer.is_empty() {
+        return Ok(());
+    }
+
+    let mut by_database: std::collections::BTreeMap<DatabaseName, Vec<String>> =
+        std::collections::BTreeMap::new();
+
+    for (db, data) in inner.buffer.drain(..) {
+        by_database.entry(db).or_default().push(data);
+    }
+
+    for (db_name, data_vec) in by_database {
+        let combined = data_vec.join("\n");
+
+        if let Err(e) = inner.client.write_lp(db_name.clone(), combined).await {
+            error!(
+                "Failed to write batched data for database {:?}: {}",
+                db_name, e
+            );
+            return Err(e);
+        }
+    }
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::write::RequestMaker;
+    use futures_util::FutureExt;
+    use futures_util::future::BoxFuture;
+    use std::sync::{Arc, Mutex};
+
+    #[derive(Debug)]
+    struct MockRequestMaker {
+        requests: Mutex<Vec<String>>,
+    }
+
+    impl MockRequestMaker {
+        fn new() -> Self {
+            Self {
+                requests: Mutex::new(vec![]),
+            }
+        }
+
+        fn requests(&self) -> Vec<String> {
+            self.requests.lock().unwrap().clone()
+        }
+    }
+
+    impl RequestMaker for MockRequestMaker {
+        fn write_source(
+            &self,
+            _org_id: String,
+            _bucket_id: String,
+            body: String,
+        ) -> BoxFuture<'_, Result<usize, Error>> {
+            let sz = body.len();
+            self.requests.lock().unwrap().push(body);
+            async move { Ok(sz) }.boxed()
+        }
+    }
+
+    #[tokio::test]
+    async fn test_batching_by_size() {
+        let mock = Arc::new(MockRequestMaker::new());
+        let client = WriteClient::new_with_maker(Arc::clone(&mock) as _);
+
+        let config = BatchedWriteClientConfig {
+            max_batch_size: 3,
+            flush_interval: Duration::from_secs(3600), // Long interval to not interfere with test
+        };
+
+        let batched = BatchedWriteClient::new(client, config);
+
+        batched.write_lp("test_db", "m1 f=1").await.unwrap();
+        batched.write_lp("test_db", "m2 f=2").await.unwrap();
+        batched.write_lp("test_db", "m3 f=3").await.unwrap();
+
+        let requests = mock.requests();
+        assert_eq!(requests.len(), 1);
+        assert!(requests[0].contains("m1 f=1"));
+        assert!(requests[0].contains("m2 f=2"));
+        assert!(requests[0].contains("m3 f=3"));
+    }
+
+    #[tokio::test]
+    async fn test_explicit_flush() {
+        let mock = Arc::new(MockRequestMaker::new());
+        let client = WriteClient::new_with_maker(Arc::clone(&mock) as _);
+
+        let config = BatchedWriteClientConfig {
+            max_batch_size: 100,
+            flush_interval: Duration::from_secs(3600), // Long interval to not interfere with test
+        };
+
+        let batched = BatchedWriteClient::new(client, config);
+
+        batched.write_lp("test_db", "m1 f=1").await.unwrap();
+        batched.write_lp("test_db", "m2 f=2").await.unwrap();
+
+        batched.flush().await.unwrap();
+
+        let requests = mock.requests();
+        assert_eq!(requests.len(), 1);
+        assert!(requests[0].contains("m1 f=1"));
+        assert!(requests[0].contains("m2 f=2"));
+    }
+
+    #[tokio::test]
+    async fn test_timer_flush() {
+        let mock = Arc::new(MockRequestMaker::new());
+        let client = WriteClient::new_with_maker(Arc::clone(&mock) as _);
+
+        let config = BatchedWriteClientConfig {
+            max_batch_size: 100,                        // High batch size so it won't trigger
+            flush_interval: Duration::from_millis(100), // Short interval for testing
+        };
+
+        let batched = BatchedWriteClient::new(client, config);
+
+        // Write some data that won't trigger batch size flush
+        batched.write_lp("test_db", "m1 f=1").await.unwrap();
+        batched.write_lp("test_db", "m2 f=2").await.unwrap();
+
+        // Initially no flush should have happened
+        assert_eq!(mock.requests().len(), 0);
+
+        // Wait for the timer to trigger
+        tokio::time::sleep(Duration::from_millis(150)).await;
+
+        // Now the timer should have flushed the data
+        let requests = mock.requests();
+        assert_eq!(requests.len(), 1);
+        assert!(requests[0].contains("m1 f=1"));
+        assert!(requests[0].contains("m2 f=2"));
+    }
+}
diff --git a/influxdb_iox_client/src/client/table.rs b/influxdb_iox_client/src/client/table.rs
index 0fef4244..3283d34d 100644
--- a/influxdb_iox_client/src/client/table.rs
+++ b/influxdb_iox_client/src/client/table.rs
@@ -103,19 +103,25 @@ impl Client {
     }
 
     /// Enable iceberg exports for a table
-    pub async fn enable_iceberg(&mut self, table_id: i64) -> Result<(), Error> {
+    pub async fn enable_iceberg(&mut self, table_id: i64, namespace_id: i64) -> Result<(), Error> {
         let _ = self
             .inner
-            .enable_iceberg(EnableIcebergRequest { table_id })
+            .enable_iceberg(EnableIcebergRequest {
+                table_id,
+                namespace_id,
+            })
             .await?;
         Ok(())
     }
 
     /// Disable iceberg exports for a table
-    pub async fn disable_iceberg(&mut self, table_id: i64) -> Result<(), Error> {
+    pub async fn disable_iceberg(&mut self, table_id: i64, namespace_id: i64) -> Result<(), Error> {
         let _ = self
             .inner
-            .disable_iceberg(DisableIcebergRequest { table_id })
+            .disable_iceberg(DisableIcebergRequest {
+                table_id,
+                namespace_id,
+            })
             .await?;
         Ok(())
     }
diff --git a/influxdb_iox_client/src/client/write.rs b/influxdb_iox_client/src/client/write.rs
index 718b793e..5d321491 100644
--- a/influxdb_iox_client/src/client/write.rs
+++ b/influxdb_iox_client/src/client/write.rs
@@ -13,7 +13,7 @@ use reqwest::{Body, Method};
 pub const DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES: Option<usize> = Some(1024 * 1024);
 
 /// Name of a database.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct DatabaseName {
     /// The database name.
     database: String,
@@ -53,7 +53,7 @@ impl DatabaseName {
 
     /// Internally, we speak the v2 protocol which has an "org" parameter. Single tenant instances of InfluxDB
     /// will tolerate the presence of an "org" parameter provided it's an empty string.
-    fn get_org_bucket(&self) -> (String, String) {
+    pub fn get_org_bucket(&self) -> (String, String) {
         let name = self.clone();
         (name.org.unwrap_or_default(), name.database)
     }
@@ -123,7 +123,7 @@ impl Client {
     }
 
     /// Creates a new client with the provided request maker
-    fn new_with_maker(inner: Arc<dyn RequestMaker>) -> Self {
+    pub fn new_with_maker(inner: Arc<dyn RequestMaker>) -> Self {
         Self {
             inner,
             max_request_payload_size_bytes: DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES,
@@ -225,7 +225,7 @@ impl Client {
 
 /// Something that knows how to send http data. Exists so it can be
 /// mocked out for testing
-trait RequestMaker: Debug + Send + Sync {
+pub trait RequestMaker: Debug + Send + Sync {
     /// Write the body data to the specified org, bucket, and
     /// returning the number of bytes written
     ///
diff --git a/iox_query/Cargo.toml b/iox_query/Cargo.toml
index ad969a4e..013dc63b 100644
--- a/iox_query/Cargo.toml
+++ b/iox_query/Cargo.toml
@@ -29,7 +29,7 @@ datafusion_util = { path = "../datafusion_util" }
 executor = { path = "../executor" }
 futures = "0.3"
 hashbrown = { workspace = true }
-indexmap = { version = "2.11", features = ["std"] }
+indexmap = { version = "2.12", features = ["std"] }
 influxdb-line-protocol = { path = "../influxdb_line_protocol" }
 itertools = "0.13.0"
 iox_query_params = { path = "../iox_query_params" }
@@ -46,7 +46,7 @@ parquet_file = { path = "../parquet_file" }
 query_functions = { path = "../query_functions" }
 schema = { path = "../schema" }
 snafu = "0.8"
-tokio = { version = "1.47", features = ["macros", "parking_lot"] }
+tokio = { version = "1.48", features = ["macros", "parking_lot"] }
 tokio-stream = "0.1"
 trace = { path = "../trace" }
 tracker = { path = "../tracker" }
diff --git a/iox_query/src/analyzer/handle_gapfill.rs b/iox_query/src/analyzer/handle_gapfill.rs
index 5012cb28..b5c2f20b 100644
--- a/iox_query/src/analyzer/handle_gapfill.rs
+++ b/iox_query/src/analyzer/handle_gapfill.rs
@@ -4,7 +4,7 @@
 pub mod range_predicate;
 mod virtual_function;
 
-use crate::exec::gapfill::{FillStrategy, GapFill, GapFillParams};
+use crate::exec::gapfill::{FillExpr, FillStrategy, GapFill};
 use datafusion::common::{
     DFSchema, ExprSchema, internal_datafusion_err, plan_datafusion_err, plan_err,
 };
@@ -150,7 +150,7 @@ fn build_gapfill_node(
     new_aggr_plan: LogicalPlan,
     date_bin_gapfill_index: usize,
     date_bin_gapfill_args: Vec<Expr>,
-    date_bin_udf: Arc<str>,
+    date_bin_udf: Arc<ScalarUDF>,
 ) -> Result<LogicalPlan> {
     match date_bin_gapfill_args.len() {
         2 | 3 => (),
@@ -159,7 +159,7 @@ fn build_gapfill_node(
                 "DATE_BIN_GAPFILL expects 2 or 3 arguments, got {nargs}",
             )));
         }
-    }
+    };
 
     let mut args_iter = date_bin_gapfill_args.into_iter();
 
@@ -206,6 +206,15 @@ fn build_gapfill_node(
             .schema()
             .qualified_field(date_bin_gapfill_index),
     ));
+    let time_column_alias = time_column.name_for_alias()?;
+
+    let time_expr = date_bin_udf
+        .call(if let Some(origin) = origin {
+            vec![stride, time_column, origin]
+        } else {
+            vec![stride, time_column]
+        })
+        .alias(time_column_alias);
 
     let LogicalPlan::Aggregate(aggr) = &new_aggr_plan else {
         return Err(DataFusionError::Internal(format!(
@@ -213,26 +222,19 @@ fn build_gapfill_node(
             new_aggr_plan.display()
         )));
     };
-    let mut new_group_expr: Vec<_> = aggr
-        .schema
-        .iter()
-        .map(|(qualifier, field)| {
+
+    let mut col_it = aggr.schema.iter();
+    let series_expr = (&mut col_it)
+        .take(aggr.group_expr.len())
+        .enumerate()
+        .filter(|(idx, _)| *idx != date_bin_gapfill_index)
+        .map(|(_, (qualifier, field))| {
             Expr::Column(datafusion::common::Column::from((
                 qualifier,
                 field.as_ref(),
             )))
         })
         .collect();
-    let aggr_expr = new_group_expr.split_off(aggr.group_expr.len());
-
-    match (aggr_expr.len(), aggr.aggr_expr.len()) {
-        (f, e) if f != e => {
-            return Err(internal_datafusion_err!(
-                "The number of aggregate expressions has gotten lost; expected {e}, found {f}. This is a bug, please report it."
-            ));
-        }
-        _ => (),
-    }
 
     // this schema is used for the `FillStrategy::Default` checks below. It also represents the
     // schema of the projection of `aggr`, meaning that it shows the columns/fields as they exist
@@ -241,9 +243,13 @@ fn build_gapfill_node(
     // value of those types according to the AggregateFunction below, it all works out.
     let schema = &aggr.schema;
 
-    let fill_behavior = aggr_expr
-        .iter()
-        .cloned()
+    let fill_expr = col_it
+        .map(|(qualifier, field)| {
+            Expr::Column(datafusion::common::Column::from((
+                qualifier,
+                field.as_ref(),
+            )))
+        })
         // `aggr_expr` and `aggr.aggr_expr` should line up in the sense that `aggr.aggr_expr[n]`
         // represents a transformation that was done to produce `aggr_expr[n]`, so we can zip them
         // together like this to determine the correct fill type for the produced expression
@@ -257,25 +263,32 @@ fn build_gapfill_node(
             // `col_expr` should be the 'computed'/'transformed' representation of `aggr_expr`, we
             // `aggr_expr`, we need to make sure that it's a column or else this doesn't really
             // matter to calculate.
-            default_return_value_for_aggr_fn(aggr_expr, schema, col_expr.try_as_col())
-                .map(|rt| (col_expr, FillStrategy::Default(rt)))
+            default_return_value_for_aggr_fn(aggr_expr, schema, col_expr.try_as_col()).map(|rt| {
+                FillExpr {
+                    expr: col_expr,
+                    strategy: FillStrategy::Default(rt),
+                }
+            })
         })
-        .collect::<Result<_>>()?;
+        .collect::<Result<Vec<_>>>()?;
+
+    match (fill_expr.len(), aggr.aggr_expr.len()) {
+        (f, e) if f != e => {
+            return Err(internal_datafusion_err!(
+                "The number of aggregate expressions has gotten lost; expected {e}, found {f}. This is a bug, please report it."
+            ));
+        }
+        _ => (),
+    }
 
     Ok(LogicalPlan::Extension(Extension {
         node: Arc::new(
             GapFill::try_new(
                 Arc::new(new_aggr_plan),
-                new_group_expr,
-                aggr_expr,
-                GapFillParams {
-                    date_bin_udf,
-                    stride,
-                    time_column,
-                    origin,
-                    time_range,
-                    fill_strategy: fill_behavior,
-                },
+                series_expr,
+                time_expr,
+                fill_expr,
+                time_range,
             )
             .map_err(|e| e.context("GapFill::try_new"))?,
         ),
@@ -329,7 +342,7 @@ enum RewriteInfo {
         // The arguments to the call to DATE_BIN_GAPFILL.
         date_bin_gapfill_args: Vec<Expr>,
         // The name of the UDF that provides the DATE_BIN like functionality.
-        date_bin_udf: Arc<str>,
+        date_bin_udf: Arc<ScalarUDF>,
     },
 }
 
@@ -375,7 +388,7 @@ fn replace_date_bin_gapfill(aggr: Aggregate) -> Result<RewriteInfo> {
         }
     };
 
-    let date_bin_udf = Arc::from(date_bin.name());
+    let date_bin_udf = Arc::clone(&date_bin);
     let mut rewriter = DateBinGapfillRewriter {
         args: None,
         date_bin,
@@ -985,7 +998,7 @@ mod test {
         insta::assert_yaml_snapshot!(
             format_analyzed_plan(plan)?,
             @r#"
-        - "GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)], aggr=[[avg(temps.temp)]], time_column=date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
+        - "GapFill: series=[], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), fill=[avg(temps.temp)], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
         - "  Aggregate: groupBy=[[date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)]], aggr=[[avg(temps.temp)]]"
         - "    Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)"
         - "      TableScan: temps"
@@ -1014,7 +1027,7 @@ mod test {
         insta::assert_yaml_snapshot!(
             format_analyzed_plan(plan)?,
             @r#"
-        - "GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time,TimestampNanosecond(7, None))], aggr=[[avg(temps.temp)]], time_column=date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time,TimestampNanosecond(7, None)), stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
+        - "GapFill: series=[], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time,TimestampNanosecond(7, None)), TimestampNanosecond(7, None)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time,TimestampNanosecond(7, None)), fill=[avg(temps.temp)], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
         - "  Aggregate: groupBy=[[date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), temps.time, TimestampNanosecond(7, None)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time,TimestampNanosecond(7, None))]], aggr=[[avg(temps.temp)]]"
         - "    Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)"
         - "      TableScan: temps"
@@ -1043,7 +1056,7 @@ mod test {
         insta::assert_yaml_snapshot!(
             format_analyzed_plan(plan)?,
             @r#"
-        - "GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), temps.loc], aggr=[[avg(temps.temp)]], time_column=date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
+        - "GapFill: series=[temps.loc], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), fill=[avg(temps.temp)], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
         - "  Aggregate: groupBy=[[date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), temps.loc]], aggr=[[avg(temps.temp)]]"
         - "    Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)"
         - "      TableScan: temps"
@@ -1094,7 +1107,7 @@ mod test {
             format_analyzed_plan(plan)?,
             @r#"
         - "Projection: date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), avg(temps.temp)"
-        - "  GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)], aggr=[[avg(temps.temp)]], time_column=date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
+        - "  GapFill: series=[], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), fill=[avg(temps.temp)], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
         - "    Aggregate: groupBy=[[date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)]], aggr=[[avg(temps.temp)]]"
         - "      Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)"
         - "        TableScan: temps"
@@ -1128,7 +1141,7 @@ mod test {
             format_analyzed_plan(plan)?,
             @r#"
         - "Projection: date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), avg(temps.temp) AS locf(avg(temps.temp)), min(temps.temp) AS locf(min(temps.temp))"
-        - "  GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)], aggr=[[LOCF(avg(temps.temp)), LOCF(min(temps.temp))]], time_column=date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
+        - "  GapFill: series=[], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), fill=[LOCF(avg(temps.temp)), LOCF(min(temps.temp))], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
         - "    Aggregate: groupBy=[[date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)]], aggr=[[avg(temps.temp), min(temps.temp)]]"
         - "      Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)"
         - "        TableScan: temps"
@@ -1161,7 +1174,7 @@ mod test {
             format_analyzed_plan(plan)?,
             @r#"
         - "Projection: date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), min(temps.temp) AS locf(min(temps.temp)) AS locf_min_temp"
-        - "  GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)], aggr=[[avg(temps.temp), LOCF(min(temps.temp))]], time_column=date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
+        - "  GapFill: series=[], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), fill=[avg(temps.temp), LOCF(min(temps.temp))], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
         - "    Aggregate: groupBy=[[date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)]], aggr=[[avg(temps.temp), min(temps.temp)]]"
         - "      Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)"
         - "        TableScan: temps"
@@ -1195,7 +1208,7 @@ mod test {
             format_analyzed_plan(plan)?,
             @r#"
         - "Projection: date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), avg(temps.temp) AS interpolate(avg(temps.temp)), min(temps.temp) AS interpolate(min(temps.temp))"
-        - "  GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)], aggr=[[INTERPOLATE(avg(temps.temp)), INTERPOLATE(min(temps.temp))]], time_column=date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
+        - "  GapFill: series=[], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), fill=[INTERPOLATE(avg(temps.temp)), INTERPOLATE(min(temps.temp))], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
         - "    Aggregate: groupBy=[[date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)]], aggr=[[avg(temps.temp), min(temps.temp)]]"
         - "      Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)"
         - "        TableScan: temps"
@@ -1231,7 +1244,7 @@ mod test {
         insta::assert_yaml_snapshot!(
             format_analyzed_plan(plan).unwrap(),
             @r#"
-        - "GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)], aggr=[[]], time_column=date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
+        - "GapFill: series=[], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time), fill=[], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
         - "  Aggregate: groupBy=[[date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"),temps.time)]], aggr=[[]]"
         - "    TableScan: temps projection=[time], full_filters=[temps.time >= TimestampNanosecond(1000, None), temps.time < TimestampNanosecond(2000, None), temps.loc = Utf8(\"foo\")]"
         "#);
diff --git a/iox_query/src/exec.rs b/iox_query/src/exec.rs
index c1abe928..e65376cf 100644
--- a/iox_query/src/exec.rs
+++ b/iox_query/src/exec.rs
@@ -5,6 +5,7 @@ pub(crate) mod context;
 pub mod gapfill;
 mod metrics;
 pub mod query_tracing;
+pub mod series_limit;
 pub mod sleep;
 pub(crate) mod split;
 use datafusion_util::config::register_iox_object_store;
diff --git a/iox_query/src/exec/context.rs b/iox_query/src/exec/context.rs
index a0f69bcf..98ee0a6f 100644
--- a/iox_query/src/exec/context.rs
+++ b/iox_query/src/exec/context.rs
@@ -4,6 +4,7 @@
 use super::{
     cross_rt_stream::CrossRtStream,
     gapfill::{GapFill, plan_gap_fill},
+    series_limit::{SeriesLimit, plan_series_limit},
     sleep::SleepNode,
     split::StreamSplitNode,
 };
@@ -131,6 +132,10 @@ impl ExtensionPlanner for IOxExtensionPlanner {
             let gap_fill_exec =
                 plan_gap_fill(session_state, gap_fill, logical_inputs, physical_inputs)?;
             Some(Arc::new(gap_fill_exec))
+        } else if let Some(series_limit) = any.downcast_ref::<SeriesLimit>() {
+            let series_limit_exec =
+                plan_series_limit(session_state, series_limit, logical_inputs, physical_inputs)?;
+            Some(Arc::new(series_limit_exec))
         } else if let Some(sleep) = any.downcast_ref::<SleepNode>() {
             let sleep = sleep.plan(planner, logical_inputs, physical_inputs, session_state)?;
             Some(Arc::new(sleep))
diff --git a/iox_query/src/exec/gapfill/algo.rs b/iox_query/src/exec/gapfill/algo.rs
index 1c6e283f..b9829931 100644
--- a/iox_query/src/exec/gapfill/algo.rs
+++ b/iox_query/src/exec/gapfill/algo.rs
@@ -132,17 +132,17 @@ impl GapFiller {
     pub fn build_gapfilled_output(
         &mut self,
         schema: SchemaRef,
-        input_time_array: (usize, &TimestampNanosecondArray),
-        group_arrays: &[(usize, ArrayRef)],
-        aggr_arrays: &[(usize, ArrayRef)],
+        input_time_array: &TimestampNanosecondArray,
+        series_arrays: &[ArrayRef],
+        fill_arrays: &[(usize, ArrayRef)],
     ) -> Result<RecordBatch> {
-        let series_ends = self.plan_output_batch(input_time_array.1, group_arrays)?;
+        let series_ends = self.plan_output_batch(input_time_array, series_arrays)?;
         self.cursor.remaining_output_batch_size = self.batch_size;
         self.build_output(
             schema,
             input_time_array,
-            group_arrays,
-            aggr_arrays,
+            series_arrays,
+            fill_arrays,
             &series_ends,
         )
     }
@@ -167,18 +167,15 @@ impl GapFiller {
     fn plan_output_batch(
         &mut self,
         input_time_array: &TimestampNanosecondArray,
-        group_arr: &[(usize, ArrayRef)],
+        series_arr: &[ArrayRef],
     ) -> Result<Vec<usize>> {
-        if group_arr.is_empty() {
+        if series_arr.is_empty() {
             // there are no group columns, so the output
             // will be just one big series.
             return Ok(vec![input_time_array.len()]);
         }
 
-        let sort_columns = group_arr
-            .iter()
-            .map(|(_, arr)| Arc::clone(arr))
-            .collect::<Vec<_>>();
+        let sort_columns = series_arr.to_vec();
 
         let mut ranges = partition(&sort_columns)?.ranges().into_iter();
 
@@ -218,32 +215,27 @@ impl GapFiller {
     fn build_output(
         &mut self,
         schema: SchemaRef,
-        input_time_array: (usize, &TimestampNanosecondArray),
-        group_arr: &[(usize, ArrayRef)],
-        aggr_arr: &[(usize, ArrayRef)],
+        input_time_array: &TimestampNanosecondArray,
+        series_arr: &[ArrayRef],
+        fill_arr: &[(usize, ArrayRef)],
         series_ends: &[usize],
     ) -> Result<RecordBatch> {
-        let mut output_arrays: Vec<(usize, ArrayRef)> =
-            Vec::with_capacity(group_arr.len() + aggr_arr.len() + 1); // plus one for time column
+        let mut output_arrays: Vec<ArrayRef> =
+            Vec::with_capacity(series_arr.len() + fill_arr.len() + 1); // plus one for time column
 
         // build the time column
         let mut cursor = self.cursor.clone_for_aggr_col(None)?;
-        let (time_idx, input_time_array) = input_time_array;
         let time_vec = cursor.build_time_vec(&self.params, series_ends, input_time_array)?;
         let output_time_len = time_vec.len();
-        output_arrays.push((
-            time_idx,
-            Arc::new(
-                TimestampNanosecondArray::from(time_vec)
-                    .with_timezone_opt(input_time_array.timezone()),
-            ),
-        ));
+        let time_arr = Arc::new(
+            TimestampNanosecondArray::from(time_vec).with_timezone_opt(input_time_array.timezone()),
+        );
         // There may not be any aggregate or group columns, so use this cursor state as the new
         // GapFiller cursor once this output batch is complete.
         let mut final_cursor = cursor;
 
         // build the other group columns
-        for (idx, ga) in group_arr {
+        for ga in series_arr {
             let mut cursor = self.cursor.clone_for_aggr_col(None)?;
             let take_vec =
                 cursor.build_group_take_vec(&self.params, series_ends, input_time_array)?;
@@ -255,11 +247,12 @@ impl GapFiller {
                 )));
             }
             let take_arr = UInt64Array::from(take_vec);
-            output_arrays.push((*idx, take::take(ga, &take_arr, None)?));
+            output_arrays.push(take::take(ga, &take_arr, None)?);
         }
+        output_arrays.push(time_arr);
 
         // Build the aggregate columns
-        for (idx, aa) in aggr_arr {
+        for (idx, aa) in fill_arr {
             let mut cursor = self.cursor.clone_for_aggr_col(Some(*idx))?;
             let output_array =
                 cursor.build_aggr_col(&self.params, series_ends, input_time_array, aa)?;
@@ -270,14 +263,13 @@ impl GapFiller {
                     output_time_len
                 )));
             }
-            output_arrays.push((*idx, output_array));
+            output_arrays.push(output_array);
             final_cursor.merge_aggr_col_cursor(cursor);
         }
 
-        output_arrays.sort_by(|(a, _), (b, _)| a.cmp(b));
-        let output_arrays: Vec<_> = output_arrays.into_iter().map(|(_, arr)| arr).collect();
-        let batch = RecordBatch::try_new(Arc::clone(&schema), output_arrays)
-            .map_err(|err| DataFusionError::ArrowError(Box::new(err), None))?;
+        let batch = RecordBatch::try_new(Arc::clone(&schema), output_arrays).map_err(|err| {
+            DataFusionError::ArrowError(Box::new(err), None).context("build_output")
+        })?;
 
         self.cursor = final_cursor;
         Ok(batch)
diff --git a/iox_query/src/exec/gapfill/buffered_input.rs b/iox_query/src/exec/gapfill/buffered_input.rs
index 82a17a3f..8911dadb 100644
--- a/iox_query/src/exec/gapfill/buffered_input.rs
+++ b/iox_query/src/exec/gapfill/buffered_input.rs
@@ -26,8 +26,9 @@ use super::{FillStrategy, params::GapFillParams};
 /// [`FillStrategy::LinearInterpolate`]: super::FillStrategy::LinearInterpolate
 /// [`GapFillStream`]: super::stream::GapFillStream
 pub(super) struct BufferedInput {
-    /// Indexes of group columns in the schema (not including time).
-    group_cols: Vec<usize>,
+    /// Indexes of series columns in the schema. These are the columns
+    /// that will have consistent values for all rows in a time series.
+    series_cols: Vec<usize>,
     /// Indexes of aggregate columns filled via interpolation.
     interpolate_cols: Vec<usize>,
     /// Buffered records from the input stream.
@@ -42,7 +43,7 @@ pub(super) struct BufferedInput {
 }
 
 impl BufferedInput {
-    pub(super) fn new(params: &GapFillParams, group_cols: Vec<usize>) -> Self {
+    pub(super) fn new(params: &GapFillParams, series_cols: Vec<usize>) -> Self {
         let interpolate_cols = params
             .fill_strategy
             .iter()
@@ -51,7 +52,7 @@ impl BufferedInput {
             })
             .collect::<Vec<usize>>();
         Self {
-            group_cols,
+            series_cols,
             interpolate_cols,
             batches: vec![],
             row_converter: None,
@@ -170,7 +171,7 @@ impl BufferedInput {
     ///
     /// [`arrow::row`]: https://docs.rs/arrow-row/36.0.0/arrow_row/index.html
     fn group_columns_changed(&mut self, last_output_row_idx: (usize, usize)) -> Result<bool> {
-        if self.group_cols.is_empty() {
+        if self.series_cols.is_empty() {
             return Ok(false);
         }
 
@@ -193,7 +194,7 @@ impl BufferedInput {
         if self.row_converter.is_none() {
             let batch = self.batches.first().expect("at least one batch");
             let sort_fields = self
-                .group_cols
+                .series_cols
                 .iter()
                 .map(|c| SortField::new(batch.column(*c).data_type().clone()))
                 .collect();
@@ -208,7 +209,7 @@ impl BufferedInput {
     fn convert_row(&mut self, row_idxs: (usize, usize)) -> Result<Rows> {
         let batch = &self.batches[row_idxs.0];
         let columns: Vec<ArrayRef> = self
-            .group_cols
+            .series_cols
             .iter()
             .map(|col_idx| batch.column(*col_idx).slice(row_idxs.1, 1))
             .collect();
diff --git a/iox_query/src/exec/gapfill/exec_tests.rs b/iox_query/src/exec/gapfill/exec_tests.rs
index bd5d5dfc..995e12f1 100644
--- a/iox_query/src/exec/gapfill/exec_tests.rs
+++ b/iox_query/src/exec/gapfill/exec_tests.rs
@@ -15,10 +15,9 @@ use arrow_util::test_util::batches_to_lines;
 use datafusion::{
     error::Result,
     execution::runtime_env::RuntimeEnvBuilder,
-    functions::datetime::date_bin::DateBinFunc,
     physical_plan::{
         collect,
-        expressions::{col as phys_col, lit as phys_lit},
+        expressions::{Column, col as phys_col, lit as phys_lit},
         test::exec::MockExec,
     },
     prelude::{SessionConfig, SessionContext},
@@ -42,11 +41,13 @@ fn test_gapfill_simple() {
                 struct_cols: vec![],
                 input_batch_size,
             };
-            let params = get_params_ms(&batch, 25, Some(975), 1_125);
+            let time_expr = get_date_bin_expr(&batch, 25, None);
             let tc = TestCase {
                 test_records: batch,
                 output_batch_size,
-                params,
+                time_expr,
+                time_range: get_time_range(Some(975), 1_125),
+                fill_strategy: None,
             };
             // For this simple test case, also test that
             // memory is tracked correctly, which is done by
@@ -84,11 +85,13 @@ fn test_gapfill_simple_tz() {
                 struct_cols: vec![],
                 input_batch_size,
             };
-            let params = get_params_ms(&batch, 25, Some(975), 1_125);
+            let time_expr = get_date_bin_expr(&batch, 25, None);
             let tc = TestCase {
                 test_records: batch,
                 output_batch_size,
-                params,
+                time_expr,
+                time_range: get_time_range(Some(975), 1_125),
+                fill_strategy: None,
             };
             // For this simple test case, also test that
             // memory is tracked correctly, which is done by
@@ -129,11 +132,13 @@ fn test_gapfill_simple_no_group_no_aggr() {
                 struct_cols: vec![],
                 input_batch_size,
             };
-            let params = get_params_ms(&batch, 25, Some(975), 1_125);
+            let time_expr= get_date_bin_expr(&batch, 25, None);
             let tc = TestCase {
                 test_records: batch,
                 output_batch_size,
-                params,
+                time_expr,
+                time_range: get_time_range(Some(975), 1_125),
+                fill_strategy: None,
             };
             let batches = tc.run().unwrap();
             let actual = batches_to_lines(&batches);
@@ -169,11 +174,13 @@ fn test_gapfill_multi_group_simple() {
                 struct_cols: vec![],
                 input_batch_size,
             };
-            let params = get_params_ms(&records, 25, Some(975), 1_125);
+            let time_expr  = get_date_bin_expr(&records, 25, None);
             let tc = TestCase {
                 test_records: records,
                 output_batch_size,
-                params,
+                time_expr,
+                time_range: get_time_range(Some(975), 1_125),
+                fill_strategy: None,
             };
             let batches = tc.run().unwrap();
             let actual = batches_to_lines(&batches);
@@ -215,11 +222,13 @@ fn test_gapfill_multi_group_simple_origin() {
                 struct_cols: vec![],
                 input_batch_size,
             };
-            let params = get_params_ms_with_origin_fill_strategy(&records, 25, Some(975), 1_125, Some(3), None);
+            let time_expr  = get_date_bin_expr(&records, 25, Some(3));
             let tc = TestCase {
                 test_records: records,
                 output_batch_size,
-                params,
+                time_expr,
+                time_range: get_time_range(Some(975), 1_125),
+                fill_strategy: None,
             };
             let batches = tc.run().unwrap();
             let actual = batches_to_lines(&batches);
@@ -286,11 +295,13 @@ fn test_gapfill_multi_group_with_nulls() {
                 struct_cols: vec![],
                 input_batch_size,
             };
-            let params = get_params_ms(&records, 25, Some(975), 1_125);
+            let time_expr = get_date_bin_expr(&records, 25, None);
             let tc = TestCase {
                 test_records: records,
                 output_batch_size,
-                params,
+                time_expr,
+                time_range: get_time_range(Some(975), 1_125),
+                fill_strategy: None,
             };
             let batches = tc.run().unwrap();
             let actual = batches_to_lines(&batches);
@@ -370,11 +381,13 @@ fn test_gapfill_multi_group_cols_with_nulls() {
                 struct_cols: vec![],
                 input_batch_size,
             };
-            let params = get_params_ms(&records, 25, Some(975), 1_125);
+            let time_expr = get_date_bin_expr(&records, 25, None);
             let tc = TestCase {
                 test_records: records,
                 output_batch_size,
-                params,
+                time_expr,
+                time_range: get_time_range(Some(975), 1_125),
+                fill_strategy: None,
             };
             let batches = tc.run().unwrap();
             let actual = batches_to_lines(&batches);
@@ -431,11 +444,13 @@ fn test_gapfill_multi_group_cols_with_more_nulls() {
                 struct_cols: vec![],
                 input_batch_size,
             };
-            let params = get_params_ms(&records, 25, Some(975), 1_025);
+            let time_expr  = get_date_bin_expr(&records, 25, None);
             let tc = TestCase {
                 test_records: records,
                 output_batch_size,
-                params,
+                time_expr,
+                time_range: get_time_range(Some(975), 1_025),
+                fill_strategy: None,
             };
             let batches = tc.run().unwrap();
             let actual = batches_to_lines(&batches);
@@ -519,11 +534,13 @@ fn test_gapfill_multi_aggr_cols_with_nulls() {
                 struct_cols: vec![],
                 input_batch_size,
             };
-            let params = get_params_ms(&records, 25, Some(975), 1_125);
+            let time_expr = get_date_bin_expr(&records, 25, None);
             let tc = TestCase {
                 test_records: records,
                 output_batch_size,
-                params,
+                time_expr,
+                time_range: get_time_range(Some(975), 1_125),
+                fill_strategy: None,
             };
             let batches = tc.run().unwrap();
             let actual = batches_to_lines(&batches);
@@ -568,11 +585,13 @@ fn test_gapfill_simple_no_lower_bound() {
                 struct_cols: vec![],
                 input_batch_size,
             };
-            let params = get_params_ms(&batch, 25, None, 1_125);
+            let time_expr = get_date_bin_expr(&batch, 25, None);
             let tc = TestCase {
                 test_records: batch,
                 output_batch_size,
-                params,
+                time_expr,
+                time_range: get_time_range(None, 1_125),
+                fill_strategy: None,
             };
             let batches = tc.run().unwrap();
             let actual = batches_to_lines(&batches);
@@ -637,11 +656,13 @@ fn test_gapfill_fill_prev() {
                 struct_cols: vec![],
                 input_batch_size,
             };
-            let params = get_params_ms_with_fill_strategy(&records, 25, Some(975), 1_125, Some(FillStrategy::PrevNullAsIntentional));
+            let time_expr = get_date_bin_expr(&records, 25, None);
             let tc = TestCase {
                 test_records: records,
                 output_batch_size,
-                params,
+                time_expr,
+                time_range: get_time_range(Some(975), 1_125),
+                fill_strategy: Some(FillStrategy::PrevNullAsIntentional),
             };
             let batches = tc.run().unwrap();
             let actual = batches_to_lines(&batches);
@@ -716,11 +737,13 @@ fn test_gapfill_fill_prev_null_as_missing() {
                 struct_cols: vec![],
                 input_batch_size,
             };
-            let params = get_params_ms_with_fill_strategy(&records, 25, Some(975), 1_125, Some(FillStrategy::PrevNullAsMissing));
+            let time_expr  = get_date_bin_expr(&records, 25, None );
             let tc = TestCase {
                 test_records: records,
                 output_batch_size,
-                params,
+                time_expr,
+                time_range: get_time_range(Some(975), 1_125),
+                fill_strategy: Some(FillStrategy::PrevNullAsMissing),
             };
             let batches = tc.run().unwrap();
             let actual = batches_to_lines(&batches);
@@ -816,11 +839,13 @@ fn test_gapfill_fill_prev_null_as_missing_many_nulls() {
                 struct_cols: vec![],
                 input_batch_size,
             };
-            let params = get_params_ms_with_fill_strategy(&records, 25, Some(975), 1_125, Some(FillStrategy::PrevNullAsMissing));
+            let time_expr = get_date_bin_expr(&records, 25, None);
             let tc = TestCase {
                 test_records: records,
                 output_batch_size,
-                params,
+                time_expr,
+                time_range: get_time_range(Some(975), 1_125),
+                fill_strategy: Some(FillStrategy::PrevNullAsMissing),
             };
             let batches = tc.run().unwrap();
             let actual = batches_to_lines(&batches);
@@ -926,17 +951,13 @@ fn test_gapfill_fill_interpolate() {
                     struct_cols: vec![],
                     input_batch_size,
                 };
-                let params = get_params_ms_with_fill_strategy(
-                    &records,
-                    25,
-                    Some(975),
-                    1_125,
-                    Some(FillStrategy::LinearInterpolate)
-                );
+                let time_expr = get_date_bin_expr(&records,25,None);
                 let tc = TestCase {
                     test_records: records,
                     output_batch_size,
-                    params,
+                    time_expr,
+                    time_range: get_time_range(Some(975), 1_125),
+                    fill_strategy:  Some(FillStrategy::LinearInterpolate),
                 };
                 let batches = tc.run().unwrap();
                 let actual = batches_to_lines(&batches);
@@ -1024,11 +1045,13 @@ fn test_gapfill_simple_no_lower_bound_with_nulls() {
                 struct_cols: vec![],
                 input_batch_size,
             };
-            let params = get_params_ms(&batch, 25, None, 1_125);
+            let time_expr = get_date_bin_expr(&batch, 25, None);
             let tc = TestCase {
                 test_records: batch,
                 output_batch_size,
-                params,
+                time_expr,
+                time_range: get_time_range(None, 1_125),
+                fill_strategy: None,
             };
             let batches = tc.run().unwrap();
             let actual = batches_to_lines(&batches);
@@ -1074,11 +1097,13 @@ fn test_gapfill_oom() {
         struct_cols: vec![],
         input_batch_size,
     };
-    let params = get_params_ms(&batch, 25, Some(975), 1_125);
+    let time_expr = get_date_bin_expr(&batch, 25, None);
     let tc = TestCase {
         test_records: batch,
         output_batch_size,
-        params,
+        time_expr,
+        time_range: get_time_range(Some(975), 1_125),
+        fill_strategy: None,
     };
     let result = tc.run_with_memory_limit(1);
     assert_error!(result, DataFusionError::ResourcesExhausted(_));
@@ -1145,17 +1170,13 @@ fn test_gapfill_interpolate_struct() {
                 ]],
                 input_batch_size,
             };
-            let params = get_params_ms_with_fill_strategy(
-                &records,
-                25,
-                Some(975),
-                1_125,
-                Some(FillStrategy::LinearInterpolate)
-            );
+            let time_expr= get_date_bin_expr(&records, 25, None);
             let tc = TestCase {
                 test_records: records,
                 output_batch_size,
-                params,
+                time_expr,
+                time_range: get_time_range(Some(975), 1_125),
+                fill_strategy: Some(FillStrategy::LinearInterpolate),
             };
             let batches = tc.run().unwrap();
             let actual = batches_to_lines(&batches);
@@ -1251,17 +1272,13 @@ fn test_gapfill_interpolate_struct_additional_data() {
                 ]],
                 input_batch_size,
             };
-            let params = get_params_ms_with_fill_strategy(
-                &records,
-                25,
-                Some(975),
-                1_125,
-                Some(FillStrategy::LinearInterpolate)
-            );
+            let time_expr = get_date_bin_expr(&records, 25, None);
             let tc = TestCase {
                 test_records: records,
                 output_batch_size,
-                params,
+                time_expr,
+                time_range: get_time_range(Some(975), 1_125),
+                fill_strategy: Some(FillStrategy::LinearInterpolate),
             };
             let batches = tc.run().unwrap();
             let actual = batches_to_lines(&batches);
@@ -1391,7 +1408,7 @@ impl TestRecords {
         for i in 0..self.schema().fields().len() {
             match i.cmp(&ngroup_cols) {
                 Ordering::Less => group_expr.push(Arc::new(Column::new(&format!("g{i}"), i))),
-                Ordering::Equal => group_expr.push(Arc::new(Column::new("t", i))),
+                Ordering::Equal => continue,
                 Ordering::Greater => {
                     let idx = i - ngroup_cols + 1;
                     aggr_expr.push(Arc::new(Column::new(&format!("a{idx}"), i)));
@@ -1464,7 +1481,9 @@ impl TryFrom<TestRecords> for Vec<RecordBatch> {
 struct TestCase {
     test_records: TestRecords,
     output_batch_size: usize,
-    params: GapFillExecParams,
+    time_expr: Arc<dyn PhysicalExpr>,
+    time_range: Range<Bound<Arc<dyn PhysicalExpr>>>,
+    fill_strategy: Option<FillStrategy>,
 }
 
 impl TestCase {
@@ -1502,8 +1521,8 @@ impl TestCase {
 
     fn plan(self) -> Result<Arc<GapFillExec>> {
         let schema = self.test_records.schema();
-        let (group_expr, aggr_expr) = self.test_records.exprs()?;
-
+        let (series_expr, _) = self.test_records.exprs()?;
+        let fill_expr = phys_fill_expr(&self.test_records, self.fill_strategy)?;
         let input_batch_size = self.test_records.input_batch_size;
 
         let num_records = self.test_records.len();
@@ -1522,11 +1541,13 @@ impl TestCase {
             MockExec::new(batches.into_iter().map(Ok).collect(), Arc::clone(&schema))
                 .with_use_task(false),
         );
+
         let plan = Arc::new(GapFillExec::try_new(
             input,
-            group_expr,
-            aggr_expr,
-            self.params.clone(),
+            series_expr,
+            Arc::clone(&self.time_expr),
+            fill_expr,
+            self.time_range.clone(),
         )?);
         Ok(plan)
     }
@@ -1548,75 +1569,62 @@ fn bound_included_from_option<T>(o: Option<T>) -> Bound<T> {
     }
 }
 
-fn phys_fill_strategies(
+fn phys_fill_expr(
     records: &TestRecords,
     fill_strategy: Option<FillStrategy>,
-) -> Result<Vec<(Arc<dyn PhysicalExpr>, FillStrategy)>> {
+) -> Result<Vec<PhysicalFillExpr>> {
     let start = records.group_cols.len() + 1; // 1 is for time col
     let end = start + records.agg_cols.len() + records.struct_cols.len();
     let mut v = Vec::with_capacity(records.agg_cols.len());
     for f in &records.schema().fields()[start..end] {
-        v.push((
-            phys_col(f.name(), &records.schema())?,
-            match fill_strategy {
+        v.push(PhysicalFillExpr {
+            expr: phys_col(f.name(), &records.schema())?,
+            strategy: match fill_strategy {
                 Some(ref fs) => fs.clone(),
                 None => FillStrategy::Default(f.data_type().try_into()?),
             },
-        ));
+        });
     }
     Ok(v)
 }
 
-fn get_params_ms_with_fill_strategy(
-    batch: &TestRecords,
-    stride_ms: i64,
-    start: Option<i64>,
-    end: i64,
-    fill_strategy: Option<FillStrategy>,
-) -> GapFillExecParams {
-    get_params_ms_with_origin_fill_strategy(batch, stride_ms, start, end, None, fill_strategy)
-}
-
-fn get_params_ms_with_origin_fill_strategy(
+fn get_date_bin_expr(
     batch: &TestRecords,
     stride_ms: i64,
-    start: Option<i64>,
-    end: i64,
     origin_ms: Option<i64>,
-    fill_strategy: Option<FillStrategy>,
-) -> GapFillExecParams {
-    // stride is in ms
-    let stride = ScalarValue::new_interval_mdn(0, 0, stride_ms * 1_000_000);
-    let origin =
-        origin_ms.map(|o| phys_lit(ScalarValue::TimestampNanosecond(Some(o * 1_000_000), None)));
+) -> Arc<dyn PhysicalExpr> {
+    let mut args = vec![
+        phys_lit(ScalarValue::new_interval_mdn(0, 0, stride_ms * 1_000_000)),
+        Arc::new(Column::new("t", batch.group_cols.len())),
+    ];
+    args.extend(
+        origin_ms
+            .iter()
+            .map(|ms| phys_lit(ScalarValue::TimestampNanosecond(Some(ms * 1_000_000), None))),
+    );
+    Arc::new(ScalarFunctionExpr::new(
+        "time",
+        datafusion::functions::datetime::date_bin(),
+        args,
+        Arc::new(Field::new(
+            "time",
+            DataType::Timestamp(TimeUnit::Nanosecond, batch.timezone.clone()),
+            true,
+        )),
+    ))
+}
 
-    GapFillExecParams {
-        date_bin_udf: Arc::new(ScalarUDF::new_from_impl(DateBinFunc::new())),
-        stride: phys_lit(stride),
-        time_column: Column::new("t", batch.group_cols.len()),
-        origin,
-        // timestamps are nanos, so scale them accordingly
-        time_range: Range {
-            start: bound_included_from_option(start.map(|start| {
-                phys_lit(ScalarValue::TimestampNanosecond(
-                    Some(start * 1_000_000),
-                    None,
-                ))
-            })),
-            end: Bound::Included(phys_lit(ScalarValue::TimestampNanosecond(
-                Some(end * 1_000_000),
+fn get_time_range(start: Option<i64>, end: i64) -> Range<Bound<Arc<dyn PhysicalExpr>>> {
+    Range {
+        start: bound_included_from_option(start.map(|start| {
+            phys_lit(ScalarValue::TimestampNanosecond(
+                Some(start * 1_000_000),
                 None,
-            ))),
-        },
-        fill_strategy: phys_fill_strategies(batch, fill_strategy).unwrap(),
+            ))
+        })),
+        end: Bound::Included(phys_lit(ScalarValue::TimestampNanosecond(
+            Some(end * 1_000_000),
+            None,
+        ))),
     }
 }
-
-fn get_params_ms(
-    batch: &TestRecords,
-    stride: i64,
-    start: Option<i64>,
-    end: i64,
-) -> GapFillExecParams {
-    get_params_ms_with_fill_strategy(batch, stride, start, end, None)
-}
diff --git a/iox_query/src/exec/gapfill/mod.rs b/iox_query/src/exec/gapfill/mod.rs
index 6ccf28fa..3e23043f 100644
--- a/iox_query/src/exec/gapfill/mod.rs
+++ b/iox_query/src/exec/gapfill/mod.rs
@@ -12,9 +12,11 @@ mod params;
 mod stream;
 
 use self::stream::GapFillStream;
+use arrow::datatypes::Schema;
 use arrow::{compute::SortOptions, datatypes::SchemaRef};
-use datafusion::common::plan_datafusion_err;
-use datafusion::physical_expr::{LexOrdering, OrderingRequirements};
+use datafusion::common::{DFSchema, plan_datafusion_err};
+use datafusion::logical_expr::ExprSchemable;
+use datafusion::physical_expr::{LexOrdering, OrderingRequirements, ScalarFunctionExpr};
 use datafusion::physical_plan::metrics::MetricsSet;
 use datafusion::{
     common::DFSchemaRef,
@@ -23,12 +25,11 @@ use datafusion::{
         context::{SessionState, TaskContext},
         memory_pool::MemoryConsumer,
     },
-    logical_expr::{LogicalPlan, ScalarUDF, UserDefinedLogicalNodeCore},
+    logical_expr::{LogicalPlan, UserDefinedLogicalNodeCore},
     physical_expr::{EquivalenceProperties, PhysicalSortExpr},
     physical_plan::{
         DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
-        Partitioning, PhysicalExpr, PlanProperties, SendableRecordBatchStream, Statistics,
-        expressions::Column,
+        PhysicalExpr, PlanProperties, SendableRecordBatchStream, Statistics,
         metrics::{BaselineMetrics, ExecutionPlanMetricsSet},
     },
     prelude::Expr,
@@ -49,13 +50,19 @@ use std::{
 pub struct GapFill {
     /// The incoming logical plan
     pub input: Arc<LogicalPlan>,
-    /// Grouping expressions
-    pub group_expr: Vec<Expr>,
-    /// Aggregate expressions
-    pub aggr_expr: Vec<Expr>,
-    /// Parameters to configure the behavior of the
-    /// gap-filling operation
-    pub params: GapFillParams,
+    /// Series expressions
+    pub series_expr: Vec<Expr>,
+    /// Time binning expr
+    pub time_expr: Expr,
+    /// Filling expressions
+    pub fill_expr: Vec<FillExpr>,
+    /// The time range of the time column inferred from predicates
+    /// in the overall query. The lower bound may be [`Bound::Unbounded`]
+    /// which implies that gap-filling should just start from the
+    /// first point in each series.
+    pub time_range: Range<Bound<Expr>>,
+    /// The schema after the gap-fill operation
+    pub schema: DFSchemaRef,
 }
 
 // Manual impl because GapFillParams has a Range and is not PartialOrd
@@ -64,31 +71,21 @@ impl PartialOrd for GapFill {
         other
             .input
             .partial_cmp(&self.input)
-            .then_with_opt(|| self.group_expr.partial_cmp(&other.group_expr))
-            .then_with_opt(|| self.aggr_expr.partial_cmp(&other.aggr_expr))
+            .then_with_opt(|| self.series_expr.partial_cmp(&other.series_expr))
+            .then_with_opt(|| self.fill_expr.partial_cmp(&other.fill_expr))
     }
 }
 
-/// Parameters to the GapFill operation
-#[derive(Clone, Debug, Hash, PartialEq, Eq)]
-pub struct GapFillParams {
-    /// The name of the UDF that provides the DATE_BIN like functionality.
-    pub date_bin_udf: Arc<str>,
-    /// The stride argument from the call to DATE_BIN_GAPFILL
-    pub stride: Expr,
-    /// The source time column
-    pub time_column: Expr,
-    /// The origin argument from the call to DATE_BIN_GAPFILL
-    pub origin: Option<Expr>,
-    /// The time range of the time column inferred from predicates
-    /// in the overall query. The lower bound may be [`Bound::Unbounded`]
-    /// which implies that gap-filling should just start from the
-    /// first point in each series.
-    pub time_range: Range<Bound<Expr>>,
-    /// What to do when filling aggregate columns.
-    /// The first item in the tuple will be the column
-    /// reference for the aggregate column.
-    pub fill_strategy: Vec<(Expr, FillStrategy)>,
+#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd)]
+pub struct FillExpr {
+    pub expr: Expr,
+    pub strategy: FillStrategy,
+}
+
+impl std::fmt::Display for FillExpr {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.strategy.display_with_expr(&self.expr))
+    }
 }
 
 /// Describes how to fill gaps in an aggregate column.
@@ -116,85 +113,8 @@ impl FillStrategy {
             Self::PrevNullAsMissing => format!("LOCF({expr})"),
             Self::LinearInterpolate => format!("INTERPOLATE({expr})"),
             Self::Default(scalar) if scalar.is_null() => expr.to_string(),
-            Self::Default(val) => format!("COALESCE({val}, {expr})"),
-        }
-    }
-}
-
-impl GapFillParams {
-    // Extract the expressions so they can be optimized.
-    fn expressions(&self) -> Vec<Expr> {
-        let mut exprs = vec![self.stride.clone(), self.time_column.clone()];
-        if let Some(e) = self.origin.as_ref() {
-            exprs.push(e.clone())
-        }
-        if let Some(start) = bound_extract(&self.time_range.start) {
-            exprs.push(start.clone());
-        }
-        exprs.push(
-            bound_extract(&self.time_range.end)
-                .unwrap_or_else(|| panic!("upper time bound is required"))
-                .clone(),
-        );
-        exprs
-    }
-
-    #[expect(clippy::wrong_self_convention)] // follows convention of UserDefinedLogicalNode
-    fn from_template(&self, exprs: &[Expr], aggr_expr: &[Expr]) -> Self {
-        let mut e_iter = exprs.iter().cloned();
-
-        // we only need the third item in the iter if `Some(_) == self.origin` so that's why we
-        // match against `None | Some(Some(_))` - that ensures either origin is None, or origin is
-        // Some and e_iter.next() is Some
-        let (Some(stride), Some(time_column), origin @ (None | Some(Some(_)))) = (
-            e_iter.next(),
-            e_iter.next(),
-            self.origin.as_ref().map(|_| e_iter.next()),
-        ) else {
-            panic!("`exprs` should contain at least a stride, source, and origin");
-        };
-
-        let origin = origin.flatten();
-
-        let time_range = match try_map_range(&self.time_range, |b| {
-            try_map_bound(b.as_ref(), |_| {
-                Ok::<_, Infallible>(e_iter.next().expect("expr count should match template"))
-            })
-        }) {
-            Ok(tr) => tr,
-            Err(infallible) => match infallible {},
-        };
-
-        let fill_strategy = aggr_expr
-            .iter()
-            .cloned()
-            .zip(
-                self.fill_strategy
-                    .iter()
-                    .map(|(_expr, fill_strategy)| fill_strategy.clone()),
-            )
-            .collect();
-
-        Self {
-            date_bin_udf: Arc::clone(&self.date_bin_udf),
-            stride,
-            time_column,
-            origin,
-            time_range,
-            fill_strategy,
-        }
-    }
-
-    // Find the expression that matches `e` and replace its fill strategy.
-    // If such an expression is found, return the old strategy, and `None` otherwise.
-    fn replace_fill_strategy(&mut self, e: &Expr, mut fs: FillStrategy) -> Option<FillStrategy> {
-        for expr_fs in &mut self.fill_strategy {
-            if &expr_fs.0 == e {
-                std::mem::swap(&mut fs, &mut expr_fs.1);
-                return Some(fs);
-            }
+            Self::Default(val) => format!("COALESCE({expr}, {val})"),
         }
-        None
     }
 }
 
@@ -202,21 +122,61 @@ impl GapFill {
     /// Create a new gap-filling operator.
     pub fn try_new(
         input: Arc<LogicalPlan>,
-        group_expr: Vec<Expr>,
-        aggr_expr: Vec<Expr>,
-        params: GapFillParams,
+        series_expr: Vec<Expr>,
+        time_expr: Expr,
+        fill_expr: Vec<FillExpr>,
+        time_range: Range<Bound<Expr>>,
     ) -> Result<Self> {
-        if params.time_range.end == Bound::Unbounded {
+        let (time_alias, time_col) = {
+            let (time_alias, time_expr) = if let Expr::Alias(alias) = &time_expr {
+                (Some(alias.name.clone()), alias.expr.as_ref())
+            } else {
+                (None, &time_expr)
+            };
+            let Expr::ScalarFunction(time_func) = time_expr else {
+                return Err(DataFusionError::Internal(
+                    "GapFill time expression must be a ScalarFunctionExpr".to_string(),
+                ));
+            };
+            let time_col = time_func.args.get(1).ok_or_else(|| {
+                DataFusionError::Internal(
+                    "GapFill time expression must have at least two arguments".to_string(),
+                )
+            })?;
+            (time_alias, time_col.clone())
+        };
+
+        if time_range.end == Bound::Unbounded {
             return Err(DataFusionError::Internal(
                 "missing upper bound in GapFill time range".to_string(),
             ));
         }
 
+        let time_schema_expr = if let Some(alias) = &time_alias {
+            time_col.alias(alias)
+        } else {
+            time_col
+        };
+
+        let fields = series_expr
+            .iter()
+            .chain(std::iter::once(&time_schema_expr))
+            .chain(fill_expr.iter().map(|fe| &fe.expr))
+            .map(|expr| expr.to_field(input.schema().as_ref()))
+            .collect::<Result<Vec<_>>>()?;
+
+        let schema = Arc::new(DFSchema::new_with_metadata(
+            fields,
+            input.schema().metadata().clone(),
+        )?);
+
         Ok(Self {
             input,
-            group_expr,
-            aggr_expr,
-            params,
+            series_expr,
+            time_expr,
+            fill_expr,
+            time_range,
+            schema,
         })
     }
 
@@ -225,9 +185,15 @@ impl GapFill {
     pub(crate) fn replace_fill_strategy(
         &mut self,
         e: &Expr,
-        fs: FillStrategy,
+        mut fs: FillStrategy,
     ) -> Option<FillStrategy> {
-        self.params.replace_fill_strategy(e, fs)
+        for fe in &mut self.fill_expr {
+            if &fe.expr == e {
+                std::mem::swap(&mut fe.strategy, &mut fs);
+                return Some(fs);
+            }
+        }
+        None
     }
 }
 
@@ -241,29 +207,39 @@ impl UserDefinedLogicalNodeCore for GapFill {
     }
 
     fn schema(&self) -> &DFSchemaRef {
-        self.input.schema()
+        &self.schema
     }
 
     fn expressions(&self) -> Vec<Expr> {
-        self.group_expr
-            .iter()
-            .chain(&self.aggr_expr)
-            .chain(&self.params.expressions())
-            .cloned()
-            .collect()
+        let mut exprs = Vec::with_capacity(self.series_expr.len() + 1 + self.fill_expr.len() + 2);
+        for e in &self.series_expr {
+            exprs.push(e.clone());
+        }
+        exprs.push(self.time_expr.clone());
+        for fe in &self.fill_expr {
+            exprs.push(fe.expr.clone());
+        }
+        if let Some(start) = bound_extract(&self.time_range.start) {
+            exprs.push(start.clone());
+        }
+        exprs.push(
+            bound_extract(&self.time_range.end)
+                .unwrap_or_else(|| panic!("upper time bound is required"))
+                .clone(),
+        );
+        exprs
     }
 
     fn fmt_for_explain(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let aggr_expr: String = self
-            .params
-            .fill_strategy
+        let fill_expr: String = self
+            .fill_expr
             .iter()
-            .map(|(e, fs)| fs.display_with_expr(e))
+            .map(|e| e.to_string())
             .collect::<Vec<String>>()
             .join(", ");
 
-        let group_expr = self
-            .group_expr
+        let series_expr = self
+            .series_expr
             .iter()
             .map(|e| e.to_string())
             .collect::<Vec<String>>()
@@ -271,24 +247,48 @@ impl UserDefinedLogicalNodeCore for GapFill {
 
         write!(
             f,
-            "{}: groupBy=[{group_expr}], aggr=[[{aggr_expr}]], time_column={}, stride={}, range={:?}",
+            "{}: series=[{series_expr}], time={}, fill=[{fill_expr}], range={:?}",
             self.name(),
-            self.params.time_column,
-            self.params.stride,
-            self.params.time_range,
+            self.time_expr,
+            self.time_range,
         )
     }
 
     fn with_exprs_and_inputs(
         &self,
-        mut group_expr: Vec<Expr>,
+        mut series_expr: Vec<Expr>,
         inputs: Vec<LogicalPlan>,
     ) -> Result<Self> {
         let plan = inputs[0].clone();
-        let mut aggr_expr = group_expr.split_off(self.group_expr.len());
-        let param_expr = aggr_expr.split_off(self.aggr_expr.len());
-        let params = self.params.from_template(&param_expr, &aggr_expr);
-        Self::try_new(Arc::new(plan), group_expr, aggr_expr, params)
+        let mut fill_expr = series_expr.split_off(self.series_expr.len() + 1);
+        let time_expr = series_expr
+            .pop()
+            .expect("there should be at least one series expr (the time expr)");
+        let mut e_iter = fill_expr.split_off(self.fill_expr.len()).into_iter();
+        let time_range = match try_map_range(&self.time_range, |b| {
+            try_map_bound(b.as_ref(), |_| {
+                Ok::<_, Infallible>(e_iter.next().expect("expr count should match template"))
+            })
+        }) {
+            Ok(tr) => tr,
+            Err(infallible) => match infallible {},
+        };
+
+        let fill_expr = fill_expr
+            .into_iter()
+            .zip(self.fill_expr.iter().map(|fe| fe.strategy.clone()))
+            .map(|(e, fs)| FillExpr {
+                expr: e,
+                strategy: fs,
+            })
+            .collect();
+        Self::try_new(
+            Arc::new(plan),
+            series_expr,
+            time_expr,
+            fill_expr,
+            time_range,
+        )
     }
 
     /// Projection pushdown is an optmization that pushes a `Projection` node further down
@@ -353,79 +353,37 @@ pub(crate) fn plan_gap_fill(
         }
     };
 
-    let input_schema = phys_input.schema();
-    let input_schema = input_schema.as_ref();
-
-    let group_expr: Result<Vec<_>> = gap_fill
-        .group_expr
-        .iter()
-        .map(|e| session_state.create_physical_expr(e.clone(), input_dfschema))
-        .collect();
-    let group_expr = group_expr?;
-
-    let aggr_expr: Result<Vec<_>> = gap_fill
-        .aggr_expr
+    let series_expr = gap_fill
+        .series_expr
         .iter()
-        .map(|e| session_state.create_physical_expr(e.clone(), input_dfschema))
-        .collect();
-    let aggr_expr = aggr_expr?;
-
-    let Some(logical_time_column) = gap_fill.params.time_column.try_as_col() else {
-        return Err(DataFusionError::Internal(
-            "GapFillExec: time column must be a `Column` expression".to_string(),
-        ));
-    };
-    let time_column = Column::new_with_schema(&logical_time_column.name, input_schema)?;
-
-    let stride =
-        session_state.create_physical_expr(gap_fill.params.stride.clone(), input_dfschema)?;
-
-    let time_range = &gap_fill.params.time_range;
-    let time_range = try_map_range(time_range, |b| {
+        .map(|expr| session_state.create_physical_expr(expr.clone(), input_dfschema))
+        .collect::<Result<Vec<_>>>()?;
+    let time_expr =
+        session_state.create_physical_expr(gap_fill.time_expr.clone(), input_dfschema)?;
+    let time_range = try_map_range(&gap_fill.time_range, |b| {
         try_map_bound(b.as_ref(), |e| {
             session_state.create_physical_expr(e.clone(), input_dfschema)
         })
     })?;
 
-    let origin = gap_fill
-        .params
-        .origin
-        .as_ref()
-        .map(|e| session_state.create_physical_expr(e.clone(), input_dfschema))
-        .transpose()?;
-
-    let fill_strategy = gap_fill
-        .params
-        .fill_strategy
+    let fill_expr = gap_fill
+        .fill_expr
         .iter()
-        .map(|(e, fs)| {
-            Ok((
-                session_state.create_physical_expr(e.clone(), input_dfschema)?,
-                fs.clone(),
-            ))
+        .map(|fe| {
+            Ok(PhysicalFillExpr {
+                expr: session_state.create_physical_expr(fe.expr.clone(), input_dfschema)?,
+                strategy: fe.strategy.clone(),
+            })
         })
-        .collect::<Result<Vec<(Arc<dyn PhysicalExpr>, FillStrategy)>>>()?;
-
-    let date_bin_udf = session_state
-        .scalar_functions()
-        .get(gap_fill.params.date_bin_udf.as_ref())
-        .cloned()
-        .ok_or_else(|| {
-            DataFusionError::Execution(format!(
-                "ScalarUDF {} not found",
-                gap_fill.params.date_bin_udf
-            ))
-        })?;
-
-    let params = GapFillExecParams {
-        date_bin_udf,
-        stride,
-        time_column,
-        origin,
+        .collect::<Result<Vec<_>>>()?;
+
+    GapFillExec::try_new(
+        Arc::clone(phys_input),
+        series_expr,
+        time_expr,
+        fill_expr,
         time_range,
-        fill_strategy,
-    };
-    GapFillExec::try_new(Arc::clone(phys_input), group_expr, aggr_expr, params)
+    )
 }
 
 fn try_map_range<T, U, E, F>(tr: &Range<T>, mut f: F) -> Result<Range<U>, E>
@@ -459,96 +417,97 @@ fn bound_extract<T>(b: &Bound<T>) -> Option<&T> {
 /// A physical node for the gap-fill operation.
 pub struct GapFillExec {
     input: Arc<dyn ExecutionPlan>,
-    // The group by expressions from the original aggregation node.
-    group_expr: Vec<Arc<dyn PhysicalExpr>>,
-    // The aggregate expressions from the original aggregation node.
-    aggr_expr: Vec<Arc<dyn PhysicalExpr>>,
+    // Expressions which separate the time-series that are being filled.
+    series_expr: Vec<Arc<dyn PhysicalExpr>>,
+    /// The time expression within the series.
+    time_expr: Arc<dyn PhysicalExpr>,
+    /// Expressions the describe how values are filled.
+    fill_expr: Vec<PhysicalFillExpr>,
+    /// The output schema.
+    schema: SchemaRef,
     // The sort expressions for the required sort order of the input:
     // all of the group exressions, with the time column being last.
     sort_expr: LexOrdering,
-    // Parameters (besides streaming data) to gap filling
-    params: GapFillExecParams,
+    /// The time range of source input to DATE_BIN_GAPFILL.
+    /// Inferred from predicates in the overall query.
+    time_range: Range<Bound<Arc<dyn PhysicalExpr>>>,
     /// Metrics reporting behavior during execution.
     metrics: ExecutionPlanMetricsSet,
     /// Cache holding plan properties like equivalences, output partitioning, output ordering etc.
     cache: PlanProperties,
 }
 
-#[derive(Clone, Debug)]
-struct GapFillExecParams {
-    /// The scalar function used to bin the timestamps.
-    date_bin_udf: Arc<ScalarUDF>,
-    /// The uniform interval of incoming timestamps
-    stride: Arc<dyn PhysicalExpr>,
-    /// The timestamp column produced by date_bin
-    time_column: Column,
-    /// The origin argument from the all to DATE_BIN_GAPFILL
-    origin: Option<Arc<dyn PhysicalExpr>>,
-    /// The time range of source input to DATE_BIN_GAPFILL.
-    /// Inferred from predicates in the overall query.
-    time_range: Range<Bound<Arc<dyn PhysicalExpr>>>,
-    /// What to do when filling aggregate columns.
-    /// The 0th element in each tuple is the aggregate column.
-    fill_strategy: Vec<(Arc<dyn PhysicalExpr>, FillStrategy)>,
-}
-
 impl GapFillExec {
     fn try_new(
         input: Arc<dyn ExecutionPlan>,
-        group_expr: Vec<Arc<dyn PhysicalExpr>>,
-        aggr_expr: Vec<Arc<dyn PhysicalExpr>>,
-        params: GapFillExecParams,
+        series_expr: Vec<Arc<dyn PhysicalExpr>>,
+        time_expr: Arc<dyn PhysicalExpr>,
+        fill_expr: Vec<PhysicalFillExpr>,
+        time_range: Range<Bound<Arc<dyn PhysicalExpr>>>,
     ) -> Result<Self> {
+        let time_col = {
+            let Some(time_func) = time_expr.as_any().downcast_ref::<ScalarFunctionExpr>() else {
+                return Err(DataFusionError::Internal(format!(
+                    "GapFill time expression must be a ScalarFunctionExpr: {}",
+                    time_expr
+                )));
+            };
+            let Some(time_col) = time_func.args().get(1) else {
+                return Err(DataFusionError::Internal(format!(
+                    "GapFill time expression must have at least two arguments: {}",
+                    time_expr
+                )));
+            };
+
+            Arc::clone(time_col)
+        };
         let sort_expr = {
-            let mut sort_expr: Vec<_> = group_expr
+            let sort_expr: Vec<_> = series_expr
                 .iter()
                 .map(|expr| PhysicalSortExpr {
                     expr: Arc::clone(expr),
                     options: SortOptions::default(),
                 })
+                // Add the time input as the lowest priority sort key.
+                .chain(std::iter::once(PhysicalSortExpr {
+                    expr: Arc::clone(&time_col),
+                    options: SortOptions::default(),
+                }))
                 .collect();
 
-            // Ensure that the time column is the last component in the sort
-            // expressions.
-            let time_idx = group_expr
-                .iter()
-                .enumerate()
-                .find(|(_i, e)| {
-                    e.as_any()
-                        .downcast_ref::<Column>()
-                        .is_some_and(|c| c.index() == params.time_column.index())
-                })
-                .map(|(i, _)| i);
-
-            if let Some(time_idx) = time_idx {
-                let last_elem = sort_expr.len() - 1;
-                sort_expr.swap(time_idx, last_elem);
-            } else {
-                return Err(DataFusionError::Internal(
-                    "could not find time column for GapFillExec".to_string(),
-                ));
-            }
-
             LexOrdering::new(sort_expr)
                 .ok_or_else(|| plan_datafusion_err!("GapFill sort key empty"))?
         };
 
-        let cache = Self::compute_properties(&input);
+        let input_schema = input.schema();
+        let fields = series_expr
+            .iter()
+            .chain(std::iter::once(&time_col))
+            .chain(fill_expr.iter().map(|fe| &fe.expr))
+            .map(|expr| expr.return_field(&input_schema))
+            .collect::<Result<Vec<_>>>()?;
+        let schema = Arc::new(Schema::new_with_metadata(
+            fields,
+            input_schema.metadata().clone(),
+        ));
+
+        let cache = Self::compute_properties(&input, Arc::clone(&schema));
 
         Ok(Self {
             input,
-            group_expr,
-            aggr_expr,
+            series_expr,
+            time_expr,
+            fill_expr,
+            schema,
             sort_expr,
-            params,
+            time_range,
             metrics: ExecutionPlanMetricsSet::new(),
             cache,
         })
     }
 
     /// This function creates the cache object that stores the plan properties such as equivalence properties, partitioning, ordering, etc.
-    fn compute_properties(input: &Arc<dyn ExecutionPlan>) -> PlanProperties {
-        let schema = input.schema();
+    fn compute_properties(input: &Arc<dyn ExecutionPlan>, schema: SchemaRef) -> PlanProperties {
         let eq_properties = match input.properties().output_ordering() {
             None => EquivalenceProperties::new(schema),
             Some(output_ordering) => EquivalenceProperties::new_with_orderings(
@@ -557,11 +516,9 @@ impl GapFillExec {
             ),
         };
 
-        let output_partitioning = Partitioning::UnknownPartitioning(1);
-
         PlanProperties::new(
             eq_properties,
-            output_partitioning,
+            input.properties().output_partitioning().clone(),
             input.pipeline_behavior(),
             input.boundedness(),
         )
@@ -584,7 +541,7 @@ impl ExecutionPlan for GapFillExec {
     }
 
     fn schema(&self) -> SchemaRef {
-        self.input.schema()
+        Arc::clone(&self.schema)
     }
 
     fn properties(&self) -> &PlanProperties {
@@ -592,9 +549,14 @@ impl ExecutionPlan for GapFillExec {
     }
 
     fn required_input_distribution(&self) -> Vec<Distribution> {
-        // It seems like it could be possible to partition on all the
-        // group keys except for the time expression. For now, keep it simple.
-        vec![Distribution::SinglePartition]
+        vec![if self.series_expr.is_empty() {
+            // If there are no series expressions then the input is a
+            // single time series. There is no advantage to partitioning
+            // in that case.
+            Distribution::SinglePartition
+        } else {
+            Distribution::HashPartitioned(self.series_expr.clone())
+        }]
     }
 
     fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
@@ -618,9 +580,10 @@ impl ExecutionPlan for GapFillExec {
         match children.as_slice() {
             [child] => Ok(Arc::new(Self::try_new(
                 Arc::clone(child),
-                self.group_expr.clone(),
-                self.aggr_expr.clone(),
-                self.params.clone(),
+                self.series_expr.clone(),
+                Arc::clone(&self.time_expr),
+                self.fill_expr.clone(),
+                self.time_range.clone(),
             )?)),
             _ => Err(DataFusionError::Internal(format!(
                 "GapFillExec wrong number of children: expected 1, found {}",
@@ -634,9 +597,15 @@ impl ExecutionPlan for GapFillExec {
         partition: usize,
         context: Arc<TaskContext>,
     ) -> Result<SendableRecordBatchStream> {
-        if partition != 0 {
+        if partition
+            >= self
+                .input
+                .properties()
+                .output_partitioning()
+                .partition_count()
+        {
             return Err(DataFusionError::Internal(format!(
-                "GapFillExec invalid partition {partition}, there can be only one partition"
+                "GapFillExec invalid partition {partition}"
             )));
         }
 
@@ -669,15 +638,19 @@ impl DisplayAs for GapFillExec {
             DisplayFormatType::Default
             | DisplayFormatType::Verbose
             | DisplayFormatType::TreeRender => {
-                let group_expr: Vec<_> = self.group_expr.iter().map(|e| e.to_string()).collect();
-                let aggr_expr: Vec<_> = self
-                    .params
-                    .fill_strategy
+                let series_expr: Vec<_> = self.series_expr.iter().map(|e| e.to_string()).collect();
+                let fill_expr: Vec<_> = self
+                    .fill_expr
                     .iter()
-                    .map(|(e, fs)| fs.display_with_expr(e))
+                    .map(
+                        |PhysicalFillExpr {
+                             expr: e,
+                             strategy: fs,
+                         }| fs.display_with_expr(e),
+                    )
                     .collect();
 
-                let time_range = match try_map_range(&self.params.time_range, |b| {
+                let time_range = match try_map_range(&self.time_range, |b| {
                     try_map_bound(b.as_ref(), |e| Ok::<_, Infallible>(e.to_string()))
                 }) {
                     Ok(tr) => tr,
@@ -686,10 +659,10 @@ impl DisplayAs for GapFillExec {
 
                 write!(
                     f,
-                    "GapFillExec: group_expr=[{}], aggr_expr=[{}], stride={}, time_range={:?}",
-                    group_expr.join(", "),
-                    aggr_expr.join(", "),
-                    self.params.stride,
+                    "GapFillExec: series_expr=[{}], time_expr={}, fill_expr=[{}], time_range={:?}",
+                    series_expr.join(", "),
+                    self.time_expr,
+                    fill_expr.join(", "),
                     time_range
                 )
             }
@@ -697,6 +670,19 @@ impl DisplayAs for GapFillExec {
     }
 }
 
+/// A physical expression that represents a fill operation.
+#[derive(Debug, Clone)]
+pub struct PhysicalFillExpr {
+    pub expr: Arc<dyn PhysicalExpr>,
+    pub strategy: FillStrategy,
+}
+
+impl std::fmt::Display for PhysicalFillExpr {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.strategy.display_with_expr(&self.expr))
+    }
+}
+
 #[cfg(test)]
 mod test {
     use std::ops::{Bound, Range};
@@ -712,8 +698,8 @@ mod test {
         common::DFSchema,
         datasource::empty::EmptyTable,
         error::Result,
-        logical_expr::{ExprSchemable, Extension, UserDefinedLogicalNode, logical_plan},
-        prelude::{col, lit},
+        logical_expr::{ExprSchemable, Extension, logical_plan},
+        prelude::{col, date_bin, lit},
         scalar::ScalarValue,
     };
     use datafusion_util::lit_timestamptz_nano;
@@ -737,12 +723,15 @@ mod test {
         logical_plan::table_scan(Some("temps"), &schema, None)?.build()
     }
 
-    fn fill_strategy_null(cols: Vec<Expr>, schema: &DFSchema) -> Vec<(Expr, FillStrategy)> {
+    fn fill_strategy_null(cols: Vec<Expr>, schema: &DFSchema) -> Vec<FillExpr> {
         cols.into_iter()
             .map(|e| {
                 e.get_type(schema)
                     .and_then(|dt| dt.try_into())
-                    .map(|null| (e, FillStrategy::Default(null)))
+                    .map(|null| FillExpr {
+                        expr: e,
+                        strategy: FillStrategy::Default(null),
+                    })
             })
             .collect::<Result<Vec<_>>>()
             .unwrap()
@@ -754,113 +743,22 @@ mod test {
         let schema = Arc::clone(scan.schema());
         let result = GapFill::try_new(
             Arc::new(scan),
-            vec![col("loc"), col("time")],
-            vec![col("temp")],
-            GapFillParams {
-                date_bin_udf: Arc::from("date_bin"),
-                stride: lit(ScalarValue::new_interval_dt(0, 60_000)),
-                time_column: col("time"),
-                origin: None,
-                time_range: Range {
-                    start: Bound::Included(lit_timestamptz_nano(1000)),
-                    end: Bound::Unbounded,
-                },
-                fill_strategy: fill_strategy_null(vec![col("temp")], schema.as_ref()),
+            vec![col("loc")],
+            date_bin(
+                lit(ScalarValue::new_interval_dt(0, 60_000)),
+                col("time"),
+                lit_timestamptz_nano(0),
+            ),
+            fill_strategy_null(vec![col("temp")], schema.as_ref()),
+            Range {
+                start: Bound::Included(lit_timestamptz_nano(1000)),
+                end: Bound::Unbounded,
             },
         );
 
         assert_error!(result, DataFusionError::Internal(ref msg) if msg == "missing upper bound in GapFill time range");
     }
 
-    fn assert_gapfill_from_template_roundtrip(gapfill: &GapFill) {
-        let gapfill_as_node: &dyn UserDefinedLogicalNode = gapfill;
-        let scan = table_scan().unwrap();
-        let exprs = gapfill_as_node.expressions();
-        let want_exprs = gapfill.group_expr.len()
-            + gapfill.aggr_expr.len()
-            + 2 // stride, time
-            + gapfill.params.origin.iter().count()
-            + bound_extract(&gapfill.params.time_range.start).iter().count()
-            + bound_extract(&gapfill.params.time_range.end).iter().count();
-        assert_eq!(want_exprs, exprs.len());
-        let gapfill_ft = gapfill_as_node
-            .with_exprs_and_inputs(exprs, vec![scan])
-            .expect("should be able to create a new `UserDefinedLogicalNode` node");
-        let gapfill_ft = gapfill_ft
-            .as_any()
-            .downcast_ref::<GapFill>()
-            .expect("should be a GapFill");
-        assert_eq!(gapfill.group_expr, gapfill_ft.group_expr);
-        assert_eq!(gapfill.aggr_expr, gapfill_ft.aggr_expr);
-        assert_eq!(gapfill.params, gapfill_ft.params);
-    }
-
-    #[test]
-    fn test_from_template() {
-        let schema = schema().try_into().unwrap();
-
-        for params in vec![
-            // no origin, no start bound
-            GapFillParams {
-                date_bin_udf: Arc::from("date_bin"),
-                stride: lit(ScalarValue::new_interval_dt(0, 60_000)),
-                time_column: col("time"),
-                origin: None,
-                time_range: Range {
-                    start: Bound::Unbounded,
-                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
-                },
-                fill_strategy: fill_strategy_null(vec![col("temp")], &schema),
-            },
-            // no origin, yes start bound
-            GapFillParams {
-                date_bin_udf: Arc::from("date_bin"),
-                stride: lit(ScalarValue::new_interval_dt(0, 60_000)),
-                time_column: col("time"),
-                origin: None,
-                time_range: Range {
-                    start: Bound::Included(lit_timestamptz_nano(1000)),
-                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
-                },
-                fill_strategy: fill_strategy_null(vec![col("temp")], &schema),
-            },
-            // yes origin, no start bound
-            GapFillParams {
-                date_bin_udf: Arc::from("date_bin"),
-                stride: lit(ScalarValue::new_interval_dt(0, 60_000)),
-                time_column: col("time"),
-                origin: Some(lit_timestamptz_nano(1_000_000_000)),
-                time_range: Range {
-                    start: Bound::Unbounded,
-                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
-                },
-                fill_strategy: fill_strategy_null(vec![col("temp")], &schema),
-            },
-            // yes origin, yes start bound
-            GapFillParams {
-                date_bin_udf: Arc::from("date_bin"),
-                stride: lit(ScalarValue::new_interval_dt(0, 60_000)),
-                time_column: col("time"),
-                origin: Some(lit_timestamptz_nano(1_000_000_000)),
-                time_range: Range {
-                    start: Bound::Included(lit_timestamptz_nano(1000)),
-                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
-                },
-                fill_strategy: fill_strategy_null(vec![col("temp")], &schema),
-            },
-        ] {
-            let scan = table_scan().unwrap();
-            let gapfill = GapFill::try_new(
-                Arc::new(scan.clone()),
-                vec![col("loc"), col("time")],
-                vec![col("temp")],
-                params,
-            )
-            .unwrap();
-            assert_gapfill_from_template_roundtrip(&gapfill);
-        }
-    }
-
     #[test]
     fn fmt_logical_plan() -> Result<()> {
         // This test case does not make much sense but
@@ -870,18 +768,16 @@ mod test {
         let schema = Arc::clone(scan.schema());
         let gapfill = GapFill::try_new(
             Arc::new(scan),
-            vec![col("loc"), col("time")],
-            vec![col("temp")],
-            GapFillParams {
-                date_bin_udf: Arc::from("date_bin"),
-                stride: lit(ScalarValue::new_interval_dt(0, 60_000)),
-                time_column: col("time"),
-                origin: None,
-                time_range: Range {
-                    start: Bound::Included(lit_timestamptz_nano(1000)),
-                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
-                },
-                fill_strategy: fill_strategy_null(vec![col("temp")], &schema),
+            vec![col("loc")],
+            date_bin(
+                lit(ScalarValue::new_interval_dt(0, 60_000)),
+                col("time"),
+                lit_timestamptz_nano(0),
+            ),
+            fill_strategy_null(vec![col("temp")], &schema),
+            Range {
+                start: Bound::Included(lit_timestamptz_nano(1000)),
+                end: Bound::Excluded(lit_timestamptz_nano(2000)),
             },
         )?;
         let plan = LogicalPlan::Extension(Extension {
@@ -891,7 +787,7 @@ mod test {
         insta::assert_yaml_snapshot!(
             format_logical_plan(&plan),
             @r#"
-        - " GapFill: groupBy=[loc, time], aggr=[[temp]], time_column=time, stride=IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
+        - " GapFill: series=[loc], time=date_bin(IntervalDayTime(\"IntervalDayTime { days: 0, milliseconds: 60000 }\"), time, TimestampNanosecond(0, None)), fill=[temp], range=Included(Literal(TimestampNanosecond(1000, None), None))..Excluded(Literal(TimestampNanosecond(2000, None), None))"
         - "   TableScan: temps"
         "#
         );
@@ -922,7 +818,7 @@ mod test {
             explain,
             @r#"
         - " ProjectionExec: expr=[date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 as minute, avg(temps.temp)@1 as avg(temps.temp)]"
-        - "   GapFillExec: group_expr=[date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0], aggr_expr=[avg(temps.temp)@1], stride=IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }, time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")"
+        - "   GapFillExec: series_expr=[], time_expr=date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }, date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0, 0), fill_expr=[avg(temps.temp)@1], time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")"
         - "     SortExec: expr=[date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 ASC], preserve_partitioning=[false]"
         - "       AggregateExec: mode=Single, gby=[date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }, time@0, 0) as date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))], aggr=[avg(temps.temp)]"
         - "         EmptyExec"
@@ -949,8 +845,8 @@ mod test {
         insta::assert_yaml_snapshot!(
             explain,
             @r#"
-        - " ProjectionExec: expr=[loc@0 as loc, date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 as minute, concat(Utf8(\"zz\"),temps.loc)@2 as loczz, avg(temps.temp)@3 as avg(temps.temp)]"
-        - "   GapFillExec: group_expr=[loc@0, date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1, concat(Utf8(\"zz\"),temps.loc)@2], aggr_expr=[avg(temps.temp)@3], stride=IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }, time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")"
+        - " ProjectionExec: expr=[loc@0 as loc, date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@2 as minute, concat(Utf8(\"zz\"),temps.loc)@1 as loczz, avg(temps.temp)@3 as avg(temps.temp)]"
+        - "   GapFillExec: series_expr=[loc@0, concat(Utf8(\"zz\"),temps.loc)@2], time_expr=date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }, date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1, 0), fill_expr=[avg(temps.temp)@3], time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")"
         - "     SortExec: expr=[loc@0 ASC, concat(Utf8(\"zz\"),temps.loc)@2 ASC, date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 ASC], preserve_partitioning=[false]"
         - "       AggregateExec: mode=Single, gby=[loc@1 as loc, date_bin(IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }, time@0, 0) as date_bin_gapfill(IntervalMonthDayNano(\"IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 60000000000 }\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\")), concat(zz, loc@1) as concat(Utf8(\"zz\"),temps.loc)], aggr=[avg(temps.temp)]"
         - "         EmptyExec"
diff --git a/iox_query/src/exec/gapfill/params.rs b/iox_query/src/exec/gapfill/params.rs
index b8a9ff88..e53df364 100644
--- a/iox_query/src/exec/gapfill/params.rs
+++ b/iox_query/src/exec/gapfill/params.rs
@@ -1,5 +1,5 @@
 //! Evaluate the parameters to be used for gap filling.
-use std::ops::Bound;
+use std::ops::{Bound, Range};
 use std::sync::Arc;
 
 use arrow::{
@@ -7,20 +7,23 @@ use arrow::{
     record_batch::RecordBatch,
 };
 use chrono::Duration;
+use datafusion::physical_plan::expressions::Column;
 use datafusion::{
     common::exec_err,
     error::{DataFusionError, Result},
     functions::datetime::date_bin::DateBinFunc,
     logical_expr::ScalarFunctionArgs,
-    physical_expr::PhysicalExpr,
-    physical_plan::{ColumnarValue, expressions::Column},
+    physical_expr::{PhysicalExpr, ScalarFunctionExpr},
+    physical_plan::ColumnarValue,
     scalar::ScalarValue,
 };
 use hashbrown::HashMap;
 use query_functions::date_bin_wallclock::DateBinWallclockUDF;
 
+use crate::exec::gapfill::PhysicalFillExpr;
+
 use super::{
-    FillStrategy, GapExpander, GapFillExecParams, date_bin_gap_expander::DateBinGapExpander,
+    FillStrategy, GapExpander, date_bin_gap_expander::DateBinGapExpander,
     date_bin_wallclock_gap_expander::DateBinWallclockGapExpander, try_map_bound, try_map_range,
 };
 
@@ -47,22 +50,40 @@ pub(crate) struct GapFillParams {
 impl GapFillParams {
     /// Create a new [GapFillParams] by figuring out the actual values (as native i64) for the stride,
     /// first and last timestamp for gap filling.
-    pub(super) fn try_new(schema: SchemaRef, params: &GapFillExecParams) -> Result<Self> {
-        let time_data_type = params.time_column.data_type(schema.as_ref())?;
+    pub(super) fn try_new(
+        schema: SchemaRef,
+        time_expr: &Arc<dyn PhysicalExpr>,
+        fill_expr: &[PhysicalFillExpr],
+        time_range: &Range<Bound<Arc<dyn PhysicalExpr>>>,
+    ) -> Result<Self> {
+        let Some(time_func) = time_expr.as_any().downcast_ref::<ScalarFunctionExpr>() else {
+            return Err(DataFusionError::Internal(format!(
+                "time_expr was not a function call: {time_expr}"
+            )));
+        };
+
+        let time_data_type = time_func.data_type(schema.as_ref())?;
         let DataType::Timestamp(_, tz) = time_data_type else {
             return exec_err!("invalid data type for time column: {time_data_type}");
         };
 
         let batch = RecordBatch::new_empty(schema);
-        let stride = params.stride.evaluate(&batch)?;
-        let origin = params
-            .origin
-            .as_ref()
-            .map(|e| e.evaluate(&batch))
-            .transpose()?;
+        let (stride, origin) = match time_func.args() {
+            [stride, _] => (Arc::clone(stride), None),
+            [stride, _, origin] => (Arc::clone(stride), Some(Arc::clone(origin))),
+            _ => {
+                return Err(DataFusionError::Internal(format!(
+                    "unexpected arguments to time_expr: {:?}",
+                    time_func.args()
+                )));
+            }
+        };
+
+        let stride = stride.evaluate(&batch)?;
+        let origin = origin.as_ref().map(|e| e.evaluate(&batch)).transpose()?;
 
         // Evaluate the upper and lower bounds of the time range
-        let range = try_map_range(&params.time_range, |b| {
+        let range = try_map_range(time_range, |b| {
             try_map_bound(b.as_ref(), |pe| {
                 extract_timestamp_nanos(&pe.evaluate(&batch)?)
             })
@@ -103,55 +124,48 @@ impl GapFillParams {
         ));
         let first_ts = first_ts
             .map(|_| {
-                extract_timestamp_nanos(&params.date_bin_udf.invoke_with_args(
-                    ScalarFunctionArgs {
-                        args: args.clone(),
-                        arg_fields: arg_fields(&args),
-                        number_rows: 1,
-                        return_field: Arc::clone(&return_field),
-                    },
-                )?)
+                extract_timestamp_nanos(&time_func.fun().invoke_with_args(ScalarFunctionArgs {
+                    args: args.clone(),
+                    arg_fields: arg_fields(&args),
+                    number_rows: 1,
+                    return_field: Arc::clone(&return_field),
+                })?)
             })
             .transpose()?;
         args[1] = i64_to_columnar_ts(Some(last_ts), &tz);
-        let last_ts = extract_timestamp_nanos(&params.date_bin_udf.invoke_with_args(
-            ScalarFunctionArgs {
+        let last_ts =
+            extract_timestamp_nanos(&time_func.fun().invoke_with_args(ScalarFunctionArgs {
                 args: args.clone(),
                 arg_fields: arg_fields(&args),
                 number_rows: 1,
                 return_field: Arc::clone(&return_field),
-            },
-        )?)?;
+            })?)?;
 
         let gap_expander: Arc<dyn GapExpander + Send + Sync> =
-            if params.date_bin_udf.inner().as_any().is::<DateBinFunc>() {
+            if time_func.fun().inner().as_any().is::<DateBinFunc>() {
                 Arc::new(DateBinGapExpander::new(stride_nanos))
-            } else if params
-                .date_bin_udf
-                .inner()
-                .as_any()
-                .is::<DateBinWallclockUDF>()
-            {
+            } else if time_func.fun().inner().as_any().is::<DateBinWallclockUDF>() {
                 Arc::new(DateBinWallclockGapExpander::try_from_df_args(&args)?)
             } else {
                 return Err(DataFusionError::Execution(format!(
                     "gap filling not supported for {}",
-                    params.date_bin_udf.name()
+                    time_func.fun().name()
                 )));
             };
 
-        let fill_strategy = params
-            .fill_strategy
+        let fill_strategy = fill_expr
             .iter()
-            .map(|(e, fs)| {
-                let idx = e
+            .map(|pfe| {
+                let idx = pfe
+                    .expr
                     .as_any()
                     .downcast_ref::<Column>()
                     .ok_or(DataFusionError::Internal(format!(
-                        "fill strategy aggr expr was not a column: {e:?}",
+                        "fill strategy aggr expr was not a column: {:?}",
+                        pfe.expr
                     )))?
                     .index();
-                Ok((idx, fs.clone()))
+                Ok((idx, pfe.strategy.clone()))
             })
             .collect::<Result<HashMap<usize, FillStrategy>>>()?;
 
@@ -237,7 +251,7 @@ mod tests {
 
     use crate::exec::{
         Executor,
-        gapfill::{FillStrategy, GapFillExec, GapFillExecParams},
+        gapfill::{FillStrategy, GapFillExec},
     };
 
     #[tokio::test]
@@ -367,23 +381,29 @@ mod tests {
 
     #[test]
     fn test_params_no_start() {
-        let exec_params = GapFillExecParams {
-            date_bin_udf: Arc::new(ScalarUDF::new_from_impl(DateBinFunc::new())),
-            stride: interval(1_000_000_000),
-            time_column: Column::new("time", 0),
-            origin: None,
-            time_range: Range {
-                start: Bound::Unbounded,
-                end: Bound::Excluded(timestamp(20_000_000_000)),
-            },
-            fill_strategy: std::iter::once((
-                Arc::new(Column::new("a0", 1)) as Arc<dyn PhysicalExpr>,
-                FillStrategy::Default(ScalarValue::Null),
-            ))
-            .collect(),
+        let time_range = Range {
+            start: Bound::Unbounded,
+            end: Bound::Excluded(timestamp(20_000_000_000)),
         };
 
-        let params = GapFillParams::try_new(schema().into(), &exec_params).unwrap();
+        let time_expr: Arc<dyn PhysicalExpr> = Arc::new(ScalarFunctionExpr::new(
+            "time",
+            Arc::new(ScalarUDF::new_from_impl(DateBinFunc::new())),
+            vec![interval(1_000_000_000), Arc::new(Column::new("time", 0))],
+            Arc::new(Field::new(
+                "time",
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                false,
+            )),
+        ));
+
+        let fill_expr = vec![PhysicalFillExpr {
+            expr: Arc::new(Column::new("a0", 1)),
+            strategy: FillStrategy::Default(ScalarValue::Null),
+        }];
+
+        let params =
+            GapFillParams::try_new(schema().into(), &time_expr, &fill_expr, &time_range).unwrap();
         assert_eq!(
             params.gap_expander.to_string(),
             "DateBinGapExpander [stride=PT1S]"
@@ -419,9 +439,11 @@ mod tests {
         let physical_plan = context.sql_to_physical_plan(sql).await?;
         let gapfill_node = &physical_plan.children()[0];
         let gapfill_node = gapfill_node.as_any().downcast_ref::<GapFillExec>().unwrap();
-        let exec_params = &gapfill_node.params;
+        let time_expr = &gapfill_node.time_expr;
+        let fill_expr = &gapfill_node.fill_expr;
+        let time_range = &gapfill_node.time_range;
         let schema = schema();
-        GapFillParams::try_new(schema.into(), exec_params)
+        GapFillParams::try_new(schema.into(), time_expr, fill_expr, time_range)
     }
 
     fn simple_fill_strategy() -> HashMap<usize, FillStrategy> {
diff --git a/iox_query/src/exec/gapfill/stream.rs b/iox_query/src/exec/gapfill/stream.rs
index 214e7271..fda86e07 100644
--- a/iox_query/src/exec/gapfill/stream.rs
+++ b/iox_query/src/exec/gapfill/stream.rs
@@ -14,6 +14,7 @@ use arrow_util::optimize::optimize_dictionaries;
 use datafusion::{
     error::{DataFusionError, Result},
     execution::memory_pool::MemoryReservation,
+    physical_expr::ScalarFunctionExpr,
     physical_plan::{
         ExecutionPlan, PhysicalExpr, RecordBatchStream, SendableRecordBatchStream,
         expressions::Column,
@@ -35,15 +36,14 @@ use super::{GapFillExec, algo::GapFiller, buffered_input::BufferedInput, params:
 pub(super) struct GapFillStream {
     /// The schema of the input and output.
     schema: SchemaRef,
+    /// The columns that define the time series that a value belongs to.
+    series_expr: Vec<Arc<dyn PhysicalExpr>>,
     /// The column from the input that contains the timestamps for each row.
     /// This column has already had `date_bin` applied to it by a previous `Aggregate`
     /// operator.
     time_expr: Arc<dyn PhysicalExpr>,
-    /// The other columns from the input that appeared in the GROUP BY clause of the
-    /// original query.
-    group_expr: Vec<Arc<dyn PhysicalExpr>>,
     /// The aggregate columns from the select list of the original query.
-    aggr_expr: Vec<Arc<dyn PhysicalExpr>>,
+    fill_expr: Vec<Arc<dyn PhysicalExpr>>,
     /// The producer of the input record batches.
     input: SendableRecordBatchStream,
     /// Input that has been read from the input stream.
@@ -69,35 +69,40 @@ impl GapFillStream {
     ) -> Result<Self> {
         let schema = exec.schema();
         let GapFillExec {
-            sort_expr,
-            aggr_expr,
-            params,
+            series_expr,
+            time_expr,
+            fill_expr,
+            time_range,
             ..
         } = exec;
 
-        if sort_expr.is_empty() {
+        let series_cols = series_expr.iter().map(expr_to_index).collect::<Vec<_>>();
+        let params = GapFillParams::try_new(Arc::clone(&schema), time_expr, fill_expr, time_range)?;
+        let buffered_input = BufferedInput::new(&params, series_cols);
+
+        let time_expr = if let Some(func) = time_expr.as_any().downcast_ref::<ScalarFunctionExpr>()
+        {
+            // The time_expr has already been determined to be a
+            // date_bin call. Thie input time column is the second
+            // argument.
+            Arc::clone(&func.args()[1])
+        } else {
             return Err(DataFusionError::Internal(
-                "empty sort_expr vector for gap filling; should have at least a time expression"
-                    .to_string(),
+                "time_expr must be a ScalarFunctionExpr".to_string(),
             ));
-        }
-        let mut group_expr = sort_expr
+        };
+
+        let fill_expr = fill_expr
             .iter()
-            .map(|se| Arc::clone(&se.expr))
+            .map(|pfe| Arc::clone(&pfe.expr))
             .collect::<Vec<_>>();
-        let aggr_expr = aggr_expr.to_owned();
-        let time_expr = group_expr.split_off(group_expr.len() - 1).pop().unwrap();
-
-        let group_cols = group_expr.iter().map(expr_to_index).collect::<Vec<_>>();
-        let params = GapFillParams::try_new(Arc::clone(&schema), params)?;
-        let buffered_input = BufferedInput::new(&params, group_cols);
 
         let gap_filler = GapFiller::new(params, batch_size);
         Ok(Self {
             schema,
+            series_expr: series_expr.clone(),
             time_expr,
-            group_expr,
-            aggr_expr,
+            fill_expr,
             input,
             buffered_input,
             gap_filler,
@@ -180,7 +185,7 @@ impl GapFillStream {
 
         let old_size = batches.iter().map(|rb| rb.get_array_memory_size()).sum();
 
-        let mut batch = arrow::compute::concat_batches(&self.schema, &batches)
+        let mut batch = arrow::compute::concat_batches(&batches[0].schema(), &batches)
             .map_err(|err| DataFusionError::ArrowError(Box::new(err), None))?;
         self.reservation.try_grow(batch.get_array_memory_size())?;
 
@@ -212,10 +217,9 @@ impl GapFillStream {
             .ok_or(DataFusionError::Internal(
                 "time array must be a TimestampNanosecondArray".to_string(),
             ))?;
-        let input_time_array = (expr_to_index(&self.time_expr), input_time_array);
 
-        let group_arrays = self.group_arrays(&input_batch)?;
-        let aggr_arrays = self.aggr_arrays(&input_batch)?;
+        let series_arrays = self.series_arrays(&input_batch)?;
+        let fill_arrays = self.fill_arrays(&input_batch)?;
 
         let timer = elapsed_compute.timer();
         let output_batch = self
@@ -223,8 +227,8 @@ impl GapFillStream {
             .build_gapfilled_output(
                 Arc::clone(&self.schema),
                 input_time_array,
-                &group_arrays,
-                &aggr_arrays,
+                &series_arrays,
+                &fill_arrays,
             )
             .record_output(&self.baseline_metrics)?;
         timer.done();
@@ -241,23 +245,17 @@ impl GapFillStream {
 
     /// Produces the arrays for the group columns in the input.
     /// The first item in the 2-tuple is the arrays offset in the schema.
-    fn group_arrays(&self, input_batch: &RecordBatch) -> Result<Vec<(usize, ArrayRef)>> {
-        self.group_expr
+    fn series_arrays(&self, input_batch: &RecordBatch) -> Result<Vec<ArrayRef>> {
+        self.series_expr
             .iter()
-            .map(|e| {
-                Ok((
-                    expr_to_index(e),
-                    e.evaluate(input_batch)?
-                        .into_array(input_batch.num_rows())?,
-                ))
-            })
+            .map(|e| e.evaluate(input_batch)?.into_array(input_batch.num_rows()))
             .collect::<Result<Vec<_>>>()
     }
 
     /// Produces the arrays for the aggregate columns in the input.
     /// The first item in the 2-tuple is the arrays offset in the schema.
-    fn aggr_arrays(&self, input_batch: &RecordBatch) -> Result<Vec<(usize, ArrayRef)>> {
-        self.aggr_expr
+    fn fill_arrays(&self, input_batch: &RecordBatch) -> Result<Vec<(usize, ArrayRef)>> {
+        self.fill_expr
             .iter()
             .map(|e| {
                 Ok((
diff --git a/iox_query/src/exec/series_limit/logical.rs b/iox_query/src/exec/series_limit/logical.rs
new file mode 100644
index 00000000..30a0be2f
--- /dev/null
+++ b/iox_query/src/exec/series_limit/logical.rs
@@ -0,0 +1,1252 @@
+//! Logical plan node for the SeriesLimit operation.
+
+use arrow::datatypes::{DataType, Field};
+use datafusion::{
+    common::{
+        DFSchema, DFSchemaRef, ExprSchema, Result, TableReference, internal_err,
+        tree_node::{Transformed, TreeNodeContainer, TreeNodeRecursion},
+    },
+    error::DataFusionError,
+    logical_expr::{Expr, ExprSchemable, LogicalPlan, SortExpr, UserDefinedLogicalNodeCore},
+    sql::sqlparser::ast::NullTreatment,
+};
+use std::{collections::BTreeMap, sync::Arc};
+
+/// Expression type that describes a time-series column to which per-series
+/// `LIMIT` and `OFFSET` operations should be applied.
+///
+/// This type represents a single value column in a time series that will have
+/// limiting applied independently per series group. It encapsulates not just the
+/// expression to evaluate, but also how NULL values should be handled and what
+/// default value to use when a row falls outside the limit range.
+///
+/// # Purpose
+///
+/// `LimitExpr` is used as part of the logical planning phase for InfluxQL queries
+/// that apply LIMIT/OFFSET on a per-series basis. Each `LimitExpr` corresponds to
+/// one value column in the SELECT clause that needs series-based limiting.
+///
+/// # NULL Treatment Modes
+///
+/// The `null_treatment` field controls how NULL values are counted:
+///
+/// - **`RespectNulls`**: NULL values count toward the row limit and are included
+///   in row numbering. This is the default SQL behavior.
+///
+/// - **`IgnoreNulls`**: NULL values are skipped and don't count toward the limit.
+///   Only non-NULL values contribute to the row count.
+///
+/// # Default Values
+///
+/// The `default_value` field specifies what value to output when a row is filtered
+/// out due to LIMIT/OFFSET constraints, but the timestamp exists in another series.
+/// This enables time-aligned output across multiple series even when some series
+/// have fewer points.
+///
+/// # Examples
+///
+/// ## Basic Usage
+///
+/// ```text
+/// Query: SELECT temperature FROM weather GROUP BY location LIMIT 3
+///
+/// LimitExpr {
+///     expr: Column("temperature"),
+///     null_treatment: RespectNulls,
+///     default_value: Literal(NULL),
+/// }
+/// ```
+///
+/// ## With Default Values (FILL)
+///
+/// ```text
+/// Query: SELECT FILL(0, temperature) FROM weather GROUP BY location LIMIT 3
+///
+/// LimitExpr {
+///     expr: Column("temperature"),
+///     null_treatment: RespectNulls,
+///     default_value: Literal(0),
+/// }
+/// ```
+///
+/// ## Ignoring NULLs
+///
+/// ```text
+/// Query: SELECT temperature IGNORE NULLS FROM weather GROUP BY location LIMIT 3
+///
+/// LimitExpr {
+///     expr: Column("temperature"),
+///     null_treatment: IgnoreNulls,
+///     default_value: Literal(NULL),
+/// }
+/// ```
+///
+/// # Type Safety
+///
+/// The `expr` and `default_value` must have the same data type. This is validated
+/// by the `get_type()` method and enforced during logical plan construction. Type
+/// mismatches result in an error during query planning.
+///
+/// # Relation to Physical Plan
+///
+/// During physical planning, each `LimitExpr` is converted to a [`PhysicalLimitExpr`]
+/// which performs the actual row numbering and filtering during query execution.
+///
+/// [`PhysicalLimitExpr`]: crate::exec::series_limit::physical::PhysicalLimitExpr
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)]
+pub struct LimitExpr {
+    /// The expression for the values to which the limit will be applied.
+    /// This must reference exactly one column in the input which will
+    /// be replaced with the limited version.
+    pub expr: Expr,
+
+    /// How nulls in the series should be treated.
+    pub null_treatment: NullTreatment,
+
+    /// The default value that should be output if a point in time is
+    /// outside of the limits for (or not present in) this series, but
+    /// the time is present in another series.
+    pub default_value: Expr,
+}
+
+impl std::fmt::Display for LimitExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{} {} (default: {})",
+            self.expr, self.null_treatment, self.default_value
+        )
+    }
+}
+
+impl LimitExpr {
+    /// Returns the data type of this expression when evaluated against the given schema.
+    ///
+    /// This method ensures that both the expression and default value have the same type,
+    /// returning an error if they differ.
+    pub fn get_type(&self, schema: &dyn ExprSchema) -> Result<DataType> {
+        let expr_dt = self.expr.get_type(schema)?;
+        let default_value_dt = self.default_value.get_type(schema)?;
+        if expr_dt != default_value_dt {
+            return internal_err!(
+                "LimitExpr expr and default_value must have the same type, got expr: {expr_dt:?}, default_value: {default_value_dt:?}"
+            );
+        }
+        Ok(expr_dt)
+    }
+
+    /// Returns whether this expression is nullable when evaluated against the given schema.
+    ///
+    /// The result is nullable if either the expression or the default value is nullable,
+    /// since either could contribute to the final result.
+    pub fn nullable(&self, input_schema: &dyn ExprSchema) -> Result<bool> {
+        let expr_nullable = self.expr.nullable(input_schema)?;
+        let default_nullable = self.default_value.nullable(input_schema)?;
+        Ok(match self.null_treatment {
+            // If ignoring nulls, the expression's nullability does not affect the result
+            NullTreatment::IgnoreNulls => default_nullable,
+            // If respecting nulls, both expression and default value nullability matter
+            NullTreatment::RespectNulls => expr_nullable || default_nullable,
+        })
+    }
+
+    /// Returns both the data type and nullability of this expression.
+    ///
+    /// This is a convenience method that combines `get_type` and `nullable`.
+    pub fn data_type_and_nullable(&self, schema: &dyn ExprSchema) -> Result<(DataType, bool)> {
+        let data_type = self.get_type(schema)?;
+        let nullable = self.nullable(schema)?;
+        Ok((data_type, nullable))
+    }
+
+    /// Returns a field representation of this expression with its name, data type, and nullability.
+    ///
+    /// The field is derived from the underlying expression but with potentially updated
+    /// nullability based on both the expression and default value.
+    pub fn to_field(
+        &self,
+        input_schema: &dyn ExprSchema,
+    ) -> Result<(Option<TableReference>, Arc<Field>)> {
+        let (qualifier, field) = self.expr.to_field(input_schema)?;
+
+        // Get the data type and nullability, which may differ from the base expression
+        let data_type = self.get_type(input_schema)?;
+        let nullable = self.nullable(input_schema)?;
+
+        // Create a new field with the potentially updated type and nullability
+        let new_field =
+            Field::new(field.name(), data_type, nullable).with_metadata(field.metadata().clone());
+
+        Ok((qualifier, Arc::new(new_field)))
+    }
+}
+
+impl<'a> TreeNodeContainer<'a, Expr> for LimitExpr {
+    fn apply_elements<F: FnMut(&'a Expr) -> Result<TreeNodeRecursion>>(
+        &'a self,
+        mut f: F,
+    ) -> Result<TreeNodeRecursion> {
+        // Apply to the series expression
+        let recursion = f(&self.expr)?;
+        if recursion == TreeNodeRecursion::Stop {
+            return Ok(TreeNodeRecursion::Stop);
+        }
+
+        // Apply to the default value expression
+        f(&self.default_value)
+    }
+
+    fn map_elements<F: FnMut(Expr) -> Result<Transformed<Expr>>>(
+        self,
+        mut f: F,
+    ) -> Result<Transformed<Self>> {
+        // Transform the series expression
+        let expr_result = f(self.expr)?;
+        let mut transformed = expr_result.transformed;
+        let mut tnr = expr_result.tnr;
+        let expr = expr_result.data;
+
+        // Transform the default value expression (if we should continue)
+        let default_value = match tnr {
+            TreeNodeRecursion::Continue | TreeNodeRecursion::Jump => {
+                let default_value_result = f(self.default_value)?;
+                transformed |= default_value_result.transformed;
+                tnr = default_value_result.tnr;
+                default_value_result.data
+            }
+            TreeNodeRecursion::Stop => self.default_value,
+        };
+
+        Ok(Transformed {
+            data: Self {
+                expr,
+                null_treatment: self.null_treatment,
+                default_value,
+            },
+            transformed,
+            tnr,
+        })
+    }
+}
+
+/// Logical plan node for per-series LIMIT and OFFSET operations.
+///
+/// This logical plan node represents the InfluxQL-style series limiting operation,
+/// which applies LIMIT and OFFSET constraints independently to each time series
+/// rather than globally across all results. This is a key semantic difference
+/// from standard SQL LIMIT/OFFSET.
+///
+/// # Purpose
+///
+/// `SeriesLimit` is a custom logical plan node used during query planning for
+/// InfluxQL queries. It captures the intent to limit rows on a per-series basis
+/// before being converted to a physical execution plan ([`SeriesLimitExec`]).
+///
+/// # InfluxQL vs SQL Semantics
+///
+/// ## Standard SQL LIMIT
+/// ```sql
+/// SELECT value FROM measurements LIMIT 10
+/// ```
+/// Returns 10 rows total across all series.
+///
+/// ## InfluxQL Series-based LIMIT
+/// ```sql
+/// SELECT value1 FROM measurements GROUP BY location LIMIT 10
+/// ```
+/// Returns up to 10 rows **per location** (per series), potentially returning
+/// many more than 10 rows total.
+///
+/// ```sql
+/// SELECT value1, value2 FROM measurements GROUP BY location LIMIT 5 OFFSET 2
+/// ```
+/// Returns up to 5 rows **per location**, skipping the first 2 rows in each
+/// series. Where the series do not have values with matching timestamps the
+/// default_value is used to fill in the gaps where required.
+///
+/// # Important Note: SLIMIT vs LIMIT
+///
+/// **This operation does NOT implement InfluxQL's `SLIMIT` or `SOFFSET` clauses.**
+///
+/// - `SLIMIT`/`SOFFSET`: Limit the number of *series* loaded from storage
+/// - `LIMIT`/`OFFSET` (this operation): Limit the number of *rows per series*
+///
+/// For example:
+/// - `SLIMIT 5` → Load at most 5 different series from storage
+/// - `LIMIT 10` → Return at most 10 rows from each series
+///
+/// The implementation of `SLIMIT`/`SOFFSET` is tracked by
+/// [issue 6940](https://github.com/influxdata/influxdb_iox/issues/6940).
+///
+/// # Query Structure
+///
+/// A typical InfluxQL query using series limiting looks like:
+///
+/// ```text
+/// SELECT <limit_expr>...
+/// FROM <measurement>
+/// WHERE <conditions>
+/// GROUP BY <series_expr>...
+/// ORDER BY time [ASC|DESC]
+/// LIMIT <fetch> OFFSET <skip>
+/// ```
+///
+/// This translates to a `SeriesLimit` node with:
+/// - `series_expr`: The GROUP BY columns that define series boundaries
+/// - `order_expr`: The time column and sort direction (from ORDER BY)
+/// - `limit_expr`: The value columns from SELECT clause
+/// - `skip`: The OFFSET value (number of rows to skip per series)
+/// - `fetch`: The LIMIT value (max rows to return per series)
+///
+/// # Components
+///
+/// ## Series Expressions (`series_expr`)
+///
+/// Define what constitutes a unique time series. Typically these are tag columns.
+/// Rows with identical values for all series expressions belong to the same series.
+///
+/// Example: `GROUP BY location, sensor_id` → `series_expr = [Column("location"), Column("sensor_id")]`
+///
+/// ## Order Expressions (`order_expr`)
+///
+/// Defines the sort ordering within each series. This will always include the time
+/// column, but may also include additional columns. Each series is independently
+/// sorted by this expression before applying LIMIT/OFFSET.
+///
+/// Example: `ORDER BY time DESC` → `order_expr = [SortExpr { expr: Column("time"), asc: false, ... }]`
+///
+/// ## Limit Expressions (`limit_expr`)
+///
+/// The value columns that should be included in the output. Each has associated
+/// NULL handling and default value semantics. See [`LimitExpr`] for details.
+///
+/// ## Skip and Fetch
+///
+/// - `skip`: Number of rows to skip at the start of each series (OFFSET)
+/// - `fetch`: Maximum number of rows to return from each series (LIMIT)
+///
+/// Both are optional `Expr` types wrapped in `Box` to allow for dynamic values
+/// or literals. If `skip` is `None`, no rows are skipped. If `fetch` is `None`,
+/// all remaining rows (after skip) are returned.
+///
+/// # Examples
+///
+/// ## Example 1: Basic Per-Series LIMIT
+///
+/// ```text
+/// Query: SELECT temperature FROM weather GROUP BY location LIMIT 3
+///
+/// SeriesLimit {
+///     input: <scan weather table>,
+///     series_expr: [Column("location")],
+///     order_expr: [SortExpr { expr: Column("time"), asc: true, ... }],
+///     limit_expr: [LimitExpr {
+///         expr: Column("temperature"),
+///         null_treatment: RespectNulls,
+///         default_value: Literal(NULL),
+///     }],
+///     skip: None,
+///     fetch: Some(Box::new(Literal(3))),
+/// }
+///
+/// Result: Up to 3 temperature readings per location
+/// ```
+///
+/// ## Example 2: LIMIT with OFFSET
+///
+/// ```text
+/// Query: SELECT value FROM sensors GROUP BY sensor_id LIMIT 10 OFFSET 5
+///
+/// SeriesLimit {
+///     input: <scan sensors table>,
+///     series_expr: [Column("sensor_id")],
+///     order_expr: [SortExpr { expr: Column("time"), asc: true, ... }],
+///     limit_expr: [LimitExpr { expr: Column("value"), ... }],
+///     skip: Some(Box::new(Literal(5))),
+///     fetch: Some(Box::new(Literal(10))),
+/// }
+///
+/// Result: Rows 6-15 from each sensor (skip first 5, take next 10)
+/// ```
+///
+/// ## Example 3: Multiple Series Keys and Value Columns
+///
+/// ```text
+/// Query: SELECT temp, humidity FROM weather
+///        GROUP BY location, elevation
+///        LIMIT 5
+///
+/// SeriesLimit {
+///     series_expr: [Column("location"), Column("elevation")],
+///     limit_expr: [
+///         LimitExpr { expr: Column("temp"), ... },
+///         LimitExpr { expr: Column("humidity"), ... },
+///     ],
+///     fetch: Some(Box::new(Literal(5))),
+///     ...
+/// }
+///
+/// Result: Up to 5 rows each for temp and humidity per (location, elevation) combination,
+/// if temp and humidity have different timestamps this could result it up to 10 output
+/// rows per (location, elevation).
+/// ```
+///
+/// # See Also
+///
+/// - [`LimitExpr`]: The expression type for individual value columns
+/// - [`SeriesLimitExec`]: The physical execution plan that implements this operation
+///
+/// [`SeriesLimitExec`]: crate::exec::series_limit::physical::SeriesLimitExec
+#[derive(Debug, Clone, Hash, PartialEq, Eq)]
+pub struct SeriesLimit {
+    /// The input for this operation.
+    pub input: Arc<LogicalPlan>,
+
+    /// The expressions that definge which series a particular row is
+    /// part of.
+    pub series_expr: Vec<Expr>,
+
+    /// The expression that defines the ordering of the rows within a
+    /// series.
+    pub order_expr: Vec<SortExpr>,
+
+    /// The expressions that define the values of each time series that
+    /// needs to be processed. Each expression must reference exactly
+    /// one column in the input which will be replaced with the output
+    /// with the limited version of that column. No two limit
+    /// expressions may reference the same input column.
+    pub limit_expr: Vec<LimitExpr>,
+
+    /// The number of rows to skip (OFFSET) in each time series.
+    pub skip: Option<Box<Expr>>,
+
+    /// The maximum number of rows (LIMIT) to include in each time
+    /// series.
+    pub fetch: Option<Box<Expr>>,
+
+    /// The schema of the output of this operation.
+    pub schema: DFSchemaRef,
+}
+
+impl SeriesLimit {
+    pub fn try_new(
+        input: Arc<LogicalPlan>,
+        series_expr: Vec<Expr>,
+        order_expr: Vec<SortExpr>,
+        limit_expr: Vec<LimitExpr>,
+        skip: Option<Box<Expr>>,
+        fetch: Option<Box<Expr>>,
+    ) -> Result<Self> {
+        // Validate that the expressions are all valid against the input schema
+        let input_schema = input.schema();
+
+        let mut limited_fields = BTreeMap::new();
+        for le in &limit_expr {
+            let cols = le.expr.column_refs();
+            if cols.len() != 1 {
+                return internal_err!(
+                    "LimitExpr expr must reference exactly one column, found {} columns",
+                    cols.len()
+                );
+            }
+            let col = cols.into_iter().next().unwrap();
+            let idx = input_schema.index_of_column(col)?;
+            if limited_fields
+                .insert(idx, le.to_field(input_schema)?)
+                .is_some()
+            {
+                return internal_err!("LimitExpr contains duplicate column reference: {}", col);
+            }
+        }
+
+        // The schema is the same as the input with the limited fields
+        // potentially modified.
+        let qualified_fields = input_schema
+            .iter()
+            .enumerate()
+            .map(|(idx, (qualifier, field))| {
+                if let Some((qualifier, field)) = limited_fields.remove(&idx) {
+                    (qualifier, field)
+                } else {
+                    (qualifier.cloned(), Arc::clone(field))
+                }
+            })
+            .collect::<Vec<_>>();
+
+        let schema = Arc::new(DFSchema::new_with_metadata(
+            qualified_fields,
+            std::collections::HashMap::new(),
+        )?);
+
+        Ok(Self {
+            input,
+            series_expr,
+            order_expr,
+            limit_expr,
+            skip,
+            fetch,
+            schema,
+        })
+    }
+
+    pub fn apply_expressions<F: FnMut(&Expr) -> Result<TreeNodeRecursion>>(
+        &self,
+        mut f: F,
+    ) -> Result<TreeNodeRecursion> {
+        if self.series_expr.apply_elements(&mut f)? == TreeNodeRecursion::Stop {
+            return Ok(TreeNodeRecursion::Stop);
+        }
+        if self.order_expr.apply_elements(&mut f)? == TreeNodeRecursion::Stop {
+            return Ok(TreeNodeRecursion::Stop);
+        }
+        if self.limit_expr.apply_elements(&mut f)? == TreeNodeRecursion::Stop {
+            return Ok(TreeNodeRecursion::Stop);
+        }
+        if self.skip.apply_elements(&mut f)? == TreeNodeRecursion::Stop {
+            return Ok(TreeNodeRecursion::Stop);
+        }
+        if self.fetch.apply_elements(&mut f)? == TreeNodeRecursion::Stop {
+            return Ok(TreeNodeRecursion::Stop);
+        }
+        Ok(TreeNodeRecursion::Continue)
+    }
+}
+
+// Manual impl because DFSchemaRef doesn't implement PartialOrd
+impl PartialOrd for SeriesLimit {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        use std::cmp::Ordering;
+
+        // Compare inputs
+        match self.input.partial_cmp(&other.input) {
+            Some(Ordering::Equal) => {}
+            other => return other,
+        }
+
+        // Compare series expressions
+        match self.series_expr.partial_cmp(&other.series_expr) {
+            Some(Ordering::Equal) => {}
+            other => return other,
+        }
+
+        // Compare order expressions
+        match self.order_expr.partial_cmp(&other.order_expr) {
+            Some(Ordering::Equal) => {}
+            other => return other,
+        }
+
+        // Compare limit expressions
+        match self.limit_expr.partial_cmp(&other.limit_expr) {
+            Some(Ordering::Equal) => {}
+            other => return other,
+        }
+
+        // Compare skip
+        match self.skip.partial_cmp(&other.skip) {
+            Some(Ordering::Equal) => {}
+            other => return other,
+        }
+
+        // Compare fetch (skip schema since it doesn't implement PartialOrd)
+        self.fetch.partial_cmp(&other.fetch)
+    }
+}
+
+impl UserDefinedLogicalNodeCore for SeriesLimit {
+    fn name(&self) -> &str {
+        "SeriesLimit"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![self.input.as_ref()]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        &self.schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        let mut exprs = Vec::with_capacity(
+            self.series_expr.len() + self.order_expr.len() + 2 * self.limit_expr.len() + 2,
+        );
+
+        self.apply_expressions(|expr| {
+            exprs.push(expr.clone());
+            Ok(TreeNodeRecursion::Continue)
+        })
+        .expect("cannot error");
+
+        exprs
+    }
+
+    fn fmt_for_explain(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let series_expr = self
+            .series_expr
+            .iter()
+            .map(|e| e.to_string())
+            .collect::<Vec<String>>()
+            .join(", ");
+
+        let order_expr = self
+            .order_expr
+            .iter()
+            .map(|se| se.to_string())
+            .collect::<Vec<String>>()
+            .join(", ");
+
+        let limit_expr = self
+            .limit_expr
+            .iter()
+            .map(|le| le.to_string())
+            .collect::<Vec<String>>()
+            .join(", ");
+
+        write!(
+            f,
+            "{}: series=[{}], order=[{}], limit_expr=[{}]",
+            self.name(),
+            series_expr,
+            order_expr,
+            limit_expr,
+        )?;
+
+        if let Some(skip) = &self.skip {
+            write!(f, ", skip={}", skip)?;
+        }
+
+        if let Some(fetch) = &self.fetch {
+            write!(f, ", fetch={}", fetch)?;
+        }
+
+        Ok(())
+    }
+
+    fn with_exprs_and_inputs(&self, exprs: Vec<Expr>, inputs: Vec<LogicalPlan>) -> Result<Self> {
+        if inputs.len() != 1 {
+            return internal_err!("SeriesLimit expects exactly 1 input, got {}", inputs.len());
+        }
+
+        let input = Arc::new(inputs.into_iter().next().unwrap());
+
+        fn map_exprs<'a, C: TreeNodeContainer<'a, Expr>>(
+            c: C,
+            exprs: &mut impl Iterator<Item = Expr>,
+        ) -> Result<C> {
+            c.map_elements(|old| {
+                exprs
+                    .next()
+                    .map(|new| {
+                        let transformed = new == old;
+                        Transformed::new(new, transformed, TreeNodeRecursion::Continue)
+                    })
+                    .ok_or(DataFusionError::Internal(String::from(
+                        "not enough input expressions for SeriesLimit",
+                    )))
+            })
+            .map(|t| t.data)
+        }
+
+        let Self {
+            series_expr,
+            order_expr,
+            limit_expr,
+            skip,
+            fetch,
+            ..
+        } = self;
+
+        let mut exprs = exprs.into_iter();
+
+        let series_expr = map_exprs(series_expr.clone(), &mut exprs)?;
+        let order_expr = map_exprs(order_expr.clone(), &mut exprs)?;
+        let limit_expr = map_exprs(limit_expr.clone(), &mut exprs)?;
+        let skip = map_exprs(skip.clone(), &mut exprs)?;
+        let fetch = map_exprs(fetch.clone(), &mut exprs)?;
+
+        if exprs.next().is_some() {
+            return internal_err!("too many input expressions for SeriesLimit");
+        }
+
+        Self::try_new(input, series_expr, order_expr, limit_expr, skip, fetch)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use datafusion::{
+        logical_expr::{col, lit},
+        prelude::SessionContext,
+        scalar::ScalarValue,
+    };
+    use insta::assert_snapshot;
+
+    /// Helper function to create a simple test schema
+    fn test_schema() -> DFSchemaRef {
+        let rt = tokio::runtime::Runtime::new().unwrap();
+        rt.block_on(async {
+            let ctx = SessionContext::new();
+            Arc::clone(
+                ctx.sql("SELECT 1 as a, 2 as b, 3 as time")
+                    .await
+                    .unwrap()
+                    .into_optimized_plan()
+                    .unwrap()
+                    .schema(),
+            )
+        })
+    }
+
+    /// Helper function to create a simple LogicalPlan for testing
+    fn test_plan() -> Arc<LogicalPlan> {
+        let rt = tokio::runtime::Runtime::new().unwrap();
+        rt.block_on(async {
+            let ctx = SessionContext::new();
+            Arc::new(
+                ctx.sql("SELECT 1 as a, 2 as b, 3 as time")
+                    .await
+                    .unwrap()
+                    .into_optimized_plan()
+                    .unwrap(),
+            )
+        })
+    }
+
+    fn test_plan2() -> Arc<LogicalPlan> {
+        let rt = tokio::runtime::Runtime::new().unwrap();
+        rt.block_on(async {
+            let ctx = SessionContext::new();
+            Arc::new(
+                ctx.sql("SELECT 3 as a, 2 as b, 1 as time")
+                    .await
+                    .unwrap()
+                    .into_optimized_plan()
+                    .unwrap(),
+            )
+        })
+    }
+
+    /// Helper function to create a LogicalPlan with more columns for testing
+    fn test_plan_multi_column() -> Arc<LogicalPlan> {
+        let rt = tokio::runtime::Runtime::new().unwrap();
+        rt.block_on(async {
+            let ctx = SessionContext::new();
+            Arc::new(
+                ctx.sql("SELECT 1 as a, 2 as b, 3 as c, 4 as time")
+                    .await
+                    .unwrap()
+                    .into_optimized_plan()
+                    .unwrap(),
+            )
+        })
+    }
+
+    mod limit_expr_tests {
+        use super::*;
+
+        #[test]
+        fn test_display() {
+            let limit_expr = LimitExpr {
+                expr: col("temperature"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(0),
+            };
+
+            assert_snapshot!(limit_expr.to_string(), @r#"temperature RESPECT NULLS (default: Int32(0))"#);
+        }
+
+        #[test]
+        fn test_get_type_matching_types() {
+            let schema = test_schema();
+            let limit_expr = LimitExpr {
+                expr: col("a"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(ScalarValue::Int64(Some(42))),
+            };
+
+            assert_eq!(
+                limit_expr.get_type(schema.as_ref()).unwrap(),
+                DataType::Int64
+            );
+        }
+
+        #[test]
+        fn test_nullable() {
+            let schema = test_schema();
+            let limit_expr = LimitExpr {
+                expr: col("a"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(ScalarValue::Int64(Some(42))),
+            };
+
+            assert!(!limit_expr.nullable(schema.as_ref()).unwrap());
+        }
+
+        #[test]
+        fn test_data_type_and_nullable() {
+            let schema = test_schema();
+            let limit_expr = LimitExpr {
+                expr: col("a"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(ScalarValue::Int64(Some(42))),
+            };
+
+            assert_eq!(
+                limit_expr.data_type_and_nullable(schema.as_ref()).unwrap(),
+                (DataType::Int64, false)
+            );
+        }
+
+        #[test]
+        fn test_to_field() {
+            let schema = test_schema();
+            let limit_expr = LimitExpr {
+                expr: col("a"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(ScalarValue::Int64(Some(42))),
+            };
+
+            assert_eq!(
+                limit_expr.to_field(schema.as_ref()).unwrap(),
+                (None, Arc::new(Field::new("a", DataType::Int64, false)))
+            );
+        }
+
+        #[test]
+        fn test_tree_node_container_apply_elements() {
+            let limit_expr = LimitExpr {
+                expr: col("a"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(42),
+            };
+
+            let mut count = 0;
+            let result = limit_expr.apply_elements(|_expr| {
+                count += 1;
+                Ok(TreeNodeRecursion::Continue)
+            });
+
+            assert_eq!(result.unwrap(), TreeNodeRecursion::Continue);
+            assert_eq!(count, 2); // Should visit both expr and default_value
+        }
+
+        #[test]
+        fn test_tree_node_container_apply_elements_stop() {
+            let limit_expr = LimitExpr {
+                expr: col("a"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(42),
+            };
+
+            let mut count = 0;
+            let result = limit_expr.apply_elements(|_expr| {
+                count += 1;
+                Ok(TreeNodeRecursion::Stop)
+            });
+
+            assert!(result.is_ok());
+            assert_eq!(result.unwrap(), TreeNodeRecursion::Stop);
+            assert_eq!(count, 1); // Should stop after first expression
+        }
+
+        #[test]
+        fn test_tree_node_container_map_elements() {
+            let limit_expr = LimitExpr {
+                expr: col("a"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(42),
+            };
+
+            let result = limit_expr
+                .clone()
+                .map_elements(|expr| Ok(Transformed::no(expr)));
+
+            assert_eq!(result.unwrap(), Transformed::no(limit_expr));
+        }
+
+        #[test]
+        fn test_tree_node_container_map_elements_with_transform() {
+            let limit_expr = LimitExpr {
+                expr: col("a"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(42),
+            };
+
+            let result = limit_expr
+                .clone()
+                .map_elements(|expr| Ok(Transformed::yes(expr)));
+
+            assert_eq!(result.unwrap(), Transformed::yes(limit_expr));
+        }
+    }
+
+    mod series_limit_tests {
+        use super::*;
+        use arrow::datatypes::Fields;
+        use datafusion::logical_expr::{Extension, SortExpr};
+
+        fn create_test_series_limit() -> SeriesLimit {
+            let input = test_plan();
+            let series_expr = vec![col("a")];
+            let order_expr = vec![SortExpr {
+                expr: col("time"),
+                asc: true,
+                nulls_first: false,
+            }];
+            let limit_expr = vec![LimitExpr {
+                expr: col("b"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(ScalarValue::Int64(Some(0))),
+            }];
+
+            SeriesLimit::try_new(input, series_expr, order_expr, limit_expr, None, None).unwrap()
+        }
+
+        #[test]
+        fn test_try_new() {
+            let input = test_plan();
+            let series_expr = vec![col("a")];
+            let order_expr = vec![SortExpr {
+                expr: col("time"),
+                asc: true,
+                nulls_first: false,
+            }];
+            let limit_expr = vec![LimitExpr {
+                expr: col("b"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(ScalarValue::Int64(Some(0))),
+            }];
+
+            let result =
+                SeriesLimit::try_new(input, series_expr, order_expr, limit_expr, None, None);
+            assert!(result.is_ok());
+        }
+
+        #[test]
+        fn test_try_new_with_skip_and_fetch() {
+            let input = test_plan();
+            let series_expr = vec![col("a")];
+            let order_expr = vec![SortExpr {
+                expr: col("time"),
+                asc: true,
+                nulls_first: false,
+            }];
+            let limit_expr = vec![LimitExpr {
+                expr: col("b"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(ScalarValue::Int64(Some(0))),
+            }];
+            let skip = Some(Box::new(lit(10)));
+            let fetch = Some(Box::new(lit(100)));
+
+            let result =
+                SeriesLimit::try_new(input, series_expr, order_expr, limit_expr, skip, fetch);
+            assert!(result.is_ok());
+
+            let series_limit = result.unwrap();
+            assert!(series_limit.skip.is_some());
+            assert!(series_limit.fetch.is_some());
+        }
+
+        #[test]
+        fn test_name() {
+            let series_limit = create_test_series_limit();
+            assert_eq!(series_limit.name(), "SeriesLimit");
+        }
+
+        #[test]
+        fn test_inputs() {
+            let series_limit = create_test_series_limit();
+            let inputs = series_limit.inputs();
+            assert_eq!(inputs.len(), 1);
+        }
+
+        #[test]
+        fn test_schema() {
+            let series_limit = create_test_series_limit();
+            let schema = series_limit.schema();
+            assert_eq!(
+                schema.fields(),
+                &Fields::from(vec![
+                    Field::new("a", DataType::Int64, false),
+                    Field::new("b", DataType::Int64, false),
+                    Field::new("time", DataType::Int64, false),
+                ])
+            );
+        }
+
+        #[test]
+        fn test_expressions() {
+            let series_limit = create_test_series_limit();
+            assert_eq!(
+                series_limit.expressions(),
+                vec![col("a"), col("time"), col("b"), lit(0_i64)]
+            );
+        }
+
+        #[test]
+        fn test_expressions_with_skip_and_fetch() {
+            let input = test_plan();
+            let series_expr = vec![col("a")];
+            let order_expr = vec![SortExpr {
+                expr: col("time"),
+                asc: true,
+                nulls_first: false,
+            }];
+            let limit_expr = vec![LimitExpr {
+                expr: col("b"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(ScalarValue::Int64(Some(0))),
+            }];
+            let skip = Some(Box::new(lit(10)));
+            let fetch = Some(Box::new(lit(100)));
+
+            let series_limit =
+                SeriesLimit::try_new(input, series_expr, order_expr, limit_expr, skip, fetch)
+                    .unwrap();
+
+            assert_eq!(
+                series_limit.expressions(),
+                vec![
+                    col("a"),
+                    col("time"),
+                    col("b"),
+                    lit(0_i64),
+                    lit(10),
+                    lit(100)
+                ]
+            );
+        }
+
+        #[test]
+        fn test_fmt_for_explain() {
+            let series_limit = create_test_series_limit();
+            assert_snapshot!(format!("{}", LogicalPlan::Extension(Extension{node: Arc::new(series_limit)})), @r"
+            SeriesLimit: series=[a], order=[time ASC NULLS LAST], limit_expr=[b RESPECT NULLS (default: Int64(0))]
+              Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS time
+                EmptyRelation
+            ")
+        }
+
+        #[test]
+        fn test_with_exprs_and_inputs() {
+            let series_limit = create_test_series_limit();
+            let original_exprs = series_limit.expressions();
+
+            // Create new input
+            let new_input = test_plan2();
+
+            let series_limit = series_limit
+                .with_exprs_and_inputs(original_exprs, vec![(*new_input).clone()])
+                .unwrap();
+
+            assert_snapshot!(format!("{}", LogicalPlan::Extension(Extension{node: Arc::new(series_limit)})), @r"
+            SeriesLimit: series=[a], order=[time ASC NULLS LAST], limit_expr=[b RESPECT NULLS (default: Int64(0))]
+              Projection: Int64(3) AS a, Int64(2) AS b, Int64(1) AS time
+                EmptyRelation
+            ")
+        }
+
+        #[test]
+        fn test_with_exprs_and_inputs_wrong_input_count() {
+            let series_limit = create_test_series_limit();
+            let original_exprs = series_limit.expressions();
+
+            // Try with wrong number of inputs (0 instead of 1)
+            let result = series_limit.with_exprs_and_inputs(original_exprs, vec![]);
+
+            assert!(result.is_err());
+            let err_str = result.unwrap_err().to_string();
+            assert!(err_str.contains("expects exactly 1 input"));
+        }
+
+        #[test]
+        fn test_partial_ord() {
+            let series_limit1 = create_test_series_limit();
+            let series_limit2 = create_test_series_limit();
+
+            // Should be equal
+            assert_eq!(
+                series_limit1.partial_cmp(&series_limit2),
+                Some(std::cmp::Ordering::Equal)
+            );
+        }
+
+        #[test]
+        fn test_eq() {
+            let series_limit1 = create_test_series_limit();
+            let series_limit2 = create_test_series_limit();
+
+            assert_eq!(series_limit1, series_limit2);
+        }
+
+        #[test]
+        fn test_hash() {
+            use std::collections::hash_map::DefaultHasher;
+            use std::hash::{Hash, Hasher};
+
+            let series_limit1 = create_test_series_limit();
+            let series_limit2 = create_test_series_limit();
+
+            let mut hasher1 = DefaultHasher::new();
+            series_limit1.hash(&mut hasher1);
+            let hash1 = hasher1.finish();
+
+            let mut hasher2 = DefaultHasher::new();
+            series_limit2.hash(&mut hasher2);
+            let hash2 = hasher2.finish();
+
+            assert_eq!(hash1, hash2);
+        }
+
+        #[test]
+        fn test_multiple_order_expressions() {
+            // Test with multiple order expressions (e.g., ORDER BY time, b)
+            // Schema has: a, b, c, time
+            let input = test_plan_multi_column();
+            let series_expr = vec![col("a")];
+            let order_expr = vec![
+                SortExpr {
+                    expr: col("time"),
+                    asc: true,
+                    nulls_first: false,
+                },
+                SortExpr {
+                    expr: col("b"),
+                    asc: false,
+                    nulls_first: true,
+                },
+            ];
+            let limit_expr = vec![LimitExpr {
+                expr: col("c"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(ScalarValue::Int64(Some(0))),
+            }];
+
+            let series_limit =
+                SeriesLimit::try_new(input, series_expr, order_expr, limit_expr, None, None)
+                    .unwrap();
+
+            assert_snapshot!(format!("{}", LogicalPlan::Extension(Extension{node: Arc::new(series_limit)})), @r"
+            SeriesLimit: series=[a], order=[time ASC NULLS LAST, b DESC NULLS FIRST], limit_expr=[c RESPECT NULLS (default: Int64(0))]
+              Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c, Int64(4) AS time
+                EmptyRelation
+            ")
+        }
+
+        #[test]
+        fn test_expressions_with_multiple_order() {
+            // Test that expressions() includes all order expressions
+            let input = test_plan_multi_column();
+            let series_expr = vec![col("a")];
+            let order_expr = vec![
+                SortExpr {
+                    expr: col("time"),
+                    asc: true,
+                    nulls_first: false,
+                },
+                SortExpr {
+                    expr: col("b"),
+                    asc: false,
+                    nulls_first: true,
+                },
+            ];
+            let limit_expr = vec![LimitExpr {
+                expr: col("c"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(ScalarValue::Int64(Some(0))),
+            }];
+
+            let series_limit =
+                SeriesLimit::try_new(input, series_expr, order_expr, limit_expr, None, None)
+                    .unwrap();
+
+            let exprs = series_limit.expressions();
+            assert_eq!(
+                exprs,
+                vec![col("a"), col("time"), col("b"), col("c"), lit(0_i64)]
+            );
+        }
+
+        #[test]
+        fn test_with_exprs_and_inputs_multiple_order() {
+            // Test with_exprs_and_inputs preserves multiple order expressions
+            let input = test_plan_multi_column();
+            let series_expr = vec![col("a")];
+            let order_expr = vec![
+                SortExpr {
+                    expr: col("time"),
+                    asc: true,
+                    nulls_first: false,
+                },
+                SortExpr {
+                    expr: col("b"),
+                    asc: false,
+                    nulls_first: true,
+                },
+            ];
+            let limit_expr = vec![LimitExpr {
+                expr: col("c"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(ScalarValue::Int64(Some(0))),
+            }];
+
+            let series_limit = SeriesLimit::try_new(
+                Arc::clone(&input),
+                series_expr,
+                order_expr,
+                limit_expr,
+                None,
+                None,
+            )
+            .unwrap();
+
+            let original_exprs = series_limit.expressions();
+            let new_input = test_plan_multi_column();
+
+            let series_limit = series_limit
+                .with_exprs_and_inputs(original_exprs, vec![(*new_input).clone()])
+                .unwrap();
+
+            assert_snapshot!(format!("{}", LogicalPlan::Extension(Extension{node: Arc::new(series_limit)})), @r"
+            SeriesLimit: series=[a], order=[time ASC NULLS LAST, b DESC NULLS FIRST], limit_expr=[c RESPECT NULLS (default: Int64(0))]
+              Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c, Int64(4) AS time
+                EmptyRelation
+            ")
+        }
+
+        #[test]
+        fn test_fmt_for_explain_multiple_order() {
+            // Test that fmt_for_explain includes all order expressions
+            let input = test_plan_multi_column();
+            let series_expr = vec![col("a")];
+            let order_expr = vec![
+                SortExpr {
+                    expr: col("time"),
+                    asc: true,
+                    nulls_first: false,
+                },
+                SortExpr {
+                    expr: col("b"),
+                    asc: false,
+                    nulls_first: true,
+                },
+            ];
+            let limit_expr = vec![LimitExpr {
+                expr: col("c"),
+                null_treatment: NullTreatment::RespectNulls,
+                default_value: lit(ScalarValue::Int64(Some(0))),
+            }];
+
+            let series_limit =
+                SeriesLimit::try_new(input, series_expr, order_expr, limit_expr, None, None)
+                    .unwrap();
+
+            assert_snapshot!(format!("{}", LogicalPlan::Extension(Extension{node: Arc::new(series_limit)})), @r"
+            SeriesLimit: series=[a], order=[time ASC NULLS LAST, b DESC NULLS FIRST], limit_expr=[c RESPECT NULLS (default: Int64(0))]
+              Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c, Int64(4) AS time
+                EmptyRelation
+            ")
+        }
+    }
+}
diff --git a/iox_query/src/exec/series_limit/mod.rs b/iox_query/src/exec/series_limit/mod.rs
new file mode 100644
index 00000000..5a66dd39
--- /dev/null
+++ b/iox_query/src/exec/series_limit/mod.rs
@@ -0,0 +1,158 @@
+//! Handling of InfluxQL style `LIMIT` and `OFFSET` clauses.
+//!
+//! This module provides functionality to apply `LIMIT` and `OFFSET`
+//! clauses individually to time series data. It is designed to be
+//! compatible with older version of InfluxDB that applied `LIMIT` and
+//! `OFFSET` conditions in each iterator before they are combined.
+
+use arrow::compute::SortOptions;
+use datafusion::{
+    common::{Result, internal_err},
+    execution::context::SessionState,
+    logical_expr::{Expr, LogicalPlan},
+    physical_plan::{ExecutionPlan, expressions::PhysicalSortExpr},
+    scalar::ScalarValue,
+    sql::sqlparser::ast::NullTreatment,
+};
+use std::sync::Arc;
+
+mod logical;
+mod physical;
+
+pub use logical::{LimitExpr, SeriesLimit};
+pub use physical::{PhysicalLimitExpr, SeriesLimitExec};
+
+/// Plan a SeriesLimit logical node into a physical SeriesLimitExec.
+///
+/// This function converts the logical representation of per-series limiting
+/// into a physical execution plan that can be executed by DataFusion.
+///
+/// # Arguments
+///
+/// * `session_state` - The DataFusion session state for creating physical expressions
+/// * `series_limit` - The logical SeriesLimit node to plan
+/// * `logical_inputs` - The logical input plans (must be exactly 1)
+/// * `physical_inputs` - The physical input plans (must be exactly 1)
+///
+/// # Returns
+///
+/// Returns a `SeriesLimitExec` physical execution plan on success, or an error
+/// if the inputs are invalid or expression conversion fails.
+///
+/// # Errors
+///
+/// This function returns an error if:
+/// - The number of logical or physical inputs is not exactly 1
+/// - Skip or fetch expressions cannot be evaluated to usize values
+/// - Expression conversion from logical to physical fails
+/// - Default value conversion fails
+pub(crate) fn plan_series_limit(
+    session_state: &SessionState,
+    series_limit: &SeriesLimit,
+    logical_inputs: &[&LogicalPlan],
+    physical_inputs: &[Arc<dyn ExecutionPlan>],
+) -> Result<SeriesLimitExec> {
+    // Validate inputs
+    let input_dfschema = match logical_inputs {
+        [input] => input.schema().as_ref(),
+        _ => {
+            return internal_err!(
+                "SeriesLimitExec: wrong number of logical inputs; expected 1, found {}",
+                logical_inputs.len()
+            );
+        }
+    };
+
+    let phys_input = match physical_inputs {
+        [input] => Arc::clone(input),
+        _ => {
+            return internal_err!(
+                "SeriesLimitExec: wrong number of physical inputs; expected 1, found {}",
+                physical_inputs.len()
+            );
+        }
+    };
+
+    // Convert series expressions to physical
+    let series_expr = series_limit
+        .series_expr
+        .iter()
+        .map(|expr| session_state.create_physical_expr(expr.clone(), input_dfschema))
+        .collect::<Result<Vec<_>>>()?;
+
+    // Convert order expressions to physical
+    let order_expr = series_limit
+        .order_expr
+        .iter()
+        .map(|sort_expr| {
+            Ok(PhysicalSortExpr {
+                expr: session_state.create_physical_expr(sort_expr.expr.clone(), input_dfschema)?,
+                options: SortOptions {
+                    descending: !sort_expr.asc,
+                    nulls_first: sort_expr.nulls_first,
+                },
+            })
+        })
+        .collect::<Result<Vec<_>>>()?;
+
+    // Convert limit expressions to physical
+    let limit_expr = series_limit
+        .limit_expr
+        .iter()
+        .map(|le| {
+            let expr = session_state.create_physical_expr(le.expr.clone(), input_dfschema)?;
+            let ignore_nulls = matches!(le.null_treatment, NullTreatment::IgnoreNulls);
+
+            // Evaluate the default value expression to get a ScalarValue
+            let default_value = if let Expr::Literal(scalar, _) = &le.default_value {
+                scalar.clone()
+            } else {
+                return internal_err!(
+                    "SeriesLimit default_value must be a literal, got: {:?}",
+                    le.default_value
+                );
+            };
+
+            Ok(PhysicalLimitExpr::new(expr, ignore_nulls, default_value))
+        })
+        .collect::<Result<Vec<_>>>()?;
+
+    // Evaluate skip and fetch expressions
+    let skip = if let Some(skip_expr) = &series_limit.skip {
+        if let Expr::Literal(ScalarValue::UInt64(Some(skip_val)), _) = skip_expr.as_ref() {
+            *skip_val as usize
+        } else if let Expr::Literal(ScalarValue::Int64(Some(skip_val)), _) = skip_expr.as_ref() {
+            if *skip_val < 0 {
+                return internal_err!("SeriesLimit skip must be non-negative, got: {}", skip_val);
+            }
+            *skip_val as usize
+        } else {
+            return internal_err!(
+                "SeriesLimit skip must be a non-negative integer literal, got: {:?}",
+                skip_expr
+            );
+        }
+    } else {
+        0
+    };
+
+    let fetch = if let Some(fetch_expr) = &series_limit.fetch {
+        if let Expr::Literal(ScalarValue::UInt64(Some(fetch_val)), _) = fetch_expr.as_ref() {
+            Some(*fetch_val as usize)
+        } else if let Expr::Literal(ScalarValue::Int64(Some(fetch_val)), _) = fetch_expr.as_ref() {
+            if *fetch_val < 0 {
+                return internal_err!("SeriesLimit fetch must be non-negative, got: {}", fetch_val);
+            }
+            Some(*fetch_val as usize)
+        } else {
+            return internal_err!(
+                "SeriesLimit fetch must be a non-negative integer literal, got: {:?}",
+                fetch_expr
+            );
+        }
+    } else {
+        None
+    };
+
+    SeriesLimitExec::try_new(phys_input, series_expr, order_expr, limit_expr, skip, fetch)
+}
diff --git a/iox_query/src/exec/series_limit/physical.rs b/iox_query/src/exec/series_limit/physical.rs
new file mode 100644
index 00000000..528e6ea1
--- /dev/null
+++ b/iox_query/src/exec/series_limit/physical.rs
@@ -0,0 +1,2594 @@
+//! Physical executor for the series limit operation.
+
+use std::{
+    collections::BTreeMap,
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+};
+
+use arrow::{
+    array::{
+        Array, ArrayRef, BooleanArray, Datum, PrimitiveArray, RecordBatch, Scalar, UInt64Builder,
+        new_null_array,
+    },
+    compute::{Partitions, partition},
+    datatypes::{SchemaRef, UInt64Type},
+    error::ArrowError,
+};
+use datafusion::{
+    common::tree_node::{TreeNode, TreeNodeRecursion},
+    error::{DataFusionError, Result},
+    execution::{
+        RecordBatchStream, SendableRecordBatchStream,
+        context::TaskContext,
+        memory_pool::{MemoryConsumer, MemoryReservation},
+    },
+    physical_expr::{
+        EquivalenceProperties, LexOrdering, LexRequirement, OrderingRequirements, PhysicalExprRef,
+        PhysicalSortExpr, PhysicalSortRequirement,
+    },
+    physical_plan::{
+        DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
+        PlanProperties, SendableRecordBatchStream as SendableStream, Statistics,
+        expressions::Column,
+        metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet},
+    },
+    scalar::ScalarValue,
+};
+use futures::{Stream, StreamExt, ready};
+
+#[derive(Debug, Clone)]
+pub struct PhysicalLimitExpr {
+    /// The expression to evaluate for the limit. This must be a Column.
+    expr: PhysicalExprRef,
+
+    /// Whether to ignore null values in the limit calculation.
+    ignore_nulls: bool,
+
+    /// The default value to use when a row is filtered out of a time
+    /// series, but required in an output batch. Typically this is NULL
+    /// of the same type as the expression, however it could be 0 or another
+    /// value. When using with InfluxQL this should be the value specified by
+    /// the FILL clause mapped to an appropriate type for the column.
+    default_value: ScalarValue,
+}
+
+impl PhysicalLimitExpr {
+    /// Create a new PhysicalLimitExpr.
+    pub fn new(expr: PhysicalExprRef, ignore_nulls: bool, default_value: ScalarValue) -> Self {
+        Self {
+            expr,
+            ignore_nulls,
+            default_value,
+        }
+    }
+}
+
+impl std::fmt::Display for PhysicalLimitExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{} {}NULLS (default: {})",
+            self.expr,
+            if self.ignore_nulls {
+                "IGNORE "
+            } else {
+                "RESPECT "
+            },
+            self.default_value
+        )
+    }
+}
+
+/// Physical execution plan for per-series LIMIT and OFFSET operations.
+///
+/// This operator implements InfluxQL-style series limiting, which applies
+/// LIMIT and OFFSET constraints independently to each time series. Unlike
+/// standard SQL LIMIT/OFFSET which apply globally to the result set, this
+/// operator applies them separately to each group of rows sharing the same
+/// series key (typically tag values).
+///
+/// # Purpose
+///
+/// This execution plan is designed to support InfluxQL queries where LIMIT
+/// and OFFSET need to apply per-series rather than globally. This is a key
+/// semantic difference from SQL that makes InfluxQL suitable for time series
+/// data analysis where users want to limit the number of points returned
+/// from each individual series independently.
+///
+/// # Query Semantics
+///
+/// For a query like:
+/// ```sql
+/// SELECT value FROM measurement WHERE time > now() - 1h GROUP BY tag LIMIT 10 OFFSET 5
+/// ```
+///
+/// Standard SQL would return 10 rows total across all series. This operator
+/// returns up to 10 rows **per series** (after skipping the first 5 rows of
+/// each series).
+///
+/// # Execution Flow
+///
+/// 1. **Input Requirements**: The input must be pre-sorted by series expressions
+///    followed by order expressions (enforced via `required_input_ordering`).
+///
+/// 2. **Partitioning**: If series expressions are present, the operator requires
+///    hash partitioning on those expressions to ensure all rows for a given series
+///    are processed by the same partition.
+///
+/// 3. **Stream Processing**: Each partition creates a [`SeriesLimitStream`] that:
+///    - Identifies series boundaries in the sorted input
+///    - Assigns row numbers within each series
+///    - Filters rows based on skip/fetch values
+///    - Maintains state across batch boundaries
+///
+/// 4. **Output**: Produces a stream of record batches containing only the rows
+///    that fall within the LIMIT/OFFSET window for each series.
+///
+/// # NULL Handling
+///
+/// The `limit_expr` field supports both RESPECT NULLS and IGNORE NULLS modes:
+///
+/// - **RESPECT NULLS**: NULL values count toward the row limit
+/// - **IGNORE NULLS**: NULL values are skipped and don't count toward the limit
+///
+/// This is controlled by the `ignore_nulls` field in [`PhysicalLimitExpr`].
+///
+/// # Default Values
+///
+/// Each limited expression can specify a default value (in [`PhysicalLimitExpr`])
+/// that is used when a row is filtered out. This supports queries where all
+/// series need to be time-aligned even when some series have no data for certain
+/// timestamps.
+///
+/// # Examples
+///
+/// ## Basic LIMIT
+/// ```text
+/// Input (2 series, 4 rows each):
+/// tag | time | value
+/// ----|------|------
+///  a  |  1   | 10
+///  a  |  2   | 20
+///  a  |  3   | 30
+///  a  |  4   | 40
+///  b  |  1   | 50
+///  b  |  2   | 60
+///  b  |  3   | 70
+///  b  |  4   | 80
+///
+/// With LIMIT 2 (skip=0, fetch=Some(2)):
+/// Output:
+/// tag | time | value
+/// ----|------|------
+///  a  |  1   | 10
+///  a  |  2   | 20
+///  b  |  1   | 50
+///  b  |  2   | 60
+/// ```
+///
+/// ## LIMIT with OFFSET
+/// ```text
+/// Same input as above.
+///
+/// With LIMIT 2 OFFSET 1 (skip=1, fetch=Some(2)):
+/// Output:
+/// tag | time | value
+/// ----|------|------
+///  a  |  2   | 20
+///  a  |  3   | 30
+///  b  |  2   | 60
+///  b  |  3   | 70
+/// ```
+///
+/// ## Only OFFSET
+/// ```text
+/// Same input as above.
+///
+/// With OFFSET 2 (skip=2, fetch=None):
+/// Output:
+/// tag | time | value
+/// ----|------|------
+///  a  |  3   | 30
+///  a  |  4   | 40
+///  b  |  3   | 70
+///  b  |  4   | 80
+/// ```
+///
+/// ## Respecting NULLs
+/// ```text
+/// Input (4 series, 4 rows each):
+/// tag | time | value1 | value2
+/// ----|------|--------|-------
+///  a  |  1   | 10     | <NULL>
+///  a  |  2   | <NULL> | 20
+///  a  |  3   | 30     | 30
+///  a  |  4   | 40     | 40
+///  b  |  1   | <NULL> | <NULL>
+///  b  |  2   | <NULL> | 60
+///  b  |  3   | 70     | <NULL>
+///  b  |  4   | 80     | 80
+///
+/// With LIMIT 2 OFFSET 1 on both value1 and value2, respecting NULLs, default value NULL:
+/// Output:
+/// tag | time | value1 | value2
+/// ----|------|--------|-------
+///  a  |  2   | <NULL> | 20
+///  a  |  3   | 30     | 30
+///  b  |  2   | <NULL> | 60
+///  b  |  3   | 70     | <NULL>
+/// ```
+///
+/// ## Ignoring NULLs
+/// ```text
+/// Input (4 series):
+/// tag | time | value1 | value2
+/// ----|------|--------|-------
+///  a  |  1   | 10     | <NULL>
+///  a  |  2   | <NULL> | 20
+///  a  |  3   | 30     | 30
+///  a  |  4   | 40     | 40
+///  b  |  1   | <NULL> | <NULL>
+///  b  |  2   | <NULL> | 60
+///  b  |  3   | 70     | 70
+///  b  |  4   | 80     | 80
+///  b  |  5   | 90     | 90
+///
+/// With LIMIT 2 OFFSET 1 on both value1 and value2, ignoring NULLs, default value NULL:
+/// Output:
+/// tag | time | value1 | value2
+/// ----|------|--------|-------
+///  a  |  3   | 30     | 30
+///  a  |  4   | 40     | 40
+///  b  |  4   | <NULL> | 70
+///  b  |  4   | 80     | 80
+///  b  |  5   | 90     | <NULL>
+/// ```
+///
+/// # Performance Considerations
+///
+/// - **Memory**: Maintains minimal state (current series key + row counts)
+///   across batches, making it suitable for large datasets.
+///
+/// - **Streaming**: Processes data in a streaming fashion without materializing
+///   entire series in memory.
+///
+/// - **Early Termination**: Once a series exceeds its limit, subsequent rows
+///   for that series can be efficiently filtered without full evaluation.
+pub struct SeriesLimitExec {
+    /// The input execution plan to apply series limiting to.
+    input: Arc<dyn ExecutionPlan>,
+
+    /// Expressions that define the series grouping.
+    ///
+    /// Rows with the same values for these expressions belong to the same series.
+    /// Typically these are tag columns in InfluxQL queries.
+    series_expr: Vec<PhysicalExprRef>,
+
+    /// The expressions used for sorting within each series.
+    ///
+    /// Each series is sorted by this expression (typically ascending timestamp)
+    /// before applying LIMIT and OFFSET operations.
+    order_expr: Vec<PhysicalSortExpr>,
+
+    /// Dynamic limit expressions that can evaluate to per-series limits.
+    ///
+    /// These expressions are evaluated for each series and can provide
+    /// different limit values based on series characteristics.
+    limit_expr: Vec<PhysicalLimitExpr>,
+
+    /// Number of rows to skip at the beginning of each series (OFFSET).
+    ///
+    /// A value of 0 means no rows are skipped.
+    skip: usize,
+
+    /// Maximum number of rows to return from each series (LIMIT).
+    ///
+    /// `None` means no limit is applied (return all remaining rows after skip).
+    /// `Some(n)` limits each series to at most `n` rows.
+    fetch: Option<usize>,
+
+    /// `limit_expr` modfied for use when processing batches.
+    limited: Arc<BTreeMap<usize, LimitParams>>,
+
+    /// Metrics tracking execution statistics for this plan node.
+    ///
+    /// Collects metrics like elapsed time, number of output rows, etc.
+    metrics: ExecutionPlanMetricsSet,
+
+    /// Cached plan properties for efficient access.
+    ///
+    /// Contains schema, partitioning, execution mode, and sort order information
+    /// that are computed once and reused across multiple accesses.
+    cache: PlanProperties,
+
+    /// The required ordering for the input to this plan.
+    required_ordering: Option<OrderingRequirements>,
+}
+
+impl SeriesLimitExec {
+    /// Create a new SeriesLimitExec.
+    pub fn try_new(
+        input: Arc<dyn ExecutionPlan>,
+        series_expr: Vec<PhysicalExprRef>,
+        order_expr: Vec<PhysicalSortExpr>,
+        limit_expr: Vec<PhysicalLimitExpr>,
+        skip: usize,
+        fetch: Option<usize>,
+    ) -> Result<Self> {
+        let input_schema = input.schema();
+
+        let mut limited = BTreeMap::new();
+        for le in &limit_expr {
+            let mut index = None;
+            le.expr.apply(|pe| {
+                if let Some(column) = pe.as_any().downcast_ref::<Column>() {
+                    match index {
+                        None => index = Some(column.index()),
+                        Some(idx) if idx == column.index() => {}
+                        Some(_) => {
+                            return Err(DataFusionError::Plan(
+                                "PhysicalLimitExpr requires a single Column expression".to_string(),
+                            ));
+                        }
+                    }
+                }
+                Ok(TreeNodeRecursion::Continue)
+            })?;
+            let index = index.ok_or(DataFusionError::Plan(
+                "PhysicalLimitExpr requires a Column expression".to_string(),
+            ))?;
+            if limited.insert(index, LimitParams::try_from(le)?).is_some() {
+                return Err(DataFusionError::Plan(
+                    "SeriesLimitExec limit expressions must refer to distinct columns".to_string(),
+                ));
+            }
+        }
+
+        // The output schema is the same as the input scheme except for columns
+        // referenced in limit_expr, these are potentially renamed and may have
+        // their nullability changed.
+        let fields = input_schema
+            .fields()
+            .iter()
+            .enumerate()
+            .map(|(idx, field)| match limited.get(&idx) {
+                Some(params) => {
+                    let field = params.expr.return_field(input_schema.as_ref())?;
+                    let nullable = field.is_nullable();
+                    Ok(Arc::new(
+                        Arc::unwrap_or_clone(field).with_nullable(params.is_nullable(nullable)),
+                    ))
+                }
+                None => Ok(Arc::clone(field)),
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        let schema = Arc::new(arrow::datatypes::Schema::new_with_metadata(
+            fields,
+            input_schema.metadata().clone(),
+        ));
+
+        let limited = Arc::new(limited);
+        let required_ordering = Self::compute_ordering(&series_expr, &order_expr);
+        let cache = Self::compute_properties(&input, schema, &limited);
+
+        Ok(Self {
+            input,
+            series_expr,
+            order_expr,
+            limit_expr,
+            skip,
+            fetch,
+            limited,
+            metrics: ExecutionPlanMetricsSet::new(),
+            cache,
+            required_ordering,
+        })
+    }
+
+    /// This function creates the cache object that stores the plan properties
+    /// such as equivalence properties, partitioning, ordering, etc.
+    fn compute_properties(
+        input: &Arc<dyn ExecutionPlan>,
+        schema: SchemaRef,
+        limited: &BTreeMap<usize, LimitParams>,
+    ) -> PlanProperties {
+        // The output ordering is the same as the input ordering so long as
+        // it does not depend on any of the limited columns. Iterate through the
+        // input ordering stopping at the first ordering expression that depends on a
+        // limited column.
+        let ordering = input.output_ordering().and_then(|ordering| {
+            LexOrdering::new(
+                ordering
+                    .iter()
+                    .take_while(|pse| {
+                        !pse.expr
+                            .exists(|pe| {
+                                Ok(if let Some(col) = pe.as_any().downcast_ref::<Column>() {
+                                    limited.contains_key(&col.index())
+                                } else {
+                                    false
+                                })
+                            })
+                            .expect("cannot error")
+                    })
+                    .cloned(),
+            )
+        });
+
+        let eq_properties = if let Some(ordering) = ordering {
+            EquivalenceProperties::new_with_orderings(schema, std::iter::once(ordering))
+        } else {
+            EquivalenceProperties::new(Arc::clone(&schema))
+        };
+
+        PlanProperties::new(
+            eq_properties,
+            input.output_partitioning().clone(),
+            input.pipeline_behavior(),
+            input.boundedness(),
+        )
+    }
+
+    fn compute_ordering(
+        series_expr: &[PhysicalExprRef],
+        order_expr: &[PhysicalSortExpr],
+    ) -> Option<OrderingRequirements> {
+        let sort_requirements = series_expr
+            .iter()
+            .map(|expr| PhysicalSortRequirement {
+                expr: Arc::clone(expr),
+                options: None,
+            })
+            .chain(order_expr.iter().map(|se| PhysicalSortRequirement {
+                expr: Arc::clone(&se.expr),
+                options: Some(se.options),
+            }));
+
+        LexRequirement::new(sort_requirements).map(OrderingRequirements::new)
+    }
+}
+
+impl std::fmt::Debug for SeriesLimitExec {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("SeriesLimitExec")
+            .field("series_expr", &self.series_expr)
+            .field("order_expr", &self.order_expr)
+            .field("limit_expr", &self.limit_expr)
+            .field("skip", &self.skip)
+            .field("fetch", &self.fetch)
+            .finish_non_exhaustive()
+    }
+}
+
+impl ExecutionPlan for SeriesLimitExec {
+    fn name(&self) -> &str {
+        Self::static_name()
+    }
+
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn properties(&self) -> &PlanProperties {
+        &self.cache
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        match children.as_slice() {
+            [child] => Ok(Arc::new(Self::try_new(
+                Arc::clone(child),
+                self.series_expr.clone(),
+                self.order_expr.clone(),
+                self.limit_expr.clone(),
+                self.skip,
+                self.fetch,
+            )?)),
+            _ => Err(DataFusionError::Internal(format!(
+                "SeriesLimitExec wrong number of children: expected 1, found {}",
+                children.len()
+            ))),
+        }
+    }
+
+    fn execute(&self, partition: usize, context: Arc<TaskContext>) -> Result<SendableStream> {
+        if partition
+            >= self
+                .input
+                .properties()
+                .output_partitioning()
+                .partition_count()
+        {
+            return Err(DataFusionError::Internal(format!(
+                "SeriesLimitExec invalid partition {partition}"
+            )));
+        }
+
+        let input_stream = self.input.execute(partition, Arc::clone(&context))?;
+        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
+        let reservation = MemoryConsumer::new(format!("SeriesLimitExec[{partition}]"))
+            .register(context.memory_pool());
+
+        let series_expr = self.series_expr.clone();
+        let limited = Arc::clone(&self.limited);
+
+        let stream = SeriesLimitStream::try_new(
+            input_stream,
+            self.schema(),
+            baseline_metrics,
+            reservation,
+            series_expr,
+            limited,
+            self.skip as u64,
+            self.fetch.map(|f| f as u64),
+        )?;
+
+        Ok(Box::pin(stream))
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema()))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn required_input_distribution(&self) -> Vec<Distribution> {
+        vec![if self.series_expr.is_empty() {
+            Distribution::UnspecifiedDistribution
+        } else {
+            Distribution::HashPartitioned(self.series_expr.iter().map(Arc::clone).collect())
+        }]
+    }
+
+    fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
+        vec![self.required_ordering.clone()]
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![true]
+    }
+}
+
+impl DisplayAs for SeriesLimitExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default
+            | DisplayFormatType::Verbose
+            | DisplayFormatType::TreeRender => {
+                let series_expr = self
+                    .series_expr
+                    .iter()
+                    .map(|e| e.to_string())
+                    .collect::<Vec<_>>()
+                    .join(", ");
+
+                let order_expr = self
+                    .order_expr
+                    .iter()
+                    .map(|se| se.to_string())
+                    .collect::<Vec<_>>()
+                    .join(", ");
+
+                let limit_expr = self
+                    .limit_expr
+                    .iter()
+                    .map(|le| le.to_string())
+                    .collect::<Vec<_>>()
+                    .join(", ");
+
+                write!(
+                    f,
+                    "SeriesLimitExec: series=[{}], order=[{}], limit_expr=[{}]",
+                    series_expr, order_expr, limit_expr
+                )?;
+
+                if self.skip > 0 {
+                    write!(f, ", skip={}", self.skip)?;
+                }
+
+                if let Some(fetch) = self.fetch {
+                    write!(f, ", fetch={}", fetch)?;
+                }
+
+                Ok(())
+            }
+        }
+    }
+}
+
+/// A streaming implementation of per-series LIMIT and OFFSET operations.
+///
+/// This stream processes incoming record batches and applies LIMIT and OFFSET
+/// constraints independently to each time series (group of rows with the same
+/// series key values). It maintains state across batches to correctly handle
+/// series that span multiple batches.
+///
+/// # Behavior
+///
+/// For each incoming batch, the stream:
+/// 1. Evaluates series expressions to determine series boundaries
+/// 2. Detects series changes and resets row counters accordingly
+/// 3. Assigns row numbers within each series using [`row_number`]
+/// 4. Filters rows based on LIMIT (fetch) and OFFSET (skip) constraints
+/// 5. Replaces filtered-out values with default values
+/// 6. Tracks state for series that continue into subsequent batches
+///
+/// # Series Continuation
+///
+/// When a series spans multiple batches, the stream maintains:
+/// - The current series key values in `current_series`
+/// - Row counts for each limited expression in `counts`
+///
+/// This allows row numbering to continue correctly across batch boundaries.
+/// For example, if a series has 100 rows split across 3 batches, LIMIT 10 OFFSET 5
+/// will correctly skip the first 5 rows (even if they're in the first batch) and
+/// return the next 10 rows (even if they span multiple batches).
+///
+/// # Memory Management
+///
+/// The stream tracks memory usage via `reservation` and grows/shrinks it as the
+/// `current_series` state is updated. This ensures proper memory accounting in
+/// DataFusion's memory pool system.
+///
+/// # Example
+///
+/// Given input with two series (tag='a' and tag='b'), each with 4 rows:
+/// ```text
+/// Input:
+/// tag | time | value
+/// ----|------|------
+///  a  |  1   | 10
+///  a  |  2   | 20
+///  a  |  3   | 30
+///  a  |  4   | 40
+///  b  |  1   | 50
+///  b  |  2   | 60
+///  b  |  3   | 70
+///  b  |  4   | 80
+///
+/// With LIMIT 2 OFFSET 1:
+/// Output:
+/// tag | time | value
+/// ----|------|------
+///  a  |  2   | 20    (skipped row 1, included rows 2-3)
+///  a  |  3   | 30
+///  b  |  2   | 60    (skipped row 1, included rows 2-3)
+///  b  |  3   | 70
+/// ```
+///
+/// # Default Values
+///
+/// For rows that are filtered out but whose timestamps appear in other series,
+/// the stream can emit default values (typically NULL or 0) to maintain time
+/// alignment across series. This is controlled by the `limited` field's default
+/// value component.
+struct SeriesLimitStream {
+    /// The stream of input batches.
+    input: SendableRecordBatchStream,
+
+    /// The schema of the output batches.
+    schema: SchemaRef,
+
+    /// Metrics for tracking execution statistics.
+    metrics: BaselineMetrics,
+
+    /// Memory reservation for this stream.
+    reservation: MemoryReservation,
+
+    /// Physical expressions that define the series grouping.
+    ///
+    /// Rows with the same values for these expressions belong to the same series.
+    /// Typically these are tag columns in InfluxQL queries.
+    series_expr: Vec<PhysicalExprRef>,
+
+    /// Limited expressions with their null handling and default values.
+    ///
+    /// Each tuple contains:
+    /// - `PhysicalExprRef`: The expression to evaluate (typically a value column)
+    /// - `bool`: Whether to ignore nulls (true = IGNORE NULLS, false = RESPECT NULLS)
+    /// - `Scalar<ArrayRef>`: Default value to use for filtered-out rows
+    limited: Arc<BTreeMap<usize, LimitParams>>,
+
+    /// Range of row numbers to allow through the filter. This is a
+    /// half-open interval of the form (lower, upper]. Rows are numbered
+    /// from 1 so a lower bound of 0 will allow all rows up to upper. An
+    /// upper value of u64::MAX is used to mean there is effectively no
+    /// limit.
+    lower: Scalar<PrimitiveArray<UInt64Type>>,
+    upper: Scalar<PrimitiveArray<UInt64Type>>,
+
+    /// The current series key being processed.
+    current_series: Vec<Scalar<ArrayRef>>,
+
+    /// Row counts for each limited expression in the current series.
+    counts: BTreeMap<usize, u64>,
+}
+
+impl SeriesLimitStream {
+    #[expect(clippy::too_many_arguments)]
+    fn try_new(
+        input: SendableRecordBatchStream,
+        schema: SchemaRef,
+        metrics: BaselineMetrics,
+        mut reservation: MemoryReservation,
+        series_expr: Vec<PhysicalExprRef>,
+        limited: Arc<BTreeMap<usize, LimitParams>>,
+        skip: u64,
+        fetch: Option<u64>,
+    ) -> Result<Self> {
+        // Set the initial series to be all nulls.
+        let current_series = series_expr
+            .iter()
+            .map(|expr| expr.data_type(input.schema().as_ref()))
+            .map(|data_type| data_type.map(|data_type| new_null_array(&data_type, 1)))
+            .collect::<Result<Vec<_>>>()?;
+        // Set the initial memory size.
+        reservation.resize(
+            current_series
+                .iter()
+                .map(|arr| arr.get_array_memory_size())
+                .sum::<usize>(),
+        );
+        let current_series = current_series.into_iter().map(Scalar::new).collect();
+        let counts = limited.keys().map(|idx| (*idx, 0u64)).collect();
+        let lower = PrimitiveArray::<UInt64Type>::new_scalar(skip);
+        let upper =
+            PrimitiveArray::<UInt64Type>::new_scalar(fetch.map(|n| n + skip).unwrap_or(u64::MAX));
+        Ok(Self {
+            input,
+            schema,
+            metrics,
+            reservation,
+            series_expr,
+            limited,
+            lower,
+            upper,
+            current_series,
+            counts,
+        })
+    }
+
+    fn process_batch(&mut self, batch: RecordBatch) -> Result<RecordBatch> {
+        let num_rows = batch.num_rows();
+        if num_rows == 0 {
+            return Ok(RecordBatch::new_empty(Arc::clone(&self.schema)));
+        }
+
+        let series_arrs = self
+            .series_expr
+            .iter()
+            .map(|pe| pe.evaluate(&batch))
+            .map(|res| res.and_then(|cv| cv.to_array(num_rows)))
+            .collect::<Result<Vec<_>>>()?;
+
+        // Check if the series has changed compared to the current series.
+        // Short-circuit on first mismatch to avoid unnecessary comparisons.
+        let mut series_changed = false;
+        for (arr, current) in series_arrs.iter().zip(self.current_series.iter()) {
+            let first_value = Scalar::new(arr.slice(0, 1));
+
+            if !arrow::compute::kernels::cmp::eq(&first_value, current)?.value(0) {
+                series_changed = true;
+                break;
+            }
+        }
+
+        if series_changed {
+            // Series has changed, reset counts.
+            for count in &mut self.counts.values_mut() {
+                *count = 0;
+            }
+        }
+
+        // Partition the series.
+        let partitions = partition(&series_arrs)?;
+
+        // All columns that have ignore_nulls as false will produce the
+        // same filter, remember it to avoid recomputing.
+        let mut respect_nulls_cache: Option<(Arc<BooleanArray>, u64)> = None;
+        let mut limited_arrs: BTreeMap<usize, ArrayRef> = BTreeMap::default();
+        let mut filters = Vec::with_capacity(self.limited.len());
+
+        for (idx, params) in self.limited.iter() {
+            let LimitParams {
+                expr,
+                ignore_nulls,
+                default_value,
+            } = params;
+            let arr = expr.evaluate(&batch)?.into_array(num_rows)?;
+
+            let (filter, count) = match (*ignore_nulls, &respect_nulls_cache) {
+                (true, _) => {
+                    let (arr, count) = row_number(&arr, self.counts[idx], true, &partitions);
+                    let filter = arrow::compute::and(
+                        &arrow::compute::kernels::cmp::gt(&arr, &self.lower)?,
+                        &arrow::compute::kernels::cmp::lt_eq(&arr, &self.upper)?,
+                    )?;
+                    (Arc::new(filter), count)
+                }
+                (false, Some((filter, count))) => (Arc::clone(filter), *count),
+                (false, None) => {
+                    let (arr, count) = row_number(&arr, self.counts[idx], false, &partitions);
+                    let filter = Arc::new(arrow::compute::and(
+                        &arrow::compute::kernels::cmp::gt(&arr, &self.lower)?,
+                        &arrow::compute::kernels::cmp::lt_eq(&arr, &self.upper)?,
+                    )?);
+                    respect_nulls_cache = Some((Arc::clone(&filter), count));
+                    (filter, count)
+                }
+            };
+            limited_arrs.insert(
+                *idx,
+                arrow::compute::kernels::zip::zip(&filter, &arr, default_value)?,
+            );
+            filters.push(filter);
+            self.counts.insert(*idx, count);
+        }
+
+        // Compute the batch filter efficiently by building it in one pass.
+        // Instead of folding with or_kleene (which creates N-1 intermediate arrays),
+        // we build the result directly by checking if any filter is true at each position.
+        let batch_filter = if filters.is_empty() {
+            BooleanArray::new_null(num_rows)
+        } else if filters.len() == 1 {
+            // Fast path: single filter, no need to combine
+            Arc::unwrap_or_clone(Arc::clone(&filters[0]))
+        } else {
+            // Multiple filters: combine them efficiently
+            let mut batch_filter_builder = arrow::array::BooleanBuilder::with_capacity(num_rows);
+
+            for row_idx in 0..num_rows {
+                // Check if any filter is true for this row
+                let any_true = filters.iter().any(|filter| filter.value(row_idx));
+                batch_filter_builder.append_value(any_true);
+            }
+
+            batch_filter_builder.finish()
+        };
+
+        let output_arrs = batch
+            .into_parts()
+            .1
+            .iter()
+            .enumerate()
+            .map(|(idx, arr)| {
+                if let Some(limited_arr) = limited_arrs.get(&idx) {
+                    limited_arr
+                } else {
+                    arr
+                }
+            })
+            .map(|arr| arrow::compute::filter(arr, &batch_filter))
+            .collect::<Result<Vec<_>, ArrowError>>()?;
+
+        // Store the current series. Tracking the memory use.
+        for (idx, arr) in series_arrs.iter().enumerate() {
+            let arr = arr.slice(num_rows - 1, 1);
+            self.reservation.try_grow(arr.get_array_memory_size())?;
+            let mut value = Scalar::new(arr);
+            std::mem::swap(&mut self.current_series[idx], &mut value);
+            let arr = value.into_inner();
+            self.reservation.shrink(arr.get_array_memory_size());
+        }
+
+        Ok(RecordBatch::try_new(Arc::clone(&self.schema), output_arrs)?)
+    }
+}
+
+impl RecordBatchStream for SeriesLimitStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+impl Stream for SeriesLimitStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        // Poll the input stream for the next batch
+        match ready!(self.input.poll_next_unpin(cx)) {
+            Some(Ok(batch)) => {
+                // Process the batch through our series limiting logic
+                let elapsed_compute = self.metrics.elapsed_compute().clone();
+                let result = {
+                    let _timer = elapsed_compute.timer();
+                    self.process_batch(batch)
+                };
+                match result {
+                    Ok(output_batch) => {
+                        // Record the number of output rows
+                        self.metrics.record_output(output_batch.num_rows());
+                        Poll::Ready(Some(Ok(output_batch)))
+                    }
+                    Err(e) => Poll::Ready(Some(Err(e))),
+                }
+            }
+            Some(Err(e)) => Poll::Ready(Some(Err(e))),
+            None => Poll::Ready(None),
+        }
+    }
+}
+
+/// Parameters defining how to process a limited column.
+///
+/// `LimitParams` encapsulates the processing rules for a single value column that
+/// has per-series LIMIT/OFFSET constraints applied to it. During query execution,
+/// these parameters control how rows are numbered, filtered, and replaced with
+/// default values.
+struct LimitParams {
+    /// The expression for the limited column.
+    expr: PhysicalExprRef,
+
+    /// Whether to ignore nulls in the limit calculation.
+    ignore_nulls: bool,
+
+    /// The default value to use for filtered-out rows.
+    default_value: Scalar<ArrayRef>,
+}
+
+impl LimitParams {
+    /// Determine if the limited column can be nullable in the output.
+    fn is_nullable(&self, input_nullable: bool) -> bool {
+        let default_nullable = self.default_value.get().0.is_nullable();
+        if self.ignore_nulls {
+            // Any nulls will be replace by the default value
+            default_nullable
+        } else {
+            // Respect nulls, so nullable if input is nullable or
+            // default is nullable
+            input_nullable || default_nullable
+        }
+    }
+}
+
+impl TryFrom<&PhysicalLimitExpr> for LimitParams {
+    type Error = DataFusionError;
+
+    fn try_from(value: &PhysicalLimitExpr) -> Result<Self> {
+        let default_value = value.default_value.to_scalar().map_err(|e| {
+            DataFusionError::Plan(format!(
+                "PhysicalLimitExpr failed to convert default value to scalar: {}",
+                e
+            ))
+        })?;
+        Ok(Self {
+            expr: Arc::clone(&value.expr),
+            ignore_nulls: value.ignore_nulls,
+            default_value,
+        })
+    }
+}
+
+/// Assigns row numbers to elements in an array, respecting partition boundaries.
+///
+/// This function generates sequential row numbers for each element in the input array,
+/// with special handling for partitions and null values. Row numbers restart at 1 for
+/// each new partition.
+///
+/// # Arguments
+///
+/// * `arr` - The input array for which to generate row numbers. This is typically a
+///   value column, and is used only to check for null values when `ignore_nulls` is true.
+/// * `start` - The starting row number for the first partition. Subsequent partitions
+///   always start at 1. This allows continuing numbering across multiple batches within
+///   the same partition.
+/// * `ignore_nulls` - Controls null handling behavior:
+///   - `false` (RESPECT NULLS): Null values receive row numbers like any other value
+///   - `true` (IGNORE NULLS): Null values are skipped and assigned null row numbers
+/// * `partitions` - Defines the partition boundaries within the array. Each partition
+///   represents a distinct group (e.g., time series) where row numbering should restart.
+///
+/// # Returns
+///
+/// Returns a tuple of:
+/// * `PrimitiveArray<UInt64Type>` - An array of row numbers corresponding to each element
+///   in the input array. Elements may be null if `ignore_nulls` is true and the corresponding
+///   input element is null.
+/// * `u64` - The final row number assigned in the last partition. This can be used as the
+///   `start` value for subsequent calls to continue numbering within the same partition.
+///
+/// # Examples
+///
+/// ```text
+/// // Single partition, no nulls, starting from 0:
+/// arr = [10, 20, 30]
+/// partitions = single partition covering all elements
+/// result = ([1, 2, 3], 3)
+///
+/// // Single partition with RESPECT NULLS:
+/// arr = [10, null, 30]
+/// ignore_nulls = false
+/// result = ([1, 2, 3], 3)
+///
+/// // Single partition with IGNORE NULLS:
+/// arr = [10, null, 30]
+/// ignore_nulls = true
+/// result = ([1, null, 2], 2)
+///
+/// // Multiple partitions (e.g., two different series):
+/// arr = [1, 2, 3, 4, 5, 6]
+/// partitions = [0..3, 3..6] (two partitions)
+/// result = ([1, 2, 3, 1, 2, 3], 3)
+/// // Note: row numbering resets for second partition
+///
+/// // Continuing numbering across batches:
+/// // Batch 1:
+/// arr1 = [10, 20]
+/// (result1, last1) = row_number(arr1, 0, false, single_partition)
+/// // result1 = [1, 2], last1 = 2
+///
+/// // Batch 2 (same partition continues):
+/// arr2 = [30, 40]
+/// (result2, last2) = row_number(arr2, last1, false, single_partition)
+/// // result2 = [3, 4], last2 = 4
+/// ```
+fn row_number(
+    arr: &ArrayRef,
+    start: u64,
+    ignore_nulls: bool,
+    partitions: &Partitions,
+) -> (PrimitiveArray<UInt64Type>, u64) {
+    let mut builder = UInt64Builder::with_capacity(arr.len());
+    let mut row_number = start;
+
+    for (idx, range) in partitions.ranges().iter().enumerate() {
+        if idx > 0 {
+            row_number = 0;
+        }
+        for idx in range.start..range.end {
+            if ignore_nulls && arr.is_null(idx) {
+                builder.append_null();
+                continue;
+            }
+            row_number += 1;
+            builder.append_value(row_number);
+        }
+    }
+
+    (builder.finish(), row_number)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{Int64Array, StringArray};
+    use datafusion::physical_expr::expressions::Column;
+    use insta::assert_snapshot;
+
+    mod physical_limit_expr_tests {
+        use super::*;
+
+        #[test]
+        fn test_new() {
+            let expr = Arc::new(Column::new("value", 2)) as PhysicalExprRef;
+            let default_value = ScalarValue::Float64(Some(0.0));
+            let limit_expr = PhysicalLimitExpr::new(expr, true, default_value.clone());
+
+            assert!(limit_expr.ignore_nulls);
+            assert_eq!(limit_expr.default_value, default_value);
+        }
+
+        #[test]
+        fn test_display_ignore_nulls() {
+            let expr = Arc::new(Column::new("value", 2)) as PhysicalExprRef;
+            let default_value = ScalarValue::Float64(Some(0.0));
+            let limit_expr = PhysicalLimitExpr::new(expr, true, default_value);
+
+            let display_str = format!("{}", limit_expr);
+            assert!(display_str.contains("IGNORE NULLS"));
+            assert!(display_str.contains("default:"));
+            assert!(display_str.contains("0"));
+        }
+
+        #[test]
+        fn test_display_respect_nulls() {
+            let expr = Arc::new(Column::new("value", 2)) as PhysicalExprRef;
+            let default_value = ScalarValue::Float64(Some(99.9));
+            let limit_expr = PhysicalLimitExpr::new(expr, false, default_value);
+
+            let display_str = format!("{}", limit_expr);
+            assert!(display_str.contains("RESPECT NULLS"));
+            assert!(display_str.contains("default:"));
+            assert!(display_str.contains("99.9"));
+        }
+
+        #[test]
+        fn test_clone() {
+            let expr = Arc::new(Column::new("value", 2)) as PhysicalExprRef;
+            let default_value = ScalarValue::Float64(Some(0.0));
+            let limit_expr = PhysicalLimitExpr::new(expr, true, default_value.clone());
+
+            let cloned = limit_expr.clone();
+            assert_eq!(cloned.ignore_nulls, limit_expr.ignore_nulls);
+            assert_eq!(cloned.default_value, limit_expr.default_value);
+        }
+
+        #[test]
+        fn test_debug() {
+            let expr = Arc::new(Column::new("value", 2)) as PhysicalExprRef;
+            let default_value = ScalarValue::Float64(Some(0.0));
+            let limit_expr = PhysicalLimitExpr::new(expr, true, default_value);
+
+            let debug_str = format!("{:?}", limit_expr);
+            assert!(!debug_str.is_empty());
+            assert!(debug_str.contains("PhysicalLimitExpr"));
+        }
+    }
+
+    mod series_limit_exec_tests {
+        use super::*;
+        use arrow::array::Float64Array;
+        use arrow::compute::SortOptions;
+        use arrow::datatypes::{Field, Schema};
+        use datafusion::common::test_util::batches_to_string;
+        use datafusion::physical_expr::LexOrdering;
+        use datafusion::physical_plan::display::DisplayableExecutionPlan;
+        use datafusion::{
+            datasource::{memory::MemorySourceConfig, source::DataSourceExec},
+            execution::context::SessionContext,
+            physical_plan::sorts::sort::SortExec,
+        };
+        use futures::StreamExt;
+
+        fn string_array<I, O, S>(vals: I) -> ArrayRef
+        where
+            I: IntoIterator<Item = O>,
+            O: Into<Option<S>>,
+            S: AsRef<str>,
+        {
+            Arc::new(StringArray::from_iter(vals.into_iter().map(|v| v.into())))
+        }
+
+        fn int_array(vals: impl IntoIterator<Item = impl Into<Option<i64>>>) -> ArrayRef {
+            Arc::new(Int64Array::from_iter(vals.into_iter().map(|v| v.into())))
+        }
+
+        fn float_array(vals: impl IntoIterator<Item = impl Into<Option<f64>>>) -> ArrayRef {
+            Arc::new(Float64Array::from_iter(vals.into_iter().map(|v| v.into())))
+        }
+
+        fn input_plan(
+            arrs: impl IntoIterator<Item = (impl Into<String>, impl Into<ArrayRef>)>,
+            sort: impl IntoIterator<Item = (impl Into<String>, impl Into<Option<SortOptions>>)>,
+        ) -> Arc<dyn ExecutionPlan> {
+            let columns: Vec<(String, ArrayRef)> = arrs
+                .into_iter()
+                .map(|(name, arr)| (name.into(), arr.into()))
+                .collect();
+            let fields = columns
+                .iter()
+                .map(|(name, arr)| Field::new(name, arr.data_type().clone(), arr.null_count() > 0))
+                .collect::<Vec<_>>();
+            let schema = Arc::new(Schema::new(fields));
+            let batch = RecordBatch::try_new(
+                Arc::clone(&schema),
+                columns.into_iter().map(|(_, arr)| arr).collect(),
+            )
+            .unwrap();
+            let empty = batch.num_rows() == 0;
+            let mut plan: Arc<dyn ExecutionPlan> = Arc::new(DataSourceExec::new(Arc::new(
+                MemorySourceConfig::try_new(&[vec![batch]], schema, None).unwrap(),
+            )));
+
+            let mut sort_it = sort.into_iter().peekable();
+            if !empty && sort_it.peek().is_some() {
+                let sort_expr = LexOrdering::new(sort_it.map(|(name, opts)| {
+                    let name = name.into();
+                    PhysicalSortExpr::new(
+                        Arc::new(Column::new(&name, plan.schema().index_of(&name).unwrap())),
+                        opts.into().unwrap_or_default(),
+                    )
+                }))
+                .unwrap();
+                plan = Arc::new(SortExec::new(sort_expr, plan));
+            }
+
+            plan
+        }
+
+        fn test_input_plan<I, O, S>(
+            tag: I,
+            time: impl IntoIterator<Item = impl Into<Option<i64>>>,
+            value: impl IntoIterator<Item = impl Into<Option<f64>>>,
+        ) -> Arc<dyn ExecutionPlan>
+        where
+            I: IntoIterator<Item = O>,
+            O: Into<Option<S>>,
+            S: AsRef<str>,
+        {
+            input_plan(
+                [
+                    ("tag", string_array(tag)),
+                    ("time", int_array(time)),
+                    ("value", float_array(value)),
+                ],
+                [("tag", None), ("time", None)],
+            )
+        }
+
+        /// Helper to collect all batches from a stream
+        async fn collect_stream(
+            mut stream: Pin<Box<dyn RecordBatchStream + Send>>,
+        ) -> Result<Vec<RecordBatch>> {
+            let mut batches = vec![];
+            while let Some(batch) = stream.next().await {
+                batches.push(batch?);
+            }
+            Ok(batches)
+        }
+
+        #[tokio::test]
+        async fn test_basic_limit() {
+            // Test basic LIMIT functionality - limit 2 rows per series
+            let input = test_input_plan(
+                ["a", "a", "a", "a", "b", "b", "b", "b"],
+                [1, 2, 3, 4, 1, 2, 3, 4],
+                [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(2))
+                    .unwrap();
+
+            let session_ctx = SessionContext::new();
+            let task_ctx = session_ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let batches = collect_stream(stream).await.unwrap();
+
+            assert_snapshot!(batches_to_string(&batches), @r#"
+            +-----+------+-------+
+            | tag | time | value |
+            +-----+------+-------+
+            | a   | 1    | 1.0   |
+            | a   | 2    | 2.0   |
+            | b   | 1    | 5.0   |
+            | b   | 2    | 6.0   |
+            +-----+------+-------+
+            "#);
+        }
+
+        #[tokio::test]
+        async fn test_basic_offset() {
+            // Test basic OFFSET functionality - skip first 2 rows per series
+            let input = test_input_plan(["a", "a", "a", "a"], [1, 2, 3, 4], [1.0, 2.0, 3.0, 4.0]);
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 2, None)
+                    .unwrap();
+
+            let session_ctx = SessionContext::new();
+            let task_ctx = session_ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let batches = collect_stream(stream).await.unwrap();
+
+            assert_snapshot!(batches_to_string(&batches), @r#"
+            +-----+------+-------+
+            | tag | time | value |
+            +-----+------+-------+
+            | a   | 3    | 3.0   |
+            | a   | 4    | 4.0   |
+            +-----+------+-------+
+            "#);
+        }
+
+        #[tokio::test]
+        async fn test_limit_and_offset() {
+            // Test combined LIMIT and OFFSET
+            let input = test_input_plan(
+                ["a", "a", "a", "a", "a", "a"],
+                [1, 2, 3, 4, 5, 6],
+                [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            // Skip 2, take 2
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 2, Some(2))
+                    .unwrap();
+
+            let session_ctx = SessionContext::new();
+            let task_ctx = session_ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let batches = collect_stream(stream).await.unwrap();
+
+            assert_snapshot!(batches_to_string(&batches), @r#"
+            +-----+------+-------+
+            | tag | time | value |
+            +-----+------+-------+
+            | a   | 3    | 3.0   |
+            | a   | 4    | 4.0   |
+            +-----+------+-------+
+            "#);
+        }
+
+        #[tokio::test]
+        async fn test_multiple_series() {
+            // Test that limits apply independently to each series
+            let input = test_input_plan(
+                ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+                [1, 2, 3, 1, 2, 3, 1, 2, 3],
+                [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            // Limit 1 row per series
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(1))
+                    .unwrap();
+
+            let session_ctx = SessionContext::new();
+            let task_ctx = session_ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let batches = collect_stream(stream).await.unwrap();
+
+            assert_snapshot!(batches_to_string(&batches), @r#"
+            +-----+------+-------+
+            | tag | time | value |
+            +-----+------+-------+
+            | a   | 1    | 1.0   |
+            | b   | 1    | 4.0   |
+            | c   | 1    | 7.0   |
+            +-----+------+-------+
+            "#);
+        }
+
+        #[tokio::test]
+        async fn test_empty_batch() {
+            // Test handling of empty batches
+            let input = test_input_plan(Vec::<String>::new(), Vec::<i64>::new(), Vec::<f64>::new());
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10))
+                    .unwrap();
+
+            let session_ctx = SessionContext::new();
+            let task_ctx = session_ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let batches = collect_stream(stream).await.unwrap();
+
+            assert_snapshot!(batches_to_string(&batches), @r#"
+            +-----+------+-------+
+            | tag | time | value |
+            +-----+------+-------+
+            +-----+------+-------+
+            "#);
+        }
+
+        #[tokio::test]
+        async fn test_with_nulls() {
+            // Test handling of null values with ignore_nulls = false
+            let input = test_input_plan(
+                ["a", "a", "a", "a"],
+                [1, 2, 3, 4],
+                [Some(1.0), None, Some(3.0), Some(4.0)],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false, // RESPECT NULLS
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(2))
+                    .unwrap();
+
+            let session_ctx = SessionContext::new();
+            let task_ctx = session_ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let batches = collect_stream(stream).await.unwrap();
+
+            assert_snapshot!(batches_to_string(&batches), @r#"
+            +-----+------+-------+
+            | tag | time | value |
+            +-----+------+-------+
+            | a   | 1    | 1.0   |
+            | a   | 2    |       |
+            +-----+------+-------+
+            "#);
+        }
+
+        #[tokio::test]
+        async fn test_with_nulls_ignore() {
+            // Test handling of null values with ignore_nulls = true
+            let input = test_input_plan(
+                ["a", "a", "a", "a", "a"],
+                [1, 2, 3, 4, 5],
+                [Some(1.0), None, Some(3.0), None, Some(5.0)],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                true, // IGNORE NULLS
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(2))
+                    .unwrap();
+
+            let session_ctx = SessionContext::new();
+            let task_ctx = session_ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let batches = collect_stream(stream).await.unwrap();
+
+            assert_snapshot!(batches_to_string(&batches), @r#"
+            +-----+------+-------+
+            | tag | time | value |
+            +-----+------+-------+
+            | a   | 1    | 1.0   |
+            | a   | 3    | 3.0   |
+            +-----+------+-------+
+            "#);
+        }
+
+        #[test]
+        fn test_execute_invalid_partition() {
+            // Test that execute returns an error for invalid partition number
+            let input = test_input_plan(["a"], [1], [1.0]);
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, None)
+                    .unwrap();
+
+            let session_ctx = SessionContext::new();
+            let task_ctx = session_ctx.task_ctx();
+
+            // Try to execute with invalid partition number (only partition 0 exists)
+            let result = exec.execute(999, task_ctx);
+            assert!(result.is_err());
+            let err_string = result.err().unwrap().to_string();
+            assert!(err_string.contains("invalid partition"));
+        }
+
+        #[test]
+        fn test_with_new_children_wrong_count_zero() {
+            // Test with_new_children with 0 children
+            let input = test_input_plan(["a"], [1], [1.0]);
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec = Arc::new(
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, None)
+                    .unwrap(),
+            );
+
+            // Try with 0 children
+            let result = exec.with_new_children(vec![]);
+            assert!(result.is_err());
+            assert!(
+                result
+                    .unwrap_err()
+                    .to_string()
+                    .contains("wrong number of children")
+            );
+        }
+
+        #[test]
+        fn test_with_new_children_wrong_count_two() {
+            // Test with_new_children with 2 children
+            let input1 = test_input_plan(["a"], [1], [1.0]);
+            let input2 = Arc::clone(&input1);
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec = Arc::new(
+                SeriesLimitExec::try_new(input1, series_expr, order_expr, limit_expr, 0, None)
+                    .unwrap(),
+            );
+
+            let input3 =
+                test_input_plan(Vec::<String>::new(), Vec::<i64>::new(), Vec::<f64>::new());
+            // Try with 2 children
+            let result = exec.with_new_children(vec![input2, input3]);
+            assert!(result.is_err());
+            assert!(
+                result
+                    .unwrap_err()
+                    .to_string()
+                    .contains("wrong number of children")
+            );
+        }
+
+        #[tokio::test]
+        async fn test_preserve_schema() {
+            // Test that limits apply independently to each series
+            let input = input_plan(
+                [
+                    (
+                        "value1",
+                        float_array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]),
+                    ),
+                    (
+                        "tag1",
+                        string_array(["a", "a", "a", "b", "b", "b", "c", "c", "c"]),
+                    ),
+                    (
+                        "value2",
+                        float_array([10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0]),
+                    ),
+                    ("time", int_array([1, 2, 3, 1, 2, 3, 1, 2, 3])),
+                    (
+                        "tag2",
+                        string_array(["A", "A", "B", "B", "B", "C", "C", "C", "D"]),
+                    ),
+                ],
+                [("tag1", None), ("tag2", None), ("time", None)],
+            );
+
+            let series_expr = vec![
+                Arc::new(Column::new("tag1", 1)) as PhysicalExprRef,
+                Arc::new(Column::new("tag2", 4)) as PhysicalExprRef,
+            ];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 3)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            let limit_expr = vec![
+                PhysicalLimitExpr::new(
+                    Arc::new(Column::new("value1", 0)) as PhysicalExprRef,
+                    false,
+                    ScalarValue::Float64(None),
+                ),
+                PhysicalLimitExpr::new(
+                    Arc::new(Column::new("value2", 2)) as PhysicalExprRef,
+                    false,
+                    ScalarValue::Float64(None),
+                ),
+            ];
+
+            // Limit 1 row per series
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(1))
+                    .unwrap();
+
+            let session_ctx = SessionContext::new();
+            let task_ctx = session_ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let batches = collect_stream(stream).await.unwrap();
+
+            assert_snapshot!(batches_to_string(&batches), @r"
+            +--------+------+--------+------+------+
+            | value1 | tag1 | value2 | time | tag2 |
+            +--------+------+--------+------+------+
+            | 1.0    | a    | 10.0   | 1    | A    |
+            | 3.0    | a    | 30.0   | 3    | B    |
+            | 4.0    | b    | 40.0   | 1    | B    |
+            | 6.0    | b    | 60.0   | 3    | C    |
+            | 7.0    | c    | 70.0   | 1    | C    |
+            | 9.0    | c    | 90.0   | 3    | D    |
+            +--------+------+--------+------+------+
+            ");
+        }
+
+        #[test]
+        fn test_display_as() {
+            // Test DisplayAs formatting
+            let input = test_input_plan(["a"], [1], [1.0]);
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 5, Some(10))
+                    .unwrap();
+
+            assert_snapshot!(DisplayableExecutionPlan::new(&exec).indent(false), @r"
+            SeriesLimitExec: series=[tag@0], order=[time@1 ASC], limit_expr=[value@2 RESPECT NULLS (default: 0)], skip=5, fetch=10
+              SortExec: expr=[tag@0 ASC, time@1 ASC], preserve_partitioning=[false]
+                DataSourceExec: partitions=1, partition_sizes=[1]
+            ");
+        }
+
+        #[test]
+        fn test_empty_series_expr_distribution() {
+            // Test UnspecifiedDistribution when series_expr is empty
+            let input = test_input_plan(["a"], [1], [1.0]);
+
+            let series_expr = vec![]; // Empty!
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10))
+                    .unwrap();
+
+            let distributions = exec.required_input_distribution();
+            assert_eq!(distributions.len(), 1);
+            matches!(distributions[0], Distribution::UnspecifiedDistribution);
+        }
+
+        #[tokio::test]
+        async fn test_descending_time_ordering() {
+            // Test with descending time ordering
+            let input = input_plan(
+                [
+                    ("tag", string_array(["a", "a", "a"])),
+                    ("time", int_array([3, 2, 1])), // Descending time
+                    ("value", float_array([30.0, 20.0, 10.0])),
+                ],
+                [
+                    ("tag", None),
+                    (
+                        "time",
+                        Some(SortOptions {
+                            descending: true,
+                            nulls_first: true,
+                        }),
+                    ),
+                ],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions {
+                    descending: true,
+                    nulls_first: true,
+                },
+            }];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(2))
+                    .unwrap();
+
+            let session_ctx = SessionContext::new();
+            let task_ctx = session_ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let batches = collect_stream(stream).await.unwrap();
+
+            assert_snapshot!(batches_to_string(&batches), @r#"
+            +-----+------+-------+
+            | tag | time | value |
+            +-----+------+-------+
+            | a   | 3    | 30.0  |
+            | a   | 2    | 20.0  |
+            +-----+------+-------+
+            "#);
+        }
+
+        #[tokio::test]
+        async fn test_multiple_limited_expressions() {
+            let input = input_plan(
+                [
+                    ("tag", string_array(["a", "a", "a", "a"])),
+                    ("time", int_array([1, 2, 3, 4])),
+                    ("value1", float_array([1.0, 2.0, 3.0, 4.0])),
+                    ("value2", float_array([10.0, 20.0, 30.0, 40.0])),
+                ],
+                [("tag", None), ("time", None)],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            // TWO limited expressions
+            let limit_expr = vec![
+                PhysicalLimitExpr::new(
+                    Arc::new(Column::new("value1", 2)) as PhysicalExprRef,
+                    false,
+                    ScalarValue::Float64(Some(0.0)),
+                ),
+                PhysicalLimitExpr::new(
+                    Arc::new(Column::new("value2", 3)) as PhysicalExprRef,
+                    false,
+                    ScalarValue::Float64(Some(0.0)),
+                ),
+            ];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(2))
+                    .unwrap();
+
+            let session_ctx = SessionContext::new();
+            let task_ctx = session_ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let batches = collect_stream(stream).await.unwrap();
+
+            assert_snapshot!(batches_to_string(&batches), @r#"
+            +-----+------+--------+--------+
+            | tag | time | value1 | value2 |
+            +-----+------+--------+--------+
+            | a   | 1    | 1.0    | 10.0   |
+            | a   | 2    | 2.0    | 20.0   |
+            +-----+------+--------+--------+
+            "#);
+        }
+
+        #[tokio::test]
+        async fn test_multiple_limited_expressions_ignore_nulls() {
+            // Test with multiple limited expressions ignoring nulls
+            let input = input_plan(
+                [
+                    ("tag", string_array(["a", "a", "a", "a"])),
+                    ("time", int_array([1, 2, 3, 4])),
+                    (
+                        "value1",
+                        float_array([Some(1.0), None, Some(3.0), Some(4.0)]),
+                    ),
+                    (
+                        "value2",
+                        float_array([Some(10.0), Some(20.0), None, Some(40.0)]),
+                    ),
+                ],
+                [("tag", None), ("time", None)],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            // TWO limited expressions
+            let limit_expr = vec![
+                PhysicalLimitExpr::new(
+                    Arc::new(Column::new("value1", 2)) as PhysicalExprRef,
+                    true,
+                    ScalarValue::Float64(Some(0.0)),
+                ),
+                PhysicalLimitExpr::new(
+                    Arc::new(Column::new("value2", 3)) as PhysicalExprRef,
+                    true,
+                    ScalarValue::Float64(Some(0.0)),
+                ),
+            ];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(2))
+                    .unwrap();
+
+            let session_ctx = SessionContext::new();
+            let task_ctx = session_ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let batches = collect_stream(stream).await.unwrap();
+
+            assert_snapshot!(batches_to_string(&batches), @r"
+            +-----+------+--------+--------+
+            | tag | time | value1 | value2 |
+            +-----+------+--------+--------+
+            | a   | 1    | 1.0    | 10.0   |
+            | a   | 2    | 0.0    | 20.0   |
+            | a   | 3    | 3.0    | 0.0    |
+            +-----+------+--------+--------+
+            ");
+        }
+
+        #[tokio::test]
+        async fn test_multiple_limited_expressions_ignore_nulls_with_offset() {
+            // Test with multiple limited expressions ignoring nulls and with offset
+            let input = input_plan(
+                [
+                    ("tag", string_array(["a", "a", "a", "a"])),
+                    ("time", int_array([1, 2, 3, 4])),
+                    (
+                        "value1",
+                        float_array([Some(1.0), None, Some(3.0), Some(4.0)]),
+                    ),
+                    (
+                        "value2",
+                        float_array([Some(10.0), Some(20.0), None, Some(40.0)]),
+                    ),
+                ],
+                [("tag", None), ("time", None)],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            // TWO limited expressions
+            let limit_expr = vec![
+                PhysicalLimitExpr::new(
+                    Arc::new(Column::new("value1", 2)) as PhysicalExprRef,
+                    true,
+                    ScalarValue::Float64(Some(0.0)),
+                ),
+                PhysicalLimitExpr::new(
+                    Arc::new(Column::new("value2", 3)) as PhysicalExprRef,
+                    true,
+                    ScalarValue::Float64(Some(0.0)),
+                ),
+            ];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 1, Some(2))
+                    .unwrap();
+
+            let session_ctx = SessionContext::new();
+            let task_ctx = session_ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let batches = collect_stream(stream).await.unwrap();
+
+            assert_snapshot!(batches_to_string(&batches), @r"
+            +-----+------+--------+--------+
+            | tag | time | value1 | value2 |
+            +-----+------+--------+--------+
+            | a   | 2    | 0.0    | 20.0   |
+            | a   | 3    | 3.0    | 0.0    |
+            | a   | 4    | 4.0    | 40.0   |
+            +-----+------+--------+--------+
+            ");
+        }
+
+        #[tokio::test]
+        async fn test_multiple_order_expressions() {
+            // Test with multiple order expressions (e.g., ORDER BY time, value)
+            let input = input_plan(
+                [
+                    ("tag", string_array(["a", "a", "a", "a", "a", "a"])),
+                    ("time", int_array([1, 1, 1, 2, 2, 2])), // Same time values to test secondary ordering
+                    ("tag2", string_array(["c", "c", "c", "c", "c", "c"])),
+                    ("value", float_array([30.0, 20.0, 10.0, 60.0, 50.0, 40.0])),
+                ],
+                [
+                    ("tag", None),
+                    ("time", None),
+                    ("tag2", None),
+                    (
+                        "value",
+                        Some(SortOptions {
+                            descending: true,
+                            nulls_first: false,
+                        }),
+                    ),
+                ],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                    options: SortOptions::default(),
+                },
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("tag2", 2)) as PhysicalExprRef,
+                    options: SortOptions {
+                        descending: true,
+                        nulls_first: false,
+                    },
+                },
+            ];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 3)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(2))
+                    .unwrap();
+
+            let session_ctx = SessionContext::new();
+            let task_ctx = session_ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let batches = collect_stream(stream).await.unwrap();
+
+            assert_snapshot!(batches_to_string(&batches), @r"
+            +-----+------+------+-------+
+            | tag | time | tag2 | value |
+            +-----+------+------+-------+
+            | a   | 1    | c    | 30.0  |
+            | a   | 1    | c    | 20.0  |
+            +-----+------+------+-------+
+            ");
+        }
+
+        #[tokio::test]
+        async fn test_multiple_order_with_multiple_series() {
+            // Test multiple order expressions with multiple series
+            let input = input_plan(
+                [
+                    (
+                        "tag",
+                        string_array(["a", "a", "a", "a", "b", "b", "b", "b"]),
+                    ),
+                    ("time", int_array([1, 1, 2, 2, 1, 1, 2, 2])),
+                    (
+                        "tag2",
+                        string_array(["c", "c", "c", "c", "c", "c", "c", "c"]),
+                    ),
+                    (
+                        "value",
+                        float_array([20.0, 10.0, 40.0, 30.0, 70.0, 60.0, 90.0, 80.0]),
+                    ),
+                ],
+                [("tag", None), ("time", None), ("tag2", None)],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                    options: SortOptions::default(),
+                },
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("tag2", 2)) as PhysicalExprRef,
+                    options: SortOptions::default(),
+                },
+            ];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 3)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            // Limit to 3 rows per series
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(3))
+                    .unwrap();
+
+            let session_ctx = SessionContext::new();
+            let task_ctx = session_ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let batches = collect_stream(stream).await.unwrap();
+
+            assert_snapshot!(batches_to_string(&batches), @r"
+            +-----+------+------+-------+
+            | tag | time | tag2 | value |
+            +-----+------+------+-------+
+            | a   | 1    | c    | 20.0  |
+            | a   | 1    | c    | 10.0  |
+            | a   | 2    | c    | 40.0  |
+            | b   | 1    | c    | 70.0  |
+            | b   | 1    | c    | 60.0  |
+            | b   | 2    | c    | 90.0  |
+            +-----+------+------+-------+
+            ");
+        }
+
+        #[tokio::test]
+        async fn test_multiple_order_with_offset() {
+            // Test multiple order expressions with OFFSET
+            let input = input_plan(
+                [
+                    ("tag", string_array(["a", "a", "a", "a", "a"])),
+                    ("time", int_array([1, 1, 2, 2, 3])),
+                    ("tag2", string_array(["c", "c", "c", "c", "c"])),
+                    ("value", float_array([10.0, 20.0, 30.0, 40.0, 50.0])),
+                ],
+                [("tag", None), ("time", None), ("tag2", None)],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                    options: SortOptions::default(),
+                },
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("tag2", 2)) as PhysicalExprRef,
+                    options: SortOptions::default(),
+                },
+            ];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 3)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            // Skip 2, take 2
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 2, Some(2))
+                    .unwrap();
+
+            let session_ctx = SessionContext::new();
+            let task_ctx = session_ctx.task_ctx();
+            let stream = exec.execute(0, task_ctx).unwrap();
+            let batches = collect_stream(stream).await.unwrap();
+
+            assert_snapshot!(batches_to_string(&batches), @r"
+            +-----+------+------+-------+
+            | tag | time | tag2 | value |
+            +-----+------+------+-------+
+            | a   | 2    | c    | 30.0  |
+            | a   | 2    | c    | 40.0  |
+            +-----+------+------+-------+
+            ");
+        }
+
+        #[test]
+        fn test_required_ordering_multiple_order() {
+            // Test that required_input_ordering includes all order expressions
+            let input = test_input_plan(["a"], [1], [1.0]);
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                    options: SortOptions::default(),
+                },
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                    options: SortOptions {
+                        descending: true,
+                        nulls_first: false,
+                    },
+                },
+            ];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10))
+                    .unwrap();
+
+            let expect = OrderingRequirements::new(
+                LexRequirement::new(vec![
+                    PhysicalSortRequirement {
+                        expr: Arc::new(Column::new("tag", 0)),
+                        options: None,
+                    },
+                    PhysicalSortRequirement {
+                        expr: Arc::new(Column::new("time", 1)),
+                        options: Some(SortOptions::default()),
+                    },
+                    PhysicalSortRequirement {
+                        expr: Arc::new(Column::new("value", 2)),
+                        options: Some(SortOptions {
+                            descending: true,
+                            nulls_first: false,
+                        }),
+                    },
+                ])
+                .unwrap(),
+            );
+
+            let required_ordering = exec.required_input_ordering();
+            assert_eq!(required_ordering, vec![Some(expect)]);
+        }
+
+        #[test]
+        fn test_display_as_multiple_order() {
+            // Test DisplayAs formatting with multiple order expressions
+            let input = test_input_plan(["a"], [1], [1.0]);
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                    options: SortOptions::default(),
+                },
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                    options: SortOptions {
+                        descending: true,
+                        nulls_first: false,
+                    },
+                },
+            ];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 5, Some(10))
+                    .unwrap();
+
+            assert_snapshot!(DisplayableExecutionPlan::new(&exec).indent(false), @r"
+            SeriesLimitExec: series=[tag@0], order=[time@1 ASC, value@2 DESC NULLS LAST], limit_expr=[value@2 RESPECT NULLS (default: 0)], skip=5, fetch=10
+              SortExec: expr=[tag@0 ASC, time@1 ASC], preserve_partitioning=[false]
+                DataSourceExec: partitions=1, partition_sizes=[1]
+            ");
+        }
+
+        #[test]
+        fn test_output_ordering_preserved_when_limited_column_not_in_ordering() {
+            // Test that output ordering is fully preserved when the limited column
+            // doesn't appear in the input ordering
+            let input = input_plan(
+                [
+                    ("tag", string_array(["a"])),
+                    ("time", int_array([1])),
+                    ("value", float_array([1.0])),
+                    ("other", float_array([2.0])),
+                ],
+                [("tag", None), ("time", None), ("other", None)],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                    options: SortOptions::default(),
+                },
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("other", 3)) as PhysicalExprRef,
+                    options: SortOptions::default(),
+                },
+            ];
+            // Limit "value" column which is NOT in the input ordering
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10))
+                    .unwrap();
+
+            // Output ordering should be fully preserved: tag, time, other
+            assert_snapshot!(
+                exec.properties().output_ordering().unwrap(),
+                @"tag@0 ASC, time@1 ASC, other@3 ASC",
+            );
+        }
+
+        #[test]
+        fn test_output_ordering_truncated_when_limited_column_in_ordering() {
+            // Test that output ordering is truncated when a limited column appears
+            // in the input ordering
+            let input = input_plan(
+                [
+                    ("tag", string_array(["a"])),
+                    ("time", int_array([1])),
+                    ("value", float_array([1.0])),
+                ],
+                [("tag", None), ("time", None), ("value", None)],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                    options: SortOptions::default(),
+                },
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                    options: SortOptions::default(),
+                },
+            ];
+            // Limit "value" column which IS in the input ordering
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10))
+                    .unwrap();
+
+            // Output ordering should be truncated before "value": only tag, time
+            assert_snapshot!(
+                exec.properties().output_ordering().unwrap(),
+                @"tag@0 ASC, time@1 ASC",
+            );
+        }
+
+        #[test]
+        fn test_output_ordering_empty_when_first_column_limited() {
+            // Test that output ordering becomes empty when the first order column
+            // is a limited column
+            let input = input_plan(
+                [
+                    ("tag", string_array(["a"])),
+                    ("time", int_array([1])),
+                    ("value", float_array([1.0])),
+                ],
+                [("tag", None), ("value", None)],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            // Limit "value" column which is the FIRST in the input ordering
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10))
+                    .unwrap();
+
+            // Output ordering should only have tag (series expr), not value
+            assert_snapshot!(
+                exec.properties().output_ordering().unwrap(),
+                @"tag@0 ASC",
+            );
+        }
+
+        #[test]
+        fn test_output_ordering_with_no_input_ordering() {
+            // Test behavior when input has no ordering
+            let input = input_plan(
+                [
+                    ("tag", string_array(["a"])),
+                    ("time", int_array([1])),
+                    ("value", float_array([1.0])),
+                ],
+                Vec::<(&str, Option<SortOptions>)>::new(), // No sorting
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                options: SortOptions::default(),
+            }];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10))
+                    .unwrap();
+
+            // Output ordering should be None since input has no ordering
+            assert!(exec.properties().output_ordering().is_none());
+        }
+
+        #[test]
+        fn test_output_ordering_with_multiple_limited_columns() {
+            // Test ordering when multiple columns are limited
+            let input = input_plan(
+                [
+                    ("tag", string_array(["a"])),
+                    ("time", int_array([1])),
+                    ("value1", float_array([1.0])),
+                    ("value2", float_array([2.0])),
+                    ("value3", float_array([3.0])),
+                ],
+                [
+                    ("tag", None),
+                    ("time", None),
+                    ("value2", None),
+                    ("value3", None),
+                ],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                    options: SortOptions::default(),
+                },
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("value2", 3)) as PhysicalExprRef,
+                    options: SortOptions::default(),
+                },
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("value3", 4)) as PhysicalExprRef,
+                    options: SortOptions::default(),
+                },
+            ];
+            // Limit multiple columns
+            let limit_expr = vec![
+                PhysicalLimitExpr::new(
+                    Arc::new(Column::new("value1", 2)) as PhysicalExprRef,
+                    false,
+                    ScalarValue::Float64(Some(0.0)),
+                ),
+                PhysicalLimitExpr::new(
+                    Arc::new(Column::new("value2", 3)) as PhysicalExprRef,
+                    false,
+                    ScalarValue::Float64(Some(0.0)),
+                ),
+            ];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10))
+                    .unwrap();
+
+            // Output ordering should be truncated right before value2 (first limited column in ordering)
+            // Should include: tag, time (but not value2 or value3)
+            assert_snapshot!(
+                exec.properties().output_ordering().unwrap(),
+                @"tag@0 ASC, time@1 ASC",
+            );
+        }
+
+        #[test]
+        fn test_output_ordering_preserves_sort_options() {
+            // Test that sort options (descending, nulls_first) are preserved
+            let input = input_plan(
+                [
+                    ("tag", string_array(["a"])),
+                    ("time", int_array([1])),
+                    ("value", float_array([1.0])),
+                    ("other", float_array([2.0])),
+                ],
+                [
+                    (
+                        "tag",
+                        Some(SortOptions {
+                            descending: false,
+                            nulls_first: true,
+                        }),
+                    ),
+                    (
+                        "time",
+                        Some(SortOptions {
+                            descending: true,
+                            nulls_first: false,
+                        }),
+                    ),
+                    (
+                        "other",
+                        Some(SortOptions {
+                            descending: false,
+                            nulls_first: false,
+                        }),
+                    ),
+                ],
+            );
+
+            let series_expr = vec![Arc::new(Column::new("tag", 0)) as PhysicalExprRef];
+            let order_expr = vec![
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("time", 1)) as PhysicalExprRef,
+                    options: SortOptions {
+                        descending: true,
+                        nulls_first: false,
+                    },
+                },
+                PhysicalSortExpr {
+                    expr: Arc::new(Column::new("other", 3)) as PhysicalExprRef,
+                    options: SortOptions {
+                        descending: false,
+                        nulls_first: false,
+                    },
+                },
+            ];
+            let limit_expr = vec![PhysicalLimitExpr::new(
+                Arc::new(Column::new("value", 2)) as PhysicalExprRef,
+                false,
+                ScalarValue::Float64(Some(0.0)),
+            )];
+
+            let exec =
+                SeriesLimitExec::try_new(input, series_expr, order_expr, limit_expr, 0, Some(10))
+                    .unwrap();
+
+            // Output ordering should preserve sort options
+            assert_snapshot!(exec.properties().output_ordering().unwrap(),
+                @"tag@0 ASC, time@1 DESC NULLS LAST, other@3 ASC NULLS LAST",
+            );
+        }
+    }
+
+    mod row_number_tests {
+        use super::*;
+
+        #[test]
+        fn test_row_number_simple() {
+            // Single partition - all rows in same group
+            let group_arr = Arc::new(StringArray::from(vec!["a", "a", "a"])) as ArrayRef;
+            let value_arr = Arc::new(Int64Array::from(vec![1, 2, 3])) as ArrayRef;
+            let partitions = partition(&[group_arr]).unwrap();
+
+            let (result, final_count) = row_number(&value_arr, 0, false, &partitions);
+
+            assert_eq!(result, PrimitiveArray::from_iter_values([1_u64, 2, 3]));
+            assert_eq!(final_count, 3);
+        }
+
+        #[test]
+        fn test_row_number_with_start() {
+            let group_arr = Arc::new(StringArray::from(vec!["a", "a", "a"])) as ArrayRef;
+            let value_arr = Arc::new(Int64Array::from(vec![1, 2, 3])) as ArrayRef;
+            let partitions = partition(&[group_arr]).unwrap();
+
+            let (result, final_count) = row_number(&value_arr, 10, false, &partitions);
+
+            assert_eq!(result, PrimitiveArray::from_iter_values([11_u64, 12, 13]));
+            assert_eq!(final_count, 13);
+        }
+
+        #[test]
+        fn test_row_number_with_nulls_respect() {
+            let group_arr = Arc::new(StringArray::from(vec!["a", "a", "a"])) as ArrayRef;
+            let value_arr = Arc::new(Int64Array::from(vec![Some(1), None, Some(3)])) as ArrayRef;
+            let partitions = partition(&[group_arr]).unwrap();
+
+            let (result, final_count) = row_number(&value_arr, 0, false, &partitions);
+
+            // Respect nulls means nulls still get row numbers
+            assert_eq!(result, PrimitiveArray::from_iter_values([1_u64, 2, 3]));
+            assert_eq!(final_count, 3);
+        }
+
+        #[test]
+        fn test_row_number_with_nulls_ignore() {
+            let group_arr = Arc::new(StringArray::from(vec!["a", "a", "a"])) as ArrayRef;
+            let value_arr = Arc::new(Int64Array::from(vec![Some(1), None, Some(3)])) as ArrayRef;
+            let partitions = partition(&[group_arr]).unwrap();
+
+            let (result, final_count) = row_number(&value_arr, 0, true, &partitions);
+
+            // Ignore nulls means nulls are skipped in numbering
+            assert_eq!(
+                result,
+                PrimitiveArray::from_iter(vec![Some(1u64), None, Some(2u64)])
+            );
+            assert_eq!(final_count, 2);
+        }
+
+        #[test]
+        fn test_row_number_multiple_partitions() {
+            // Two partitions - rows grouped by "a" and "b"
+            let group_arr =
+                Arc::new(StringArray::from(vec!["a", "a", "a", "b", "b", "b"])) as ArrayRef;
+            let value_arr = Arc::new(Int64Array::from(vec![1, 2, 3, 4, 5, 6])) as ArrayRef;
+            let partitions = partition(&[group_arr]).unwrap();
+
+            let (result, final_count) = row_number(&value_arr, 0, false, &partitions);
+
+            // Numbers start at one for each partition
+            assert_eq!(
+                result,
+                PrimitiveArray::from_iter_values([1_u64, 2, 3, 1, 2, 3])
+            );
+            assert_eq!(final_count, 3);
+        }
+
+        #[test]
+        fn test_row_number_empty() {
+            let group_arr = Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef;
+            let value_arr = Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef;
+            let partitions = partition(&[group_arr]).unwrap();
+
+            let (result, final_count) = row_number(&value_arr, 0, false, &partitions);
+
+            assert_eq!(result, PrimitiveArray::<UInt64Type>::from_iter_values([]));
+            assert_eq!(final_count, 0);
+        }
+
+        #[test]
+        fn test_row_number_single_partition_with_start_and_nulls() {
+            let group_arr = Arc::new(StringArray::from(vec!["a", "a", "a", "a", "a"])) as ArrayRef;
+            let value_arr = Arc::new(Int64Array::from(vec![
+                Some(1),
+                None,
+                Some(3),
+                None,
+                Some(5),
+            ])) as ArrayRef;
+            let partitions = partition(&[group_arr]).unwrap();
+
+            let (result, final_count) = row_number(&value_arr, 5, true, &partitions);
+
+            assert_eq!(
+                result,
+                PrimitiveArray::from_iter([Some(6_u64), None, Some(7), None, Some(8)])
+            );
+            assert_eq!(final_count, 8);
+        }
+    }
+}
diff --git a/iox_query/src/lib.rs b/iox_query/src/lib.rs
index 4e860d23..9bb66dcb 100644
--- a/iox_query/src/lib.rs
+++ b/iox_query/src/lib.rs
@@ -6,8 +6,9 @@ use arrow::{
     record_batch::RecordBatch,
 };
 use async_trait::async_trait;
-use data_types::{ChunkId, ChunkOrder, TransitionPartitionId};
+use data_types::{ChunkId, ChunkOrder, Namespace, TransitionPartitionId};
 use datafusion::{
+    common::not_impl_err,
     error::DataFusionError,
     physical_plan::{SendableRecordBatchStream, Statistics},
     prelude::SessionContext,
@@ -167,6 +168,14 @@ pub trait QueryDatabase: Debug + Send + Sync + 'static {
         include_debug_info_tables: bool,
     ) -> Result<Option<Arc<dyn QueryNamespace>>, DataFusionError>;
 
+    /// List all namespaces
+    async fn list_namespaces(
+        &self,
+        _span: Option<Span>,
+    ) -> Result<Vec<Namespace>, DataFusionError> {
+        not_impl_err!("QueryDatabase::list_namespaces is only used in InfluxDB 3 Core/Enterprise")
+    }
+
     /// Acquire concurrency-limiting semapahore
     async fn acquire_semaphore(&self, span: Option<Span>) -> InstrumentedAsyncOwnedSemaphorePermit;
 
diff --git a/iox_query/src/physical_optimizer/dedup/split.rs b/iox_query/src/physical_optimizer/dedup/split.rs
index 55aa9c58..607f28ac 100644
--- a/iox_query/src/physical_optimizer/dedup/split.rs
+++ b/iox_query/src/physical_optimizer/dedup/split.rs
@@ -212,7 +212,6 @@ mod tests {
         test::TestChunk,
         util::arrow_sort_key_exprs,
     };
-    use data_types::{PartitionHashId, PartitionId, TransitionPartitionId};
     use datafusion::{
         physical_plan::{expressions::Literal, filter::FilterExec},
         scalar::ScalarValue,
@@ -335,59 +334,6 @@ mod tests {
             );
         }
 
-        #[test]
-        fn test_different_partitions_with_and_without_hash_ids() {
-            // Partition without hash ID in the catalog
-            let legacy_partition_id = 1;
-            let legacy_transition_partition_id =
-                TransitionPartitionId::Catalog(PartitionId::new(legacy_partition_id));
-
-            // Partition with hash ID in the catalog
-            let transition_partition_id =
-                TransitionPartitionId::Hash(PartitionHashId::arbitrary_for_testing());
-
-            let chunk1 = chunk(1).with_partition_id(legacy_transition_partition_id.clone());
-            let chunk2 = chunk(2).with_partition_id(transition_partition_id.clone());
-
-            let chunk3 = chunk(3)
-                .with_dummy_parquet_file()
-                .with_partition_id(legacy_transition_partition_id.clone());
-            let chunk4 = chunk(4)
-                .with_dummy_parquet_file()
-                .with_partition_id(transition_partition_id.clone());
-            let chunk5 = chunk(5)
-                .with_dummy_parquet_file()
-                .with_partition_id(legacy_transition_partition_id.clone());
-            let chunk6 = chunk(6)
-                .with_dummy_parquet_file()
-                .with_partition_id(legacy_transition_partition_id.clone());
-            let schema = chunk1.schema().clone();
-            let plan = dedup_plan(schema, vec![chunk1, chunk2, chunk3, chunk4, chunk5, chunk6]);
-            let mut config = ConfigOptions::default();
-            config.execution.target_partitions = 2;
-            insta::assert_yaml_snapshot!(
-                OptimizationTest::new_with_config(plan, SplitDedup, &config),
-                @r#"
-            input:
-              - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
-              - "   UnionExec"
-              - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
-              - "     DataSourceExec: file_groups={2 groups: [[3.parquet, 5.parquet], [4.parquet, 6.parquet]]}, projection=[field, tag1, tag2, time], file_type=parquet"
-            output:
-              Ok:
-                - " UnionExec"
-                - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
-                - "     UnionExec"
-                - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
-                - "       DataSourceExec: file_groups={2 groups: [[3.parquet, 6.parquet], [5.parquet]]}, projection=[field, tag1, tag2, time], file_type=parquet"
-                - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
-                - "     UnionExec"
-                - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
-                - "       DataSourceExec: file_groups={1 group: [[4.parquet]]}, projection=[field, tag1, tag2, time], file_type=parquet"
-            "#
-            );
-        }
-
         #[test]
         fn test_max_split() {
             let chunk1 = chunk(1)
diff --git a/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs b/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs
index 8d934af7..df77132e 100644
--- a/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs
+++ b/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs
@@ -1,11 +1,14 @@
 use std::sync::Arc;
 
 use datafusion::{
-    common::tree_node::{Transformed, TreeNode},
+    common::tree_node::{Transformed, TreeNode, TreeNodeRecursion},
     config::ConfigOptions,
     error::Result,
     physical_optimizer::PhysicalOptimizerRule,
-    physical_plan::{ExecutionPlan, sorts::sort_preserving_merge::SortPreservingMergeExec},
+    physical_plan::{
+        ExecutionPlan, Partitioning, repartition::RepartitionExec,
+        sorts::sort_preserving_merge::SortPreservingMergeExec,
+    },
 };
 use itertools::Itertools;
 
@@ -146,21 +149,26 @@ fn swap_spm_for_progeval(
 ) -> Result<Transformed<Arc<dyn ExecutionPlan>>> {
     let ordering_req = original_spm.expr();
 
-    // Step 1: Split and regroup partitioned file scans. Also re-orders the scan partitions.
-    // This step maximizes our chances of getting a disjoint, nonoverlapping lexical ranges.
+    // Step 1: Remove any RoundRobin repartition nodes that may interfere with optimization
     let input = Arc::clone(original_spm.input())
+        .transform_down(remove_rr_repartition_if_exists)
+        .map(|t| t.data)?;
+
+    // Step 2: Split and regroup partitioned file scans. Also re-orders the scan partitions.
+    // This step maximizes our chances of getting a disjoint, nonoverlapping lexical ranges.
+    let input = input
         .transform_down(|plan| split_and_regroup_parquet_files(plan, ordering_req))
         .map(|t| t.data)?;
 
-    // Step 2: compensate for previous redistribution (for parallelized sorting) passes.
+    // Step 3: compensate for previous redistribution (for parallelized sorting) passes.
     let input = merge_partitions_after_parallelized_sorting(input, ordering_req)?;
 
-    // Step 3: try to extract the lexical ranges for the input partitions
+    // Step 4: try to extract the lexical ranges for the input partitions
     let Some(lexical_ranges) = extract_disjoint_ranges_from_plan(ordering_req, &input)? else {
         return Ok(Transformed::no(return_unaltered_plan));
     };
 
-    // Step 4: if needed, re-order the partitions
+    // Step 5: if needed, re-order the partitions
     let ordered_input = if lexical_ranges.indices().is_sorted() {
         input
     } else {
@@ -171,7 +179,7 @@ fn swap_spm_for_progeval(
         )?) as Arc<dyn ExecutionPlan>
     };
 
-    // Step 5: Replace SortPreservingMergeExec with ProgressiveEvalExec
+    // Step 6: Replace SortPreservingMergeExec with ProgressiveEvalExec
     let progresive_eval_exec = Arc::new(ProgressiveEvalExec::new(
         ordered_input,
         Some(lexical_ranges.ordered_ranges().cloned().collect_vec()),
@@ -181,6 +189,34 @@ fn swap_spm_for_progeval(
     Ok(Transformed::yes(progresive_eval_exec))
 }
 
+/// Remove any RoundRobin repartition nodes that may interfere with optimization.
+///
+/// If the current node is a RepartitionExec with Partitioning::RoundRobinBatch,
+/// then remove that node and return its child.
+fn remove_rr_repartition_if_exists(
+    plan: Arc<dyn ExecutionPlan>,
+) -> Result<Transformed<Arc<dyn ExecutionPlan>>> {
+    if let Some(repartition_exec) = plan.as_any().downcast_ref::<RepartitionExec>()
+        && matches!(
+            repartition_exec.partitioning(),
+            Partitioning::RoundRobinBatch(_)
+        )
+    {
+        // Remove the RoundRobin repartition node and return its child
+        Ok(Transformed::new(
+            Arc::clone(repartition_exec.input()),
+            true,
+            TreeNodeRecursion::Continue,
+        ))
+    } else if plan.as_any().is::<SortPreservingMergeExec>() {
+        // halt at the next SPM.
+        // that will be considered separately at the root PhysicalOptimizer::optimize(), as it checks per SPM found
+        Ok(Transformed::new(plan, false, TreeNodeRecursion::Jump))
+    } else {
+        Ok(Transformed::no(plan))
+    }
+}
+
 #[cfg(test)]
 mod test {
     use std::sync::Arc;
@@ -424,7 +460,7 @@ mod test {
         );
     }
 
-    // No limit & but the input is in the right sort preserving merge struct --> optimize
+    // No limit & the input is in the right sort preserving merge struct --> optimize
     #[test]
     fn test_spm_time_desc() {
         test_helpers::maybe_start_logging();
@@ -488,6 +524,74 @@ mod test {
         );
     }
 
+    // No limit & the input is in the right sort preserving merge struct
+    // has a rr repartitoning --> should remove
+    // then --> optimize
+    #[test]
+    fn test_spm_time_desc_rr_repartition() {
+        test_helpers::maybe_start_logging();
+
+        let schema = schema();
+        let sort_exprs = [
+            ("col2", SortOp::Asc),
+            ("col1", SortOp::Asc),
+            ("time", SortOp::Asc),
+        ];
+
+        let plan_parquet = PlanBuilder::data_source_exec_parquet(&schema, 1000, 2000);
+        let plan_parquet2 = PlanBuilder::data_source_exec_parquet(&schema, 2001, 3000);
+        let plan_batches = PlanBuilder::record_batches_exec(2, 2500, 3500);
+
+        let plan_sort1 = plan_batches.sort(sort_exprs);
+        let plan_union_1 = plan_sort1.union(plan_parquet2);
+        let plan_spm_for_dedupe = plan_union_1.sort_preserving_merge(sort_exprs);
+        let plan_dedupe = plan_spm_for_dedupe.deduplicate(sort_exprs, false);
+
+        let sort_exprs = [("time", SortOp::Desc)];
+        let plan_sort1 = plan_parquet.sort(sort_exprs);
+        let plan_sort2 = plan_dedupe.sort(sort_exprs);
+
+        let plan_union_2 = plan_sort1.union(plan_sort2);
+        let repartioned = plan_union_2.round_robin_repartition(4);
+
+        let plan_spm = repartioned.sort_preserving_merge(sort_exprs);
+
+        // Output plan: rr Repartition will be removed
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_spm.build(), opt),
+            @r#"
+        input:
+          - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+          - "   RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2"
+          - "     UnionExec"
+          - "       SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
+          - "         DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet"
+          - "       SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
+          - "         DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "           SortPreservingMergeExec: [col2@1 ASC NULLS LAST, col1@0 ASC NULLS LAST, time@3 ASC NULLS LAST]"
+          - "             UnionExec"
+          - "               SortExec: expr=[col2@1 ASC NULLS LAST, col1@0 ASC NULLS LAST, time@3 ASC NULLS LAST], preserve_partitioning=[false]"
+          - "                 RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "               DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet"
+        output:
+          Ok:
+            - " ProgressiveEvalExec: input_ranges=[(2001)->(3500), (1000)->(2000)]"
+            - "   ReorderPartitionsExec: mapped_partition_indices=[1, 0]"
+            - "     UnionExec"
+            - "       SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
+            - "         DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet"
+            - "       SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
+            - "         DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "           SortPreservingMergeExec: [col2@1 ASC NULLS LAST, col1@0 ASC NULLS LAST, time@3 ASC NULLS LAST]"
+            - "             UnionExec"
+            - "               SortExec: expr=[col2@1 ASC NULLS LAST, col1@0 ASC NULLS LAST, time@3 ASC NULLS LAST], preserve_partitioning=[false]"
+            - "                 RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "               DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet"
+        "#
+        );
+    }
+
     // No limit & but the input is in the right sort preserving merge struct --> optimize
     #[test]
     fn test_spm_non_time_desc() {
@@ -1206,13 +1310,9 @@ mod test {
         );
     }
 
-    // ------------------------------------------------------------------
-    // Negative tests: the right structure not found -> nothing optimized
-    // ------------------------------------------------------------------
-
-    // Right stucture but sort on 2 columns --> plan stays the same
+    // Right stucture and sort on 2 columns --> optimize
     #[test]
-    fn test_negative_spm_2_column_sort_desc() {
+    fn test_spm_2_column_sort_desc() {
         test_helpers::maybe_start_logging();
 
         // plan:
@@ -1272,104 +1372,10 @@ mod test {
         );
     }
 
-    // No limit  & random plan --> plan stay the same
-    #[test]
-    fn test_negative_no_limit() {
-        test_helpers::maybe_start_logging();
-
-        let schema = schema();
-        let sort_exprs = [
-            ("col2", SortOp::Asc),
-            ("col1", SortOp::Asc),
-            ("time", SortOp::Asc),
-        ];
-
-        let plan_parquet = PlanBuilder::data_source_exec_parquet(&schema, 1000, 2000);
-        let plan_batches = PlanBuilder::record_batches_exec(2, 1500, 2500);
-
-        let plan = plan_batches
-            .union(plan_parquet)
-            .round_robin_repartition(8)
-            .hash_repartition(vec!["col2", "col1", "time"], 8)
-            .sort(sort_exprs)
-            .deduplicate(sort_exprs, true);
-
-        // input and output are the same
-        let opt = OrderUnionSortedInputs;
-        insta::assert_yaml_snapshot!(
-            OptimizationTest::new(plan.build(), opt),
-            @r#"
-        input:
-          - " DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
-          - "   SortExec: expr=[col2@1 ASC NULLS LAST, col1@0 ASC NULLS LAST, time@3 ASC NULLS LAST], preserve_partitioning=[false]"
-          - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3], 8), input_partitions=8"
-          - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=3"
-          - "         UnionExec"
-          - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
-          - "           DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet"
-        output:
-          Ok:
-            - " DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
-            - "   SortExec: expr=[col2@1 ASC NULLS LAST, col1@0 ASC NULLS LAST, time@3 ASC NULLS LAST], preserve_partitioning=[false]"
-            - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3], 8), input_partitions=8"
-            - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=3"
-            - "         UnionExec"
-            - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
-            - "           DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet"
-        "#
-        );
-    }
-
-    // has limit but no sort preserving merge --> plan stay the same
-    #[test]
-    fn test_negative_limit_no_preserving_merge() {
-        test_helpers::maybe_start_logging();
-
-        let plan_batches1 = PlanBuilder::record_batches_exec(1, 1000, 2000);
-        let plan_batches2 = PlanBuilder::record_batches_exec(3, 2001, 3000);
-        let plan_batches3 = PlanBuilder::record_batches_exec(2, 2500, 3500);
-
-        let plan_union_1 = plan_batches2.union(plan_batches3);
-
-        let sort_exprs = [("time", SortOp::Desc)];
-        let plan_sort1 = plan_batches1.sort(sort_exprs);
-        let plan_sort2 = plan_union_1.sort(sort_exprs);
-
-        let plan_union_2 = plan_sort1.union(plan_sort2);
-
-        let plan_limit = plan_union_2.limit(0, Some(1));
-
-        // input and output are the same
-        let opt = OrderUnionSortedInputs;
-        insta::assert_yaml_snapshot!(
-            OptimizationTest::new(plan_limit.build(), opt),
-            @r#"
-        input:
-          - " GlobalLimitExec: skip=0, fetch=1"
-          - "   UnionExec"
-          - "     SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
-          - "       RecordBatchesExec: chunks=1, projection=[col1, col2, field1, time, __chunk_order]"
-          - "     SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
-          - "       UnionExec"
-          - "         RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]"
-          - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
-        output:
-          Ok:
-            - " GlobalLimitExec: skip=0, fetch=1"
-            - "   UnionExec"
-            - "     SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
-            - "       RecordBatchesExec: chunks=1, projection=[col1, col2, field1, time, __chunk_order]"
-            - "     SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
-            - "       UnionExec"
-            - "         RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]"
-            - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
-        "#
-        );
-    }
-
-    // right structure and same sort order but inputs of uion overlap --> plan stay the same
+    // right structure and same sort order
+    // inputs of union touch, but do not overlap --> optimize
     #[test]
-    fn test_negative_overlap() {
+    fn test_touching_ranges() {
         test_helpers::maybe_start_logging();
 
         // Input plan:
@@ -1442,64 +1448,10 @@ mod test {
         );
     }
 
-    // No limit & but the input is in the right union struct --> plan stay the same
+    // Projection expression (field + field)
+    // but the sort order is not on field, only time ==> optimize
     #[test]
-    fn test_negative_no_sortpreservingmerge_input_union() {
-        test_helpers::maybe_start_logging();
-
-        // plan:
-        //    UnionExec
-        //      SortExec: expr=[time@2 DESC]
-        //        DataSourceExec
-        //      SortExec: expr=[time@2 DESC]
-        //        UnionExec
-        //          RecordBatchesExec
-        //          DataSourceExec
-
-        let schema = schema();
-
-        let plan_parquet = PlanBuilder::data_source_exec_parquet(&schema, 1000, 2000);
-        let plan_parquet2 = PlanBuilder::data_source_exec_parquet(&schema, 2001, 3000);
-        let plan_batches = PlanBuilder::record_batches_exec(2, 2500, 3500);
-
-        let plan_union_1 = plan_batches.union(plan_parquet2);
-
-        let sort_exprs = [("time", SortOp::Desc)];
-
-        let plan_sort1 = plan_parquet.sort(sort_exprs);
-        let plan_sort2 = plan_union_1.sort(sort_exprs);
-
-        let plan_union_2 = plan_sort1.union(plan_sort2);
-
-        // input and output are the same
-        let opt = OrderUnionSortedInputs;
-        insta::assert_yaml_snapshot!(
-            OptimizationTest::new(plan_union_2.build(), opt),
-            @r#"
-        input:
-          - " UnionExec"
-          - "   SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
-          - "     DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet"
-          - "   SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
-          - "     UnionExec"
-          - "       RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
-          - "       DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet"
-        output:
-          Ok:
-            - " UnionExec"
-            - "   SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
-            - "     DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet"
-            - "   SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
-            - "     UnionExec"
-            - "       RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
-            - "       DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet"
-        "#
-        );
-    }
-
-    // Projection expression (field + field) ==> not optimze. Plan stays the same
-    #[test]
-    fn test_negative_spm_time_desc_with_dedupe_and_proj_on_expr() {
+    fn test_spm_time_desc_with_dedupe_and_proj_on_expr() {
         test_helpers::maybe_start_logging();
 
         // plan:
@@ -1629,6 +1581,164 @@ mod test {
         );
     }
 
+    // ------------------------------------------------------------------
+    // Negative tests: the right structure not found -> nothing optimized
+    // ------------------------------------------------------------------
+
+    // No limit  & random plan --> plan stay the same
+    #[test]
+    fn test_negative_no_limit() {
+        test_helpers::maybe_start_logging();
+
+        let schema = schema();
+        let sort_exprs = [
+            ("col2", SortOp::Asc),
+            ("col1", SortOp::Asc),
+            ("time", SortOp::Asc),
+        ];
+
+        let plan_parquet = PlanBuilder::data_source_exec_parquet(&schema, 1000, 2000);
+        let plan_batches = PlanBuilder::record_batches_exec(2, 1500, 2500);
+
+        let plan = plan_batches
+            .union(plan_parquet)
+            .round_robin_repartition(8)
+            .hash_repartition(vec!["col2", "col1", "time"], 8)
+            .sort(sort_exprs)
+            .deduplicate(sort_exprs, true);
+
+        // input and output are the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan.build(), opt),
+            @r#"
+        input:
+          - " DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "   SortExec: expr=[col2@1 ASC NULLS LAST, col1@0 ASC NULLS LAST, time@3 ASC NULLS LAST], preserve_partitioning=[false]"
+          - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3], 8), input_partitions=8"
+          - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=3"
+          - "         UnionExec"
+          - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet"
+        output:
+          Ok:
+            - " DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "   SortExec: expr=[col2@1 ASC NULLS LAST, col1@0 ASC NULLS LAST, time@3 ASC NULLS LAST], preserve_partitioning=[false]"
+            - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3], 8), input_partitions=8"
+            - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=3"
+            - "         UnionExec"
+            - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet"
+        "#
+        );
+    }
+
+    // has limit but no sort preserving merge --> plan stay the same
+    #[test]
+    fn test_negative_limit_no_preserving_merge() {
+        test_helpers::maybe_start_logging();
+
+        let plan_batches1 = PlanBuilder::record_batches_exec(1, 1000, 2000);
+        let plan_batches2 = PlanBuilder::record_batches_exec(3, 2001, 3000);
+        let plan_batches3 = PlanBuilder::record_batches_exec(2, 2500, 3500);
+
+        let plan_union_1 = plan_batches2.union(plan_batches3);
+
+        let sort_exprs = [("time", SortOp::Desc)];
+        let plan_sort1 = plan_batches1.sort(sort_exprs);
+        let plan_sort2 = plan_union_1.sort(sort_exprs);
+
+        let plan_union_2 = plan_sort1.union(plan_sort2);
+
+        let plan_limit = plan_union_2.limit(0, Some(1));
+
+        // input and output are the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_limit.build(), opt),
+            @r#"
+        input:
+          - " GlobalLimitExec: skip=0, fetch=1"
+          - "   UnionExec"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
+          - "       RecordBatchesExec: chunks=1, projection=[col1, col2, field1, time, __chunk_order]"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
+          - "       UnionExec"
+          - "         RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]"
+          - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+        output:
+          Ok:
+            - " GlobalLimitExec: skip=0, fetch=1"
+            - "   UnionExec"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
+            - "       RecordBatchesExec: chunks=1, projection=[col1, col2, field1, time, __chunk_order]"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
+            - "       UnionExec"
+            - "         RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]"
+            - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+        "#
+        );
+    }
+
+    // No limit & but the input is in the right union struct --> plan stay the same
+    #[test]
+    fn test_negative_no_sortpreservingmerge_input_union() {
+        test_helpers::maybe_start_logging();
+
+        // plan:
+        //    UnionExec
+        //      SortExec: expr=[time@2 DESC]
+        //        DataSourceExec
+        //      SortExec: expr=[time@2 DESC]
+        //        UnionExec
+        //          RecordBatchesExec
+        //          DataSourceExec
+
+        let schema = schema();
+
+        let plan_parquet = PlanBuilder::data_source_exec_parquet(&schema, 1000, 2000);
+        let plan_parquet2 = PlanBuilder::data_source_exec_parquet(&schema, 2001, 3000);
+        let plan_batches = PlanBuilder::record_batches_exec(2, 2500, 3500);
+
+        let plan_union_1 = plan_batches.union(plan_parquet2);
+
+        let sort_exprs = [("time", SortOp::Desc)];
+
+        let plan_sort1 = plan_parquet.sort(sort_exprs);
+        let plan_sort2 = plan_union_1.sort(sort_exprs);
+
+        let plan_union_2 = plan_sort1.union(plan_sort2);
+
+        // input and output are the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_union_2.build(), opt),
+            @r#"
+        input:
+          - " UnionExec"
+          - "   SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
+          - "     DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet"
+          - "   SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
+          - "     UnionExec"
+          - "       RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "       DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
+            - "     DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet"
+            - "   SortExec: expr=[time@3 DESC NULLS LAST], preserve_partitioning=[false]"
+            - "     UnionExec"
+            - "       RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "       DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC], file_type=parquet"
+        "#
+        );
+    }
+
+    // ------------------------------------------------------------------
+    // Many partitioned files tests
+    // ------------------------------------------------------------------
+
     // Reproduce of https://github.com/influxdata/influxdb_iox/issues/12461#issuecomment-2430196754
     // The reproducer needs big non-overlapped files so its first physical plan will have DataSourceExec with multiple
     // file groups, each file group has multiple partitioned files.
diff --git a/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs_for_constants.rs b/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs_for_constants.rs
index 1ece2ce9..5237b4da 100644
--- a/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs_for_constants.rs
+++ b/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs_for_constants.rs
@@ -327,6 +327,52 @@ mod test {
         );
     }
 
+    // Under sort preserving merge is not UnionExec,
+    // although the new optimizer can handle it.
+    #[test]
+    fn test_replace_spm_with_no_union_under_spm() {
+        test_helpers::maybe_start_logging();
+
+        let schema = schema();
+        let sort_order = sort_order_for_sort();
+
+        // First sort on parquet file
+        let plan_parquet = data_source_exec_parquet_with_value_range(&schema, 1000, 2000);
+        let plan_projection_1 = Arc::new(
+            ProjectionExec::try_new(
+                projection_expr_with_2_constants("m1", "tag0", &schema),
+                plan_parquet,
+            )
+            .unwrap(),
+        );
+        let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_projection_1));
+
+        // add sort preserving merge on top
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order_for_sort_preserving_merge(),
+            plan_sort1,
+        ));
+
+        // input and output are the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_spm, opt),
+            @r#"
+        input:
+          - " SortPreservingMergeExec: [iox::measurement@0 ASC NULLS LAST, key@1 ASC NULLS LAST, value@2 ASC NULLS LAST]"
+          - "   SortExec: expr=[value@2 ASC NULLS LAST], preserve_partitioning=[false]"
+          - "     ProjectionExec: expr=[m1 as iox::measurement, tag0 as key, tag0@1 as value]"
+          - "       DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[tag2, tag0, tag1, field1, time, __chunk_order], output_ordering=[__chunk_order@5 ASC], file_type=parquet"
+        output:
+          Ok:
+            - " ProgressiveEvalExec: input_ranges=[(m1,tag0)->(m1,tag0)]"
+            - "   SortExec: expr=[value@2 ASC NULLS LAST], preserve_partitioning=[false]"
+            - "     ProjectionExec: expr=[m1 as iox::measurement, tag0 as key, tag0@1 as value]"
+            - "       DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[tag2, tag0, tag1, field1, time, __chunk_order], output_ordering=[__chunk_order@5 ASC], file_type=parquet"
+        "#
+        );
+    }
+
     // ------------------------------------------------------------------
     // Negative tests: wrong structure -> not optimized
     // ------------------------------------------------------------------
@@ -395,52 +441,6 @@ mod test {
         );
     }
 
-    // Under sort preserving merge is not UnionExec,
-    // altho the new optimizer can handle it.
-    #[test]
-    fn test_replace_spm_with_no_union_under_spm() {
-        test_helpers::maybe_start_logging();
-
-        let schema = schema();
-        let sort_order = sort_order_for_sort();
-
-        // First sort on parquet file
-        let plan_parquet = data_source_exec_parquet_with_value_range(&schema, 1000, 2000);
-        let plan_projection_1 = Arc::new(
-            ProjectionExec::try_new(
-                projection_expr_with_2_constants("m1", "tag0", &schema),
-                plan_parquet,
-            )
-            .unwrap(),
-        );
-        let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_projection_1));
-
-        // add sort preserving merge on top
-        let plan_spm = Arc::new(SortPreservingMergeExec::new(
-            sort_order_for_sort_preserving_merge(),
-            plan_sort1,
-        ));
-
-        // input and output are the same
-        let opt = OrderUnionSortedInputs;
-        insta::assert_yaml_snapshot!(
-            OptimizationTest::new(plan_spm, opt),
-            @r#"
-        input:
-          - " SortPreservingMergeExec: [iox::measurement@0 ASC NULLS LAST, key@1 ASC NULLS LAST, value@2 ASC NULLS LAST]"
-          - "   SortExec: expr=[value@2 ASC NULLS LAST], preserve_partitioning=[false]"
-          - "     ProjectionExec: expr=[m1 as iox::measurement, tag0 as key, tag0@1 as value]"
-          - "       DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[tag2, tag0, tag1, field1, time, __chunk_order], output_ordering=[__chunk_order@5 ASC], file_type=parquet"
-        output:
-          Ok:
-            - " ProgressiveEvalExec: input_ranges=[(m1,tag0)->(m1,tag0)]"
-            - "   SortExec: expr=[value@2 ASC NULLS LAST], preserve_partitioning=[false]"
-            - "     ProjectionExec: expr=[m1 as iox::measurement, tag0 as key, tag0@1 as value]"
-            - "       DataSourceExec: file_groups={1 group: [[0.parquet]]}, projection=[tag2, tag0, tag1, field1, time, __chunk_order], output_ordering=[__chunk_order@5 ASC], file_type=parquet"
-        "#
-        );
-    }
-
     // Under Union is not all SortExec
     #[test]
     fn test_negative_not_all_sorts_under_union() {
diff --git a/iox_query/src/query_log.rs b/iox_query/src/query_log.rs
index 929b26c3..cfd24b35 100644
--- a/iox_query/src/query_log.rs
+++ b/iox_query/src/query_log.rs
@@ -8,7 +8,7 @@ use datafusion::physical_plan::{
     ExecutionPlan,
     metrics::{MetricValue, MetricsSet},
 };
-use influxdb_iox_client::write::Client as WriteClient;
+use influxdb_iox_client::batched_write::MaybeBatchedWriteClient as WriteClient;
 use influxdb_line_protocol::LineProtocolBuilder;
 use iox_query_params::StatementParams;
 use iox_time::{Time, TimeProvider};
@@ -230,6 +230,7 @@ impl QueryLogEntryState {
 
         let mut lp = builder
             .measurement(measurement_name)
+            .tag("id", &self.id.to_string())
             .tag("namespace_id", &self.namespace_id.get().to_string())
             .tag("namespace_name", &self.namespace_name)
             .tag("query_type", self.query_type)
@@ -1477,7 +1478,7 @@ mod test_super {
 
         insta::assert_snapshot!(
             format_line_protocol(&lp),
-            @r#"query_log_test,namespace_id=1,namespace_name=ns,query_type=sql,phase=cancel running="false",success="false",query_text="SELECT 1",query_params="Params { }",query_issue_time_ns=100000000i,end_to_end_duration_ns=0u 1000000000000000000"#
+            @r#"query_log_test,id=00000000-0000-0000-0000-000000000001,namespace_id=1,namespace_name=ns,query_type=sql,phase=cancel running="false",success="false",query_text="SELECT 1",query_params="Params { }",query_issue_time_ns=100000000i,end_to_end_duration_ns=0u 1000000000000000000"#
         );
     }
 
@@ -1510,7 +1511,7 @@ mod test_super {
         let lp = lp_builder.build();
         insta::assert_snapshot!(
             format_line_protocol(&lp),
-            @r#"query_log_test,namespace_id=1,namespace_name=ns,query_type=sql,phase=success running="false",success="true",query_text="SELECT 1",query_params="Params { }",query_issue_time_ns=100000000i,partition_count=0u,parquet_file_count=0u,permit_duration_ns=2000000u,plan_duration_ns=1000000u,execute_duration_ns=5000000u,end_to_end_duration_ns=8000000u,compute_duration_ns=1337000000u,max_memory_bytes=0i,ingester_latency_to_plan_ns=0u,ingester_latency_to_full_data_ns=0u,ingester_response_row_count=0u,ingester_response_size_bytes=0u,ingester_partition_count=0u 1000000000000000000"#);
+            @r#"query_log_test,id=00000000-0000-0000-0000-000000000001,namespace_id=1,namespace_name=ns,query_type=sql,phase=success running="false",success="true",query_text="SELECT 1",query_params="Params { }",query_issue_time_ns=100000000i,partition_count=0u,parquet_file_count=0u,permit_duration_ns=2000000u,plan_duration_ns=1000000u,execute_duration_ns=5000000u,end_to_end_duration_ns=8000000u,compute_duration_ns=1337000000u,max_memory_bytes=0i,ingester_latency_to_plan_ns=0u,ingester_latency_to_full_data_ns=0u,ingester_response_row_count=0u,ingester_response_size_bytes=0u,ingester_partition_count=0u 1000000000000000000"#);
     }
 
     #[test]
@@ -1550,7 +1551,7 @@ mod test_super {
 
         insta::assert_snapshot!(
             format_line_protocol(&lp),
-            @r#"query_log_test,namespace_id=1,namespace_name=ns,query_type=sql,phase=received,auth_id=user123,trace_id=42 running="true",success="false",query_text="SELECT 1",query_params="Params { }",query_issue_time_ns=100000000i 1000000000000000000"#
+            @r#"query_log_test,id=00000000-0000-0000-0000-000000000001,namespace_id=1,namespace_name=ns,query_type=sql,phase=received,auth_id=user123,trace_id=42 running="true",success="false",query_text="SELECT 1",query_params="Params { }",query_issue_time_ns=100000000i 1000000000000000000"#
         );
     }
 
diff --git a/iox_query/src/statistics/partition_statistics/mod.rs b/iox_query/src/statistics/partition_statistics/mod.rs
index 92371a0b..a2369506 100644
--- a/iox_query/src/statistics/partition_statistics/mod.rs
+++ b/iox_query/src/statistics/partition_statistics/mod.rs
@@ -11,6 +11,7 @@ use datafusion::{
         coalesce_partitions::CoalescePartitionsExec,
         coop::CooperativeExec,
         empty::EmptyExec,
+        expressions::Column,
         filter::FilterExec,
         limit::{GlobalLimitExec, LocalLimitExec},
         placeholder_row::PlaceholderRowExec,
@@ -157,16 +158,19 @@ impl PartitionStatistics for ProjectionExec {
             |mut acc, child| {
                 let child_stats = statistics_by_partition(child.as_ref())?;
 
-                let child_stats_with_project_exec_projected =
-                    child_stats.into_iter().map(|stats| {
-                        proj_exec_stats(
-                            Arc::unwrap_or_clone(stats),
-                            self.expr().iter(),
-                            &self.schema(),
-                        )
-                    });
-
-                acc.extend(child_stats_with_project_exec_projected);
+                let child_stats_with_project_exec_projected: Result<Vec<_>, DataFusionError> =
+                    child_stats
+                        .into_iter()
+                        .map(|stats| {
+                            proj_exec_stats(
+                                Arc::unwrap_or_clone(stats),
+                                self.expr().iter(),
+                                &self.schema(),
+                            )
+                        })
+                        .collect();
+
+                acc.extend(child_stats_with_project_exec_projected?);
                 Ok::<PartitionedStatistics, DataFusionError>(acc)
             },
         )?;
@@ -270,27 +274,52 @@ impl PartitionStatistics for AggregateExec {
     fn statistics_by_partition(&self) -> Result<PartitionedStatistics> {
         if self.aggr_expr().is_empty() {
             let inner_stats_per_partition = statistics_by_partition(self.input.as_ref())?;
+            let input_schema = self.input.schema();
 
-            Ok(inner_stats_per_partition
+            inner_stats_per_partition
                 .iter()
                 .map(|stats| {
-                    // only retain the min/max per column
-                    // whereas the remaining stats can be changed by the grouping
-                    Arc::new(Statistics {
+                    // Create column statistics for each output GROUP BY expression
+                    let column_statistics: Result<Vec<ColumnStatistics>, DataFusionError> = self
+                        .output_group_expr()
+                        .iter()
+                        .map(|group_expr| {
+                            // Check if this group expression corresponds to an input column
+                            if let Some(input_col_idx) = group_expr
+                                .as_any()
+                                .downcast_ref::<Column>()
+                                .and_then(|col| input_schema.index_of(col.name()).ok())
+                            {
+                                // This is a direct column reference, use existing statistics
+                                if input_col_idx < stats.column_statistics.len() {
+                                    let col_stats = &stats.column_statistics[input_col_idx];
+                                    Ok(ColumnStatistics {
+                                        min_value: col_stats.min_value.clone(),
+                                        max_value: col_stats.max_value.clone(),
+                                        ..Default::default()
+                                    })
+                                } else {
+                                    // Input column index out of bounds - this should not happen
+                                    Err(internal_datafusion_err!(
+                                        "Column index {input_col_idx} out of bounds in partition statistics (available columns: {}, column found in schema)",
+                                        stats.column_statistics.len()
+                                    ))
+                                }
+                            } else {
+                                // This is a computed expression (like date_part), return unknown stats
+                                Ok(ColumnStatistics::default())
+                            }
+                        })
+                        .collect();
+
+                    let column_statistics = column_statistics?;
+                    Ok(Arc::new(Statistics {
                         num_rows: Precision::Absent,
                         total_byte_size: Precision::Absent,
-                        column_statistics: stats
-                            .column_statistics
-                            .iter()
-                            .map(|col_stats| ColumnStatistics {
-                                min_value: col_stats.min_value.clone(),
-                                max_value: col_stats.max_value.clone(),
-                                ..Default::default()
-                            })
-                            .collect(),
-                    })
+                        column_statistics,
+                    }))
                 })
-                .collect())
+                .collect()
         } else {
             // if aggr expr is not empty, then the projected values (per column) could be different
             Ok(unknown_statistics_by_partition(self))
diff --git a/iox_query/src/statistics/partition_statistics/project_schema.rs b/iox_query/src/statistics/partition_statistics/project_schema.rs
index 22d199d6..d235b7d5 100644
--- a/iox_query/src/statistics/partition_statistics/project_schema.rs
+++ b/iox_query/src/statistics/partition_statistics/project_schema.rs
@@ -232,14 +232,26 @@ pub(super) fn proj_exec_stats<'a>(
     mut stats: Statistics,
     exprs: impl Iterator<Item = &'a (Arc<dyn PhysicalExpr>, String)>,
     projexec_schema: &SchemaRef,
-) -> Arc<Statistics> {
+) -> Result<Arc<Statistics>> {
     let mut primitive_row_size = 0;
     let mut primitive_row_size_possible = true;
     let mut column_statistics = vec![];
     for (expr, _) in exprs {
         let col_stats = if let Some(col) = expr.as_any().downcast_ref::<Column>() {
             // handle columns in schema
-            stats.column_statistics[col.index()].clone()
+            let col_index = col.index();
+            if col_index >= stats.column_statistics.len() {
+                return Err(internal_datafusion_err!(
+                    "Column index {} out of bounds in partition statistics projection \
+                     (available columns: {}, column name: '{}'). \
+                     This indicates a schema mismatch between projection expressions and input statistics.",
+                    col_index,
+                    stats.column_statistics.len(),
+                    col.name()
+                ));
+            } else {
+                stats.column_statistics[col_index].clone()
+            }
         } else if let Some(lit_expr) = expr.as_any().downcast_ref::<Literal>() {
             // handle constants
             match lit_expr.value() {
@@ -277,7 +289,7 @@ pub(super) fn proj_exec_stats<'a>(
         stats.total_byte_size = Precision::Exact(primitive_row_size).multiply(&stats.num_rows);
     }
     stats.column_statistics = column_statistics;
-    Arc::new(stats)
+    Ok(Arc::new(stats))
 }
 
 #[cfg(test)]
@@ -387,7 +399,8 @@ mod tests {
             Arc::unwrap_or_clone(src_stats),
             exprs.iter(),
             &project_schema,
-        );
+        )
+        .unwrap();
         assert_eq!(
             actual, expected_stats,
             "should be able to project all columns"
@@ -457,7 +470,8 @@ mod tests {
             Arc::unwrap_or_clone(src_stats),
             exprs.iter(),
             &project_schema,
-        );
+        )
+        .unwrap();
         assert_eq!(
             actual, expected_stats,
             "should be able to remove and re-order columns"
@@ -549,7 +563,8 @@ mod tests {
             Arc::unwrap_or_clone(src_stats),
             exprs.iter(),
             &project_schema,
-        );
+        )
+        .unwrap();
         assert_eq!(
             actual, expected_stats,
             "should be able to handle schema with aliases"
@@ -717,7 +732,8 @@ mod tests {
             Arc::unwrap_or_clone(src_stats),
             exprs.iter(),
             &project_schema,
-        );
+        )
+        .unwrap();
         assert_eq!(
             actual, expected_stats,
             "should be able to handle schema with same-named fields"
@@ -789,7 +805,8 @@ mod tests {
             Arc::unwrap_or_clone(src_stats),
             exprs.iter(),
             &project_schema,
-        );
+        )
+        .unwrap();
         assert_eq!(
             actual, expected_stats,
             "should be able to handle schema with same-named fields, reversed ordering"
@@ -816,7 +833,8 @@ mod tests {
             Arc::unwrap_or_clone(Arc::clone(&src_stats)),
             exprs.iter(),
             &src_schema,
-        );
+        )
+        .unwrap();
         assert_eq!(
             actual, src_stats,
             "proj_exec_stats should extract the proper columns from the physical exprs"
@@ -831,7 +849,8 @@ mod tests {
             Arc::unwrap_or_clone(Arc::clone(&src_stats)),
             exprs.iter(),
             &src_schema,
-        );
+        )
+        .unwrap();
         // min/max are the constants
         assert_eq!(
             actual.column_statistics[0].min_value.get_value(),
@@ -881,7 +900,8 @@ mod tests {
             (lit(ScalarValue::Null), "col_a".to_string()),
             (lit(ScalarValue::Null), "col_b".to_string()),
         ];
-        let actual = proj_exec_stats(Arc::unwrap_or_clone(src_stats), exprs.iter(), &src_schema);
+        let actual =
+            proj_exec_stats(Arc::unwrap_or_clone(src_stats), exprs.iter(), &src_schema).unwrap();
         // min/max are the constants
         assert_eq!(
             actual.column_statistics[0].min_value.get_value(),
diff --git a/iox_query/src/test.rs b/iox_query/src/test.rs
index fa516714..5b140500 100644
--- a/iox_query/src/test.rs
+++ b/iox_query/src/test.rs
@@ -19,7 +19,9 @@ use arrow::{
     record_batch::RecordBatch,
 };
 use async_trait::async_trait;
-use data_types::{ChunkId, ChunkOrder, NamespaceId, PartitionKey, TableId, TransitionPartitionId};
+use data_types::{
+    ChunkId, ChunkOrder, Namespace, NamespaceId, PartitionKey, TableId, TransitionPartitionId,
+};
 use datafusion::error::DataFusionError;
 use datafusion::logical_expr::Expr;
 use datafusion::physical_plan::ExecutionPlan;
@@ -114,6 +116,28 @@ impl QueryDatabase for TestDatabaseStore {
         Ok(databases.get(name).cloned().map(|ns| ns as _))
     }
 
+    async fn list_namespaces(
+        &self,
+        _span: Option<Span>,
+    ) -> Result<Vec<Namespace>, DataFusionError> {
+        Ok(self
+            .databases
+            .lock()
+            .iter()
+            .enumerate()
+            .map(|(i, (name, db))| Namespace {
+                id: NamespaceId::new(i as i64),
+                name: name.to_owned(),
+                retention_period_ns: db.retention_time_ns,
+                max_tables: Default::default(),
+                max_columns_per_table: Default::default(),
+                deleted_at: Default::default(),
+                partition_template: Default::default(),
+                router_version: Default::default(),
+            })
+            .collect())
+    }
+
     async fn acquire_semaphore(&self, span: Option<Span>) -> InstrumentedAsyncOwnedSemaphorePermit {
         Arc::clone(&self.query_semaphore)
             .acquire_owned(span)
diff --git a/iox_query_influxql/Cargo.toml b/iox_query_influxql/Cargo.toml
index b24ede58..faf44fdf 100644
--- a/iox_query_influxql/Cargo.toml
+++ b/iox_query_influxql/Cargo.toml
@@ -11,6 +11,7 @@ workspace = true
 [dependencies]
 arrow = { workspace = true }
 assert_matches = "1"
+async-trait = { version = "0.1.89", default-features = false }
 chrono-tz = { version = "0.10" }
 datafusion = { workspace = true }
 datafusion_util = { path = "../datafusion_util" }
diff --git a/iox_query_influxql/src/lib.rs b/iox_query_influxql/src/lib.rs
index 0d0d8fc8..478f4b5b 100644
--- a/iox_query_influxql/src/lib.rs
+++ b/iox_query_influxql/src/lib.rs
@@ -11,6 +11,8 @@ mod aggregate;
 mod error;
 pub mod frontend;
 pub mod plan;
+pub mod show_databases;
+pub mod show_retention_policies;
 mod window;
 
 /// A list of the numeric types supported by InfluxQL that can be be used
diff --git a/iox_query_influxql/src/plan/planner.rs b/iox_query_influxql/src/plan/planner.rs
index 0d37e757..f95dfdd4 100644
--- a/iox_query_influxql/src/plan/planner.rs
+++ b/iox_query_influxql/src/plan/planner.rs
@@ -100,7 +100,7 @@ use iox_query::analyzer::default_return_value_for_aggr_fn;
 use iox_query::analyzer::range_predicate::find_time_range;
 use iox_query::config::{IoxConfigExt, MetadataCutoff};
 use iox_query::exec::IOxSessionContext;
-use iox_query::exec::gapfill::{FillStrategy, GapFill, GapFillParams};
+use iox_query::exec::gapfill::{FillExpr, FillStrategy, GapFill};
 use iox_query_params::StatementParams;
 use itertools::Itertools;
 use query_functions::date_bin_wallclock::DateBinWallclockUDF;
@@ -1580,7 +1580,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
             ctx.group_by.and_then(|gb| gb.time_dimension()),
             fill_strategy,
         ) {
-            build_gap_fill_node(plan, time_column, fill_strategy, &ctx.projection_type)?
+            build_gap_fill_node(plan, fill_strategy, &ctx.projection_type)?
         } else {
             plan
         };
@@ -2291,106 +2291,10 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
         iql: &IQLExpr,
         schema: &IQLSchema<'_>,
     ) -> Result<Expr> {
-        let df_schema = &schema.df_schema;
         match iql {
             // rewriter is expected to expand wildcard expressions
             IQLExpr::Wildcard(_) => error::internal("unexpected wildcard in projection"),
-            IQLExpr::VarRef(VarRef {
-                name,
-                data_type: opt_dst_type,
-            }) => {
-                Ok(match (scope, name.as_str()) {
-                    // Per the Go implementation, the time column is case-insensitive in the
-                    // `WHERE` clause and disregards any postfix type cast operator.
-                    //
-                    // See: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L5751-L5753
-                    (ExprScope::Where, name) if name.eq_ignore_ascii_case("time") => {
-                        "time".as_expr()
-                    }
-                    (ExprScope::Projection, "time") => "time".as_expr(),
-                    (_, name) => match df_schema
-                        .fields_with_unqualified_name(name)
-                        .first()
-                        .map(|f| f.data_type().clone())
-                    {
-                        Some(src_type) => {
-                            let column = name.as_expr();
-
-                            match opt_dst_type.and_then(var_ref_data_type_to_data_type) {
-                                Some(dst_type) => {
-                                    fn is_numeric(dt: &DataType) -> bool {
-                                        matches!(
-                                            dt,
-                                            DataType::Int64 | DataType::Float64 | DataType::UInt64
-                                        )
-                                    }
-
-                                    if src_type == dst_type {
-                                        column
-                                    } else if is_numeric(&src_type) && is_numeric(&dst_type) {
-                                        // InfluxQL only allows casting between numeric types,
-                                        // and it is safe to unconditionally unwrap, as the
-                                        // `is_numeric_type` call guarantees it can be mapped to
-                                        // an Arrow DataType
-                                        column.cast_to(&dst_type, &schema.df_schema)?
-                                    } else {
-                                        // If the cast is incompatible, evaluates to NULL
-                                        Expr::Literal(ScalarValue::Null, None)
-                                    }
-                                }
-                                None => column,
-                            }
-                        }
-                        _ => {
-                            // For non-existent columns, we need to check if the user specified a gap-filling value.
-                            // See [`VirtualColumnFillConfig`] for more details.
-                            match fill_config {
-                                Some(VirtualColumnFillConfig {
-                                    fill_clause: Some(FillClause::Value(n)),
-                                    data_type,
-                                }) => {
-                                    // The user specified a gap-filling value
-                                    match data_type {
-                                        Some(InfluxColumnType::Field(InfluxFieldType::Integer)) => {
-                                            Expr::Literal(
-                                                number_to_scalar(n, &DataType::Int64)?,
-                                                None,
-                                            )
-                                        }
-                                        Some(InfluxColumnType::Field(InfluxFieldType::Float)) => {
-                                            Expr::Literal(
-                                                number_to_scalar(n, &DataType::Float64)?,
-                                                None,
-                                            )
-                                        }
-                                        Some(InfluxColumnType::Tag) => {
-                                            // Do not gap-fill tags
-                                            Expr::Literal(ScalarValue::Null, None)
-                                        }
-                                        _ => {
-                                            match n {
-                                                // Default to the data type of the gap-filling value
-                                                Number::Integer(_) => Expr::Literal(
-                                                    number_to_scalar(n, &DataType::Int64)?,
-                                                    None,
-                                                ),
-                                                Number::Float(_) => Expr::Literal(
-                                                    number_to_scalar(n, &DataType::Float64)?,
-                                                    None,
-                                                ),
-                                            }
-                                        }
-                                    }
-                                }
-                                _ => {
-                                    // No gap-filling config or value, return NULL
-                                    Expr::Literal(ScalarValue::Null, None)
-                                }
-                            }
-                        }
-                    },
-                })
-            }
+            IQLExpr::VarRef(varref) => self.varref_to_df_expr(fill_config, scope, varref, schema),
             IQLExpr::BindParameter(id) => {
                 let err = BindParameterError::NotDefined(id.to_string());
                 error::params(err.to_string())
@@ -2425,6 +2329,101 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
         }
     }
 
+    /// Map an InfluxQL variable reference to a DataFusion expression.
+    fn varref_to_df_expr(
+        &self,
+        fill_config: &Option<VirtualColumnFillConfig>,
+        scope: ExprScope,
+        varref: &VarRef,
+        schema: &IQLSchema<'_>,
+    ) -> Result<Expr> {
+        let df_schema = &schema.df_schema;
+        let VarRef {
+            name,
+            data_type: opt_dst_type,
+        } = varref;
+        Ok(match (scope, name.as_str()) {
+            // Per the Go implementation, the time column is case-insensitive in the
+            // `WHERE` clause and disregards any postfix type cast operator.
+            //
+            // See: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L5751-L5753
+            (ExprScope::Where, name) if name.eq_ignore_ascii_case("time") => "time".as_expr(),
+            (ExprScope::Projection, "time") => "time".as_expr(),
+            (_, name) => match df_schema
+                .fields_with_unqualified_name(name)
+                .first()
+                .map(|f| f.data_type().clone())
+            {
+                Some(src_type) => {
+                    let column = name.as_expr();
+
+                    match opt_dst_type.and_then(var_ref_data_type_to_data_type) {
+                        Some(dst_type) => {
+                            fn is_numeric(dt: &DataType) -> bool {
+                                matches!(dt, DataType::Int64 | DataType::Float64 | DataType::UInt64)
+                            }
+
+                            if src_type == dst_type {
+                                column
+                            } else if is_numeric(&src_type) && is_numeric(&dst_type) {
+                                // InfluxQL only allows casting between numeric types,
+                                // and it is safe to unconditionally unwrap, as the
+                                // `is_numeric_type` call guarantees it can be mapped to
+                                // an Arrow DataType
+                                column.cast_to(&dst_type, &schema.df_schema)?
+                            } else {
+                                // If the cast is incompatible, evaluates to NULL
+                                Expr::Literal(ScalarValue::Null, None)
+                            }
+                        }
+                        None => column,
+                    }
+                }
+                _ => {
+                    // For non-existent columns, we need to check if the user specified a gap-filling value.
+                    // See [`VirtualColumnFillConfig`] for more details.
+                    match fill_config {
+                        Some(VirtualColumnFillConfig {
+                            fill_clause: Some(FillClause::Value(n)),
+                            data_type,
+                        }) => {
+                            // The user specified a gap-filling value
+                            match data_type {
+                                Some(InfluxColumnType::Field(InfluxFieldType::Integer)) => {
+                                    Expr::Literal(number_to_scalar(n, &DataType::Int64)?, None)
+                                }
+                                Some(InfluxColumnType::Field(InfluxFieldType::Float)) => {
+                                    Expr::Literal(number_to_scalar(n, &DataType::Float64)?, None)
+                                }
+                                Some(InfluxColumnType::Tag) => {
+                                    // Do not gap-fill tags
+                                    Expr::Literal(ScalarValue::Null, None)
+                                }
+                                _ => {
+                                    match n {
+                                        // Default to the data type of the gap-filling value
+                                        Number::Integer(_) => Expr::Literal(
+                                            number_to_scalar(n, &DataType::Int64)?,
+                                            None,
+                                        ),
+                                        Number::Float(_) => Expr::Literal(
+                                            number_to_scalar(n, &DataType::Float64)?,
+                                            None,
+                                        ),
+                                    }
+                                }
+                            }
+                        }
+                        _ => {
+                            // No gap-filling config or value, return NULL
+                            Expr::Literal(ScalarValue::Null, None)
+                        }
+                    }
+                }
+            },
+        })
+    }
+
     /// Map an InfluxQL function call to a DataFusion expression.
     ///
     /// A full list of supported functions available via the [InfluxQL documentation][docs].
@@ -3891,37 +3890,64 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
 ///
 /// # Arguments
 ///
-/// * `input` - An aggregate plan which requires gap-filling.
-/// * `time_column` - The `date_bin` expression.
-/// * `fill_strategy` - The strategy used to fill gaps in the data. Should be equal in length to
-///   `input.aggr_exprs`, where fill_strategy\[n\] is the strategy for aggr_exprs\[n\]
+/// * `input` - A plan which requires gap-filling, it is required that
+///   the input plan includes an Aggregate node.
+/// * `fill_strategy` - The strategy used to fill gaps in the data.
+///   Should be equal in length to `input.aggr_exprs`, where
+///   fill_strategy\[n\] is the strategy for aggr_exprs\[n\].
+/// * `projection_type` - The type of projection being performed.
 fn build_gap_fill_node(
     input: LogicalPlan,
-    time_column: &Expr,
     fill_strategy: Vec<FillStrategy>,
     projection_type: &ProjectionType,
 ) -> Result<LogicalPlan> {
-    let (expr, alias) = match time_column {
-        Expr::Alias(Alias {
-            expr,
-            relation: None,
-            name: alias,
-            metadata: _,
-        }) => (expr.as_ref(), alias),
-        _ => return error::internal("expected time column to have an alias function"),
+    let mut aggr = None;
+    input.apply(|expr| {
+        if let LogicalPlan::Aggregate(a) = expr {
+            aggr = Some(a.clone());
+            Ok(TreeNodeRecursion::Stop)
+        } else {
+            Ok(TreeNodeRecursion::Continue)
+        }
+    })?;
+    let Some(aggr) = aggr else {
+        return error::internal("GapFill requires an Aggregate ancestor");
     };
 
-    let (date_bin_udf, date_bin_args) = match expr {
-        Expr::ScalarFunction(ScalarFunction { func: udf, args })
-            if udf.inner().as_any().is::<DateBinFunc>()
-                || udf.inner().as_any().is::<DateBinWallclockUDF>() =>
-        {
-            (Arc::<str>::from(udf.name()), args)
-        }
+    let group_expr = aggr.group_expr;
+
+    // Extract the DATE_BIN expression from the aggregate's group
+    // expressions.
+    let (time_column_idx, time_column_alias, date_bin_udf, date_bin_args) = match group_expr
+        .iter()
+        .enumerate()
+        .filter_map(|(idx, expr)| match expr {
+            Expr::Alias(alias) => {
+                if let Expr::ScalarFunction(fun) = alias.expr.as_ref() {
+                    if fun.func.inner().as_any().is::<DateBinFunc>()
+                        || fun.func.inner().as_any().is::<DateBinWallclockUDF>()
+                    {
+                        Some((
+                            idx,
+                            alias.name.clone(),
+                            Arc::clone(&fun.func),
+                            fun.args.clone(),
+                        ))
+                    } else {
+                        None
+                    }
+                } else {
+                    None
+                }
+            }
+            _ => None,
+        })
+        .collect::<Vec<_>>()
+        .as_slice()
+    {
+        [(idx, alias, udf, args)] => (*idx, alias.to_owned(), Arc::clone(udf), args.to_owned()),
         _ => {
-            // The InfluxQL planner adds the `date_bin` function,
-            // so this condition represents an internal failure.
-            return error::internal("expected DATE_BIN function");
+            return error::internal("expected exactly one DATE_BIN in Aggregate group expressions");
         }
     };
 
@@ -3995,26 +4021,43 @@ fn build_gap_fill_node(
     let aggr_expr = new_group_expr.split_off(aggr.group_expr.len());
 
     // The fill strategy for InfluxQL is specified at the query level
-    let fill_strategy = aggr_expr.iter().cloned().zip(fill_strategy).collect();
+    let fill_expr = aggr_expr
+        .iter()
+        .cloned()
+        .zip(fill_strategy)
+        .map(|(e, s)| FillExpr {
+            expr: e,
+            strategy: s,
+        })
+        .collect();
 
-    let time_column = col(input
-        .schema()
-        .qualified_field_with_unqualified_name(alias)
-        .map(Column::from)?);
+    let series_expr = group_expr
+        .iter()
+        .enumerate()
+        .filter_map(|(i, e)| {
+            if i != time_column_idx {
+                Some(e.clone())
+            } else {
+                None
+            }
+        })
+        .collect();
+
+    let time_expr = Expr::ScalarFunction(ScalarFunction {
+        func: date_bin_udf,
+        args: vec![stride.clone(), col(time_column_alias)]
+            .into_iter()
+            .chain(origin.clone())
+            .collect(),
+    });
 
     Ok(LogicalPlan::Extension(Extension {
         node: Arc::new(GapFill::try_new(
             Arc::new(input),
-            new_group_expr,
-            aggr_expr,
-            GapFillParams {
-                date_bin_udf,
-                stride: stride.clone(),
-                time_column,
-                origin,
-                time_range,
-                fill_strategy,
-            },
+            series_expr,
+            time_expr,
+            fill_expr,
+            time_range,
         )?),
     }))
 }
@@ -5296,7 +5339,7 @@ mod tests {
                   Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, difference(avg(cpu.usage_idle)) AS difference [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, difference:Float64;N]
                     Filter: difference(avg(cpu.usage_idle)) IS NOT NULL [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, difference(avg(cpu.usage_idle)):Float64;N]
                       WindowAggr: windowExpr=[[difference(avg(cpu.usage_idle)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS difference(avg(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, difference(avg(cpu.usage_idle)):Float64;N]
-                        GapFill: groupBy=[time], aggr=[[avg(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
+                        GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[avg(cpu.usage_idle)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
                           Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[avg(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
                             Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                               Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
@@ -5322,12 +5365,25 @@ mod tests {
                   Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, non_negative_difference(avg(cpu.usage_idle)) AS non_negative_difference [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_difference:Float64;N]
                     Filter: non_negative_difference(avg(cpu.usage_idle)) IS NOT NULL [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, non_negative_difference(avg(cpu.usage_idle)):Float64;N]
                       WindowAggr: windowExpr=[[non_negative_difference(avg(cpu.usage_idle)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_difference(avg(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, non_negative_difference(avg(cpu.usage_idle)):Float64;N]
-                        GapFill: groupBy=[time], aggr=[[avg(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
+                        GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[avg(cpu.usage_idle)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
                           Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[avg(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
                             Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                               Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                                 TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                 "#);
+
+                // aggregate SUM regex
+                assert_snapshot!(plan("SELECT NON_NEGATIVE_DIFFERENCE(SUM(/usage_.*/)) FROM cpu GROUP BY time(10s)"), @r#"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_difference_usage_idle:Float64;N, non_negative_difference_usage_system:Float64;N, non_negative_difference_usage_user:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, non_negative_difference(sum(cpu.usage_idle)) AS non_negative_difference_usage_idle, non_negative_difference(sum(cpu.usage_system)) AS non_negative_difference_usage_system, non_negative_difference(sum(cpu.usage_user)) AS non_negative_difference_usage_user [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_difference_usage_idle:Float64;N, non_negative_difference_usage_system:Float64;N, non_negative_difference_usage_user:Float64;N]
+                    Filter: non_negative_difference(sum(cpu.usage_idle)) IS NOT NULL OR non_negative_difference(sum(cpu.usage_system)) IS NOT NULL OR non_negative_difference(sum(cpu.usage_user)) IS NOT NULL [time:Timestamp(Nanosecond, None);N, sum(cpu.usage_idle):Float64;N, sum(cpu.usage_system):Float64;N, sum(cpu.usage_user):Float64;N, non_negative_difference(sum(cpu.usage_idle)):Float64;N, non_negative_difference(sum(cpu.usage_system)):Float64;N, non_negative_difference(sum(cpu.usage_user)):Float64;N]
+                      WindowAggr: windowExpr=[[non_negative_difference(sum(cpu.usage_idle)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_difference(sum(cpu.usage_idle)), non_negative_difference(sum(cpu.usage_system)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_difference(sum(cpu.usage_system)), non_negative_difference(sum(cpu.usage_user)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_difference(sum(cpu.usage_user))]] [time:Timestamp(Nanosecond, None);N, sum(cpu.usage_idle):Float64;N, sum(cpu.usage_system):Float64;N, sum(cpu.usage_user):Float64;N, non_negative_difference(sum(cpu.usage_idle)):Float64;N, non_negative_difference(sum(cpu.usage_system)):Float64;N, non_negative_difference(sum(cpu.usage_user)):Float64;N]
+                        GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[sum(cpu.usage_idle), sum(cpu.usage_system), sum(cpu.usage_user)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, sum(cpu.usage_idle):Float64;N, sum(cpu.usage_system):Float64;N, sum(cpu.usage_user):Float64;N]
+                          Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[sum(cpu.usage_idle), sum(cpu.usage_system), sum(cpu.usage_user)]] [time:Timestamp(Nanosecond, None);N, sum(cpu.usage_idle):Float64;N, sum(cpu.usage_system):Float64;N, sum(cpu.usage_user):Float64;N]
+                            Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                              Filter: cpu.usage_idle IS NOT NULL OR cpu.usage_system IS NOT NULL OR cpu.usage_user IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "#);
             }
 
             #[test]
@@ -5348,7 +5404,7 @@ mod tests {
                   Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, moving_average(avg(cpu.usage_idle),Int64(3)) AS moving_average [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, moving_average:Float64;N]
                     Filter: moving_average(avg(cpu.usage_idle),Int64(3)) IS NOT NULL [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, moving_average(avg(cpu.usage_idle),Int64(3)):Float64;N]
                       WindowAggr: windowExpr=[[moving_average(avg(cpu.usage_idle), Int64(3)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS moving_average(avg(cpu.usage_idle),Int64(3))]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, moving_average(avg(cpu.usage_idle),Int64(3)):Float64;N]
-                        GapFill: groupBy=[time], aggr=[[avg(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
+                        GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[avg(cpu.usage_idle)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
                           Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[avg(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
                             Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                               Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
@@ -5383,7 +5439,7 @@ mod tests {
                   Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, derivative(avg(cpu.usage_idle)) AS derivative [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, derivative:Float64;N]
                     Filter: derivative(avg(cpu.usage_idle)) IS NOT NULL [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, derivative(avg(cpu.usage_idle)):Float64;N]
                       WindowAggr: windowExpr=[[derivative(avg(cpu.usage_idle), IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS derivative(avg(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, derivative(avg(cpu.usage_idle)):Float64;N]
-                        GapFill: groupBy=[time], aggr=[[avg(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
+                        GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[avg(cpu.usage_idle)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
                           Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[avg(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
                             Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                               Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
@@ -5409,7 +5465,7 @@ mod tests {
                   Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, non_negative_derivative(avg(cpu.usage_idle)) AS non_negative_derivative [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_derivative:Float64;N]
                     Filter: non_negative_derivative(avg(cpu.usage_idle)) IS NOT NULL [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, non_negative_derivative(avg(cpu.usage_idle)):Float64;N]
                       WindowAggr: windowExpr=[[non_negative_derivative(avg(cpu.usage_idle), IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_derivative(avg(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, non_negative_derivative(avg(cpu.usage_idle)):Float64;N]
-                        GapFill: groupBy=[time], aggr=[[avg(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
+                        GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[avg(cpu.usage_idle)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
                           Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[avg(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
                             Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                               Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
@@ -5422,7 +5478,7 @@ mod tests {
                   Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, non_negative_derivative(selector_last(cpu.usage_idle,cpu.time)[value]) AS non_negative_derivative [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_derivative:Float64;N]
                     Filter: non_negative_derivative(selector_last(cpu.usage_idle,cpu.time)[value]) IS NOT NULL [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N, non_negative_derivative(selector_last(cpu.usage_idle,cpu.time)[value]):Float64;N]
                       WindowAggr: windowExpr=[[non_negative_derivative(get_field(selector_last(cpu.usage_idle,cpu.time), Utf8("value")), IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_derivative(selector_last(cpu.usage_idle,cpu.time)[value])]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N, non_negative_derivative(selector_last(cpu.usage_idle,cpu.time)[value]):Float64;N]
-                        GapFill: groupBy=[time], aggr=[[selector_last(cpu.usage_idle,cpu.time)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                        GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[selector_last(cpu.usage_idle,cpu.time)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                           Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_last(cpu.usage_idle, cpu.time)]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                             Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                               Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
@@ -5448,7 +5504,7 @@ mod tests {
                   Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, cumulative_sum(avg(cpu.usage_idle)) AS cumulative_sum [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cumulative_sum:Float64;N]
                     Filter: cumulative_sum(avg(cpu.usage_idle)) IS NOT NULL [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, cumulative_sum(avg(cpu.usage_idle)):Float64;N]
                       WindowAggr: windowExpr=[[cumumlative_sum(avg(cpu.usage_idle)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS cumulative_sum(avg(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, cumulative_sum(avg(cpu.usage_idle)):Float64;N]
-                        GapFill: groupBy=[time], aggr=[[avg(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
+                        GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[avg(cpu.usage_idle)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
                           Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[avg(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
                             Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                               Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
@@ -5462,7 +5518,7 @@ mod tests {
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, difference:Float64;N, mean:Float64;N]
                   Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, difference(avg(cpu.usage_idle)) AS difference, avg(cpu.usage_idle) AS mean [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, difference:Float64;N, mean:Float64;N]
                     WindowAggr: windowExpr=[[difference(avg(cpu.usage_idle)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS difference(avg(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N, difference(avg(cpu.usage_idle)):Float64;N]
-                      GapFill: groupBy=[time], aggr=[[avg(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
+                      GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[avg(cpu.usage_idle)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
                         Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[avg(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, avg(cpu.usage_idle):Float64;N]
                           Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                             Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
@@ -5598,7 +5654,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT LAST(usage_idle) FROM cpu GROUP BY TIME(5s)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N]
                   Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, get_field(selector_last(cpu.usage_idle,cpu.time), Utf8("value")) AS last [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N]
-                    GapFill: groupBy=[time], aggr=[[selector_last(cpu.usage_idle,cpu.time)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), time, TimestampNanosecond(0, None)), fill=[selector_last(cpu.usage_idle,cpu.time)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_last(cpu.usage_idle, cpu.time)]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                         Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                           Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
@@ -5611,7 +5667,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT FIRST(usage_idle) FROM cpu GROUP BY TIME(5s) FILL(0)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N]
                   Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, get_field(coalesce_struct(selector_first(cpu.usage_idle,cpu.time), Struct({value:0.0,time:1970-01-01T00:00:00})), Utf8("value")) AS first [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N]
-                    GapFill: groupBy=[time], aggr=[[COALESCE({value:0.0,time:1970-01-01T00:00:00}, selector_first(cpu.usage_idle,cpu.time))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_first(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(selector_first(cpu.usage_idle,cpu.time), {value:0.0,time:1970-01-01T00:00:00})], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_first(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_first(cpu.usage_idle, cpu.time)]] [time:Timestamp(Nanosecond, None);N, selector_first(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                         Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                           Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
@@ -5658,7 +5714,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT LAST(usage_idle), usage_system FROM cpu GROUP BY TIME(5s)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N, usage_system:Float64;N]
                   Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, get_field(selector_last(cpu.usage_idle,cpu.time,cpu.usage_system), Utf8("value")) AS last, get_field(selector_last(cpu.usage_idle,cpu.time,cpu.usage_system), Utf8("other_1")) AS usage_system [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N, usage_system:Float64;N]
-                    GapFill: groupBy=[time], aggr=[[selector_last(cpu.usage_idle,cpu.time,cpu.usage_system)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), time, TimestampNanosecond(0, None)), fill=[selector_last(cpu.usage_idle,cpu.time,cpu.usage_system)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_last(cpu.usage_idle, cpu.time, cpu.usage_system)]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                         Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                           Filter: cpu.usage_idle IS NOT NULL OR cpu.usage_system IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
@@ -5670,7 +5726,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT LAST(usage_idle), usage_system FROM cpu GROUP BY TIME(5s) FILL(0)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N, usage_system:Float64;N]
                   Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, get_field(coalesce_struct(selector_last(cpu.usage_idle,cpu.time,cpu.usage_system), Struct({value:0.0,time:1970-01-01T00:00:00,other_1:0.0})), Utf8("value")) AS last, get_field(coalesce_struct(selector_last(cpu.usage_idle,cpu.time,cpu.usage_system), Struct({value:0.0,time:1970-01-01T00:00:00,other_1:0.0})), Utf8("other_1")) AS usage_system [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N, usage_system:Float64;N]
-                    GapFill: groupBy=[time], aggr=[[COALESCE({value:0.0,time:1970-01-01T00:00:00,other_1:0.0}, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(selector_last(cpu.usage_idle,cpu.time,cpu.usage_system), {value:0.0,time:1970-01-01T00:00:00,other_1:0.0})], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 5000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_last(cpu.usage_idle, cpu.time, cpu.usage_system)]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                         Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                           Filter: cpu.usage_idle IS NOT NULL OR cpu.usage_system IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
@@ -5787,7 +5843,7 @@ mod tests {
             assert_snapshot!(plan("SELECT percentile(usage_idle,50), percentile(usage_idle,90) FROM cpu WHERE time >= 0 AND time < 60000000000 GROUP BY time(10s), cpu"), @r#"
             Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, percentile:Float64;N, percentile_1:Float64;N]
               Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, cpu.cpu AS cpu, percentile(cpu.usage_idle,Int64(50)) AS percentile, percentile(cpu.usage_idle,Int64(90)) AS percentile_1 [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, percentile:Float64;N, percentile_1:Float64;N]
-                GapFill: groupBy=[time, cpu.cpu], aggr=[[percentile(cpu.usage_idle,Int64(50)), percentile(cpu.usage_idle,Int64(90))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Included(Literal(TimestampNanosecond(0, None), None))..Included(Literal(TimestampNanosecond(59999999999, None), None)) [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, percentile(cpu.usage_idle,Int64(50)):Float64;N, percentile(cpu.usage_idle,Int64(90)):Float64;N]
+                GapFill: series=[cpu.cpu], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[percentile(cpu.usage_idle,Int64(50)), percentile(cpu.usage_idle,Int64(90))], range=Included(Literal(TimestampNanosecond(0, None), None))..Included(Literal(TimestampNanosecond(59999999999, None), None)) [cpu:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None);N, percentile(cpu.usage_idle,Int64(50)):Float64;N, percentile(cpu.usage_idle,Int64(90)):Float64;N]
                   Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), cpu.time, TimestampNanosecond(0, None)) AS time, cpu.cpu]], aggr=[[percentile(cpu.usage_idle, Int64(50)), percentile(cpu.usage_idle, Int64(90))]] [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, percentile(cpu.usage_idle,Int64(50)):Float64;N, percentile(cpu.usage_idle,Int64(90)):Float64;N]
                     Filter: cpu.time >= TimestampNanosecond(0, None) AND cpu.time <= TimestampNanosecond(59999999999, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                       Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
@@ -6681,7 +6737,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT count(f64_field) FROM data GROUP BY TIME(10s)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64]
                   Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, count(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64]
-                    GapFill: groupBy=[time], aggr=[[COALESCE(0, count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 0)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                         Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                           Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@@ -6695,7 +6751,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT count(f64_field) FROM data WHERE time < '2022-10-31T02:02:00Z' GROUP BY TIME(10s)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64]
                   Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, count(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64]
-                    GapFill: groupBy=[time], aggr=[[COALESCE(0, count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1667181719999999999, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 0)], range=Unbounded..Included(Literal(TimestampNanosecond(1667181719999999999, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                         Filter: data.time <= TimestampNanosecond(1667181719999999999, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                           Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@@ -6709,7 +6765,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT count(f64_field) FROM data WHERE time >= '2022-10-31T02:00:00Z' GROUP BY TIME(10s)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64]
                   Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, count(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64]
-                    GapFill: groupBy=[time], aggr=[[COALESCE(0, count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Included(Literal(TimestampNanosecond(1667181600000000000, None), None))..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 0)], range=Included(Literal(TimestampNanosecond(1667181600000000000, None), None))..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                         Filter: data.time >= TimestampNanosecond(1667181600000000000, None) AND data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                           Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@@ -6723,7 +6779,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT count(f64_field) FROM data WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(10s)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64]
                   Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, count(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64]
-                    GapFill: groupBy=[time], aggr=[[COALESCE(0, count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Included(Literal(TimestampNanosecond(1667181600000000000, None), None))..Included(Literal(TimestampNanosecond(1667181719999999999, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 0)], range=Included(Literal(TimestampNanosecond(1667181600000000000, None), None))..Included(Literal(TimestampNanosecond(1667181719999999999, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                         Filter: data.time >= TimestampNanosecond(1667181600000000000, None) AND data.time <= TimestampNanosecond(1667181719999999999, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                           Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@@ -6736,7 +6792,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT count(f64_field) FROM data GROUP BY TIME(10s)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64]
                   Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, count(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64]
-                    GapFill: groupBy=[time], aggr=[[COALESCE(0, count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 0)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                         Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                           Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@@ -6749,7 +6805,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT count(f64_field) FROM data GROUP BY TIME(10s) FILL(null)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64]
                   Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, count(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64]
-                    GapFill: groupBy=[time], aggr=[[COALESCE(0, count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 0)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                         Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                           Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@@ -6762,7 +6818,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT count(f64_field) FROM data GROUP BY TIME(10s) FILL(previous)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64]
                   Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, count(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64]
-                    GapFill: groupBy=[time], aggr=[[LOCF(count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[LOCF(count(data.f64_field))], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                         Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                           Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@@ -6775,7 +6831,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT count(f64_field) FROM data GROUP BY TIME(10s) FILL(0)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
                   Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(count(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
-                    GapFill: groupBy=[time], aggr=[[COALESCE(0, count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 0)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                         Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                           Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@@ -6788,7 +6844,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT count(f64_field) FROM data GROUP BY TIME(10s) FILL(linear)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64]
                   Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, count(data.f64_field) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64]
-                    GapFill: groupBy=[time], aggr=[[INTERPOLATE(count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[INTERPOLATE(count(data.f64_field))], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                         Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                           Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@@ -6804,7 +6860,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT first(f64_field) FROM data GROUP BY TIME(10s) FILL(null)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N]
                   Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, get_field(selector_first(data.f64_field,data.time), Utf8("value")) AS first [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N]
-                    GapFill: groupBy=[time], aggr=[[selector_first(data.f64_field,data.time)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[selector_first(data.f64_field,data.time)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_first(data.f64_field, data.time)]] [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                         Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                           Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@@ -6814,7 +6870,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT first(f64_field) * 1 FROM data GROUP BY TIME(10s) FILL(null)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N]
                   Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, get_field(selector_first(data.f64_field,data.time), Utf8("value")) * Int64(1) AS first [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N]
-                    GapFill: groupBy=[time], aggr=[[selector_first(data.f64_field,data.time)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[selector_first(data.f64_field,data.time)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_first(data.f64_field, data.time)]] [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                         Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                           Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@@ -6824,7 +6880,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT first(f64_field) / 1 FROM data GROUP BY TIME(10s) FILL(null)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N]
                   Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, CASE WHEN Int64(1) = Float64(0) AND get_field(selector_first(data.f64_field,data.time), Utf8("value")) IS NOT NULL THEN Float64(0) ELSE get_field(selector_first(data.f64_field,data.time), Utf8("value")) / Int64(1) END AS first [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N]
-                    GapFill: groupBy=[time], aggr=[[selector_first(data.f64_field,data.time)]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[selector_first(data.f64_field,data.time)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_first(data.f64_field, data.time)]] [time:Timestamp(Nanosecond, None);N, selector_first(data.f64_field,data.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                         Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                           Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@@ -6838,7 +6894,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT count(f64_field) FROM data GROUP BY TIME(10s) FILL(3.2)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
                   Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(count(data.f64_field), Int64(3)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
-                    GapFill: groupBy=[time], aggr=[[COALESCE(3, count(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 3)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64]
                         Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                           Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
@@ -6852,7 +6908,7 @@ mod tests {
                 assert_snapshot!(plan("SELECT count(f64_field) + MEAN(f64_field) FROM data GROUP BY TIME(10s) FILL(3.2)"), @r#"
                 Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count_mean:Float64;N]
                   Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(count(data.f64_field), Int64(3)) + coalesce_struct(avg(data.f64_field), Float64(3.2)) AS count_mean [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count_mean:Float64;N]
-                    GapFill: groupBy=[time], aggr=[[COALESCE(3, count(data.f64_field)), COALESCE(3.2, avg(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64, avg(data.f64_field):Float64;N]
+                    GapFill: series=[], time=date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), time, TimestampNanosecond(0, None)), fill=[COALESCE(count(data.f64_field), 3), COALESCE(avg(data.f64_field), 3.2)], range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None), None)) [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64, avg(data.f64_field):Float64;N]
                       Aggregate: groupBy=[[date_bin_wallclock(IntervalMonthDayNano("IntervalMonthDayNano { months: 0, days: 0, nanoseconds: 10000000000 }"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[count(data.f64_field), avg(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, count(data.f64_field):Int64, avg(data.f64_field):Float64;N]
                         Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
                           Filter: data.f64_field IS NOT NULL [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
diff --git a/iox_query_influxql/src/plan/rewriter.rs b/iox_query_influxql/src/plan/rewriter.rs
index f34d3d08..db57ef4b 100644
--- a/iox_query_influxql/src/plan/rewriter.rs
+++ b/iox_query_influxql/src/plan/rewriter.rs
@@ -730,18 +730,16 @@ fn fields_expand_wildcards(
             }
 
             Expr::Call(Call { name, args }) => {
-                let mut name = name;
-                let mut args = args;
+                let mut stack = vec![(name, args)];
 
                 // Search for the call with a wildcard by continuously descending until
                 // we no longer have a call.
                 while let Some(Expr::Call(Call {
                     name: inner_name,
                     args: inner_args,
-                })) = args.first()
+                })) = stack.last().unwrap().1.first()
                 {
-                    name = inner_name;
-                    args = inner_args;
+                    stack.push((inner_name, inner_args));
                 }
 
                 // a list of supported types that may be selected from the var_refs
@@ -753,7 +751,7 @@ fn fields_expand_wildcards(
                 ]);
 
                 // Modify the supported types for certain functions.
-                match name.as_str() {
+                match stack.last().unwrap().0.as_str() {
                     "count" | "first" | "last" | "distinct" | "elapsed" | "mode" | "sample" => {
                         supported_types
                             .extend([Some(VarRefDataType::String), Some(VarRefDataType::Boolean)]);
@@ -767,21 +765,38 @@ fn fields_expand_wildcards(
                     _ => {}
                 }
 
+                // Reverse the stack so that new fields can be added by
+                // applying the required function calls starting with
+                // the inner-most.
+                stack.reverse();
                 let add_field = |v: &VarRef| {
-                    let mut args = args.clone();
-                    args[0] = Expr::VarRef(v.clone());
+                    let var_ref_name = v.name.clone();
+                    let mut e = Expr::VarRef(v.clone());
+                    for (name, args) in &stack {
+                        e = Expr::Call(Call {
+                            name: (*name).clone(),
+                            // The first argument is always e as it is
+                            // either the new field reference or the
+                            // next inner function call. Any remaining
+                            // arguments are appended.
+                            args: vec![e]
+                                .into_iter()
+                                .chain(args.iter().skip(1).cloned())
+                                .collect(),
+                        })
+                    }
                     new_fields.push(influxdb_influxql_parser::select::Field {
-                        expr: Expr::Call(Call {
-                            name: name.clone(),
-                            args,
-                        }),
-                        alias: Some(format!("{}_{}", field_name(&f), v.name).into()),
+                        expr: e,
+                        alias: Some(format!("{}_{}", field_name(&f), var_ref_name).into()),
                     })
                 };
 
-                match args.first() {
+                match stack.first().unwrap().1.first() {
                     Some(Expr::Wildcard(Some(WildcardType::Tag))) => {
-                        return error::query(format!("unable to use tag as wildcard in {name}()"));
+                        return error::query(format!(
+                            "unable to use tag as wildcard in {}()",
+                            stack.first().unwrap().0
+                        ));
                     }
                     Some(Expr::Wildcard(_)) => {
                         var_refs
@@ -1670,16 +1685,19 @@ fn select_statement_info(
 
 #[cfg(test)]
 mod test {
-    use super::Result;
+    use super::{Result, VarRef};
     use crate::plan::ir::{Field, Select};
     use crate::plan::rewriter::{
-        ProjectionType, SelectStatementInfo, find_table_names, has_wildcards, rewrite_select,
-        rewrite_statement,
+        ProjectionType, SelectStatementInfo, fields_expand_wildcards, find_table_names,
+        has_wildcards, rewrite_select, rewrite_statement,
     };
     use crate::plan::test_utils::{MockSchemaProvider, parse_select};
     use assert_matches::assert_matches;
     use datafusion::error::DataFusionError;
+    use influxdb_influxql_parser::expression::VarRefDataType;
+    use influxdb_influxql_parser::identifier::Identifier;
     use influxdb_influxql_parser::select::SelectStatement;
+    use influxdb_influxql_parser::string::Regex;
     use test_helpers::{assert_contains, assert_error};
 
     #[test]
@@ -2812,4 +2830,232 @@ mod test {
         assert!(!res.0);
         assert!(!res.1);
     }
+
+    #[test]
+    fn test_nested_function_wildcard_expansion() {
+        // Test that wildcards in nested functions are properly expanded
+        // This tests the fix for expanding regular expressions in nested functions
+
+        let _namespace = MockSchemaProvider::default();
+
+        // Create var_refs for cpu table (based on database::schemas())
+        // Tags: host, region, cpu
+        // Fields: usage_user, usage_system, usage_idle (all Float)
+        let var_refs = vec![
+            VarRef {
+                name: Identifier::new("host".to_string()),
+                data_type: Some(VarRefDataType::Tag),
+            },
+            VarRef {
+                name: Identifier::new("region".to_string()),
+                data_type: Some(VarRefDataType::Tag),
+            },
+            VarRef {
+                name: Identifier::new("cpu".to_string()),
+                data_type: Some(VarRefDataType::Tag),
+            },
+            VarRef {
+                name: Identifier::new("usage_user".to_string()),
+                data_type: Some(VarRefDataType::Float),
+            },
+            VarRef {
+                name: Identifier::new("usage_system".to_string()),
+                data_type: Some(VarRefDataType::Float),
+            },
+            VarRef {
+                name: Identifier::new("usage_idle".to_string()),
+                data_type: Some(VarRefDataType::Float),
+            },
+        ];
+
+        // Test difference(sum(*)) - a nested function combination with wildcard
+        let fields = vec![influxdb_influxql_parser::select::Field {
+            expr: influxdb_influxql_parser::expression::Expr::Call(
+                influxdb_influxql_parser::expression::Call {
+                    name: "difference".to_string(),
+                    args: vec![influxdb_influxql_parser::expression::Expr::Call(
+                        influxdb_influxql_parser::expression::Call {
+                            name: "sum".to_string(),
+                            args: vec![influxdb_influxql_parser::expression::Expr::Wildcard(None)],
+                        },
+                    )],
+                },
+            ),
+            alias: None,
+        }];
+
+        // Expand wildcards
+        let expanded_fields = fields_expand_wildcards(fields, var_refs.clone()).unwrap();
+
+        // Check that wildcards were expanded to actual fields (only numeric fields)
+        assert!(
+            expanded_fields.len() == 3,
+            "Expected 3 numeric fields after expansion"
+        );
+
+        // Verify each field has proper nested structure difference(sum(field))
+        for field in &expanded_fields {
+            // Should have an alias like difference_<fieldname> (outermost function name + field)
+            assert!(field.alias.is_some(), "Field should have an alias");
+            let alias = field.alias.as_ref().unwrap();
+            assert!(
+                alias.starts_with("difference_"),
+                "Alias should start with difference_"
+            );
+
+            // Verify it's a difference(sum(field)) structure
+            match &field.expr {
+                influxdb_influxql_parser::expression::Expr::Call(outer_call) => {
+                    assert_eq!(outer_call.name, "difference");
+                    assert_eq!(outer_call.args.len(), 1);
+
+                    match &outer_call.args[0] {
+                        influxdb_influxql_parser::expression::Expr::Call(inner_call) => {
+                            assert_eq!(inner_call.name, "sum");
+                            assert_eq!(inner_call.args.len(), 1);
+
+                            // Should be a VarRef to an actual field
+                            assert_matches!(
+                                &inner_call.args[0],
+                                influxdb_influxql_parser::expression::Expr::VarRef(_)
+                            );
+                        }
+                        _ => panic!("Expected inner call to be sum()"),
+                    }
+                }
+                _ => panic!("Expected outer call to be difference()"),
+            }
+        }
+
+        // Test with regex pattern in nested function - difference(sum(/usage.*/))
+        let fields_regex = vec![influxdb_influxql_parser::select::Field {
+            expr: influxdb_influxql_parser::expression::Expr::Call(
+                influxdb_influxql_parser::expression::Call {
+                    name: "difference".to_string(),
+                    args: vec![influxdb_influxql_parser::expression::Expr::Call(
+                        influxdb_influxql_parser::expression::Call {
+                            name: "sum".to_string(),
+                            args: vec![influxdb_influxql_parser::expression::Expr::Literal(
+                                influxdb_influxql_parser::literal::Literal::Regex(Regex::new(
+                                    "usage.*".to_string(),
+                                )),
+                            )],
+                        },
+                    )],
+                },
+            ),
+            alias: None,
+        }];
+
+        // Expand regex pattern
+        let expanded_regex = fields_expand_wildcards(fields_regex, var_refs.clone()).unwrap();
+
+        // Should expand to fields matching the pattern
+        assert_eq!(
+            expanded_regex.len(),
+            3,
+            "Expected exactly 3 fields matching 'usage.*'"
+        );
+
+        for field in &expanded_regex {
+            assert!(field.alias.is_some());
+            let alias = field.alias.as_ref().unwrap();
+            assert!(
+                alias.starts_with("difference_usage"),
+                "Expanded field should start with 'difference_usage'"
+            );
+
+            // Verify it's a difference(sum(field)) structure
+            match &field.expr {
+                influxdb_influxql_parser::expression::Expr::Call(outer_call) => {
+                    assert_eq!(outer_call.name, "difference");
+                    match &outer_call.args[0] {
+                        influxdb_influxql_parser::expression::Expr::Call(inner_call) => {
+                            assert_eq!(inner_call.name, "sum");
+                        }
+                        _ => panic!("Expected inner call to be sum()"),
+                    }
+                }
+                _ => panic!("Expected outer call to be difference()"),
+            }
+        }
+
+        // Test that the stack-based traversal correctly handles deeply nested functions
+        // This is the core of the fix - ensuring we properly rebuild the nested structure
+        let deep_nested = vec![influxdb_influxql_parser::select::Field {
+            expr: influxdb_influxql_parser::expression::Expr::Call(
+                influxdb_influxql_parser::expression::Call {
+                    name: "non_negative_difference".to_string(),
+                    args: vec![influxdb_influxql_parser::expression::Expr::Call(
+                        influxdb_influxql_parser::expression::Call {
+                            name: "mean".to_string(),
+                            args: vec![influxdb_influxql_parser::expression::Expr::Call(
+                                influxdb_influxql_parser::expression::Call {
+                                    name: "sum".to_string(),
+                                    args: vec![
+                                        influxdb_influxql_parser::expression::Expr::Wildcard(None),
+                                    ],
+                                },
+                            )],
+                        },
+                    )],
+                },
+            ),
+            alias: None,
+        }];
+
+        // This should expand the wildcard while preserving the full nested structure
+        let expanded_deep = fields_expand_wildcards(deep_nested, var_refs).unwrap();
+        assert_eq!(
+            expanded_deep.len(),
+            3,
+            "Deep nested functions should expand to 3 numeric fields"
+        );
+
+        // Verify the structure is preserved: non_negative_difference(mean(sum(field)))
+        for field in &expanded_deep {
+            assert!(field.alias.is_some());
+            let alias = field.alias.as_ref().unwrap();
+            assert!(
+                alias.starts_with("non_negative_difference_usage"),
+                "Deep nested alias should start with non_negative_difference_usage"
+            );
+
+            match &field.expr {
+                influxdb_influxql_parser::expression::Expr::Call(outer) => {
+                    assert_eq!(outer.name, "non_negative_difference");
+                    assert_eq!(
+                        outer.args.len(),
+                        1,
+                        "non_negative_difference should have 1 arg"
+                    );
+
+                    // First arg should be mean(sum(field))
+                    match &outer.args[0] {
+                        influxdb_influxql_parser::expression::Expr::Call(middle) => {
+                            assert_eq!(middle.name, "mean");
+                            assert_eq!(middle.args.len(), 1);
+
+                            // Inner should be sum(field)
+                            match &middle.args[0] {
+                                influxdb_influxql_parser::expression::Expr::Call(inner) => {
+                                    assert_eq!(inner.name, "sum");
+                                    assert_eq!(inner.args.len(), 1);
+
+                                    // Should be a VarRef
+                                    assert_matches!(
+                                        &inner.args[0],
+                                        influxdb_influxql_parser::expression::Expr::VarRef(_)
+                                    );
+                                }
+                                _ => panic!("Expected innermost call to be sum()"),
+                            }
+                        }
+                        _ => panic!("Expected middle call to be mean()"),
+                    }
+                }
+                _ => panic!("Expected outer call to be non_negative_difference()"),
+            }
+        }
+    }
 }
diff --git a/iox_query_influxql/src/show_databases.rs b/iox_query_influxql/src/show_databases.rs
new file mode 100644
index 00000000..66531f39
--- /dev/null
+++ b/iox_query_influxql/src/show_databases.rs
@@ -0,0 +1,99 @@
+use std::collections::HashMap;
+use std::fmt::Debug;
+
+use arrow::datatypes::SchemaRef;
+use datafusion::common::Result;
+use datafusion::execution::SendableRecordBatchStream;
+use generated_types::influxdata::iox::querier::v1::InfluxQlMetadata;
+use schema::INFLUXQL_METADATA_KEY;
+
+/// Trait for handling the `SHOW DATABASES` query
+///
+/// This allows for optional `SHOW DATABASES` handling for systems such as
+/// InfluxDB3 Enterprise, without requiring it to be implemented on all systems
+#[async_trait::async_trait]
+pub trait InfluxQlShowDatabases: Debug + Send + Sync + 'static {
+    /// Produce the Arrow schema for the `SHOW DATABASES` InfluxQL query
+    fn schema(&self) -> SchemaRef;
+    /// Produce a record batch stream containing the results for the `SHOW DATABASES` query
+    ///
+    /// Accepts `database_names` which represents the list of databases the requestor is
+    /// authorized to read. The underlying implementation should only produce the databases listed
+    /// in the resulting record batch stream.
+    async fn show_databases(
+        &self,
+        database_names: Vec<String>,
+    ) -> Result<SendableRecordBatchStream>;
+}
+
+/// Generate the default InfluxQL metadata map for producing a `Schema` for the `SHOW DATABASES`
+/// query.
+pub fn generate_metadata(measurement_column_index: u32) -> HashMap<String, String> {
+    let md = serde_json::to_string(&InfluxQlMetadata {
+        measurement_column_index,
+        tag_key_columns: vec![],
+    })
+    .expect("metadata should serialize as JSON");
+    [(INFLUXQL_METADATA_KEY.to_string(), md)]
+        .into_iter()
+        .collect()
+}
+
+pub mod mock {
+    use std::sync::Arc;
+
+    use arrow::{
+        array::{Array, RecordBatch, StringArray},
+        datatypes::{DataType, Field, Schema},
+    };
+    use datafusion_util::MemoryStream;
+    use schema::INFLUXQL_MEASUREMENT_COLUMN_NAME;
+
+    use super::*;
+
+    #[derive(Debug)]
+    pub struct MockShowDatabases {
+        database_names: Vec<String>,
+    }
+
+    impl MockShowDatabases {
+        pub fn new(database_names: impl IntoIterator<Item: Into<String>>) -> Self {
+            Self {
+                database_names: database_names.into_iter().map(Into::into).collect(),
+            }
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl InfluxQlShowDatabases for MockShowDatabases {
+        fn schema(&self) -> SchemaRef {
+            Arc::new(
+                Schema::new(vec![
+                    Field::new(INFLUXQL_MEASUREMENT_COLUMN_NAME, DataType::Utf8, false),
+                    Field::new("name", arrow::datatypes::DataType::Utf8, false),
+                ])
+                .with_metadata(generate_metadata(0)),
+            )
+        }
+
+        async fn show_databases(
+            &self,
+            database_names: Vec<String>,
+        ) -> Result<SendableRecordBatchStream> {
+            let names = self
+                .database_names
+                .iter()
+                .filter(|n| database_names.contains(*n))
+                .map(String::as_str)
+                .collect::<Vec<_>>();
+            let measurement_array: StringArray = vec!["databases"; names.len()].into();
+            let names_array: StringArray = names.into();
+            let arrays = vec![
+                Arc::new(measurement_array) as Arc<dyn Array>,
+                Arc::new(names_array) as Arc<dyn Array>,
+            ];
+            let batch = RecordBatch::try_new(self.schema(), arrays)?;
+            Ok(Box::pin(MemoryStream::new(vec![batch])))
+        }
+    }
+}
diff --git a/iox_query_influxql/src/show_retention_policies.rs b/iox_query_influxql/src/show_retention_policies.rs
new file mode 100644
index 00000000..2097dc8a
--- /dev/null
+++ b/iox_query_influxql/src/show_retention_policies.rs
@@ -0,0 +1,227 @@
+use std::collections::HashMap;
+use std::fmt::Debug;
+
+use arrow::datatypes::SchemaRef;
+use datafusion::common::Result;
+use datafusion::execution::SendableRecordBatchStream;
+use generated_types::influxdata::iox::querier::v1::InfluxQlMetadata;
+use schema::INFLUXQL_METADATA_KEY;
+
+/// Trait for handling the `SHOW RETENTION POLICIES` query
+///
+/// This allows for optional `SHOW RETENTION POLICIES` handling for systems such as
+/// InfluxDB3 Enterprise, without requiring it to be implemented on all systems
+#[async_trait::async_trait]
+pub trait InfluxQlShowRetentionPolicies: Debug + Send + Sync + 'static {
+    /// Produce the Arrow schema for the `SHOW RETENTION POLICIES` InfluxQL query
+    fn schema(&self) -> SchemaRef;
+    /// Produce a record batch stream containing the results for the `SHOW RETENTION POLICIES` query
+    async fn show_retention_policies(&self, db_name: String) -> Result<SendableRecordBatchStream>;
+}
+
+/// Generate the default InfluxQL metadata map for producing a `Schema` for the
+/// `SHOW RETENTION POLICIES` query.
+pub fn generate_metadata(measurement_column_index: u32) -> HashMap<String, String> {
+    let md = serde_json::to_string(&InfluxQlMetadata {
+        measurement_column_index,
+        tag_key_columns: vec![],
+    })
+    .expect("metadata should serialize as JSON");
+    [(INFLUXQL_METADATA_KEY.to_string(), md)]
+        .into_iter()
+        .collect()
+}
+
+pub mod mock {
+    use std::{collections::BTreeMap, sync::Arc, time::Duration};
+
+    use arrow::{
+        array::{Array, BooleanArray, RecordBatch, StringArray, UInt64Array},
+        datatypes::{DataType, Field, Schema},
+    };
+    use datafusion_util::MemoryStream;
+    use schema::INFLUXQL_MEASUREMENT_COLUMN_NAME;
+
+    use super::*;
+
+    #[derive(Debug)]
+    pub struct MockRetentionPolicy {
+        name: String,
+        duration: Duration,
+        shard_group_duration: Duration,
+        replica_n: u64,
+        future_write_limit: Duration,
+        past_write_limit: Duration,
+        default: bool,
+    }
+
+    impl Default for MockRetentionPolicy {
+        fn default() -> Self {
+            Self {
+                name: "autogen".to_string(),
+                duration: Duration::ZERO,
+                shard_group_duration: Duration::from_secs(7 * 60 * 60 * 24), // default is 7 days
+                replica_n: 1,
+                future_write_limit: Duration::ZERO,
+                past_write_limit: Duration::ZERO,
+                default: true,
+            }
+        }
+    }
+
+    impl MockRetentionPolicy {
+        /// Create a named policy that is not the default policy
+        pub fn new(name: impl Into<String>) -> Self {
+            Self::default().with_name(name).with_default(false)
+        }
+
+        fn with_name(mut self, name: impl Into<String>) -> Self {
+            self.name = name.into();
+            self
+        }
+
+        fn with_default(mut self, default: bool) -> Self {
+            self.default = default;
+            self
+        }
+
+        pub fn with_duration(mut self, duration: Duration) -> Self {
+            self.duration = duration;
+            self
+        }
+    }
+
+    #[derive(Debug, Default)]
+    pub struct MockShowRetentionPolicies {
+        retention_policies: BTreeMap<String, Vec<MockRetentionPolicy>>,
+    }
+
+    impl MockShowRetentionPolicies {
+        pub fn new() -> Self {
+            Self::default()
+        }
+
+        pub fn with_default_retention_policy(mut self, db_name: impl Into<String>) -> Self {
+            self.retention_policies
+                .entry(db_name.into())
+                .or_insert_with(|| vec![MockRetentionPolicy::default()]);
+            self
+        }
+
+        pub fn with_retention_policy(
+            mut self,
+            db_name: impl Into<String>,
+            policy: MockRetentionPolicy,
+        ) -> Self {
+            self.retention_policies
+                .entry(db_name.into())
+                .or_default()
+                .push(policy);
+            self
+        }
+    }
+
+    /// The implementation of this follows that of InfluxDB v1.12.2's /query API with respect to
+    /// the field names provided in the `SHOW RETENTION POLICIES` response schema. As such, this
+    /// would be a good reference point for implementing this interface in production.
+    ///
+    /// One distinction with the v1.12.2 response is that durations are reported there using a more
+    /// human-friendly format. For example, 1 hour and 30 minutes would be displayed as "1h30m",
+    /// whereas in this implementation, which uses `std::time::Duration`'s pretty formatting, the
+    /// same would be displayed as "5400s".
+    #[async_trait::async_trait]
+    impl InfluxQlShowRetentionPolicies for MockShowRetentionPolicies {
+        fn schema(&self) -> SchemaRef {
+            Arc::new(
+                Schema::new(vec![
+                    Field::new(INFLUXQL_MEASUREMENT_COLUMN_NAME, DataType::Utf8, false),
+                    Field::new("name", arrow::datatypes::DataType::Utf8, false),
+                    Field::new("duration", arrow::datatypes::DataType::Utf8, false),
+                    Field::new(
+                        "shardGroupDuration",
+                        arrow::datatypes::DataType::Utf8,
+                        false,
+                    ),
+                    Field::new("replicaN", arrow::datatypes::DataType::UInt64, false),
+                    Field::new("futureWriteLimit", arrow::datatypes::DataType::Utf8, false),
+                    Field::new("pastWriteLimit", arrow::datatypes::DataType::Utf8, false),
+                    Field::new("default", arrow::datatypes::DataType::Boolean, false),
+                ])
+                .with_metadata(generate_metadata(0)),
+            )
+        }
+
+        async fn show_retention_policies(
+            &self,
+            db_name: String,
+        ) -> Result<SendableRecordBatchStream> {
+            let Some(db) = self.retention_policies.get(&db_name) else {
+                return Err(datafusion::error::DataFusionError::Plan(format!(
+                    "database not found: {db_name}"
+                )));
+            };
+            let measurement_array: StringArray = vec!["retention_policies"; db.len()].into();
+            let names_array: StringArray = db
+                .iter()
+                .map(|p| p.name.as_str())
+                .collect::<Vec<_>>()
+                .into();
+            let durations_array: StringArray = db
+                .iter()
+                .map(|MockRetentionPolicy { duration, .. }| format!("{duration:#?}"))
+                .collect::<Vec<_>>()
+                .into();
+            let shard_group_durations_array: StringArray = db
+                .iter()
+                .map(
+                    |MockRetentionPolicy {
+                         shard_group_duration,
+                         ..
+                     }| format!("{shard_group_duration:#?}"),
+                )
+                .collect::<Vec<_>>()
+                .into();
+            let replica_n_array: UInt64Array = db
+                .iter()
+                .map(|MockRetentionPolicy { replica_n, .. }| *replica_n)
+                .collect::<Vec<_>>()
+                .into();
+            let future_write_limit_array: StringArray = db
+                .iter()
+                .map(
+                    |MockRetentionPolicy {
+                         future_write_limit, ..
+                     }| format!("{future_write_limit:#?}"),
+                )
+                .collect::<Vec<_>>()
+                .into();
+            let past_write_limit_array: StringArray = db
+                .iter()
+                .map(
+                    |MockRetentionPolicy {
+                         past_write_limit, ..
+                     }| format!("{past_write_limit:#?}"),
+                )
+                .collect::<Vec<_>>()
+                .into();
+            let default_array: BooleanArray = db
+                .iter()
+                .map(|MockRetentionPolicy { default, .. }| *default)
+                .collect::<Vec<_>>()
+                .into();
+
+            let arrays = vec![
+                Arc::new(measurement_array) as Arc<dyn Array>,
+                Arc::new(names_array) as Arc<dyn Array>,
+                Arc::new(durations_array) as Arc<dyn Array>,
+                Arc::new(shard_group_durations_array) as Arc<dyn Array>,
+                Arc::new(replica_n_array) as Arc<dyn Array>,
+                Arc::new(future_write_limit_array) as Arc<dyn Array>,
+                Arc::new(past_write_limit_array) as Arc<dyn Array>,
+                Arc::new(default_array) as Arc<dyn Array>,
+            ];
+            let batch = RecordBatch::try_new(self.schema(), arrays)?;
+            Ok(Box::pin(MemoryStream::new(vec![batch])))
+        }
+    }
+}
diff --git a/iox_query_influxql_rewrite/Cargo.toml b/iox_query_influxql_rewrite/Cargo.toml
new file mode 100644
index 00000000..3e39fddf
--- /dev/null
+++ b/iox_query_influxql_rewrite/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "iox_query_influxql_rewrite"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+# workspace dependencies:
+influxdb_influxql_parser = { path = "../influxdb_influxql_parser" }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+# crates.io dependencies:
+thiserror = "2.0"
+
+[lints]
+workspace = true
diff --git a/iox_query_influxql_rewrite/src/lib.rs b/iox_query_influxql_rewrite/src/lib.rs
new file mode 100644
index 00000000..731468c4
--- /dev/null
+++ b/iox_query_influxql_rewrite/src/lib.rs
@@ -0,0 +1,572 @@
+use workspace_hack as _;
+
+use std::collections::HashSet;
+
+use influxdb_influxql_parser::{
+    common::ParseError,
+    explain::ExplainStatement,
+    identifier::Identifier,
+    parse_statements as parse_internal,
+    select::{MeasurementSelection, SelectStatement},
+    show_measurements::ExtendedOnClause,
+    statement::Statement,
+};
+
+/// Rewritten takes a Statement and attempts to derive the db/rp
+/// from the body of the statement.
+#[derive(Debug)]
+pub struct Rewritten<S> {
+    database: Option<Identifier>,
+    retention_policy: Option<Identifier>,
+    statement: S,
+}
+
+impl<S> Rewritten<S> {
+    fn new(statement: S) -> Self {
+        Self {
+            database: None,
+            retention_policy: None,
+            statement,
+        }
+    }
+
+    fn with_database(mut self, db: Option<Identifier>) -> Self {
+        self.database = db;
+        self
+    }
+
+    fn with_retention_policy(mut self, rp: Option<Identifier>) -> Self {
+        self.retention_policy = rp;
+        self
+    }
+
+    pub fn set_retention_policy(&mut self, rp: String) {
+        self.retention_policy = Some(Identifier::from(rp.as_str()));
+    }
+
+    pub fn database(&self) -> Option<&Identifier> {
+        self.database.as_ref()
+    }
+
+    pub fn retention_policy(&self) -> Option<&Identifier> {
+        self.retention_policy.as_ref()
+    }
+
+    pub fn statement(&self) -> &S {
+        &self.statement
+    }
+
+    pub fn to_statement(self) -> S {
+        self.statement
+    }
+
+    pub fn resolve_dbrp(&self) -> Option<String> {
+        // We use `as_str().to_owned()` to avoid the
+        // quoting logic that is part of Identifier's `Display` implementation
+        match (&self.database, &self.retention_policy) {
+            (None, None) | (None, Some(_)) => None,
+            (Some(db), None) => Some(db.as_str().to_owned()),
+            (Some(db), Some(rp)) => {
+                if rp.as_str() != "autogen" && rp.as_str() != "default" {
+                    Some(format!("{}/{rp}", db.as_str()))
+                } else {
+                    Some(db.as_str().to_owned())
+                }
+            }
+        }
+    }
+}
+
+impl From<Rewritten<Self>> for Statement {
+    fn from(r: Rewritten<Self>) -> Self {
+        r.to_statement()
+    }
+}
+
+impl TryFrom<Statement> for Rewritten<Statement> {
+    type Error = Error;
+
+    fn try_from(statement: Statement) -> Result<Self, Self::Error> {
+        match statement {
+            Statement::ShowMeasurements(mut s) => {
+                if let Some(on) = s.on.take() {
+                    let (db, rp) = match on {
+                        ExtendedOnClause::Database(db) => (Some(db), None),
+                        ExtendedOnClause::DatabaseRetentionPolicy(db, rp) => (Some(db), Some(rp)),
+                        ExtendedOnClause::AllDatabases
+                        | ExtendedOnClause::AllDatabasesAndRetentionPolicies => {
+                            return Err(Error::MultiDatabase);
+                        }
+                    };
+                    Ok(Self::new(Statement::ShowMeasurements(s))
+                        .with_database(db)
+                        .with_retention_policy(rp))
+                } else {
+                    Ok(Self::new(Statement::ShowMeasurements(s)))
+                }
+            }
+            Statement::ShowRetentionPolicies(mut s) => {
+                let identifier = s.database.take().map(Into::into);
+                Ok(Self::new(Statement::ShowRetentionPolicies(s)).with_database(identifier))
+            }
+            Statement::ShowTagKeys(mut s) => {
+                let identifier = s.database.take().map(Into::into);
+                Ok(Self::new(Statement::ShowTagKeys(s)).with_database(identifier))
+            }
+            Statement::ShowTagValues(mut s) => {
+                let identifier = s.database.take().map(Into::into);
+                Ok(Self::new(Statement::ShowTagValues(s)).with_database(identifier))
+            }
+            Statement::ShowFieldKeys(mut s) => {
+                let identifier = s.database.take().map(Into::into);
+                Ok(Self::new(Statement::ShowFieldKeys(s)).with_database(identifier))
+            }
+            Statement::Select(s) => {
+                let ss = Rewritten::<SelectStatement>::try_from(*s)?;
+                let db = ss.database.to_owned();
+                let rp = ss.retention_policy.to_owned();
+                Ok(Self::new(Statement::Select(Box::new(ss.to_statement())))
+                    .with_database(db)
+                    .with_retention_policy(rp))
+            }
+            Statement::Explain(mut s) => {
+                let options = s.options.take();
+                let s = Self::try_from(*s.statement)?;
+                let db = s.database.to_owned();
+                let rp = s.retention_policy.to_owned();
+                Ok(Self::new(Statement::Explain(Box::new(ExplainStatement {
+                    options,
+                    statement: Box::new(s.to_statement()),
+                })))
+                .with_database(db)
+                .with_retention_policy(rp))
+            }
+            // For all other statements, we just pass them through. Explicitly
+            // do not use a catch-all match arm here in the event that new variants
+            // are added to the Statement enum, we want the compiler to direct us
+            // here to handle, if relevant.
+            Statement::CreateDatabase(_)
+            | Statement::Delete(_)
+            | Statement::DropMeasurement(_)
+            | Statement::ShowDatabases(_) => Ok(Self::new(statement)),
+        }
+    }
+}
+
+impl TryFrom<SelectStatement> for Rewritten<SelectStatement> {
+    type Error = Error;
+
+    fn try_from(mut select_statement: SelectStatement) -> Result<Self, Self::Error> {
+        let mut db_rp_set = HashSet::new();
+        let from_clause = select_statement
+            .from
+            .take()
+            .into_iter()
+            .map(|ms| {
+                let (db, rp, ms) = match ms {
+                    MeasurementSelection::Name(mut qn) => {
+                        let db = qn.database.take();
+                        let rp = qn.retention_policy.take();
+                        (db, rp, MeasurementSelection::Name(qn))
+                    }
+                    // Recursively call try_from on nested sub-queries, and compare their
+                    // resulting db/rp to the same at this level. Sub-queries that have
+                    // multiple db/rp in them will throw the MultiDatabase error.
+                    MeasurementSelection::Subquery(s) => {
+                        let ss = Self::try_from(*s)?;
+                        (
+                            ss.database.to_owned(),
+                            ss.retention_policy.to_owned(),
+                            MeasurementSelection::Subquery(Box::new(ss.to_statement())),
+                        )
+                    }
+                };
+                if db_rp_set.insert((db, rp)) && db_rp_set.len() > 1 {
+                    Err(Error::MultiDatabase)
+                } else {
+                    Ok(ms)
+                }
+            })
+            .collect::<Result<Vec<MeasurementSelection>, Error>>()?;
+        select_statement.from.replace(from_clause);
+        let mut result = Self::new(select_statement);
+        if let Some((db, rp)) = db_rp_set.into_iter().next() {
+            result = result.with_database(db).with_retention_policy(rp);
+        }
+        Ok(result)
+    }
+}
+
+#[derive(Debug, thiserror::Error, Clone, PartialEq, Eq)]
+pub enum Error {
+    #[error("can only perform queries on a single database")]
+    MultiDatabase,
+    #[error("parsing error: {0}")]
+    Parse(ParseError),
+}
+
+pub fn parse_statements(input: &str) -> Result<Vec<Rewritten<Statement>>, Error> {
+    parse_internal(input)
+        .map_err(Error::Parse)?
+        .into_iter()
+        .map(Rewritten::<Statement>::try_from)
+        .collect::<Result<Vec<Rewritten<Statement>>, Error>>()
+}
+
+#[cfg(test)]
+mod tests {
+    use influxdb_influxql_parser::statement::Statement;
+
+    use crate::{Error, Rewritten, parse_statements};
+
+    fn parse_single(input: &str) -> Rewritten<Statement> {
+        parse_statements(input).unwrap().pop().unwrap()
+    }
+
+    fn parse_single_failure(input: &str) -> Error {
+        parse_statements(input).unwrap_err()
+    }
+
+    struct TestCase {
+        input: &'static str,
+        expected: &'static str,
+        db: Option<&'static str>,
+        rp: Option<&'static str>,
+    }
+
+    impl TestCase {
+        fn assert(&self) {
+            let s = parse_single(self.input);
+            assert_eq!(s.database().map(|db| db.as_str()), self.db);
+            assert_eq!(s.retention_policy().map(|rp| rp.as_str()), self.rp);
+            assert_eq!(self.expected, s.to_statement().to_string());
+        }
+    }
+
+    struct TestFailure {
+        input: &'static str,
+        expected: Error,
+    }
+
+    impl TestFailure {
+        fn assert(&self) {
+            let e = parse_single_failure(self.input);
+            assert_eq!(self.expected, e, "input: {}", self.input);
+        }
+    }
+
+    #[test]
+    fn show_measurements() {
+        TestCase {
+            input: "SHOW MEASUREMENTS",
+            expected: "SHOW MEASUREMENTS",
+            db: None,
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "SHOW MEASUREMENTS ON foo",
+            expected: "SHOW MEASUREMENTS",
+            db: Some("foo"),
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "SHOW MEASUREMENTS ON foo.bar",
+            expected: "SHOW MEASUREMENTS",
+            db: Some("foo"),
+            rp: Some("bar"),
+        }
+        .assert();
+    }
+
+    #[test]
+    fn show_measurements_failure_modes() {
+        TestFailure {
+            input: "SHOW MEASUREMENTS ON *.*",
+            expected: Error::MultiDatabase,
+        }
+        .assert();
+        TestFailure {
+            input: r#"SHOW MEASUREMENTS ON *"#,
+            expected: Error::MultiDatabase,
+        }
+        .assert();
+    }
+
+    #[test]
+    fn show_retention_policies() {
+        TestCase {
+            input: "SHOW RETENTION POLICIES",
+            expected: "SHOW RETENTION POLICIES",
+            db: None,
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "SHOW RETENTION POLICIES ON foo",
+            expected: "SHOW RETENTION POLICIES",
+            db: Some("foo"),
+            rp: None,
+        }
+        .assert();
+    }
+
+    #[test]
+    fn show_tag_keys() {
+        TestCase {
+            input: "SHOW TAG KEYS",
+            expected: "SHOW TAG KEYS",
+            db: None,
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "SHOW TAG KEYS FROM cpu",
+            expected: "SHOW TAG KEYS FROM cpu",
+            db: None,
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "SHOW TAG KEYS ON foo",
+            expected: "SHOW TAG KEYS",
+            db: Some("foo"),
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "SHOW TAG KEYS ON foo FROM cpu",
+            expected: "SHOW TAG KEYS FROM cpu",
+            db: Some("foo"),
+            rp: None,
+        }
+        .assert();
+    }
+
+    #[test]
+    fn show_tag_values() {
+        TestCase {
+            input: "SHOW TAG VALUES WITH KEY = host",
+            expected: "SHOW TAG VALUES WITH KEY = host",
+            db: None,
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "SHOW TAG VALUES FROM cpu WITH KEY = host",
+            expected: "SHOW TAG VALUES FROM cpu WITH KEY = host",
+            db: None,
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "SHOW TAG VALUES ON foo WITH KEY = host",
+            expected: "SHOW TAG VALUES WITH KEY = host",
+            db: Some("foo"),
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "SHOW TAG VALUES ON foo FROM cpu WITH KEY = host",
+            expected: "SHOW TAG VALUES FROM cpu WITH KEY = host",
+            db: Some("foo"),
+            rp: None,
+        }
+        .assert();
+    }
+
+    #[test]
+    fn show_field_keys() {
+        TestCase {
+            input: "SHOW FIELD KEYS",
+            expected: "SHOW FIELD KEYS",
+            db: None,
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "SHOW FIELD KEYS FROM cpu",
+            expected: "SHOW FIELD KEYS FROM cpu",
+            db: None,
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "SHOW FIELD KEYS ON foo",
+            expected: "SHOW FIELD KEYS",
+            db: Some("foo"),
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "SHOW FIELD KEYS ON foo FROM cpu",
+            expected: "SHOW FIELD KEYS FROM cpu",
+            db: Some("foo"),
+            rp: None,
+        }
+        .assert();
+    }
+
+    #[test]
+    fn select() {
+        TestCase {
+            input: "SELECT * FROM cpu",
+            expected: "SELECT * FROM cpu",
+            db: None,
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "SELECT * FROM bar.cpu",
+            expected: "SELECT * FROM cpu",
+            db: None,
+            rp: Some("bar"),
+        }
+        .assert();
+        TestCase {
+            input: "SELECT * FROM foo.bar.cpu",
+            expected: "SELECT * FROM cpu",
+            db: Some("foo"),
+            rp: Some("bar"),
+        }
+        .assert();
+        TestCase {
+            input: r#"SELECT * FROM (SELECT * FROM cpu)"#,
+            expected: r#"SELECT * FROM (SELECT * FROM cpu)"#,
+            db: None,
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: r#"SELECT * FROM (SELECT * FROM bar.cpu), bar.mem"#,
+            expected: r#"SELECT * FROM (SELECT * FROM cpu), mem"#,
+            db: None,
+            rp: Some("bar"),
+        }
+        .assert();
+        TestCase {
+            input: r#"SELECT * FROM (SELECT * FROM foo.bar.cpu), foo.bar.mem"#,
+            expected: r#"SELECT * FROM (SELECT * FROM cpu), mem"#,
+            db: Some("foo"),
+            rp: Some("bar"),
+        }
+        .assert();
+        TestCase {
+            input: "SELECT * FROM \"5318725357728643_8729387113858758\".bar.cpu",
+            expected: "SELECT * FROM cpu",
+            db: Some("5318725357728643_8729387113858758"),
+            rp: Some("bar"),
+        }
+        .assert();
+    }
+
+    #[test]
+    fn select_failure_modes() {
+        TestFailure {
+            input: r#"SELECT * FROM foo.bar.cpu, baz.bop.cpu"#,
+            expected: Error::MultiDatabase,
+        }
+        .assert();
+        TestFailure {
+            input: r#"SELECT * FROM cpu, baz.bop.cpu"#,
+            expected: Error::MultiDatabase,
+        }
+        .assert();
+        TestFailure {
+            input: r#"SELECT * FROM bar.cpu, baz.bop.cpu"#,
+            expected: Error::MultiDatabase,
+        }
+        .assert();
+        TestFailure {
+            input: r#"SELECT * FROM foo.bar.cpu, (SELECT * FROM mem)"#,
+            expected: Error::MultiDatabase,
+        }
+        .assert();
+    }
+
+    #[test]
+    fn explain() {
+        TestCase {
+            input: "EXPLAIN SELECT * FROM cpu",
+            expected: "EXPLAIN SELECT * FROM cpu",
+            db: None,
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "EXPLAIN SELECT * FROM bar.cpu",
+            expected: "EXPLAIN SELECT * FROM cpu",
+            db: None,
+            rp: Some("bar"),
+        }
+        .assert();
+        TestCase {
+            input: "EXPLAIN SELECT * FROM foo.bar.cpu",
+            expected: "EXPLAIN SELECT * FROM cpu",
+            db: Some("foo"),
+            rp: Some("bar"),
+        }
+        .assert();
+        TestCase {
+            input: r#"EXPLAIN SELECT * FROM (SELECT * FROM cpu)"#,
+            expected: r#"EXPLAIN SELECT * FROM (SELECT * FROM cpu)"#,
+            db: None,
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: r#"EXPLAIN SELECT * FROM (SELECT * FROM bar.cpu), bar.mem"#,
+            expected: r#"EXPLAIN SELECT * FROM (SELECT * FROM cpu), mem"#,
+            db: None,
+            rp: Some("bar"),
+        }
+        .assert();
+        TestCase {
+            input: r#"EXPLAIN SELECT * FROM (SELECT * FROM foo.bar.cpu), foo.bar.mem"#,
+            expected: r#"EXPLAIN SELECT * FROM (SELECT * FROM cpu), mem"#,
+            db: Some("foo"),
+            rp: Some("bar"),
+        }
+        .assert();
+    }
+
+    #[test]
+    fn noop_rewrites() {
+        TestCase {
+            input: "CREATE DATABASE foo",
+            expected: "CREATE DATABASE foo",
+            db: None,
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "DELETE FROM cpu",
+            expected: "DELETE FROM cpu",
+            db: None,
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "DROP MEASUREMENT cpu",
+            expected: "DROP MEASUREMENT cpu",
+            db: None,
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "EXPLAIN SELECT * FROM cpu",
+            expected: "EXPLAIN SELECT * FROM cpu",
+            db: None,
+            rp: None,
+        }
+        .assert();
+        TestCase {
+            input: "SHOW DATABASES",
+            expected: "SHOW DATABASES",
+            db: None,
+            rp: None,
+        }
+        .assert();
+    }
+}
diff --git a/iox_time/Cargo.toml b/iox_time/Cargo.toml
index 70a240c2..8d946cfa 100644
--- a/iox_time/Cargo.toml
+++ b/iox_time/Cargo.toml
@@ -12,7 +12,7 @@ workspace = true
 [dependencies]
 chrono = { version = "0.4.42", default-features = false, features = ["clock", "std"] }
 parking_lot = "0.12"
-tokio = { version = "1.47", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
+tokio = { version = "1.48", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies]
diff --git a/iox_v1_query_api/Cargo.toml b/iox_v1_query_api/Cargo.toml
new file mode 100644
index 00000000..3425b5f0
--- /dev/null
+++ b/iox_v1_query_api/Cargo.toml
@@ -0,0 +1,55 @@
+[package]
+name = "iox_v1_query_api"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+anyhow = "1.0.99"
+arrow = { workspace = true }
+authz = { path = "../authz", features = ["http"] }
+bytes = "1.10.1"
+chrono = "0.4.42"
+datafusion = { workspace = true }
+futures = "0.3"
+generated_types = { path = "../generated_types" }
+http = { workspace = true }
+http-body-util = { workspace = true }
+iox_http_util = { path = "../iox_http_util" }
+iox_query = { path = "../iox_query" }
+iox_query_params = { path = "../iox_query_params" }
+iox_query_influxql = { path = "../iox_query_influxql" }
+iox_query_influxql_rewrite = { path = "../iox_query_influxql_rewrite" }
+mime = "0.3.16"
+multer = "3.1.0"
+trace_http = { path = "../trace_http" }
+tracing = { workspace = true }
+rmp-serde = "1.3.0"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0.145"
+serde_urlencoded = "0.7.0"
+schema = { path = "../schema" }
+thiserror = "2.0.16"
+trace = { path = "../trace" }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies] # In alphabetical order
+async-trait = "0.1"
+data_types = { path = "../data_types" }
+datafusion_util = { path = "../datafusion_util" }
+insta = { version = "1.43.2", features = ["json", "redactions"] }
+iox_time = { path = "../iox_time" }
+metric = { path = "../metric" }
+tokio = { version = "1.48", features = [
+    "macros",
+    "net",
+    "parking_lot",
+    "rt-multi-thread",
+    "signal",
+    "sync",
+    "time",
+] }
diff --git a/iox_v1_query_api/src/error.rs b/iox_v1_query_api/src/error.rs
new file mode 100644
index 00000000..916c1cd8
--- /dev/null
+++ b/iox_v1_query_api/src/error.rs
@@ -0,0 +1,122 @@
+use std::fmt::Debug;
+
+use datafusion::error::DataFusionError;
+use iox_query_influxql_rewrite as rewrite;
+use thiserror::Error;
+
+/// Error type for the v1 API
+///
+/// This is used to catch errors that occur during the streaming process.
+/// [`anyhow::Error`] is used as a catch-all because if anything fails during
+/// that process it will result in a 500 INTERNAL ERROR.
+#[derive(Debug, thiserror::Error)]
+#[error("unexpected query error: {0}")]
+pub struct QueryError(#[from] pub anyhow::Error);
+
+#[derive(Debug, Error)]
+pub enum Error {
+    /// The requested path has no registered handler.
+    #[error("not found: {0}")]
+    NoHandler(String),
+
+    #[error("authorization failure: {0}")]
+    AuthorizationFailure(String),
+
+    #[error("invalid mime type ({0})")]
+    InvalidMimeType(String),
+
+    /// Missing parameters for query
+    #[error("missing query parameters 'db' and 'q'")]
+    MissingQueryParams,
+
+    #[error("error decoding multipart file upload: {0}")]
+    MultipartFile(String),
+
+    #[error("Invalid UTF8: {message} {error}")]
+    Utf8 {
+        message: &'static str,
+        error: String,
+    },
+
+    /// Serde decode error
+    #[error("error decoding params from url: {0}")]
+    SerdeUrlDecoding(#[from] serde_urlencoded::de::Error),
+
+    // SerdeJsonError
+    #[error("error decoding query body: {0}")]
+    SerdeJson(#[from] serde_json::Error),
+
+    #[error("datafusion error: {0}")]
+    Datafusion(#[from] DataFusionError),
+
+    #[error("error in InfluxQL statement: {0}")]
+    InfluxqlRewrite(#[from] rewrite::Error),
+
+    #[error("must provide only one InfluxQl statement per query")]
+    InfluxqlSingleStatement,
+
+    #[error("must specify a 'db' parameter, or provide the database in the InfluxQL query")]
+    InfluxqlNoDatabase,
+
+    #[error(
+        "provided a database in both the parameters ({param_db}) and \
+        query string ({query_db}) that do not match, if providing a query \
+        that specifies the database, you can omit the 'database' parameter \
+        from your request"
+    )]
+    InfluxqlDatabaseMismatch { param_db: String, query_db: String },
+
+    #[error(
+        "provided a retention policy in both the parameters ({param_rp}) and \
+        query string ({query_rp}) that do not match, if providing a query \
+        that specifies the retention_policy, you can omit the 'rp' parameter \
+        from your request"
+    )]
+    InfluxqlRetentionPolicyMismatch { param_rp: String, query_rp: String },
+
+    #[error("error reading field from body: {name} -- {error}")]
+    FieldRead { name: &'static str, error: String },
+
+    #[error("Cannot retrieve database: {0}")]
+    Database(DataFusionError),
+
+    #[error("Database {0} not found")]
+    DatabaseNotFound(String),
+
+    #[error("v1 query API error: {0}")]
+    V1Query(#[from] QueryError),
+}
+
+#[derive(Debug, Clone)]
+pub enum HttpError {
+    NotFound(String),
+    Unauthorized(String),
+    Invalid(String),
+    InternalError(String),
+}
+
+impl From<Error> for HttpError {
+    fn from(e: Error) -> Self {
+        use Error::*;
+        use HttpError::*;
+        match e {
+            NoHandler(_) => NotFound(e.to_string()),
+            InvalidMimeType(_)
+            | MissingQueryParams
+            | InfluxqlSingleStatement
+            | InfluxqlNoDatabase
+            | Database(_)
+            | DatabaseNotFound(_)
+            | InfluxqlDatabaseMismatch { .. }
+            | InfluxqlRetentionPolicyMismatch { .. }
+            | MultipartFile(_)
+            | SerdeUrlDecoding(_)
+            | SerdeJson(_)
+            | Utf8 { .. }
+            | FieldRead { .. }
+            | InfluxqlRewrite(_) => Invalid(e.to_string()),
+            Datafusion(_) | V1Query(_) => InternalError(e.to_string()),
+            AuthorizationFailure(_) => Unauthorized(e.to_string()),
+        }
+    }
+}
diff --git a/iox_v1_query_api/src/handler.rs b/iox_v1_query_api/src/handler.rs
new file mode 100644
index 00000000..0afd9857
--- /dev/null
+++ b/iox_v1_query_api/src/handler.rs
@@ -0,0 +1,963 @@
+use std::{collections::HashMap, sync::Arc};
+
+use arrow::datatypes::Schema;
+use authz::{Authorization, Authorizer, Permission, http::AuthorizationHeaderExtension};
+use bytes::Bytes;
+use datafusion::{
+    execution::SendableRecordBatchStream, parquet::data_type::AsBytes, physical_plan::ExecutionPlan,
+};
+use futures::{StreamExt, stream::BoxStream};
+use http::{
+    HeaderValue, Method,
+    header::{ACCEPT, CONTENT_TYPE},
+    status::StatusCode,
+};
+use http_body_util::BodyExt;
+use iox_http_util::{
+    Request, Response, ResponseBuilder, empty_response_body, stream_bytes_to_response_body,
+};
+use iox_query::{
+    QueryDatabase,
+    exec::IOxSessionContext,
+    query_log::{PermitAndToken, QueryCompletedToken, StatePlanned},
+};
+use iox_query_influxql::{
+    frontend::planner::InfluxQLQueryPlanner, show_databases::InfluxQlShowDatabases,
+    show_retention_policies::InfluxQlShowRetentionPolicies,
+};
+use iox_query_influxql_rewrite::{self as rewrite};
+use iox_query_params::StatementParams;
+use mime::Mime;
+use multer::Multipart;
+use serde::Deserialize;
+use serde_json::ser::{CompactFormatter, PrettyFormatter};
+use trace::{TraceCollector, ctx::SpanContext, span::SpanExt};
+use trace_http::{
+    ctx::{RequestLogContext, RequestLogContextExt},
+    query_variant::QueryVariant,
+};
+use tracing::{info, warn};
+
+use super::{
+    DEFAULT_CHUNK_SIZE, Error, QueryFormat, QueryParams, Result, StatementFuture, types::Precision,
+};
+use crate::{
+    HttpError,
+    response::{
+        buffered::BufferedResponseStream,
+        chunked::ChunkedResponseStream,
+        csv::CsvStream,
+        json::{BufferedJsonStream, ChunkedJsonStream},
+        msgpack::{BufferedMessagePackStream, ChunkedMessagePackStream},
+    },
+    types::Statement,
+};
+
+#[derive(Debug)]
+struct QueryPlan {
+    physical_plan: Arc<dyn ExecutionPlan>,
+    schema: Arc<Schema>,
+    query_completed_token: QueryCompletedToken<StatePlanned>,
+    context: IOxSessionContext,
+}
+
+#[derive(Debug, Clone)]
+pub struct V1HttpHandler {
+    database: Arc<dyn QueryDatabase>,
+    authz: Option<Arc<dyn Authorizer>>,
+    trace_collector: Option<Arc<dyn TraceCollector>>,
+    iox_version: String,
+    show_databases: Option<Arc<dyn InfluxQlShowDatabases>>,
+    show_retention_policies: Option<Arc<dyn InfluxQlShowRetentionPolicies>>,
+}
+
+impl V1HttpHandler {
+    pub fn new(
+        database: Arc<dyn QueryDatabase>,
+        authz: Option<Arc<dyn Authorizer>>,
+        trace_collector: Option<Arc<dyn TraceCollector>>,
+        iox_version: String,
+    ) -> Self {
+        Self {
+            database,
+            authz,
+            trace_collector,
+            iox_version,
+            show_databases: None,
+            show_retention_policies: None,
+        }
+    }
+
+    /// Add a `InfluxQlShowDatabases` to the handler
+    ///
+    /// This allows the implemention of `InfluxQlShowDatabases` to be added optionally so that
+    /// systems that support `SHOW DATABASES` queries (i.e., Core and Enterprise) can opt-in to that
+    /// functonality when constructing the `V1HttpHandler`.
+    pub fn with_show_databases(mut self, show_databases: Arc<dyn InfluxQlShowDatabases>) -> Self {
+        self.show_databases = Some(show_databases);
+        self
+    }
+
+    /// Add a `InfluxQlShowRetentionPolicies` to the handler
+    ///
+    /// This allows the implemention of `InfluxQlShowRetentionPolicies` to be added optionally so
+    /// that systems that support `SHOW RETENTION POLICIES` queries (i.e., Core and Enterprise) can
+    /// opt-in to that functonality when constructing the `V1HttpHandler`.
+    pub fn with_show_retention_policies(
+        mut self,
+        show_retention_policies: Arc<dyn InfluxQlShowRetentionPolicies>,
+    ) -> Self {
+        self.show_retention_policies = Some(show_retention_policies);
+        self
+    }
+
+    pub async fn route_request(&self, req: Request) -> Result<Response, HttpError> {
+        match (req.method(), req.uri().path()) {
+            (&Method::GET | &Method::POST, "/query") => self
+                .handle_parameterized_query(req)
+                .await
+                .inspect_err(|e| warn!("error encountered while handling /query: {:?}", e)),
+            (&Method::GET | &Method::HEAD, "/ping") => self.ping(req).await,
+            _ => Err(HttpError::NotFound(req.uri().path().to_owned())),
+        }
+    }
+
+    async fn ping(&self, _req: Request) -> Result<Response, HttpError> {
+        ResponseBuilder::new()
+            .status(StatusCode::NO_CONTENT)
+            // This is important for backwards compat with one of the clients
+            .header("X-Influxdb-Build", "cloud2")
+            .header("X-Influxdb-Version", self.iox_version.clone())
+            .body(empty_response_body())
+            .map_err(|e| HttpError::InternalError(e.to_string()))
+    }
+
+    async fn handle_parameterized_query(&self, mut req: Request) -> Result<Response, HttpError> {
+        let span_ctx = Some(SpanContext::new_with_optional_collector(
+            self.trace_collector.as_ref().map(Arc::clone),
+        ));
+
+        // Go ahead and get the token before we consume the body,
+        // but we can't use it until later once we know the database.
+        let token = self.get_token_from_request(&mut req)?;
+
+        let (params, format) = extract_request(req).await?;
+
+        let QueryParams {
+            chunk_size,
+            chunked,
+            database,
+            retention_policy,
+            epoch,
+            pretty: _,
+            query,
+            params,
+        } = params;
+        let chunk_size =
+            chunked.and_then(|chunked| chunked.then(|| chunk_size.unwrap_or(DEFAULT_CHUNK_SIZE)));
+
+        // Make a provided but empty db param None for better error messaging.
+        let database = if let Some("") = database.as_deref() {
+            None
+        } else {
+            database
+        };
+
+        if query.is_none() {
+            return Err(HttpError::Invalid(
+                "expected a query to be provided in the query string or body".to_owned(),
+            ));
+        }
+
+        let query = query.unwrap();
+
+        let sp: StatementParams = params
+            .map(|s| serde_json::from_str(&s))
+            .transpose()
+            .map_err(Error::from)?
+            .unwrap_or_default();
+
+        let statements = rewrite::parse_statements(query.as_str()).map_err(Error::from);
+
+        let statements = match statements {
+            Ok(statements) => statements,
+            Err(e) => {
+                let statement = error_statement(e);
+                let response = statements_to_response(vec![statement], chunk_size, epoch, format);
+
+                return ResponseBuilder::new()
+                    .status(200)
+                    .header(CONTENT_TYPE, format.as_content_type())
+                    .body(stream_bytes_to_response_body(response))
+                    .map_err(|e| HttpError::InternalError(e.to_string()));
+            }
+        };
+
+        let resolve_db = |request_db: Option<String>, query_db: Option<String>| {
+            match (request_db, query_db) {
+                (None, None) => None,
+                (None, Some(db)) | (Some(db), None) => Some(db),
+                (Some(_), Some(q)) => {
+                    // Influxqlbridge prioritizes the embedded dp/rp in the query
+                    // over the params if both are specified.
+                    Some(q)
+                }
+            }
+        };
+
+        let executing_statements = statements
+            .into_iter()
+            .map(|mut statement| {
+                let fut = async {
+                    if statement.statement().is_show_databases()
+                        && let Some(sd) = self.show_databases.as_ref()
+                    {
+                        let namespaces = match self
+                            .database
+                            .list_namespaces(span_ctx.child_span("list_namespaces"))
+                            .await
+                            .map_err(Error::Datafusion)
+                        {
+                            Ok(n) => n,
+                            Err(e) => return Ok::<_, Error>(error_statement(e)),
+                        };
+                        let permissions = namespaces
+                            .into_iter()
+                            .map(|n| {
+                                Permission::ResourceAction(
+                                    authz::Resource::Database(authz::Target::ResourceName(n.name)),
+                                    authz::Action::Read,
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        let authorized = self
+                            .authz
+                            .authorize(token.clone(), &permissions)
+                            .await
+                            .map_err(|error| Error::AuthorizationFailure(error.to_string()))?;
+
+                        let db_names = authorized
+                            .permissions()
+                            .iter()
+                            .filter_map(|p| match p {
+                                Permission::ResourceAction(
+                                    authz::Resource::Database(authz::Target::ResourceName(db_name)),
+                                    _,
+                                ) => Some(db_name.to_owned()),
+                                _ => None,
+                            })
+                            .collect::<Vec<_>>();
+
+                        match sd.show_databases(db_names).await {
+                            Ok(stream) => Ok(get_executing_statement_from_stream(stream)),
+                            Err(error) => Ok(error_statement(error.into())),
+                        }
+                    } else if statement.statement().is_show_retention_policies()
+                        && let Some(srp) = self.show_retention_policies.as_ref()
+                    {
+                        // Resolve database
+                        let Some(database) = resolve_db(database.clone(), statement.resolve_dbrp())
+                        else {
+                            return Ok::<_, Error>(error_statement(Error::InfluxqlNoDatabase));
+                        };
+
+                        self.authz
+                            .authorize(
+                                token.clone(),
+                                &[Permission::ResourceAction(
+                                    authz::Resource::Database(authz::Target::ResourceName(
+                                        database.clone(),
+                                    )),
+                                    authz::Action::Read,
+                                )],
+                            )
+                            .await
+                            .map_err(|error| Error::AuthorizationFailure(error.to_string()))?;
+
+                        match srp.show_retention_policies(database).await {
+                            Ok(stream) => Ok(get_executing_statement_from_stream(stream)),
+                            Err(error) => Ok(error_statement(error.into())),
+                        }
+                    } else {
+                        // Handle retention policy
+                        match (retention_policy.clone(), statement.retention_policy()) {
+                            (None, None) | (None, Some(_)) => {}
+                            (Some(rp), None) => {
+                                statement.set_retention_policy(rp);
+                            }
+                            (Some(_), Some(_)) => {
+                                // Influxqlbridge prioritizes the embedded dp/rp in the query
+                                // over the params if both are specified.
+                            }
+                        };
+
+                        // Resolve database
+                        let Some(database) = resolve_db(database.clone(), statement.resolve_dbrp())
+                        else {
+                            return Ok::<_, Error>(error_statement(Error::InfluxqlNoDatabase));
+                        };
+
+                        // Authorize request
+                        let authz = self.authorize_request(token.clone(), &database).await?;
+
+                        // Generate the query
+                        let query = statement.to_statement().to_string();
+
+                        // Plan the query
+                        let sp_clone = sp.clone();
+                        let span_ctx = span_ctx.clone();
+                        let query_plan = self
+                            .plan_query(
+                                query,
+                                database,
+                                sp_clone,
+                                authz.into_subject(),
+                                span_ctx,
+                                None,
+                            )
+                            .await;
+
+                        match query_plan {
+                            Ok(query_plan) => {
+                                // Get executing statement
+                                let query_statement_result_stream =
+                                    get_executing_statement_from_plan(
+                                        query_plan,
+                                        Arc::clone(&self.database),
+                                    );
+
+                                Ok(query_statement_result_stream)
+                            }
+                            Err(err) => Ok(error_statement(err)),
+                        }
+                    }
+                };
+
+                Ok::<_, Error>(fut)
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+
+        // Execute these futures
+        let executing_statements = futures::future::try_join_all(executing_statements)
+            .await?
+            .into_iter()
+            .collect::<Vec<_>>();
+
+        let response = statements_to_response(executing_statements, chunk_size, epoch, format);
+
+        ResponseBuilder::new()
+            .status(200)
+            .header(CONTENT_TYPE, format.as_content_type())
+            .body(stream_bytes_to_response_body(response))
+            .map_err(|e| HttpError::InternalError(e.to_string()))
+    }
+
+    async fn plan_query(
+        &self,
+        query: String,
+        database: String,
+        params: StatementParams,
+        authz_id: Option<String>,
+        span_ctx: Option<SpanContext>,
+        external_span_ctx: Option<RequestLogContext>,
+    ) -> Result<QueryPlan, Error> {
+        let namespace: Arc<str> = database.into();
+        let namespace_name = Arc::clone(&namespace);
+        let namespace_name = namespace_name.as_ref();
+
+        let db = self
+            .database
+            .namespace(namespace_name, span_ctx.child_span("get_namespace"), false)
+            .await
+            .map_err(Error::Database)?
+            .ok_or(Error::DatabaseNotFound(namespace_name.to_string()))?;
+
+        let query_completed_token = db.record_query(
+            external_span_ctx.as_ref().map(RequestLogContext::ctx),
+            QueryVariant::InfluxQl.str(),
+            Box::new(query.to_string()),
+            params.clone(),
+            authz_id,
+        );
+
+        // Log after we acquire the permit and are about to start execution
+        info!(
+            %namespace_name,
+            %query,
+            trace=external_span_ctx.format_jaeger().as_str(),
+            variant=QueryVariant::InfluxQl.str(),
+            request_protocol="v1_http_query",
+            "InfluxQL request planning",
+        );
+
+        let context = db.new_query_context(span_ctx, None);
+
+        let planner_ctx = context.child_ctx("v1 query planner");
+        // Run planner on a separate threadpool, rather than the IO pool that is servicing this request
+        let physical_plan_res =
+            context
+                .run(async move {
+                    InfluxQLQueryPlanner::query(query.as_ref(), params, &planner_ctx).await
+                })
+                .await;
+
+        let (physical_plan, query_completed_token) = match physical_plan_res {
+            Ok(physical_plan) => {
+                let query_completed_token =
+                    query_completed_token.planned(&context, Arc::clone(&physical_plan));
+                (physical_plan, query_completed_token)
+            }
+            Err(e) => {
+                query_completed_token.fail();
+                Err(Error::from(e))?
+            }
+        };
+
+        let schema = Arc::clone(&physical_plan.schema());
+        Ok(QueryPlan {
+            physical_plan,
+            schema,
+            query_completed_token,
+            context,
+        })
+    }
+
+    fn get_token_from_request(&self, req: &mut Request) -> Result<Option<Vec<u8>>, Error> {
+        let token = if let Some(p) = extract_v1_auth_token(req) {
+            Some(p)
+        } else {
+            let auth_header = req.extensions().get::<AuthorizationHeaderExtension>();
+            auth_header
+                .and_then(|auth_header| {
+                    let header_value = &**auth_header;
+                    header_value.as_ref().map(validate_auth_header)
+                })
+                .transpose()?
+        };
+
+        Ok(token)
+    }
+
+    async fn authorize_request(
+        &self,
+        token: Option<Vec<u8>>,
+        database: &str,
+    ) -> Result<Authorization, Error> {
+        let required_permission = authz::Permission::ResourceAction(
+            authz::Resource::Database(authz::Target::ResourceName(database.to_string())),
+            authz::Action::Read,
+        );
+
+        self.authz
+            .authorize(token, &[required_permission])
+            .await
+            .map_err(|e| Error::AuthorizationFailure(e.to_string()))
+    }
+}
+
+fn statements_to_response(
+    executing_statements: Vec<StatementFuture>,
+    chunk_size: Option<usize>,
+    epoch: Option<Precision>,
+    format: QueryFormat,
+) -> BoxStream<'static, Bytes> {
+    match format {
+        QueryFormat::Csv => CsvStream::new(executing_statements)
+            .with_epoch(epoch)
+            .boxed(),
+        QueryFormat::Json => match chunk_size {
+            Some(chunk_size) => {
+                let response_stream = ChunkedResponseStream::new(executing_statements, chunk_size);
+                ChunkedJsonStream::new(response_stream, || CompactFormatter, epoch).boxed()
+            }
+            None => {
+                let response_stream = BufferedResponseStream::new(executing_statements);
+                BufferedJsonStream::new(response_stream, || CompactFormatter, epoch).boxed()
+            }
+        },
+        QueryFormat::JsonPretty => match chunk_size {
+            Some(chunk_size) => {
+                let response_stream = ChunkedResponseStream::new(executing_statements, chunk_size);
+                ChunkedJsonStream::new(response_stream, PrettyFormatter::new, epoch).boxed()
+            }
+            None => {
+                let response_stream = BufferedResponseStream::new(executing_statements);
+                BufferedJsonStream::new(response_stream, PrettyFormatter::new, epoch).boxed()
+            }
+        },
+        QueryFormat::MsgPack => match chunk_size {
+            Some(chunk_size) => {
+                let response_stream = ChunkedResponseStream::new(executing_statements, chunk_size);
+                ChunkedMessagePackStream::new(response_stream, epoch).boxed()
+            }
+            None => {
+                let response_stream = BufferedResponseStream::new(executing_statements);
+                BufferedMessagePackStream::new(response_stream, epoch).boxed()
+            }
+        },
+    }
+}
+
+fn get_executing_statement_from_stream(stream: SendableRecordBatchStream) -> StatementFuture {
+    Box::new(async move { Ok(Statement::new(stream.schema(), None, stream)) })
+}
+
+fn get_executing_statement_from_plan(
+    query_plan: QueryPlan,
+    database: Arc<dyn QueryDatabase>,
+) -> StatementFuture {
+    let QueryPlan {
+        physical_plan,
+        schema,
+        query_completed_token,
+        context,
+    } = query_plan;
+
+    let fut = async move {
+        let permit_span = context.child_span("query_rate_limit_semaphore");
+        let permit = database.acquire_semaphore(permit_span).await;
+        let query_completed_token: iox_query::query_log::QueryCompletedToken<
+            iox_query::query_log::StatePermit,
+        > = query_completed_token.permit();
+
+        context
+            .execute_stream(physical_plan)
+            .await
+            .map(|stream| {
+                Statement::new(
+                    Arc::clone(&schema),
+                    Some(PermitAndToken {
+                        permit,
+                        query_completed_token,
+                    }),
+                    stream,
+                )
+            })
+            .map_err(Error::from)
+    };
+
+    Box::new(fut)
+}
+
+fn error_statement(error: Error) -> StatementFuture {
+    Box::new(futures::future::err(error))
+}
+
+#[derive(Debug, Deserialize)]
+struct V1AuthParameters {
+    #[serde(rename = "p")]
+    password: Option<String>,
+}
+
+fn extract_v1_auth_token(req: &mut Request) -> Option<Vec<u8>> {
+    req.uri()
+        .path_and_query()
+        .and_then(|pq| match pq.path() {
+            "/query" => pq.query(),
+            _ => None,
+        })
+        .map(serde_urlencoded::from_str::<V1AuthParameters>)
+        .transpose()
+        .ok()
+        .flatten()
+        .and_then(|params| params.password)
+        .map(String::into_bytes)
+}
+
+fn validate_auth_header(header: &HeaderValue) -> Result<Vec<u8>> {
+    let header = header.to_str().map_err(|e| Error::Utf8 {
+        message: "auth header",
+        error: e.to_string(),
+    })?;
+    authz::extract_token(Some(header)).ok_or(Error::AuthorizationFailure(
+        "failed to extract token from header".to_owned(),
+    ))
+}
+
+enum SupportedContentType {
+    ApplicationInfluxql,
+    FormUrlEncoded,
+    MultipartFormData,
+}
+
+impl SupportedContentType {
+    fn from_request(req: &Request) -> Result<Self> {
+        if let Some(ct) = req.headers().get("Content-Type") {
+            let ct = std::str::from_utf8(ct.as_bytes()).map_err(|e| Error::Utf8 {
+                message: "mime type",
+                error: e.to_string(),
+            })?;
+            let mime: Mime = ct
+                .parse()
+                .map_err(|x: mime::FromStrError| Error::InvalidMimeType(x.to_string()))?;
+
+            match (mime.type_(), mime.subtype()) {
+                (mime::APPLICATION, mime::WWW_FORM_URLENCODED) => Ok(Self::FormUrlEncoded),
+                (mime::APPLICATION, subtype) if subtype.as_str() == "vnd.influxql" => {
+                    Ok(Self::ApplicationInfluxql)
+                }
+                (mime::MULTIPART, mime::FORM_DATA) => Ok(Self::MultipartFormData),
+                _ => Err(Error::InvalidMimeType(mime.to_string())),
+            }
+        } else {
+            // Default to assuming an influxql POST body
+            Ok(Self::ApplicationInfluxql)
+        }
+    }
+}
+
+async fn influxql_body(req: Request) -> Result<QueryParams, HttpError> {
+    let mut params = QueryParams::from_request_query_string(&req)?;
+    // We support a "q" query string for POST too.
+    // If empty, check the content-type and parse the body appropriately.
+    if params.query.as_ref().is_none_or(|x| x.is_empty()) {
+        let bytes = req
+            .into_body()
+            .collect()
+            .await
+            .map_err(|_| {
+                HttpError::Invalid("Error retrieving bytes from response body".to_owned())
+            })?
+            .to_bytes();
+        params.query = Some(String::from_utf8(bytes.to_vec()).map_err(|_| {
+            HttpError::Invalid("Error retrieving query from request body".to_owned())
+        })?);
+    };
+
+    Ok(params)
+}
+
+async fn form_urlencoded(req: Request) -> Result<QueryParams, HttpError> {
+    let (body_params, _) = form_urlencoded_inner(req).await?;
+    Ok(body_params)
+}
+
+async fn form_urlencoded_inner(req: Request) -> Result<(QueryParams, Bytes), HttpError> {
+    // The 1.x implementation uses [FormValue](https://pkg.go.dev/net/http#Request.FormValue)
+    // which relies on [ParseForm](https://pkg.go.dev/net/http#Request.ParseForm).
+    //
+    // This will always parse the URL query string as well as parsing the form body when required.
+    // Request body parameters take precedence over URL query string values.
+
+    // It is okay to swallow the error here, since a query string is not mandatory.
+    let query_string_params = QueryParams::from_request_query_string(&req).unwrap_or_default();
+
+    let bytes = req
+        .into_body()
+        .collect()
+        .await
+        .map_err(|e| Error::FieldRead {
+            name: "body",
+            error: e.to_string(),
+        })?
+        .to_bytes();
+    let mut body_params = QueryParams::from_bytes_form_urlencoded(&bytes)?;
+
+    body_params.merge(query_string_params);
+
+    Ok((body_params, bytes))
+}
+
+async fn multipart_upload(req: Request) -> Result<QueryParams, HttpError> {
+    let boundary = req
+        .headers()
+        .get(CONTENT_TYPE)
+        .and_then(|ct| ct.to_str().ok())
+        .and_then(|ct| multer::parse_boundary(ct).ok());
+
+    if boundary.is_none() {
+        return Err(HttpError::Invalid(
+            "A boundary header is required for multipart upload".to_owned(),
+        ));
+    }
+
+    let (lower_precedence_params, bytes) = form_urlencoded_inner(req).await?;
+
+    let mut fields: HashMap<String, String> = HashMap::new();
+    // all the fields are strings,
+    // then parse in a method on QueryParams
+    let mut multipart = Multipart::new(
+        futures::stream::once(async { Ok::<_, Error>(bytes) }),
+        boundary.unwrap(), // safe due to is_none check above
+    );
+    while let Some(mut field) = multipart
+        .next_field()
+        .await
+        .map_err(|e| Error::MultipartFile(e.to_string()))?
+    {
+        if let Some(name) = field.name() {
+            let name = name.to_owned();
+            let mut value = Vec::new();
+            while let Some(field_chunk) = field
+                .chunk()
+                .await
+                .map_err(|e| Error::MultipartFile(e.to_string()))?
+            {
+                value.extend_from_slice(field_chunk.as_bytes());
+            }
+
+            let value = String::from_utf8(value).map_err(|e| Error::Utf8 {
+                message: "multipart field",
+                error: e.to_string(),
+            })?;
+            fields.insert(name, value);
+        }
+    }
+
+    let mut body_params = QueryParams::from_hashmap_multipart(fields)?;
+    body_params.merge(lower_precedence_params);
+
+    Ok(body_params)
+}
+
+async fn extract_request(req: Request) -> Result<(QueryParams, QueryFormat), HttpError> {
+    // Pull the mime_type out before we consume the body in the match
+    let accept = req.headers().get(ACCEPT).cloned();
+    let mime_type = accept.as_ref().map(HeaderValue::as_bytes);
+
+    let qp = match *req.method() {
+        Method::GET => Ok(QueryParams::from_request_query_string(&req)?),
+        Method::POST => {
+            let content_type = SupportedContentType::from_request(&req)?;
+            match content_type {
+                SupportedContentType::ApplicationInfluxql => influxql_body(req).await,
+                SupportedContentType::FormUrlEncoded => form_urlencoded(req).await,
+                SupportedContentType::MultipartFormData => multipart_upload(req).await,
+            }
+        }
+        _ => Err(HttpError::Invalid("Invalid request method".to_owned())),
+    }?;
+
+    let qf = QueryFormat::from_bytes(mime_type, qp.pretty.unwrap_or_default())?;
+    Ok((qp, qf))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{sync::Arc, time::Duration};
+
+    use authz::{Action, Authorization, Authorizer, Error, Permission, Target};
+    use iox_http_util::{RequestBuilder, empty_request_body, read_body_bytes_for_tests};
+    use iox_query::{QueryDatabase, test::TestDatabaseStore};
+    use iox_query_influxql::{
+        show_databases::mock::MockShowDatabases,
+        show_retention_policies::mock::{MockRetentionPolicy, MockShowRetentionPolicies},
+    };
+
+    use crate::V1HttpHandler;
+
+    #[derive(Debug)]
+    struct MockAuthorizer {
+        authorized_databases: Vec<String>,
+    }
+
+    impl MockAuthorizer {
+        fn new(databases: impl IntoIterator<Item: Into<String>>) -> Self {
+            Self {
+                authorized_databases: databases.into_iter().map(Into::into).collect(),
+            }
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl Authorizer for MockAuthorizer {
+        async fn authorize(
+            &self,
+            _token: Option<Vec<u8>>,
+            _perms: &[Permission],
+        ) -> Result<Authorization, Error> {
+            let permissions = self
+                .authorized_databases
+                .iter()
+                .map(|n| {
+                    Permission::ResourceAction(
+                        authz::Resource::Database(Target::ResourceName(n.to_string())),
+                        Action::Read,
+                    )
+                })
+                .collect::<Vec<_>>();
+            Ok(Authorization::new(None, permissions))
+        }
+    }
+
+    #[tokio::test]
+    async fn test_show_databases() {
+        let db = Arc::new(TestDatabaseStore::default());
+        db.db_or_create("foo").await;
+        db.db_or_create("bar").await;
+        let show_databases = Arc::new(MockShowDatabases::new(["foo", "bar"]));
+        let handler = V1HttpHandler::new(db, None, None, "test".to_string())
+            .with_show_databases(show_databases);
+        let req = RequestBuilder::new()
+            .method("GET")
+            .uri("http://foo.bar/query?q=show%20databases")
+            .body(empty_request_body())
+            .unwrap();
+        let res = handler.route_request(req).await.unwrap();
+        let res = read_body_bytes_for_tests(res.into_body()).await;
+        let res = String::from_utf8(Vec::<u8>::from(res)).unwrap();
+        insta::with_settings!({
+            description => "SHOW DATABASES -- on handler with SHOW DATABASES enabled",
+        },{
+            insta::assert_snapshot!(res);
+        });
+    }
+
+    #[tokio::test]
+    async fn test_show_databases_with_no_impl() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let handler = V1HttpHandler::new(db, None, None, "test".to_string());
+        let req = RequestBuilder::new()
+            .method("GET")
+            .uri("http://foo.bar/query?q=show%20databases")
+            .body(empty_request_body())
+            .unwrap();
+        let res = handler.route_request(req).await.unwrap();
+        let res = read_body_bytes_for_tests(res.into_body()).await;
+        let res = String::from_utf8(Vec::<u8>::from(res)).unwrap();
+        insta::with_settings!({
+            description => "SHOW DATABASES -- on handler with SHOW DATABASES _not_ enabled",
+        },{
+            insta::assert_snapshot!(res);
+        });
+    }
+
+    #[tokio::test]
+    async fn test_show_databases_with_authz() {
+        let db = Arc::new(TestDatabaseStore::default());
+        db.db_or_create("foo").await;
+        db.db_or_create("bar").await;
+        db.db_or_create("mop").await;
+        {
+            let authz = Arc::new(MockAuthorizer::new(["foo", "bar"]));
+            // The show databases has databases foo, bar, and mop, but only foo and bar will be returned
+            // due to the mock authorizer...
+            let show_databases = Arc::new(MockShowDatabases::new(["foo", "bar", "mop"]));
+            let handler =
+                V1HttpHandler::new(Arc::clone(&db) as _, Some(authz), None, "test".to_string())
+                    .with_show_databases(show_databases);
+            let req = RequestBuilder::new()
+                .method("GET")
+                .uri("http://foo.bar/query?q=show%20databases")
+                .body(empty_request_body())
+                .unwrap();
+            let res = handler.route_request(req).await.unwrap();
+            let res = read_body_bytes_for_tests(res.into_body()).await;
+            let res = String::from_utf8(Vec::<u8>::from(res)).unwrap();
+            insta::with_settings!({
+                description => "SHOW DATABASES -- should not return mop database due to authz",
+            },{
+                insta::assert_snapshot!(res);
+            });
+        }
+        {
+            let authz = Arc::new(MockAuthorizer::new(["foo", "bar", "mop"]));
+            // The show databases has databases foo, bar, and mop, but only foo and bar will be returned
+            // due to the mock authorizer...
+            let show_databases = Arc::new(MockShowDatabases::new(["foo", "bar", "mop"]));
+            let handler = V1HttpHandler::new(db, Some(authz), None, "test".to_string())
+                .with_show_databases(show_databases);
+            let req = RequestBuilder::new()
+                .method("GET")
+                .uri("http://foo.bar/query?q=show%20databases")
+                .body(empty_request_body())
+                .unwrap();
+            let res = handler.route_request(req).await.unwrap();
+            let res = read_body_bytes_for_tests(res.into_body()).await;
+            let res = String::from_utf8(Vec::<u8>::from(res)).unwrap();
+            insta::with_settings!({
+                description => "SHOW DATABASES -- should return mop database after adding to authz",
+            },{
+                insta::assert_snapshot!(res);
+            });
+        }
+    }
+
+    #[tokio::test]
+    async fn test_show_retention_policies() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let show_retention_policies = Arc::new(
+            MockShowRetentionPolicies::new()
+                .with_default_retention_policy("foo")
+                .with_default_retention_policy("bar")
+                .with_retention_policy(
+                    "bar",
+                    MockRetentionPolicy::new("short").with_duration(Duration::from_secs(100)),
+                ),
+        );
+        let handler = V1HttpHandler::new(db, None, None, "test".to_string())
+            .with_show_retention_policies(show_retention_policies);
+        // on foo db:
+        {
+            let req = RequestBuilder::new()
+                .method("GET")
+                .uri("http://foo.bar/query?db=foo&q=show%20retention%20policies")
+                .body(empty_request_body())
+                .unwrap();
+            let res = handler.route_request(req).await.unwrap();
+            let res = read_body_bytes_for_tests(res.into_body()).await;
+            let res = String::from_utf8(Vec::<u8>::from(res)).unwrap();
+            insta::with_settings!({
+                description => "SHOW RETENTION POLICIES -- on `foo` database which contains one \
+                    default policy",
+            },{
+                insta::assert_snapshot!(res);
+            });
+        }
+        // on bar db:
+        {
+            let req = RequestBuilder::new()
+                .method("GET")
+                .uri("http://foo.bar/query?db=bar&q=show%20retention%20policies")
+                .body(empty_request_body())
+                .unwrap();
+            let res = handler.route_request(req).await.unwrap();
+            let res = read_body_bytes_for_tests(res.into_body()).await;
+            let res = String::from_utf8(Vec::<u8>::from(res)).unwrap();
+            insta::with_settings!({
+                description => "SHOW RETENTION POLICIES -- on `bar` database which contains one \
+                    default policy and one non-default policy",
+            },{
+                insta::assert_snapshot!(res);
+            });
+        }
+        // on non-existent db:
+        {
+            let req = RequestBuilder::new()
+                .method("GET")
+                .uri("http://foo.bar/query?db=frodo&q=show%20retention%20policies")
+                .body(empty_request_body())
+                .unwrap();
+            let res = handler.route_request(req).await.unwrap();
+            let res = read_body_bytes_for_tests(res.into_body()).await;
+            let res = String::from_utf8(Vec::<u8>::from(res)).unwrap();
+            insta::with_settings!({
+                description => "SHOW RETENTION POLICIES -- on `frodo` database which does not \
+                    exist",
+            },{
+                insta::assert_snapshot!(res);
+            });
+        }
+    }
+
+    #[tokio::test]
+    async fn test_show_retention_policies_with_no_impl() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let handler = V1HttpHandler::new(db, None, None, "test".to_string());
+        let req = RequestBuilder::new()
+            .method("GET")
+            .uri("http://foo.bar/query?db=foo&q=show%20retention%20policies")
+            .body(empty_request_body())
+            .unwrap();
+        let res = handler.route_request(req).await.unwrap();
+        let res = read_body_bytes_for_tests(res.into_body()).await;
+        let res = String::from_utf8(Vec::<u8>::from(res)).unwrap();
+        insta::with_settings!({
+            description => "SHOW RETENTION POLICIES -- on handler with SHOW RETENTION POLICIES \
+                _not_ enabled",
+        },{
+            insta::assert_snapshot!(res);
+        });
+    }
+}
diff --git a/iox_v1_query_api/src/lib.rs b/iox_v1_query_api/src/lib.rs
new file mode 100644
index 00000000..b099c875
--- /dev/null
+++ b/iox_v1_query_api/src/lib.rs
@@ -0,0 +1,216 @@
+use workspace_hack as _;
+
+// A large majority of the code in this file was copied from the Monolith
+// project and then adapted to fit our format/needs.
+use std::{collections::HashMap, fmt::Debug, num::ParseIntError, str::ParseBoolError};
+
+use bytes::Bytes;
+use error::Error;
+use iox_http_util::Request;
+use serde::Deserialize;
+use types::Precision;
+use types::Statement;
+
+mod error;
+pub use error::HttpError;
+mod handler;
+pub use handler::V1HttpHandler;
+mod response;
+mod types;
+mod value;
+
+const DEFAULT_CHUNK_SIZE: usize = 10_000;
+
+type Result<T, E = Error> = std::result::Result<T, E>;
+type StatementFuture = Box<dyn Future<Output = std::result::Result<Statement, Error>> + Send>;
+/// Enum representing the query format for the v1/query API.
+///
+/// The original API supports CSV, JSON, and "pretty" JSON formats.
+#[derive(Debug, Default, Deserialize, Clone, Copy, PartialEq)]
+#[serde(rename_all = "snake_case")]
+pub enum QueryFormat {
+    Csv,
+    #[default]
+    Json,
+    JsonPretty,
+    MsgPack,
+}
+
+impl QueryFormat {
+    /// Returns the content type as a string slice for the query format.
+    ///
+    /// Maps the `QueryFormat` variants to their corresponding MIME types as strings.
+    /// This is useful for setting the `Content-Type` header in HTTP responses.
+    pub fn as_content_type(&self) -> &str {
+        match self {
+            Self::Csv => "application/csv",
+            Self::Json | Self::JsonPretty => "application/json",
+            Self::MsgPack => "application/x-msgpack",
+        }
+    }
+
+    /// Extracts the [`QueryFormat`] from an HTTP [`Request`].
+    ///
+    /// Parses the HTTP request to determine the desired query format. The `pretty`
+    /// parameter indicates if the pretty format is requested via a query parameter.
+    /// The function inspects the `Accept` header of the request to determine the
+    /// format, defaulting to JSON if no specific format is requested. If the format
+    /// is invalid or non-UTF8, an error is returned.
+    pub fn from_bytes(mime_type: Option<&[u8]>, pretty: bool) -> Result<Self> {
+        match mime_type {
+            Some(b"application/csv" | b"text/csv") => Ok(Self::Csv),
+            Some(b"application/x-msgpack") => Ok(Self::MsgPack),
+            Some(b"application/json" | b"*/*") | None => {
+                // If no specific format is requested via the Accept header,
+                // and the 'pretty' parameter is true, use the pretty JSON format.
+                // Otherwise, default to the regular JSON format.
+                if pretty {
+                    Ok(Self::JsonPretty)
+                } else {
+                    Ok(Self::Json)
+                }
+            }
+            Some(mime_type) => match std::str::from_utf8(mime_type) {
+                Ok(s) => Err(Error::InvalidMimeType(s.to_owned())),
+                Err(e) => Err(Error::Utf8 {
+                    message: "mime type",
+                    error: e.to_string(),
+                }),
+            },
+        }
+    }
+}
+
+/// Query parameters for the v1/query API
+///
+/// The original API supports a `u` parameter, for "username", as well as a `p`,
+/// for "password". The password is extracted upstream, and username is ignored.
+#[derive(Debug, Default, Deserialize)]
+pub struct QueryParams {
+    /// Chunk the response into chunks of size `chunk_size`, or 10,000, or by series
+    pub chunked: Option<bool>,
+    /// Define the number of records that will go into a chunk
+    pub chunk_size: Option<usize>,
+    /// Database to perform the query against
+    ///
+    /// This is optional because the query string may specify the database
+    #[serde(rename = "db")]
+    pub database: Option<String>,
+    /// Retention Policy to perform the query against
+    ///
+    /// This is optional because the query string may specify the rp
+    #[serde(rename = "rp")]
+    pub retention_policy: Option<String>,
+    /// Map timestamps to UNIX epoch time, with the given precision
+    pub epoch: Option<Precision>,
+    /// Format the JSON outputted in pretty format
+    pub pretty: Option<bool>,
+    /// The InfluxQL query string
+    #[serde(rename = "q")]
+    pub query: Option<String>,
+    /// Params for parameterized queries
+    pub params: Option<String>,
+}
+
+impl QueryParams {
+    /// Extract [`QueryParams`] from an HTTP [`Request`]
+    pub fn from_request_query_string(req: &Request) -> Result<Self> {
+        let query = req.uri().query().ok_or(Error::MissingQueryParams)?;
+        let mut params: Self = serde_urlencoded::from_str(query).map_err(Error::from)?;
+
+        // For other request types we need to know if the value was set or not,
+        // so we have to unwrap_or_default here rather than on QueryParams directly.
+        params.chunked = Some(params.chunked.unwrap_or_default());
+        params.pretty = Some(params.pretty.unwrap_or_default());
+
+        Ok(params)
+    }
+
+    pub fn from_bytes_form_urlencoded(bytes: &Bytes) -> Result<Self> {
+        serde_urlencoded::from_bytes(bytes).map_err(Into::into)
+    }
+
+    pub fn from_hashmap_multipart(fields: HashMap<String, String>) -> Result<Self> {
+        let mut this = Self::default();
+
+        if let Some(chunked) = fields.get("chunked") {
+            let b = chunked
+                .trim()
+                .parse()
+                .map_err(|e: ParseBoolError| Error::FieldRead {
+                    name: "chunked",
+                    error: e.to_string(),
+                })?;
+            this.chunked = Some(b);
+        }
+
+        if let Some(chunk_size) = fields.get("chunk_size") {
+            let u = chunk_size
+                .trim()
+                .parse()
+                .map_err(|e: ParseIntError| Error::FieldRead {
+                    name: "chunk_size",
+                    error: e.to_string(),
+                })?;
+            this.chunk_size = Some(u);
+        }
+
+        if let Some(epoch) = fields.get("epoch") {
+            let e = epoch.trim();
+            let e = serde_json::from_str(e).map_err(Error::from)?;
+            this.epoch = Some(e)
+        }
+
+        if let Some(pretty) = fields.get("pretty") {
+            let p = pretty
+                .trim()
+                .parse()
+                .map_err(|e: ParseBoolError| Error::FieldRead {
+                    name: "pretty",
+                    error: e.to_string(),
+                })?;
+            this.pretty = Some(p);
+        }
+
+        this.database = fields.get("db").cloned();
+        this.retention_policy = fields.get("rp").cloned();
+        this.query = fields.get("q").cloned();
+        this.params = fields.get("params").cloned();
+
+        Ok(this)
+    }
+
+    pub fn merge(&mut self, lower_precedence: Self) {
+        if self.chunked.is_none() {
+            self.chunked = lower_precedence.chunked;
+        }
+
+        if self.chunk_size.is_none() {
+            self.chunk_size = lower_precedence.chunk_size;
+        }
+
+        if self.database.is_none() {
+            self.database = lower_precedence.database;
+        }
+
+        if self.retention_policy.is_none() {
+            self.retention_policy = lower_precedence.retention_policy;
+        }
+
+        if self.epoch.is_none() {
+            self.epoch = lower_precedence.epoch;
+        }
+
+        if self.pretty.is_none() {
+            self.pretty = lower_precedence.pretty;
+        }
+
+        if self.query.is_none() {
+            self.query = lower_precedence.query;
+        }
+
+        if self.params.is_none() {
+            self.params = lower_precedence.params;
+        }
+    }
+}
diff --git a/iox_v1_query_api/src/response.rs b/iox_v1_query_api/src/response.rs
new file mode 100644
index 00000000..0b7c4fc8
--- /dev/null
+++ b/iox_v1_query_api/src/response.rs
@@ -0,0 +1,634 @@
+use arrow::array::RecordBatch;
+use serde::{
+    Serialize, Serializer,
+    ser::{SerializeSeq, SerializeStruct},
+};
+use std::{
+    collections::BTreeMap,
+    fmt::{Debug, Formatter},
+    sync::Arc,
+};
+
+pub(super) mod buffered;
+pub(super) mod chunked;
+pub(super) mod csv;
+pub(super) mod json;
+pub(super) mod msgpack;
+use buffered::BufferedResponseStream;
+use chunked::ChunkedResponseStream;
+mod stream;
+use stream::{SeriesChunkMergeStream, SeriesChunkStream};
+
+use super::{
+    types::Precision,
+    value::{Value, ValueSerializer},
+};
+
+#[derive(Debug, PartialEq)]
+pub(super) struct Series {
+    measurement: String,
+    tags: BTreeMap<Arc<str>, String>,
+}
+
+impl Series {
+    fn new(measurement: String, tags: BTreeMap<Arc<str>, String>) -> Self {
+        Self { measurement, tags }
+    }
+}
+
+/// This represents a discrete chunk of data as defined by the V1 query API.
+/// This can be a complete Series when:
+/// 1. We are not running in chunked mode
+/// 2. We are in chunked mode, and the series is smaller than the chunk size
+///
+/// or this will be a Chunk (a subset of a Series) as defined by the chunk size.
+///
+/// The intention is that this type can be easily converted to our output formats
+/// and returned/streamed by another stream.
+#[derive(PartialEq)]
+pub(crate) struct SeriesChunk {
+    measurement_column: usize,
+
+    tag_columns: Arc<BTreeMap<Arc<str>, usize>>,
+
+    value_columns: Arc<[(Arc<str>, usize)]>,
+
+    /// SeriesChunks may contain one or more record batches.
+    data: Vec<RecordBatch>,
+    /// If this chunk is a partial chunk.
+    partial: bool,
+}
+
+impl SeriesChunk {
+    /// Create a new SeriesChunk from a RecordBatch.
+    fn new(
+        measurement_column: usize,
+        tag_columns: Arc<BTreeMap<Arc<str>, usize>>,
+        value_columns: Arc<[(Arc<str>, usize)]>,
+        batch: RecordBatch,
+    ) -> Self {
+        Self {
+            measurement_column,
+            tag_columns,
+            value_columns,
+            data: vec![batch],
+            partial: false,
+        }
+    }
+
+    /// Get the measurement name this series belongs to.
+    fn measurement(&self) -> String {
+        assert!(
+            !self.data.is_empty(),
+            "SeriesChunk should have at least one record batch"
+        );
+        Value::new(self.data[0].column(self.measurement_column), 0).to_string()
+    }
+
+    /// Get the tags that define this series.
+    fn tags(&self) -> BTreeMap<Arc<str>, String> {
+        assert!(
+            !self.data.is_empty(),
+            "SeriesChunk should have at least one record batch"
+        );
+        self.tag_columns
+            .iter()
+            .map(|(name, idx)| (Arc::clone(name), Value::new(self.data[0].column(*idx), 0)))
+            .map(|(name, value)| (name, value.to_string()))
+            .collect()
+    }
+
+    /// Get the definition of the Series this chunk is part of.
+    fn series(&self) -> Series {
+        Series::new(self.measurement(), self.tags())
+    }
+
+    // Get the names of the columns in this SeriesChunk.
+    fn columns(&self) -> Vec<Arc<str>> {
+        self.value_columns
+            .iter()
+            .map(|(name, _)| Arc::clone(name))
+            .collect()
+    }
+
+    /// Get the total number of rows in this SeriesChunk.
+    fn num_rows(&self) -> usize {
+        self.data.iter().map(|x| x.num_rows()).sum()
+    }
+
+    /// Get the values at the given row.
+    fn row(&self, row: usize) -> Option<Vec<Value>> {
+        if row >= self.num_rows() {
+            return None;
+        }
+
+        // Calculate the index of the record batch, and the index of the row within that batch.
+        //
+        // For example, if we have 3 batches that look like this:
+        // [
+        //   [a, b, c],
+        //   [d, e, f, g],
+        //   [h, i]
+        // ]
+        // and we want to get the value of row 5 (rows starting from 0), which is "f",
+        // the index of the batch is 1, and the index of the row within that batch is 2.
+        let mut batch_idx = 0;
+        let mut row_idx = row;
+        while row_idx >= self.data[batch_idx].num_rows() {
+            row_idx -= self.data[batch_idx].num_rows();
+            batch_idx += 1;
+        }
+
+        if batch_idx > self.data.len() {
+            return None;
+        }
+
+        let mut values = Vec::new();
+        for (_, idx) in self.value_columns.iter() {
+            values.push(Value::new(self.data[batch_idx].column(*idx), row_idx));
+        }
+        Some(values)
+    }
+
+    /// Split this SeriesChunk into two SeriesChunks at the given size.
+    fn split_at(self, mut size: usize) -> (Self, Self) {
+        let mut left = Self {
+            measurement_column: self.measurement_column,
+            tag_columns: Arc::clone(&self.tag_columns),
+            value_columns: Arc::clone(&self.value_columns),
+            data: Vec::new(),
+            partial: self.partial,
+        };
+        let mut right = Self {
+            measurement_column: self.measurement_column,
+            tag_columns: self.tag_columns,
+            value_columns: self.value_columns,
+            data: Vec::new(),
+            partial: self.partial,
+        };
+        let it = self.data.into_iter();
+        for batch in it {
+            if size > 0 {
+                if batch.num_rows() > size {
+                    left.data.push(batch.slice(0, size));
+                    right.data.push(batch.slice(size, batch.num_rows() - size));
+                    size = 0;
+                } else {
+                    size -= batch.num_rows();
+                    left.data.push(batch);
+                }
+            } else {
+                right.data.push(batch);
+            }
+        }
+        (left, right)
+    }
+
+    /// Merge another SeriesChunk into this one.
+    fn merge(&mut self, other: Self) {
+        assert_eq!(self.series(), other.series());
+        self.data.extend(other.data);
+    }
+}
+
+struct SeriesChunkSerializer<'a> {
+    chunk: &'a SeriesChunk,
+    epoch: Option<Precision>,
+    /// Allow infinite values
+    allow_inf: bool,
+}
+
+impl<'a> SeriesChunkSerializer<'a> {
+    fn new(chunk: &'a SeriesChunk, epoch: Option<Precision>, allow_inf: bool) -> Self {
+        Self {
+            chunk,
+            epoch,
+            allow_inf,
+        }
+    }
+}
+
+impl Serialize for SeriesChunkSerializer<'_> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let mut fields = 0;
+        let name = self.chunk.measurement();
+        if !name.is_empty() {
+            fields += 1;
+        }
+        let tags = self.chunk.tags();
+        if !tags.is_empty() {
+            fields += 1;
+        }
+        let columns = self.chunk.columns();
+        if !columns.is_empty() {
+            fields += 1;
+        }
+        if self.chunk.num_rows() > 0 {
+            fields += 1;
+        }
+        if self.chunk.partial {
+            fields += 1;
+        }
+
+        let mut obj = serializer.serialize_struct("", fields)?;
+        if !name.is_empty() {
+            obj.serialize_field("name", &name)?;
+        }
+        if !tags.is_empty() {
+            obj.serialize_field("tags", &tags)?;
+        }
+        if !columns.is_empty() {
+            obj.serialize_field("columns", &self.chunk.columns())?;
+        }
+        if self.chunk.num_rows() > 0 {
+            obj.serialize_field(
+                "values",
+                &SeriesValues {
+                    chunk: self.chunk,
+                    epoch: self.epoch,
+                    allow_inf: self.allow_inf,
+                },
+            )?;
+        }
+        if self.chunk.partial {
+            obj.serialize_field("partial", &self.chunk.partial)?;
+        }
+        obj.end()
+    }
+}
+
+impl Debug for SeriesChunk {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Chunk")
+            .field("measurement", &self.measurement())
+            .field("tags", &self.tags())
+            .field("columns", &self.columns())
+            .finish_non_exhaustive()
+    }
+}
+
+impl<'a> IntoIterator for &'a SeriesChunk {
+    type Item = Vec<Value>;
+    type IntoIter = SeriesChunkIter<'a>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        SeriesChunkIter {
+            chunk: self,
+            row: 0,
+        }
+    }
+}
+
+pub(crate) struct SeriesChunkIter<'a> {
+    chunk: &'a SeriesChunk,
+    row: usize,
+}
+
+impl Iterator for SeriesChunkIter<'_> {
+    type Item = Vec<Value>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let row = self.chunk.row(self.row);
+        if row.is_some() {
+            self.row += 1;
+        }
+        row
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let remaining = self.chunk.num_rows() - self.row;
+        (remaining, Some(remaining))
+    }
+}
+
+/// This is a helper struct to serialize a SeriesChunk into the JSON
+/// format
+struct SeriesValues<'a> {
+    chunk: &'a SeriesChunk,
+    epoch: Option<Precision>,
+    /// Allow infinite values
+    allow_inf: bool,
+}
+
+impl Serialize for SeriesValues<'_> {
+    fn serialize<S: serde::Serializer>(
+        &self,
+        serializer: S,
+    ) -> std::result::Result<S::Ok, S::Error> {
+        let mut seq = serializer.serialize_seq(Some(self.chunk.num_rows()))?;
+        for row in self.chunk.into_iter() {
+            let row = row
+                .iter()
+                .map(|e| ValueSerializer::new(e, self.epoch, self.allow_inf))
+                .collect::<Vec<_>>();
+            seq.serialize_element(&row)?;
+        }
+        seq.end()
+    }
+}
+
+/// The result of a single InfluxQL statement. This is equivalent to
+/// [query.Result](https://github.com/influxdata/influxdb/blob/master-1.x/query/result.go#L86)
+/// from InfluxDB v1.
+///
+/// N.B. This doesn't support the messages field, as we have no use for
+/// it.
+#[derive(Debug, PartialEq)]
+pub(crate) struct StatementResult {
+    statement_id: usize,
+    series: Vec<SeriesChunk>,
+    partial: bool,
+    error: String,
+}
+
+impl StatementResult {
+    fn new(statement_id: usize) -> Self {
+        Self {
+            statement_id,
+            series: Vec::new(),
+            partial: false,
+            error: String::new(),
+        }
+    }
+
+    fn add_series(&mut self, series: SeriesChunk) {
+        self.series.push(series);
+    }
+
+    fn set_partial(&mut self, partial: bool) {
+        self.partial = partial;
+    }
+
+    fn set_error(&mut self, error: String) {
+        self.error = error;
+    }
+
+    fn is_error(&self) -> bool {
+        !self.error.is_empty()
+    }
+}
+
+struct StatementResultSerializer<'a> {
+    result: &'a StatementResult,
+    epoch: Option<Precision>,
+    /// Allow infinite values
+    allow_inf: bool,
+}
+
+impl Serialize for StatementResultSerializer<'_> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let mut fields = 1;
+        if !self.result.series.is_empty() {
+            fields += 1;
+        }
+        if self.result.partial {
+            fields += 1;
+        }
+        if !self.result.error.is_empty() {
+            fields += 1;
+        }
+        let mut obj = serializer.serialize_struct("", fields)?;
+        obj.serialize_field("statement_id", &self.result.statement_id)?;
+        if !self.result.series.is_empty() {
+            let series = self
+                .result
+                .series
+                .iter()
+                .map(|s| SeriesChunkSerializer::new(s, self.epoch, self.allow_inf))
+                .collect::<Vec<_>>();
+            obj.serialize_field("series", &series)?;
+        }
+        if self.result.partial {
+            obj.serialize_field("partial", &self.result.partial)?;
+        }
+        if !self.result.error.is_empty() {
+            obj.serialize_field("error", &self.result.error)?;
+        }
+        obj.end()
+    }
+}
+
+#[derive(Debug, Default, PartialEq)]
+pub(crate) struct Response(Vec<StatementResult>);
+
+impl Response {
+    pub(crate) fn add_result(&mut self, result: StatementResult) {
+        self.0.push(result);
+    }
+}
+
+struct ResponseSerializer<'a> {
+    response: &'a Response,
+    epoch: Option<Precision>,
+    /// Allow infinite values
+    allow_inf: bool,
+}
+
+impl<'a> ResponseSerializer<'a> {
+    fn new(response: &'a Response, epoch: Option<Precision>, allow_inf: bool) -> Self {
+        Self {
+            response,
+            epoch,
+            allow_inf,
+        }
+    }
+}
+
+impl Serialize for ResponseSerializer<'_> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let fields = if self.response.0.is_empty() { 0 } else { 1 };
+        let mut obj = serializer.serialize_struct("", fields)?;
+        if !self.response.0.is_empty() {
+            let result = self
+                .response
+                .0
+                .iter()
+                .map(|r| StatementResultSerializer {
+                    result: r,
+                    epoch: self.epoch,
+                    allow_inf: self.allow_inf,
+                })
+                .collect::<Vec<_>>();
+            obj.serialize_field("results", &result)?;
+        }
+        obj.end()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::StatementFuture;
+    use crate::error::Error;
+    use crate::types::Statement;
+    use arrow::{
+        array::{ArrayRef, RecordBatch},
+        datatypes::{DataType, Field, Schema, SchemaRef},
+    };
+    use data_types::NamespaceId;
+    use datafusion::physical_plan::{ExecutionPlan, test::exec::MockExec};
+    use generated_types::influxdata::iox::querier::v1::{
+        InfluxQlMetadata, influx_ql_metadata::TagKeyColumn,
+    };
+    use iox_query::QueryDatabase;
+    use iox_query::exec::IOxSessionContext;
+    use iox_query::query_log::{PermitAndToken, QueryLog};
+    use iox_query_params::StatementParams;
+    use schema::{INFLUXQL_MEASUREMENT_COLUMN_NAME, INFLUXQL_METADATA_KEY, TIME_COLUMN_NAME};
+    use std::{collections::HashMap, sync::Arc};
+
+    #[derive(Clone)]
+    pub(super) enum Column {
+        Measurement,
+        Tag {
+            name: &'static str,
+            group_by: bool,
+            projected: bool,
+        },
+        Time,
+        Field {
+            name: &'static str,
+        },
+    }
+
+    pub(super) fn make_statement(
+        database: &Arc<dyn QueryDatabase>,
+        ctx: &Arc<IOxSessionContext>,
+        log: &Arc<QueryLog>,
+        columns: impl IntoIterator<Item = Column>,
+        data: Vec<ArrayRef>,
+    ) -> StatementFuture {
+        let (schema, batches) = make_schema_and_batches(columns, vec![data]);
+        let exec: Arc<dyn ExecutionPlan> = Arc::new(
+            MockExec::new(
+                batches.into_iter().map(Ok).collect(),
+                SchemaRef::clone(&schema),
+            )
+            .with_use_task(false),
+        );
+
+        let database = Arc::clone(database);
+        let ctx = Arc::clone(ctx);
+        let log = Arc::clone(log);
+        let fut = async move {
+            let token = log.push(
+                NamespaceId::new(0),
+                Arc::from("test"),
+                "test_query",
+                Box::new("test_query".to_string()),
+                StatementParams::new(),
+                None,
+                None,
+            );
+            let token = token.planned(ctx.as_ref(), Arc::clone(&exec));
+            let permit = database.acquire_semaphore(None).await;
+            let query_completed_token = token.permit();
+            let permit_state = Some(PermitAndToken {
+                permit,
+                query_completed_token,
+            });
+            ctx.execute_stream(exec)
+                .await
+                .map(|stream| Statement {
+                    schema,
+                    permit_state,
+                    stream,
+                })
+                .map_err(Error::from)
+        };
+
+        Box::new(fut)
+    }
+
+    pub(super) fn make_schema_and_batches(
+        columns: impl IntoIterator<Item = Column>,
+        data: Vec<Vec<ArrayRef>>,
+    ) -> (SchemaRef, Vec<RecordBatch>) {
+        let mut measurement_column_index = None;
+        let mut tag_key_columns = vec![];
+        let fields = columns
+            .into_iter()
+            .enumerate()
+            .inspect(|(i, column)| {
+                if let Column::Tag {
+                    name,
+                    group_by: true,
+                    projected,
+                } = column
+                {
+                    tag_key_columns.push(TagKeyColumn {
+                        tag_key: name.to_string(),
+                        column_index: *i as u32,
+                        is_projected: *projected,
+                    });
+                }
+                if let Column::Measurement = column {
+                    measurement_column_index = Some(*i as u32);
+                }
+            })
+            .map(|(i, column)| match column {
+                Column::Measurement => Field::new(
+                    INFLUXQL_MEASUREMENT_COLUMN_NAME,
+                    data.first()
+                        .and_then(|batch| batch.get(i))
+                        .map(|arr| arr.data_type().clone())
+                        .unwrap_or(DataType::Utf8),
+                    false,
+                ),
+                Column::Tag { name, .. } => Field::new(
+                    name,
+                    data.first()
+                        .and_then(|batch| batch.get(i))
+                        .map(|arr| arr.data_type().clone())
+                        .unwrap_or(DataType::Dictionary(
+                            Box::new(DataType::Int32),
+                            Box::new(DataType::Utf8),
+                        )),
+                    true,
+                ),
+                Column::Time => Field::new(
+                    TIME_COLUMN_NAME,
+                    data.first()
+                        .and_then(|batch| batch.get(i))
+                        .map(|arr| arr.data_type().clone())
+                        .unwrap_or(DataType::Timestamp(
+                            arrow::datatypes::TimeUnit::Nanosecond,
+                            None,
+                        )),
+                    false,
+                ),
+                Column::Field { name } => Field::new(
+                    name,
+                    data.first()
+                        .and_then(|batch| batch.get(i))
+                        .map(|arr| arr.data_type().clone())
+                        .unwrap_or(DataType::Float64),
+                    true,
+                ),
+            })
+            .collect::<Vec<_>>();
+        let md = InfluxQlMetadata {
+            measurement_column_index: measurement_column_index.unwrap(),
+            tag_key_columns,
+        };
+        let md = serde_json::to_string(&md).unwrap();
+        let schema =
+            Schema::new(fields).with_metadata(HashMap::from([(INFLUXQL_METADATA_KEY.into(), md)]));
+        let schema = SchemaRef::new(schema);
+
+        let batches = data
+            .into_iter()
+            .map(|d| RecordBatch::try_new(SchemaRef::clone(&schema), d).unwrap())
+            .collect();
+
+        (schema, batches)
+    }
+}
diff --git a/iox_v1_query_api/src/response/buffered.rs b/iox_v1_query_api/src/response/buffered.rs
new file mode 100644
index 00000000..8447e9ec
--- /dev/null
+++ b/iox_v1_query_api/src/response/buffered.rs
@@ -0,0 +1,402 @@
+//! Streams for producing responses where chunking is not enabled.
+
+use super::{Response, SeriesChunk, SeriesChunkMergeStream, SeriesChunkStream, StatementResult};
+use crate::{Result, types::Statement};
+use datafusion::physical_plan::SendableRecordBatchStream;
+use futures::{Stream, ready};
+use iox_query::query_log::PermitAndToken;
+use std::{
+    future::Future,
+    pin::Pin,
+    task::{Context, Poll},
+};
+
+/// A stream of one [Response] value where the response contains the
+/// result data for all the corresponding statements.
+pub(crate) struct BufferedResponseStream {
+    statements: Vec<Pin<Box<dyn Future<Output = Result<Statement>> + Send>>>,
+    statement_id: usize,
+    current_statement: Option<(
+        Option<PermitAndToken>,
+        BufferedResultStream<SeriesChunkStream<SendableRecordBatchStream>>,
+    )>,
+    // Buffer the statment results for all the statements in one response
+    response: Option<Response>,
+}
+
+impl BufferedResponseStream {
+    pub(crate) fn new(statements: Vec<Box<dyn Future<Output = Result<Statement>> + Send>>) -> Self {
+        let response = (!statements.is_empty()).then_some(Response::default());
+
+        let statements = statements.into_iter().map(Box::into_pin).collect();
+        Self {
+            statements,
+            statement_id: 0,
+            current_statement: None,
+            response,
+        }
+    }
+
+    fn poll_statement(&mut self, cx: &mut Context<'_>) -> Poll<Result<Statement>> {
+        self.statements[self.statement_id].as_mut().poll(cx)
+    }
+
+    fn poll_next_result(&mut self, cx: &mut Context<'_>) -> Poll<Option<StatementResult>> {
+        let (_, stream) = self
+            .current_statement
+            .as_mut()
+            .expect("current_statement is None");
+        Pin::new(stream).poll_next(cx)
+    }
+
+    fn poll_next_unpin(&mut self, cx: &mut Context<'_>) -> Poll<Option<Response>> {
+        if self.statement_id >= self.statements.len() {
+            return Poll::Ready(self.response.take());
+        }
+
+        if self.current_statement.is_some() {
+            match ready!(self.poll_next_result(cx)) {
+                Some(result) => {
+                    if result.is_error() {
+                        let (permit_state, _) = self.current_statement.take().unwrap(); // safe to unwrap because we just checked above
+                        if let Some(permit_state) = permit_state {
+                            permit_state.query_completed_token.fail();
+                        }
+                        self.statement_id += 1;
+                    }
+                    self.response.as_mut().unwrap().add_result(result); // safe to unwrap because we just checked above
+                }
+                None => {
+                    let (permit_state, _) = self.current_statement.take().unwrap(); // safe to unwrap because we just checked above
+                    if let Some(permit_state) = permit_state {
+                        permit_state.query_completed_token.success();
+                    }
+                    self.statement_id += 1;
+                }
+            }
+        } else {
+            match ready!(self.poll_statement(cx)) {
+                Ok(Statement {
+                    schema,
+                    permit_state,
+                    stream,
+                }) => match SeriesChunkStream::try_new(stream, schema) {
+                    Ok(stream) => {
+                        self.current_statement = Some((
+                            permit_state,
+                            BufferedResultStream::new(stream, self.statement_id),
+                        ));
+                    }
+                    Err(e) => {
+                        if let Some(permit_state) = permit_state {
+                            permit_state.query_completed_token.fail();
+                        }
+                        let mut result = StatementResult::new(self.statement_id);
+                        result.set_error(e.to_string());
+                        self.statement_id += 1;
+                        self.response.as_mut().unwrap().add_result(result); // safe to unwrap because we just checked above
+                    }
+                },
+                Err(e) => {
+                    let mut result = StatementResult::new(self.statement_id);
+                    result.set_error(e.to_string());
+                    self.statement_id += 1;
+                    self.response.as_mut().unwrap().add_result(result); // safe to unwrap because we just checked above
+                }
+            }
+        }
+        self.poll_next_unpin(cx)
+    }
+}
+
+impl Stream for BufferedResponseStream {
+    type Item = Response;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        this.poll_next_unpin(cx)
+    }
+}
+
+pub(super) struct BufferedResultStream<S> {
+    inner: SeriesChunkMergeStream<S>,
+    result: Option<StatementResult>,
+}
+
+impl<S> BufferedResultStream<S> {
+    pub(super) fn new(stream: S, statement_id: usize) -> Self {
+        let inner = SeriesChunkMergeStream::new(stream, None);
+        Self {
+            inner,
+            result: Some(StatementResult::new(statement_id)),
+        }
+    }
+}
+
+impl<S> BufferedResultStream<S>
+where
+    S: Stream<Item = Result<SeriesChunk>> + Unpin,
+{
+    fn poll_next_unpin(&mut self, cx: &mut Context<'_>) -> Poll<Option<StatementResult>> {
+        loop {
+            if self.result.is_none() {
+                return Poll::Ready(None);
+            }
+
+            match ready!(Pin::new(&mut self.inner).poll_next(cx)) {
+                Some(Ok(chunk)) => {
+                    self.result.as_mut().unwrap().add_series(chunk); // safe to unwrap because we just checked above
+                }
+                Some(Err(e)) => {
+                    let mut result = self.result.take().unwrap(); // safe to unwrap because we just checked above
+                    result.set_error(e.to_string());
+                    return Poll::Ready(Some(result));
+                }
+                None => return Poll::Ready(self.result.take()),
+            }
+        }
+    }
+}
+
+impl<S> Stream for BufferedResultStream<S>
+where
+    S: Stream<Item = Result<SeriesChunk>> + Unpin,
+{
+    type Item = StatementResult;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        this.poll_next_unpin(cx)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::response::ResponseSerializer;
+    use crate::types::Precision;
+
+    use super::super::tests::{Column, make_statement};
+    use super::*;
+    use arrow::array::{DictionaryArray, Float64Array, StringArray, TimestampNanosecondArray};
+    use arrow::datatypes::Int32Type;
+    use futures::StreamExt;
+    use iox_query::QueryDatabase;
+    use iox_query::exec::IOxSessionContext;
+    use iox_query::query_log::QueryLog;
+    use iox_query::test::TestDatabaseStore;
+    use iox_time::SystemProvider;
+    use serde::ser::Serialize;
+    use serde_json::ser::Serializer;
+    use std::sync::Arc;
+
+    #[tokio::test]
+    async fn no_statements() {
+        let mut stream = BufferedResponseStream::new(vec![]);
+        assert!(stream.next().await.is_none());
+    }
+
+    #[tokio::test]
+    async fn single_statement_single_series() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: false,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1"])),
+                Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["t1a", "t1a"])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0])),
+            ],
+        );
+        insta::assert_snapshot!(collect_output(BufferedResponseStream::new(vec![statement])).await);
+    }
+
+    #[tokio::test]
+    async fn single_statement_multi_series() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t1a", "t1a", "t1b",
+                ])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])),
+            ],
+        );
+
+        insta::assert_snapshot!(collect_output(BufferedResponseStream::new(vec![statement])).await);
+    }
+
+    #[tokio::test]
+    async fn many_statements() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement1 = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: false,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000, 1000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t1a", "t1a", "t1a", "t1a",
+                ])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])),
+            ],
+        );
+        let statement2 = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000, 1000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t1a", "t1a", "t1b", "t1b",
+                ])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])),
+            ],
+        );
+
+        insta::assert_snapshot!(
+            collect_output(BufferedResponseStream::new(vec![statement1, statement2])).await
+        );
+    }
+
+    #[tokio::test]
+    async fn test_epoch() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t1a", "t1a", "t1b",
+                ])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])),
+            ],
+        );
+
+        insta::assert_snapshot!(
+            collect_output_epoch(
+                BufferedResponseStream::new(vec![statement]),
+                Some(Precision::Milliseconds)
+            )
+            .await
+        );
+    }
+
+    async fn collect_output(stream: impl Stream<Item = Response> + Send) -> String {
+        collect_output_epoch(stream, None).await
+    }
+
+    async fn collect_output_epoch(
+        stream: impl Stream<Item = Response> + Send,
+        epoch: Option<Precision>,
+    ) -> String {
+        stream
+            .map(|r| {
+                let mut w = vec![];
+                let mut ser = Serializer::new(&mut w);
+                let r = ResponseSerializer::new(&r, epoch, true);
+                r.serialize(&mut ser).unwrap();
+                w.push(b'\n');
+                String::from_utf8(w).unwrap()
+            })
+            .collect()
+            .await
+    }
+}
diff --git a/iox_v1_query_api/src/response/chunked.rs b/iox_v1_query_api/src/response/chunked.rs
new file mode 100644
index 00000000..7700b2a8
--- /dev/null
+++ b/iox_v1_query_api/src/response/chunked.rs
@@ -0,0 +1,465 @@
+//! Streams for producing responses where chunking is enabled.
+
+use super::{Response, SeriesChunk, SeriesChunkMergeStream, SeriesChunkStream, StatementResult};
+use crate::{Result, StatementFuture, error::Error, types::Statement};
+use datafusion::physical_plan::SendableRecordBatchStream;
+use futures::{Stream, ready};
+use iox_query::query_log::PermitAndToken;
+use std::{
+    num::NonZero,
+    pin::Pin,
+    task::{Context, Poll},
+};
+
+/// A stream of [Response] values where each response contains a single
+/// chunk of result data.
+pub(crate) struct ChunkedResponseStream {
+    statements: Vec<Pin<StatementFuture>>,
+    chunk_size: usize,
+    statement_id: usize,
+    current_statement: Option<(
+        Option<PermitAndToken>,
+        ChunkedResultStream<SeriesChunkStream<SendableRecordBatchStream>>,
+    )>,
+}
+
+impl ChunkedResponseStream {
+    pub(crate) fn new(statements: Vec<StatementFuture>, chunk_size: usize) -> Self {
+        let statements = statements.into_iter().map(Box::into_pin).collect();
+        Self {
+            statements,
+            chunk_size,
+            statement_id: 0,
+            current_statement: None,
+        }
+    }
+
+    fn poll_statement(&mut self, cx: &mut Context<'_>) -> Poll<Result<Statement>> {
+        self.statements[self.statement_id]
+            .as_mut()
+            .poll(cx)
+            .map_err(Error::from)
+    }
+
+    fn poll_next_result(&mut self, cx: &mut Context<'_>) -> Poll<Option<StatementResult>> {
+        let (_, stream) = self
+            .current_statement
+            .as_mut()
+            .expect("current_statement is None");
+        Pin::new(stream).poll_next(cx)
+    }
+
+    fn poll_next_unpin(&mut self, cx: &mut Context<'_>) -> Poll<Option<Response>> {
+        if self.statement_id >= self.statements.len() {
+            return Poll::Ready(None);
+        }
+        if self.current_statement.is_some() {
+            match ready!(self.poll_next_result(cx)) {
+                Some(result) => {
+                    if result.is_error() {
+                        let (permit_state, _) = self.current_statement.take().unwrap(); // safe to unwrap because we just checked above
+                        if let Some(permit_state) = permit_state {
+                            permit_state.query_completed_token.fail();
+                        }
+                        self.statement_id += 1;
+                    }
+                    let mut resp = Response::default();
+                    resp.add_result(result);
+                    Poll::Ready(Some(resp))
+                }
+                None => {
+                    let (permit_state, _) = self.current_statement.take().unwrap(); // safe to unwrap because we just checked above
+                    if let Some(permit_state) = permit_state {
+                        permit_state.query_completed_token.success();
+                    }
+                    self.statement_id += 1;
+                    self.poll_next_unpin(cx)
+                }
+            }
+        } else {
+            match ready!(self.poll_statement(cx)) {
+                Ok(Statement {
+                    schema,
+                    permit_state,
+                    stream,
+                }) => match SeriesChunkStream::try_new(stream, schema) {
+                    Ok(stream) => {
+                        self.current_statement = Some((
+                            permit_state,
+                            ChunkedResultStream::new(stream, self.chunk_size, self.statement_id),
+                        ));
+                        self.poll_next_unpin(cx)
+                    }
+                    Err(e) => {
+                        if let Some(permit_state) = permit_state {
+                            permit_state.query_completed_token.fail();
+                        }
+                        let mut result = StatementResult::new(self.statement_id);
+                        result.set_error(e.to_string());
+                        self.statement_id += 1;
+                        let mut resp = Response::default();
+                        resp.add_result(result);
+                        Poll::Ready(Some(resp))
+                    }
+                },
+                Err(e) => {
+                    let mut result = StatementResult::new(self.statement_id);
+                    result.set_error(e.to_string());
+                    self.statement_id += 1;
+                    let mut resp = Response::default();
+                    resp.add_result(result);
+                    Poll::Ready(Some(resp))
+                }
+            }
+        }
+    }
+}
+
+impl Stream for ChunkedResponseStream {
+    type Item = Response;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        this.poll_next_unpin(cx)
+    }
+}
+
+pub(super) struct ChunkedResultStream<S> {
+    inner: SeriesChunkMergeStream<S>,
+    statement_id: usize,
+    buffered: Option<StatementResult>,
+    done: bool,
+}
+
+impl<S> ChunkedResultStream<S> {
+    pub(super) fn new(stream: S, chunk_size: usize, statement_id: usize) -> Self {
+        let inner = SeriesChunkMergeStream::new(stream, NonZero::new(chunk_size));
+        Self {
+            inner,
+            statement_id,
+            buffered: None,
+            done: false,
+        }
+    }
+}
+
+impl<S> ChunkedResultStream<S>
+where
+    S: Stream<Item = Result<SeriesChunk>> + Unpin,
+{
+    fn poll_next_unpin(&mut self, cx: &mut Context<'_>) -> Poll<Option<StatementResult>> {
+        if self.done {
+            return Poll::Ready(None);
+        }
+        match ready!(Pin::new(&mut self.inner).poll_next(cx)) {
+            Some(Ok(chunk)) => {
+                let mut result = StatementResult::new(self.statement_id);
+                result.add_series(chunk);
+                result.set_partial(true);
+                match self.buffered.replace(result) {
+                    Some(result) => Poll::Ready(Some(result)),
+                    None => self.poll_next_unpin(cx),
+                }
+            }
+            Some(Err(e)) => {
+                let mut result = self
+                    .buffered
+                    .take()
+                    .unwrap_or_else(|| StatementResult::new(self.statement_id));
+                result.set_error(e.to_string());
+                Poll::Ready(Some(result))
+            }
+            None => {
+                self.done = true;
+                match self.buffered.take() {
+                    Some(mut result) => {
+                        result.set_partial(false);
+                        Poll::Ready(Some(result))
+                    }
+                    None => Poll::Ready(None),
+                }
+            }
+        }
+    }
+}
+
+impl<S> Stream for ChunkedResultStream<S>
+where
+    S: Stream<Item = Result<SeriesChunk>> + Unpin,
+{
+    type Item = StatementResult;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        this.poll_next_unpin(cx)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::response::ResponseSerializer;
+    use crate::types::Precision;
+
+    use super::super::tests::{Column, make_statement};
+    use super::*;
+    use arrow::array::{DictionaryArray, Float64Array, StringArray, TimestampNanosecondArray};
+    use arrow::datatypes::Int32Type;
+    use futures::StreamExt;
+    use iox_query::QueryDatabase;
+    use iox_query::exec::IOxSessionContext;
+    use iox_query::query_log::QueryLog;
+    use iox_query::test::TestDatabaseStore;
+    use iox_time::SystemProvider;
+    use serde::ser::Serialize;
+    use serde_json::ser::Serializer;
+    use std::sync::Arc;
+
+    #[tokio::test]
+    async fn no_statements() {
+        let mut stream = ChunkedResponseStream::new(vec![], 3);
+        assert!(stream.next().await.is_none());
+    }
+
+    #[tokio::test]
+    async fn single_chunk() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: false,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1"])),
+                Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["t1a", "t1a"])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0])),
+            ],
+        );
+        insta::assert_snapshot!(
+            collect_output(ChunkedResponseStream::new(vec![statement], 2)).await
+        );
+    }
+
+    #[tokio::test]
+    async fn many_chunks() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: false,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t1a", "t1a", "t1a",
+                ])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])),
+            ],
+        );
+        insta::assert_snapshot!(
+            collect_output(ChunkedResponseStream::new(vec![statement], 2)).await
+        );
+    }
+
+    #[tokio::test]
+    async fn many_chunks_with_groups() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: false,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000, 1000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t1a", "t1a", "t1a", "t1a",
+                ])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])),
+            ],
+        );
+        insta::assert_snapshot!(
+            collect_output(ChunkedResponseStream::new(vec![statement], 2)).await
+        );
+    }
+
+    #[tokio::test]
+    async fn many_statements() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement1 = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: false,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000, 1000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t1a", "t1a", "t1a", "t1a",
+                ])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])),
+            ],
+        );
+        let statement2 = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000, 1000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t1a", "t1a", "t1b", "t1b",
+                ])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])),
+            ],
+        );
+        insta::assert_snapshot!(
+            collect_output(ChunkedResponseStream::new(vec![statement1, statement2], 2)).await
+        );
+    }
+
+    #[tokio::test]
+    async fn test_epoch() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: false,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1"])),
+                Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["t1a", "t1a"])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0])),
+            ],
+        );
+
+        insta::assert_snapshot!(
+            collect_output_epoch(
+                ChunkedResponseStream::new(vec![statement], 2),
+                Some(Precision::Milliseconds)
+            )
+            .await
+        );
+    }
+
+    async fn collect_output(stream: impl Stream<Item = Response> + Send) -> String {
+        collect_output_epoch(stream, None).await
+    }
+
+    async fn collect_output_epoch(
+        stream: impl Stream<Item = Response> + Send,
+        epoch: Option<Precision>,
+    ) -> String {
+        stream
+            .map(|r| {
+                let mut w = vec![];
+                let mut ser = Serializer::new(&mut w);
+                let r = ResponseSerializer::new(&r, epoch, true);
+                r.serialize(&mut ser).unwrap();
+                w.push(b'\n');
+                String::from_utf8(w).unwrap()
+            })
+            .collect()
+            .await
+    }
+}
diff --git a/iox_v1_query_api/src/response/csv.rs b/iox_v1_query_api/src/response/csv.rs
new file mode 100644
index 00000000..7c4dcf73
--- /dev/null
+++ b/iox_v1_query_api/src/response/csv.rs
@@ -0,0 +1,824 @@
+//! InfluxDB v1 compatible CSV streaming output for InfluxQL queries.
+
+use super::{SeriesChunk, SeriesChunkStream};
+use crate::error::Error;
+use crate::types::Precision;
+use crate::{Result, StatementFuture, types::Statement, value::ValueType};
+use bytes::{Bytes, BytesMut};
+use datafusion::execution::SendableRecordBatchStream;
+use futures::{Stream, ready};
+use iox_query::query_log::PermitAndToken;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use tracing::warn;
+
+/// A stream of CSV data produced by executing InfluxQL statements.
+pub(crate) struct CsvStream {
+    statements: Vec<Pin<StatementFuture>>,
+    statement_id: usize,
+    current_statement: Option<(
+        Option<PermitAndToken>,
+        SeriesChunkStream<SendableRecordBatchStream>,
+    )>,
+    add_headers: bool,
+    add_newline: bool,
+    epoch: Precision,
+}
+
+impl CsvStream {
+    pub(crate) fn new(statements: Vec<StatementFuture>) -> Self {
+        let statements = statements.into_iter().map(Box::into_pin).collect();
+        Self {
+            statements,
+            statement_id: 0,
+            current_statement: None,
+            add_headers: false,
+            add_newline: false,
+            epoch: Precision::Nanoseconds,
+        }
+    }
+
+    pub(crate) fn with_epoch(mut self, epoch: Option<Precision>) -> Self {
+        self.epoch = epoch.unwrap_or(Precision::Nanoseconds);
+        self
+    }
+
+    fn poll_statement(&mut self, cx: &mut Context<'_>) -> Poll<Result<Statement>> {
+        self.statements[self.statement_id].as_mut().poll(cx)
+    }
+
+    fn poll_next_chunk(&mut self, cx: &mut Context<'_>) -> Poll<Option<Result<SeriesChunk>>> {
+        let (_, stream) = self
+            .current_statement
+            .as_mut()
+            .expect("no active SeriesChunkStream");
+        Pin::new(stream).poll_next(cx).map_err(Error::from)
+    }
+}
+
+impl Stream for CsvStream {
+    type Item = Bytes;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        if this.statement_id >= this.statements.len() {
+            return Poll::Ready(None);
+        }
+
+        if this.current_statement.is_none() {
+            let res = ready!(this.poll_statement(cx));
+            let Statement {
+                schema,
+                permit_state,
+                stream,
+            } = match res {
+                Ok(v) => v,
+                Err(e) => {
+                    warn!(error=%e, "Error executing query");
+                    this.statement_id += 1;
+                    cx.waker().wake_by_ref();
+                    return Poll::Pending;
+                }
+            };
+            let stream = match SeriesChunkStream::try_new(stream, schema) {
+                Ok(v) => v,
+                Err(e) => {
+                    warn!(error=%e, "Error creating SeriesChunk stream");
+                    if let Some(permit_state) = permit_state {
+                        permit_state.query_completed_token.fail();
+                    }
+                    this.statement_id += 1;
+                    cx.waker().wake_by_ref();
+                    return Poll::Pending;
+                }
+            };
+            this.add_headers = true;
+            this.add_newline = this.statement_id != 0;
+            this.current_statement = Some((permit_state, stream));
+        }
+        assert!(this.current_statement.is_some());
+        match ready!(this.poll_next_chunk(cx)) {
+            None => {
+                let (permit_state, _) = this.current_statement.take().unwrap();
+                if let Some(permit_state) = permit_state {
+                    permit_state.query_completed_token.success();
+                }
+                this.statement_id += 1;
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
+            Some(Ok(chunk)) => {
+                let mut chunk = CsvSeriesChunk::new(chunk, this.epoch);
+                if this.add_newline {
+                    chunk = chunk.with_newline();
+                    this.add_newline = false;
+                }
+                if this.add_headers {
+                    chunk = chunk.with_headers();
+                    this.add_headers = false;
+                }
+                Poll::Ready(Some(chunk.into()))
+            }
+            Some(Err(e)) => {
+                warn!(error=%e, "Error streaming SeriesChunk");
+                let (permit_state, _) = this.current_statement.take().unwrap();
+                if let Some(permit_state) = permit_state {
+                    permit_state.query_completed_token.fail();
+                }
+                this.statement_id += 1;
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
+        }
+    }
+}
+
+/// A chunk of CSV data which represents part of the result of executing
+/// an InfluxQL statement. Each chunk is for a single series, but may
+/// not contain a complete series.
+#[derive(Debug)]
+pub(crate) struct CsvSeriesChunk {
+    chunk: SeriesChunk,
+    emit_headers: bool,
+    emit_newline: bool,
+    epoch: Precision,
+}
+
+impl CsvSeriesChunk {
+    fn new(chunk: SeriesChunk, epoch: Precision) -> Self {
+        Self {
+            chunk,
+            emit_headers: false,
+            emit_newline: false,
+            epoch,
+        }
+    }
+
+    fn with_headers(mut self) -> Self {
+        self.emit_headers = true;
+        self
+    }
+
+    fn with_newline(mut self) -> Self {
+        self.emit_newline = true;
+        self
+    }
+}
+
+impl From<CsvSeriesChunk> for Bytes {
+    fn from(value: CsvSeriesChunk) -> Self {
+        let mut bytes = BytesMut::new();
+        let epoch = value.epoch;
+        if value.emit_newline {
+            bytes.extend_from_slice(b"\n");
+        }
+        if value.emit_headers {
+            // Measurement name and tag headers are always present.
+            bytes.extend_from_slice(b"name,tags");
+            for column in value.chunk.columns() {
+                bytes.extend_from_slice(format!(",{column}").as_bytes());
+            }
+            bytes.extend_from_slice(b"\n");
+        }
+        let measurement = csv_escape(value.chunk.measurement());
+        let mut tags = String::new();
+        for (k, v) in value.chunk.tags() {
+            tags.push_str(k.as_ref());
+            tags.push('=');
+            tags.push_str(&v);
+            tags.push(',');
+        }
+        if !tags.is_empty() {
+            tags.pop(); // Remove trailing comma.
+        }
+        let tags = csv_escape(tags);
+
+        for row in value.chunk.into_iter() {
+            bytes.extend_from_slice(measurement.as_bytes());
+            bytes.extend_from_slice(b",");
+            bytes.extend_from_slice(tags.as_bytes());
+            for value in row.into_iter() {
+                bytes.extend_from_slice(b",");
+                match value.value_type() {
+                    // NOTE(hiltontj): legacy /query API respects the `epoch` parameter, but only
+                    // returns timestamps in epoch format, not in RFC3339.
+                    //
+                    // See: <https://docs.influxdata.com/influxdb/v1/tools/api/#request-query-results-in-csv-format>
+                    ValueType::Timestamp(_) => {
+                        let ts = value.as_timestamp_opt().unwrap_or_default();
+                        let ts = match epoch {
+                            Precision::Nanoseconds => ts.timestamp_nanos_opt().unwrap_or_default(),
+                            Precision::Microseconds => ts.timestamp_micros(),
+                            Precision::Milliseconds => ts.timestamp_millis(),
+                            Precision::Seconds => ts.timestamp(),
+                            Precision::Minutes => ts.timestamp() / 60,
+                            Precision::Hours => ts.timestamp() / (60 * 60),
+                            Precision::Days => ts.timestamp() / (60 * 60 * 24),
+                            Precision::Weeks => ts.timestamp() / (60 * 60 * 24 * 7),
+                        };
+                        bytes.extend_from_slice(ts.to_string().as_bytes());
+                    }
+                    _ => {
+                        bytes.extend_from_slice(csv_escape(format!("{value}")).as_bytes());
+                    }
+                }
+            }
+            bytes.extend_from_slice(b"\n");
+        }
+
+        bytes.into()
+    }
+}
+
+fn csv_escape(s: String) -> String {
+    if s.contains(',') || s.contains('"') {
+        format!("\"{}\"", s.replace("\"", "\"\""))
+    } else {
+        s
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use super::super::tests::{Column, make_statement};
+    use super::*;
+    use arrow::{
+        array::{DictionaryArray, Float64Array, StringArray, TimestampNanosecondArray},
+        datatypes::Int32Type,
+    };
+    use datafusion::error::DataFusionError;
+    use futures::StreamExt;
+    use iox_query::QueryDatabase;
+    use iox_query::exec::IOxSessionContext;
+    use iox_query::query_log::QueryLog;
+    use iox_query::test::TestDatabaseStore;
+    use iox_time::SystemProvider;
+
+    #[tokio::test]
+    async fn no_statements() {
+        let mut stream = CsvStream::new(vec![]);
+        assert!(stream.next().await.is_none());
+    }
+
+    #[tokio::test]
+    async fn single_statement_no_group() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: false,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000, 1000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t1a", "t1a", "t1a", "t1a",
+                ])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])),
+            ],
+        );
+        let stream = CsvStream::new(vec![statement]);
+        let output = stream.map(Bytes::from).map(Vec::<u8>::from).concat().await;
+        let output = String::from_utf8(output).unwrap();
+        insta::assert_snapshot!(output, @r"
+        name,tags,time,t1,f1
+        m1,,1000000000,t1a,1
+        m1,,2000000000,t1a,2
+        m1,,3000000000,t1a,3
+        m2,,1000000000,t1a,4
+        ");
+    }
+
+    #[tokio::test]
+    async fn single_statement_tag_group() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000, 1000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t1a", "t1a", "t1a", "t1a",
+                ])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])),
+            ],
+        );
+        let stream = CsvStream::new(vec![statement]);
+        let output = stream.map(Bytes::from).map(Vec::<u8>::from).concat().await;
+        let output = String::from_utf8(output).unwrap();
+        insta::assert_snapshot!(output, @r"
+        name,tags,time,f1
+        m1,t1=t1a,1000000000,1
+        m1,t1=t1a,2000000000,2
+        m1,t1=t1a,3000000000,3
+        m2,t1=t1a,1000000000,4
+        ");
+    }
+
+    #[tokio::test]
+    async fn single_statement_tag_group_projected() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: true,
+                    projected: true,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000, 1000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t1a", "t1a", "t1a", "t1a",
+                ])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])),
+            ],
+        );
+        let stream = CsvStream::new(vec![statement]);
+        let output = stream.map(Bytes::from).map(Vec::<u8>::from).concat().await;
+        let output = String::from_utf8(output).unwrap();
+        insta::assert_snapshot!(output, @r"
+        name,tags,time,t1,f1
+        m1,t1=t1a,1000000000,t1a,1
+        m1,t1=t1a,2000000000,t1a,2
+        m1,t1=t1a,3000000000,t1a,3
+        m2,t1=t1a,1000000000,t1a,4
+        ");
+    }
+
+    #[tokio::test]
+    async fn single_statement_tag_group_multiple_tags() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Tag {
+                    name: "t2",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000, 1000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t1a", "t1a", "t1a", "t1a",
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "\"t2a\"", "\"t2a\"", "\"t2b\"", "\"t2a\"",
+                ])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])),
+            ],
+        );
+        let stream = CsvStream::new(vec![statement]);
+        let output = stream.map(Bytes::from).map(Vec::<u8>::from).concat().await;
+        let output = String::from_utf8(output).unwrap();
+        insta::assert_snapshot!(output, @r#"
+        name,tags,time,f1
+        m1,"t1=t1a,t2=""t2a""",1000000000,1
+        m1,"t1=t1a,t2=""t2a""",2000000000,2
+        m1,"t1=t1a,t2=""t2b""",3000000000,3
+        m2,"t1=t1a,t2=""t2a""",1000000000,4
+        "#);
+    }
+
+    #[tokio::test]
+    async fn single_statement_infinite_value() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: false,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000, 1000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t1a", "t1a", "t1a", "t1a",
+                ])),
+                Arc::new(Float64Array::from(vec![
+                    f64::NEG_INFINITY,
+                    f64::INFINITY,
+                    f64::NEG_INFINITY,
+                    f64::NAN,
+                ])),
+            ],
+        );
+        let stream = CsvStream::new(vec![statement]);
+        let output = stream.map(Bytes::from).map(Vec::<u8>::from).concat().await;
+        let output = String::from_utf8(output).unwrap();
+        insta::assert_snapshot!(output, @r"
+        name,tags,time,t1,f1
+        m1,,1000000000,t1a,-inf
+        m1,,2000000000,t1a,inf
+        m1,,3000000000,t1a,-inf
+        m2,,1000000000,t1a,NaN
+        ");
+    }
+
+    #[tokio::test]
+    async fn multiple_statements() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement1 = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: false,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000, 1000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t1a", "t1a", "t1a", "t1a",
+                ])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])),
+            ],
+        );
+        let statement2 = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t2",
+                    group_by: false,
+                    projected: false,
+                },
+                Column::Field { name: "f2" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m3", "m3", "m3", "m3"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000, 4000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t2a", "t2a", "t2a", "t2a",
+                ])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])),
+            ],
+        );
+        let stream = CsvStream::new(vec![statement1, statement2]);
+        let output = stream.map(Bytes::from).map(Vec::<u8>::from).concat().await;
+        let output = String::from_utf8(output).unwrap();
+        insta::assert_snapshot!(output, @r"
+        name,tags,time,t1,f1
+        m1,,1000000000,t1a,1
+        m1,,2000000000,t1a,2
+        m1,,3000000000,t1a,3
+        m2,,1000000000,t1a,4
+
+        name,tags,time,t2,f2
+        m3,,1000000000,t2a,1
+        m3,,2000000000,t2a,2
+        m3,,3000000000,t2a,3
+        m3,,4000000000,t2a,4
+        ");
+    }
+
+    #[tokio::test]
+    async fn multiple_statements_with_error() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement1 = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t1",
+                    group_by: false,
+                    projected: false,
+                },
+                Column::Field { name: "f1" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000, 1000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t1a", "t1a", "t1a", "t1a",
+                ])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])),
+            ],
+        );
+        let statement2 = make_statement(
+            &db,
+            &ctx,
+            &log,
+            [
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "t2",
+                    group_by: false,
+                    projected: false,
+                },
+                Column::Field { name: "f2" },
+            ],
+            vec![
+                Arc::new(StringArray::from(vec!["m3", "m3", "m3", "m3"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000, 4000000000,
+                ])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                    "t2a", "t2a", "t2a", "t2a",
+                ])),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])),
+            ],
+        );
+        let stream = CsvStream::new(vec![
+            statement1,
+            Box::new(async { Err(DataFusionError::Internal("test error".to_string()))? }),
+            statement2,
+        ]);
+        let output = stream.map(Bytes::from).map(Vec::<u8>::from).concat().await;
+        let output = String::from_utf8(output).unwrap();
+        insta::assert_snapshot!(output, @r"
+        name,tags,time,t1,f1
+        m1,,1000000000,t1a,1
+        m1,,2000000000,t1a,2
+        m1,,3000000000,t1a,3
+        m2,,1000000000,t1a,4
+
+        name,tags,time,t2,f2
+        m3,,1000000000,t2a,1
+        m3,,2000000000,t2a,2
+        m3,,3000000000,t2a,3
+        m3,,4000000000,t2a,4
+        ");
+    }
+
+    #[tokio::test]
+    async fn test_csv_epoch_handling() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let statement = || {
+            make_statement(
+                &db,
+                &ctx,
+                &log,
+                [
+                    Column::Measurement,
+                    Column::Time,
+                    Column::Tag {
+                        name: "t1",
+                        group_by: false,
+                        projected: false,
+                    },
+                    Column::Field { name: "f1" },
+                ],
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        1_000_000_000_000_000,
+                        2_000_000_000_000_000,
+                        3_000_000_000_000_000,
+                        4_000_000_000_000_000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "t1a", "t1a", "t1a", "t1a",
+                    ])),
+                    Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0])),
+                ],
+            )
+        };
+        {
+            let stream = CsvStream::new(vec![statement()]).with_epoch(None);
+            let output = stream.map(Bytes::from).map(Vec::<u8>::from).concat().await;
+            let output = String::from_utf8(output).unwrap();
+            insta::assert_snapshot!(output, @r"
+        name,tags,time,t1,f1
+        m1,,1000000000000000,t1a,1
+        m1,,2000000000000000,t1a,2
+        m1,,3000000000000000,t1a,3
+        m2,,4000000000000000,t1a,4
+        ");
+        }
+        {
+            let stream = CsvStream::new(vec![statement()]).with_epoch(Some(Precision::Nanoseconds));
+            let output = stream.map(Bytes::from).map(Vec::<u8>::from).concat().await;
+            let output = String::from_utf8(output).unwrap();
+            insta::assert_snapshot!(output, @r"
+        name,tags,time,t1,f1
+        m1,,1000000000000000,t1a,1
+        m1,,2000000000000000,t1a,2
+        m1,,3000000000000000,t1a,3
+        m2,,4000000000000000,t1a,4
+        ");
+        }
+        {
+            let stream =
+                CsvStream::new(vec![statement()]).with_epoch(Some(Precision::Microseconds));
+            let output = stream.map(Bytes::from).map(Vec::<u8>::from).concat().await;
+            let output = String::from_utf8(output).unwrap();
+            insta::assert_snapshot!(output, @r"
+        name,tags,time,t1,f1
+        m1,,1000000000000,t1a,1
+        m1,,2000000000000,t1a,2
+        m1,,3000000000000,t1a,3
+        m2,,4000000000000,t1a,4
+        ");
+        }
+        {
+            let stream =
+                CsvStream::new(vec![statement()]).with_epoch(Some(Precision::Milliseconds));
+            let output = stream.map(Bytes::from).map(Vec::<u8>::from).concat().await;
+            let output = String::from_utf8(output).unwrap();
+            insta::assert_snapshot!(output, @r"
+        name,tags,time,t1,f1
+        m1,,1000000000,t1a,1
+        m1,,2000000000,t1a,2
+        m1,,3000000000,t1a,3
+        m2,,4000000000,t1a,4
+        ");
+        }
+        {
+            let stream = CsvStream::new(vec![statement()]).with_epoch(Some(Precision::Seconds));
+            let output = stream.map(Bytes::from).map(Vec::<u8>::from).concat().await;
+            let output = String::from_utf8(output).unwrap();
+            insta::assert_snapshot!(output, @r#"
+            name,tags,time,t1,f1
+            m1,,1000000,t1a,1
+            m1,,2000000,t1a,2
+            m1,,3000000,t1a,3
+            m2,,4000000,t1a,4
+            "#);
+        }
+        {
+            let stream = CsvStream::new(vec![statement()]).with_epoch(Some(Precision::Minutes));
+            let output = stream.map(Bytes::from).map(Vec::<u8>::from).concat().await;
+            let output = String::from_utf8(output).unwrap();
+            insta::assert_snapshot!(output, @r#"
+            name,tags,time,t1,f1
+            m1,,16666,t1a,1
+            m1,,33333,t1a,2
+            m1,,50000,t1a,3
+            m2,,66666,t1a,4
+            "#);
+        }
+        {
+            let stream = CsvStream::new(vec![statement()]).with_epoch(Some(Precision::Hours));
+            let output = stream.map(Bytes::from).map(Vec::<u8>::from).concat().await;
+            let output = String::from_utf8(output).unwrap();
+            insta::assert_snapshot!(output, @r#"
+            name,tags,time,t1,f1
+            m1,,277,t1a,1
+            m1,,555,t1a,2
+            m1,,833,t1a,3
+            m2,,1111,t1a,4
+            "#);
+        }
+        {
+            let stream = CsvStream::new(vec![statement()]).with_epoch(Some(Precision::Days));
+            let output = stream.map(Bytes::from).map(Vec::<u8>::from).concat().await;
+            let output = String::from_utf8(output).unwrap();
+            insta::assert_snapshot!(output, @r#"
+            name,tags,time,t1,f1
+            m1,,11,t1a,1
+            m1,,23,t1a,2
+            m1,,34,t1a,3
+            m2,,46,t1a,4
+            "#);
+        }
+        {
+            let stream = CsvStream::new(vec![statement()]).with_epoch(Some(Precision::Weeks));
+            let output = stream.map(Bytes::from).map(Vec::<u8>::from).concat().await;
+            let output = String::from_utf8(output).unwrap();
+            insta::assert_snapshot!(output, @r#"
+            name,tags,time,t1,f1
+            m1,,1,t1a,1
+            m1,,3,t1a,2
+            m1,,4,t1a,3
+            m2,,6,t1a,4
+            "#);
+        }
+    }
+}
diff --git a/iox_v1_query_api/src/response/json.rs b/iox_v1_query_api/src/response/json.rs
new file mode 100644
index 00000000..7e13331f
--- /dev/null
+++ b/iox_v1_query_api/src/response/json.rs
@@ -0,0 +1,461 @@
+//! JSON encoding of InfluxQL query results.
+use crate::types::Precision;
+
+use super::{BufferedResponseStream, ChunkedResponseStream, Response, ResponseSerializer};
+use bytes::buf::BufMut;
+use bytes::{Bytes, BytesMut};
+use futures::Stream;
+use serde::Serialize;
+use serde_json::ser::Serializer;
+use std::{
+    pin::Pin,
+    task::{Context, Poll},
+};
+use tracing::warn;
+
+/// A generic JSON-encoded [Response] stream.
+pub(crate) struct JsonStream<S, F> {
+    stream: S,
+    formatter_fn: F,
+    epoch: Option<Precision>,
+}
+
+impl<S, F> JsonStream<S, F> {
+    pub(crate) fn new(stream: S, formatter_fn: F, epoch: Option<Precision>) -> Self {
+        Self {
+            stream,
+            formatter_fn,
+            epoch,
+        }
+    }
+
+    fn poll_next_inner(&mut self, cx: &mut Context<'_>) -> Poll<Option<Response>>
+    where
+        S: Stream<Item = Response> + Unpin,
+    {
+        Pin::new(&mut self.stream).poll_next(cx)
+    }
+}
+
+impl<S, F, Fmt> Stream for JsonStream<S, F>
+where
+    S: Stream<Item = Response> + Unpin,
+    F: Fn() -> Fmt + Unpin,
+    Fmt: serde_json::ser::Formatter,
+{
+    type Item = Bytes;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        this.poll_next_inner(cx).map(|opt| {
+            opt.and_then(|resp| {
+                let mut w = BytesMut::new().writer();
+                let formatter = (this.formatter_fn)();
+                let mut serializer = Serializer::with_formatter(&mut w, formatter);
+                let resp = ResponseSerializer::new(&resp, this.epoch, false);
+                if let Err(e) = resp.serialize(&mut serializer) {
+                    warn!(error = %e, "failed to serialize response");
+                    let mut w = BytesMut::new().writer();
+                    let formatter = (this.formatter_fn)();
+                    let mut serializer = Serializer::with_formatter(&mut w, formatter);
+                    let error_field = serde_json::json!({
+                        "error": format!("{}", e),
+                    });
+                    if let Err(e) = error_field.serialize(&mut serializer) {
+                        warn!(error = %e, "failed to serialize error field");
+                        return None;
+                    }
+                    Some(w.into_inner().freeze())
+                } else {
+                    Some(w.into_inner().freeze())
+                }
+            })
+        })
+    }
+}
+
+pub(crate) type ChunkedJsonStream<F> = JsonStream<ChunkedResponseStream, F>;
+pub(crate) type BufferedJsonStream<F> = JsonStream<BufferedResponseStream, F>;
+
+#[cfg(test)]
+mod tests {
+    use super::super::tests::{Column, make_statement};
+    use super::*;
+
+    use arrow::array::{
+        ArrayRef, DictionaryArray, Float64Array, StringArray, TimestampNanosecondArray,
+    };
+    use arrow::datatypes::Int32Type;
+    use futures::StreamExt;
+    use iox_query::QueryDatabase;
+    use iox_query::exec::IOxSessionContext;
+    use iox_query::query_log::QueryLog;
+    use iox_query::test::TestDatabaseStore;
+    use iox_time::SystemProvider;
+    use serde_json::ser::{CompactFormatter, PrettyFormatter};
+    use std::sync::Arc;
+
+    #[tokio::test]
+    async fn empty_stream() {
+        let stream = ChunkedResponseStream::new(vec![], 2);
+        let mut chunked_json_stream = ChunkedJsonStream::new(stream, || CompactFormatter, None);
+        assert!(chunked_json_stream.next().await.is_none());
+
+        let stream = BufferedResponseStream::new(vec![]);
+        let mut buffered_json_stream = BufferedJsonStream::new(stream, || CompactFormatter, None);
+        assert!(buffered_json_stream.next().await.is_none());
+    }
+
+    #[tokio::test]
+    async fn single_chunk() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let columns = [
+            Column::Measurement,
+            Column::Time,
+            Column::Tag {
+                name: "t1",
+                group_by: false,
+                projected: false,
+            },
+            Column::Field { name: "f1" },
+        ];
+        let data: Vec<ArrayRef> = vec![
+            Arc::new(StringArray::from(vec!["m1", "m1"])),
+            Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])),
+            Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["t1a", "t1a"])),
+            Arc::new(Float64Array::from(vec![1.0, 2.0])),
+        ];
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = ChunkedResponseStream::new(vec![statement], 2);
+        let stream = ChunkedJsonStream::new(stream, || CompactFormatter, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = BufferedResponseStream::new(vec![statement]);
+        let stream = BufferedJsonStream::new(stream, || CompactFormatter, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+    }
+
+    #[tokio::test]
+    async fn single_chunk_pretty() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let columns = [
+            Column::Measurement,
+            Column::Time,
+            Column::Tag {
+                name: "t1",
+                group_by: false,
+                projected: false,
+            },
+            Column::Field { name: "f1" },
+        ];
+        let data: Vec<ArrayRef> = vec![
+            Arc::new(StringArray::from(vec!["m1", "m1"])),
+            Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])),
+            Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["t1a", "t1a"])),
+            Arc::new(Float64Array::from(vec![1.0, 2.0])),
+        ];
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = ChunkedResponseStream::new(vec![statement], 2);
+        let stream = ChunkedJsonStream::new(stream, PrettyFormatter::new, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = BufferedResponseStream::new(vec![statement]);
+        let stream = BufferedJsonStream::new(stream, PrettyFormatter::new, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+    }
+
+    #[tokio::test]
+    async fn single_chunk_exponential_value() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let columns = [
+            Column::Measurement,
+            Column::Time,
+            Column::Tag {
+                name: "t1",
+                group_by: false,
+                projected: false,
+            },
+            Column::Field { name: "f1" },
+        ];
+        let data: Vec<ArrayRef> = vec![
+            Arc::new(StringArray::from(vec!["m1", "m1"])),
+            Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])),
+            Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["t1a", "t1a"])),
+            Arc::new(Float64Array::from(vec![
+                73070599793680680000000000000000000000000000.0,
+                73070599793680670000000000000000000000000000.0,
+            ])),
+        ];
+
+        // Only test for buffered since the purpose is to test the formatting of the exponential value
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = BufferedResponseStream::new(vec![statement]);
+        let stream = BufferedJsonStream::new(stream, || CompactFormatter, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+    }
+
+    #[tokio::test]
+    async fn single_chunk_infinite_value() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let columns = [
+            Column::Measurement,
+            Column::Time,
+            Column::Tag {
+                name: "t1",
+                group_by: false,
+                projected: false,
+            },
+            Column::Field { name: "f1" },
+        ];
+        let data: Vec<ArrayRef> = vec![
+            Arc::new(StringArray::from(vec!["m1", "m1"])),
+            Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])),
+            Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["t1a", "t1a"])),
+            Arc::new(Float64Array::from(vec![f64::INFINITY, f64::NEG_INFINITY])),
+        ];
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = ChunkedResponseStream::new(vec![statement], 2);
+        let stream = ChunkedJsonStream::new(stream, PrettyFormatter::new, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = BufferedResponseStream::new(vec![statement]);
+        let stream = BufferedJsonStream::new(stream, PrettyFormatter::new, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+    }
+
+    #[tokio::test]
+    async fn many_chunks() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let columns = [
+            Column::Measurement,
+            Column::Time,
+            Column::Tag {
+                name: "t1",
+                group_by: false,
+                projected: false,
+            },
+            Column::Field { name: "f1" },
+        ];
+        let data: Vec<ArrayRef> = vec![
+            Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+            Arc::new(TimestampNanosecondArray::from(vec![
+                1000000000, 2000000000, 3000000000,
+            ])),
+            Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                "t1a", "t1a", "t1a",
+            ])),
+            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])),
+        ];
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = ChunkedResponseStream::new(vec![statement], 2);
+        let stream = ChunkedJsonStream::new(stream, || CompactFormatter, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = BufferedResponseStream::new(vec![statement]);
+        let stream = BufferedJsonStream::new(stream, || CompactFormatter, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+    }
+
+    #[tokio::test]
+    async fn many_chunks_many_measurements() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let columns = [
+            Column::Measurement,
+            Column::Time,
+            Column::Tag {
+                name: "t1",
+                group_by: false,
+                projected: false,
+            },
+            Column::Field { name: "f1" },
+        ];
+        let data: Vec<ArrayRef> = vec![
+            Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+            Arc::new(TimestampNanosecondArray::from(vec![
+                1000000000, 2000000000, 3000000000, 1000000000,
+            ])),
+            Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                "t1a", "t1a", "t1a", "t1a",
+            ])),
+            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.5])),
+        ];
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = ChunkedResponseStream::new(vec![statement], 2);
+        let stream = ChunkedJsonStream::new(stream, || CompactFormatter, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = BufferedResponseStream::new(vec![statement]);
+        let stream = BufferedJsonStream::new(stream, || CompactFormatter, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+    }
+
+    #[tokio::test]
+    async fn many_statements() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let columns1 = [
+            Column::Measurement,
+            Column::Time,
+            Column::Tag {
+                name: "t1",
+                group_by: false,
+                projected: false,
+            },
+            Column::Field { name: "f1" },
+        ];
+        let data1: Vec<ArrayRef> = vec![
+            Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+            Arc::new(TimestampNanosecondArray::from(vec![
+                1000000000, 2000000000, 3000000000, 1000000000,
+            ])),
+            Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                "t1a", "t1a", "t1a", "t1a",
+            ])),
+            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])),
+        ];
+        let columns2 = [
+            Column::Measurement,
+            Column::Time,
+            Column::Tag {
+                name: "t1",
+                group_by: true,
+                projected: false,
+            },
+            Column::Field { name: "f1" },
+        ];
+        let data2: Vec<ArrayRef> = vec![
+            Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+            Arc::new(TimestampNanosecondArray::from(vec![
+                1000000000, 2000000000, 3000000000, 1000000000,
+            ])),
+            Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                "t1a", "t1a", "t1b", "t1b",
+            ])),
+            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])),
+        ];
+
+        let statement1 = make_statement(&db, &ctx, &log, columns1.clone(), data1.clone());
+        let statement2 = make_statement(&db, &ctx, &log, columns2.clone(), data2.clone());
+        let stream = ChunkedResponseStream::new(vec![statement1, statement2], 2);
+        let stream = ChunkedJsonStream::new(stream, || CompactFormatter, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+
+        let statement1 = make_statement(&db, &ctx, &log, columns1.clone(), data1.clone());
+        let statement2 = make_statement(&db, &ctx, &log, columns2.clone(), data2.clone());
+        let stream = BufferedResponseStream::new(vec![statement1, statement2]);
+        let stream = BufferedJsonStream::new(stream, || CompactFormatter, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+    }
+
+    #[tokio::test]
+    async fn test_epoch() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let columns = [
+            Column::Measurement,
+            Column::Time,
+            Column::Tag {
+                name: "t1",
+                group_by: false,
+                projected: false,
+            },
+            Column::Field { name: "f1" },
+        ];
+        let data: Vec<ArrayRef> = vec![
+            Arc::new(StringArray::from(vec!["m1", "m1"])),
+            Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])),
+            Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["t1a", "t1a"])),
+            Arc::new(Float64Array::from(vec![1.0, 2.0])),
+        ];
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = ChunkedResponseStream::new(vec![statement], 2);
+        let stream =
+            ChunkedJsonStream::new(stream, || CompactFormatter, Some(Precision::Nanoseconds));
+        insta::assert_snapshot!(collect_output(stream).await);
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = BufferedResponseStream::new(vec![statement]);
+        let stream =
+            BufferedJsonStream::new(stream, || CompactFormatter, Some(Precision::Nanoseconds));
+        insta::assert_snapshot!(collect_output(stream).await);
+    }
+
+    async fn collect_output<S: Stream<Item = Bytes> + Send>(stream: S) -> String {
+        String::from_utf8(
+            stream
+                .map(Vec::<u8>::from)
+                .map(|mut v| {
+                    v.push(b'\n');
+                    v
+                })
+                .concat()
+                .await,
+        )
+        .unwrap()
+    }
+}
diff --git a/iox_v1_query_api/src/response/msgpack.rs b/iox_v1_query_api/src/response/msgpack.rs
new file mode 100644
index 00000000..089fc772
--- /dev/null
+++ b/iox_v1_query_api/src/response/msgpack.rs
@@ -0,0 +1,337 @@
+//! Message Pack encoding of InfluxQL query results.
+use crate::types::Precision;
+
+use super::{BufferedResponseStream, ChunkedResponseStream, Response, ResponseSerializer};
+use bytes::buf::BufMut;
+use bytes::{Bytes, BytesMut};
+use futures::Stream;
+use rmp_serde::Serializer;
+use serde::Serialize;
+use std::{
+    pin::Pin,
+    task::{Context, Poll},
+};
+use tracing::warn;
+
+/// A generic Message Pack-encoded [Response] stream.
+pub(crate) struct MessagePackStream<S> {
+    stream: S,
+    epoch: Option<Precision>,
+}
+
+impl<S> MessagePackStream<S> {
+    pub(crate) fn new(stream: S, epoch: Option<Precision>) -> Self {
+        Self { stream, epoch }
+    }
+
+    fn poll_next_inner(&mut self, cx: &mut Context<'_>) -> Poll<Option<Response>>
+    where
+        S: Stream<Item = Response> + Unpin,
+    {
+        Pin::new(&mut self.stream).poll_next(cx)
+    }
+}
+
+impl<S> Stream for MessagePackStream<S>
+where
+    S: Stream<Item = Response> + Unpin,
+{
+    type Item = Bytes;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        this.poll_next_inner(cx).map(|opt| {
+            opt.and_then(|resp| {
+                let mut w = BytesMut::new().writer();
+                let mut serializer = Serializer::new(&mut w);
+                let resp = ResponseSerializer::new(&resp, this.epoch, true);
+                if let Err(e) = resp.serialize(&mut serializer) {
+                    warn!(error = %e, "failed to serialize response");
+                    None
+                } else {
+                    Some(w.into_inner().freeze())
+                }
+            })
+        })
+    }
+}
+
+pub(crate) type ChunkedMessagePackStream = MessagePackStream<ChunkedResponseStream>;
+pub(crate) type BufferedMessagePackStream = MessagePackStream<BufferedResponseStream>;
+
+#[cfg(test)]
+mod tests {
+    use crate::response::buffered::BufferedResponseStream;
+
+    use super::super::tests::{Column, make_statement};
+    use super::*;
+    use arrow::array::{
+        ArrayRef, DictionaryArray, Float64Array, StringArray, TimestampNanosecondArray,
+    };
+    use arrow::datatypes::Int32Type;
+    use futures::StreamExt;
+    use iox_query::QueryDatabase;
+    use iox_query::exec::IOxSessionContext;
+    use iox_query::query_log::QueryLog;
+    use iox_query::test::TestDatabaseStore;
+    use iox_time::SystemProvider;
+    use rmp_serde::Deserializer;
+    use serde::Deserialize;
+    use serde_json::Value;
+    use std::sync::Arc;
+
+    #[tokio::test]
+    async fn empty_stream() {
+        let stream = ChunkedResponseStream::new(vec![], 2);
+        let mut stream = ChunkedMessagePackStream::new(stream, None);
+        assert!(stream.next().await.is_none());
+
+        let stream = BufferedResponseStream::new(vec![]);
+        let mut stream = BufferedMessagePackStream::new(stream, None);
+        assert!(stream.next().await.is_none());
+    }
+
+    #[tokio::test]
+    async fn single_chunk() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let columns = [
+            Column::Measurement,
+            Column::Time,
+            Column::Tag {
+                name: "t1",
+                group_by: false,
+                projected: false,
+            },
+            Column::Field { name: "f1" },
+        ];
+        let data: Vec<ArrayRef> = vec![
+            Arc::new(StringArray::from(vec!["m1", "m1"])),
+            Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])),
+            Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["t1a", "t1a"])),
+            Arc::new(Float64Array::from(vec![1.0, 2.0])),
+        ];
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = ChunkedResponseStream::new(vec![statement], 2);
+        let stream = ChunkedMessagePackStream::new(stream, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = BufferedResponseStream::new(vec![statement]);
+        let stream = BufferedMessagePackStream::new(stream, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+    }
+
+    #[tokio::test]
+    async fn many_chunks() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let columns = [
+            Column::Measurement,
+            Column::Time,
+            Column::Tag {
+                name: "t1",
+                group_by: false,
+                projected: false,
+            },
+            Column::Field { name: "f1" },
+        ];
+        let data: Vec<ArrayRef> = vec![
+            Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+            Arc::new(TimestampNanosecondArray::from(vec![
+                1000000000, 2000000000, 3000000000,
+            ])),
+            Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                "t1a", "t1a", "t1a",
+            ])),
+            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])),
+        ];
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = ChunkedResponseStream::new(vec![statement], 2);
+        let stream = ChunkedMessagePackStream::new(stream, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = BufferedResponseStream::new(vec![statement]);
+        let stream = BufferedMessagePackStream::new(stream, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+    }
+
+    #[tokio::test]
+    async fn many_chunks_many_measurments() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let columns = [
+            Column::Measurement,
+            Column::Time,
+            Column::Tag {
+                name: "t1",
+                group_by: false,
+                projected: false,
+            },
+            Column::Field { name: "f1" },
+        ];
+        let data: Vec<ArrayRef> = vec![
+            Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+            Arc::new(TimestampNanosecondArray::from(vec![
+                1000000000, 2000000000, 3000000000, 1000000000,
+            ])),
+            Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                "t1a", "t1a", "t1a", "t1a",
+            ])),
+            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])),
+        ];
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = ChunkedResponseStream::new(vec![statement], 2);
+        let stream = ChunkedMessagePackStream::new(stream, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = BufferedResponseStream::new(vec![statement]);
+        let stream = BufferedMessagePackStream::new(stream, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+    }
+
+    #[tokio::test]
+    async fn many_statements() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let columns1 = [
+            Column::Measurement,
+            Column::Time,
+            Column::Tag {
+                name: "t1",
+                group_by: false,
+                projected: false,
+            },
+            Column::Field { name: "f1" },
+        ];
+        let data1: Vec<ArrayRef> = vec![
+            Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+            Arc::new(TimestampNanosecondArray::from(vec![
+                1000000000, 2000000000, 3000000000, 1000000000,
+            ])),
+            Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                "t1a", "t1a", "t1a", "t1a",
+            ])),
+            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])),
+        ];
+
+        let columns2 = [
+            Column::Measurement,
+            Column::Time,
+            Column::Tag {
+                name: "t1",
+                group_by: true,
+                projected: false,
+            },
+            Column::Field { name: "f1" },
+        ];
+        let data2: Vec<ArrayRef> = vec![
+            Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m2"])),
+            Arc::new(TimestampNanosecondArray::from(vec![
+                1000000000, 2000000000, 3000000000, 1000000000,
+            ])),
+            Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                "t1a", "t1a", "t1b", "t1b",
+            ])),
+            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 1.0])),
+        ];
+
+        let statement1 = make_statement(&db, &ctx, &log, columns1.clone(), data1.clone());
+        let statement2 = make_statement(&db, &ctx, &log, columns2.clone(), data2.clone());
+        let stream = ChunkedResponseStream::new(vec![statement1, statement2], 2);
+        let stream = ChunkedMessagePackStream::new(stream, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+
+        let statement1 = make_statement(&db, &ctx, &log, columns1.clone(), data1.clone());
+        let statement2 = make_statement(&db, &ctx, &log, columns2.clone(), data2.clone());
+        let stream = BufferedResponseStream::new(vec![statement1, statement2]);
+        let stream = BufferedMessagePackStream::new(stream, None);
+        insta::assert_snapshot!(collect_output(stream).await);
+    }
+
+    #[tokio::test]
+    async fn test_epoch() {
+        let db: Arc<dyn QueryDatabase> = Arc::new(TestDatabaseStore::default());
+        let ctx = Arc::new(IOxSessionContext::with_testing());
+        let log = Arc::new(QueryLog::new(
+            1,
+            Arc::new(SystemProvider::new()),
+            &metric::Registry::new(),
+            None,
+        ));
+        let columns = [
+            Column::Measurement,
+            Column::Time,
+            Column::Tag {
+                name: "t1",
+                group_by: false,
+                projected: false,
+            },
+            Column::Field { name: "f1" },
+        ];
+        let data: Vec<ArrayRef> = vec![
+            Arc::new(StringArray::from(vec!["m1", "m1"])),
+            Arc::new(TimestampNanosecondArray::from(vec![1000000000, 2000000000])),
+            Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["t1a", "t1a"])),
+            Arc::new(Float64Array::from(vec![1.0, 2.0])),
+        ];
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = ChunkedResponseStream::new(vec![statement], 2);
+        let stream = ChunkedMessagePackStream::new(stream, Some(Precision::Microseconds));
+        insta::assert_snapshot!(collect_output(stream).await);
+
+        let statement = make_statement(&db, &ctx, &log, columns.clone(), data.clone());
+        let stream = BufferedResponseStream::new(vec![statement]);
+        let stream = BufferedMessagePackStream::new(stream, Some(Precision::Microseconds));
+        insta::assert_snapshot!(collect_output(stream).await);
+    }
+
+    async fn collect_output<S: Stream<Item = Bytes> + Send>(stream: S) -> String {
+        String::from_utf8(
+            stream
+                .map(Vec::<u8>::from)
+                .map(|v| {
+                    // Docode the msgpack and recode as JSON to make it
+                    // easier to validate.
+                    let mut de = Deserializer::new(&v[..]);
+                    let value = Value::deserialize(&mut de).unwrap();
+                    let mut v = serde_json::to_vec(&value).unwrap();
+                    v.push(b'\n');
+                    v
+                })
+                .concat()
+                .await,
+        )
+        .unwrap()
+    }
+}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__epoch.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__epoch.snap
new file mode 100644
index 00000000..fde51ebd
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__epoch.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/buffered.rs
+expression: "collect_output_epoch(BufferedResponseStream::new(vec![statement]),\nSome(Precision::Milliseconds)).await"
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","tags":{"t1":"t1a"},"columns":["time","f1"],"values":[[1000,1],[2000,2]]},{"name":"m1","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[[3000,3]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__many_statements.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__many_statements.snap
new file mode 100644
index 00000000..4bc626e9
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__many_statements.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/buffered.rs
+expression: "collect_output(BufferedResponseStream::new(vec![statement1,\nstatement2])).await"
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2],["1970-01-01T00:00:03Z","t1a",3]]},{"name":"m2","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1]]}]},{"statement_id":1,"series":[{"name":"m1","tags":{"t1":"t1a"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1],["1970-01-01T00:00:02Z",2]]},{"name":"m1","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:03Z",3]]},{"name":"m2","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__single_statement_multi_series.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__single_statement_multi_series.snap
new file mode 100644
index 00000000..e2d64bff
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__single_statement_multi_series.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/buffered.rs
+expression: "collect_output(BufferedResponseStream::new(vec![statement])).await"
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","tags":{"t1":"t1a"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1],["1970-01-01T00:00:02Z",2]]},{"name":"m1","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:03Z",3]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__single_statement_single_series.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__single_statement_single_series.snap
new file mode 100644
index 00000000..c9be940f
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__buffered__tests__single_statement_single_series.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/buffered.rs
+expression: "collect_output(BufferedResponseStream::new(vec![statement])).await"
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__epoch.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__epoch.snap
new file mode 100644
index 00000000..a3fa491d
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__epoch.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/chunked.rs
+expression: "collect_output_epoch(ChunkedResponseStream::new(vec![statement], 2),\nSome(Precision::Milliseconds)).await"
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[[1000,"t1a",1],[2000,"t1a",2]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_chunks.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_chunks.snap
new file mode 100644
index 00000000..9743f3b7
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_chunks.snap
@@ -0,0 +1,6 @@
+---
+source: iox_v1_query_api/src/response/chunked.rs
+expression: "collect_output(ChunkedResponseStream::new(vec![statement], 2)).await"
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],"partial":true}],"partial":true}]}
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:03Z","t1a",3]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_chunks_with_groups.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_chunks_with_groups.snap
new file mode 100644
index 00000000..40c3f1a1
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_chunks_with_groups.snap
@@ -0,0 +1,7 @@
+---
+source: iox_v1_query_api/src/response/chunked.rs
+expression: "collect_output(ChunkedResponseStream::new(vec![statement], 2)).await"
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],"partial":true}],"partial":true}]}
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:03Z","t1a",3]]}],"partial":true}]}
+{"results":[{"statement_id":0,"series":[{"name":"m2","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_statements.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_statements.snap
new file mode 100644
index 00000000..140954f4
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__many_statements.snap
@@ -0,0 +1,10 @@
+---
+source: iox_v1_query_api/src/response/chunked.rs
+expression: "collect_output(ChunkedResponseStream::new(vec![statement1, statement2],\n2)).await"
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],"partial":true}],"partial":true}]}
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:03Z","t1a",3]]}],"partial":true}]}
+{"results":[{"statement_id":0,"series":[{"name":"m2","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1]]}]}]}
+{"results":[{"statement_id":1,"series":[{"name":"m1","tags":{"t1":"t1a"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1],["1970-01-01T00:00:02Z",2]]}],"partial":true}]}
+{"results":[{"statement_id":1,"series":[{"name":"m1","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:03Z",3]]}],"partial":true}]}
+{"results":[{"statement_id":1,"series":[{"name":"m2","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__single_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__single_chunk.snap
new file mode 100644
index 00000000..dc6cdf9e
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__chunked__tests__single_chunk.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/chunked.rs
+expression: "collect_output(ChunkedResponseStream::new(vec![statement], 2)).await"
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__epoch-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__epoch-2.snap
new file mode 100644
index 00000000..0064e3bb
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__epoch-2.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/json.rs
+expression: collect_output(stream).await
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[[1000000000,"t1a",1],[2000000000,"t1a",2]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__epoch.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__epoch.snap
new file mode 100644
index 00000000..0064e3bb
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__epoch.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/json.rs
+expression: collect_output(stream).await
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[[1000000000,"t1a",1],[2000000000,"t1a",2]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks-2.snap
new file mode 100644
index 00000000..55e5b019
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks-2.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/json.rs
+expression: collect_output(stream).await
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2],["1970-01-01T00:00:03Z","t1a",3]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks.snap
new file mode 100644
index 00000000..cb7718e5
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks.snap
@@ -0,0 +1,6 @@
+---
+source: iox_v1_query_api/src/response/json.rs
+expression: collect_output(stream).await
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],"partial":true}],"partial":true}]}
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:03Z","t1a",3]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks_many_measurements-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks_many_measurements-2.snap
new file mode 100644
index 00000000..b39ce63c
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks_many_measurements-2.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/json.rs
+expression: collect_output(stream).await
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2],["1970-01-01T00:00:03Z","t1a",3]]},{"name":"m2","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1.5]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks_many_measurements.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks_many_measurements.snap
new file mode 100644
index 00000000..bcf2cc05
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_chunks_many_measurements.snap
@@ -0,0 +1,7 @@
+---
+source: iox_v1_query_api/src/response/json.rs
+expression: collect_output(stream).await
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],"partial":true}],"partial":true}]}
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:03Z","t1a",3]]}],"partial":true}]}
+{"results":[{"statement_id":0,"series":[{"name":"m2","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1.5]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_statements-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_statements-2.snap
new file mode 100644
index 00000000..202f297f
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_statements-2.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/json.rs
+expression: collect_output(stream).await
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2],["1970-01-01T00:00:03Z","t1a",3]]},{"name":"m2","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1]]}]},{"statement_id":1,"series":[{"name":"m1","tags":{"t1":"t1a"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1],["1970-01-01T00:00:02Z",2]]},{"name":"m1","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:03Z",3]]},{"name":"m2","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_statements.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_statements.snap
new file mode 100644
index 00000000..d3367a69
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__many_statements.snap
@@ -0,0 +1,10 @@
+---
+source: iox_v1_query_api/src/response/json.rs
+expression: collect_output(stream).await
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],"partial":true}],"partial":true}]}
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:03Z","t1a",3]]}],"partial":true}]}
+{"results":[{"statement_id":0,"series":[{"name":"m2","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1]]}]}]}
+{"results":[{"statement_id":1,"series":[{"name":"m1","tags":{"t1":"t1a"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1],["1970-01-01T00:00:02Z",2]]}],"partial":true}]}
+{"results":[{"statement_id":1,"series":[{"name":"m1","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:03Z",3]]}],"partial":true}]}
+{"results":[{"statement_id":1,"series":[{"name":"m2","tags":{"t1":"t1b"},"columns":["time","f1"],"values":[["1970-01-01T00:00:01Z",1]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk-2.snap
new file mode 100644
index 00000000..407c983b
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk-2.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/json.rs
+expression: collect_output(stream).await
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk.snap
new file mode 100644
index 00000000..407c983b
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/json.rs
+expression: collect_output(stream).await
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_exponential_value.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_exponential_value.snap
new file mode 100644
index 00000000..9c665f3c
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_exponential_value.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/json.rs
+expression: collect_output(stream).await
+---
+{"results":[{"statement_id":0,"series":[{"name":"m1","columns":["time","t1","f1"],"values":[["1970-01-01T00:00:01Z","t1a",7.307059979368068e43],["1970-01-01T00:00:02Z","t1a",7.307059979368067e43]]}]}]}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_infinite_value-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_infinite_value-2.snap
new file mode 100644
index 00000000..b984a48e
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_infinite_value-2.snap
@@ -0,0 +1,7 @@
+---
+source: iox_v1_query_api/src/response/json.rs
+expression: collect_output(stream).await
+---
+{
+  "error": "json: unsupported value: +Inf"
+}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_infinite_value.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_infinite_value.snap
new file mode 100644
index 00000000..b984a48e
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_infinite_value.snap
@@ -0,0 +1,7 @@
+---
+source: iox_v1_query_api/src/response/json.rs
+expression: collect_output(stream).await
+---
+{
+  "error": "json: unsupported value: +Inf"
+}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_pretty-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_pretty-2.snap
new file mode 100644
index 00000000..17146b1b
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_pretty-2.snap
@@ -0,0 +1,33 @@
+---
+source: iox_v1_query_api/src/response/json.rs
+expression: collect_output(stream).await
+---
+{
+  "results": [
+    {
+      "statement_id": 0,
+      "series": [
+        {
+          "name": "m1",
+          "columns": [
+            "time",
+            "t1",
+            "f1"
+          ],
+          "values": [
+            [
+              "1970-01-01T00:00:01Z",
+              "t1a",
+              1
+            ],
+            [
+              "1970-01-01T00:00:02Z",
+              "t1a",
+              2
+            ]
+          ]
+        }
+      ]
+    }
+  ]
+}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_pretty.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_pretty.snap
new file mode 100644
index 00000000..17146b1b
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__json__tests__single_chunk_pretty.snap
@@ -0,0 +1,33 @@
+---
+source: iox_v1_query_api/src/response/json.rs
+expression: collect_output(stream).await
+---
+{
+  "results": [
+    {
+      "statement_id": 0,
+      "series": [
+        {
+          "name": "m1",
+          "columns": [
+            "time",
+            "t1",
+            "f1"
+          ],
+          "values": [
+            [
+              "1970-01-01T00:00:01Z",
+              "t1a",
+              1
+            ],
+            [
+              "1970-01-01T00:00:02Z",
+              "t1a",
+              2
+            ]
+          ]
+        }
+      ]
+    }
+  ]
+}
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__epoch-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__epoch-2.snap
new file mode 100644
index 00000000..5793540f
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__epoch-2.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/msgpack.rs
+expression: collect_output(stream).await
+---
+[[[0,[["m1",["time","t1","f1"],[[1000000,"t1a",1],[2000000,"t1a",2]]]]]]]
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__epoch.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__epoch.snap
new file mode 100644
index 00000000..5793540f
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__epoch.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/msgpack.rs
+expression: collect_output(stream).await
+---
+[[[0,[["m1",["time","t1","f1"],[[1000000,"t1a",1],[2000000,"t1a",2]]]]]]]
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks-2.snap
new file mode 100644
index 00000000..4434440c
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks-2.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/msgpack.rs
+expression: collect_output(stream).await
+---
+[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2],["1970-01-01T00:00:03Z","t1a",3]]]]]]]
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks.snap
new file mode 100644
index 00000000..1fd528b3
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks.snap
@@ -0,0 +1,6 @@
+---
+source: iox_v1_query_api/src/response/msgpack.rs
+expression: collect_output(stream).await
+---
+[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],true]],true]]]
+[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:03Z","t1a",3]]]]]]]
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks_many_measurments-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks_many_measurments-2.snap
new file mode 100644
index 00000000..ce6925f1
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks_many_measurments-2.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/msgpack.rs
+expression: collect_output(stream).await
+---
+[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2],["1970-01-01T00:00:03Z","t1a",3]]],["m2",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1]]]]]]]
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks_many_measurments.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks_many_measurments.snap
new file mode 100644
index 00000000..7e040072
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_chunks_many_measurments.snap
@@ -0,0 +1,7 @@
+---
+source: iox_v1_query_api/src/response/msgpack.rs
+expression: collect_output(stream).await
+---
+[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],true]],true]]]
+[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:03Z","t1a",3]]]],true]]]
+[[[0,[["m2",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1]]]]]]]
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_statements-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_statements-2.snap
new file mode 100644
index 00000000..86899e59
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_statements-2.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/msgpack.rs
+expression: collect_output(stream).await
+---
+[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2],["1970-01-01T00:00:03Z","t1a",3]]],["m2",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1]]]]],[1,[["m1",{"t1":"t1a"},["time","f1"],[["1970-01-01T00:00:01Z",1],["1970-01-01T00:00:02Z",2]]],["m1",{"t1":"t1b"},["time","f1"],[["1970-01-01T00:00:03Z",3]]],["m2",{"t1":"t1b"},["time","f1"],[["1970-01-01T00:00:01Z",1]]]]]]]
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_statements.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_statements.snap
new file mode 100644
index 00000000..45c35345
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__many_statements.snap
@@ -0,0 +1,10 @@
+---
+source: iox_v1_query_api/src/response/msgpack.rs
+expression: collect_output(stream).await
+---
+[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]],true]],true]]]
+[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:03Z","t1a",3]]]],true]]]
+[[[0,[["m2",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1]]]]]]]
+[[[1,[["m1",{"t1":"t1a"},["time","f1"],[["1970-01-01T00:00:01Z",1],["1970-01-01T00:00:02Z",2]]]],true]]]
+[[[1,[["m1",{"t1":"t1b"},["time","f1"],[["1970-01-01T00:00:03Z",3]]]],true]]]
+[[[1,[["m2",{"t1":"t1b"},["time","f1"],[["1970-01-01T00:00:01Z",1]]]]]]]
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__single_chunk-2.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__single_chunk-2.snap
new file mode 100644
index 00000000..15a643dc
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__single_chunk-2.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/msgpack.rs
+expression: collect_output(stream).await
+---
+[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]]]]]]]
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__single_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__single_chunk.snap
new file mode 100644
index 00000000..15a643dc
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__msgpack__tests__single_chunk.snap
@@ -0,0 +1,5 @@
+---
+source: iox_v1_query_api/src/response/msgpack.rs
+expression: collect_output(stream).await
+---
+[[[0,[["m1",["time","t1","f1"],[["1970-01-01T00:00:01Z","t1a",1],["1970-01-01T00:00:02Z","t1a",2]]]]]]]
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_eq.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_eq.snap
new file mode 100644
index 00000000..05cfdbb0
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_eq.snap
@@ -0,0 +1,57 @@
+---
+source: iox_v1_query_api/src/response/stream.rs
+expression: chunks
+---
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:01Z"
+      - 1
+    - - "1970-01-01T00:00:02Z"
+      - 2
+- name: m1
+  tags:
+    tag0: b
+    tag1: "2"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:03Z"
+      - 3
+    - - "1970-01-01T00:00:04Z"
+      - 4
+    - - "1970-01-01T00:00:05Z"
+      - 5
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:06Z"
+      - 6
+    - - "1970-01-01T00:00:07Z"
+      - 7
+    - - "1970-01-01T00:00:08Z"
+      - 8
+  partial: true
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:09Z"
+      - 9
+    - - "1970-01-01T00:00:10Z"
+      - 10
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_gt.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_gt.snap
new file mode 100644
index 00000000..3f8eacb9
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_gt.snap
@@ -0,0 +1,75 @@
+---
+source: iox_v1_query_api/src/response/stream.rs
+expression: chunks
+---
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:01Z"
+      - 1
+    - - "1970-01-01T00:00:02Z"
+      - 2
+- name: m1
+  tags:
+    tag0: b
+    tag1: "2"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:03Z"
+      - 3
+    - - "1970-01-01T00:00:04Z"
+      - 4
+  partial: true
+- name: m1
+  tags:
+    tag0: b
+    tag1: "2"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:05Z"
+      - 5
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:06Z"
+      - 6
+    - - "1970-01-01T00:00:07Z"
+      - 7
+  partial: true
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:08Z"
+      - 8
+    - - "1970-01-01T00:00:09Z"
+      - 9
+  partial: true
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:10Z"
+      - 10
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_lt.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_lt.snap
new file mode 100644
index 00000000..1d149291
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_multi_chunk_lt.snap
@@ -0,0 +1,48 @@
+---
+source: iox_v1_query_api/src/response/stream.rs
+expression: chunks
+---
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:01Z"
+      - 1
+    - - "1970-01-01T00:00:02Z"
+      - 2
+- name: m1
+  tags:
+    tag0: b
+    tag1: "2"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:03Z"
+      - 3
+    - - "1970-01-01T00:00:04Z"
+      - 4
+    - - "1970-01-01T00:00:05Z"
+      - 5
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:06Z"
+      - 6
+    - - "1970-01-01T00:00:07Z"
+      - 7
+    - - "1970-01-01T00:00:08Z"
+      - 8
+    - - "1970-01-01T00:00:09Z"
+      - 9
+    - - "1970-01-01T00:00:10Z"
+      - 10
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_single_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_single_chunk.snap
new file mode 100644
index 00000000..1d149291
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_multi_series_single_chunk.snap
@@ -0,0 +1,48 @@
+---
+source: iox_v1_query_api/src/response/stream.rs
+expression: chunks
+---
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:01Z"
+      - 1
+    - - "1970-01-01T00:00:02Z"
+      - 2
+- name: m1
+  tags:
+    tag0: b
+    tag1: "2"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:03Z"
+      - 3
+    - - "1970-01-01T00:00:04Z"
+      - 4
+    - - "1970-01-01T00:00:05Z"
+      - 5
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:06Z"
+      - 6
+    - - "1970-01-01T00:00:07Z"
+      - 7
+    - - "1970-01-01T00:00:08Z"
+      - 8
+    - - "1970-01-01T00:00:09Z"
+      - 9
+    - - "1970-01-01T00:00:10Z"
+      - 10
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_eq.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_eq.snap
new file mode 100644
index 00000000..746c873e
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_eq.snap
@@ -0,0 +1,59 @@
+---
+source: iox_v1_query_api/src/response/stream.rs
+expression: chunks
+---
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:01Z"
+      - 1
+    - - "1970-01-01T00:00:02Z"
+      - 2
+    - - "1970-01-01T00:00:03Z"
+      - 3
+  partial: true
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:04Z"
+      - 4
+    - - "1970-01-01T00:00:05Z"
+      - 5
+    - - "1970-01-01T00:00:06Z"
+      - 6
+  partial: true
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:07Z"
+      - 7
+    - - "1970-01-01T00:00:08Z"
+      - 8
+    - - "1970-01-01T00:00:09Z"
+      - 9
+  partial: true
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:10Z"
+      - 10
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_gt.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_gt.snap
new file mode 100644
index 00000000..369c05e1
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_gt.snap
@@ -0,0 +1,68 @@
+---
+source: iox_v1_query_api/src/response/stream.rs
+expression: chunks
+---
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:01Z"
+      - 1
+    - - "1970-01-01T00:00:02Z"
+      - 2
+  partial: true
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:03Z"
+      - 3
+    - - "1970-01-01T00:00:04Z"
+      - 4
+  partial: true
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:05Z"
+      - 5
+    - - "1970-01-01T00:00:06Z"
+      - 6
+  partial: true
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:07Z"
+      - 7
+    - - "1970-01-01T00:00:08Z"
+      - 8
+  partial: true
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:09Z"
+      - 9
+    - - "1970-01-01T00:00:10Z"
+      - 10
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_lt.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_lt.snap
new file mode 100644
index 00000000..65d7b7f2
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_multi_chunk_lt.snap
@@ -0,0 +1,32 @@
+---
+source: iox_v1_query_api/src/response/stream.rs
+expression: chunks
+---
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:01Z"
+      - 1
+    - - "1970-01-01T00:00:02Z"
+      - 2
+    - - "1970-01-01T00:00:03Z"
+      - 3
+    - - "1970-01-01T00:00:04Z"
+      - 4
+    - - "1970-01-01T00:00:05Z"
+      - 5
+    - - "1970-01-01T00:00:06Z"
+      - 6
+    - - "1970-01-01T00:00:07Z"
+      - 7
+    - - "1970-01-01T00:00:08Z"
+      - 8
+    - - "1970-01-01T00:00:09Z"
+      - 9
+    - - "1970-01-01T00:00:10Z"
+      - 10
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_single_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_single_chunk.snap
new file mode 100644
index 00000000..65d7b7f2
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__many_batches_single_series_single_chunk.snap
@@ -0,0 +1,32 @@
+---
+source: iox_v1_query_api/src/response/stream.rs
+expression: chunks
+---
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:01Z"
+      - 1
+    - - "1970-01-01T00:00:02Z"
+      - 2
+    - - "1970-01-01T00:00:03Z"
+      - 3
+    - - "1970-01-01T00:00:04Z"
+      - 4
+    - - "1970-01-01T00:00:05Z"
+      - 5
+    - - "1970-01-01T00:00:06Z"
+      - 6
+    - - "1970-01-01T00:00:07Z"
+      - 7
+    - - "1970-01-01T00:00:08Z"
+      - 8
+    - - "1970-01-01T00:00:09Z"
+      - 9
+    - - "1970-01-01T00:00:10Z"
+      - 10
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_multi_chunks.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_multi_chunks.snap
new file mode 100644
index 00000000..6da6f7de
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_multi_chunks.snap
@@ -0,0 +1,31 @@
+---
+source: iox_v1_query_api/src/response/stream.rs
+expression: chunks
+---
+- name: m1
+  columns:
+    - time
+    - tag0
+    - tag1
+    - val
+  values:
+    - - "1970-01-01T00:00:01Z"
+      - a
+      - "1"
+      - 1
+    - - "1970-01-01T00:00:02Z"
+      - a
+      - "1"
+      - 2
+  partial: true
+- name: m1
+  columns:
+    - time
+    - tag0
+    - tag1
+    - val
+  values:
+    - - "1970-01-01T00:00:03Z"
+      - a
+      - "1"
+      - 3
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_single_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_single_chunk.snap
new file mode 100644
index 00000000..5ccf7d20
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_single_chunk.snap
@@ -0,0 +1,23 @@
+---
+source: iox_v1_query_api/src/response/stream.rs
+expression: chunks
+---
+- name: m1
+  columns:
+    - time
+    - tag0
+    - tag1
+    - val
+  values:
+    - - "1970-01-01T00:00:01Z"
+      - a
+      - "1"
+      - 1
+    - - "1970-01-01T00:00:02Z"
+      - a
+      - "1"
+      - 2
+    - - "1970-01-01T00:00:03Z"
+      - a
+      - "1"
+      - 3
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_zero_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_zero_chunk.snap
new file mode 100644
index 00000000..5ccf7d20
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__no_group_by_zero_chunk.snap
@@ -0,0 +1,23 @@
+---
+source: iox_v1_query_api/src/response/stream.rs
+expression: chunks
+---
+- name: m1
+  columns:
+    - time
+    - tag0
+    - tag1
+    - val
+  values:
+    - - "1970-01-01T00:00:01Z"
+      - a
+      - "1"
+      - 1
+    - - "1970-01-01T00:00:02Z"
+      - a
+      - "1"
+      - 2
+    - - "1970-01-01T00:00:03Z"
+      - a
+      - "1"
+      - 3
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__single_batch_single_series_multi_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__single_batch_single_series_multi_chunk.snap
new file mode 100644
index 00000000..fb810fba
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__single_batch_single_series_multi_chunk.snap
@@ -0,0 +1,27 @@
+---
+source: iox_v1_query_api/src/response/stream.rs
+expression: chunks
+---
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:01Z"
+      - 1
+    - - "1970-01-01T00:00:02Z"
+      - 2
+  partial: true
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:03Z"
+      - 3
diff --git a/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__single_batch_single_series_single_chunk.snap b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__single_batch_single_series_single_chunk.snap
new file mode 100644
index 00000000..5da76922
--- /dev/null
+++ b/iox_v1_query_api/src/response/snapshots/iox_v1_query_api__response__stream__tests__single_batch_single_series_single_chunk.snap
@@ -0,0 +1,18 @@
+---
+source: iox_v1_query_api/src/response/stream.rs
+expression: chunks
+---
+- name: m1
+  tags:
+    tag0: a
+    tag1: "1"
+  columns:
+    - time
+    - val
+  values:
+    - - "1970-01-01T00:00:01Z"
+      - 1
+    - - "1970-01-01T00:00:02Z"
+      - 2
+    - - "1970-01-01T00:00:03Z"
+      - 3
diff --git a/iox_v1_query_api/src/response/stream.rs b/iox_v1_query_api/src/response/stream.rs
new file mode 100644
index 00000000..04ed9b27
--- /dev/null
+++ b/iox_v1_query_api/src/response/stream.rs
@@ -0,0 +1,980 @@
+use super::SeriesChunk;
+use crate::Result;
+use crate::error::Error;
+use arrow::array::RecordBatch;
+use arrow::compute::partition;
+use arrow::datatypes::SchemaRef;
+use futures::{Stream, ready};
+use generated_types::influxdata::iox::querier::v1::InfluxQlMetadata;
+use schema::INFLUXQL_METADATA_KEY;
+use std::{
+    collections::{BTreeMap, BTreeSet},
+    num::NonZeroUsize,
+    ops::Range,
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+};
+
+/// Stream that processes a stream of [SeriesChunk]s, merging them
+/// subsequet chunks that are for the same series, and splitting chunks
+/// that are larger that the specified chunk size.
+pub(crate) struct SeriesChunkMergeStream<S> {
+    input: S,
+    chunk_size: Option<NonZeroUsize>,
+
+    current: Option<SeriesChunk>,
+}
+
+impl<S> SeriesChunkMergeStream<S> {
+    pub(crate) fn new(input: S, chunk_size: Option<NonZeroUsize>) -> Self {
+        Self {
+            input,
+            chunk_size,
+            current: None,
+        }
+    }
+}
+
+impl<S> Stream for SeriesChunkMergeStream<S>
+where
+    S: Stream<Item = Result<SeriesChunk>> + Unpin,
+{
+    type Item = Result<SeriesChunk>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        let mut input = Pin::new(&mut this.input);
+        loop {
+            if let Some(chunk_size) = this.chunk_size {
+                let chunk_size: usize = chunk_size.into();
+                if let Some(current) = this.current.take() {
+                    if current.num_rows() > chunk_size {
+                        let (mut left, right) = current.split_at(chunk_size);
+                        left.partial = true;
+                        this.current = Some(right);
+                        return Poll::Ready(Some(Ok(left)));
+                    } else {
+                        this.current = Some(current);
+                    }
+                }
+            }
+
+            match ready!(input.as_mut().poll_next(cx)) {
+                None => {
+                    return if let Some(current) = this.current.take() {
+                        Poll::Ready(Some(Ok(current)))
+                    } else {
+                        Poll::Ready(None)
+                    };
+                }
+                Some(Err(e)) => return Poll::Ready(Some(Err(e))),
+                Some(Ok(chunk)) => {
+                    match this.current {
+                        Some(ref mut current) => {
+                            if current.series() == chunk.series() {
+                                current.merge(chunk);
+                            } else {
+                                let current = this.current.take().unwrap(); // safe unwrap due to check above
+                                this.current = Some(chunk);
+                                return Poll::Ready(Some(Ok(current)));
+                            }
+                        }
+                        None => {
+                            this.current = Some(chunk);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+/// SeriesChunkStream processes a stream of [RecordBatch]es, breaking
+/// each one into [SeriesChunk]s. Each [SeriesChunk] contains the data
+/// from a single Series.
+pub(crate) struct SeriesChunkStream<S> {
+    /// A stream that returns [RecordBatch]es
+    record_batch_stream: S,
+
+    measurement: usize,
+    tag_columns: Arc<BTreeMap<Arc<str>, usize>>,
+    value_columns: Arc<[(Arc<str>, usize)]>,
+
+    batch: RecordBatch,
+    partitions: Vec<Range<usize>>,
+    current_partition: usize,
+}
+
+impl<S> SeriesChunkStream<S> {
+    pub(crate) fn try_new(record_batch_stream: S, schema: SchemaRef) -> Result<Self> {
+        let md = schema.metadata.get(INFLUXQL_METADATA_KEY).ok_or(
+            datafusion::error::DataFusionError::Internal(
+                "Missing INFLUXQL_METADATA in RecordBatch schema".to_owned(),
+            ),
+        )?;
+        let iox_metadata: InfluxQlMetadata = serde_json::from_str(md.as_str())
+            .map_err(|x| datafusion::error::DataFusionError::Internal(x.to_string()))?;
+
+        let measurement = iox_metadata.measurement_column_index as usize;
+        let mut elided_columns = BTreeSet::new();
+        elided_columns.insert(measurement);
+        let tags = iox_metadata
+            .tag_key_columns
+            .iter()
+            .inspect(|x| {
+                if !x.is_projected {
+                    elided_columns.insert(x.column_index as usize);
+                }
+            })
+            .map(|x| (Arc::from(x.tag_key.as_str()), x.column_index as usize))
+            .collect::<BTreeMap<_, _>>();
+        let mut columns = Vec::new();
+
+        schema.fields().iter().enumerate().for_each(|(i, f)| {
+            if !elided_columns.contains(&i) {
+                columns.push((Arc::from(f.name().as_str()), i));
+            }
+        });
+
+        Ok(Self {
+            record_batch_stream,
+            measurement,
+            tag_columns: Arc::from(tags),
+            value_columns: Arc::from(columns),
+            batch: RecordBatch::new_empty(schema),
+            partitions: Vec::new(),
+            current_partition: 0,
+        })
+    }
+
+    fn chunk(&self, range: &Range<usize>, batch: &RecordBatch) -> SeriesChunk {
+        let batch = batch.slice(range.start, range.end - range.start);
+        SeriesChunk::new(
+            self.measurement,
+            Arc::clone(&self.tag_columns),
+            Arc::clone(&self.value_columns),
+            batch,
+        )
+    }
+
+    fn get_partitions_from_record_batch(
+        &self,
+        batch: &RecordBatch,
+    ) -> datafusion::common::Result<Vec<Range<usize>>> {
+        let mut tag_keys_columns = Vec::with_capacity(self.tag_columns.len() + 1);
+        tag_keys_columns.push(Arc::clone(batch.column(self.measurement)));
+        for (_, idx) in self.tag_columns.iter() {
+            tag_keys_columns.push(Arc::clone(batch.column(*idx)));
+        }
+        Ok(partition(tag_keys_columns.as_slice())?.ranges())
+    }
+}
+
+impl<S> SeriesChunkStream<S>
+where
+    S: Stream<Item = Result<RecordBatch, datafusion::common::DataFusionError>> + Unpin,
+{
+    fn poll_next_inner(&mut self, cx: &mut Context<'_>) -> Poll<Option<Result<RecordBatch>>> {
+        Pin::new(&mut self.record_batch_stream)
+            .poll_next(cx)
+            .map_err(Error::from)
+    }
+}
+
+impl<S> Stream for SeriesChunkStream<S>
+where
+    S: Stream<Item = Result<RecordBatch, datafusion::common::DataFusionError>> + Unpin,
+{
+    type Item = Result<SeriesChunk>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        while this.current_partition >= this.partitions.len() {
+            match ready!(this.poll_next_inner(cx)) {
+                None => return Poll::Ready(None),
+                Some(Err(e)) => return Poll::Ready(Some(Err(e))),
+                Some(Ok(batch)) => {
+                    this.partitions = this.get_partitions_from_record_batch(&batch)?;
+                    this.batch = batch;
+                    this.current_partition = 0;
+                }
+            }
+        }
+        let chunk = this.chunk(&this.partitions[this.current_partition], &this.batch);
+        this.current_partition += 1;
+        Poll::Ready(Some(Ok(chunk)))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::response::SeriesChunkSerializer;
+
+    use super::super::tests::{Column, make_schema_and_batches};
+    use super::*;
+    use arrow::array::{Array, DictionaryArray, Int64Array, StringArray, TimestampNanosecondArray};
+    use arrow::datatypes::Int32Type;
+    use datafusion_util::MemoryStream;
+    use futures::TryStreamExt;
+
+    macro_rules! insta_assert_yaml_snapshot {
+        ($INPUT:expr) => {
+            let chunks = $INPUT
+                .iter()
+                .map(|s| SeriesChunkSerializer::new(s, None, true))
+                .collect::<Vec<_>>();
+
+            insta::assert_yaml_snapshot!(chunks);
+        };
+    }
+
+    #[tokio::test]
+    async fn test_no_batch() {
+        let chunks = make_chunks(TestParams {
+            columns: vec![
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "tag0",
+                    group_by: false,
+                    projected: false,
+                },
+                Column::Field { name: "val" },
+            ],
+
+            data: vec![], // no record batch
+            chunk_size: None,
+        })
+        .await;
+
+        assert!(chunks.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_no_group_by_single_chunk() {
+        let chunks = make_chunks(TestParams {
+            columns: vec![
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "tag0",
+                    group_by: false, // no group by
+                    projected: false,
+                },
+                Column::Tag {
+                    name: "tag1",
+                    group_by: false, // no group by
+                    projected: false,
+                },
+                Column::Field { name: "val" },
+            ],
+            // single record batch
+            data: vec![vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000,
+                ])),
+                // single series: a1
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                Arc::new(Int64Array::from(vec![1, 2, 3])),
+            ]],
+            chunk_size: None, // single chunk
+        })
+        .await;
+
+        insta_assert_yaml_snapshot!(chunks);
+    }
+
+    #[tokio::test]
+    async fn test_no_group_by_multi_chunks() {
+        let chunks = make_chunks(TestParams {
+            columns: vec![
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "tag0",
+                    group_by: false, // no group by
+                    projected: false,
+                },
+                Column::Tag {
+                    name: "tag1",
+                    group_by: false, // no group by
+                    projected: false,
+                },
+                Column::Field { name: "val" },
+            ],
+            // single record batch
+            data: vec![vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000,
+                ])),
+                // single series: a1
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                Arc::new(Int64Array::from(vec![1, 2, 3])),
+            ]],
+            chunk_size: Some(2), // multi chunks
+        })
+        .await;
+
+        insta_assert_yaml_snapshot!(chunks);
+    }
+
+    #[tokio::test]
+    async fn test_no_group_by_zero_chunk() {
+        let chunks = make_chunks(TestParams {
+            columns: vec![
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "tag0",
+                    group_by: false, // no group by
+                    projected: false,
+                },
+                Column::Tag {
+                    name: "tag1",
+                    group_by: false, // no group by
+                    projected: false,
+                },
+                Column::Field { name: "val" },
+            ],
+            // single record batch
+            data: vec![vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000,
+                ])),
+                // single series: a1
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                Arc::new(Int64Array::from(vec![1, 2, 3])),
+            ]],
+            chunk_size: Some(0), // this is equivalent to None, i.e. single chunk
+        })
+        .await;
+
+        insta_assert_yaml_snapshot!(chunks);
+    }
+
+    #[tokio::test]
+    async fn test_single_batch_single_series_single_chunk() {
+        let chunks = make_chunks(TestParams {
+            columns: vec![
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "tag0",
+                    group_by: true, // group by tag0
+                    projected: false,
+                },
+                Column::Tag {
+                    name: "tag1",
+                    group_by: true, // group by tag1
+                    projected: false,
+                },
+                Column::Field { name: "val" },
+            ],
+            // single record batch
+            data: vec![vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000,
+                ])),
+                // single series: a1
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                Arc::new(Int64Array::from(vec![1, 2, 3])),
+            ]],
+            chunk_size: None, // single chunk
+        })
+        .await;
+
+        insta_assert_yaml_snapshot!(chunks);
+    }
+
+    #[tokio::test]
+    async fn test_single_batch_single_series_multi_chunk() {
+        let chunks = make_chunks(TestParams {
+            columns: vec![
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "tag0",
+                    group_by: true, // group by tag0
+                    projected: false,
+                },
+                Column::Tag {
+                    name: "tag1",
+                    group_by: true, // group by tag1
+                    projected: false,
+                },
+                Column::Field { name: "val" },
+            ],
+            // single record batch
+            data: vec![vec![
+                Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                Arc::new(TimestampNanosecondArray::from(vec![
+                    1000000000, 2000000000, 3000000000,
+                ])),
+                // single series: a1
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                Arc::new(Int64Array::from(vec![1, 2, 3])),
+            ]],
+            chunk_size: Some(2), // multi chunks
+        })
+        .await;
+
+        insta_assert_yaml_snapshot!(chunks);
+    }
+
+    #[tokio::test]
+    async fn test_many_batches_single_series_single_chunk() {
+        let chunks = make_chunks(TestParams {
+            columns: vec![
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "tag0",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Tag {
+                    name: "tag1",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Field { name: "val" },
+            ],
+            data: vec![
+                // record batch 1
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        1000000000, 2000000000, 3000000000,
+                    ])),
+                    // single series: a1
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                    Arc::new(Int64Array::from(vec![1, 2, 3])),
+                ],
+                // record batch 2
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        4000000000, 5000000000, 6000000000, 7000000000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "a", "a", "a", "a",
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "1", "1", "1", "1",
+                    ])),
+                    Arc::new(Int64Array::from(vec![4, 5, 6, 7])),
+                ],
+                // record batch 3
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        8000000000,
+                        9000000000,
+                        10000000000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                    Arc::new(Int64Array::from(vec![8, 9, 10])),
+                ],
+            ],
+            chunk_size: None, // single chunk
+        })
+        .await;
+
+        insta_assert_yaml_snapshot!(chunks);
+    }
+
+    #[tokio::test]
+    async fn test_many_batches_single_series_multi_chunk_gt() {
+        // Testing batch size > chunk size
+
+        let chunks = make_chunks(TestParams {
+            columns: vec![
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "tag0",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Tag {
+                    name: "tag1",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Field { name: "val" },
+            ],
+            data: vec![
+                // record batch 1
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        1000000000, 2000000000, 3000000000,
+                    ])),
+                    // single series: a1
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                    Arc::new(Int64Array::from(vec![1, 2, 3])),
+                ],
+                // record batch 2
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        4000000000, 5000000000, 6000000000, 7000000000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "a", "a", "a", "a",
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "1", "1", "1", "1",
+                    ])),
+                    Arc::new(Int64Array::from(vec![4, 5, 6, 7])),
+                ],
+                // record batch 3
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        8000000000,
+                        9000000000,
+                        10000000000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                    Arc::new(Int64Array::from(vec![8, 9, 10])),
+                ],
+            ],
+            chunk_size: Some(2), // multi chunks
+        })
+        .await;
+
+        insta_assert_yaml_snapshot!(chunks);
+    }
+
+    #[tokio::test]
+    async fn test_many_batches_single_series_multi_chunk_eq() {
+        // Testing batch size = chunk size
+
+        let chunks = make_chunks(TestParams {
+            columns: vec![
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "tag0",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Tag {
+                    name: "tag1",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Field { name: "val" },
+            ],
+            data: vec![
+                // record batch 1
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        1000000000, 2000000000, 3000000000,
+                    ])),
+                    // single series: a1
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                    Arc::new(Int64Array::from(vec![1, 2, 3])),
+                ],
+                // record batch 2
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        4000000000, 5000000000, 6000000000, 7000000000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "a", "a", "a", "a",
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "1", "1", "1", "1",
+                    ])),
+                    Arc::new(Int64Array::from(vec![4, 5, 6, 7])),
+                ],
+                // record batch 3
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        8000000000,
+                        9000000000,
+                        10000000000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                    Arc::new(Int64Array::from(vec![8, 9, 10])),
+                ],
+            ],
+            chunk_size: Some(3), // multi chunks
+        })
+        .await;
+
+        insta_assert_yaml_snapshot!(chunks);
+    }
+
+    #[tokio::test]
+    async fn test_many_batches_single_series_multi_chunk_lt() {
+        // Testing batch size < chunk size
+
+        let chunks = make_chunks(TestParams {
+            columns: vec![
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "tag0",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Tag {
+                    name: "tag1",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Field { name: "val" },
+            ],
+            data: vec![
+                // record batch 1
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        1000000000, 2000000000, 3000000000,
+                    ])),
+                    // single series: a1
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                    Arc::new(Int64Array::from(vec![1, 2, 3])),
+                ],
+                // record batch 2
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        4000000000, 5000000000, 6000000000, 7000000000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "a", "a", "a", "a",
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "1", "1", "1", "1",
+                    ])),
+                    Arc::new(Int64Array::from(vec![4, 5, 6, 7])),
+                ],
+                // record batch 3
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        8000000000,
+                        9000000000,
+                        10000000000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                    Arc::new(Int64Array::from(vec![8, 9, 10])),
+                ],
+            ],
+            chunk_size: Some(12), // multi chunks
+        })
+        .await;
+
+        insta_assert_yaml_snapshot!(chunks);
+    }
+
+    #[tokio::test]
+    async fn test_many_batches_multi_series_single_chunk() {
+        let chunks = make_chunks(TestParams {
+            columns: vec![
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "tag0",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Tag {
+                    name: "tag1",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Field { name: "val" },
+            ],
+            data: vec![
+                // record batch 1
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        1000000000, 2000000000, 3000000000,
+                    ])),
+                    // multi series: a1, b2
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "b"])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "2"])),
+                    Arc::new(Int64Array::from(vec![1, 2, 3])),
+                ],
+                // record batch 2
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        4000000000, 5000000000, 6000000000, 7000000000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "b", "b", "a", "a",
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "2", "2", "1", "1",
+                    ])),
+                    Arc::new(Int64Array::from(vec![4, 5, 6, 7])),
+                ],
+                // record batch 3
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        8000000000,
+                        9000000000,
+                        10000000000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                    Arc::new(Int64Array::from(vec![8, 9, 10])),
+                ],
+            ],
+            chunk_size: None, // single chunk
+        })
+        .await;
+
+        insta_assert_yaml_snapshot!(chunks);
+    }
+
+    #[tokio::test]
+    async fn test_many_batches_multi_series_multi_chunk_lt() {
+        // Testing batch size < chunk size
+
+        let chunks = make_chunks(TestParams {
+            columns: vec![
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "tag0",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Tag {
+                    name: "tag1",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Field { name: "val" },
+            ],
+            data: vec![
+                // record batch 1
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        1000000000, 2000000000, 3000000000,
+                    ])),
+                    // multi series: a1, b2
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "b"])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "2"])),
+                    Arc::new(Int64Array::from(vec![1, 2, 3])),
+                ],
+                // record batch 2
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        4000000000, 5000000000, 6000000000, 7000000000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "b", "b", "a", "a",
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "2", "2", "1", "1",
+                    ])),
+                    Arc::new(Int64Array::from(vec![4, 5, 6, 7])),
+                ],
+                // record batch 3
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        8000000000,
+                        9000000000,
+                        10000000000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                    Arc::new(Int64Array::from(vec![8, 9, 10])),
+                ],
+            ],
+            chunk_size: Some(12), // multi chunks
+        })
+        .await;
+
+        insta_assert_yaml_snapshot!(chunks);
+    }
+
+    #[tokio::test]
+    async fn test_many_batches_multi_series_multi_chunk_eq() {
+        // Testing batch size = chunk size
+
+        let chunks = make_chunks(TestParams {
+            columns: vec![
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "tag0",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Tag {
+                    name: "tag1",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Field { name: "val" },
+            ],
+            data: vec![
+                // record batch 1
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        1000000000, 2000000000, 3000000000,
+                    ])),
+                    // multi series: a1, b2
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "b"])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "2"])),
+                    Arc::new(Int64Array::from(vec![1, 2, 3])),
+                ],
+                // record batch 2
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        4000000000, 5000000000, 6000000000, 7000000000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "b", "b", "a", "a",
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "2", "2", "1", "1",
+                    ])),
+                    Arc::new(Int64Array::from(vec![4, 5, 6, 7])),
+                ],
+                // record batch 3
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        8000000000,
+                        9000000000,
+                        10000000000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                    Arc::new(Int64Array::from(vec![8, 9, 10])),
+                ],
+            ],
+            chunk_size: Some(3), // multi chunks
+        })
+        .await;
+
+        insta_assert_yaml_snapshot!(chunks);
+    }
+
+    #[tokio::test]
+    async fn test_many_batches_multi_series_multi_chunk_gt() {
+        // Testing batch size > chunk size
+
+        let chunks = make_chunks(TestParams {
+            columns: vec![
+                Column::Measurement,
+                Column::Time,
+                Column::Tag {
+                    name: "tag0",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Tag {
+                    name: "tag1",
+                    group_by: true,
+                    projected: false,
+                },
+                Column::Field { name: "val" },
+            ],
+            data: vec![
+                // record batch 1
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        1000000000, 2000000000, 3000000000,
+                    ])),
+                    // multi series: a1, b2
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "b"])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "2"])),
+                    Arc::new(Int64Array::from(vec![1, 2, 3])),
+                ],
+                // record batch 2
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        4000000000, 5000000000, 6000000000, 7000000000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "b", "b", "a", "a",
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec![
+                        "2", "2", "1", "1",
+                    ])),
+                    Arc::new(Int64Array::from(vec![4, 5, 6, 7])),
+                ],
+                // record batch 3
+                vec![
+                    Arc::new(StringArray::from(vec!["m1", "m1", "m1"])),
+                    Arc::new(TimestampNanosecondArray::from(vec![
+                        8000000000,
+                        9000000000,
+                        10000000000,
+                    ])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["a", "a", "a"])),
+                    Arc::new(DictionaryArray::<Int32Type>::from_iter(vec!["1", "1", "1"])),
+                    Arc::new(Int64Array::from(vec![8, 9, 10])),
+                ],
+            ],
+            chunk_size: Some(2), // multi chunks
+        })
+        .await;
+
+        insta_assert_yaml_snapshot!(chunks);
+    }
+
+    struct TestParams {
+        columns: Vec<Column>,
+        data: Vec<Vec<Arc<dyn Array>>>,
+        chunk_size: Option<usize>,
+    }
+
+    async fn make_chunks(params: TestParams) -> Vec<SeriesChunk> {
+        let TestParams {
+            columns,
+            data,
+            chunk_size,
+        } = params;
+
+        let (schema, batches) = make_schema_and_batches(columns, data);
+
+        let stream = MemoryStream::new_with_schema(batches, Arc::clone(&schema));
+        let stream = SeriesChunkStream::try_new(stream, schema).unwrap();
+        let stream = SeriesChunkMergeStream::new(stream, chunk_size.and_then(NonZeroUsize::new));
+
+        let chunks: Result<Vec<SeriesChunk>> = stream.try_collect().await;
+        chunks.unwrap()
+    }
+}
diff --git a/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases.snap b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases.snap
new file mode 100644
index 00000000..f3788aa2
--- /dev/null
+++ b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases.snap
@@ -0,0 +1,6 @@
+---
+source: iox_v1_query_api/src/handler.rs
+description: SHOW DATABASES -- on handler with SHOW DATABASES enabled
+expression: res
+---
+{"results":[{"statement_id":0,"series":[{"name":"databases","columns":["name"],"values":[["foo"],["bar"]]}]}]}
diff --git a/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_authz-2.snap b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_authz-2.snap
new file mode 100644
index 00000000..f968d2cc
--- /dev/null
+++ b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_authz-2.snap
@@ -0,0 +1,6 @@
+---
+source: iox_v1_query_api/src/handler.rs
+description: SHOW DATABASES -- should return mop database after adding to authz
+expression: res
+---
+{"results":[{"statement_id":0,"series":[{"name":"databases","columns":["name"],"values":[["foo"],["bar"],["mop"]]}]}]}
diff --git a/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_authz.snap b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_authz.snap
new file mode 100644
index 00000000..2dd37332
--- /dev/null
+++ b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_authz.snap
@@ -0,0 +1,6 @@
+---
+source: iox_v1_query_api/src/handler.rs
+description: SHOW DATABASES -- should not return mop database due to authz
+expression: res
+---
+{"results":[{"statement_id":0,"series":[{"name":"databases","columns":["name"],"values":[["foo"],["bar"]]}]}]}
diff --git a/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_no_impl.snap b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_no_impl.snap
new file mode 100644
index 00000000..150825be
--- /dev/null
+++ b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_databases_with_no_impl.snap
@@ -0,0 +1,6 @@
+---
+source: iox_v1_query_api/src/handler.rs
+description: SHOW DATABASES -- on handler with SHOW DATABASES _not_ enabled
+expression: res
+---
+{"results":[{"statement_id":0,"error":"must specify a 'db' parameter, or provide the database in the InfluxQL query"}]}
diff --git a/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies-2.snap b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies-2.snap
new file mode 100644
index 00000000..5a2e5ab8
--- /dev/null
+++ b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies-2.snap
@@ -0,0 +1,6 @@
+---
+source: iox_v1_query_api/src/handler.rs
+description: "SHOW RETENTION POLICIES -- on `bar` database which contains one default policy and one non-default policy"
+expression: res
+---
+{"results":[{"statement_id":0,"series":[{"name":"retention_policies","columns":["name","duration","shardGroupDuration","replicaN","futureWriteLimit","pastWriteLimit","default"],"values":[["autogen","0ns","604800s",1,"0ns","0ns",true],["short","100s","604800s",1,"0ns","0ns",false]]}]}]}
diff --git a/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies-3.snap b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies-3.snap
new file mode 100644
index 00000000..d17c3991
--- /dev/null
+++ b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies-3.snap
@@ -0,0 +1,6 @@
+---
+source: iox_v1_query_api/src/handler.rs
+description: "SHOW RETENTION POLICIES -- on `frodo` database which does not exist"
+expression: res
+---
+{"results":[{"statement_id":0,"error":"datafusion error: Error during planning: database not found: frodo"}]}
diff --git a/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies.snap b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies.snap
new file mode 100644
index 00000000..07dbc525
--- /dev/null
+++ b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies.snap
@@ -0,0 +1,6 @@
+---
+source: iox_v1_query_api/src/handler.rs
+description: "SHOW RETENTION POLICIES -- on `foo` database which contains one default policy"
+expression: res
+---
+{"results":[{"statement_id":0,"series":[{"name":"retention_policies","columns":["name","duration","shardGroupDuration","replicaN","futureWriteLimit","pastWriteLimit","default"],"values":[["autogen","0ns","604800s",1,"0ns","0ns",true]]}]}]}
diff --git a/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies_with_no_impl.snap b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies_with_no_impl.snap
new file mode 100644
index 00000000..cdc81d88
--- /dev/null
+++ b/iox_v1_query_api/src/snapshots/iox_v1_query_api__handler__tests__show_retention_policies_with_no_impl.snap
@@ -0,0 +1,6 @@
+---
+source: iox_v1_query_api/src/handler.rs
+description: SHOW RETENTION POLICIES -- on handler with SHOW RETENTION POLICIES _not_ enabled
+expression: res
+---
+{"results":[{"statement_id":0,"error":"Database foo not found"}]}
diff --git a/iox_v1_query_api/src/types.rs b/iox_v1_query_api/src/types.rs
new file mode 100644
index 00000000..04821d34
--- /dev/null
+++ b/iox_v1_query_api/src/types.rs
@@ -0,0 +1,131 @@
+use arrow::datatypes::SchemaRef;
+use datafusion::physical_plan::SendableRecordBatchStream;
+use serde::{Deserialize, Serialize};
+
+use iox_query::query_log::PermitAndToken;
+
+/// UNIX epoch precision.
+/// Doc: <https://docs.influxdata.com/influxdb/v1/query_language/spec/#duration-units>
+#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
+pub enum Precision {
+    #[serde(rename = "ns")]
+    Nanoseconds,
+    #[serde(rename = "u", alias = "µ")]
+    Microseconds,
+    #[serde(rename = "ms")]
+    Milliseconds,
+    #[serde(rename = "s")]
+    Seconds,
+    #[serde(rename = "m")]
+    Minutes,
+    #[serde(rename = "h")]
+    Hours,
+    #[serde(rename = "d")]
+    Days,
+    #[serde(rename = "w")]
+    Weeks,
+}
+
+/// An executing InfluxQL statement that produces results that can be
+/// streamed as CSV.
+pub(crate) struct Statement {
+    pub schema: SchemaRef,
+    /// Optional Permit/Token to support commands such as `SHOW DATABASES`,
+    /// which do not go through the query planner/executor.
+    pub permit_state: Option<PermitAndToken>,
+    pub stream: SendableRecordBatchStream,
+}
+
+impl Statement {
+    pub(crate) fn new(
+        schema: SchemaRef,
+        permit_state: Option<PermitAndToken>,
+        stream: SendableRecordBatchStream,
+    ) -> Self {
+        Self {
+            schema,
+            permit_state,
+            stream,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use serde_json::{from_str, to_string};
+
+    #[test]
+    fn test_precision_deserialize() {
+        // Test standard rename attributes
+        assert_eq!(
+            from_str::<Precision>(r#""ns""#).unwrap(),
+            Precision::Nanoseconds
+        );
+        assert_eq!(
+            from_str::<Precision>(r#""u""#).unwrap(),
+            Precision::Microseconds
+        );
+        assert_eq!(
+            from_str::<Precision>(r#""ms""#).unwrap(),
+            Precision::Milliseconds
+        );
+        assert_eq!(from_str::<Precision>(r#""s""#).unwrap(), Precision::Seconds);
+        assert_eq!(from_str::<Precision>(r#""m""#).unwrap(), Precision::Minutes);
+        assert_eq!(from_str::<Precision>(r#""h""#).unwrap(), Precision::Hours);
+        assert_eq!(from_str::<Precision>(r#""d""#).unwrap(), Precision::Days);
+        assert_eq!(from_str::<Precision>(r#""w""#).unwrap(), Precision::Weeks);
+
+        // Test the alias for Microseconds
+        assert_eq!(
+            from_str::<Precision>(r#""µ""#).unwrap(),
+            Precision::Microseconds
+        );
+    }
+
+    #[test]
+    fn test_precision_serialize() {
+        // Test that each enum variant serializes to the expected string
+        assert_eq!(to_string(&Precision::Nanoseconds).unwrap(), r#""ns""#);
+        assert_eq!(to_string(&Precision::Microseconds).unwrap(), r#""u""#); // Note: serializes to "u", not "µ"
+        assert_eq!(to_string(&Precision::Milliseconds).unwrap(), r#""ms""#);
+        assert_eq!(to_string(&Precision::Seconds).unwrap(), r#""s""#);
+        assert_eq!(to_string(&Precision::Minutes).unwrap(), r#""m""#);
+        assert_eq!(to_string(&Precision::Hours).unwrap(), r#""h""#);
+        assert_eq!(to_string(&Precision::Days).unwrap(), r#""d""#);
+        assert_eq!(to_string(&Precision::Weeks).unwrap(), r#""w""#);
+    }
+
+    #[test]
+    fn test_precision_error_cases() {
+        // Test invalid inputs
+        let invalid_result = from_str::<Precision>(r#""invalid""#);
+        assert!(invalid_result.is_err());
+
+        // Test case sensitivity (serde is case-sensitive by default)
+        let uppercase_result = from_str::<Precision>(r#""NS""#);
+        assert!(uppercase_result.is_err());
+    }
+
+    #[test]
+    fn test_precision_roundtrip() {
+        // Test serialization and deserialization roundtrip for all variants
+        let variants = vec![
+            Precision::Nanoseconds,
+            Precision::Microseconds,
+            Precision::Milliseconds,
+            Precision::Seconds,
+            Precision::Minutes,
+            Precision::Hours,
+            Precision::Days,
+            Precision::Weeks,
+        ];
+
+        for variant in variants {
+            let serialized = to_string(&variant).unwrap();
+            let deserialized = from_str::<Precision>(&serialized).unwrap();
+            assert_eq!(variant, deserialized);
+        }
+    }
+}
diff --git a/iox_v1_query_api/src/value.rs b/iox_v1_query_api/src/value.rs
new file mode 100644
index 00000000..1686fb9c
--- /dev/null
+++ b/iox_v1_query_api/src/value.rs
@@ -0,0 +1,303 @@
+//! Types to represent values produced by the InfluxQL queries in
+//! InfluxDB. These types are used to serialize the results for the
+//! v1 API.
+use arrow::array::timezone::Tz;
+use arrow::array::{Array, ArrayRef, ArrowPrimitiveType, AsArray};
+use arrow::datatypes::{
+    DataType, Float64Type, Int32Type, Int64Type, TimeUnit, TimestampNanosecondType, UInt64Type,
+};
+use arrow::temporal_conversions::timestamp_ns_to_datetime;
+use chrono::{DateTime, SecondsFormat, Utc};
+use serde::Serialize;
+use std::fmt;
+use std::str::FromStr;
+use std::sync::Arc;
+
+use crate::types::Precision;
+
+/// The InfluxQL type of a value.
+#[derive(Debug, PartialEq)]
+pub(crate) enum ValueType {
+    Boolean,
+    Integer,
+    Float,
+    String,
+    Timestamp(Option<Arc<str>>),
+    Unsigned,
+    Null,
+}
+
+/// A velue produced by an InfluxQL query. This is a reference to a
+/// single element of a an arrow array.
+pub(crate) struct Value {
+    arr: ArrayRef,
+    row: usize,
+}
+
+impl Value {
+    /// Create a new value wrapping the element at `row` in the `arr`
+    /// array.
+    pub(crate) fn new(arr: &ArrayRef, row: usize) -> Self {
+        Self {
+            arr: ArrayRef::clone(arr),
+            row,
+        }
+    }
+
+    /// Return the InfluxQL type of the value.
+    pub(crate) fn value_type(&self) -> ValueType {
+        match self.arr.data_type() {
+            DataType::Boolean => ValueType::Boolean,
+            DataType::Int64 => ValueType::Integer,
+            DataType::Float64 | DataType::Float32 | DataType::Float16 => ValueType::Float,
+            DataType::Utf8 => ValueType::String,
+            DataType::Dictionary(k, v)
+                if k.equals_datatype(&DataType::Int32) && v.equals_datatype(&DataType::Utf8) =>
+            {
+                ValueType::String
+            }
+            DataType::Timestamp(TimeUnit::Nanosecond, tz) => ValueType::Timestamp(tz.clone()),
+            DataType::UInt64 => ValueType::Unsigned,
+            DataType::Null => ValueType::Null,
+            dt => panic!("Unsupported InfluxQL data type: {dt}"),
+        }
+    }
+
+    /// Return the value as a boolean, if it is one.
+    pub(crate) fn as_boolean_opt(&self) -> Option<bool> {
+        if self.arr.is_valid(self.row) {
+            self.arr.as_boolean_opt().map(|a| a.value(self.row))
+        } else {
+            None
+        }
+    }
+
+    /// Return the value as an integer, if it is one.
+    pub(crate) fn as_integer_opt(&self) -> Option<i64> {
+        self.as_primitive_opt::<Int64Type>()
+    }
+
+    /// Return the value as a float, if it is one.
+    pub(crate) fn as_float_opt(&self) -> Option<f64> {
+        self.as_primitive_opt::<Float64Type>()
+    }
+
+    fn as_primitive_opt<T: ArrowPrimitiveType>(&self) -> Option<T::Native> {
+        if self.arr.is_valid(self.row) {
+            self.arr.as_primitive_opt::<T>().map(|a| a.value(self.row))
+        } else {
+            None
+        }
+    }
+
+    /// Return the value as a string, if it is one.
+    pub(crate) fn as_string_opt(&self) -> Option<&str> {
+        if self.arr.is_valid(self.row) {
+            let (arr, idx) = match self.arr.as_dictionary_opt::<Int32Type>() {
+                Some(a) => (a.values(), a.key(self.row)),
+                None => (&self.arr, Some(self.row)),
+            };
+            idx.and_then(|idx| arr.as_string_opt::<i32>().map(|a| a.value(idx)))
+        } else {
+            None
+        }
+    }
+
+    /// Return the value as a timestamp, if it is one.
+    pub(crate) fn as_timestamp_opt(&self) -> Option<DateTime<Utc>> {
+        self.as_primitive_opt::<TimestampNanosecondType>()
+            .and_then(timestamp_ns_to_datetime)
+            .map(|t| t.and_utc())
+    }
+
+    /// Return the value as an unsigned integer, if it is one.
+    pub(crate) fn as_unsigned_opt(&self) -> Option<u64> {
+        self.as_primitive_opt::<UInt64Type>()
+    }
+}
+
+impl fmt::Debug for Value {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self.value_type() {
+            ValueType::Boolean => write!(f, "{:?}", self.as_boolean_opt()),
+            ValueType::Integer => write!(f, "{:?}", self.as_integer_opt()),
+            ValueType::Float => write!(f, "{:?}", self.as_float_opt()),
+            ValueType::String => write!(f, "{:?}", self.as_string_opt()),
+            ValueType::Timestamp(_) => write!(f, "{:?}", self.as_timestamp_opt()),
+            ValueType::Unsigned => write!(f, "{:?}", self.as_unsigned_opt()),
+            ValueType::Null => write!(f, "null"),
+        }
+    }
+}
+
+impl fmt::Display for Value {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self.value_type() {
+            ValueType::Boolean => self
+                .as_boolean_opt()
+                .map(|v| write!(f, "{v}"))
+                .unwrap_or(Ok(())),
+            ValueType::Integer => self
+                .as_integer_opt()
+                .map(|v| write!(f, "{v}"))
+                .unwrap_or(Ok(())),
+            ValueType::Float => self
+                .as_float_opt()
+                .map(|v| write!(f, "{v}"))
+                .unwrap_or(Ok(())),
+            ValueType::String => self
+                .as_string_opt()
+                .map(|v| write!(f, "{v}"))
+                .unwrap_or(Ok(())),
+            ValueType::Timestamp(tz) => {
+                let tz = tz
+                    .and_then(|tz| Tz::from_str(&tz).ok())
+                    .unwrap_or_else(|| Tz::from_str("UTC").unwrap());
+                self.as_timestamp_opt()
+                    .map(|t| write!(f, "{}", t.with_timezone(&tz).to_rfc3339()))
+                    .unwrap_or(Ok(()))
+            }
+            ValueType::Unsigned => self
+                .as_unsigned_opt()
+                .map(|v| write!(f, "{v}"))
+                .unwrap_or(Ok(())),
+            ValueType::Null => Ok(()),
+        }
+    }
+}
+
+impl PartialEq for Value {
+    fn eq(&self, other: &Self) -> bool {
+        if self.value_type() != other.value_type() {
+            return false;
+        }
+        match self.value_type() {
+            ValueType::Boolean => self.as_boolean_opt() == other.as_boolean_opt(),
+            ValueType::Integer => self.as_integer_opt() == other.as_integer_opt(),
+            ValueType::Float => self.as_float_opt() == other.as_float_opt(),
+            ValueType::String => self.as_string_opt() == other.as_string_opt(),
+            ValueType::Timestamp(_) => self.as_timestamp_opt() == other.as_timestamp_opt(),
+            ValueType::Unsigned => self.as_unsigned_opt() == other.as_unsigned_opt(),
+            ValueType::Null => true,
+        }
+    }
+}
+
+pub(crate) struct ValueSerializer<'a> {
+    value: &'a Value,
+    epoch: Option<Precision>,
+    // Allow infinite values
+    allow_inf: bool,
+}
+
+impl<'a> ValueSerializer<'a> {
+    pub(crate) fn new(value: &'a Value, epoch: Option<Precision>, allow_inf: bool) -> Self {
+        Self {
+            value,
+            epoch,
+            allow_inf,
+        }
+    }
+}
+
+impl Serialize for ValueSerializer<'_> {
+    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
+        match self.value.value_type() {
+            ValueType::Boolean => {
+                if let Some(v) = self.value.as_boolean_opt() {
+                    serializer.serialize_bool(v)
+                } else {
+                    serializer.serialize_none()
+                }
+            }
+            ValueType::Integer => {
+                if let Some(v) = self.value.as_integer_opt() {
+                    serializer.serialize_i64(v)
+                } else {
+                    serializer.serialize_none()
+                }
+            }
+            ValueType::Float => {
+                if let Some(v) = self.value.as_float_opt() {
+                    if v.fract() == 0.0 && (v.abs() < (i64::MAX as f64)) {
+                        // Only turn x.0 into x if it is small enough to fit in an i64.
+                        // For example, 100.0 becomes 100.
+                        // But 1_000_000_000_000_000_000_000.0 still stays as 1_000_000_000_000_000_000_000.0
+                        // because it is too large to fit in an i64.
+                        serializer.serialize_i64(v as i64)
+                    } else if v.is_infinite() && !self.allow_inf {
+                        // JSON and /query 1.x doesn't support infinite values
+                        //
+                        // https://www.rfc-editor.org/rfc/rfc4627#:~:text=Numeric%20values%20that%20cannot%20be%20represented%20as%20sequences%20of%20digits%0A%20%20%20(such%20as%20Infinity%20and%20NaN)%20are%20not%20permitted.
+                        if v > 0.0 {
+                            Err(serde::ser::Error::custom("json: unsupported value: +Inf"))
+                        } else {
+                            Err(serde::ser::Error::custom("json: unsupported value: -Inf"))
+                        }
+                    } else if v.is_nan() {
+                        // /query 1.x serilizes NaN as null for json and msgpack
+                        serializer.serialize_none()
+                    } else {
+                        serializer.serialize_f64(v)
+                    }
+                } else {
+                    serializer.serialize_none()
+                }
+            }
+            ValueType::String => {
+                if let Some(v) = self.value.as_string_opt() {
+                    serializer.serialize_str(v)
+                } else {
+                    serializer.serialize_none()
+                }
+            }
+            ValueType::Timestamp(tz) => {
+                if let Some(v) = self.value.as_timestamp_opt() {
+                    match self.epoch {
+                        Some(Precision::Nanoseconds) => {
+                            chrono::serde::ts_nanoseconds::serialize(&v, serializer)
+                        }
+                        Some(Precision::Microseconds) => {
+                            chrono::serde::ts_microseconds::serialize(&v, serializer)
+                        }
+                        Some(Precision::Milliseconds) => {
+                            chrono::serde::ts_milliseconds::serialize(&v, serializer)
+                        }
+                        Some(Precision::Seconds) => {
+                            chrono::serde::ts_seconds::serialize(&v, serializer)
+                        }
+                        Some(Precision::Minutes) => serializer.serialize_i64(v.timestamp() / 60),
+                        Some(Precision::Hours) => {
+                            serializer.serialize_i64(v.timestamp() / (60 * 60))
+                        }
+                        Some(Precision::Days) => {
+                            serializer.serialize_i64(v.timestamp() / (60 * 60 * 24))
+                        }
+                        Some(Precision::Weeks) => {
+                            serializer.serialize_i64(v.timestamp() / (60 * 60 * 24 * 7))
+                        }
+                        None => match tz.and_then(|tz| Tz::from_str(tz.as_ref()).ok()) {
+                            Some(tz) => v
+                                .with_timezone(&tz)
+                                .to_rfc3339_opts(SecondsFormat::AutoSi, true)
+                                .serialize(serializer),
+                            None => v
+                                .to_rfc3339_opts(SecondsFormat::AutoSi, true)
+                                .serialize(serializer),
+                        },
+                    }
+                } else {
+                    serializer.serialize_none()
+                }
+            }
+            ValueType::Unsigned => {
+                if let Some(v) = self.value.as_unsigned_opt() {
+                    serializer.serialize_u64(v)
+                } else {
+                    serializer.serialize_none()
+                }
+            }
+            ValueType::Null => serializer.serialize_none(),
+        }
+    }
+}
diff --git a/jemalloc_stats/Cargo.toml b/jemalloc_stats/Cargo.toml
index 44918295..54d7313d 100644
--- a/jemalloc_stats/Cargo.toml
+++ b/jemalloc_stats/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 [dependencies]
 snafu = "0.8"
 tikv-jemalloc-ctl = { version = "0.5.4", features = ["use_std"] }
-tokio = { version = "1.47.1", features = ["rt", "sync", "time"] }
+tokio = { version = "1.48.0", features = ["rt", "sync", "time"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [lints]
@@ -18,4 +18,4 @@ workspace = true
 tikv-jemallocator = { version = "0.5", features = [
     "unprefixed_malloc_on_supported_platforms",
 ] }
-tokio = { version = "1.47.1", features = ["macros", "rt-multi-thread"] }
+tokio = { version = "1.48.0", features = ["macros", "rt-multi-thread"] }
diff --git a/jemalloc_stats/src/lib.rs b/jemalloc_stats/src/lib.rs
index 38b3e87c..88a17358 100644
--- a/jemalloc_stats/src/lib.rs
+++ b/jemalloc_stats/src/lib.rs
@@ -1,6 +1,6 @@
 #![expect(missing_copy_implementations)]
 
-use std::{sync::LazyLock, time::Duration};
+use std::{sync::OnceLock, time::Duration};
 
 use tikv_jemalloc_ctl::{epoch as epoch_ctl, stats};
 use tokio::{sync::watch, task::JoinHandle};
@@ -17,8 +17,9 @@ pub use monitor::{AllocationMonitor, AllocationMonitorError};
 /// [`Refresher::handle()`] to obtain periodic updates.
 ///
 /// The first reference to [`STATS`] MUST be made from within an async tokio
-/// runtime.
-pub static STATS: LazyLock<Refresher> = LazyLock::new(Refresher::new);
+/// runtime because a background tokio task is spawned by the initialised
+/// [`Refresher`].
+pub static STATS: OnceLock<Refresher> = OnceLock::new();
 
 /// Defines the frequency at which updated [`Stats`] are obtained and published.
 ///
@@ -58,7 +59,7 @@ impl Refresher {
     /// Construct a new [`Stats`].
     ///
     /// Intentionally non-pub to enforce a singleton exposed via [`STATS`].
-    fn new() -> Self {
+    pub fn new(tick_duration: Duration) -> Self {
         let (tx, rx) = watch::channel(Stats::default());
 
         Self {
@@ -66,7 +67,7 @@ impl Refresher {
 
             // Spawn a background task to ask jemalloc to refresh the statistics
             // periodically, and publish the result.
-            refresh_task: tokio::task::spawn(refresh(tx)),
+            refresh_task: tokio::task::spawn(refresh(tx, tick_duration)),
         }
     }
 
@@ -82,8 +83,8 @@ impl Refresher {
     /// ```rust
     /// # fn do_slow_thing() {}
     /// # let _guard = tokio::runtime::Runtime::new().unwrap().enter();
-    /// #
-    /// let handle = jemalloc_stats::STATS.handle();
+    /// # let REFRESH_INTERVAL = std::time::Duration::from_millis(9100);
+    /// let handle = jemalloc_stats::STATS.get_or_init(|| jemalloc_stats::Refresher::new(REFRESH_INTERVAL)).handle();
     ///
     /// // Good:
     /// let stats = handle.borrow().clone();
@@ -104,7 +105,7 @@ impl Drop for Refresher {
     }
 }
 
-async fn refresh(tx: watch::Sender<Stats>) {
+async fn refresh(tx: watch::Sender<Stats>, tick_duration: Duration) {
     let epoch = epoch_ctl::mib().unwrap();
     let active = stats::active::mib().unwrap();
     let allocated = stats::allocated::mib().unwrap();
@@ -135,7 +136,7 @@ async fn refresh(tx: watch::Sender<Stats>) {
             return;
         }
 
-        tokio::time::sleep(REFRESH_INTERVAL).await;
+        tokio::time::sleep(tick_duration).await;
     }
 }
 
@@ -147,7 +148,8 @@ mod tests {
     /// reported.
     #[tokio::test]
     async fn test_stats() {
-        let handle = STATS.handle();
+        let stats = STATS.get_or_init(|| Refresher::new(REFRESH_INTERVAL));
+        let handle = stats.handle();
 
         tokio::time::timeout(Duration::from_secs(10), async move {
             loop {
diff --git a/linear_buffer/Cargo.toml b/linear_buffer/Cargo.toml
new file mode 100644
index 00000000..188fdadf
--- /dev/null
+++ b/linear_buffer/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "linear_buffer"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+# avoid compiling all the workspace-hack dependencies for MIRI tests
+[target.'cfg(not(miri))'.dependencies]
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
diff --git a/linear_buffer/src/allocation.rs b/linear_buffer/src/allocation.rs
new file mode 100644
index 00000000..0e7a5881
--- /dev/null
+++ b/linear_buffer/src/allocation.rs
@@ -0,0 +1,77 @@
+//! Allocation-related tools.
+
+use std::{
+    alloc::Layout,
+    mem::MaybeUninit,
+    num::NonZeroUsize,
+    ops::{Deref, DerefMut},
+    ptr::NonNull,
+};
+
+/// An allocation of potentially uninitialized memory.
+///
+/// This is basically `Box<[MaybeUninit<u8>]>` but allows us to control the alignment as well.
+pub(crate) struct Allocation {
+    layout: Layout,
+    ptr: NonNull<u8>,
+}
+
+impl Allocation {
+    /// Create new allocation with given size and alignment.
+    pub(crate) fn new(size: usize, alignment: NonZeroUsize) -> Self {
+        let layout = Layout::array::<u8>(size)
+            .expect("size fits `isize`")
+            .align_to(alignment.get())
+            .expect("valid alignment");
+
+        let ptr = if size == 0 {
+            // That's basically what the standard library does for empty `Vec`s. We are allowed to create an empty
+            // slice based on this pointer.
+            NonNull::<u8>::without_provenance(alignment)
+        } else {
+            // SAFETY: we made sure that the size is non-zero
+            let ptr = unsafe { std::alloc::alloc(layout) };
+
+            match NonNull::new(ptr) {
+                Some(ptr) => ptr,
+                None => {
+                    panic!("cannot allocate {size} bytes with alignment {alignment}")
+                }
+            }
+        };
+
+        Self { layout, ptr }
+    }
+
+    /// Correctly typed pointer.
+    fn ptr(&self) -> NonNull<MaybeUninit<u8>> {
+        self.ptr.cast()
+    }
+}
+
+impl Drop for Allocation {
+    fn drop(&mut self) {
+        let Self { layout, ptr } = self;
+
+        if layout.size() != 0 {
+            // SAFETY: this is a valid pointer and there are no dangling references
+            unsafe { std::alloc::dealloc(ptr.as_ptr(), *layout) };
+        }
+    }
+}
+
+impl Deref for Allocation {
+    type Target = [MaybeUninit<u8>];
+
+    fn deref(&self) -> &Self::Target {
+        // SAFETY: this is a valid pointer
+        unsafe { std::slice::from_raw_parts(self.ptr().as_ptr(), self.layout.size()) }
+    }
+}
+
+impl DerefMut for Allocation {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        // SAFETY: this is a valid pointer
+        unsafe { std::slice::from_raw_parts_mut(self.ptr().as_ptr(), self.layout.size()) }
+    }
+}
diff --git a/linear_buffer/src/extend.rs b/linear_buffer/src/extend.rs
new file mode 100644
index 00000000..64b406b2
--- /dev/null
+++ b/linear_buffer/src/extend.rs
@@ -0,0 +1,243 @@
+//! Extensions traits for [`LinearBuffer`] to simplify common tasks.
+
+use std::mem::MaybeUninit;
+
+use crate::LinearBuffer;
+
+/// Extension methods for [`LinearBuffer`] that are a safe combination of [`tail`](LinearBuffer::tail) and
+/// [`bump`](LinearBuffer::bump).
+pub trait LinearBufferExtend {
+    /// Append data to buffer.
+    ///
+    /// # Panic
+    /// There must be enough space left. In case of a panic, the buffer will be left untouched.
+    ///
+    /// # Example
+    /// ```
+    /// # use linear_buffer::{LinearBuffer, LinearBufferExtend};
+    /// let mut buffer = LinearBuffer::new(6);
+    ///
+    /// buffer.append(b"foo");
+    /// buffer.append(b"bar");
+    ///
+    /// assert_eq!(
+    ///     buffer.slice_initialized_part(..).as_ref(),
+    ///     b"foobar",
+    /// );
+    /// ```
+    fn append(&mut self, data: &[u8]);
+
+    /// Extend buffer with constant value.
+    ///
+    /// This can be used for example to zero-extend the buffer without allocating a temporary slice for
+    /// [`append`](Self::append).
+    ///
+    /// # Panic
+    /// There must be enough space left. In case of a panic, the buffer will be left untouched.
+    ///
+    /// # Example
+    /// ```
+    /// # use linear_buffer::{LinearBuffer, LinearBufferExtend};
+    /// let mut buffer = LinearBuffer::new(6);
+    ///
+    /// buffer.fill(0, 2);
+    /// buffer.fill(0xff, 4);
+    ///
+    /// assert_eq!(
+    ///     buffer.slice_initialized_part(..).as_ref(),
+    ///     [0, 0, 0xff, 0xff, 0xff, 0xff],
+    /// );
+    /// ```
+    fn fill(&mut self, value: u8, n: usize);
+}
+
+impl LinearBufferExtend for LinearBuffer {
+    fn append(&mut self, data: &[u8]) {
+        let space_left = self.space_left();
+        assert!(
+            data.len() <= space_left,
+            "want to append {} bytes but buffer only has {space_left} bytes left",
+            data.len(),
+        );
+
+        let tail = self.tail();
+
+        // SAFETY: we've just checked that there is enough space left
+        let target = unsafe { tail.get_unchecked_mut(0..data.len()) };
+
+        // there is no good stable way to write a slice, see:
+        // https://github.com/rust-lang/rust/issues/79995
+        // so we gonna hand-roll that
+
+        // SAFETY: &[T] and &[MaybeUninit<T>] have the same layout
+        let uninit_src: &[MaybeUninit<u8>] = unsafe { std::mem::transmute(data) };
+        target.copy_from_slice(uninit_src);
+
+        // SAFETY: we just wrote that data
+        unsafe { self.bump(data.len()) };
+    }
+
+    fn fill(&mut self, value: u8, n: usize) {
+        let space_left = self.space_left();
+        assert!(
+            n <= space_left,
+            "want to fill {n} bytes but buffer only has {space_left} bytes left",
+        );
+
+        let tail = self.tail();
+
+        // SAFETY: we've just checked that there is enough space left
+        let target = unsafe { tail.get_unchecked_mut(0..n) };
+
+        // filling `MaybeUninit` is currently not simple on stable, see
+        // https://github.com/rust-lang/rust/issues/117428
+        //
+        // So we just hand-roll it. In contrast to the stdlib implementation though, we don't need to care about `Drop` because `u8` doesn't need it.
+        for x in target.iter_mut() {
+            x.write(value);
+        }
+
+        // SAFETY: we just wrote that data
+        unsafe { self.bump(n) };
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::panic::AssertUnwindSafe;
+
+    use super::*;
+
+    #[test]
+    fn append() {
+        let mut buffer = LinearBuffer::new(5);
+        buffer.append(b"foo");
+        buffer.append(b"ba");
+        assert_eq!(buffer.slice_initialized_part(..).as_ref(), b"fooba");
+    }
+
+    #[test]
+    fn append_empty() {
+        let mut buffer = LinearBuffer::new(3);
+        fill_buffer_with_ff(&mut buffer);
+
+        buffer.append(b"");
+
+        // buffer init position didn't change
+        assert_eq!(buffer.space_left(), 3);
+
+        // our pre-initialized tail wasn't overridden
+        unsafe {
+            buffer.bump(3);
+        }
+        assert_eq!(
+            buffer.slice_initialized_part(0..3).as_ref(),
+            [0xff, 0xff, 0xff]
+        );
+    }
+
+    #[test]
+    fn panic_append_to_much() {
+        let mut buffer = LinearBuffer::new(3);
+        fill_buffer_with_ff(&mut buffer);
+
+        buffer.append(b"ab");
+        assert_eq!(buffer.slice_initialized_part(0..2).as_ref(), b"ab");
+
+        let err = std::panic::catch_unwind(AssertUnwindSafe(|| {
+            buffer.append(b"cd");
+        }))
+        .unwrap_err();
+        assert_eq!(
+            err.downcast_ref::<String>().unwrap(),
+            "want to append 2 bytes but buffer only has 1 bytes left",
+        );
+
+        // buffer init position didn't change
+        assert_eq!(buffer.space_left(), 1);
+
+        // our pre-initialized tail wasn't overridden
+        unsafe {
+            buffer.bump(1);
+        }
+        assert_eq!(buffer.slice_initialized_part(0..3).as_ref(), b"ab\xff");
+    }
+
+    #[test]
+    fn fill() {
+        let mut buffer = LinearBuffer::new(5);
+        buffer.fill(1, 3);
+        buffer.fill(42, 2);
+        assert_eq!(
+            buffer.slice_initialized_part(..).as_ref(),
+            [1, 1, 1, 42, 42]
+        );
+    }
+
+    #[test]
+    fn fill_n_0() {
+        let mut buffer = LinearBuffer::new(3);
+        fill_buffer_with_ff(&mut buffer);
+
+        buffer.fill(1, 0);
+
+        // buffer init position didn't change
+        assert_eq!(buffer.space_left(), 3);
+
+        // our pre-initialized tail wasn't overridden
+        unsafe {
+            buffer.bump(3);
+        }
+        assert_eq!(
+            buffer.slice_initialized_part(0..3).as_ref(),
+            [0xff, 0xff, 0xff]
+        );
+    }
+
+    #[test]
+    fn panic_fill_to_much() {
+        let mut buffer = LinearBuffer::new(3);
+        fill_buffer_with_ff(&mut buffer);
+
+        buffer.fill(0, 2);
+        assert_eq!(buffer.slice_initialized_part(0..2).as_ref(), [0, 0]);
+
+        let err = std::panic::catch_unwind(AssertUnwindSafe(|| {
+            buffer.fill(0, 2);
+        }))
+        .unwrap_err();
+        assert_eq!(
+            err.downcast_ref::<String>().unwrap(),
+            "want to fill 2 bytes but buffer only has 1 bytes left",
+        );
+
+        // buffer init position didn't change
+        assert_eq!(buffer.space_left(), 1);
+
+        // our pre-initialized tail wasn't overridden
+        unsafe {
+            buffer.bump(1);
+        }
+        assert_eq!(buffer.slice_initialized_part(0..3).as_ref(), [0, 0, 0xff]);
+    }
+
+    #[test]
+    fn test_fill_buffer_with_ff() {
+        let mut buffer = LinearBuffer::new(3);
+        fill_buffer_with_ff(&mut buffer);
+        unsafe {
+            buffer.bump(3);
+        }
+        assert_eq!(
+            buffer.slice_initialized_part(0..3).as_ref(),
+            [0xff, 0xff, 0xff]
+        );
+    }
+
+    /// Fill buffer with pattern `0xff` without advancing the "initialized" position so we can check certain behavior.
+    fn fill_buffer_with_ff(buffer: &mut LinearBuffer) {
+        for x in buffer.tail().iter_mut() {
+            x.write(0xff);
+        }
+    }
+}
diff --git a/linear_buffer/src/lib.rs b/linear_buffer/src/lib.rs
new file mode 100644
index 00000000..758fc18b
--- /dev/null
+++ b/linear_buffer/src/lib.rs
@@ -0,0 +1,12 @@
+//! Crate that implements [`LinearBuffer`].
+mod allocation;
+mod extend;
+mod linear_buffer;
+
+// Workaround for "unused crate" lint false positives.
+// This is only done if we do NOT run under MIRI to avoid costlly compliation of a lot of unused dependencies.
+#[cfg(not(miri))]
+use workspace_hack as _;
+
+pub use extend::LinearBufferExtend;
+pub use linear_buffer::{LinearBuffer, Slice};
diff --git a/linear_buffer/src/linear_buffer.rs b/linear_buffer/src/linear_buffer.rs
new file mode 100644
index 00000000..21631afc
--- /dev/null
+++ b/linear_buffer/src/linear_buffer.rs
@@ -0,0 +1,529 @@
+//! Implementation of the buffer construct itself.
+use std::{
+    cell::UnsafeCell,
+    mem::MaybeUninit,
+    num::NonZeroUsize,
+    ops::{Bound, Deref, Range, RangeBounds},
+    sync::Arc,
+};
+
+use crate::allocation::Allocation;
+
+/// Fixed-size buffer that supports [append] and
+/// [reading initialized parts](Self::slice_initialized_part) at the same time.
+///
+/// # Use Case
+/// This construct allows you to [append] data to a buffer but at the same time hand out slices to the
+/// already-initialized part of it. This is normally not possible with Rust's borrowing rules. An example is when you
+/// receive data from a network and want to cache data in-memory (like an entire file), but also want to run write
+/// operations for the already-received data to disk (e.g. for caching).
+///
+/// Furthermore, the buffer can be [initialized with a desired alignment](Self::with_alignment).
+///
+/// Neither of this is possible with purely safe standard library tooling nor with the famous [`bytes`] crate.
+///
+/// # Implementation
+/// The data layout looks like this:
+///
+/// ```text
+/// |<-----------------total_size------------------------------>|
+/// |                                                           |
+/// [============== allocation =================================]
+/// [✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓????????????????????????????????????????]
+/// |                   |                                       |
+/// |<---initialized--->|                                       |
+///                     |<---unitialized / space left / tail--->|
+///                     ^
+///                     |
+///            first_uninit_element
+/// ```
+///
+/// The _allocation_ is held as a [`MaybeUninit`] slice to avoid zeroing the buffer just to overwrite the data shortly
+/// after. The _allocation_ NEVER moves and is only dropped when the [`LinearBuffer`] and all [`Slice`]s are dropped.
+///
+/// The user can [get slices of the _initialized_ part](Self::slice_initialized_part). At the same time there exists
+/// only at max one [`LinearBuffer`] which acts as a mutable reference to the uninitialized part:
+///
+/// ```text
+/// |<----------------LinearBuffer----------------------------->|
+/// |                                                           |
+/// |                                                           |
+/// V                                                           V
+/// [============== allocation =================================]
+/// [✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓✓??????????????????????????????????]
+///  ^ ^           ^   ^
+///  | |           |   |
+///  | |           |   |
+///  | |<---Slice 1--->|
+///  |             |
+///  |             |
+///  |<--Slice 2-->|
+/// ```
+///
+///
+/// [append]: crate::LinearBufferExtend::append
+/// [`bytes`]: https://crates.io/crates/bytes
+#[derive(Debug)]
+pub struct LinearBuffer {
+    data: SharedAllocation,
+    first_uninit_element: usize,
+}
+
+impl LinearBuffer {
+    /// Allocate new buffer of given size in bytes.
+    ///
+    /// # Panic
+    /// If we cannot allocate the buffer, we panic.
+    pub fn new(len: usize) -> Self {
+        Self::with_alignment(len, NonZeroUsize::MIN)
+    }
+
+    /// Allocate new buffer of given size in bytes and alignment.
+    ///
+    /// # Panic
+    /// If we cannot allocate the buffer, we panic.
+    ///
+    /// Alignment must be a power of 2.
+    ///
+    /// # Alignment Rust Type
+    /// Once <https://github.com/rust-lang/rust/issues/102070> is closed and we have a proper stable `Alignment` type,
+    /// we should use that. For now we only enforce "not zero" on the type level and the rest during runtime.
+    #[expect(clippy::arc_with_non_send_sync)]
+    pub fn with_alignment(len: usize, alignment: NonZeroUsize) -> Self {
+        Self {
+            data: SharedAllocation(Arc::new(UnsafeCell::new(Allocation::new(len, alignment)))),
+            first_uninit_element: 0,
+        }
+    }
+
+    /// Size of the entire buffer, including the initialized part and the uninitialized part.
+    ///
+    /// Also see [`space_left`](Self::space_left) and [`initialized_bytes`](Self::initialized_bytes).
+    pub fn total_size(&self) -> usize {
+        self.data.total_size()
+    }
+
+    /// How much space is left.
+    ///
+    /// This is identical to the length of the [`tail`](Self::tail), but does not require a mutable reference to obtain.
+    pub fn space_left(&self) -> usize {
+        self.total_size() - self.first_uninit_element
+    }
+
+    /// Number of initialized bytes.
+    pub fn initialized_bytes(&self) -> usize {
+        self.first_uninit_element
+    }
+
+    /// Number of references that point to the allocation.
+    pub fn strong_count(&self) -> usize {
+        self.data.strong_count()
+    }
+
+    /// The uninitialized part of the buffer.
+    ///
+    /// This can be used as a target for I/O operations. After writing data in, call [`bump`](Self::bump) to specify
+    /// the amount of data written to the START of the tail.
+    ///
+    /// If you want to append data from an existing slice or a constant value, it is easier to use
+    /// [`LinearBufferExtend`]. However, using this low-level interface might work better if you have I/O operations
+    /// that can read into a pre-allocated buffer.
+    ///
+    /// # Example
+    /// ```
+    /// # use linear_buffer::LinearBuffer;
+    /// let mut buffer = LinearBuffer::new(3);
+    ///
+    /// let tail = buffer.tail();
+    /// tail[0].write(b'f');
+    /// tail[1].write(b'o');
+    /// tail[2].write(b'o');
+    ///
+    /// unsafe { buffer.bump(3) };
+    ///
+    /// assert_eq!(
+    ///     buffer.slice_initialized_part(0..3).as_ref(),
+    ///     b"foo",
+    /// );
+    /// ```
+    ///
+    ///
+    /// [`LinearBufferExtend`]: crate::LinearBufferExtend
+    pub fn tail(&mut self) -> &mut [MaybeUninit<u8>] {
+        let data_ptr = self.data.0.get();
+
+        // SAFETY: there can only be one caller that accesses the tail due to Rust's borrowing rules
+        let partially_initialized_buffer = unsafe { &mut *data_ptr };
+
+        // SAFETY: first_uninit_element is always in bounds because we reject "overshooting" in `bump`
+        unsafe { partially_initialized_buffer.get_unchecked_mut(self.first_uninit_element..) }
+    }
+
+    /// Bump initialized part of the buffer by given amount of bytes (= delta).
+    ///
+    /// # Panic
+    /// There must be enough space left in buffer.
+    ///
+    /// # Safety
+    /// The caller must ensure that they initialized the respective portion of the buffer using [`tail`](Self::tail).
+    pub unsafe fn bump(&mut self, initialized: usize) {
+        let space_left = self.space_left();
+        assert!(
+            initialized <= space_left,
+            "buffer only has {space_left} bytes left but initialized part should be bumped by {initialized} bytes",
+        );
+
+        self.first_uninit_element += initialized;
+    }
+
+    /// Get a slice of the initialized portion of the buffer.
+    ///
+    /// You may hold multiple overlapping slices to the same initialized memory.
+    ///
+    /// # Panic
+    /// The range must be well-formed and within the range of the initialized part.
+    #[track_caller]
+    pub fn slice_initialized_part(&self, range: impl RangeBounds<usize>) -> Slice {
+        let len = self.total_size();
+
+        let begin = match range.start_bound() {
+            Bound::Included(&n) => n,
+            Bound::Excluded(&n) => n.checked_add(1).expect("out of range"),
+            Bound::Unbounded => 0,
+        };
+
+        let end = match range.end_bound() {
+            Bound::Included(&n) => n.checked_add(1).expect("out of range"),
+            Bound::Excluded(&n) => n,
+            Bound::Unbounded => len,
+        };
+
+        assert!(
+            begin <= end,
+            "range start must not be greater than end: {begin} <= {end}",
+        );
+        assert!(
+            end <= self.first_uninit_element,
+            "range end out of bounds: {end} <= {}",
+            self.first_uninit_element,
+        );
+
+        Slice {
+            data: self.data.clone(),
+            range: begin..end,
+        }
+    }
+}
+
+/// Wrapper around the half-initialized buffer.
+#[derive(Debug, Clone)]
+struct SharedAllocation(Arc<UnsafeCell<Allocation>>);
+
+// SAFETY: We manually make sure that:
+//         - the inner allocation never changes
+//         - there is only at max one mutable reference to the tail part of the buffer
+//         - there is NO mutable reference to the initialized part of the buffer
+//         - slices (i.e. non-mut references) only exist to the initialized part of the buffer
+unsafe impl Send for SharedAllocation {}
+unsafe impl Sync for SharedAllocation {}
+
+impl SharedAllocation {
+    /// Size of the entire buffer, including the initialized part and the uninitialized part.
+    fn total_size(&self) -> usize {
+        let data_ptr = self.0.get();
+
+        // SAFETY: we NEVER change the underlying allocation
+        let allocation = unsafe { &*data_ptr };
+
+        allocation.len()
+    }
+
+    /// Number of references that point to the allocation.
+    fn strong_count(&self) -> usize {
+        Arc::strong_count(&self.0)
+    }
+}
+
+/// A slice of initialized data from a [`LinearBuffer`].
+#[derive(Clone)]
+pub struct Slice {
+    data: SharedAllocation,
+    range: Range<usize>,
+}
+
+impl Slice {
+    /// Size of the underlying allocation in bytes.
+    pub fn allocation_size(&self) -> usize {
+        self.data.total_size()
+    }
+
+    /// Number of references that point to the allocation.
+    pub fn strong_count(&self) -> usize {
+        self.data.strong_count()
+    }
+}
+
+impl std::fmt::Debug for Slice {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.deref().fmt(f)
+    }
+}
+
+impl Deref for Slice {
+    type Target = [u8];
+
+    fn deref(&self) -> &Self::Target {
+        let data_ptr = self.data.0.get();
+
+        // SAFETY: the actual allocation is never changed
+        let partially_initialized_buffer = unsafe { &*data_ptr };
+
+        // SAFETY: we've check the bounds in LinearBuffer::slice_initialized_part
+        let init_part = unsafe { partially_initialized_buffer.get_unchecked(self.range.clone()) };
+
+        // SAFETY: this is only the initialized part
+        unsafe {
+            // Slice methods of "assume init" aren't stable yet, see
+            // https://github.com/rust-lang/rust/issues/63569
+            //
+            // So we just use the code from the stdlib
+            &*(init_part as *const [MaybeUninit<u8>] as *const [u8])
+        }
+    }
+}
+
+impl AsRef<[u8]> for Slice {
+    fn as_ref(&self) -> &[u8] {
+        self.deref()
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::LinearBufferExtend;
+
+    use super::*;
+
+    #[test]
+    #[should_panic(
+        expected = "buffer only has 2 bytes left but initialized part should be bumped by 3 bytes"
+    )]
+    fn panic_bump_too_much() {
+        let mut buffer = LinearBuffer::new(3);
+
+        buffer.tail()[0].write(1);
+        unsafe { buffer.bump(1) };
+
+        buffer.tail()[0].write(1);
+        buffer.tail()[1].write(1);
+        unsafe { buffer.bump(3) };
+    }
+
+    #[test]
+    #[should_panic(expected = "out of range")]
+    fn panic_slice_begin_usize_out_of_range() {
+        let mut buffer = LinearBuffer::new(3);
+        buffer.append(b"foo");
+
+        buffer.slice_initialized_part((Bound::Excluded(usize::MAX), Bound::Unbounded));
+    }
+
+    #[test]
+    #[should_panic(expected = "out of range")]
+    fn panic_slice_end_usize_out_of_range() {
+        let mut buffer = LinearBuffer::new(3);
+        buffer.append(b"foo");
+
+        buffer.slice_initialized_part(..=usize::MAX);
+    }
+
+    #[test]
+    #[should_panic(expected = "range start must not be greater than end: 2 <= 1")]
+    #[expect(clippy::reversed_empty_ranges)]
+    fn panic_slice_begin_past_end() {
+        let mut buffer = LinearBuffer::new(10);
+        buffer.append(b"foo");
+
+        buffer.slice_initialized_part(2..1);
+    }
+
+    #[test]
+    #[should_panic(expected = "range end out of bounds: 4 <= 3")]
+    fn panic_slice_end_past_init_part() {
+        let mut buffer = LinearBuffer::new(10);
+        buffer.append(b"foo");
+
+        buffer.slice_initialized_part(..4);
+    }
+
+    #[test]
+    fn empty_slice() {
+        let mut buffer = LinearBuffer::new(3);
+        buffer.append(b"foo");
+
+        let bytes = buffer.slice_initialized_part(0..0);
+        assert_eq!(bytes.as_ref(), b"");
+    }
+
+    #[test]
+    fn slices_are_zero_copy() {
+        let mut buffer = LinearBuffer::new(10);
+        buffer.append(b"foo");
+
+        let bytes_1 = buffer.slice_initialized_part(..3);
+        let ptr_1 = bytes_1.as_ptr().expose_provenance();
+        assert_eq!(bytes_1.as_ref(), b"foo".as_slice());
+
+        buffer.append(b"bar");
+
+        let bytes_2 = buffer.slice_initialized_part(..6);
+        let ptr_2 = bytes_1.as_ptr().expose_provenance();
+        assert_eq!(bytes_2.as_ref(), b"foobar".as_slice());
+        assert_eq!(ptr_1, ptr_2);
+
+        buffer.append(b"xxxx");
+
+        let data = buffer.slice_initialized_part(..);
+        let data_ptr = data.as_ptr().expose_provenance();
+        assert_eq!(data_ptr, ptr_1);
+        assert_eq!(data.as_ref(), b"foobarxxxx".as_slice());
+    }
+
+    #[test]
+    fn can_read_slice_after_buffer_drop() {
+        let mut buffer = LinearBuffer::new(3);
+        buffer.append(b"foo");
+
+        let bytes = buffer.slice_initialized_part(..);
+        drop(buffer);
+
+        assert_eq!(bytes.as_ref(), b"foo".as_slice());
+    }
+
+    #[test]
+    fn slice_clone() {
+        let mut buffer = LinearBuffer::new(3);
+        buffer.append(b"foo");
+
+        let bytes = buffer.slice_initialized_part(..);
+        assert_eq!(bytes.as_ref(), b"foo".as_slice());
+        assert_eq!(bytes.strong_count(), 2);
+
+        drop(buffer);
+        assert_eq!(bytes.strong_count(), 1);
+
+        let bytes2 = bytes.clone();
+        assert_eq!(bytes2.as_ref(), b"foo".as_slice());
+        assert_eq!(bytes.strong_count(), 2);
+        assert_eq!(bytes2.strong_count(), 2);
+        assert_eq!(
+            bytes.as_ptr().expose_provenance(),
+            bytes2.as_ptr().expose_provenance(),
+            "slice cloning MUST NOT clone the actual data",
+        );
+
+        drop(bytes);
+        assert_eq!(bytes2.strong_count(), 1);
+        assert_eq!(bytes2.as_ref(), b"foo".as_slice());
+    }
+
+    #[test]
+    fn slice_debug() {
+        let mut buffer = LinearBuffer::new(3);
+        buffer.append(b"foo");
+
+        let slice = buffer.slice_initialized_part(..);
+        assert_eq!(format!("{slice:?}"), "[102, 111, 111]");
+        assert_eq!(format!("{slice:x?}"), "[66, 6f, 6f]");
+    }
+
+    #[test]
+    fn empty_buffer() {
+        let buffer = LinearBuffer::new(0);
+        let slice = buffer.slice_initialized_part(..);
+        assert_eq!(slice.as_ref(), b"");
+    }
+
+    #[test]
+    #[should_panic(expected = "size fits `isize`")]
+    fn new_panics_larger_than_isize() {
+        LinearBuffer::new(usize::MAX);
+    }
+
+    #[test]
+    #[should_panic(expected = "cannot allocate 9223372036854775807 bytes with alignment 1")]
+    #[cfg(not(miri))] // MIRI cannot handle this
+    fn new_panics_out_of_memory() {
+        LinearBuffer::new(isize::MAX as usize);
+    }
+
+    #[test]
+    #[should_panic(expected = "valid alignment")]
+    fn new_panics_if_alignment_is_not_power_of_two() {
+        LinearBuffer::with_alignment(1, NonZeroUsize::new(3).unwrap());
+    }
+
+    #[test]
+    fn alignment() {
+        for size in [0, 13] {
+            for shift in 0..13 {
+                let alignment = NonZeroUsize::new(1 << shift).unwrap();
+                println!("size={size} alignment={alignment}");
+
+                let mut buffer = LinearBuffer::with_alignment(size, alignment);
+                assert_eq!(buffer.total_size(), size);
+
+                let slice = buffer.slice_initialized_part(0..0);
+                assert_eq!(slice.as_ptr().align_offset(alignment.get()), 0);
+
+                buffer.fill(0, size);
+                let slice = buffer.slice_initialized_part(..size);
+                assert_eq!(slice.as_ptr().align_offset(alignment.get()), 0);
+            }
+        }
+    }
+
+    #[test]
+    fn strong_count() {
+        let buffer = LinearBuffer::new(3);
+        assert_eq!(buffer.strong_count(), 1);
+
+        let slice_1 = buffer.slice_initialized_part(..0);
+        assert_eq!(buffer.strong_count(), 2);
+        assert_eq!(slice_1.strong_count(), 2);
+
+        let slice_2 = buffer.slice_initialized_part(..0);
+        assert_eq!(buffer.strong_count(), 3);
+        assert_eq!(slice_1.strong_count(), 3);
+        assert_eq!(slice_2.strong_count(), 3);
+
+        drop(slice_1);
+        assert_eq!(buffer.strong_count(), 2);
+        assert_eq!(slice_2.strong_count(), 2);
+
+        drop(buffer);
+        assert_eq!(slice_2.strong_count(), 1);
+    }
+
+    #[test]
+    #[ignore = "this is unsound, it just demonstrates that MIRI will find out about it"]
+    fn miri_finds_it() {
+        let mut buffer = LinearBuffer::new(3);
+
+        buffer.tail()[0].write(1);
+
+        // we lie about the amount of data written
+        unsafe { buffer.bump(3) };
+
+        let bytes = buffer.slice_initialized_part(..);
+        assert_ne!(bytes.as_ref(), b"xxx".as_slice());
+    }
+
+    const fn assert_send<T: Send>() {}
+    const fn assert_sync<T: Sync>() {}
+
+    const _: () = assert_send::<LinearBuffer>();
+    const _: () = assert_sync::<LinearBuffer>();
+    const _: () = assert_send::<Slice>();
+    const _: () = assert_sync::<Slice>();
+}
diff --git a/meta_data_cache/Cargo.toml b/meta_data_cache/Cargo.toml
index d87af7a7..1166d667 100644
--- a/meta_data_cache/Cargo.toml
+++ b/meta_data_cache/Cargo.toml
@@ -22,7 +22,7 @@ futures = { version = "0.3.31" }
 [dev-dependencies]
 arrow_util = { path = "../arrow_util" }
 bytes = "1.10"
-tokio = { version = "1.47.1", default-features = false }
+tokio = { version = "1.48.0", default-features = false }
 uuid = { version = "1", features = ["v4"] }
 
 [lints]
diff --git a/object_store_mem_cache/Cargo.toml b/object_store_mem_cache/Cargo.toml
index be94e16b..28b4fa91 100644
--- a/object_store_mem_cache/Cargo.toml
+++ b/object_store_mem_cache/Cargo.toml
@@ -12,15 +12,17 @@ bytes = { version = "1.10.1", default-features = false }
 dashmap = "6.1.0"
 data_types = { path = "../data_types" }
 futures = { version = "0.3.31" }
-indexmap = { version = "2.11", features = ["std"] }
+http = { workspace = true }
+indexmap = { version = "2.12", features = ["std"] }
 iox_time = { path = "../iox_time" }
+linear_buffer = { path = "../linear_buffer" }
 metric = { path = "../metric" }
 object_store.workspace = true
 object_store_metrics = { path = "../object_store_metrics" }
 object_store_mock = { path = "../object_store_mock" }
 object_store_size_hinting = { path = "../object_store_size_hinting" }
 tracing = { workspace = true }
-tokio = { version = "1.47.1", default-features = false }
+tokio = { version = "1.48.0", default-features = false }
 tracker = { path = "../tracker" }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
diff --git a/object_store_mem_cache/src/buffer_channel.rs b/object_store_mem_cache/src/buffer_channel.rs
new file mode 100644
index 00000000..d451beb3
--- /dev/null
+++ b/object_store_mem_cache/src/buffer_channel.rs
@@ -0,0 +1,175 @@
+//! Channel to hand a buffer from an inner store to the in-mem cache.
+//!
+//! Normally we would just use [`bytes`], however the crate suffers from gate-keeping and even though many users would
+//! like to see it, there is currently no proper way to build [`bytes`]-based buffers with proper alignment or a
+//! custom vtable. So we work around it.
+
+use std::{
+    pin::Pin,
+    sync::{
+        Arc, Mutex,
+        atomic::{AtomicBool, Ordering},
+    },
+    task::{Context, Poll},
+};
+
+use futures::FutureExt;
+use linear_buffer::Slice;
+use tokio::sync::oneshot::{Receiver, Sender, error::RecvError};
+
+/// Create channel that can be used ONCE to send a [`Slice`].
+///
+/// The sender may choose not to accept the transfer (by not calling [`accept`](BufferSender::accept)), i.e. if it does
+/// not implement buffer handling.
+pub fn channel() -> (BufferSender, BufferReceiver) {
+    let accepted = Arc::new(AtomicBool::new(false));
+    let (sender, receiver) = tokio::sync::oneshot::channel();
+    let sender = BufferSender {
+        accepted: Arc::clone(&accepted),
+        sender: Arc::new(Mutex::new(Some(sender))),
+    };
+    let receiver = BufferReceiver { accepted, receiver };
+    (sender, receiver)
+}
+
+/// Sender-side for a [`Slice`].
+///
+/// The sender is clonable so it can be used with [`http::Extensions`], but you must only call [`accept`](Self::accept)
+/// at most once.
+#[derive(Debug, Clone)]
+pub struct BufferSender {
+    accepted: Arc<AtomicBool>,
+    sender: Arc<Mutex<Option<Sender<Slice>>>>,
+}
+
+impl BufferSender {
+    /// Accept that we will have a [`Slice`] available at some point.
+    ///
+    /// After calling this function, the sender MUST provide a slice at some point. Dropping the returned
+    /// [handle](BufferSenderAccepted) without doing so will result in an error on the
+    /// [receiver side](BufferReceiverAccepted).
+    ///
+    /// # Panic
+    /// Across all clones, this method must only be called at most once.
+    pub fn accept(self) -> BufferSenderAccepted {
+        let Self { accepted, sender } = self;
+        let maybe_sender = {
+            let mut guard = sender.lock().unwrap();
+            guard.take()
+        };
+        let sender = maybe_sender.expect("can only accept once");
+        accepted.store(true, Ordering::SeqCst);
+        BufferSenderAccepted { sender }
+    }
+}
+
+/// Sender-side in an [accepted](BufferSender::accept) state.
+#[derive(Debug)]
+pub struct BufferSenderAccepted {
+    sender: Sender<Slice>,
+}
+
+impl BufferSenderAccepted {
+    /// Send slice.
+    pub fn send(self, buffer: Slice) {
+        let Self { sender } = self;
+        sender.send(buffer).ok();
+    }
+}
+
+/// Receiver side of a [`Slice`].
+#[derive(Debug)]
+pub struct BufferReceiver {
+    accepted: Arc<AtomicBool>,
+    receiver: Receiver<Slice>,
+}
+
+impl BufferReceiver {
+    pub fn accepted(self) -> Option<BufferReceiverAccepted> {
+        let Self { accepted, receiver } = self;
+        accepted
+            .load(Ordering::SeqCst)
+            .then_some(BufferReceiverAccepted { receiver })
+    }
+}
+
+/// Receiver side of the [`Slice`] for which the sender has accepted the transfer.
+#[derive(Debug)]
+pub struct BufferReceiverAccepted {
+    receiver: Receiver<Slice>,
+}
+
+impl Future for BufferReceiverAccepted {
+    type Output = Result<Slice, RecvError>;
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.receiver.poll_unpin(cx)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use linear_buffer::LinearBuffer;
+
+    use super::*;
+
+    #[test]
+    #[should_panic(expected = "can only accept once")]
+    fn panic_accept_twice() {
+        let (tx, _rx) = channel();
+        let tx2 = tx.clone();
+
+        tx.accept();
+        tx2.accept();
+    }
+
+    #[tokio::test]
+    async fn err_accepted_sender_dropped() {
+        let (tx, rx) = channel();
+        let tx = tx.accept();
+        let rx = rx.accepted().unwrap();
+        drop(tx);
+        rx.await.unwrap_err();
+    }
+
+    #[tokio::test]
+    async fn accept_accepted_send_receive() {
+        let buffer = LinearBuffer::new(0);
+        let slice = buffer.slice_initialized_part(..);
+
+        let (tx, rx) = channel();
+        let tx = tx.accept();
+        let rx = rx.accepted().unwrap();
+        tx.send(slice.clone());
+        let slice2 = rx.await.unwrap();
+
+        assert_eq!(
+            slice.as_ptr().expose_provenance(),
+            slice2.as_ptr().expose_provenance(),
+        );
+    }
+
+    #[tokio::test]
+    async fn accept_send_accepted_receive() {
+        let buffer = LinearBuffer::new(0);
+        let slice = buffer.slice_initialized_part(..);
+
+        let (tx, rx) = channel();
+        let tx = tx.accept();
+        tx.send(slice.clone());
+        let rx = rx.accepted().unwrap();
+        let slice2 = rx.await.unwrap();
+
+        assert_eq!(
+            slice.as_ptr().expose_provenance(),
+            slice2.as_ptr().expose_provenance(),
+        );
+    }
+
+    #[tokio::test]
+    async fn not_accepted() {
+        let (tx, rx) = channel();
+        drop(tx);
+        assert!(rx.accepted().is_none());
+    }
+}
diff --git a/object_store_mem_cache/src/cache_system/mod.rs b/object_store_mem_cache/src/cache_system/mod.rs
index cbe76ac8..2a3f841b 100644
--- a/object_store_mem_cache/src/cache_system/mod.rs
+++ b/object_store_mem_cache/src/cache_system/mod.rs
@@ -86,6 +86,12 @@ where
     }
 }
 
+impl HasSize for () {
+    fn size(&self) -> usize {
+        0
+    }
+}
+
 /// Dynamic error type.
 pub type DynError = Arc<dyn std::error::Error + Send + Sync>;
 
diff --git a/object_store_mem_cache/src/cache_system/s3_fifo_cache/fifo.rs b/object_store_mem_cache/src/cache_system/s3_fifo_cache/fifo.rs
index 9fb36b28..88a4a3b5 100644
--- a/object_store_mem_cache/src/cache_system/s3_fifo_cache/fifo.rs
+++ b/object_store_mem_cache/src/cache_system/s3_fifo_cache/fifo.rs
@@ -15,10 +15,21 @@ impl<T> Fifo<T>
 where
     T: HasSize,
 {
+    /// Create a new Fifo from a VecDeque.
+    pub(crate) fn new(queue: VecDeque<T>) -> Self {
+        let memory_size = queue.iter().map(|o| o.size()).sum();
+        Self { queue, memory_size }
+    }
+
     pub(crate) fn memory_size(&self) -> usize {
         self.memory_size
     }
 
+    /// Return a count of items in the queue.
+    pub(crate) fn len(&self) -> usize {
+        self.queue.len()
+    }
+
     pub(crate) fn iter(&self) -> vec_deque::Iter<'_, T> {
         self.queue.iter()
     }
@@ -37,6 +48,15 @@ where
             None => None,
         }
     }
+
+    /// Drain all elements from the queue, consuming the underlying VecDeque
+    /// and returning a iterator over the items.
+    ///
+    /// This preserves the ordering of elements and avoids re-allocation.
+    pub(crate) fn drain(&mut self) -> impl Iterator<Item = T> {
+        self.memory_size = 0;
+        std::mem::take(&mut self.queue).into_iter()
+    }
 }
 
 impl<T> Default for Fifo<T>
diff --git a/object_store_mem_cache/src/cache_system/s3_fifo_cache/mod.rs b/object_store_mem_cache/src/cache_system/s3_fifo_cache/mod.rs
index 1ab24109..ddf11079 100644
--- a/object_store_mem_cache/src/cache_system/s3_fifo_cache/mod.rs
+++ b/object_store_mem_cache/src/cache_system/s3_fifo_cache/mod.rs
@@ -15,8 +15,8 @@ use tracker::{
     AsyncSemaphoreMetrics, InstrumentedAsyncOwnedSemaphorePermit, InstrumentedAsyncSemaphore,
 };
 
-// for benchmarks
-pub use s3_fifo::{S3Config, S3Fifo};
+// for benchmarks and tests
+pub use s3_fifo::{S3Config, S3Fifo, s3_fifo_entry_overhead_size};
 
 use crate::cache_system::{AsyncDrop, DynError, InUse};
 
@@ -264,9 +264,27 @@ where
     ///
     /// Note that the keys listed in the cache are those which have returned from the
     /// [`CacheFn`] function, i.e. they are the keys that have been successfully fetched.
+    ///
+    /// These keys do not have any guaranteed ordering.
     pub fn list(&self) -> impl Iterator<Item = Arc<K>> {
         self.cache.keys()
     }
+
+    /// Evict multiple keys from the S3FifoCache, in a blocking manner.
+    ///
+    /// This method directly removes entries from the cache without going through
+    /// the normal eviction process, where the S3-Fifo algorthim decides what to evict.
+    /// This is useful for cache management operations like repair/validation.
+    ///
+    /// This method is blocking, and holds a mutex in order to replace the [`S3Fifo`] cache
+    /// at once.
+    ///
+    /// Returns the number of keys that were successfully evicted. If a key does not
+    /// exist in the cache and cannot be evicted, it will be ignored (and the
+    /// returned count of evicted items will be lower).
+    pub fn evict_keys(&self, keys: impl Iterator<Item = K>) -> usize {
+        self.cache.remove_keys(keys)
+    }
 }
 
 #[async_trait]
@@ -1135,4 +1153,256 @@ mod tests {
         let result2 = res2.await.unwrap();
         assert_eq!(result2, Arc::from("value2"));
     }
+
+    #[tokio::test]
+    async fn test_evict_keys_small_queue() {
+        let hook = Arc::new(TestHook::default());
+        let cache = S3FifoCache::<Arc<str>, Arc<str>, ()>::new(
+            S3Config {
+                max_memory_size: 1000,
+                max_ghost_memory_size: 500,
+                move_to_main_threshold: 0.1,
+                hook: Arc::clone(&hook) as _,
+                inflight_bytes: 250,
+            },
+            &metric::Registry::new(),
+        );
+
+        // Insert 5 keys in order: key1, key2, key3, key4, key5
+        let keys = vec!["key1", "key2", "key3", "key4", "key5"];
+        let mut inserted_keys = Vec::new();
+
+        for key_str in &keys {
+            let key = Arc::from(*key_str);
+            let value = Arc::from(format!("value_{}", key_str));
+            inserted_keys.push(Arc::clone(&key));
+
+            let (res, _, state) = cache.get_or_fetch(
+                &key,
+                Box::new({
+                    let value = Arc::clone(&value);
+                    move || futures::future::ready(Ok(value)).boxed()
+                }),
+                (),
+                Some(value.size()),
+            );
+            assert_eq!(state, CacheState::NewEntry);
+            res.await.unwrap();
+        }
+
+        // Verify all keys are in the cache
+        assert_eq!(cache.len(), 5);
+        for key in &inserted_keys {
+            assert!(cache.get(key).is_some(), "Key {key:?} should be in cache");
+        }
+
+        // Get list of keys before eviction to verify ordering preservation
+        let keys_before: Vec<Arc<str>> = cache
+            .cache
+            .small_queue_keys()
+            .into_iter()
+            .map(Arc::unwrap_or_clone)
+            .collect();
+        assert_eq!(keys_before.len(), 5, "Should have 5 keys before eviction");
+
+        // Confirm have empty ghost queue
+        assert_eq!(cache.cache.ghost_len(), 0, "Ghost queue should be empty");
+
+        // Evict only key2 and key4 (selective eviction)
+        let keys_to_evict = vec![
+            Arc::clone(&inserted_keys[1]), // key2
+            Arc::clone(&inserted_keys[3]), // key4
+        ];
+        let evicted_count = cache.evict_keys(keys_to_evict.clone().into_iter());
+        assert_eq!(evicted_count, 2, "Should have evicted exactly 2 keys");
+
+        // Verify cache size is reduced
+        assert_eq!(
+            cache.len(),
+            3,
+            "Cache should contain 3 entries after eviction"
+        );
+
+        // Get list of keys after eviction
+        let keys_after: Vec<Arc<str>> = cache
+            .cache
+            .small_queue_keys()
+            .into_iter()
+            .map(Arc::unwrap_or_clone)
+            .collect();
+        let expected_remaining_keys: Vec<Arc<str>> = keys_before
+            .into_iter()
+            .filter(|key| !keys_to_evict.contains(key))
+            .collect();
+        assert_eq!(
+            keys_after, expected_remaining_keys,
+            "Remaining keys should match expected keys, and retain the same ordering"
+        );
+
+        // Check that evicted keys are removed from S3Fifo::entries
+        for evicted_key in &keys_to_evict {
+            assert!(
+                !cache.cache.contains_key_in_entries(evicted_key),
+                "Evicted key {evicted_key:?} should be removed from entries"
+            );
+        }
+
+        // Check that remaining keys are still in S3Fifo::entries
+        for remaining_key in &expected_remaining_keys {
+            assert!(
+                cache.cache.contains_key_in_entries(remaining_key),
+                "Remaining key {remaining_key:?} should still be in entries"
+            );
+        }
+
+        // Check ghost queue is still empty
+        assert_eq!(
+            cache.cache.ghost_len(),
+            0,
+            "Ghost queue should remain empty"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_evict_keys_main_queue_and_ghost() {
+        let hook = Arc::new(TestHook::default());
+        let cache = S3FifoCache::<Arc<str>, Arc<str>, ()>::new(
+            S3Config {
+                max_memory_size: 150 + 100,
+                max_ghost_memory_size: 150,
+                move_to_main_threshold: 0.3,
+                hook: Arc::clone(&hook) as _,
+                inflight_bytes: 50,
+            },
+            &metric::Registry::new(),
+        );
+
+        // Insert 6 keys in order: key1, key2, key3, key4, key5, key6
+        let keys = vec!["key1", "key2", "key3", "key4", "key5", "key6"];
+        let mut inserted_keys = Vec::new();
+
+        for key_str in &keys {
+            let key = Arc::from(*key_str);
+            let value = Arc::from(format!("value_{}", key_str));
+            inserted_keys.push(Arc::clone(&key));
+
+            let (res, _, state) = cache.get_or_fetch(
+                &key,
+                Box::new({
+                    let value = Arc::clone(&value);
+                    move || futures::future::ready(Ok(value)).boxed()
+                }),
+                (),
+                Some(value.size()),
+            );
+            assert_eq!(state, CacheState::NewEntry);
+            res.await.unwrap();
+        }
+
+        // Verify cache only has the last 3 keys (key4, key5, key6) due to eviction
+        assert_eq!(cache.len(), 3, "Cache should contain exactly 3 entries");
+
+        // The first 3 keys should have been evicted and logged in the ghost
+        assert_eq!(cache.cache.ghost_len(), 3, "Ghost should have 3 entries");
+
+        // Re-insert the first 3 keys (key1, key2, key3)
+        for i in 0..3 {
+            let key = Arc::clone(&inserted_keys[i]);
+            let value = Arc::from(format!("value_{}", keys[i]));
+
+            let (res, _, state) = cache.get_or_fetch(
+                &key,
+                Box::new({
+                    let value = Arc::clone(&value);
+                    move || futures::future::ready(Ok(value)).boxed()
+                }),
+                (),
+                Some(value.size()),
+            );
+            assert_eq!(state, CacheState::NewEntry);
+            res.await.unwrap();
+        }
+
+        // Verify the first 3 keys are now in the main queue (since they were in ghost)
+        let main_queue_keys = cache.cache.main_queue_keys();
+        assert_eq!(
+            main_queue_keys,
+            vec![
+                Arc::new(Arc::clone(&inserted_keys[0])),
+                Arc::new(Arc::clone(&inserted_keys[1])),
+                Arc::new(Arc::clone(&inserted_keys[2])),
+            ]
+        );
+
+        // Verify they are no longer in the ghost
+        assert!(
+            !cache
+                .cache
+                .contains_key_in_ghost(&Arc::new(Arc::clone(&inserted_keys[0]))),
+            "key1 should no longer be in ghost"
+        );
+        assert!(
+            !cache
+                .cache
+                .contains_key_in_ghost(&Arc::new(Arc::clone(&inserted_keys[1]))),
+            "key2 should no longer be in ghost"
+        );
+        assert!(
+            !cache
+                .cache
+                .contains_key_in_ghost(&Arc::new(Arc::clone(&inserted_keys[2]))),
+            "key3 should no longer be in ghost"
+        );
+        // Instead, we have key4 & key5 & key6 in the ghost
+        assert_eq!(
+            cache.cache.ghost_len(),
+            3,
+            "Ghost should have 3 NEW entries"
+        );
+        assert!(
+            cache
+                .cache
+                .contains_key_in_ghost(&Arc::new(Arc::clone(&inserted_keys[3]))),
+            "key4 should be in ghost"
+        );
+
+        // Evict key1 (main queue) and key4 (ghost) from the cache
+        let keys_to_evict = vec![Arc::clone(&inserted_keys[0]), Arc::clone(&inserted_keys[3])]; // key1, key4
+        let evicted_count = cache.evict_keys(keys_to_evict.clone().into_iter());
+        assert_eq!(
+            evicted_count, 1,
+            "Should have evicted exactly 1 key -- since only 1 is currently in the queue"
+        );
+
+        // Verify key1 is removed from main queue
+        let main_queue_keys_after = cache.cache.main_queue_keys();
+        assert!(
+            !main_queue_keys_after.contains(&Arc::new(Arc::clone(&inserted_keys[0]))),
+            "key1 should be removed from main queue"
+        );
+
+        // Verify key1 is removed from entries (should not be in cache anymore)
+        assert!(
+            !cache.cache.contains_key_in_entries(&inserted_keys[0]),
+            "key1 should be removed from entries"
+        );
+
+        // Verify key2 & key 3 are still in main queue, as well as the ordering is retained.
+        assert_eq!(
+            main_queue_keys_after,
+            vec![
+                Arc::new(Arc::clone(&inserted_keys[1])),
+                Arc::new(Arc::clone(&inserted_keys[2])),
+            ],
+            "key2 & key3 should still be in main queue"
+        );
+
+        // Verify key4 is still in ghost queue (should remain there)
+        assert!(
+            cache
+                .cache
+                .contains_key_in_ghost(&Arc::new(Arc::clone(&inserted_keys[3]))),
+            "key4 should still be in ghost after eviction"
+        );
+    }
 }
diff --git a/object_store_mem_cache/src/cache_system/s3_fifo_cache/ordered_set.rs b/object_store_mem_cache/src/cache_system/s3_fifo_cache/ordered_set.rs
index a4dfc6ce..85eea7da 100644
--- a/object_store_mem_cache/src/cache_system/s3_fifo_cache/ordered_set.rs
+++ b/object_store_mem_cache/src/cache_system/s3_fifo_cache/ordered_set.rs
@@ -139,6 +139,11 @@ where
     pub(crate) fn len(&self) -> usize {
         self.set.len()
     }
+
+    #[cfg(test)]
+    pub(crate) fn contains(&self, o: &T) -> bool {
+        self.set.contains(&Entry::Data(o))
+    }
 }
 
 /// Encode implementation, with trait bounds for `T`.
diff --git a/object_store_mem_cache/src/cache_system/s3_fifo_cache/s3_fifo.rs b/object_store_mem_cache/src/cache_system/s3_fifo_cache/s3_fifo.rs
index bd923005..78c82672 100644
--- a/object_store_mem_cache/src/cache_system/s3_fifo_cache/s3_fifo.rs
+++ b/object_store_mem_cache/src/cache_system/s3_fifo_cache/s3_fifo.rs
@@ -1,6 +1,7 @@
 use bincode::{Decode, Encode};
 use dashmap::DashMap;
 use std::{
+    collections::{HashSet, VecDeque},
     fmt::{Debug, Formatter},
     hash::Hash,
     sync::{
@@ -123,6 +124,23 @@ where
     }
 }
 
+/// Returns the overhead size of the [`S3FifoEntry`]
+/// placed into the S3 Fifo cache manager.
+///
+/// This is useful for testing, since it's the size used
+/// for eviction decisions.
+pub fn s3_fifo_entry_overhead_size() -> usize {
+    // The overhead size is the size of the S3FifoEntry<V> struct,
+    // which is used to store the cache entry in the S3 FIFO cache manager.
+    Arc::new(S3FifoEntry {
+        key: Arc::new(()),
+        value: Arc::new(()),
+        generation: 0,
+        freq: AtomicU8::new(0),
+    })
+    .size()
+}
+
 pub(crate) type CacheEntry<K, V> = Arc<S3FifoEntry<K, V>>;
 type Entries<K, V> = DashMap<Arc<K>, CacheEntry<K, V>>;
 pub(crate) type Evicted<K, V> = Vec<Arc<S3FifoEntry<K, V>>>;
@@ -356,6 +374,33 @@ where
         self.entries.iter().map(|entry| Arc::clone(entry.key()))
     }
 
+    /// Remove multiple keys from the cache, in a blocking manner.
+    ///
+    /// This method directly removes entries from the cache without going through
+    /// the normal eviction process. This is useful for cache management operations
+    /// like repair/validation.
+    ///
+    /// Returns the number of keys that were successfully removed. If a key does not
+    /// exist in the cache and cannot be removed, it will be ignored (and the returned count
+    /// of removed items will be lower).
+    pub fn remove_keys(&self, keys: impl Iterator<Item = K>) -> usize
+    where
+        K: Sized + Clone + Debug,
+    {
+        let mut guard = self.locked_state.lock();
+
+        // Remove keys from the entries map
+        let to_remove_from_state: HashSet<K> = keys
+            .filter_map(|k| self.entries.remove(&k).map(|_| k))
+            .collect();
+
+        // Remove from locked state.
+        let count_removed = guard.remove_keys(&to_remove_from_state);
+        drop(guard);
+
+        count_removed
+    }
+
     /// Create a snapshot of the locked state.
     ///
     /// This function serializes the [`S3Fifo`] inner state using bincode, allowing for
@@ -423,6 +468,38 @@ where
         let guard = self.locked_state.lock();
         guard.ghost.len()
     }
+
+    #[cfg(test)]
+    pub(crate) fn small_queue_keys(&self) -> Vec<Arc<K>> {
+        let guard = self.locked_state.lock();
+        guard
+            .small
+            .iter()
+            .map(|entry| Arc::clone(&entry.key))
+            .collect()
+    }
+
+    #[cfg(test)]
+    pub(crate) fn main_queue_keys(&self) -> Vec<Arc<K>> {
+        let guard = self.locked_state.lock();
+        guard
+            .main
+            .iter()
+            .map(|entry| Arc::clone(&entry.key))
+            .collect()
+    }
+
+    #[cfg(test)]
+    pub(crate) fn contains_key_in_entries(&self, key: &K) -> bool {
+        self.entries.contains_key(key)
+    }
+
+    #[cfg(test)]
+    pub(crate) fn contains_key_in_ghost(&self, key: &Arc<K>) -> bool {
+        let guard = self.locked_state.lock();
+        // The ghost stores Arc<K>, so we need to check by content
+        guard.ghost.contains(key)
+    }
 }
 
 /// Calls [`drop`] but isn't inlined, so it is easier to see on profiles.
@@ -734,6 +811,41 @@ where
             }
         }
     }
+
+    /// Remove multiple keys from the small and main queues.
+    ///
+    /// This method efficiently removes multiple keys by iterating through each queue once.
+    /// It first checks the small queue for all keys, then checks the main queue for any
+    /// remaining keys that weren't found in the small queue.
+    ///
+    /// Returns the number of keys that were successfully removed. If a key does not
+    /// exist in the cache and cannot be removed, it will be ignored (and the returned count
+    /// of removed items will be lower).
+    fn remove_keys(&mut self, keys_to_remove: &HashSet<K>) -> usize
+    where
+        K: Sized + Clone + Debug,
+    {
+        let initial_count = self.small.len() + self.main.len();
+
+        // Remove from small queue
+        let filtered_small: VecDeque<_> = self
+            .small
+            .drain()
+            .filter(|entry| !keys_to_remove.contains(entry.key.as_ref()))
+            .collect();
+        self.small = Fifo::new(filtered_small);
+
+        // Remove from main queue
+        let filtered_main: VecDeque<_> = self
+            .main
+            .drain()
+            .filter(|entry| !keys_to_remove.contains(entry.key.as_ref()))
+            .collect();
+        self.main = Fifo::new(filtered_main);
+
+        // Return the number of keys that were actually removed
+        initial_count - (self.small.len() + self.main.len())
+    }
 }
 
 #[cfg(test)]
diff --git a/object_store_mem_cache/src/lib.rs b/object_store_mem_cache/src/lib.rs
index faefcf54..90e7094f 100644
--- a/object_store_mem_cache/src/lib.rs
+++ b/object_store_mem_cache/src/lib.rs
@@ -7,6 +7,7 @@ use clap as _;
 use rand as _;
 use workspace_hack as _;
 
+pub mod buffer_channel;
 pub mod cache_system;
 pub mod object_store_cache_tests;
 pub mod object_store_helpers;
diff --git a/object_store_mem_cache/src/object_store_cache_tests.rs b/object_store_mem_cache/src/object_store_cache_tests.rs
index 13af15aa..7bf18d3c 100644
--- a/object_store_mem_cache/src/object_store_cache_tests.rs
+++ b/object_store_mem_cache/src/object_store_cache_tests.rs
@@ -2,8 +2,10 @@ use std::sync::Arc;
 
 use bytes::Bytes;
 use futures::future::BoxFuture;
+use http::Extensions;
 use object_store::{
-    DynObjectStore, Error, GetResult, GetResultPayload, ObjectMeta, PutPayload, path::Path,
+    DynObjectStore, Error, GetOptions, GetResult, GetResultPayload, ObjectMeta, PutPayload,
+    path::Path,
 };
 
 /// Abstract test setup.
@@ -25,6 +27,11 @@ pub trait Setup: Send {
     ///
     /// This store MUST reject writes.
     fn outer(&self) -> &Arc<DynObjectStore>;
+
+    /// Extensions used by the store.
+    fn extensions(&self) -> Extensions {
+        Default::default()
+    }
 }
 
 fn get_result(data: &'static [u8], path: &Path) -> GetResult {
@@ -53,14 +60,19 @@ where
     let location_a = Path::parse("x").unwrap();
     let location_b = Path::parse("y").unwrap();
 
+    let get_ops = GetOptions {
+        extensions: setup.extensions(),
+        ..Default::default()
+    };
+
     Arc::clone(setup.inner())
         .mock_next(object_store_mock::MockCall::GetOpts {
-            params: (location_a.clone(), Default::default()),
+            params: (location_a.clone(), get_ops.clone().into()),
             barriers: vec![],
             res: Ok(get_result(b"foo", &location_a)),
         })
         .mock_next(object_store_mock::MockCall::GetOpts {
-            params: (location_b.clone(), Default::default()),
+            params: (location_b.clone(), get_ops.clone().into()),
             barriers: vec![],
             res: Ok(get_result(b"bar", &location_b)),
         });
@@ -107,14 +119,19 @@ where
     let location_a = Path::parse("x").unwrap();
     let location_b = Path::parse("y").unwrap();
 
+    let get_ops = GetOptions {
+        extensions: setup.extensions(),
+        ..Default::default()
+    };
+
     Arc::clone(setup.inner())
         .mock_next(object_store_mock::MockCall::GetOpts {
-            params: (location_a.clone(), Default::default()),
+            params: (location_a.clone(), get_ops.clone().into()),
             barriers: vec![],
             res: Ok(get_result(b"foo", &location_a)),
         })
         .mock_next(object_store_mock::MockCall::GetOpts {
-            params: (location_b.clone(), Default::default()),
+            params: (location_b.clone(), get_ops.clone().into()),
             barriers: vec![],
             res: Ok(get_result(b"bar", &location_b)),
         });
@@ -147,8 +164,13 @@ where
 
     let location = Path::parse("x").unwrap();
 
+    let get_ops = GetOptions {
+        extensions: setup.extensions(),
+        ..Default::default()
+    };
+
     Arc::clone(setup.inner()).mock_next(object_store_mock::MockCall::GetOpts {
-        params: (location.clone(), Default::default()),
+        params: (location.clone(), get_ops.clone().into()),
         barriers: vec![],
         res: Err(Error::NotFound {
             path: location.to_string(),
@@ -171,8 +193,13 @@ where
 
     let location = Path::parse("x").unwrap();
 
+    let get_ops = GetOptions {
+        extensions: setup.extensions(),
+        ..Default::default()
+    };
+
     Arc::clone(setup.inner()).mock_next(object_store_mock::MockCall::GetOpts {
-        params: (location.clone(), Default::default()),
+        params: (location.clone(), get_ops.clone().into()),
         barriers: vec![],
         res: Ok(get_result(b"foo", &location)),
     });
@@ -185,9 +212,9 @@ where
     assert_eq!(data_1.as_ref(), b"foo");
 
     let res_2 = setup.outer().get(&location).await.unwrap();
-    assert_eq!(
+    assert_ne!(
         CacheState::try_from(res_2.attributes.get(&ATTR_CACHE_STATE).unwrap()).unwrap(),
-        CacheState::WasCached,
+        CacheState::NewEntry, // should be loading, or in cache
     );
     let data_2 = res_2.bytes().await.unwrap();
     assert_eq!(data_1, data_2);
@@ -221,8 +248,11 @@ where
     let location = Path::parse("x").unwrap();
     let data = b"foo";
 
+    let mut get_ops = hint_size(data.len() as u64);
+    get_ops.extensions.extend(setup.extensions());
+
     Arc::clone(setup.inner()).mock_next(object_store_mock::MockCall::GetOpts {
-        params: (location.clone(), hint_size(data.len() as u64).into()),
+        params: (location.clone(), get_ops.clone().into()),
         barriers: vec![],
         res: Ok(get_result(data, &location)),
     });
diff --git a/object_store_mem_cache/src/store.rs b/object_store_mem_cache/src/store.rs
index 486c30c3..ffba3573 100644
--- a/object_store_mem_cache/src/store.rs
+++ b/object_store_mem_cache/src/store.rs
@@ -3,6 +3,7 @@ use std::{num::NonZeroUsize, ops::Range, sync::Arc};
 use async_trait::async_trait;
 use bytes::Bytes;
 use futures::{FutureExt, StreamExt, TryStreamExt, stream::BoxStream};
+use linear_buffer::Slice;
 use metric::U64Counter;
 use object_store::{
     AttributeValue, Attributes, DynObjectStore, Error, GetOptions, GetResult, GetResultPayload,
@@ -27,9 +28,38 @@ use crate::{
 const CACHE_NAME: &str = "object_store";
 const STORE_NAME: &str = "mem_cache";
 
+#[derive(Debug)]
+enum CacheValueData {
+    Owned(Bytes),
+    Shared(Slice),
+}
+
+impl CacheValueData {
+    fn size(&self) -> usize {
+        match self {
+            Self::Owned(bytes) => bytes.len(),
+            Self::Shared(slice) => slice.allocation_size(),
+        }
+    }
+
+    fn as_bytes(&self) -> Bytes {
+        match self {
+            Self::Owned(bytes) => bytes.clone(),
+            Self::Shared(slice) => Bytes::from_owner(slice.clone()),
+        }
+    }
+
+    fn is_unique(&self) -> bool {
+        match self {
+            Self::Owned(bytes) => bytes.is_unique(),
+            Self::Shared(slice) => slice.strong_count() == 1,
+        }
+    }
+}
+
 #[derive(Debug)]
 struct CacheValue {
-    data: Bytes,
+    data: CacheValueData,
     meta: ObjectMeta,
 }
 
@@ -39,29 +69,48 @@ impl CacheValue {
         location: &Path,
         size_hint: Option<u64>,
     ) -> Result<Self> {
-        let options = match size_hint {
+        let mut options = match size_hint {
             Some(size) => hint_size(size),
             None => GetOptions::default(),
         };
+
+        let (buffer_tx, buffer_rx) = crate::buffer_channel::channel();
+        options.extensions.insert(buffer_tx);
+
         let res = store.get_opts(location, options).await?;
         let meta = res.meta.clone();
 
-        // HACK: `Bytes` is a view-based type and may reference and underlying larger buffer. Maybe that causes
-        //        https://github.com/influxdata/influxdb_iox/issues/13765 (there it was a catalog issue, but we
-        //        seem to have a similar issue with the disk cache interaction?) . So we "unshare" the buffer by
-        //        round-tripping it through an owned type.
-        //
-        // We try to be clever by creating 1 "landing buffer" instead of using `res.bytes()` and then an
-        // additional clone. See https://github.com/influxdata/influxdb_iox/issues/15078#issuecomment-3223376485
-        let mut stream = res.into_stream();
-        let mut buffer = Vec::with_capacity(meta.size as usize);
-        while let Some(next) = stream.try_next().await? {
-            buffer.extend_from_slice(&next);
-        }
-        let data = buffer.into();
+        let data = if let Some(buffer_rx) = buffer_rx.accepted() {
+            // drain stream because metric wrappers might depend on it
+            let mut stream = res.into_stream();
+            while stream.try_next().await?.is_some() {}
+
+            CacheValueData::Shared(buffer_rx.await.map_err(|e| Error::Generic {
+                store: STORE_NAME,
+                source: Box::new(e),
+            })?)
+        } else {
+            // HACK: `Bytes` is a view-based type and may reference and underlying larger buffer. Maybe that causes
+            //        https://github.com/influxdata/influxdb_iox/issues/13765 (there it was a catalog issue, but we
+            //        seem to have a similar issue with the disk cache interaction?) . So we "unshare" the buffer by
+            //        round-tripping it through an owned type.
+            //
+            // We try to be clever by creating 1 "landing buffer" instead of using `res.bytes()` and then an
+            // additional clone. See https://github.com/influxdata/influxdb_iox/issues/15078#issuecomment-3223376485
+            let mut stream = res.into_stream();
+            let mut buffer = Vec::with_capacity(meta.size as usize);
+            while let Some(next) = stream.try_next().await? {
+                buffer.extend_from_slice(&next);
+            }
+            CacheValueData::Owned(buffer.into())
+        };
 
         Ok(Self { data, meta })
     }
+
+    fn data(&self) -> Bytes {
+        self.data.as_bytes()
+    }
 }
 
 impl HasSize for CacheValue {
@@ -75,7 +124,7 @@ impl HasSize for CacheValue {
             version,
         } = meta;
 
-        data.len()
+        data.size()
             + location.as_ref().len()
             + e_tag.as_ref().map(|s| s.capacity()).unwrap_or_default()
             + version.as_ref().map(|s| s.capacity()).unwrap_or_default()
@@ -269,11 +318,13 @@ impl ObjectStore for MemCacheObjectStore {
         }
 
         let (v, state) = self.get_or_fetch(location, size_hint).await?;
+        let data = v.data();
+        let data_len = data.len();
 
         Ok(GetResult {
-            payload: GetResultPayload::Stream(futures::stream::iter([Ok(v.data.clone())]).boxed()),
+            payload: GetResultPayload::Stream(futures::stream::iter([Ok(data)]).boxed()),
             meta: v.meta.clone(),
-            range: 0..(v.data.len() as u64),
+            range: 0..(data_len as u64),
             attributes: Attributes::from_iter([(ATTR_CACHE_STATE, AttributeValue::from(state))]),
         })
     }
@@ -289,17 +340,18 @@ impl ObjectStore for MemCacheObjectStore {
 
     async fn get_ranges(&self, location: &Path, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
         let (v, _state) = self.get_or_fetch(location, None).await?;
+        let data = v.data();
 
         ranges
             .iter()
             .map(|range| {
-                if range.end > (v.data.len() as u64) {
+                if range.end > (data.len() as u64) {
                     return Err(Error::Generic {
                         store: STORE_NAME,
                         source: format!(
                             "Range end ({}) out of bounds, object size is {}",
                             range.end,
-                            v.data.len()
+                            data.len(),
                         )
                         .into(),
                     });
@@ -314,7 +366,7 @@ impl ObjectStore for MemCacheObjectStore {
                         .into(),
                     });
                 }
-                Ok(v.data.slice((range.start as usize)..(range.end as usize)))
+                Ok(data.slice((range.start as usize)..(range.end as usize)))
             })
             .collect()
     }
@@ -374,9 +426,12 @@ impl ObjectStore for MemCacheObjectStore {
 #[cfg(test)]
 mod tests {
     use futures::FutureExt;
-    use object_store_mock::MockStore;
+    use http::Extensions;
+    use linear_buffer::{LinearBuffer, LinearBufferExtend};
+    use object_store_mock::{MockCall, MockParam, MockStore, path};
+    use tokio::sync::Barrier;
 
-    use crate::{gen_store_tests, object_store_cache_tests::Setup};
+    use crate::{buffer_channel::BufferSender, gen_store_tests, object_store_cache_tests::Setup};
 
     use super::*;
 
@@ -414,7 +469,129 @@ mod tests {
         fn outer(&self) -> &Arc<DynObjectStore> {
             &self.store
         }
+
+        fn extensions(&self) -> Extensions {
+            let mut ext = Extensions::default();
+            let (tx, _rx) = crate::buffer_channel::channel();
+            ext.insert(tx);
+            ext
+        }
     }
 
     gen_store_tests!(TestSetup);
+
+    #[tokio::test]
+    async fn test_cache_value_buffer_copy() {
+        let location = path();
+        let data = Bytes::from(b"foobar".to_vec());
+
+        let (tx, _rx) = crate::buffer_channel::channel();
+        let mut get_ops = GetOptions::default();
+        get_ops.extensions.insert(tx);
+
+        let store = MockStore::new()
+            .mock_next(MockCall::GetOpts {
+                params: (location.clone(), get_ops.clone().into()),
+                barriers: vec![],
+                res: Ok(GetResult {
+                    payload: GetResultPayload::Stream(
+                        futures::stream::iter([Ok(data.clone())]).boxed(),
+                    ),
+                    meta: meta(&location, &data),
+                    range: 0..(data.len() as u64),
+                    attributes: Default::default(),
+                }),
+            })
+            .as_store();
+
+        let value = CacheValue::fetch(&store, &location, None).await.unwrap();
+        assert!(!value.in_use());
+
+        let slice = value.data();
+        assert_eq!(slice, data);
+        assert_ne!(
+            slice.as_ptr().expose_provenance(),
+            data.as_ptr().expose_provenance(),
+            "data was copied",
+        );
+        assert!(value.in_use());
+
+        drop(slice);
+        assert!(!value.in_use());
+    }
+
+    #[tokio::test]
+    async fn test_cache_value_buffer_nocopy() {
+        let location = path();
+        let data = Bytes::from(b"foobar".to_vec());
+
+        const OVERALLOCATE: usize = 10;
+        let mut buffer = LinearBuffer::new(data.len() + OVERALLOCATE);
+        buffer.append(&data);
+
+        let (tx, _rx) = crate::buffer_channel::channel();
+        let mut get_ops = GetOptions::default();
+        get_ops.extensions.insert(tx);
+
+        let barrier = Arc::new(Barrier::new(2));
+
+        let store = MockStore::new().mock_next(MockCall::GetOpts {
+            params: (location.clone(), get_ops.clone().into()),
+            barriers: vec![Arc::clone(&barrier)],
+            res: Ok(GetResult {
+                payload: GetResultPayload::Stream(
+                    futures::stream::iter([Ok(data.clone())]).boxed(),
+                ),
+                meta: meta(&location, &data),
+                range: 0..(data.len() as u64),
+                attributes: Default::default(),
+            }),
+        });
+        let mut store_params = store.observed_params();
+        let store = store.as_store();
+
+        let fut_value = async { CacheValue::fetch(&store, &location, None).await.unwrap() };
+        let fut_buffer = async {
+            let param = store_params.recv().await.unwrap();
+            let MockParam::GetOpts((_path, get_options)) = param else {
+                unreachable!()
+            };
+            let tx = get_options.extensions.get::<BufferSender>().unwrap();
+            let tx = tx.clone().accept();
+            tx.send(buffer.slice_initialized_part(0..data.len()));
+            barrier.wait().await;
+        };
+
+        let (value, ()) = tokio::join!(fut_value, fut_buffer);
+        assert!(value.in_use());
+
+        let buffer_ptr = buffer
+            .slice_initialized_part(0..0)
+            .as_ptr()
+            .expose_provenance();
+        drop(buffer);
+        assert!(!value.in_use());
+
+        let slice = value.data();
+        assert_eq!(slice, data);
+        assert_eq!(
+            slice.as_ptr().expose_provenance(),
+            buffer_ptr,
+            "data was NOT copied",
+        );
+        assert!(value.in_use());
+
+        drop(slice);
+        assert!(!value.in_use());
+    }
+
+    fn meta(location: &Path, data: &[u8]) -> ObjectMeta {
+        ObjectMeta {
+            location: location.clone(),
+            last_modified: Default::default(),
+            size: data.len() as u64,
+            e_tag: None,
+            version: None,
+        }
+    }
 }
diff --git a/object_store_metrics/Cargo.toml b/object_store_metrics/Cargo.toml
index 15ddb150..02b070ab 100644
--- a/object_store_metrics/Cargo.toml
+++ b/object_store_metrics/Cargo.toml
@@ -18,7 +18,7 @@ metric = { version = "0.1.0", path = "../metric" }
 object_store = { workspace = true }
 tracing = { workspace = true }
 pin-project = "1.1.10"
-tokio = { version = "1.47", features = ["io-util"] }
+tokio = { version = "1.48", features = ["io-util"] }
 tracker = { path = "../tracker" }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
@@ -27,7 +27,7 @@ futures_test_utils = { path = "../futures_test_utils" }
 insta = { version = "1", features = ["yaml"] }
 object_store_mem_cache = { path = "../object_store_mem_cache" }
 object_store_mock = { version = "0.1", path = "../object_store_mock" }
-rust_decimal = "1.38.0"
-tempfile = "3.22.0"
+rust_decimal = "1.39.0"
+tempfile = "3.23.0"
 test_helpers = { path = "../test_helpers" }
-tokio = { version = "1.47", features = ["macros", "io-util"] }
+tokio = { version = "1.48", features = ["macros", "io-util"] }
diff --git a/object_store_metrics/src/cache_metrics.rs b/object_store_metrics/src/cache_metrics.rs
index 73222902..65fded09 100644
--- a/object_store_metrics/src/cache_metrics.rs
+++ b/object_store_metrics/src/cache_metrics.rs
@@ -1061,10 +1061,15 @@ mod tests {
         let capture = capture();
 
         let location = path();
+
+        let mut get_opts = GetOptions::default();
+        let (tx, _rx) = object_store_mem_cache::buffer_channel::channel();
+        get_opts.extensions.insert(tx);
+
         let barrier = Arc::new(Barrier::new(2));
         let inner: Arc<dyn ObjectStore> = MockStore::new()
             .mock_next(GetOpts {
-                params: (location.clone(), Default::default()),
+                params: (location.clone(), get_opts.into()),
                 barriers: vec![Arc::clone(&barrier)],
                 res: Ok(get_result_stream()),
             })
diff --git a/object_store_mock/Cargo.toml b/object_store_mock/Cargo.toml
index af5b0c83..45e1ce29 100644
--- a/object_store_mock/Cargo.toml
+++ b/object_store_mock/Cargo.toml
@@ -12,7 +12,7 @@ async-trait = { version = "0.1.89", default-features = false }
 bytes = { version = "1.10.1", default-features = false }
 futures = { version = "0.3.31" }
 object_store.workspace = true
-tokio = { version = "1.47.1", default-features = false, features = [
+tokio = { version = "1.48.0", default-features = false, features = [
     "macros",
     "rt-multi-thread",
 ] }
diff --git a/object_store_mock/src/lib.rs b/object_store_mock/src/lib.rs
index 50e62c7b..aa6f6af0 100644
--- a/object_store_mock/src/lib.rs
+++ b/object_store_mock/src/lib.rs
@@ -1,6 +1,6 @@
 use std::{
     fmt::Display,
-    ops::Range,
+    ops::{Deref, Range},
     sync::{Arc, Mutex},
 };
 
@@ -13,7 +13,10 @@ use object_store::{
     GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, ObjectStore,
     PutMultipartOptions, PutOptions, PutPayload, PutResult, Result, path::Path,
 };
-use tokio::sync::Barrier;
+use tokio::sync::{
+    Barrier,
+    mpsc::{UnboundedReceiver, UnboundedSender},
+};
 
 // Workaround for "unused crate" lint false positives.
 use workspace_hack as _;
@@ -63,6 +66,12 @@ impl From<GetOptions> for WrappedGetOptions {
     }
 }
 
+impl From<WrappedGetOptions> for GetOptions {
+    fn from(options: WrappedGetOptions) -> Self {
+        options.0
+    }
+}
+
 impl Clone for WrappedGetOptions {
     fn clone(&self) -> Self {
         Self(GetOptions {
@@ -78,6 +87,14 @@ impl Clone for WrappedGetOptions {
     }
 }
 
+impl Deref for WrappedGetOptions {
+    type Target = GetOptions;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
 /// Wrapper for PutPayload that implements PartialEQ
 #[derive(Debug, Clone)]
 pub struct PutPayloadWrapper(PutPayload);
@@ -172,6 +189,17 @@ macro_rules! calls {
                 }
             }
         }
+
+        #[derive(Debug)]
+        #[expect(
+            unused_parens,
+            reason = "a single param will expand to ($param)"
+        )]
+        pub enum MockParam {
+            $(
+                $name (($($param),*),),
+            )*
+        }
     };
 }
 
@@ -279,9 +307,46 @@ struct MockStoreState {
     index_counter: usize,
 }
 
-#[derive(Debug, Default)]
 pub struct MockStore {
     state: Mutex<MockStoreState>,
+    tx: UnboundedSender<MockParam>,
+    rx: Mutex<Option<UnboundedReceiver<MockParam>>>,
+}
+
+impl Default for MockStore {
+    fn default() -> Self {
+        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
+
+        Self {
+            state: Default::default(),
+            tx,
+            rx: Mutex::new(Some(rx)),
+        }
+    }
+}
+
+impl std::fmt::Debug for MockStore {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let Self {
+            state,
+            tx: _,
+            rx: _,
+        } = self;
+
+        match state.try_lock() {
+            Ok(state) => {
+                let MockStoreState {
+                    calls,
+                    index_counter,
+                } = state.deref();
+                f.debug_struct("MockStore")
+                    .field("calls", calls)
+                    .field("index_counter", index_counter)
+                    .finish_non_exhaustive()
+            }
+            Err(_) => f.debug_struct("MockStore").finish_non_exhaustive(),
+        }
+    }
 }
 
 impl Drop for MockStore {
@@ -327,6 +392,18 @@ impl MockStore {
     pub fn as_store(self: Arc<Self>) -> Arc<dyn ObjectStore> {
         self as Arc<dyn ObjectStore>
     }
+
+    /// Get receiver for mocked operations.
+    ///
+    /// The data will be sent BEFORE barriers are passed and contains the original instance of the parameters, not the
+    /// one passed to [`mock_next`](Self::mock_next)/[`mock_next_multi`](Self::mock_next_multi).
+    ///
+    /// # Panic
+    /// Since the parameters are not [`Clone`]able, you can only extract the receiver once.
+    pub fn observed_params(&self) -> UnboundedReceiver<MockParam> {
+        let maybe_rx = { self.rx.lock().unwrap().take() };
+        maybe_rx.expect("cannot take receiver twice")
+    }
 }
 
 macro_rules! barrier_wait {
@@ -383,6 +460,8 @@ macro_rules! mock {
                 params,
             );
 
+            $self.tx.send(MockParam::$variant(actual)).ok();
+
             let res = res.into();
 
             if barriers.is_empty() {
@@ -635,7 +714,7 @@ mod tests {
     fn test_debug() {
         assert_eq!(
             format!("{:?}", MockStore::new()),
-            "MockStore { state: Mutex { data: MockStoreState { calls: [], index_counter: 0 }, poisoned: false, .. } }",
+            "MockStore { calls: [], index_counter: 0, .. }",
         );
     }
 
@@ -814,6 +893,72 @@ mod tests {
         assert!(stream.next().await.is_none());
     }
 
+    #[test]
+    #[should_panic(expected = "cannot take receiver twice")]
+    fn test_take_param_reciever_twice() {
+        let store = MockStore::new();
+        store.observed_params();
+        store.observed_params();
+    }
+
+    /// There are two parameters created:
+    ///
+    /// 1. the one for the [`MockCall`] that is tested for "equality"
+    /// 2. the one that the API user of [`ObjectStore`] passes into the respective trait method.
+    ///
+    /// The [`MockStore::observed_params`] should return (2), so a test can use it for various things. Returning (1)
+    /// would be redundant because the mock/test setup actually had that parameter at hand already.
+    #[tokio::test]
+    async fn test_param_receiver_has_original_instance() {
+        let payload_1 = PutPayload::from_bytes(Bytes::from(b"foo".to_vec()));
+        let payload_1_ptr = payload_1.as_ref().as_ptr().expose_provenance();
+        let store = MockStore::new().mock_next(MockCall::Put {
+            params: (path(), payload_1.clone().into()),
+            barriers: vec![],
+            res: Ok(PutResult {
+                e_tag: None,
+                version: None,
+            }),
+        });
+
+        let payload_2 = PutPayload::from_bytes(Bytes::from(b"foo".to_vec()));
+        let payload_2_ptr = payload_2.as_ref().as_ptr().expose_provenance();
+        assert_ne!(payload_1_ptr, payload_2_ptr);
+        store.put(&path(), payload_2.clone()).await.unwrap();
+
+        let MockParam::Put((_path, payload_3)) = store.observed_params().recv().await.unwrap()
+        else {
+            unreachable!()
+        };
+        let payload_3_ptr = payload_3.0.as_ref().as_ptr().expose_provenance();
+        assert_eq!(payload_2_ptr, payload_3_ptr);
+    }
+
+    #[tokio::test]
+    async fn test_param_receiver_gets_data_before_barrier() {
+        let barrier = Arc::new(Barrier::new(2));
+        let store = MockStore::new().mock_next(MockCall::Copy {
+            params: (path(), path()),
+            barriers: vec![Arc::clone(&barrier)],
+            res: Ok(()),
+        });
+
+        let mut recv = store.observed_params();
+
+        let path = path();
+        let mut fut = store.copy(&path, &path);
+        fut.assert_pending().await;
+
+        // the barrier is still blocked, but we can already retrieve the parameters
+        assert!(matches!(recv.recv().await.unwrap(), MockParam::Copy(_)));
+
+        // now unblock the barrier
+        let (res, _) = tokio::join!(fut, async move {
+            barrier.wait().await;
+        },);
+        res.unwrap();
+    }
+
     #[test]
     fn test_paths_different() {
         assert_ne!(path(), path2());
diff --git a/parquet_file/Cargo.toml b/parquet_file/Cargo.toml
index 4b6b6406..085ffc0e 100644
--- a/parquet_file/Cargo.toml
+++ b/parquet_file/Cargo.toml
@@ -27,9 +27,9 @@ pbjson-types = { workspace = true }
 prost = { workspace = true }
 schema = { path = "../schema" }
 snafu = "0.8"
-thiserror = "2.0.16"
+thiserror = "2.0.17"
 thrift = "0.17"
-tokio = { version = "1.47", features = [
+tokio = { version = "1.48", features = [
     "macros",
     "parking_lot",
     "rt",
diff --git a/parquet_file/src/lib.rs b/parquet_file/src/lib.rs
index f967a4e6..817f316a 100644
--- a/parquet_file/src/lib.rs
+++ b/parquet_file/src/lib.rs
@@ -163,7 +163,7 @@ impl From<&ParquetFile> for ParquetFilePath {
             table_id: f.table_id,
             partition_id: TransitionPartitionId::from_parts(
                 f.partition_id,
-                f.partition_hash_id.clone(),
+                Some(f.partition_hash_id.clone()),
             ),
             object_store_id: f.object_store_id,
         }
@@ -173,7 +173,7 @@ impl From<&ParquetFile> for ParquetFilePath {
 impl From<&ParquetFileParams> for ParquetFilePath {
     fn from(f: &ParquetFileParams) -> Self {
         let partition_id =
-            TransitionPartitionId::from_parts(f.partition_id, f.partition_hash_id.clone());
+            TransitionPartitionId::from_parts(f.partition_id, Some(f.partition_hash_id.clone()));
 
         Self {
             partition_id,
diff --git a/parquet_file/src/metadata.rs b/parquet_file/src/metadata.rs
index 00d9291f..1ff7d307 100644
--- a/parquet_file/src/metadata.rs
+++ b/parquet_file/src/metadata.rs
@@ -465,7 +465,7 @@ impl IoxMetadata {
     pub fn to_parquet_file<F>(
         &self,
         partition_id: PartitionId,
-        partition_hash_id: Option<PartitionHashId>,
+        partition_hash_id: PartitionHashId,
         file_size_bytes: u64,
         metadata: &IoxParquetMetaData,
         column_id_map: F,
diff --git a/parquet_file/src/serialize.rs b/parquet_file/src/serialize.rs
index 7843732b..eb115619 100644
--- a/parquet_file/src/serialize.rs
+++ b/parquet_file/src/serialize.rs
@@ -42,7 +42,7 @@ use crate::{
 pub const ROW_GROUP_WRITE_SIZE: usize = 1024 * 1024;
 
 /// ensure read and write work well together
-const _: () = assert!(ROW_GROUP_WRITE_SIZE % BATCH_SIZE == 0);
+const _: () = assert!(ROW_GROUP_WRITE_SIZE.is_multiple_of(BATCH_SIZE));
 
 /// [`RecordBatch`] to Parquet serialisation errors.
 ///
diff --git a/parquet_file/tests/metadata.rs b/parquet_file/tests/metadata.rs
index dc7c4875..696eda7f 100644
--- a/parquet_file/tests/metadata.rs
+++ b/parquet_file/tests/metadata.rs
@@ -441,7 +441,7 @@ async fn test_derive_parquet_file_params() {
     let partition_id = PartitionId::new(1);
     let catalog_data = meta.to_parquet_file(
         partition_id,
-        Some(partition_hash_id),
+        partition_hash_id,
         file_size,
         &iox_parquet_meta,
         |name| *column_id_map.get(name).unwrap(),
diff --git a/partition/Cargo.toml b/partition/Cargo.toml
index 2e9eb169..87170b2d 100644
--- a/partition/Cargo.toml
+++ b/partition/Cargo.toml
@@ -16,7 +16,7 @@ hashbrown = { workspace = true }
 mutable_batch = { path = "../mutable_batch" }
 percent-encoding = "2.3.2"
 schema = { path = "../schema" }
-thiserror = "2.0.16"
+thiserror = "2.0.17"
 unicode-segmentation = "1.12.0"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
diff --git a/query_functions/Cargo.toml b/query_functions/Cargo.toml
index 842ff9fb..d6bcfebb 100644
--- a/query_functions/Cargo.toml
+++ b/query_functions/Cargo.toml
@@ -13,7 +13,7 @@ arrow = { workspace = true }
 chrono = { version = "0.4", default-features = false }
 datafusion = { workspace = true }
 regex = "1"
-regex-syntax = "0.8.6"
+regex-syntax = "0.8.8"
 schema = { path = "../schema" }
 snafu = "0.8"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
@@ -21,4 +21,4 @@ workspace-hack = { version = "0.1", path = "../workspace-hack" }
 [dev-dependencies]
 datafusion_util = { path = "../datafusion_util" }
 itertools = "0.13.0"
-tokio = { version = "1.47", features = ["macros", "parking_lot"] }
+tokio = { version = "1.48", features = ["macros", "parking_lot"] }
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 908d2ecb..43e5784a 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,3 +1,3 @@
 [toolchain]
-channel = "1.89.0"
+channel = "1.90.0"
 components = ["rustfmt", "clippy"]
diff --git a/schema/Cargo.toml b/schema/Cargo.toml
index fcf44f45..fb2d33f4 100644
--- a/schema/Cargo.toml
+++ b/schema/Cargo.toml
@@ -13,7 +13,7 @@ workspace = true
 arrow = { workspace = true }
 base64 = { version = "0.22", optional = true }
 hashbrown = { workspace = true }
-indexmap = { version = "2.11", features = ["std"] }
+indexmap = { version = "2.12", features = ["std"] }
 tracing = { workspace = true }
 snafu = "0.8"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
@@ -24,6 +24,6 @@ rstest = { version = "0.26.1" }
 [features]
 # Enable features from the newly proposed v3 data model, see:
 # https://github.com/influxdata/influxdb/issues/24979
-# 
+#
 # This feature is experimental, and is not enabled by default.
 v3 = ["dep:base64"]
diff --git a/schema/src/lib.rs b/schema/src/lib.rs
index 7870edcf..f0403f61 100644
--- a/schema/src/lib.rs
+++ b/schema/src/lib.rs
@@ -56,6 +56,7 @@ use hashbrown::HashSet;
 
 use crate::sort::SortKey;
 use snafu::{OptionExt, Snafu};
+
 #[cfg(feature = "v3")]
 use tracing::warn;
 
@@ -1321,6 +1322,10 @@ mod test {
     use crate::test_util::make_field;
 
     use super::{builder::SchemaBuilder, *};
+
+    use rstest as _; // workaround for "unused crate" false positive
+
+    #[cfg(feature = "v3")]
     use rstest::rstest;
 
     #[test]
diff --git a/service_grpc_flight/Cargo.toml b/service_grpc_flight/Cargo.toml
index aae1583e..ed6068d4 100644
--- a/service_grpc_flight/Cargo.toml
+++ b/service_grpc_flight/Cargo.toml
@@ -34,7 +34,7 @@ prost = { workspace = true }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0.145"
 snafu = "0.8"
-tokio = { version = "1.47", features = [
+tokio = { version = "1.48", features = [
     "macros",
     "net",
     "parking_lot",
diff --git a/test_helpers/Cargo.toml b/test_helpers/Cargo.toml
index edf091dd..631ae35d 100644
--- a/test_helpers/Cargo.toml
+++ b/test_helpers/Cargo.toml
@@ -11,14 +11,14 @@ workspace = true
 [dependencies] # In alphabetical order
 async-trait = "0.1.89"
 dotenvy = "0.15.7"
-ordered-float = "5.0.0"
+ordered-float = "5.1.0"
 parking_lot = "0.12"
 prometheus-parse = "0.2.5"
 reqwest = { workspace = true, features = ["stream", "rustls-tls-native-roots"] }
 serde =  { version = "1.0", features = ["derive"] }
-tempfile = "3.22.0"
-thiserror = "2.0.16"
-tokio = { version = "1.47.1", default-features = false, features = ["time"] }
+tempfile = "3.23.0"
+thiserror = "2.0.17"
+tokio = { version = "1.48.0", default-features = false, features = ["time"] }
 tracing = { workspace = true }
 tracing-log = { workspace = true }
 tracing-subscriber = { workspace = true }
diff --git a/test_helpers_authz/Cargo.toml b/test_helpers_authz/Cargo.toml
index b51bc51d..3ee9d27c 100644
--- a/test_helpers_authz/Cargo.toml
+++ b/test_helpers_authz/Cargo.toml
@@ -14,7 +14,7 @@ workspace-hack = { version = "0.1", path = "../workspace-hack" }
 # Crates.io dependencies, in alphabetical order
 futures = "0.3"
 rand = "0.9.2"
-tokio = "1.47"
+tokio = "1.48"
 
 [lints]
 workspace = true
diff --git a/tokio_metrics_bridge/Cargo.toml b/tokio_metrics_bridge/Cargo.toml
index d7c797df..17a41fa8 100644
--- a/tokio_metrics_bridge/Cargo.toml
+++ b/tokio_metrics_bridge/Cargo.toml
@@ -10,8 +10,8 @@ workspace = true
 
 [dependencies]
 metric = { path = "../metric" }
-parking_lot = "0.12.4"
-tokio = { version = "1.47", features = ["macros", "net", "parking_lot", "rt-multi-thread", "sync", "time"] }
+parking_lot = "0.12.5"
+tokio = { version = "1.48", features = ["macros", "net", "parking_lot", "rt-multi-thread", "sync", "time"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies]
diff --git a/tokio_watchdog/Cargo.toml b/tokio_watchdog/Cargo.toml
index 14857513..1872dada 100644
--- a/tokio_watchdog/Cargo.toml
+++ b/tokio_watchdog/Cargo.toml
@@ -11,7 +11,7 @@ workspace = true
 [dependencies]
 metric = { path = "../metric" }
 tracing = { workspace = true }
-tokio = { version = "1.47", features = ["macros", "net", "parking_lot", "rt-multi-thread", "sync", "time"] }
+tokio = { version = "1.48", features = ["macros", "net", "parking_lot", "rt-multi-thread", "sync", "time"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies]
diff --git a/trace_exporters/Cargo.toml b/trace_exporters/Cargo.toml
index f459ba2a..1bef8cb1 100644
--- a/trace_exporters/Cargo.toml
+++ b/trace_exporters/Cargo.toml
@@ -16,9 +16,9 @@ futures = "0.3"
 iox_time = { path = "../iox_time" }
 tracing = { workspace = true }
 snafu = "0.8"
-socket2 = "0.6.0"
+socket2 = "0.6.1"
 thrift = { version = "0.17.0" }
-tokio = { version = "1.47", features = ["macros", "parking_lot", "rt", "sync"] }
+tokio = { version = "1.48", features = ["macros", "parking_lot", "rt", "sync"] }
 trace = { path = "../trace" }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
diff --git a/tracker/Cargo.toml b/tracker/Cargo.toml
index 4accba77..fb6476a2 100644
--- a/tracker/Cargo.toml
+++ b/tracker/Cargo.toml
@@ -20,13 +20,13 @@ parking_lot = "0.12"
 pin-project = "1.1"
 # Delaying upgrade until <https://github.com/GuillaumeGomez/sysinfo/issues/1496> is fixed
 sysinfo = "<0.38"
-tokio = { version = "1.47", features = ["macros", "parking_lot", "sync", "time"] }
+tokio = { version = "1.48", features = ["macros", "parking_lot", "sync", "time"] }
 tokio-util = { version = "0.7.16" }
 trace = { path = "../trace"}
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies]
-tempfile = "3.22.0"
+tempfile = "3.23.0"
 # Need the multi-threaded executor for testing
-tokio = { version = "1.47", features = ["macros", "parking_lot", "rt-multi-thread", "time"] }
+tokio = { version = "1.48", features = ["macros", "parking_lot", "rt-multi-thread", "time"] }
 test_helpers = { path = "../test_helpers" }
diff --git a/trogging/Cargo.toml b/trogging/Cargo.toml
index 402ab2fa..789bfcc0 100644
--- a/trogging/Cargo.toml
+++ b/trogging/Cargo.toml
@@ -12,7 +12,7 @@ workspace = true
 [dependencies]
 clap = { version = "4", features = ["derive", "env"], optional = true }
 logfmt = { path = "../logfmt" }
-thiserror = "2.0.16"
+thiserror = "2.0.17"
 tracing-log = { workspace = true }
 tracing = { workspace = true }
 tracing-subscriber = { workspace = true }
diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml
index 925c95cc..04ff1188 100644
--- a/workspace-hack/Cargo.toml
+++ b/workspace-hack/Cargo.toml
@@ -23,76 +23,114 @@ ahash = { version = "0.8", default-features = false, features = ["runtime-rng"]
 arrayvec = { version = "0.7", default-features = false, features = ["std"] }
 arrow-ipc = { version = "55", features = ["lz4", "zstd"] }
 arrow-schema = { version = "55", default-features = false, features = ["canonical_extension_types"] }
+aws-credential-types = { version = "1", default-features = false, features = ["test-util"] }
+aws-sdk-s3 = { version = "1", features = ["behavior-version-latest"] }
+aws-smithy-runtime = { version = "1", default-features = false, features = ["client", "default-https-client", "rt-tokio", "tls-rustls"] }
+aws-smithy-runtime-api = { version = "1", features = ["client", "http-02x", "http-auth", "test-util"] }
+aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "http-body-1-x", "rt-tokio", "test-util"] }
 base64 = { version = "0.22" }
+bigdecimal = { version = "0.4", features = ["serde"] }
+bincode = { version = "2", default-features = false, features = ["alloc", "derive", "serde"] }
+bloom2 = { version = "0.5", default-features = false, features = ["serde"] }
 byteorder = { version = "1" }
 bytes = { version = "1" }
-chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
-clap = { version = "4", features = ["derive", "env"] }
-clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "suggestions", "usage"] }
+chrono = { version = "0.4", features = ["serde"] }
+clap = { version = "4", features = ["derive", "env", "string"] }
+clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
+crypto-bigint = { version = "0.5", features = ["generic-array", "zeroize"] }
 crypto-common = { version = "0.1", default-features = false, features = ["std"] }
-datafusion-common = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "a9cf9aca9ebf0d6c04e0861d2baebffa0ba77dbc", default-features = false, features = ["object_store", "parquet_encryption", "recursive_protection"] }
-datafusion-expr = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "a9cf9aca9ebf0d6c04e0861d2baebffa0ba77dbc", default-features = false, features = ["recursive_protection"] }
-digest = { version = "0.10", features = ["mac", "std"] }
+datafusion-common = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "ee81b1cc652bde6c131973d091b178836692112d", default-features = false, features = ["object_store", "parquet_encryption", "recursive_protection"] }
+datafusion-expr = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "ee81b1cc652bde6c131973d091b178836692112d", default-features = false, features = ["recursive_protection"] }
+digest = { version = "0.10", features = ["mac", "oid", "std"] }
 either = { version = "1", features = ["serde", "use_std"] }
 fastrand = { version = "2" }
+flatbuffers = { version = "25" }
 flate2 = { version = "1", features = ["zlib-rs"] }
 form_urlencoded = { version = "1" }
-futures = { version = "0.3" }
 futures-channel = { version = "0.3", features = ["sink"] }
 futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-task = { version = "0.3", default-features = false, features = ["std"] }
-futures-util = { version = "0.3", default-features = false, features = ["async-await-macro", "channel", "io", "sink"] }
-getrandom = { version = "0.3", default-features = false, features = ["std"] }
+futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
+generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] }
+getrandom-468e82937335b1c9 = { package = "getrandom", version = "0.3", default-features = false, features = ["std"] }
+getrandom-6f8ce4dd05d13bba = { package = "getrandom", version = "0.2", default-features = false, features = ["std"] }
 hashbrown-3575ec1268b04181 = { package = "hashbrown", version = "0.15" }
 hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
+hickory-proto = { version = "0.25", default-features = false, features = ["serde", "text-parsing", "tokio"] }
 httparse = { version = "1" }
 hyper = { version = "1", features = ["client", "http1", "http2", "server"] }
-hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] }
+hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "server-graceful", "service"] }
 indexmap = { version = "2" }
-libc = { version = "0.2", features = ["use_std"] }
+insta = { version = "1", features = ["json", "redactions", "yaml"] }
+ipnet = { version = "2", features = ["serde"] }
+libc = { version = "0.2", features = ["extra_traits", "use_std"] }
+lock_api = { version = "0.4", features = ["arc_lock"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 md-5 = { version = "0.10" }
 memchr = { version = "2" }
+moka = { version = "0.12", features = ["future", "sync"] }
+num-bigint = { version = "0.4", features = ["serde"] }
+num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 object_store = { version = "0.12", features = ["aws", "azure", "gcp"] }
+once_cell = { version = "1", features = ["critical-section"] }
+parking_lot = { version = "0.12", features = ["arc_lock"] }
 parquet = { version = "55", features = ["encryption", "object_store"] }
 percent-encoding = { version = "2" }
+portable-atomic = { version = "1" }
+proptest = { version = "1" }
 prost = { version = "0.13", features = ["prost-derive"] }
 prost-types = { version = "0.13" }
 rand-274715c4dabd11b0 = { package = "rand", version = "0.9" }
 rand-c38e5c1d305a1b54 = { package = "rand", version = "0.8", features = ["small_rng"] }
 rand_chacha = { version = "0.9", default-features = false, features = ["std"] }
+rand_core = { version = "0.9", default-features = false, features = ["os_rng", "std"] }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-build", "dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "std", "unicode"] }
 regex-syntax = { version = "0.8" }
-reqwest = { version = "0.12", default-features = false, features = ["http2", "json", "rustls-tls-native-roots", "stream"] }
+reqwest = { version = "0.12", default-features = false, features = ["gzip", "http2", "json", "multipart", "rustls-tls", "rustls-tls-native-roots", "stream"] }
+ring = { version = "0.17", features = ["std"] }
+rustls = { version = "0.23", default-features = false, features = ["logging", "prefer-post-quantum", "ring", "std", "tls12"] }
+rustls-pemfile = { version = "2" }
+rustls-webpki = { version = "0.103", default-features = false, features = ["aws-lc-rs", "ring", "std"] }
 serde = { version = "1", features = ["alloc", "derive", "rc"] }
-serde_core = { version = "1", default-features = false, features = ["alloc", "rc", "result", "std"] }
+serde_core = { version = "1", features = ["alloc", "rc"] }
 serde_json = { version = "1", features = ["raw_value"] }
-sha2 = { version = "0.10" }
+sha2 = { version = "0.10", features = ["oid"] }
+signature = { version = "2", default-features = false, features = ["digest", "rand_core", "std"] }
 similar = { version = "2", features = ["inline"] }
 smallvec = { version = "1", default-features = false, features = ["const_new", "serde", "union"] }
-socket2 = { version = "0.6", default-features = false, features = ["all"] }
+snafu = { version = "0.8", features = ["futures"] }
+socket2-3b31131e45eafb45 = { package = "socket2", version = "0.6", default-features = false, features = ["all"] }
+spin = { version = "0.9" }
 sqlparser = { version = "0.55", default-features = false, features = ["recursive-protection", "visitor"] }
+sqlx = { version = "0.8", features = ["postgres", "runtime-tokio-rustls", "sqlite", "tls-rustls", "uuid"] }
 sqlx-core = { version = "0.8", features = ["_rt-tokio", "_tls-rustls-ring-webpki", "any", "json", "migrate", "offline", "uuid"] }
 sqlx-postgres = { version = "0.8", default-features = false, features = ["any", "json", "migrate", "offline", "uuid"] }
 sqlx-sqlite = { version = "0.8", default-features = false, features = ["any", "bundled", "json", "migrate", "offline", "uuid"] }
+subtle = { version = "2" }
 sync_wrapper = { version = "1", default-features = false, features = ["futures"] }
+thiserror = { version = "2" }
 thrift = { version = "0.17" }
-tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "parking_lot", "rt-multi-thread", "signal", "test-util"] }
+time = { version = "0.3", features = ["formatting", "macros", "parsing"] }
+tokio = { version = "1", features = ["full", "test-util", "tracing"] }
+tokio-metrics = { version = "0.4" }
+tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
 tokio-stream = { version = "0.1", features = ["fs", "net"] }
-tokio-util = { version = "0.7", features = ["codec", "io"] }
+tokio-util = { version = "0.7", features = ["codec", "compat", "io"] }
 tonic = { version = "0.12", features = ["gzip", "tls-roots", "zstd"] }
-tower = { version = "0.5", default-features = false, features = ["util"] }
+tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "load-shed"] }
 tracing = { version = "0.1", features = ["log", "max_level_trace"] }
 tracing-core = { version = "0.1" }
 tracing-log = { version = "0.2" }
-twox-hash = { version = "2", default-features = false, features = ["xxhash32", "xxhash64"] }
-uuid = { version = "1", features = ["js", "v4"] }
+twox-hash = { version = "2" }
+url = { version = "2" }
+uuid = { version = "1", features = ["js", "serde", "v4", "v7"] }
+zeroize = { version = "1", features = ["derive", "std"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }
@@ -104,123 +142,188 @@ arrow-schema = { version = "55", default-features = false, features = ["canonica
 base64 = { version = "0.22" }
 byteorder = { version = "1" }
 bytes = { version = "1" }
-chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
+cc = { version = "1", default-features = false, features = ["parallel"] }
+chrono = { version = "0.4", features = ["serde"] }
 crossbeam-utils = { version = "0.8" }
 crypto-common = { version = "0.1", default-features = false, features = ["std"] }
-datafusion-common = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "a9cf9aca9ebf0d6c04e0861d2baebffa0ba77dbc", default-features = false, features = ["object_store", "parquet_encryption", "recursive_protection"] }
-datafusion-expr = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "a9cf9aca9ebf0d6c04e0861d2baebffa0ba77dbc", default-features = false, features = ["recursive_protection"] }
-digest = { version = "0.10", features = ["mac", "std"] }
+datafusion-common = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "ee81b1cc652bde6c131973d091b178836692112d", default-features = false, features = ["object_store", "parquet_encryption", "recursive_protection"] }
+datafusion-expr = { git = "https://github.com/influxdata/arrow-datafusion.git", rev = "ee81b1cc652bde6c131973d091b178836692112d", default-features = false, features = ["recursive_protection"] }
+digest = { version = "0.10", features = ["mac", "oid", "std"] }
 either = { version = "1", features = ["serde", "use_std"] }
 fastrand = { version = "2" }
+flatbuffers = { version = "25" }
 flate2 = { version = "1", features = ["zlib-rs"] }
 form_urlencoded = { version = "1" }
-futures = { version = "0.3" }
 futures-channel = { version = "0.3", features = ["sink"] }
 futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-task = { version = "0.3", default-features = false, features = ["std"] }
-futures-util = { version = "0.3", default-features = false, features = ["async-await-macro", "channel", "io", "sink"] }
-getrandom = { version = "0.3", default-features = false, features = ["std"] }
+futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
+generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] }
+getrandom-468e82937335b1c9 = { package = "getrandom", version = "0.3", default-features = false, features = ["std"] }
+getrandom-6f8ce4dd05d13bba = { package = "getrandom", version = "0.2", default-features = false, features = ["std"] }
 hashbrown-3575ec1268b04181 = { package = "hashbrown", version = "0.15" }
 hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
 httparse = { version = "1" }
 hyper = { version = "1", features = ["client", "http1", "http2", "server"] }
 indexmap = { version = "2" }
-libc = { version = "0.2", features = ["use_std"] }
+libc = { version = "0.2", features = ["extra_traits", "use_std"] }
+lock_api = { version = "0.4", features = ["arc_lock"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 md-5 = { version = "0.10" }
 memchr = { version = "2" }
+num-bigint = { version = "0.4", features = ["serde"] }
+num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 object_store = { version = "0.12", features = ["aws", "azure", "gcp"] }
+once_cell = { version = "1", features = ["critical-section"] }
+parking_lot = { version = "0.12", features = ["arc_lock"] }
 parquet = { version = "55", features = ["encryption", "object_store"] }
 percent-encoding = { version = "2" }
+portable-atomic = { version = "1" }
 prost = { version = "0.13", features = ["prost-derive"] }
 prost-types = { version = "0.13" }
 rand-274715c4dabd11b0 = { package = "rand", version = "0.9" }
 rand-c38e5c1d305a1b54 = { package = "rand", version = "0.8", features = ["small_rng"] }
 rand_chacha = { version = "0.9", default-features = false, features = ["std"] }
+rand_core = { version = "0.9", default-features = false, features = ["os_rng", "std"] }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-build", "dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "std", "unicode"] }
 regex-syntax = { version = "0.8" }
-reqwest = { version = "0.12", default-features = false, features = ["http2", "json", "rustls-tls-native-roots", "stream"] }
+reqwest = { version = "0.12", default-features = false, features = ["gzip", "http2", "json", "multipart", "rustls-tls", "rustls-tls-native-roots", "stream"] }
+ring = { version = "0.17", features = ["std"] }
+rustls = { version = "0.23", default-features = false, features = ["logging", "prefer-post-quantum", "ring", "std", "tls12"] }
+rustls-pemfile = { version = "2" }
+rustls-webpki = { version = "0.103", default-features = false, features = ["aws-lc-rs", "ring", "std"] }
 serde = { version = "1", features = ["alloc", "derive", "rc"] }
-serde_core = { version = "1", default-features = false, features = ["alloc", "rc", "result", "std"] }
+serde_core = { version = "1", features = ["alloc", "rc"] }
 serde_json = { version = "1", features = ["raw_value"] }
-sha2 = { version = "0.10" }
+sha2 = { version = "0.10", features = ["oid"] }
 smallvec = { version = "1", default-features = false, features = ["const_new", "serde", "union"] }
+spin = { version = "0.9" }
 sqlparser = { version = "0.55", default-features = false, features = ["recursive-protection", "visitor"] }
 sqlx-core = { version = "0.8", features = ["_rt-tokio", "_tls-rustls-ring-webpki", "any", "json", "migrate", "offline", "uuid"] }
+sqlx-macros = { version = "0.8", features = ["_rt-tokio", "_tls-rustls-ring-webpki", "derive", "json", "macros", "migrate", "postgres", "sqlite", "uuid"] }
+sqlx-macros-core = { version = "0.8", features = ["_rt-tokio", "_tls-rustls-ring-webpki", "derive", "json", "macros", "migrate", "postgres", "sqlite", "uuid"] }
 sqlx-postgres = { version = "0.8", default-features = false, features = ["any", "json", "migrate", "offline", "uuid"] }
 sqlx-sqlite = { version = "0.8", default-features = false, features = ["any", "bundled", "json", "migrate", "offline", "uuid"] }
+subtle = { version = "2" }
 syn = { version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 sync_wrapper = { version = "1", default-features = false, features = ["futures"] }
+thiserror = { version = "2" }
 thrift = { version = "0.17" }
-tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "parking_lot", "rt-multi-thread", "signal", "test-util"] }
+tokio = { version = "1", features = ["full", "test-util", "tracing"] }
 tokio-stream = { version = "0.1", features = ["fs", "net"] }
-tokio-util = { version = "0.7", features = ["codec", "io"] }
+tokio-util = { version = "0.7", features = ["codec", "compat", "io"] }
 tracing = { version = "0.1", features = ["log", "max_level_trace"] }
 tracing-core = { version = "0.1" }
-twox-hash = { version = "2", default-features = false, features = ["xxhash32", "xxhash64"] }
-uuid = { version = "1", features = ["js", "v4"] }
+twox-hash = { version = "2" }
+url = { version = "2" }
+uuid = { version = "1", features = ["js", "serde", "v4", "v7"] }
+zeroize = { version = "1", features = ["derive", "std"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }
 
 [target.x86_64-unknown-linux-gnu.dependencies]
+async-compression = { version = "0.4", default-features = false, features = ["bzip2", "gzip", "tokio", "xz", "zstd"] }
 bitflags = { version = "2", default-features = false, features = ["std"] }
-hyper-util = { version = "0.1", default-features = false, features = ["client-proxy"] }
-once_cell = { version = "1" }
+hyper-rustls = { version = "0.27", default-features = false, features = ["http1", "http2", "native-tokio", "ring", "tls12", "webpki-tokio"] }
+lzma-sys = { version = "0.1", default-features = false, features = ["static"] }
+nix = { version = "0.30", default-features = false, features = ["fs", "ioctl", "poll", "signal", "socket", "term"] }
 tower = { version = "0.5", default-features = false, features = ["retry", "timeout"] }
+tower-http = { version = "0.6", features = ["catch-panic", "follow-redirect"] }
+xz2 = { version = "0.1", default-features = false, features = ["static"] }
 
 [target.x86_64-unknown-linux-gnu.build-dependencies]
+async-compression = { version = "0.4", default-features = false, features = ["bzip2", "gzip", "tokio", "xz", "zstd"] }
 bitflags = { version = "2", default-features = false, features = ["std"] }
-hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "service"] }
-once_cell = { version = "1" }
-socket2 = { version = "0.6", default-features = false, features = ["all"] }
-tower = { version = "0.5", default-features = false, features = ["retry", "timeout"] }
+hyper-rustls = { version = "0.27", default-features = false, features = ["http1", "http2", "native-tokio", "ring", "tls12", "webpki-tokio"] }
+hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "server-graceful", "service"] }
+ipnet = { version = "2", features = ["serde"] }
+lzma-sys = { version = "0.1", default-features = false, features = ["static"] }
+socket2-3b31131e45eafb45 = { package = "socket2", version = "0.6", default-features = false, features = ["all"] }
+tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
+tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "load-shed", "retry", "timeout"] }
+tower-http = { version = "0.6", features = ["catch-panic", "follow-redirect"] }
+xz2 = { version = "0.1", default-features = false, features = ["static"] }
 
 [target.x86_64-apple-darwin.dependencies]
+async-compression = { version = "0.4", default-features = false, features = ["bzip2", "gzip", "tokio", "xz", "zstd"] }
 bitflags = { version = "2", default-features = false, features = ["std"] }
-hyper-util = { version = "0.1", default-features = false, features = ["client-proxy"] }
-once_cell = { version = "1" }
+hyper-rustls = { version = "0.27", default-features = false, features = ["http1", "http2", "native-tokio", "ring", "tls12", "webpki-tokio"] }
+lzma-sys = { version = "0.1", default-features = false, features = ["static"] }
+nix = { version = "0.30", default-features = false, features = ["fs", "ioctl", "poll", "signal", "socket", "term"] }
 tower = { version = "0.5", default-features = false, features = ["retry", "timeout"] }
+tower-http = { version = "0.6", features = ["catch-panic", "follow-redirect"] }
+xz2 = { version = "0.1", default-features = false, features = ["static"] }
 
 [target.x86_64-apple-darwin.build-dependencies]
+async-compression = { version = "0.4", default-features = false, features = ["bzip2", "gzip", "tokio", "xz", "zstd"] }
 bitflags = { version = "2", default-features = false, features = ["std"] }
-hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "service"] }
-once_cell = { version = "1" }
-socket2 = { version = "0.6", default-features = false, features = ["all"] }
-tower = { version = "0.5", default-features = false, features = ["retry", "timeout"] }
+hyper-rustls = { version = "0.27", default-features = false, features = ["http1", "http2", "native-tokio", "ring", "tls12", "webpki-tokio"] }
+hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "server-graceful", "service"] }
+ipnet = { version = "2", features = ["serde"] }
+lzma-sys = { version = "0.1", default-features = false, features = ["static"] }
+socket2-3b31131e45eafb45 = { package = "socket2", version = "0.6", default-features = false, features = ["all"] }
+tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
+tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "load-shed", "retry", "timeout"] }
+tower-http = { version = "0.6", features = ["catch-panic", "follow-redirect"] }
+xz2 = { version = "0.1", default-features = false, features = ["static"] }
 
 [target.aarch64-apple-darwin.dependencies]
+async-compression = { version = "0.4", default-features = false, features = ["bzip2", "gzip", "tokio", "xz", "zstd"] }
 bitflags = { version = "2", default-features = false, features = ["std"] }
-hyper-util = { version = "0.1", default-features = false, features = ["client-proxy"] }
-once_cell = { version = "1" }
+hyper-rustls = { version = "0.27", default-features = false, features = ["http1", "http2", "native-tokio", "ring", "tls12", "webpki-tokio"] }
+lzma-sys = { version = "0.1", default-features = false, features = ["static"] }
+nix = { version = "0.30", default-features = false, features = ["fs", "ioctl", "poll", "signal", "socket", "term"] }
 tower = { version = "0.5", default-features = false, features = ["retry", "timeout"] }
+tower-http = { version = "0.6", features = ["catch-panic", "follow-redirect"] }
+xz2 = { version = "0.1", default-features = false, features = ["static"] }
 
 [target.aarch64-apple-darwin.build-dependencies]
+async-compression = { version = "0.4", default-features = false, features = ["bzip2", "gzip", "tokio", "xz", "zstd"] }
 bitflags = { version = "2", default-features = false, features = ["std"] }
-hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "service"] }
-once_cell = { version = "1" }
-socket2 = { version = "0.6", default-features = false, features = ["all"] }
-tower = { version = "0.5", default-features = false, features = ["retry", "timeout"] }
+hyper-rustls = { version = "0.27", default-features = false, features = ["http1", "http2", "native-tokio", "ring", "tls12", "webpki-tokio"] }
+hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "server-graceful", "service"] }
+ipnet = { version = "2", features = ["serde"] }
+lzma-sys = { version = "0.1", default-features = false, features = ["static"] }
+socket2-3b31131e45eafb45 = { package = "socket2", version = "0.6", default-features = false, features = ["all"] }
+tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
+tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "load-shed", "retry", "timeout"] }
+tower-http = { version = "0.6", features = ["catch-panic", "follow-redirect"] }
+xz2 = { version = "0.1", default-features = false, features = ["static"] }
 
 [target.x86_64-pc-windows-msvc.dependencies]
-hyper-util = { version = "0.1", default-features = false, features = ["client-proxy"] }
-once_cell = { version = "1" }
+async-compression = { version = "0.4", default-features = false, features = ["bzip2", "gzip", "tokio", "xz", "zstd"] }
+hyper-rustls = { version = "0.27", default-features = false, features = ["http1", "http2", "native-tokio", "ring", "tls12", "webpki-tokio"] }
+lzma-sys = { version = "0.1", default-features = false, features = ["static"] }
+socket2-d8f496e17d97b5cb = { package = "socket2", version = "0.5", default-features = false, features = ["all"] }
 tower = { version = "0.5", default-features = false, features = ["retry", "timeout"] }
-windows-sys-73dcd821b1037cfd = { package = "windows-sys", version = "0.59", features = ["Wdk_Foundation", "Wdk_Storage_FileSystem", "Wdk_System_IO", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_IO", "Win32_System_Memory", "Win32_System_Pipes", "Win32_System_SystemServices", "Win32_System_Threading", "Win32_System_WindowsProgramming", "Win32_UI_Input_KeyboardAndMouse", "Win32_UI_Shell"] }
-windows-sys-b21d60becc0929df = { package = "windows-sys", version = "0.52", features = ["Win32_Foundation", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_IO", "Win32_System_Threading", "Win32_System_WindowsProgramming"] }
-windows-sys-d4189bed749088b6 = { package = "windows-sys", version = "0.61", features = ["Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_SystemInformation"] }
+tower-http = { version = "0.6", features = ["catch-panic", "follow-redirect"] }
+winapi = { version = "0.3", default-features = false, features = ["cfg", "evntrace", "in6addr", "inaddr", "minwinbase", "minwindef", "ntsecapi", "profileapi", "windef", "winioctl", "winnt"] }
+windows-sys-4db8c43aad08e7ae = { package = "windows-sys", version = "0.60", features = ["Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_IO", "Win32_System_Threading", "Win32_System_WindowsProgramming", "Win32_UI_Input_KeyboardAndMouse"] }
+windows-sys-73dcd821b1037cfd = { package = "windows-sys", version = "0.59", features = ["Wdk_Foundation", "Wdk_Storage_FileSystem", "Wdk_System_IO", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_IO", "Win32_System_Memory", "Win32_System_Pipes", "Win32_System_Threading", "Win32_System_WindowsProgramming", "Win32_UI_Input_KeyboardAndMouse"] }
+windows-sys-c8eced492e86ede7 = { package = "windows-sys", version = "0.48", features = ["Win32_Foundation", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Diagnostics_Debug", "Win32_System_Registry", "Win32_System_Time", "Win32_UI_Shell"] }
+windows-sys-d4189bed749088b6 = { package = "windows-sys", version = "0.61", features = ["Win32_Security_Authentication_Identity", "Win32_Security_Credentials", "Win32_Security_Cryptography", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_LibraryLoader", "Win32_System_Memory", "Win32_System_Pipes", "Win32_System_SystemInformation", "Win32_System_SystemServices", "Win32_System_Threading", "Win32_System_WindowsProgramming", "Win32_UI_Shell"] }
+xz2 = { version = "0.1", default-features = false, features = ["static"] }
 
 [target.x86_64-pc-windows-msvc.build-dependencies]
-hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "service"] }
-once_cell = { version = "1" }
-socket2 = { version = "0.6", default-features = false, features = ["all"] }
-tower = { version = "0.5", default-features = false, features = ["retry", "timeout"] }
-windows-sys-73dcd821b1037cfd = { package = "windows-sys", version = "0.59", features = ["Wdk_Foundation", "Wdk_Storage_FileSystem", "Wdk_System_IO", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_IO", "Win32_System_Memory", "Win32_System_Pipes", "Win32_System_SystemServices", "Win32_System_Threading", "Win32_System_WindowsProgramming", "Win32_UI_Input_KeyboardAndMouse", "Win32_UI_Shell"] }
-windows-sys-d4189bed749088b6 = { package = "windows-sys", version = "0.61", features = ["Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_SystemInformation"] }
+async-compression = { version = "0.4", default-features = false, features = ["bzip2", "gzip", "tokio", "xz", "zstd"] }
+hyper-rustls = { version = "0.27", default-features = false, features = ["http1", "http2", "native-tokio", "ring", "tls12", "webpki-tokio"] }
+hyper-util = { version = "0.1", features = ["client-legacy", "client-proxy", "server-auto", "server-graceful", "service"] }
+ipnet = { version = "2", features = ["serde"] }
+lzma-sys = { version = "0.1", default-features = false, features = ["static"] }
+socket2-3b31131e45eafb45 = { package = "socket2", version = "0.6", default-features = false, features = ["all"] }
+tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
+tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "load-shed", "retry", "timeout"] }
+tower-http = { version = "0.6", features = ["catch-panic", "follow-redirect"] }
+windows-sys-4db8c43aad08e7ae = { package = "windows-sys", version = "0.60", features = ["Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_IO", "Win32_System_Threading", "Win32_System_WindowsProgramming", "Win32_UI_Input_KeyboardAndMouse"] }
+windows-sys-73dcd821b1037cfd = { package = "windows-sys", version = "0.59", features = ["Wdk_Foundation", "Wdk_Storage_FileSystem", "Wdk_System_IO", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_IO", "Win32_System_Memory", "Win32_System_Pipes", "Win32_System_Threading", "Win32_System_WindowsProgramming", "Win32_UI_Input_KeyboardAndMouse"] }
+windows-sys-c8eced492e86ede7 = { package = "windows-sys", version = "0.48", features = ["Win32_Foundation", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Diagnostics_Debug", "Win32_System_Registry", "Win32_System_Time", "Win32_UI_Shell"] }
+windows-sys-d4189bed749088b6 = { package = "windows-sys", version = "0.61", features = ["Win32_Security_Authentication_Identity", "Win32_Security_Credentials", "Win32_Security_Cryptography", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_LibraryLoader", "Win32_System_Memory", "Win32_System_Pipes", "Win32_System_SystemInformation", "Win32_System_SystemServices", "Win32_System_Threading", "Win32_System_WindowsProgramming", "Win32_UI_Shell"] }
+xz2 = { version = "0.1", default-features = false, features = ["static"] }
 
 ### END HAKARI SECTION