oximeter: refactor field query to improve performance.

jmcarp · jmcarp · commit 5532a7254238 · 2025-10-20T15:05:46.000-04:00
As described in #9256, looking up field names and values can take much longer than looking up measurements, making queries surprisingly slow overall. Rather than scanning field tables multiple times and running potentially many joins to combine the results, this patch refactors the field lookup query to scan multiple tables in parallel with the `merge` table function, then pivot the results to a wide table using maps. As described inline, there's a bug in merge that prevents us from merging all field tables at once, so we split tables into groups that merge cleanly, then join the results.
diff --git a/oximeter/db/src/client/oxql.rs b/oximeter/db/src/client/oxql.rs
@@ -22,8 +22,8 @@ use crate::oxql::ast::table_ops::limit::LimitKind;
 use crate::oxql::query::QueryAuthzScope;
 use crate::query::field_table_name;
 use oximeter::Measurement;
-use oximeter::TimeseriesSchema;
 use oximeter::schema::TimeseriesKey;
+use oximeter::{FieldSchema, TimeseriesSchema};
 use slog::Logger;
 use slog::debug;
 use slog::trace;
@@ -949,98 +949,118 @@ impl Client {
         Ok(query)
     }
 
+    // We store fields in long tables of (field_name, field_value, ...) with
+    // separate tables by field type. Here, we need to pivot to wide tables,
+    // collecting all field (name, value) pairs for a given timeseries key. To
+    // do this relatively efficiently, we use the `merge` table function to
+    // scan multiple field tables in parallel, and pivot results using maps.
+    // Due to a bug in the version of ClickHouse we're using at the moment, we
+    // can't merge all field tables together at once, since ClickHouse attempts
+    // to cast values to the wrong types as part of the the merge. To work
+    // around this, we group field tables into sets that ClickHouse can merge
+    // correctly, then join them together to collect the results. See #9256 for
+    // details.
+    //
+    // TODO(jmcarp): simplify the merge logic after upgrading to a ClickHouse
+    // version with a fix for this bug, or refactor further with a different
+    // approach. We may be able to do better by aggregating results at the
+    // application layer, or by materializing fields into a wide table on
+    // write.
     fn all_fields_query_raw(
         &self,
         schema: &TimeseriesSchema,
     ) -> (bool, String) {
         match schema.field_schema.len() {
             0 => unreachable!(),
-            1 => {
-                let field_schema = schema.field_schema.first().unwrap();
-                (
-                    true,
-                    format!(
-                        "SELECT DISTINCT timeseries_key, field_value AS {field_name} \
-                        FROM {db_name}.{field_table} \
-                        WHERE \
-                            timeseries_name = '{timeseries_name}' AND \
-                            field_name = '{field_name}'",
-                        field_name = field_schema.name,
-                        db_name = crate::DATABASE_NAME,
-                        field_table = field_table_name(field_schema.field_type),
-                        timeseries_name = schema.timeseries_name,
-                    ),
-                )
-            }
             _ => {
-                let mut top_level_columns =
-                    Vec::with_capacity(schema.field_schema.len());
-                let mut field_subqueries =
-                    Vec::with_capacity(schema.field_schema.len());
+                let mut intish_fields: Vec<&FieldSchema> = Vec::new();
+                let mut textish_fields: Vec<&FieldSchema> = Vec::new();
 
-                // Select each field value, aliasing it to its field name.
+                let mut selects: Vec<String> =
+                    vec![String::from("timeseries_key")];
+                let mut froms: Vec<String> = Vec::new();
                 for field_schema in schema.field_schema.iter() {
-                    top_level_columns.push(format!(
-                        "filter_on_{}.field_value AS {}",
-                        field_schema.name, field_schema.name
-                    ));
-                    field_subqueries.push((
-                        format!(
-                            "SELECT DISTINCT timeseries_key, field_value \
-                                FROM {db_name}.{field_table} \
-                                WHERE \
-                                    timeseries_name = '{timeseries_name}' AND \
-                                    field_name = '{field_name}' \
-                                ",
-                            db_name = crate::DATABASE_NAME,
-                            field_table =
-                                field_table_name(field_schema.field_type),
-                            timeseries_name = schema.timeseries_name,
-                            field_name = field_schema.name,
-                        ),
-                        format!("filter_on_{}", field_schema.name),
-                    ));
-                }
-
-                // Write the top-level select statement, starting by selecting
-                // the timeseries key from the first field schema.
-                let mut out = format!(
-                    "SELECT {}.timeseries_key AS timeseries_key, {} FROM ",
-                    field_subqueries[0].1,
-                    top_level_columns.join(", "),
-                );
-
-                // Then add all the subqueries selecting each field.
-                //
-                // We need to add these, along with their aliases. The first
-                // such subquery has no join conditions, but the later ones all
-                // refer to the previous via:
-                //
-                // `ON <previous_filter_name>.timeseries_key = <current_filter_name>.timeseries_key`
-                for (i, (subq, alias)) in field_subqueries.iter().enumerate() {
-                    // Push the subquery itself, aliased.
-                    out.push('(');
-                    out.push_str(subq);
-                    out.push_str(") AS ");
-                    out.push_str(alias);
-
-                    // Push the join conditions.
-                    if i > 0 {
-                        let previous_alias = &field_subqueries[i - 1].1;
-                        out.push_str(" ON ");
-                        out.push_str(alias);
-                        out.push_str(".timeseries_key = ");
-                        out.push_str(previous_alias);
-                        out.push_str(".timeseries_key");
+                    match field_schema.field_type {
+                        oximeter::FieldType::I8
+                        | oximeter::FieldType::I16
+                        | oximeter::FieldType::I32
+                        | oximeter::FieldType::I64
+                        | oximeter::FieldType::U8
+                        | oximeter::FieldType::U16
+                        | oximeter::FieldType::U32
+                        | oximeter::FieldType::U64
+                        | oximeter::FieldType::Bool => {
+                            intish_fields.push(field_schema);
+                            selects.push(format!(
+                                "intish_fields.fields['{}']::{} AS {}",
+                                field_schema.name,
+                                ch_type_for_field(field_schema.field_type),
+                                field_schema.name
+                            ))
+                        }
+                        oximeter::FieldType::String
+                        | oximeter::FieldType::Uuid
+                        | oximeter::FieldType::IpAddr => {
+                            textish_fields.push(field_schema);
+                            selects.push(format!(
+                                "textish_fields.fields['{}']::{} AS {}",
+                                field_schema.name,
+                                ch_type_for_field(field_schema.field_type),
+                                field_schema.name
+                            ))
+                        }
                     }
+                }
+                let intish_tables: Vec<String> = intish_fields
+                    .iter()
+                    .map(|field| field_table_name(field.field_type))
+                    .collect();
+                let textish_tables: Vec<String> = textish_fields
+                    .iter()
+                    .map(|field| field_table_name(field.field_type))
+                    .collect();
+                let mut ctes: Vec<String> = Vec::new();
+                if !intish_tables.is_empty() {
+                    ctes.push(format!(
+                        "intish_fields AS (
+                            SELECT timeseries_name, timeseries_key, mapFromArrays(groupArray(field_name), groupArray(field_value)) AS fields \
+                            FROM merge('oximeter', '{field_table_regex}') \
+                            WHERE timeseries_name = '{timeseries_name}' \
+                            GROUP BY timeseries_name, timeseries_key \
+                        )",
+                        field_table_regex = intish_tables.join("|"),
+                        timeseries_name = schema.timeseries_name));
+                    froms.push(String::from("intish_fields"));
+                }
+                if !textish_tables.is_empty() {
+                    ctes.push(format!(
+                        "textish_fields AS (
+                            SELECT timeseries_name, timeseries_key, mapFromArrays(groupArray(field_name), groupArray(field_value)) AS fields \
+                            FROM merge('oximeter', '{field_table_regex}') \
+                            WHERE timeseries_name = '{timeseries_name}' \
+                            GROUP BY timeseries_name, timeseries_key \
+                        )",
+                        field_table_regex = textish_tables.join("|"),
+                        timeseries_name = schema.timeseries_name));
+                    froms.push(String::from("textish_fields"));
+                }
 
-                    // Push the "INNER JOIN" expression itself, for all but the
-                    // last subquery.
-                    if i < field_subqueries.len() - 1 {
-                        out.push_str(" INNER JOIN ");
-                    }
+                let mut from = froms[0].clone();
+                for from_item in froms.iter().skip(1) {
+                    from = format!(
+                        "{} JOIN {} USING (timeseries_key)",
+                        from, from_item
+                    );
                 }
-                (false, out)
+                let query = format!(
+                    "WITH {ctes} \
+                        SELECT {select} \
+                        FROM {from}",
+                    ctes = ctes.join(", "),
+                    select = selects.join(",\n"),
+                    from = from,
+                );
+                (false, query)
             }
         }
     }
@@ -1179,6 +1199,23 @@ fn update_total_rows_and_check(
     Ok(())
 }
 
+fn ch_type_for_field(field: oximeter::FieldType) -> String {
+    match field {
+        oximeter::FieldType::String => String::from("String"),
+        oximeter::FieldType::Bool => String::from("Bool"),
+        oximeter::FieldType::Uuid => String::from("UUID"),
+        oximeter::FieldType::IpAddr => String::from("IpV6"),
+        oximeter::FieldType::I8 => String::from("Int8"),
+        oximeter::FieldType::I16 => String::from("Int16"),
+        oximeter::FieldType::I32 => String::from("Int32"),
+        oximeter::FieldType::I64 => String::from("Int64"),
+        oximeter::FieldType::U8 => String::from("UInt8"),
+        oximeter::FieldType::U16 => String::from("UInt16"),
+        oximeter::FieldType::U32 => String::from("UInt32"),
+        oximeter::FieldType::U64 => String::from("UInt64"),
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::ConsistentKeyGroup;