feat(datafusion): Add schema validation for partition projection

viirya · claude · viirya · commit 4e6566f0039d · 2026-01-08T20:42:38.000-08:00
Implement schema validation in project_with_partition to ensure the input schema matches the Iceberg table schema before calculating partition values. This prevents subtle bugs from schema mismatches and provides clear error messages when schemas don't match. Changes: - Add helper functions to recursively strip metadata from Arrow schemas - Implement schema validation that compares input schema with expected Iceberg table schema, ignoring metadata differences - Add comprehensive tests for metadata stripping and schema validation - Closes #1752 The implementation follows the approach suggested in issue #1752: - Recursively visits schema and removes metadata from all fields - Compares cleaned schemas using Arrow's built-in equality operator - Returns helpful error messages showing both schemas on mismatch Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
diff --git a/crates/integrations/datafusion/src/physical_plan/project.rs b/crates/integrations/datafusion/src/physical_plan/project.rs
@@ -20,18 +20,80 @@
 use std::sync::Arc;
 
 use datafusion::arrow::array::RecordBatch;
-use datafusion::arrow::datatypes::{DataType, Schema as ArrowSchema};
-use datafusion::common::Result as DFResult;
+use datafusion::arrow::datatypes::{DataType, Field, Fields, Schema as ArrowSchema};
+use datafusion::common::{DataFusionError, Result as DFResult};
 use datafusion::physical_expr::PhysicalExpr;
 use datafusion::physical_expr::expressions::Column;
 use datafusion::physical_plan::projection::ProjectionExec;
 use datafusion::physical_plan::{ColumnarValue, ExecutionPlan};
-use iceberg::arrow::{PROJECTED_PARTITION_VALUE_COLUMN, PartitionValueCalculator};
+use iceberg::arrow::{PROJECTED_PARTITION_VALUE_COLUMN, PartitionValueCalculator, schema_to_arrow_schema};
 use iceberg::spec::PartitionSpec;
 use iceberg::table::Table;
 
 use crate::to_datafusion_error;
 
+/// Recursively strips metadata from an Arrow schema and all its nested fields.
+///
+/// This function creates a new schema with all metadata removed from fields at every level,
+/// including nested struct fields. This is useful for schema comparison where metadata
+/// differences should be ignored.
+///
+/// # Arguments
+/// * `schema` - The Arrow schema to strip metadata from
+///
+/// # Returns
+/// A new Arrow schema with all metadata removed
+fn strip_metadata_from_schema(schema: &ArrowSchema) -> ArrowSchema {
+    let fields: Fields = schema
+        .fields()
+        .iter()
+        .map(|field| strip_metadata_from_field(field))
+        .collect();
+    ArrowSchema::new(fields)
+}
+
+/// Recursively strips metadata from an Arrow field and its nested fields.
+///
+/// # Arguments
+/// * `field` - The Arrow field to strip metadata from
+///
+/// # Returns
+/// A new Arrow field with all metadata removed
+fn strip_metadata_from_field(field: &Field) -> Field {
+    let data_type = strip_metadata_from_datatype(field.data_type());
+    Field::new(field.name(), data_type, field.is_nullable())
+}
+
+/// Recursively strips metadata from an Arrow data type.
+///
+/// For struct types, this function recursively processes all nested fields.
+/// For other types, it returns a clone of the type.
+///
+/// # Arguments
+/// * `data_type` - The Arrow data type to strip metadata from
+///
+/// # Returns
+/// A new Arrow data type with all metadata removed from nested structures
+fn strip_metadata_from_datatype(data_type: &DataType) -> DataType {
+    match data_type {
+        DataType::Struct(fields) => {
+            let stripped_fields: Fields = fields
+                .iter()
+                .map(|field| strip_metadata_from_field(field))
+                .collect();
+            DataType::Struct(stripped_fields)
+        }
+        DataType::List(field) => DataType::List(Arc::new(strip_metadata_from_field(field))),
+        DataType::LargeList(field) => {
+            DataType::LargeList(Arc::new(strip_metadata_from_field(field)))
+        }
+        DataType::Map(field, sorted) => {
+            DataType::Map(Arc::new(strip_metadata_from_field(field)), *sorted)
+        }
+        _ => data_type.clone(),
+    }
+}
+
 /// Extends an ExecutionPlan with partition value calculations for Iceberg tables.
 ///
 /// This function takes an input ExecutionPlan and extends it with an additional column
@@ -58,8 +120,23 @@ pub fn project_with_partition(
     }
 
     let input_schema = input.schema();
-    // TODO: Validate that input_schema matches the Iceberg table schema.
-    // See: https://github.com/apache/iceberg-rust/issues/1752
+
+    // Validate that input_schema matches the Iceberg table schema
+    // Strip metadata from both schemas before comparison to ignore metadata differences
+    let expected_arrow_schema =
+        schema_to_arrow_schema(table_schema.as_ref()).map_err(to_datafusion_error)?;
+    let input_schema_cleaned = strip_metadata_from_schema(&input_schema);
+    let expected_schema_cleaned = strip_metadata_from_schema(&expected_arrow_schema);
+
+    if input_schema_cleaned != expected_schema_cleaned {
+        return Err(DataFusionError::Plan(format!(
+            "Input schema does not match Iceberg table schema.\n\
+             Expected schema: {}\n\
+             Input schema: {}",
+            expected_schema_cleaned, input_schema_cleaned
+        )));
+    }
+
     let calculator =
         PartitionValueCalculator::try_new(partition_spec.as_ref(), table_schema.as_ref())
             .map_err(to_datafusion_error)?;
@@ -377,4 +454,270 @@ mod tests {
         assert_eq!(city_partition.value(0), "New York");
         assert_eq!(city_partition.value(1), "Los Angeles");
     }
+
+    #[test]
+    fn test_strip_metadata_from_simple_schema() {
+        use std::collections::HashMap;
+
+        let mut metadata = HashMap::new();
+        metadata.insert("key1".to_string(), "value1".to_string());
+
+        let field_with_metadata = Field::new("id", DataType::Int32, false).with_metadata(metadata);
+        let schema = ArrowSchema::new(vec![field_with_metadata]);
+
+        let stripped = strip_metadata_from_schema(&schema);
+
+        assert_eq!(stripped.fields().len(), 1);
+        assert_eq!(stripped.field(0).name(), "id");
+        assert_eq!(*stripped.field(0).data_type(), DataType::Int32);
+        assert!(stripped.field(0).metadata().is_empty());
+    }
+
+    #[test]
+    fn test_strip_metadata_from_nested_schema() {
+        use std::collections::HashMap;
+
+        let mut metadata = HashMap::new();
+        metadata.insert("key1".to_string(), "value1".to_string());
+
+        let nested_field_with_metadata =
+            Field::new("city", DataType::Utf8, false).with_metadata(metadata.clone());
+        let struct_fields = Fields::from(vec![
+            Field::new("street", DataType::Utf8, false),
+            nested_field_with_metadata,
+        ]);
+        let struct_field =
+            Field::new("address", DataType::Struct(struct_fields), false).with_metadata(metadata);
+
+        let schema = ArrowSchema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            struct_field,
+        ]);
+
+        let stripped = strip_metadata_from_schema(&schema);
+
+        assert_eq!(stripped.fields().len(), 2);
+        assert!(stripped.field(0).metadata().is_empty());
+        assert!(stripped.field(1).metadata().is_empty());
+
+        if let DataType::Struct(fields) = stripped.field(1).data_type() {
+            assert_eq!(fields.len(), 2);
+            assert!(fields[0].metadata().is_empty());
+            assert!(fields[1].metadata().is_empty());
+        } else {
+            panic!("Expected Struct data type");
+        }
+    }
+
+    #[test]
+    fn test_strip_metadata_from_list_schema() {
+        use std::collections::HashMap;
+
+        let mut metadata = HashMap::new();
+        metadata.insert("key1".to_string(), "value1".to_string());
+
+        let list_field = Field::new("item", DataType::Int32, false).with_metadata(metadata.clone());
+        let list_type = DataType::List(Arc::new(list_field));
+        let field = Field::new("numbers", list_type, false).with_metadata(metadata);
+
+        let schema = ArrowSchema::new(vec![field]);
+        let stripped = strip_metadata_from_schema(&schema);
+
+        assert_eq!(stripped.fields().len(), 1);
+        assert!(stripped.field(0).metadata().is_empty());
+
+        if let DataType::List(inner_field) = stripped.field(0).data_type() {
+            assert!(inner_field.metadata().is_empty());
+        } else {
+            panic!("Expected List data type");
+        }
+    }
+
+    #[test]
+    fn test_schema_validation_matching_schemas() {
+        use iceberg::TableIdent;
+        use iceberg::io::FileIO;
+        use iceberg::spec::{FormatVersion, NestedField, PrimitiveType, Schema, Type};
+
+        let table_schema = Arc::new(
+            Schema::builder()
+                .with_fields(vec![
+                    NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(),
+                    NestedField::required(2, "name", Type::Primitive(PrimitiveType::String))
+                        .into(),
+                ])
+                .build()
+                .unwrap(),
+        );
+
+        let partition_spec = iceberg::spec::PartitionSpec::builder(table_schema.clone())
+            .add_partition_field("id", "id_partition", Transform::Identity)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let sort_order = iceberg::spec::SortOrder::builder()
+            .build(&table_schema)
+            .unwrap();
+
+        let table_metadata_builder = iceberg::spec::TableMetadataBuilder::new(
+            (*table_schema).clone(),
+            partition_spec,
+            sort_order,
+            "/test/table".to_string(),
+            FormatVersion::V2,
+            std::collections::HashMap::new(),
+        )
+        .unwrap();
+
+        let table_metadata = table_metadata_builder.build().unwrap();
+
+        // Create Arrow schema matching the table schema
+        let arrow_schema = Arc::new(ArrowSchema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("name", DataType::Utf8, false),
+        ]));
+
+        let input = Arc::new(EmptyExec::new(arrow_schema));
+
+        let table = iceberg::table::Table::builder()
+            .metadata(table_metadata.metadata)
+            .identifier(TableIdent::from_strs(["test", "table"]).unwrap())
+            .file_io(FileIO::from_path("/tmp").unwrap().build().unwrap())
+            .metadata_location("/test/metadata.json".to_string())
+            .build()
+            .unwrap();
+
+        let result = project_with_partition(input, &table);
+        assert!(result.is_ok(), "Schema validation should pass");
+    }
+
+    #[test]
+    fn test_schema_validation_mismatched_schemas() {
+        use iceberg::TableIdent;
+        use iceberg::io::FileIO;
+        use iceberg::spec::{FormatVersion, NestedField, PrimitiveType, Schema, Type};
+
+        let table_schema = Arc::new(
+            Schema::builder()
+                .with_fields(vec![
+                    NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(),
+                    NestedField::required(2, "name", Type::Primitive(PrimitiveType::String))
+                        .into(),
+                ])
+                .build()
+                .unwrap(),
+        );
+
+        let partition_spec = iceberg::spec::PartitionSpec::builder(table_schema.clone())
+            .add_partition_field("id", "id_partition", Transform::Identity)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let sort_order = iceberg::spec::SortOrder::builder()
+            .build(&table_schema)
+            .unwrap();
+
+        let table_metadata_builder = iceberg::spec::TableMetadataBuilder::new(
+            (*table_schema).clone(),
+            partition_spec,
+            sort_order,
+            "/test/table".to_string(),
+            FormatVersion::V2,
+            std::collections::HashMap::new(),
+        )
+        .unwrap();
+
+        let table_metadata = table_metadata_builder.build().unwrap();
+
+        // Create Arrow schema with different field name (mismatched)
+        let arrow_schema = Arc::new(ArrowSchema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("different_name", DataType::Utf8, false), // Wrong field name
+        ]));
+
+        let input = Arc::new(EmptyExec::new(arrow_schema));
+
+        let table = iceberg::table::Table::builder()
+            .metadata(table_metadata.metadata)
+            .identifier(TableIdent::from_strs(["test", "table"]).unwrap())
+            .file_io(FileIO::from_path("/tmp").unwrap().build().unwrap())
+            .metadata_location("/test/metadata.json".to_string())
+            .build()
+            .unwrap();
+
+        let result = project_with_partition(input, &table);
+        assert!(result.is_err(), "Schema validation should fail for mismatched schemas");
+        assert!(result
+            .unwrap_err()
+            .to_string()
+            .contains("Input schema does not match Iceberg table schema"));
+    }
+
+    #[test]
+    fn test_schema_validation_with_metadata_differences() {
+        use iceberg::TableIdent;
+        use iceberg::io::FileIO;
+        use iceberg::spec::{FormatVersion, NestedField, PrimitiveType, Schema, Type};
+        use std::collections::HashMap;
+
+        let table_schema = Arc::new(
+            Schema::builder()
+                .with_fields(vec![
+                    NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(),
+                    NestedField::required(2, "name", Type::Primitive(PrimitiveType::String))
+                        .into(),
+                ])
+                .build()
+                .unwrap(),
+        );
+
+        let partition_spec = iceberg::spec::PartitionSpec::builder(table_schema.clone())
+            .add_partition_field("id", "id_partition", Transform::Identity)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let sort_order = iceberg::spec::SortOrder::builder()
+            .build(&table_schema)
+            .unwrap();
+
+        let table_metadata_builder = iceberg::spec::TableMetadataBuilder::new(
+            (*table_schema).clone(),
+            partition_spec,
+            sort_order,
+            "/test/table".to_string(),
+            FormatVersion::V2,
+            std::collections::HashMap::new(),
+        )
+        .unwrap();
+
+        let table_metadata = table_metadata_builder.build().unwrap();
+
+        // Create Arrow schema with metadata (should be ignored in comparison)
+        let mut metadata = HashMap::new();
+        metadata.insert("extra".to_string(), "metadata".to_string());
+
+        let arrow_schema = Arc::new(ArrowSchema::new(vec![
+            Field::new("id", DataType::Int32, false).with_metadata(metadata.clone()),
+            Field::new("name", DataType::Utf8, false).with_metadata(metadata),
+        ]));
+
+        let input = Arc::new(EmptyExec::new(arrow_schema));
+
+        let table = iceberg::table::Table::builder()
+            .metadata(table_metadata.metadata)
+            .identifier(TableIdent::from_strs(["test", "table"]).unwrap())
+            .file_io(FileIO::from_path("/tmp").unwrap().build().unwrap())
+            .metadata_location("/test/metadata.json".to_string())
+            .build()
+            .unwrap();
+
+        let result = project_with_partition(input, &table);
+        assert!(
+            result.is_ok(),
+            "Schema validation should pass even with metadata differences"
+        );
+    }
 }