diff --git a/crates/catalog/loader/tests/schema_update_suite.rs b/crates/catalog/loader/tests/schema_update_suite.rs new file mode 100644 index 0000000000..9421bbf0ee --- /dev/null +++ b/crates/catalog/loader/tests/schema_update_suite.rs @@ -0,0 +1,284 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Common schema-update behavior across catalogs. +//! +//! These tests assume Docker containers are started externally via `make docker-up`. + +mod common; + +use std::collections::HashMap; + +use common::{CatalogKind, cleanup_namespace_dyn, load_catalog}; +use iceberg::spec::{NestedField, PrimitiveType, Schema, StructType, Type}; +use iceberg::transaction::{AddColumn, ApplyTransactionAction, Transaction}; +use iceberg::{ErrorKind, NamespaceIdent, Result, TableCreation, TableIdent}; +use iceberg_test_utils::normalize_test_name_with_parts; +use rstest::rstest; + +fn base_schema() -> Schema { + Schema::builder() + .with_fields(vec![ + NestedField::optional(1, "foo", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "bar", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(3, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), + ]) + .with_identifier_field_ids(vec![2]) + .build() + .unwrap() +} + +// Common behavior: adding a top-level column appends it to the schema. +// HMS is excluded because update_table is not yet supported. +#[rstest] +#[case::rest_catalog(CatalogKind::Rest)] +#[case::glue_catalog(CatalogKind::Glue)] +#[case::sql_catalog(CatalogKind::Sql)] +#[case::s3tables_catalog(CatalogKind::S3Tables)] +#[case::memory_catalog(CatalogKind::Memory)] +#[tokio::test] +async fn test_catalog_schema_add_column(#[case] kind: CatalogKind) -> Result<()> { + let Some(harness) = load_catalog(kind).await else { + return Ok(()); + }; + let catalog = harness.catalog; + let namespace = NamespaceIdent::new(normalize_test_name_with_parts!( + "catalog_schema_add_column", + harness.label + )); + + cleanup_namespace_dyn(catalog.as_ref(), &namespace).await; + catalog.create_namespace(&namespace, HashMap::new()).await?; + + let table_name = + normalize_test_name_with_parts!("catalog_schema_add_column", harness.label, "table"); + let table = catalog + .create_table( + &namespace, + TableCreation::builder() + .name(table_name) + .schema(base_schema()) + .build(), + ) + .await?; + + let tx = Transaction::new(&table); + let tx = tx + .update_schema() + .add_column(AddColumn::optional( + "a", + Type::Primitive(PrimitiveType::Int), + )) + .apply(tx)?; + let updated = tx.commit(catalog.as_ref()).await?; + + let schema = updated.metadata().current_schema(); + let field_a = schema.field_by_name("a").expect("field 'a' should exist"); + assert_eq!(field_a.id, 4); + assert_eq!(*field_a.field_type, Type::Primitive(PrimitiveType::Int)); + + Ok(()) +} + +// Common behavior: adding a nested struct column then a sub-field within it, +// and deleting another top-level column, all persist correctly. +// HMS is excluded because update_table is not yet supported. +#[rstest] +#[case::rest_catalog(CatalogKind::Rest)] +#[case::glue_catalog(CatalogKind::Glue)] +#[case::sql_catalog(CatalogKind::Sql)] +#[case::s3tables_catalog(CatalogKind::S3Tables)] +#[case::memory_catalog(CatalogKind::Memory)] +#[tokio::test] +async fn test_catalog_schema_add_nested_and_delete_column(#[case] kind: CatalogKind) -> Result<()> { + let Some(harness) = load_catalog(kind).await else { + return Ok(()); + }; + let catalog = harness.catalog; + let namespace = NamespaceIdent::new(normalize_test_name_with_parts!( + "catalog_schema_add_nested_and_delete_column", + harness.label + )); + + cleanup_namespace_dyn(catalog.as_ref(), &namespace).await; + catalog.create_namespace(&namespace, HashMap::new()).await?; + + let table_name = normalize_test_name_with_parts!( + "catalog_schema_add_nested_and_delete_column", + harness.label, + "table" + ); + let table = catalog + .create_table( + &namespace, + TableCreation::builder() + .name(table_name) + .schema(base_schema()) + .build(), + ) + .await?; + + // First transaction: add a nested struct column. + let tx = Transaction::new(&table); + let tx = tx + .update_schema() + .add_column(AddColumn::optional( + "info", + Type::Struct(StructType::new(vec![ + NestedField::optional(0, "city", Type::Primitive(PrimitiveType::String)).into(), + ])), + )) + .apply(tx)?; + let table = tx.commit(catalog.as_ref()).await?; + + // Second transaction: add a sub-field to the nested struct and delete a top-level column. + let tx = Transaction::new(&table); + let tx = tx + .update_schema() + .add_column( + AddColumn::builder() + .name("zip") + .field_type(Type::Primitive(PrimitiveType::String)) + .parent("info") + .build(), + ) + .delete_column("baz") + .apply(tx)?; + let table = tx.commit(catalog.as_ref()).await?; + + let schema = table.metadata().current_schema(); + assert!(schema.field_by_name("info").is_some()); + assert!(schema.field_by_name("info.city").is_some()); + assert!(schema.field_by_name("info.zip").is_some()); + assert!(schema.field_by_name("baz").is_none()); + + Ok(()) +} + +// Common behavior: deleting an identifier field or a nonexistent field must fail. +// HMS is excluded because update_table is not yet supported. +#[rstest] +#[case::rest_catalog(CatalogKind::Rest)] +#[case::glue_catalog(CatalogKind::Glue)] +#[case::sql_catalog(CatalogKind::Sql)] +#[case::s3tables_catalog(CatalogKind::S3Tables)] +#[case::memory_catalog(CatalogKind::Memory)] +#[tokio::test] +async fn test_catalog_schema_delete_invalid_column_errors(#[case] kind: CatalogKind) -> Result<()> { + let Some(harness) = load_catalog(kind).await else { + return Ok(()); + }; + let catalog = harness.catalog; + let namespace = NamespaceIdent::new(normalize_test_name_with_parts!( + "catalog_schema_delete_invalid_column_errors", + harness.label + )); + + cleanup_namespace_dyn(catalog.as_ref(), &namespace).await; + catalog.create_namespace(&namespace, HashMap::new()).await?; + + let table_name = normalize_test_name_with_parts!( + "catalog_schema_delete_invalid_column_errors", + harness.label, + "table" + ); + let table = catalog + .create_table( + &namespace, + TableCreation::builder() + .name(table_name) + .schema(base_schema()) + .build(), + ) + .await?; + + // Deleting an identifier field must fail. + let tx = Transaction::new(&table); + let tx = tx.update_schema().delete_column("bar").apply(tx)?; + let err = tx.commit(catalog.as_ref()).await.unwrap_err(); + assert_eq!(err.kind(), ErrorKind::PreconditionFailed); + + // Deleting a nonexistent field must fail. + let tx = Transaction::new(&table); + let tx = tx.update_schema().delete_column("nonexistent").apply(tx)?; + let err = tx.commit(catalog.as_ref()).await.unwrap_err(); + assert_eq!(err.kind(), ErrorKind::PreconditionFailed); + + Ok(()) +} + +// Common behavior: loading a table after a schema update reflects the new schema. +// HMS is excluded because update_table is not yet supported. +#[rstest] +#[case::rest_catalog(CatalogKind::Rest)] +#[case::glue_catalog(CatalogKind::Glue)] +#[case::sql_catalog(CatalogKind::Sql)] +#[case::s3tables_catalog(CatalogKind::S3Tables)] +#[case::memory_catalog(CatalogKind::Memory)] +#[tokio::test] +async fn test_catalog_schema_update_persisted_after_reload( + #[case] kind: CatalogKind, +) -> Result<()> { + let Some(harness) = load_catalog(kind).await else { + return Ok(()); + }; + let catalog = harness.catalog; + let namespace = NamespaceIdent::new(normalize_test_name_with_parts!( + "catalog_schema_update_persisted_after_reload", + harness.label + )); + + cleanup_namespace_dyn(catalog.as_ref(), &namespace).await; + catalog.create_namespace(&namespace, HashMap::new()).await?; + + let table_name = normalize_test_name_with_parts!( + "catalog_schema_update_persisted_after_reload", + harness.label, + "table" + ); + let table_ident = TableIdent::new(namespace.clone(), table_name.clone()); + let table = catalog + .create_table( + &namespace, + TableCreation::builder() + .name(table_name) + .schema(base_schema()) + .build(), + ) + .await?; + + let tx = Transaction::new(&table); + let tx = tx + .update_schema() + .add_column(AddColumn::optional( + "new_field", + Type::Primitive(PrimitiveType::Long), + )) + .apply(tx)?; + tx.commit(catalog.as_ref()).await?; + + let reloaded = catalog.load_table(&table_ident).await?; + assert!( + reloaded + .metadata() + .current_schema() + .field_by_name("new_field") + .is_some() + ); + + Ok(()) +} diff --git a/crates/iceberg/src/spec/schema/mod.rs b/crates/iceberg/src/spec/schema/mod.rs index 13ad41818b..9109990e19 100644 --- a/crates/iceberg/src/spec/schema/mod.rs +++ b/crates/iceberg/src/spec/schema/mod.rs @@ -51,6 +51,8 @@ pub type SchemaId = i32; pub type SchemaRef = Arc; /// Default schema id. pub const DEFAULT_SCHEMA_ID: SchemaId = 0; +/// Delimiter for schema name, which denotes a nested struct. +pub const SCHEMA_NAME_DELIMITER: &str = "."; /// Defines schema in iceberg. #[derive(Debug, Serialize, Deserialize, Clone)] diff --git a/crates/iceberg/src/transaction/mod.rs b/crates/iceberg/src/transaction/mod.rs index 113e1a3128..159021d9f2 100644 --- a/crates/iceberg/src/transaction/mod.rs +++ b/crates/iceberg/src/transaction/mod.rs @@ -58,6 +58,7 @@ mod snapshot; mod sort_order; mod update_location; mod update_properties; +mod update_schema; mod update_statistics; mod upgrade_format_version; @@ -65,6 +66,7 @@ use std::sync::Arc; use std::time::Duration; use backon::{BackoffBuilder, ExponentialBackoff, ExponentialBuilder, RetryableWithContext}; +pub use update_schema::AddColumn; use crate::error::Result; use crate::spec::TableProperties; @@ -74,6 +76,7 @@ use crate::transaction::append::FastAppendAction; use crate::transaction::sort_order::ReplaceSortOrderAction; use crate::transaction::update_location::UpdateLocationAction; use crate::transaction::update_properties::UpdatePropertiesAction; +use crate::transaction::update_schema::UpdateSchemaAction; use crate::transaction::update_statistics::UpdateStatisticsAction; use crate::transaction::upgrade_format_version::UpgradeFormatVersionAction; use crate::{Catalog, TableCommit, TableRequirement, TableUpdate}; @@ -136,6 +139,11 @@ impl Transaction { UpdatePropertiesAction::new() } + /// Creates an update schema action. + pub fn update_schema(&self) -> UpdateSchemaAction { + UpdateSchemaAction::new() + } + /// Creates a fast append action. pub fn fast_append(&self) -> FastAppendAction { FastAppendAction::new() diff --git a/crates/iceberg/src/transaction/update_schema.rs b/crates/iceberg/src/transaction/update_schema.rs new file mode 100644 index 0000000000..6ee37be2c9 --- /dev/null +++ b/crates/iceberg/src/transaction/update_schema.rs @@ -0,0 +1,1147 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use async_trait::async_trait; +use typed_builder::TypedBuilder; + +use crate::spec::{ + ListType, Literal, MapType, NestedField, NestedFieldRef, SCHEMA_NAME_DELIMITER, Schema, + StructType, Type, +}; +use crate::table::Table; +use crate::transaction::action::{ActionCommit, TransactionAction}; +use crate::{Error, ErrorKind, Result, TableRequirement, TableUpdate}; + +// Default ID for a new column. This will be re-assigned to a fresh ID at commit time. +const DEFAULT_FIELD_ID: i32 = 0; + +/// Declarative specification for adding a column in [`UpdateSchemaAction`]. +/// +/// Use helper constructors such as [`AddColumn::optional`] and [`AddColumn::required`], +/// optionally combined with [`AddColumn::with_parent`] and [`AddColumn::with_doc`], then pass +/// the value to +/// [`UpdateSchemaAction::add_column`]. +#[derive(TypedBuilder)] +pub struct AddColumn { + #[builder(default = None, setter(strip_option, into))] + parent: Option, + #[builder(setter(into))] + name: String, + #[builder(default = false)] + required: bool, + field_type: Type, + #[builder(default = None, setter(strip_option, into))] + doc: Option, + #[builder(default = None, setter(strip_option))] + initial_default: Option, + #[builder(default = None, setter(strip_option))] + write_default: Option, +} + +impl AddColumn { + /// Create a root-level optional column specification. + pub fn optional(name: impl ToString, field_type: Type) -> Self { + Self::builder() + .name(name.to_string()) + .field_type(field_type) + .required(false) + .build() + } + + /// Create a root-level required column specification. + pub fn required(name: impl ToString, field_type: Type, initial_default: Literal) -> Self { + Self::builder() + .name(name.to_string()) + .field_type(field_type) + .required(true) + .initial_default(initial_default.clone()) + .write_default(initial_default) + .build() + } + + fn to_nested_field(&self) -> NestedFieldRef { + let mut field = NestedField::new( + DEFAULT_FIELD_ID, + self.name.clone(), + self.field_type.clone(), + self.required, + ); + + field.doc = self.doc.clone(); + field.initial_default = self.initial_default.clone(); + field.write_default = self.write_default.clone(); + Arc::new(field) + } +} + +/// Schema evolution API modeled after the Java `SchemaUpdate` implementation. +/// +/// This action accumulates schema modifications (column additions and deletions) +/// via builder methods. At commit time, it validates all operations against the +/// current table schema, auto-assigns field IDs from `table.metadata().last_column_id()`, +/// builds a new schema, and emits `AddSchema` + `SetCurrentSchema` updates with a +/// `CurrentSchemaIdMatch` requirement. +/// +/// # Example +/// +/// ```ignore +/// let tx = Transaction::new(&table); +/// let action = tx.update_schema() +/// .add_column(AddColumn::optional("new_col", Type::Primitive(PrimitiveType::Int))) +/// .add_column( +/// AddColumn::optional("email", Type::Primitive(PrimitiveType::String)) +/// .with_parent("person") +/// ) +/// .delete_column("old_col"); +/// let tx = action.apply(tx).unwrap(); +/// let table = tx.commit(&catalog).await.unwrap(); +/// ``` +pub struct UpdateSchemaAction { + additions: Vec, + deletes: Vec, +} + +impl UpdateSchemaAction { + /// Creates a new empty `UpdateSchemaAction`. + pub(crate) fn new() -> Self { + Self { + additions: Vec::new(), + deletes: Vec::new(), + } + } + + // --- Root-level additions --- + + /// Add a column to the table schema. + /// + /// To add a root-level column, leave `AddColumn::parent` as `None`. + /// For nested additions, set a parent path (for example via [`AddColumn::with_parent`]). + /// If the parent resolves to a map/list, the column is added to map value/list element. + pub fn add_column(mut self, add_column: AddColumn) -> Self { + self.additions.push(add_column); + self + } + + // --- Other builder methods --- + + /// Record a column deletion by name. + /// + /// At commit time, the column must exist in the current schema. + pub fn delete_column(mut self, name: impl ToString) -> Self { + self.deletes.push(name.to_string()); + self + } +} + +// --------------------------------------------------------------------------- +// ID assignment helpers +// --------------------------------------------------------------------------- + +/// Recursively assign fresh field IDs to a `NestedField` and all its nested sub-fields. +/// +/// This follows the same recursive pattern as `ReassignFieldIds::reassign_ids_visit_type` +/// from `crate::spec::schema::id_reassigner`, but operates on new fields with placeholder +/// IDs rather than reassigning an existing schema. `ReassignFieldIds` cannot be used +/// directly here because it rejects duplicate old IDs (all new fields share placeholder +/// ID `DEFAULT_FIELD_ID`). +fn assign_fresh_ids(field: &NestedField, next_id: &mut i32) -> NestedFieldRef { + *next_id += 1; + let new_id = *next_id; + let new_type = assign_fresh_ids_to_type(&field.field_type, next_id); + + Arc::new(NestedField { + id: new_id, + name: field.name.clone(), + required: field.required, + field_type: Box::new(new_type), + doc: field.doc.clone(), + initial_default: field.initial_default.clone(), + write_default: field.write_default.clone(), + }) +} + +/// Recursively assign fresh field IDs to all nested fields within a `Type`. +fn assign_fresh_ids_to_type(field_type: &Type, next_id: &mut i32) -> Type { + match field_type { + Type::Primitive(_) => field_type.clone(), + Type::Struct(struct_type) => { + let new_fields: Vec = struct_type + .fields() + .iter() + .map(|f| assign_fresh_ids(f, next_id)) + .collect(); + Type::Struct(StructType::new(new_fields)) + } + Type::List(list_type) => { + let new_element = assign_fresh_ids(&list_type.element_field, next_id); + Type::List(ListType { + element_field: new_element, + }) + } + Type::Map(map_type) => { + let new_key = assign_fresh_ids(&map_type.key_field, next_id); + let new_value = assign_fresh_ids(&map_type.value_field, next_id); + Type::Map(MapType { + key_field: new_key, + value_field: new_value, + }) + } + } +} + +// --------------------------------------------------------------------------- +// Parent path resolution +// --------------------------------------------------------------------------- + +/// Resolve a parent path to the target struct's parent field ID and a reference +/// to its `StructType`. +/// +/// If the parent is a map, navigates to the value field. If a list, navigates to +/// the element field. The final target must be a struct type. +fn resolve_parent_target<'a>( + base_schema: &'a Schema, + parent: &str, +) -> Result<(i32, &'a StructType)> { + base_schema + .field_by_name(parent) + .ok_or_else(|| { + Error::new( + ErrorKind::PreconditionFailed, + format!("Cannot add column: parent '{parent}' not found"), + ) + }) + .and_then(|parent_field| match parent_field.field_type.as_ref() { + Type::Struct(s) => Ok((parent_field.id, s)), + Type::Map(m) => match m.value_field.field_type.as_ref() { + Type::Struct(s) => Ok((m.value_field.id, s)), + _ => Err(Error::new( + ErrorKind::PreconditionFailed, + format!("Cannot add column: map value of '{parent}' is not a struct"), + )), + }, + Type::List(l) => match l.element_field.field_type.as_ref() { + Type::Struct(s) => Ok((l.element_field.id, s)), + _ => Err(Error::new( + ErrorKind::PreconditionFailed, + format!("Cannot add column: list element of '{parent}' is not a struct"), + )), + }, + _ => Err(Error::new( + ErrorKind::PreconditionFailed, + format!("Cannot add column: parent '{parent}' is not a struct, map, or list"), + )), + }) +} + +// --------------------------------------------------------------------------- +// Schema tree rebuild +// --------------------------------------------------------------------------- + +/// Rebuild a slice of fields, applying deletions and additions at every level, +/// plus any additions keyed by `parent_id` (`None` represents the table root). +fn rebuild_fields( + fields: &[NestedFieldRef], + adds: &HashMap, Vec>, + delete_ids: &HashSet, + parent_id: Option, +) -> Vec { + fields + .iter() + .filter(|f| !delete_ids.contains(&f.id)) + .map(|f| rebuild_field(f, adds, delete_ids)) + .chain(adds.get(&parent_id).into_iter().flatten().cloned()) + .collect() +} + +/// Recursively rebuild a single field. If the field (or any descendant) is a struct +/// that has pending additions, those additions are appended to the struct's fields. +/// Fields whose IDs appear in `delete_ids` are filtered out at every struct level. +fn rebuild_field( + field: &NestedFieldRef, + adds: &HashMap, Vec>, + delete_ids: &HashSet, +) -> NestedFieldRef { + match field.field_type.as_ref() { + Type::Primitive(_) => field.clone(), + Type::Struct(s) => { + let new_fields = rebuild_fields(s.fields(), adds, delete_ids, Some(field.id)); + Arc::new(NestedField { + id: field.id, + name: field.name.clone(), + required: field.required, + field_type: Box::new(Type::Struct(StructType::new(new_fields))), + doc: field.doc.clone(), + initial_default: field.initial_default.clone(), + write_default: field.write_default.clone(), + }) + } + Type::List(l) => { + let new_element = rebuild_field(&l.element_field, adds, delete_ids); + Arc::new(NestedField { + id: field.id, + name: field.name.clone(), + required: field.required, + field_type: Box::new(Type::List(ListType { + element_field: new_element, + })), + doc: field.doc.clone(), + initial_default: field.initial_default.clone(), + write_default: field.write_default.clone(), + }) + } + Type::Map(m) => { + let new_key = rebuild_field(&m.key_field, adds, delete_ids); + let new_value = rebuild_field(&m.value_field, adds, delete_ids); + Arc::new(NestedField { + id: field.id, + name: field.name.clone(), + required: field.required, + field_type: Box::new(Type::Map(MapType { + key_field: new_key, + value_field: new_value, + })), + doc: field.doc.clone(), + initial_default: field.initial_default.clone(), + write_default: field.write_default.clone(), + }) + } + } +} + +// --------------------------------------------------------------------------- +// TransactionAction implementation +// --------------------------------------------------------------------------- + +#[async_trait] +impl TransactionAction for UpdateSchemaAction { + async fn commit(self: Arc, table: &Table) -> Result { + let base_schema = table.metadata().current_schema(); + let mut last_column_id = table.metadata().last_column_id(); + + // --- 1. Validate deletes --- + let delete_ids = self + .deletes + .iter() + .map(|name: &String| { + base_schema + .field_by_name(name) + .ok_or_else(|| { + Error::new( + ErrorKind::PreconditionFailed, + format!("Cannot delete missing column: {name}"), + ) + }) + .and_then(|field| { + match base_schema + .identifier_field_ids() + .find(|id| *id == field.id) + { + Some(_) => Err(Error::new( + ErrorKind::PreconditionFailed, + format!("Cannot delete identifier field: {name}"), + )), + None => Ok(field.id), + } + }) + }) + .collect::>>()?; + + // --- 2. Resolve parents, validate additions, assign IDs, and group by parent ID --- + // We assign IDs inline (before grouping) to preserve the caller's insertion order, + // since HashMap iteration order is non-deterministic. + let mut additions_by_parent: HashMap, Vec> = HashMap::new(); + + for add in &self.additions { + let pending_field = add.to_nested_field(); + + // Check that name does not contain `SCHEMA_NAME_DELIMITER`. + if pending_field.name.contains(SCHEMA_NAME_DELIMITER) { + return Err(Error::new( + ErrorKind::PreconditionFailed, + format!( + "Cannot add column with ambiguous name: {}. Use `AddColumn::with_parent` to add a column to a nested struct.", + pending_field.name + ), + )); + } + + // Required columns without an initial default need allow_incompatible_changes. + if pending_field.required && pending_field.initial_default.is_none() { + return Err(Error::new( + ErrorKind::PreconditionFailed, + format!( + "Incompatible change: cannot add required column without an initial default: {}", + pending_field.name + ), + )); + } + + let parent_id = match &add.parent { + None => { + // Root-level: check name conflict against root-level fields. + if let Some(existing) = base_schema.field_by_name(&pending_field.name) + && !delete_ids.contains(&existing.id) + { + return Err(Error::new( + ErrorKind::PreconditionFailed, + format!( + "Cannot add column, name already exists: {}", + pending_field.name + ), + )); + } + None + } + Some(parent_path) => { + // Nested: resolve parent, check name conflict within parent struct. + let (resolved_parent_id, parent_struct) = + resolve_parent_target(base_schema, parent_path)?; + + if parent_struct.fields().iter().any(|f| { + f.name == pending_field.name + && !delete_ids.contains(&f.id) + && !delete_ids.contains(&resolved_parent_id) + }) { + return Err(Error::new( + ErrorKind::PreconditionFailed, + format!( + "Cannot add column, name already exists in '{}': {}", + parent_path, pending_field.name + ), + )); + } + + Some(resolved_parent_id) + } + }; + + // Assign fresh IDs immediately, preserving insertion order. + let field = assign_fresh_ids(&pending_field, &mut last_column_id); + + additions_by_parent + .entry(parent_id) + .or_default() + .push(field); + } + + // --- 4. Rebuild the schema tree with additions and deletions --- + let new_fields = rebuild_fields( + base_schema.as_struct().fields(), + &additions_by_parent, + &delete_ids, + None, + ); + + // --- 5. Build the new schema --- + let schema = Schema::builder() + .with_fields(new_fields) + .with_identifier_field_ids(base_schema.identifier_field_ids()) + .build()?; + + let updates = vec![ + TableUpdate::AddSchema { schema }, + TableUpdate::SetCurrentSchema { schema_id: -1 }, + ]; + + let requirements = vec![TableRequirement::CurrentSchemaIdMatch { + current_schema_id: base_schema.schema_id(), + }]; + + Ok(ActionCommit::new(updates, requirements)) + } +} + +#[cfg(test)] +mod tests { + use std::io::BufReader; + use std::sync::Arc; + + use as_any::Downcast; + + use crate::spec::{ + DEFAULT_SCHEMA_ID, Literal, NestedField, PrimitiveType, StructType, TableMetadata, Type, + }; + use crate::table::Table; + use crate::transaction::Transaction; + use crate::transaction::action::{ApplyTransactionAction, TransactionAction}; + use crate::transaction::tests::make_v2_table; + use crate::transaction::update_schema::{AddColumn, DEFAULT_FIELD_ID, UpdateSchemaAction}; + use crate::{ErrorKind, TableIdent, TableRequirement, TableUpdate}; + + // The V2 test table has: + // last_column_id: 3 + // current schema (id=1): x(1, req, long), y(2, req, long), z(3, req, long) + // identifier_field_ids: [1, 2] + + /// Build a V2 test table that includes nested types: + /// + /// last_column_id: 14 + /// current schema (id=0): + /// x(1, req, long) -- identifier + /// y(2, req, long) -- identifier + /// z(3, req, long) + /// person(4, opt, struct) + /// name(5, opt, string) + /// age(6, req, int) + /// tags(7, opt, list) + /// element(8, req, struct) + /// key(9, opt, string) + /// value(10, opt, string) + /// props(11, opt, map) + /// key(12, req, string) + /// value(13, req, struct) + /// data(14, opt, string) + fn make_v2_table_with_nested() -> Table { + let json = r#"{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c2", + "location": "s3://bucket/test/location", + "last-sequence-number": 0, + "last-updated-ms": 1602638573590, + "last-column-id": 14, + "current-schema-id": 0, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "identifier-field-ids": [1, 2], + "fields": [ + {"id": 1, "name": "x", "required": true, "type": "long"}, + {"id": 2, "name": "y", "required": true, "type": "long"}, + {"id": 3, "name": "z", "required": true, "type": "long"}, + {"id": 4, "name": "person", "required": false, "type": { + "type": "struct", + "fields": [ + {"id": 5, "name": "name", "required": false, "type": "string"}, + {"id": 6, "name": "age", "required": true, "type": "int"} + ] + }}, + {"id": 7, "name": "tags", "required": false, "type": { + "type": "list", + "element-id": 8, + "element": { + "type": "struct", + "fields": [ + {"id": 9, "name": "key", "required": false, "type": "string"}, + {"id": 10, "name": "value", "required": false, "type": "string"} + ] + }, + "element-required": true + }}, + {"id": 11, "name": "props", "required": false, "type": { + "type": "map", + "key-id": 12, + "key": "string", + "value-id": 13, + "value": { + "type": "struct", + "fields": [ + {"id": 14, "name": "data", "required": false, "type": "string"} + ] + }, + "value-required": true + }} + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + {"spec-id": 0, "fields": []} + ], + "last-partition-id": 999, + "default-sort-order-id": 0, + "sort-orders": [ + {"order-id": 0, "fields": []} + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [] + }"#; + + let reader = BufReader::new(json.as_bytes()); + let metadata = serde_json::from_reader::<_, TableMetadata>(reader).unwrap(); + + Table::builder() + .metadata(metadata) + .metadata_location("s3://bucket/test/location/metadata/v1.json".to_string()) + .identifier(TableIdent::from_strs(["ns1", "test1"]).unwrap()) + .file_io(crate::io::FileIO::new_with_memory()) + .build() + .unwrap() + } + + // ----------------------------------------------------------------------- + // Existing root-level tests + // ----------------------------------------------------------------------- + + #[tokio::test] + async fn test_add_column() { + let table = make_v2_table(); + let tx = Transaction::new(&table); + + let action = tx.update_schema().add_column(AddColumn::optional( + "new_col", + Type::Primitive(PrimitiveType::Int), + )); + + let mut action_commit = Arc::new(action).commit(&table).await.unwrap(); + let updates = action_commit.take_updates(); + let requirements = action_commit.take_requirements(); + + assert_eq!(updates.len(), 2); + + // Extract the new schema from the AddSchema update. + let new_schema = match &updates[0] { + TableUpdate::AddSchema { schema } => schema, + other => panic!("expected AddSchema, got {other:?}"), + }; + + let expected_schema = table + .metadata() + .current_schema() + .as_ref() + .clone() + .into_builder() + .with_schema_id(DEFAULT_SCHEMA_ID) + .with_fields([ + NestedField::optional(4, "new_col", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(); + assert_eq!(new_schema, &expected_schema); + + assert_eq!(updates[1], TableUpdate::SetCurrentSchema { schema_id: -1 }); + + // Verify requirement. + assert_eq!(requirements.len(), 1); + assert_eq!(requirements[0], TableRequirement::CurrentSchemaIdMatch { + current_schema_id: table.metadata().current_schema().schema_id() + }); + } + + #[tokio::test] + async fn test_add_column_with_doc() { + let table = make_v2_table(); + let tx = Transaction::new(&table); + + let action = tx.update_schema().add_column( + AddColumn::builder() + .name("documented_col") + .field_type(Type::Primitive(PrimitiveType::String)) + .doc("A documented column") + .build(), + ); + + let mut action_commit = Arc::new(action).commit(&table).await.unwrap(); + let updates = action_commit.take_updates(); + + let new_schema = match &updates[0] { + TableUpdate::AddSchema { schema } => schema, + other => panic!("expected AddSchema, got {other:?}"), + }; + + let field = new_schema + .field_by_name("documented_col") + .expect("documented_col should exist"); + assert_eq!(field.id, 4); + assert!(!field.required); + assert_eq!(field.doc.as_deref(), Some("A documented column")); + } + + #[tokio::test] + async fn test_add_required_column_with_initial_default() { + let table = make_v2_table(); + let tx = Transaction::new(&table); + + let action = tx.update_schema().add_column(AddColumn::required( + "req_col", + Type::Primitive(PrimitiveType::Int), + Literal::int(0), + )); + + let mut action_commit = Arc::new(action).commit(&table).await.unwrap(); + let updates = action_commit.take_updates(); + + let new_schema = match &updates[0] { + TableUpdate::AddSchema { schema } => schema, + other => panic!("expected AddSchema, got {other:?}"), + }; + + let field = new_schema + .field_by_name("req_col") + .expect("req_col should exist"); + assert_eq!(field.id, 4); + assert!(field.required); + assert_eq!(field.initial_default, Some(Literal::int(0))); + assert_eq!(field.write_default, Some(Literal::int(0))); + } + + #[tokio::test] + async fn test_add_column_name_conflict_fails() { + let table = make_v2_table(); + let tx = Transaction::new(&table); + + // "x" already exists in the V2 test schema. + let action = tx.update_schema().add_column(AddColumn::optional( + "x", + Type::Primitive(PrimitiveType::Int), + )); + + let result = Arc::new(action).commit(&table).await; + let err = match result { + Err(e) => e, + Ok(_) => panic!("should reject adding a column with an existing name"), + }; + assert_eq!(err.kind(), ErrorKind::PreconditionFailed); + assert!( + err.message().contains("already exists"), + "error should mention name conflict, got: {}", + err.message() + ); + } + + #[tokio::test] + async fn test_delete_column() { + let table = make_v2_table(); + let tx = Transaction::new(&table); + + // z is not an identifier field, so we can delete it. + let action = tx.update_schema().delete_column("z"); + + let mut action_commit = Arc::new(action).commit(&table).await.unwrap(); + let updates = action_commit.take_updates(); + + let new_schema = match &updates[0] { + TableUpdate::AddSchema { schema } => schema, + other => panic!("expected AddSchema, got {other:?}"), + }; + + assert!( + new_schema.field_by_name("z").is_none(), + "z should be deleted" + ); + assert!(new_schema.field_by_name("x").is_some()); + assert!(new_schema.field_by_name("y").is_some()); + } + + #[tokio::test] + async fn test_delete_missing_column_fails() { + let table = make_v2_table(); + let tx = Transaction::new(&table); + + let action = tx.update_schema().delete_column("nonexistent"); + + let result = Arc::new(action).commit(&table).await; + let err = match result { + Err(e) => e, + Ok(_) => panic!("should reject deleting a non-existent column"), + }; + assert_eq!(err.kind(), ErrorKind::PreconditionFailed); + assert!( + err.message().contains("nonexistent"), + "error should mention the missing column, got: {}", + err.message() + ); + } + + #[tokio::test] + async fn test_add_and_delete_combined() { + let table = make_v2_table(); + let tx = Transaction::new(&table); + + // Delete z, add a new column. + let action = tx + .update_schema() + .delete_column("z") + .add_column(AddColumn::optional( + "w", + Type::Primitive(PrimitiveType::Boolean), + )); + + let mut action_commit = Arc::new(action).commit(&table).await.unwrap(); + let updates = action_commit.take_updates(); + + let new_schema = match &updates[0] { + TableUpdate::AddSchema { schema } => schema, + other => panic!("expected AddSchema, got {other:?}"), + }; + + assert!( + new_schema.field_by_name("z").is_none(), + "z should be deleted" + ); + let w = new_schema.field_by_name("w").expect("w should exist"); + assert_eq!(w.id, 4); + assert!(!w.required); + } + + #[tokio::test] + async fn test_delete_and_readd_same_name() { + let table = make_v2_table(); + let tx = Transaction::new(&table); + + // Delete z, then add a new column named z -- should succeed. + let action = tx + .update_schema() + .delete_column("z") + .add_column(AddColumn::optional( + "z", + Type::Primitive(PrimitiveType::Boolean), + )); + + let mut action_commit = Arc::new(action).commit(&table).await.unwrap(); + let updates = action_commit.take_updates(); + + let new_schema = match &updates[0] { + TableUpdate::AddSchema { schema } => schema, + other => panic!("expected AddSchema, got {other:?}"), + }; + + let z = new_schema + .field_by_name("z") + .expect("z should exist with new type"); + assert_eq!(z.id, 4); // new ID, not the old 3 + assert_eq!(*z.field_type, Type::Primitive(PrimitiveType::Boolean)); + } + + #[test] + fn test_apply() { + let table = make_v2_table(); + let tx = Transaction::new(&table); + + let tx = tx + .update_schema() + .add_column(AddColumn::optional( + "new_col", + Type::Primitive(PrimitiveType::Int), + )) + .apply(tx) + .unwrap(); + + assert_eq!(tx.actions.len(), 1); + (*tx.actions[0]) + .downcast_ref::() + .expect("UpdateSchemaAction was not applied to Transaction!"); + } + + // ----------------------------------------------------------------------- + // Nested add tests + // ----------------------------------------------------------------------- + + #[tokio::test] + async fn test_add_column_to_struct() { + let table = make_v2_table_with_nested(); + let tx = Transaction::new(&table); + + // Add "email" to the "person" struct. + let action = tx.update_schema().add_column( + AddColumn::builder() + .name("email") + .field_type(Type::Primitive(PrimitiveType::String)) + .parent("person") + .build(), + ); + + let mut action_commit = Arc::new(action).commit(&table).await.unwrap(); + let updates = action_commit.take_updates(); + + let new_schema = match &updates[0] { + TableUpdate::AddSchema { schema } => schema, + other => panic!("expected AddSchema, got {other:?}"), + }; + + // "email" should be nested under "person" with ID = last_column_id + 1 = 15. + let email = new_schema + .field_by_name("person.email") + .expect("person.email should exist"); + assert_eq!(email.id, 15); + assert!(!email.required); + assert_eq!(*email.field_type, Type::Primitive(PrimitiveType::String)); + + // Original nested fields should still be there. + assert!(new_schema.field_by_name("person.name").is_some()); + assert!(new_schema.field_by_name("person.age").is_some()); + } + + #[tokio::test] + async fn test_add_column_to_struct_with_doc() { + let table = make_v2_table_with_nested(); + let tx = Transaction::new(&table); + + let action = tx.update_schema().add_column( + AddColumn::builder() + .name("phone") + .field_type(Type::Primitive(PrimitiveType::String)) + .parent("person") + .doc("Phone number") + .build(), + ); + + let mut action_commit = Arc::new(action).commit(&table).await.unwrap(); + let updates = action_commit.take_updates(); + + let new_schema = match &updates[0] { + TableUpdate::AddSchema { schema } => schema, + other => panic!("expected AddSchema, got {other:?}"), + }; + + let phone = new_schema + .field_by_name("person.phone") + .expect("person.phone should exist"); + assert_eq!(phone.id, 15); + assert_eq!(phone.doc.as_deref(), Some("Phone number")); + } + + #[tokio::test] + async fn test_add_column_to_list_element_struct() { + let table = make_v2_table_with_nested(); + let tx = Transaction::new(&table); + + // "tags" is a list. Adding to the list navigates to its + // element struct automatically. + let action = tx.update_schema().add_column( + AddColumn::builder() + .name("score") + .field_type(Type::Primitive(PrimitiveType::Double)) + .parent("tags") + .build(), + ); + + let mut action_commit = Arc::new(action).commit(&table).await.unwrap(); + let updates = action_commit.take_updates(); + + let new_schema = match &updates[0] { + TableUpdate::AddSchema { schema } => schema, + other => panic!("expected AddSchema, got {other:?}"), + }; + + // The list element struct should now contain "score". + let score = new_schema + .field_by_name("tags.element.score") + .expect("tags.element.score should exist"); + assert_eq!(score.id, 15); + assert!(!score.required); + + // Existing fields preserved. + assert!(new_schema.field_by_name("tags.element.key").is_some()); + assert!(new_schema.field_by_name("tags.element.value").is_some()); + } + + #[tokio::test] + async fn test_add_column_to_map_value_struct() { + let table = make_v2_table_with_nested(); + let tx = Transaction::new(&table); + + // "props" is a map. Adding to the map navigates to its + // value struct automatically. + let action = tx.update_schema().add_column( + AddColumn::builder() + .name("version") + .field_type(Type::Primitive(PrimitiveType::Int)) + .parent("props") + .build(), + ); + + let mut action_commit = Arc::new(action).commit(&table).await.unwrap(); + let updates = action_commit.take_updates(); + + let new_schema = match &updates[0] { + TableUpdate::AddSchema { schema } => schema, + other => panic!("expected AddSchema, got {other:?}"), + }; + + let version = new_schema + .field_by_name("props.value.version") + .expect("props.value.version should exist"); + assert_eq!(version.id, 15); + + // Existing map value fields preserved. + assert!(new_schema.field_by_name("props.value.data").is_some()); + } + + #[tokio::test] + async fn test_add_column_to_nonexistent_parent_fails() { + let table = make_v2_table_with_nested(); + let tx = Transaction::new(&table); + + let action = tx.update_schema().add_column( + AddColumn::builder() + .name("col") + .field_type(Type::Primitive(PrimitiveType::Int)) + .parent("nonexistent") + .build(), + ); + + let err = match Arc::new(action).commit(&table).await { + Err(e) => e, + Ok(_) => panic!("should reject adding to a nonexistent parent"), + }; + assert_eq!(err.kind(), ErrorKind::PreconditionFailed); + assert!( + err.message().contains("nonexistent"), + "error should mention the missing parent, got: {}", + err.message() + ); + } + + #[tokio::test] + async fn test_add_column_to_primitive_parent_fails() { + let table = make_v2_table_with_nested(); + let tx = Transaction::new(&table); + + // "x" is a primitive (long), not a struct. + let action = tx.update_schema().add_column( + AddColumn::builder() + .name("col") + .field_type(Type::Primitive(PrimitiveType::Int)) + .parent("x") + .build(), + ); + + let err = match Arc::new(action).commit(&table).await { + Err(e) => e, + Ok(_) => panic!("should reject adding to a primitive parent"), + }; + assert_eq!(err.kind(), ErrorKind::PreconditionFailed); + assert!( + err.message().contains("not a struct"), + "error should mention type mismatch, got: {}", + err.message() + ); + } + + #[tokio::test] + async fn test_add_column_to_nested_name_conflict_fails() { + let table = make_v2_table_with_nested(); + let tx = Transaction::new(&table); + + // "name" already exists in the "person" struct. + let action = tx.update_schema().add_column( + AddColumn::builder() + .name("name") + .field_type(Type::Primitive(PrimitiveType::String)) + .parent("person") + .build(), + ); + + let err = match Arc::new(action).commit(&table).await { + Err(e) => e, + Ok(_) => panic!("should reject adding a column with conflicting name"), + }; + assert_eq!(err.kind(), ErrorKind::PreconditionFailed); + assert!( + err.message().contains("already exists"), + "error should mention name conflict, got: {}", + err.message() + ); + } + + #[tokio::test] + async fn test_root_and_nested_add_combined() { + let table = make_v2_table_with_nested(); + let tx = Transaction::new(&table); + + // Add a root column and a nested column in the same action. + let action = tx + .update_schema() + .add_column(AddColumn::optional( + "root_col", + Type::Primitive(PrimitiveType::Boolean), + )) + .add_column( + AddColumn::builder() + .name("email") + .field_type(Type::Primitive(PrimitiveType::String)) + .parent("person") + .build(), + ); + + let mut action_commit = Arc::new(action).commit(&table).await.unwrap(); + let updates = action_commit.take_updates(); + + let new_schema = match &updates[0] { + TableUpdate::AddSchema { schema } => schema, + other => panic!("expected AddSchema, got {other:?}"), + }; + + // Root column gets the first fresh ID. + let root_col = new_schema + .field_by_name("root_col") + .expect("root_col should exist"); + assert_eq!(root_col.id, 15); + + // Nested column gets the next ID. + let email = new_schema + .field_by_name("person.email") + .expect("person.email should exist"); + assert_eq!(email.id, 16); + } + + #[tokio::test] + async fn test_add_nested_struct_type_with_fresh_ids() { + // Adding a new column whose TYPE contains nested fields (e.g. a struct column). All sub-fields must receive + // fresh IDs, not placeholder `DEFAULT_FIELD_ID`. + let table = make_v2_table(); + let tx = Transaction::new(&table); + + let action = tx.update_schema().add_column(AddColumn::optional( + "address", + Type::Struct(StructType::new(vec![ + NestedField::optional( + DEFAULT_FIELD_ID, + "street", + Type::Primitive(PrimitiveType::String), + ) + .into(), + NestedField::optional( + DEFAULT_FIELD_ID, + "city", + Type::Primitive(PrimitiveType::String), + ) + .into(), + ])), + )); + + let mut action_commit = Arc::new(action).commit(&table).await.unwrap(); + let updates = action_commit.take_updates(); + + let new_schema = match &updates[0] { + TableUpdate::AddSchema { schema } => schema, + other => panic!("expected AddSchema, got {other:?}"), + }; + + // "address" gets ID 4 (last_column_id=3, +1). + let address = new_schema + .field_by_name("address") + .expect("address should exist"); + assert_eq!(address.id, 4); + + // Sub-fields get IDs 5 and 6. + let street = new_schema + .field_by_name("address.street") + .expect("address.street should exist"); + assert_eq!(street.id, 5); + + let city = new_schema + .field_by_name("address.city") + .expect("address.city should exist"); + assert_eq!(city.id, 6); + } +}