@@ -4590,3 +4590,73 @@ def test_orc_stripe_based_batching(tmp_path: Path) -> None:
45904590 # Verify total rows
45914591 total_rows = sum (batch .num_rows for batch in batches )
45924592 assert total_rows == 10000 , f"Expected 10000 total rows, got { total_rows } "
4593+
4594+
4595+ def test_partition_column_projection_with_schema_evolution (catalog : InMemoryCatalog ) -> None :
4596+ """Test column projection on partitioned table after schema evolution (https://github.com/apache/iceberg-python/issues/2672)."""
4597+ initial_schema = Schema (
4598+ NestedField (1 , "partition_date" , DateType (), required = False ),
4599+ NestedField (2 , "id" , IntegerType (), required = False ),
4600+ NestedField (3 , "name" , StringType (), required = False ),
4601+ NestedField (4 , "value" , IntegerType (), required = False ),
4602+ )
4603+
4604+ partition_spec = PartitionSpec (
4605+ PartitionField (
4606+ source_id = 1 ,
4607+ field_id = 1000 ,
4608+ transform = IdentityTransform (),
4609+ name = "partition_date"
4610+ ),
4611+ )
4612+
4613+ catalog .create_namespace ("default" )
4614+ table = catalog .create_table (
4615+ "default.test_schema_evolution_projection" ,
4616+ schema = initial_schema ,
4617+ partition_spec = partition_spec ,
4618+ )
4619+
4620+ data_v1 = pa .Table .from_pylist (
4621+ [
4622+ {"partition_date" : date (2024 , 1 , 1 ), "id" : 1 , "name" : "Alice" , "value" : 100 },
4623+ {"partition_date" : date (2024 , 1 , 1 ), "id" : 2 , "name" : "Bob" , "value" : 200 },
4624+ ],
4625+ schema = pa .schema ([
4626+ ("partition_date" , pa .date32 ()),
4627+ ("id" , pa .int32 ()),
4628+ ("name" , pa .string ()),
4629+ ("value" , pa .int32 ()),
4630+ ])
4631+ )
4632+
4633+ table .append (data_v1 )
4634+
4635+ with table .update_schema () as update :
4636+ update .add_column ("new_column" , StringType ())
4637+
4638+ table = catalog .load_table ("default.test_schema_evolution_projection" )
4639+
4640+ data_v2 = pa .Table .from_pylist (
4641+ [
4642+ {"partition_date" : date (2024 , 1 , 2 ), "id" : 3 , "name" : "Charlie" , "value" : 300 , "new_column" : "new1" },
4643+ {"partition_date" : date (2024 , 1 , 2 ), "id" : 4 , "name" : "David" , "value" : 400 , "new_column" : "new2" },
4644+ ],
4645+ schema = pa .schema ([
4646+ ("partition_date" , pa .date32 ()),
4647+ ("id" , pa .int32 ()),
4648+ ("name" , pa .string ()),
4649+ ("value" , pa .int32 ()),
4650+ ("new_column" , pa .string ()),
4651+ ])
4652+ )
4653+
4654+ table .append (data_v2 )
4655+
4656+ result = table .scan (selected_fields = ("id" , "name" , "value" , "new_column" )).to_arrow ()
4657+
4658+ assert set (result .schema .names ) == {"id" , "name" , "value" , "new_column" }
4659+ assert result .num_rows == 4
4660+ result_sorted = result .sort_by ("name" )
4661+ assert result_sorted ["name" ].to_pylist () == ["Alice" , "Bob" , "Charlie" , "David" ]
4662+ assert result_sorted ["new_column" ].to_pylist () == [None , None , "new1" , "new2" ]
0 commit comments