Drop column support for various destinations

anuunchin · anuunchin · commit 9f9b7eebc27d · 2025-07-21T13:59:16.000+02:00
diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py
@@ -113,6 +113,40 @@ def write_delta_table(
     )
 
 
+def drop_columns_delta_table(
+    table: DeltaTable,
+    columns_to_drop: List[str],
+) -> None:
+    """Drops columns from a Delta table by rewriting it with the remaining columns.
+
+    This function reads the entire table, removes the specified columns, and rewrites
+    the table with the remaining columns. This is a workaround for the limitation
+    that delta-rs cannot natively drop columns without column mapping enabled.
+
+    Args:
+        table: The DeltaTable to modify (should already have storage options configured)
+        columns_to_drop: List of column names to drop
+    """
+    arrow_table = table.to_pyarrow_table()
+
+    current_schema = arrow_table.schema
+    remaining_columns = [col for col in current_schema.names if col not in columns_to_drop]
+
+    filtered_table = arrow_table.select(remaining_columns)
+
+    partition_columns = []
+    metadata = table.metadata()
+    partition_columns = [col for col in metadata.partition_columns if col in remaining_columns]
+
+    write_deltalake(
+        table_or_uri=table,
+        data=ensure_delta_compatible_arrow_data(filtered_table, partition_columns or None),
+        partition_by=partition_columns or None,
+        mode="overwrite",
+        schema_mode="overwrite",
+    )
+
+
 def merge_delta_table(
     table: DeltaTable,
     data: Union[pa.Table, pa.RecordBatchReader],
diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py
@@ -406,6 +406,28 @@ def should_load_data_to_staging_dataset_on_staging_destination(self, table_name:
             return True
         return super().should_load_data_to_staging_dataset_on_staging_destination(table_name)
 
+    def drop_columns(
+        self,
+        from_tables_drop_cols: List[Dict[str, Union[str, List[str]]]],
+        update_schema: bool = True,
+    ) -> None:
+        """Drops specified columns from specified tables, using appropriate method based on table format"""
+        for from_table_drop_cols in from_tables_drop_cols:
+            table_name = cast(str, from_table_drop_cols["from_table"])
+            columns_to_drop = cast(List[str], from_table_drop_cols["drop_columns"])
+
+            table_schema = self.prepare_load_table(table_name)
+
+            if self._is_iceberg_table(table_schema):
+                # For Iceberg tables, use the base SQL client method (ALTER TABLE DROP COLUMN)
+                self.sql_client.drop_columns([from_table_drop_cols])
+            else:
+                # For Hive tables, use the special REPLACE COLUMNS method
+                self.sql_client.drop_columns_hive(table_name, columns_to_drop)
+
+        if update_schema:
+            self._update_schema_in_storage(self.schema)
+
     @staticmethod
     def is_dbapi_exception(ex: Exception) -> bool:
         from pyathena.error import Error
diff --git a/dlt/destinations/impl/athena/sql_client.py b/dlt/destinations/impl/athena/sql_client.py
@@ -158,6 +158,40 @@ def drop_columns(self, from_tables_drop_cols: List[Dict[str, Union[str, List[str
 
         self.execute_many(statements)
 
+    def drop_columns_hive(self, table_name: str, columns_to_drop: List[str]) -> None:
+        """Drop columns from Hive table using ALTER TABLE REPLACE COLUMNS"""
+        qualified_table_name = self.make_qualified_ddl_table_name(table_name)
+
+        # Get current table schema
+        describe_query = f"DESCRIBE {qualified_table_name}"
+        result = self.execute_sql(describe_query)
+
+        current_columns = []
+        if result:
+            for row in result:
+                # DESCRIBE returns: [('col_name  \tdata_type  \t. ',)
+                parts = row[0].split()
+                current_columns.append(
+                    {
+                        "name": parts[0],
+                        "type": parts[1],
+                    }
+                )
+
+        remaining_columns = [col for col in current_columns if col["name"] not in columns_to_drop]
+
+        # Build column definitions for REPLACE COLUMNS
+        column_definitions = []
+        for col in remaining_columns:
+            col_name = self.escape_ddl_identifier(col["name"])
+            col_type = col["type"]
+            column_definitions.append(f"{col_name} {col_type}")
+
+        replace_sql = (
+            f"ALTER TABLE {qualified_table_name} REPLACE COLUMNS ({', '.join(column_definitions)})"
+        )
+        self.execute_sql(replace_sql)
+
     @contextmanager
     @raise_database_error
     def begin_transaction(self) -> Iterator[DBTransaction]:
diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py
@@ -7,6 +7,10 @@
     Iterator,
     Optional,
     Sequence,
+    cast,
+    List,
+    Dict,
+    Union,
 )
 from databricks import sql as databricks_lib
 from databricks.sql.client import (
@@ -106,6 +110,25 @@ def drop_tables(self, *tables: str) -> None:
         with suppress(DatabaseUndefinedRelation):
             super().drop_tables(*tables)
 
+    def drop_columns(self, from_tables_drop_cols: List[Dict[str, Union[str, List[str]]]]) -> None:
+        """Drops specified columns from specified tables if they exist"""
+
+        statements = []
+        for from_table_drop_cols in from_tables_drop_cols:
+            table = cast(str, from_table_drop_cols["from_table"])
+            statements.append(
+                f"ALTER TABLE {self.make_qualified_table_name(table)} SET TBLPROPERTIES"
+                " ('delta.columnMapping.mode' = 'name', 'delta.minReaderVersion' = '2',"
+                " 'delta.minWriterVersion' = '5')"
+            )
+            for column in from_table_drop_cols["drop_columns"]:
+                statements.append(
+                    f"ALTER TABLE {self.make_qualified_table_name(table)} DROP COLUMN IF EXISTS"
+                    f" {self.escape_column_name(column)};"
+                )
+
+        self.execute_many(statements)
+
     def execute_sql(
         self, sql: AnyStr, *args: Any, **kwargs: Any
     ) -> Optional[Sequence[Sequence[Any]]]:
diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
@@ -16,6 +16,7 @@
     Literal,
     Any,
     Dict,
+    Union,
 )
 from fsspec import AbstractFileSystem
 
@@ -937,3 +938,72 @@ def is_open_table(self, table_format: TTableFormat, table_name: str) -> bool:
             return False
         detected_format = prepared_table.get("table_format")
         return table_format == detected_format
+
+    def drop_columns(
+        self,
+        from_tables_drop_cols: List[Dict[str, Union[str, List[str]]]],
+        update_schema: bool = True,
+    ) -> None:
+        for table_spec in from_tables_drop_cols:
+            table_name = cast(str, table_spec["from_table"])
+            columns_to_drop = list(table_spec["drop_columns"])
+
+            if self.is_open_table("iceberg", table_name):
+                ice_table = self.load_open_table("iceberg", table_name)
+                # Alter schema – delete columns one by one (requires incompatible changes flag)
+                with ice_table.update_schema(allow_incompatible_changes=True) as update:
+                    for col in columns_to_drop:
+                        update.delete_column(col)
+
+            elif self.is_open_table("delta", table_name):
+                from dlt.common.libs.deltalake import drop_columns_delta_table
+
+                delta_table = self.load_open_table("delta", table_name)
+                # Drop columns by rewriting the table
+                drop_columns_delta_table(
+                    table=delta_table,
+                    columns_to_drop=columns_to_drop,
+                )
+            else:
+                # Handle regular filesystem tables (jsonl, parquet, csv)
+                self._drop_columns_from_regular_table(table_name, columns_to_drop)
+
+        if update_schema:
+            self._update_schema_in_storage(self.schema)
+
+    def _drop_columns_from_regular_table(self, table_name: str, columns_to_drop: List[str]) -> None:
+        from dlt.common.libs.pyarrow import pyarrow as pa
+
+        table_files = self.list_table_files(table_name)
+
+        for file_path in table_files:
+            file_ext = os.path.splitext(file_path)[1].lower()
+
+            if file_ext == ".parquet":
+                table = pa.parquet.read_table(self.make_remote_url(file_path))
+                columns_to_keep = [col for col in table.column_names if col not in columns_to_drop]
+                filtered_table = table.select(columns_to_keep)
+                with pa.parquet.ParquetWriter(
+                    self.make_remote_url(file_path), filtered_table.schema
+                ) as writer:
+                    writer.write_table(filtered_table)
+
+            elif file_ext == ".jsonl":
+                content = self.fs_client.read_text(file_path, encoding="utf-8")
+                lines = content.strip().split("\n")
+
+                filtered_lines = []
+                for line in lines:
+                    if line.strip():
+                        record = json.loads(line)
+                        for col in columns_to_drop:
+                            record.pop(col, None)
+                        filtered_lines.append(json.dumps(record))
+
+                self.fs_client.write_text(file_path, "\n".join(filtered_lines), encoding="utf-8")
+
+            elif file_ext == ".csv":
+                table = pa.csv.read_csv(self.make_remote_url(file_path))
+                columns_to_keep = [col for col in table.column_names if col not in columns_to_drop]
+                filtered_table = table.select(columns_to_keep)
+                pa.csv.write_csv(filtered_table, self.make_remote_url(file_path))
diff --git a/dlt/destinations/impl/redshift/redshift.py b/dlt/destinations/impl/redshift/redshift.py
@@ -13,7 +13,7 @@
 
     # from psycopg2.sql import SQL, Composed
 
-from typing import Dict, List, Optional, Sequence
+from typing import Dict, List, Optional, Sequence, cast, Union
 
 from dlt.common.destination.client import (
     FollowupJobRequest,
@@ -59,6 +59,20 @@ def _maybe_make_terminal_exception_from_data_error(
             return DatabaseTerminalException(pg_ex)
         return None
 
+    def drop_columns(self, from_tables_drop_cols: List[Dict[str, Union[str, List[str]]]]) -> None:
+        """Drops specified columns from specified tables if they exist"""
+
+        statements = []
+        for from_table_drop_cols in from_tables_drop_cols:
+            table = cast(str, from_table_drop_cols["from_table"])
+            for column in from_table_drop_cols["drop_columns"]:
+                statements.append(
+                    f"ALTER TABLE {self.make_qualified_table_name(table)} DROP COLUMN"
+                    f" {self.escape_column_name(column)};"
+                )
+
+        self.execute_many(statements)
+
 
 class RedshiftCopyFileLoadJob(CopyRemoteFileLoadJob):
     def __init__(
diff --git a/dlt/destinations/impl/sqlalchemy/db_api_client.py b/dlt/destinations/impl/sqlalchemy/db_api_client.py
@@ -1,4 +1,16 @@
-from typing import Optional, Iterator, Any, Sequence, AnyStr, Union, Tuple, List, Dict, Set, cast
+from typing import (
+    Optional,
+    Iterator,
+    Any,
+    Sequence,
+    AnyStr,
+    Union,
+    Tuple,
+    List,
+    Dict,
+    Set,
+    cast,
+)
 from contextlib import contextmanager
 from functools import wraps
 import inspect
@@ -299,15 +311,19 @@ def drop_tables(self, *tables: str) -> None:
 
     def drop_columns(self, from_tables_drop_cols: List[Dict[str, Union[str, List[str]]]]) -> None:
         for from_table_drop_cols in from_tables_drop_cols:
-            table_name = from_table_drop_cols["from_table"]
+            table_name = cast(str, from_table_drop_cols["from_table"])
             drop_columns = from_table_drop_cols["drop_columns"]
 
-            tbl = sa.Table(table_name, self.metadata, schema=self.dataset_name, keep_existing=True)
+            # Reflect current table definition to fetch existing columns
+            tbl = self.reflect_table(table_name)
             existing_cols = {col.name for col in tbl.columns}
 
             for column in drop_columns:
                 if column in existing_cols:
-                    ddl = f"ALTER TABLE {self.make_qualified_table_name(table_name)} DROP COLUMN {self.escape_column_name(column)}"  # type: ignore[arg-type]
+                    ddl = (
+                        f"ALTER TABLE {self.make_qualified_table_name(table_name)} "
+                        f"DROP COLUMN {self.escape_column_name(column)}"
+                    )
                     self.execute_sql(ddl)
 
     def execute_sql(
diff --git a/dlt/destinations/impl/synapse/sql_client.py b/dlt/destinations/impl/synapse/sql_client.py
@@ -1,3 +1,4 @@
+from typing import List, Dict, Union, cast
 from contextlib import suppress
 
 from dlt.destinations.impl.mssql.sql_client import PyOdbcMsSqlClient
@@ -14,3 +15,16 @@ def drop_tables(self, *tables: str) -> None:
         for statement in statements:
             with suppress(DatabaseUndefinedRelation):
                 self.execute_sql(statement)
+
+    def drop_columns(self, from_tables_drop_cols: List[Dict[str, Union[str, List[str]]]]) -> None:
+        """Drops specified columns from specified tables if they exist"""
+        statements = []
+        for from_table_drop_cols in from_tables_drop_cols:
+            table = cast(str, from_table_drop_cols["from_table"])
+            for column in from_table_drop_cols["drop_columns"]:
+                statements.append(
+                    f"ALTER TABLE {self.make_qualified_table_name(table)} DROP COLUMN"
+                    f" {self.escape_column_name(column)};"
+                )
+
+        self.execute_many(statements)
diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py
@@ -333,7 +333,7 @@ def drop_tables(self, *tables: str, delete_schema: bool = True) -> None:
     def drop_columns(
         self,
         from_tables_drop_cols: List[Dict[str, Union[str, List[str]]]],
-        delete_schema: bool = True,
+        update_schema: bool = True,
     ) -> None:
         """Drop columns in destination database and optionally delete the stored schema as well.
         Clients that support ddl transactions will have both operations performed in a single transaction.
@@ -345,8 +345,8 @@ def drop_columns(
         """
         with self.maybe_ddl_transaction():
             self.sql_client.drop_columns(from_tables_drop_cols)
-            if delete_schema:
-                self._delete_schema_in_storage(self.schema)
+            if update_schema:
+                self._update_schema_in_storage(self.schema)
 
     @contextlib.contextmanager
     def maybe_ddl_transaction(self) -> Iterator[None]:
diff --git a/dlt/load/utils.py b/dlt/load/utils.py
@@ -185,7 +185,7 @@ def _init_dataset_and_update_schema(
                 f"Client for {job_client.config.destination_type} will drop columns"
                 f" from tables {table_names} {staging_text}"
             )
-            job_client.drop_columns(from_tables_drop_cols, delete_schema=True)
+            job_client.drop_columns(from_tables_drop_cols, update_schema=True)
         else:
             logger.warning(
                 f"Client for {job_client.config.destination_type} does not implement drop columns."
diff --git a/tests/load/pipeline/test_drop_column.py b/tests/load/pipeline/test_drop_column.py

Original file line number	Diff line number	Diff line change
`@@ -185,7 +185,7 @@ def _init_dataset_and_update_schema(`
`185`	`185`	`f"Client for {job_client.config.destination_type} will drop columns"`
`186`	`186`	`f" from tables {table_names} {staging_text}"`
`187`	`187`	`)`
`188`		`- job_client.drop_columns(from_tables_drop_cols, delete_schema=True)`
	`188`	`+ job_client.drop_columns(from_tables_drop_cols, update_schema=True)`
`189`	`189`	`else:`
`190`	`190`	`logger.warning(`
`191`	`191`	`f"Client for {job_client.config.destination_type} does not implement drop columns."`