Initial commit with --from --columns enabled

anuunchin · anuunchin · commit 161c3795e034 · 2025-06-13T13:55:33.000+02:00
diff --git a/dlt/cli/pipeline_command.py b/dlt/cli/pipeline_command.py
@@ -393,3 +393,46 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]:
             fmt.warning(warning)
         if fmt.confirm("Do you want to apply these changes?", default=False):
             drop()
+
+    if operation == "drop-columns":
+        drop = DropCommand(p, **command_kwargs)
+        if drop.is_empty:
+            fmt.echo(
+                "Could not select any columns to drop. Use"
+                " the command below to inspect the pipeline:"
+            )
+            fmt.echo(f"dlt pipeline -v {p.pipeline_name} info")
+            if len(drop.info["warnings"]):
+                fmt.echo("Additional warnings are available")
+                for warning in drop.info["warnings"]:
+                    fmt.warning(warning)
+            return
+
+        fmt.echo(
+            "About to drop the following columns in dataset %s in destination %s:"
+            % (
+                fmt.bold(p.dataset_name),
+                fmt.bold(p.destination.destination_name),
+            )
+        )
+        fmt.echo("%s: %s" % (fmt.style("Selected schema", fg="green"), drop.info["schema_name"]))
+        fmt.echo(
+            "%s: %s"
+            % (
+                fmt.style("Selected resource(s)", fg="green"),
+                drop.info["resource_names"],
+            )
+        )
+        fmt.echo("%s: %s" % (fmt.style("Table(s) to be affected", fg="green"), drop.info["tables"]))
+        if drop.from_tables_drop_cols:
+            fmt.echo(f"{fmt.style('Column(s) to be dropped', fg='green')}:")
+            for from_table_drop_cols in drop.from_tables_drop_cols:
+                table_name = from_table_drop_cols["from_table"]
+                columns = from_table_drop_cols["drop_columns"]
+                fmt.echo(f"\t{fmt.style('from table:', fg='green')} {table_name}")
+                fmt.echo(f"\t\t{fmt.style('columns:', fg='green')} {columns}")
+
+        for warning in drop.info["warnings"]:
+            fmt.warning(warning)
+        if fmt.confirm("Do you want to apply these changes?", default=False):
+            drop()
diff --git a/dlt/cli/plugins.py b/dlt/cli/plugins.py
@@ -390,7 +390,7 @@ def configure_parser(self, pipe_cmd: argparse.ArgumentParser) -> None:
             help="Drop all resources found in schema. Supersedes [resources] argument.",
         )
         pipe_cmd_drop.add_argument(
-            "--state-paths", nargs="*", help="State keys or json paths to drop", default=()
+            "--state-paths", nargs="*", help="State keys or json paths to drop.", default=()
         )
         pipe_cmd_drop.add_argument(
             "--schema",
@@ -422,6 +422,28 @@ def configure_parser(self, pipe_cmd: argparse.ArgumentParser) -> None:
             help="Load id of completed or normalized package. Defaults to the most recent package.",
         )
 
+        pipe_cmd_drop_columns = pipeline_subparsers.add_parser(
+            "drop-columns",
+            help="Selectively drop columns from specified tables",
+            description="""TODO""",
+            epilog=(
+                f"See {DLT_PIPELINE_COMMAND_DOCS_URL}#selectively-drop-tables-and-reset-state for"
+                " more info"
+            ),
+        )
+        pipe_cmd_drop_columns.add_argument(
+            "--from",
+            dest="from_resources",
+            nargs="*",
+            help="(With --columns) Resource names to drop columns from.",
+        )
+        pipe_cmd_drop_columns.add_argument(
+            "--columns",
+            nargs="*",
+            help="(With --from) Column names to drop from the specified resources.",
+            default=(),
+        )
+
     def execute(self, args: argparse.Namespace) -> None:
         if args.list_pipelines:
             pipeline_command_wrapper("list", "-", args.pipelines_dir, args.verbosity)
diff --git a/dlt/common/storages/load_package.py b/dlt/common/storages/load_package.py
@@ -71,6 +71,8 @@ class TPipelineStateDoc(TypedDict, total=False):
 
 
 class TLoadPackageDropTablesState(TypedDict):
+    from_tables_drop_columns: NotRequired[List[Any]]
+    """List of tables and columns that are to be dropped from them"""
     dropped_tables: NotRequired[List[TTableSchema]]
     """List of tables that are to be dropped from the schema and destination (i.e. when `refresh` mode is used)"""
     truncated_tables: NotRequired[List[TTableSchema]]
diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py
@@ -15,6 +15,7 @@
     Type,
     Iterable,
     Iterator,
+    Union,
 )
 import zlib
 import re
@@ -329,6 +330,24 @@ def drop_tables(self, *tables: str, delete_schema: bool = True) -> None:
             if delete_schema:
                 self._delete_schema_in_storage(self.schema)
 
+    def drop_columns(
+        self,
+        from_tables_drop_cols: List[Dict[str, Union[str, List[str]]]],
+        delete_schema: bool = True,
+    ) -> None:
+        """Drop columns in destination database and optionally delete the stored schema as well.
+        Clients that support ddl transactions will have both operations performed in a single transaction.
+
+        Args:
+            from_tables: Names of tables from which columns are to be dropped.
+            from_tables_drop_cols: Names of columns to be dropped grouped by table.
+            delete_schema: If True, also delete all versions of the current schema from storage
+        """
+        with self.maybe_ddl_transaction():
+            self.sql_client.drop_columns(from_tables_drop_cols)
+            if delete_schema:
+                self._delete_schema_in_storage(self.schema)
+
     @contextlib.contextmanager
     def maybe_ddl_transaction(self) -> Iterator[None]:
         """Begins a transaction if sql client supports it, otherwise works in auto commit."""
diff --git a/dlt/destinations/sql_client.py b/dlt/destinations/sql_client.py
@@ -18,10 +18,11 @@
     List,
     Generator,
     cast,
+    Union,
 )
 
 from dlt.common.typing import TFun, TypedDict, Self
-from dlt.common.schema.typing import TTableSchemaColumns
+from dlt.common.schema.typing import TTableSchemaColumns, TColumnSchema
 from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.utils import concat_strings_with_limit
 from dlt.common.destination.client import JobClientBase
@@ -141,6 +142,20 @@ def drop_tables(self, *tables: str) -> None:
         ]
         self.execute_many(statements)
 
+    def drop_columns(self, from_tables_drop_cols: List[Dict[str, Union[str, List[str]]]]) -> None:
+        """Drops specified columns from specified tables if they exist"""
+
+        statements = []
+        for from_table_drop_cols in from_tables_drop_cols:
+            table = cast(str, from_table_drop_cols["from_table"])
+            for column in from_table_drop_cols["drop_columns"]:
+                statements.append(
+                    f"ALTER TABLE {self.make_qualified_table_name(table)} DROP COLUMN IF EXISTS"
+                    f" {self.escape_column_name(column)};"
+                )
+
+        self.execute_many(statements)
+
     def _to_named_paramstyle(self, query: str, args: Sequence[Any]) -> Tuple[str, Dict[str, Any]]:
         """Convert a query from "format" ( %s ) paramstyle to "named" ( :param_name ) paramstyle.
         The %s are replaced with :arg0, :arg1, ... and the arguments are returned as a dictionary.
diff --git a/dlt/load/load.py b/dlt/load/load.py
@@ -1,6 +1,6 @@
 import contextlib
 from functools import reduce
-from typing import Dict, List, Optional, Tuple, Iterator, Sequence
+from typing import Dict, List, Optional, Tuple, Iterator, Sequence, Union
 from concurrent.futures import Executor
 import os
 
@@ -517,6 +517,7 @@ def load_single_package(self, load_id: str, schema: Schema) -> None:
         # and they must be like that in order to drop existing tables
         dropped_tables = current_load_package()["state"].get("dropped_tables", [])
         truncated_tables = current_load_package()["state"].get("truncated_tables", [])
+        from_tables_drop_cols = current_load_package()["state"].get("from_tables_drop_columns", [])
 
         self.init_jobs_counter(load_id)
 
@@ -537,6 +538,7 @@ def load_single_package(self, load_id: str, schema: Schema) -> None:
                     ),
                     drop_tables=dropped_tables,
                     truncate_tables=truncated_tables,
+                    from_tables_drop_cols=from_tables_drop_cols,
                 )
 
                 # init staging client
diff --git a/dlt/load/utils.py b/dlt/load/utils.py
@@ -1,4 +1,4 @@
-from typing import List, Set, Iterable, Callable, Optional, Tuple, Sequence
+from typing import List, Set, Iterable, Callable, Optional, Tuple, Sequence, Dict, Union, Any
 from itertools import groupby
 
 from dlt.common import logger
@@ -11,7 +11,7 @@
 )
 from dlt.common.storages.load_storage import ParsedLoadJobFileName
 from dlt.common.schema import Schema, TSchemaTables
-from dlt.common.schema.typing import TTableSchema
+from dlt.common.schema.typing import TTableSchema, TColumnSchema
 from dlt.common.destination.client import JobClientBase, WithStagingDataset, LoadJob
 from dlt.load.configuration import LoaderConfiguration
 from dlt.common.destination import DestinationCapabilitiesContext
@@ -71,6 +71,7 @@ def init_client(
     load_staging_filter: Callable[[str], bool],
     drop_tables: Optional[List[TTableSchema]] = None,
     truncate_tables: Optional[List[TTableSchema]] = None,
+    from_tables_drop_cols: Optional[List[Dict[str, Union[str, List[str]]]]] = None,
 ) -> TSchemaTables:
     """Initializes destination storage including staging dataset if supported
 
@@ -85,6 +86,7 @@ def init_client(
         load_staging_filter (Callable[[str], bool]): A filter which tell which table in the staging dataset may be loaded into
         drop_tables (Optional[List[TTableSchema]]): List of tables to drop before initializing storage
         truncate_tables (Optional[List[TTableSchema]]): List of tables to truncate before initializing storage
+        from_tables_drop_cols Optional[List[Dict[str, Union[str, List[str]]]]]: List of columns to drop grouped by table.
 
     Returns:
         TSchemaTables: Actual migrations done at destination
@@ -113,13 +115,15 @@ def init_client(
 
     # get tables to drop
     drop_table_names = {table["name"] for table in drop_tables} if drop_tables else set()
+
     job_client.verify_schema(only_tables=tables_with_jobs | dlt_tables, new_jobs=new_jobs)
     applied_update = _init_dataset_and_update_schema(
         job_client,
         expected_update,
         tables_with_jobs | dlt_tables,
         truncate_table_names,
         drop_tables=drop_table_names,
+        from_tables_drop_cols=from_tables_drop_cols,
     )
 
     # update the staging dataset if client supports this
@@ -153,6 +157,7 @@ def _init_dataset_and_update_schema(
     truncate_tables: Iterable[str] = None,
     staging_info: bool = False,
     drop_tables: Iterable[str] = None,
+    from_tables_drop_cols: List[Dict[str, Union[str, List[str]]]] = None,
 ) -> TSchemaTables:
     staging_text = "for staging dataset" if staging_info else ""
     logger.info(
@@ -171,6 +176,22 @@ def _init_dataset_and_update_schema(
                 f"Client for {job_client.config.destination_type} does not implement drop table."
                 f" Following tables {drop_tables} will not be dropped {staging_text}"
             )
+    if from_tables_drop_cols and job_client.is_storage_initialized():
+        table_names = [
+            from_table_drop_cols["from_table"] for from_table_drop_cols in from_tables_drop_cols
+        ]
+        if hasattr(job_client, "drop_columns"):
+            logger.info(
+                f"Client for {job_client.config.destination_type} will drop columns"
+                f" from tables {table_names} {staging_text}"
+            )
+            job_client.drop_columns(from_tables_drop_cols, delete_schema=True)
+        else:
+            logger.warning(
+                f"Client for {job_client.config.destination_type} does not implement drop columns."
+                " Columns will not be dropped from tables"
+                f" {table_names} {staging_text}"
+            )
 
     job_client.initialize_storage()
 
diff --git a/dlt/pipeline/drop.py b/dlt/pipeline/drop.py
diff --git a/dlt/pipeline/helpers.py b/dlt/pipeline/helpers.py