Initial commit with --from --columns enabled

anuunchin · anuunchin · commit 4c1bafef9aee · 2025-06-12T17:44:12.000+02:00
diff --git a/dlt/cli/pipeline_command.py b/dlt/cli/pipeline_command.py
@@ -343,8 +343,8 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]:
         drop = DropCommand(p, **command_kwargs)
         if drop.is_empty:
             fmt.echo(
-                "Could not select any resources to drop and no resource/source state to reset. Use"
-                " the command below to inspect the pipeline:"
+                "Could not select any resources or columns to drop and no resource/source state to"
+                " reset. Use the command below to inspect the pipeline:"
             )
             fmt.echo(f"dlt pipeline -v {p.pipeline_name} info")
             if len(drop.info["warnings"]):
@@ -368,11 +368,20 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]:
                 drop.info["resource_names"],
             )
         )
-        fmt.echo("%s: %s" % (fmt.style("Table(s) to drop", fg="green"), drop.info["tables"]))
+        label = "Table(s) to be affected" if drop.from_tables_drop_cols else "Table(s) to drop"
+        fmt.echo(f"{fmt.style(label, fg='green')}: {drop.info['tables']}")
         fmt.echo(
-            "%s: %s"
-            % (fmt.style("\twith data in destination", fg="green"), drop.info["tables_with_data"])
+            f"{fmt.style('\twith data in destination', fg='green')}:"
+            f" {drop.info['tables_with_data']}"
         )
+        if drop.from_tables_drop_cols:
+            fmt.echo(f"{fmt.style('Column(s) to be dropped', fg='green')}:")
+            for from_table_drop_cols in drop.from_tables_drop_cols:
+                table_name = from_table_drop_cols["from_table"]
+                columns = from_table_drop_cols["drop_columns"]
+                fmt.echo(f"{fmt.style('\tfrom table:', fg='green')} {table_name}")
+                fmt.echo(f"{fmt.style('\tcolumns:', fg='green')} {columns}")
+
         fmt.echo(
             "%s: %s"
             % (
diff --git a/dlt/cli/plugins.py b/dlt/cli/plugins.py
@@ -390,7 +390,7 @@ def configure_parser(self, pipe_cmd: argparse.ArgumentParser) -> None:
             help="Drop all resources found in schema. Supersedes [resources] argument.",
         )
         pipe_cmd_drop.add_argument(
-            "--state-paths", nargs="*", help="State keys or json paths to drop", default=()
+            "--state-paths", nargs="*", help="State keys or json paths to drop.", default=()
         )
         pipe_cmd_drop.add_argument(
             "--schema",
@@ -403,6 +403,18 @@ def configure_parser(self, pipe_cmd: argparse.ArgumentParser) -> None:
             help="Only wipe state for matching resources without dropping tables.",
             default=False,
         )
+        pipe_cmd_drop.add_argument(
+            "--from",
+            dest="from_resources",
+            nargs="*",
+            help="(With --columns) Resource name to drop columns from.",
+        )
+        pipe_cmd_drop.add_argument(
+            "--columns",
+            nargs="*",
+            help="(With --from) One or more column names to drop from the specified resource.",
+            default=(),
+        )
 
         pipe_cmd_package = pipeline_subparsers.add_parser(
             "load-package",
@@ -425,6 +437,11 @@ def configure_parser(self, pipe_cmd: argparse.ArgumentParser) -> None:
     def execute(self, args: argparse.Namespace) -> None:
         if args.list_pipelines:
             pipeline_command_wrapper("list", "-", args.pipelines_dir, args.verbosity)
+        elif (hasattr(args, "from_resources") and hasattr(args, "columns")) and (
+            (args.from_resources and not args.columns) or (not args.from_resources and args.columns)
+        ):
+            fmt.error("Please use --from and --columns together.")
+            raise CliCommandException()
         else:
             command_kwargs = dict(args._get_kwargs())
             if not command_kwargs.get("pipeline_name"):
diff --git a/dlt/common/storages/load_package.py b/dlt/common/storages/load_package.py
@@ -20,6 +20,7 @@
     cast,
     Any,
     Tuple,
+    TYPE_CHECKING,
 )
 from typing_extensions import NotRequired
 
@@ -53,6 +54,9 @@
 )
 from dlt.common.time import precise_time
 
+if TYPE_CHECKING:
+    from dlt.pipeline.drop import _FromTableDropCols
+
 TJobFileFormat = Literal["sql", "reference", TLoaderFileFormat]
 """Loader file formats with internal job types"""
 JOB_EXCEPTION_EXTENSION = ".exception"
@@ -71,6 +75,8 @@ class TPipelineStateDoc(TypedDict, total=False):
 
 
 class TLoadPackageDropTablesState(TypedDict):
+    from_tables_drop_columns: NotRequired[List["_FromTableDropCols"]]
+    """List of tables and columns that are to be dropped from them"""
     dropped_tables: NotRequired[List[TTableSchema]]
     """List of tables that are to be dropped from the schema and destination (i.e. when `refresh` mode is used)"""
     truncated_tables: NotRequired[List[TTableSchema]]
diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py
@@ -15,6 +15,7 @@
     Type,
     Iterable,
     Iterator,
+    Union,
 )
 import zlib
 import re
@@ -329,6 +330,24 @@ def drop_tables(self, *tables: str, delete_schema: bool = True) -> None:
             if delete_schema:
                 self._delete_schema_in_storage(self.schema)
 
+    def drop_columns(
+        self,
+        from_tables_drop_cols: List[Dict[str, Union[str, List[str]]]],
+        delete_schema: bool = True,
+    ) -> None:
+        """Drop columns in destination database and optionally delete the stored schema as well.
+        Clients that support ddl transactions will have both operations performed in a single transaction.
+
+        Args:
+            from_tables: Names of tables from which columns are to be dropped.
+            columns: Names of columns to be dropped.
+            delete_schema: If True, also delete all versions of the current schema from storage
+        """
+        with self.maybe_ddl_transaction():
+            self.sql_client.drop_columns(from_tables_drop_cols)
+            if delete_schema:
+                self._delete_schema_in_storage(self.schema)
+
     @contextlib.contextmanager
     def maybe_ddl_transaction(self) -> Iterator[None]:
         """Begins a transaction if sql client supports it, otherwise works in auto commit."""
diff --git a/dlt/destinations/sql_client.py b/dlt/destinations/sql_client.py
@@ -18,10 +18,11 @@
     List,
     Generator,
     cast,
+    Union,
 )
 
 from dlt.common.typing import TFun, TypedDict, Self
-from dlt.common.schema.typing import TTableSchemaColumns
+from dlt.common.schema.typing import TTableSchemaColumns, TColumnSchema
 from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.utils import concat_strings_with_limit
 from dlt.common.destination.client import JobClientBase
@@ -141,6 +142,20 @@ def drop_tables(self, *tables: str) -> None:
         ]
         self.execute_many(statements)
 
+    def drop_columns(self, from_tables_drop_cols: List[Dict[str, Union[str, List[str]]]]) -> None:
+        """Drops specified columns from specified tables if they exist"""
+
+        statements = []
+        for from_table_drop_cols in from_tables_drop_cols:
+            table = cast(str, from_table_drop_cols["from_table"])
+            for column in from_table_drop_cols["drop_columns"]:
+                statements.append(
+                    f"ALTER TABLE {self.make_qualified_table_name(table)} DROP COLUMN IF EXISTS"
+                    f" {self.escape_column_name(column)};"
+                )
+
+        self.execute_many(statements)
+
     def _to_named_paramstyle(self, query: str, args: Sequence[Any]) -> Tuple[str, Dict[str, Any]]:
         """Convert a query from "format" ( %s ) paramstyle to "named" ( :param_name ) paramstyle.
         The %s are replaced with :arg0, :arg1, ... and the arguments are returned as a dictionary.
diff --git a/dlt/load/load.py b/dlt/load/load.py
@@ -1,6 +1,6 @@
 import contextlib
 from functools import reduce
-from typing import Dict, List, Optional, Tuple, Iterator, Sequence
+from typing import Dict, List, Optional, Tuple, Iterator, Sequence, Union
 from concurrent.futures import Executor
 import os
 
@@ -517,6 +517,7 @@ def load_single_package(self, load_id: str, schema: Schema) -> None:
         # and they must be like that in order to drop existing tables
         dropped_tables = current_load_package()["state"].get("dropped_tables", [])
         truncated_tables = current_load_package()["state"].get("truncated_tables", [])
+        from_tables_drop_cols = current_load_package()["state"].get("from_tables_drop_columns", [])
 
         self.init_jobs_counter(load_id)
 
@@ -537,6 +538,7 @@ def load_single_package(self, load_id: str, schema: Schema) -> None:
                     ),
                     drop_tables=dropped_tables,
                     truncate_tables=truncated_tables,
+                    from_tables_drop_cols=from_tables_drop_cols,
                 )
 
                 # init staging client
diff --git a/dlt/load/utils.py b/dlt/load/utils.py
@@ -1,4 +1,4 @@
-from typing import List, Set, Iterable, Callable, Optional, Tuple, Sequence
+from typing import List, Set, Iterable, Callable, Optional, Tuple, Sequence, Dict, Union, Any
 from itertools import groupby
 
 from dlt.common import logger
@@ -11,7 +11,7 @@
 )
 from dlt.common.storages.load_storage import ParsedLoadJobFileName
 from dlt.common.schema import Schema, TSchemaTables
-from dlt.common.schema.typing import TTableSchema
+from dlt.common.schema.typing import TTableSchema, TColumnSchema
 from dlt.common.destination.client import JobClientBase, WithStagingDataset, LoadJob
 from dlt.load.configuration import LoaderConfiguration
 from dlt.common.destination import DestinationCapabilitiesContext
@@ -71,6 +71,7 @@ def init_client(
     load_staging_filter: Callable[[str], bool],
     drop_tables: Optional[List[TTableSchema]] = None,
     truncate_tables: Optional[List[TTableSchema]] = None,
+    from_tables_drop_cols: Any = None,
 ) -> TSchemaTables:
     """Initializes destination storage including staging dataset if supported
 
@@ -85,6 +86,7 @@ def init_client(
         load_staging_filter (Callable[[str], bool]): A filter which tell which table in the staging dataset may be loaded into
         drop_tables (Optional[List[TTableSchema]]): List of tables to drop before initializing storage
         truncate_tables (Optional[List[TTableSchema]]): List of tables to truncate before initializing storage
+        drop_columns Optional[Dict[str, Union[List[TTableSchema], List[str]]]]: Columns to drop from specified tables
 
     Returns:
         TSchemaTables: Actual migrations done at destination
@@ -113,13 +115,15 @@ def init_client(
 
     # get tables to drop
     drop_table_names = {table["name"] for table in drop_tables} if drop_tables else set()
+
     job_client.verify_schema(only_tables=tables_with_jobs | dlt_tables, new_jobs=new_jobs)
     applied_update = _init_dataset_and_update_schema(
         job_client,
         expected_update,
         tables_with_jobs | dlt_tables,
         truncate_table_names,
         drop_tables=drop_table_names,
+        from_tables_drop_cols=from_tables_drop_cols,
     )
 
     # update the staging dataset if client supports this
@@ -153,6 +157,7 @@ def _init_dataset_and_update_schema(
     truncate_tables: Iterable[str] = None,
     staging_info: bool = False,
     drop_tables: Iterable[str] = None,
+    from_tables_drop_cols: List[Dict[str, Union[str, List[str]]]] = None,
 ) -> TSchemaTables:
     staging_text = "for staging dataset" if staging_info else ""
     logger.info(
@@ -171,6 +176,22 @@ def _init_dataset_and_update_schema(
                 f"Client for {job_client.config.destination_type} does not implement drop table."
                 f" Following tables {drop_tables} will not be dropped {staging_text}"
             )
+    if from_tables_drop_cols and job_client.is_storage_initialized():
+        table_names = [
+            from_table_drop_cols["from_table"] for from_table_drop_cols in from_tables_drop_cols
+        ]
+        if hasattr(job_client, "drop_columns"):
+            logger.info(
+                f"Client for {job_client.config.destination_type} will drop columns"
+                f" from tables {table_names} {staging_text}"
+            )
+            job_client.drop_columns(from_tables_drop_cols, delete_schema=True)
+        else:
+            logger.warning(
+                f"Client for {job_client.config.destination_type} does not implement drop columns."
+                " Columns will not be dropped from tables"
+                f" {table_names} {staging_text}"
+            )
 
     job_client.initialize_storage()
 
diff --git a/dlt/pipeline/drop.py b/dlt/pipeline/drop.py
@@ -14,7 +14,7 @@
 )
 from dlt.common.typing import TypedDict
 
-from dlt.common.schema.typing import TSimpleRegex, TTableSchema
+from dlt.common.schema.typing import TSimpleRegex, TTableSchema, TColumnSchema
 from dlt.common.schema.utils import (
     group_tables_by_resource,
     compile_simple_regexes,
@@ -35,14 +35,21 @@ class _DropInfo(TypedDict):
     drop_all: bool
     resource_pattern: Optional[REPattern]
     warnings: List[str]
+    drop_columns: bool
+
+
+class _FromTableDropCols(TypedDict):
+    from_table: str
+    drop_columns: List[str]
 
 
 @dataclass
 class _DropResult:
     schema: Schema
-    state: TPipelineState
     info: _DropInfo
     modified_tables: List[TTableSchema]
+    state: Optional[TPipelineState] = None
+    from_tables_drop_cols: Optional[List[_FromTableDropCols]] = None
 
 
 def _create_modified_state(
@@ -62,6 +69,7 @@ def _create_modified_state(
             for key in _get_matching_resources(resource_pattern, source_state):
                 info["resource_states"].append(key)
                 reset_resource_state(key, source_state)
+
         # drop additional state paths
         # Don't drop 'resources' key if jsonpath is wildcard
         resolved_paths = [
@@ -153,6 +161,7 @@ def drop_resources(
         drop_all=drop_all,
         resource_pattern=resource_pattern,
         warnings=[],
+        drop_columns=False,
     )
 
     new_state, info = _create_modified_state(
@@ -170,4 +179,71 @@ def drop_resources(
     if not state_only:
         # drop only the selected tables
         schema.drop_tables(tables_to_drop_from_schema_names)
-    return _DropResult(schema, new_state, info, tables_to_drop_from_dest)
+    return _DropResult(schema, info, tables_to_drop_from_dest, new_state, None)
+
+
+def drop_columns(
+    schema: Schema,
+    from_resources: Union[Iterable[Union[str, TSimpleRegex]], Union[str, TSimpleRegex]],
+    columns: Union[Iterable[Union[str, TSimpleRegex]], Union[str, TSimpleRegex]],
+) -> _DropResult:
+    """Generate a new schema and pipeline state with the requested columns removed.
+
+    Args:
+        schema: The schema to modify. Note that schema is changed in place.
+        state: The pipeline state to modify. Note that state is changed in place.
+        from_resources: Resources to drop columns from
+        columns: Columns to drop from the specified resource
+
+    Returns:
+        A 3 part tuple containing the new schema, the new pipeline state, and a dictionary
+        containing information about the drop operation.
+    """
+    from_resources = set(from_resources)
+    columns = set(columns)
+    resource_pattern = compile_simple_regexes(TSimpleRegex(r) for r in from_resources)
+    # These are all tables that are supposed to have data
+    data_tables = {t["name"]: t for t in schema.data_tables(include_incomplete=True)}
+    # These are resources with tables with data {resource: tables_with_data}
+    resource_tables = group_tables_by_resource(data_tables, pattern=resource_pattern)
+    resource_names = list(resource_tables.keys())
+    # List of tables to drop List[TTableSchema]
+    tables_to_drop_from_schema = list(chain.from_iterable(resource_tables.values()))
+    tables_to_drop_from_schema.reverse()
+    tables_to_drop_from_schema_names = [t["name"] for t in tables_to_drop_from_schema]
+    # These are tables that have actual data List[TTableSchema]
+    tables_to_drop_from_dest = [t for t in tables_to_drop_from_schema if has_table_seen_data(t)]
+    tables_to_drop_from_dest_names = [t["name"] for t in tables_to_drop_from_dest]
+
+    # Collect columns to drop by table
+    from_tables_drop_cols: List[_FromTableDropCols] = []
+    for table in tables_to_drop_from_dest:
+        table_name = table["name"]
+        table_schema_cols = table["columns"]
+        drop_cols: List[str] = []
+
+        for col in columns:
+            if col in table_schema_cols:
+                col_schema = schema.tables[table["name"]]["columns"].pop(col)
+                drop_cols.append(col_schema["name"])
+
+        if drop_cols:
+            from_tables_drop_cols.append({"from_table": table_name, "drop_columns": drop_cols})
+
+    # Set to None if no columns need to be dropped
+    from_tables_drop_cols = from_tables_drop_cols or None
+
+    info: _DropInfo = dict(
+        tables=tables_to_drop_from_schema_names if from_tables_drop_cols else [],
+        tables_with_data=tables_to_drop_from_dest_names if from_tables_drop_cols else [],
+        resource_states=[],
+        state_paths=[],
+        resource_names=resource_names if from_tables_drop_cols else [],
+        schema_name=schema.name,
+        drop_all=False,
+        resource_pattern=resource_pattern,
+        warnings=[],
+        drop_columns=True,
+    )
+
+    return _DropResult(schema, info, tables_to_drop_from_dest, None, from_tables_drop_cols)
diff --git a/dlt/pipeline/helpers.py b/dlt/pipeline/helpers.py