filesystem table format logic added

anuunchin · anuunchin · commit c7248f0431d0 · 2025-07-23T14:57:43.000+02:00
diff --git a/dlt/cli/pipeline_command.py b/dlt/cli/pipeline_command.py
@@ -441,6 +441,10 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]:
                 fmt.echo("Additional warnings are available")
                 for warning in drop.info["warnings"]:
                     fmt.warning(warning)
+            if len(drop.info["notes"]):
+                fmt.echo("Additional notes are available")
+                for note in drop.info["notes"]:
+                    fmt.echo(fmt.style(note, fg="yellow"))
             return
 
         fmt.echo(
@@ -467,7 +471,13 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]:
                 fmt.echo(f"\t{fmt.style('from table:', fg='green')} {table_name}")
                 fmt.echo(f"\t\t{fmt.style('columns:', fg='green')} {columns}")
 
-        for warning in drop.info["warnings"]:
-            fmt.warning(warning)
+        if len(drop.info["warnings"]):
+            fmt.echo("Additional warnings are available")
+            for warning in drop.info["warnings"]:
+                fmt.warning(warning)
+        if len(drop.info["notes"]):
+            fmt.echo("Additional info is available:")
+            for note in drop.info["notes"]:
+                fmt.echo(fmt.style(note, fg="yellow"))
         if fmt.confirm("Do you want to apply these changes?", default=False):
             drop()
diff --git a/dlt/cli/plugins.py b/dlt/cli/plugins.py
@@ -324,11 +324,11 @@ def configure_parser(self, pipe_cmd: argparse.ArgumentParser) -> None:
 
 ```text
 About to drop the following data in dataset airflow_events_1 in destination dlt.destinations.duckdb:
-Selected schema:: github_repo_events
-Selected resource(s):: ['repo_events']
-Table(s) to drop:: ['issues_event', 'fork_event', 'pull_request_event', 'pull_request_review_event', 'pull_request_review_comment_event', 'watch_event', 'issue_comment_event', 'push_event__payload__commits', 'push_event']
-Resource(s) state to reset:: ['repo_events']
-Source state path(s) to reset:: []
+Selected schema: github_repo_events
+Selected resource(s): ['repo_events']
+Table(s) to drop: ['issues_event', 'fork_event', 'pull_request_event', 'pull_request_review_event', 'pull_request_review_comment_event', 'watch_event', 'issue_comment_event', 'push_event__payload__commits', 'push_event']
+Resource(s) state to reset: ['repo_events']
+Source state path(s) to reset: []
 Do you want to apply these changes? [y/N]
 ```
 
@@ -436,16 +436,49 @@ def configure_parser(self, pipe_cmd: argparse.ArgumentParser) -> None:
             "drop-columns",
             help="Selectively drop columns from specified tables",
             description="""
-Selectively drop columns from specified resources.
+Selectively drop columns from specified resources and tables.
 
 ```sh
-dlt pipeline <pipeline name> drop-columns --from [resource_1] [resource_2] --columns [column_1] [column_2]
+dlt pipeline <pipeline name> drop-columns --from-resources [resource_1] [resource_2] --from-tables [table_1] [table_2] --columns [column_1] [column_2]
 ```
 
-Drops selected columns in the tables generated by selected resources,
-unless the columns have a hint that makes them undroppable.
+**How column selection works:**
 
-You can use regexes to select resources and columns. Prepend the `re:` string to indicate a regex pattern.
+1. **Resource resolution**: If `--from-resources` is specified, tables are grouped by resource using regex pattern matching. If omitted, all resources are considered.
+
+2. **Table resolution**: If `--from-tables` is specified, only tables matching the pattern(s) within the selected resources are considered. If omitted, all tables from selected resources are considered.
+
+3. **Column resolution**: Columns are matched against the specified pattern(s) within the selected tables. Only nullable columns without disqualifying hints can be dropped.
+
+**Column safety rules:**
+
+Only columns that meet ALL of the following criteria can be dropped:
+- The column is nullable (can contain NULL values)
+- The column does not have any disqualifying hints such as: `partition`, `cluster`, `unique`, `sort`, `primary_key`, `row_key`, `parent_key`, `root_key`, `merge_key`, `variant`, `hard_delete`, `dedup_sort`, or `incremental`
+- After dropping the matched columns, at least one non-dlt internal column must remain in the table
+
+**Filesystem destination note:**
+
+For filesystem destination, column dropping is only supported for tables that have an associated `table_format` (e.g., Iceberg, Delta). Tables without a table format will be skipped with a notification.
+
+**Example Output:**
+
+```text
+About to drop the following columns in dataset my_dataset in destination dlt.destinations.duckdb:
+Selected schema: droppable_source
+Selected resource(s): ['droppable_b']
+Table(s) to be affected: ['droppable_b__items']
+Column(s) to be dropped:
+    from table: droppable_b__items
+        columns: ['m']
+Do you want to apply these changes?
+```
+
+**Pattern matching:**
+
+You can use regexes to select resources, tables and columns. Prepend the `re:` string to indicate a regex pattern. For example:
+- `--from-resources "re:^user"` matches all resources starting with "user"
+- `--columns "re:.*_temp$"` matches all columns ending with "_temp"
 """,
             epilog=(
                 f"See {DLT_PIPELINE_COMMAND_DOCS_URL}#selectively-drop-tables-and-reset-state for"
diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
@@ -964,46 +964,6 @@ def drop_columns(
                     table=delta_table,
                     columns_to_drop=columns_to_drop,
                 )
-            else:
-                # Handle regular filesystem tables (jsonl, parquet, csv)
-                self._drop_columns_from_regular_table(table_name, columns_to_drop)
 
         if update_schema:
             self._update_schema_in_storage(self.schema)
-
-    def _drop_columns_from_regular_table(self, table_name: str, columns_to_drop: List[str]) -> None:
-        from dlt.common.libs.pyarrow import pyarrow as pa
-
-        table_files = self.list_table_files(table_name)
-
-        for file_path in table_files:
-            file_ext = os.path.splitext(file_path)[1].lower()
-
-            if file_ext == ".parquet":
-                table = pa.parquet.read_table(self.make_remote_url(file_path))
-                columns_to_keep = [col for col in table.column_names if col not in columns_to_drop]
-                filtered_table = table.select(columns_to_keep)
-                with pa.parquet.ParquetWriter(
-                    self.make_remote_url(file_path), filtered_table.schema
-                ) as writer:
-                    writer.write_table(filtered_table)
-
-            elif file_ext == ".jsonl":
-                content = self.fs_client.read_text(file_path, encoding="utf-8")
-                lines = content.strip().split("\n")
-
-                filtered_lines = []
-                for line in lines:
-                    if line.strip():
-                        record = json.loads(line)
-                        for col in columns_to_drop:
-                            record.pop(col, None)
-                        filtered_lines.append(json.dumps(record))
-
-                self.fs_client.write_text(file_path, "\n".join(filtered_lines), encoding="utf-8")
-
-            elif file_ext == ".csv":
-                table = pa.csv.read_csv(self.make_remote_url(file_path))
-                columns_to_keep = [col for col in table.column_names if col not in columns_to_drop]
-                filtered_table = table.select(columns_to_keep)
-                pa.csv.write_csv(filtered_table, self.make_remote_url(file_path))
diff --git a/dlt/pipeline/drop.py b/dlt/pipeline/drop.py
@@ -41,6 +41,7 @@ class _DropInfo(TypedDict):
     drop_all: bool
     resource_pattern: Optional[REPattern]
     warnings: List[str]
+    notes: List[str]
     drop_columns: bool
 
 
@@ -171,6 +172,7 @@ def drop_resources(
         drop_all=drop_all,
         resource_pattern=resource_pattern,
         warnings=[],
+        notes=[],
         drop_columns=False,
     )
 
@@ -247,6 +249,7 @@ def drop_columns(
     from_resources: Union[Iterable[Union[str, TSimpleRegex]], Union[str, TSimpleRegex]] = (),
     from_tables: Union[Iterable[Union[str, TSimpleRegex]], Union[str, TSimpleRegex]] = (),
     columns: Union[Iterable[Union[str, TSimpleRegex]], Union[str, TSimpleRegex]] = (),
+    is_filesystem: bool = False,
 ) -> _DropResult:
     """Generate a new schema and pipeline state with the requested columns removed.
 
@@ -295,6 +298,7 @@ def drop_columns(
 
     # Collect columns to drop grouped by table
     warnings: List[str] = []
+    notes: List[str] = []
     from_tables_drop_cols: List[_FromTableDropCols] = []
     affected_schema_table_names: List[str] = []
 
@@ -308,8 +312,8 @@ def drop_columns(
 
             if not can_drop and len(matched_droppable_cols) > 0:
                 warning = (
-                    f"""After dropping matched droppable columns {matched_droppable_cols} from table '{table_name}'"""
-                    " only internal dlt columns will remain. This is not allowed."
+                    f"After dropping matched droppable columns {matched_droppable_cols} from table"
+                    f" '{table_name}' only internal dlt columns will remain. This is not allowed."
                 )
                 warnings.append(warning)
                 continue
@@ -320,6 +324,18 @@ def drop_columns(
             ]
 
             if drop_cols:
+                # Tables without table format are not supported
+                if is_filesystem and "table_format" not in table:
+                    drop_cols_str = ",".join(drop_cols)
+                    note = (
+                        f"Skipped table '{table_name}' with selected column(s) '{drop_cols_str}'"
+                        " because it does not use a supported table format. Column dropping in"
+                        " filesystem destinations requires the table to have an associated table"
+                        " format."
+                    )
+                    notes.append(note)
+                    continue
+
                 from_tables_drop_cols.append({"from_table": table_name, "drop_columns": drop_cols})
                 affected_schema_table_names.append(table_name)
 
@@ -348,6 +364,7 @@ def drop_columns(
         drop_all=False,
         resource_pattern=resource_pattern,
         warnings=warnings,
+        notes=notes,
         drop_columns=drop_columns,
     )
 
diff --git a/dlt/pipeline/helpers.py b/dlt/pipeline/helpers.py
@@ -91,6 +91,7 @@ def __init__(
                 from_resources,
                 from_tables,
                 columns,
+                pipeline.destination.destination_name == "filesystem",
             )
         else:
             drop_result = drop_resources(
diff --git a/docs/website/docs/reference/command-line-interface.md b/docs/website/docs/reference/command-line-interface.md
@@ -384,11 +384,11 @@ reset:
 
 ```text
 About to drop the following data in dataset airflow_events_1 in destination dlt.destinations.duckdb:
-Selected schema:: github_repo_events
-Selected resource(s):: ['repo_events']
-Table(s) to drop:: ['issues_event', 'fork_event', 'pull_request_event', 'pull_request_review_event', 'pull_request_review_comment_event', 'watch_event', 'issue_comment_event', 'push_event__payload__commits', 'push_event']
-Resource(s) state to reset:: ['repo_events']
-Source state path(s) to reset:: []
+Selected schema: github_repo_events
+Selected resource(s): ['repo_events']
+Table(s) to drop: ['issues_event', 'fork_event', 'pull_request_event', 'pull_request_review_event', 'pull_request_review_comment_event', 'watch_event', 'issue_comment_event', 'push_event__payload__commits', 'push_event']
+Resource(s) state to reset: ['repo_events']
+Source state path(s) to reset: []
 Do you want to apply these changes? [y/N]
 ```
 
@@ -501,16 +501,49 @@ dlt pipeline [pipeline_name] drop-columns [-h] [--from-resources [FROM_RESOURCES
 
 **Description**
 
-Selectively drop columns from specified resources.
+Selectively drop columns from specified resources and tables.
 
 ```sh
-dlt pipeline <pipeline name> drop-columns --from [resource_1] [resource_2] --columns [column_1] [column_2]
+dlt pipeline <pipeline name> drop-columns --from-resources [resource_1] [resource_2] --from-tables [table_1] [table_2] --columns [column_1] [column_2]
 ```
 
-Drops selected columns in the tables generated by selected resources,
-unless the columns have a hint that makes them undroppable.
+**How column selection works:**
 
-You can use regexes to select resources and columns. Prepend the `re:` string to indicate a regex pattern.
+1. **Resource resolution**: If `--from-resources` is specified, tables are grouped by resource using regex pattern matching. If omitted, all resources are considered.
+
+2. **Table resolution**: If `--from-tables` is specified, only tables matching the pattern(s) within the selected resources are considered. If omitted, all tables from selected resources are considered.
+
+3. **Column resolution**: Columns are matched against the specified pattern(s) within the selected tables. Only nullable columns without disqualifying hints can be dropped.
+
+**Column safety rules:**
+
+Only columns that meet ALL of the following criteria can be dropped:
+- The column is nullable (can contain NULL values)
+- The column does not have any disqualifying hints such as: `partition`, `cluster`, `unique`, `sort`, `primary_key`, `row_key`, `parent_key`, `root_key`, `merge_key`, `variant`, `hard_delete`, `dedup_sort`, or `incremental`
+- After dropping the matched columns, at least one non-dlt internal column must remain in the table
+
+**Filesystem destination note:**
+
+For filesystem destination, column dropping is only supported for tables that have an associated `table_format` (e.g., Iceberg, Delta). Tables without a table format will be skipped with a notification.
+
+**Example Output:**
+
+```text
+About to drop the following columns in dataset my_dataset in destination dlt.destinations.duckdb:
+Selected schema: droppable_source
+Selected resource(s): ['droppable_b']
+Table(s) to be affected: ['droppable_b__items']
+Column(s) to be dropped:
+    from table: droppable_b__items
+        columns: ['m']
+Do you want to apply these changes?
+```
+
+**Pattern matching:**
+
+You can use regexes to select resources, tables and columns. Prepend the `re:` string to indicate a regex pattern. For example:
+- `--from-resources "re:^user"` matches all resources starting with "user"
+- `--columns "re:.*_temp$"` matches all columns ending with "_temp".
 
 <details>
 
diff --git a/tests/load/pipeline/test_drop_column.py b/tests/load/pipeline/test_drop_column.py
diff --git a/tests/pipeline/test_drop_helpers.py b/tests/pipeline/test_drop_helpers.py

Original file line number	Diff line number	Diff line change
`@@ -91,6 +91,7 @@ def __init__(`
`91`	`91`	`from_resources,`
`92`	`92`	`from_tables,`
`93`	`93`	`columns,`
	`94`	`+ pipeline.destination.destination_name == "filesystem",`
`94`	`95`	`)`
`95`	`96`	`else:`
`96`	`97`	`drop_result = drop_resources(`