@@ -667,6 +667,91 @@ def _get_entries(manifest: ManifestFile) -> list[ManifestEntry]:
667667 return []
668668
669669
670+ class _RewriteFiles (_SnapshotProducer ["_RewriteFiles" ]):
671+ """Rewrites data in the table. This will produce a REPLACE snapshot.
672+
673+ Data files were logically rearranged, but no new logical records were
674+ added or removed (e.g. compaction).
675+ """
676+
677+ def _existing_manifests (self ) -> list [ManifestFile ]:
678+ """Determine if there are any existing manifest files."""
679+ existing_files = []
680+
681+ manifest_evaluators : dict [int , Callable [[ManifestFile ], bool ]] = KeyDefaultDict (self ._build_manifest_evaluator )
682+ if snapshot := self ._transaction .table_metadata .snapshot_by_name (name = self ._target_branch ):
683+ for manifest_file in snapshot .manifests (io = self ._io ):
684+ # Manifest does not contain rows that match the files to delete partitions
685+ if not manifest_evaluators [manifest_file .partition_spec_id ](manifest_file ):
686+ existing_files .append (manifest_file )
687+ continue
688+
689+ entries_to_write : set [ManifestEntry ] = set ()
690+ found_deleted_entries : set [ManifestEntry ] = set ()
691+
692+ for entry in manifest_file .fetch_manifest_entry (io = self ._io , discard_deleted = True ):
693+ if entry .data_file in self ._deleted_data_files :
694+ found_deleted_entries .add (entry )
695+ else :
696+ entries_to_write .add (entry )
697+
698+ # Is the intercept the empty set?
699+ if len (found_deleted_entries ) == 0 :
700+ existing_files .append (manifest_file )
701+ continue
702+
703+ # Delete all files from manifest
704+ if len (entries_to_write ) == 0 :
705+ continue
706+
707+ # We have to rewrite the manifest file without the deleted data files
708+ with self .new_manifest_writer (self .spec (manifest_file .partition_spec_id )) as writer :
709+ for entry in entries_to_write :
710+ writer .add_entry (
711+ ManifestEntry .from_args (
712+ status = ManifestEntryStatus .EXISTING ,
713+ snapshot_id = entry .snapshot_id ,
714+ sequence_number = entry .sequence_number ,
715+ file_sequence_number = entry .file_sequence_number ,
716+ data_file = entry .data_file ,
717+ )
718+ )
719+ existing_files .append (writer .to_manifest_file ())
720+
721+ return existing_files
722+
723+ def _deleted_entries (self ) -> list [ManifestEntry ]:
724+ """To determine if we need to record any deleted entries."""
725+ if self ._parent_snapshot_id is not None :
726+ previous_snapshot = self ._transaction .table_metadata .snapshot_by_id (self ._parent_snapshot_id )
727+ if previous_snapshot is None :
728+ raise ValueError (f"Could not find the previous snapshot: { self ._parent_snapshot_id } " )
729+
730+ executor = ExecutorFactory .get_or_create ()
731+ manifest_evaluators : dict [int , Callable [[ManifestFile ], bool ]] = KeyDefaultDict (self ._build_manifest_evaluator )
732+
733+ def _get_entries (manifest : ManifestFile ) -> list [ManifestEntry ]:
734+ if not manifest_evaluators [manifest .partition_spec_id ](manifest ):
735+ return []
736+
737+ return [
738+ ManifestEntry .from_args (
739+ status = ManifestEntryStatus .DELETED ,
740+ snapshot_id = entry .snapshot_id ,
741+ sequence_number = entry .sequence_number ,
742+ file_sequence_number = entry .file_sequence_number ,
743+ data_file = entry .data_file ,
744+ )
745+ for entry in manifest .fetch_manifest_entry (self ._io , discard_deleted = True )
746+ if entry .data_file .content == DataFileContent .DATA and entry .data_file in self ._deleted_data_files
747+ ]
748+
749+ list_of_entries = executor .map (_get_entries , previous_snapshot .manifests (self ._io ))
750+ return list (itertools .chain (* list_of_entries ))
751+ else :
752+ return []
753+
754+
670755class UpdateSnapshot :
671756 _transaction : Transaction
672757 _io : FileIO
@@ -715,6 +800,16 @@ def overwrite(self, commit_uuid: uuid.UUID | None = None) -> _OverwriteFiles:
715800 snapshot_properties = self ._snapshot_properties ,
716801 )
717802
803+ def replace (self , commit_uuid : uuid .UUID | None = None ) -> _RewriteFiles :
804+ return _RewriteFiles (
805+ commit_uuid = commit_uuid ,
806+ operation = Operation .REPLACE ,
807+ transaction = self ._transaction ,
808+ io = self ._io ,
809+ branch = self ._branch ,
810+ snapshot_properties = self ._snapshot_properties ,
811+ )
812+
718813 def delete (self ) -> _DeleteFiles :
719814 return _DeleteFiles (
720815 operation = Operation .DELETE ,
0 commit comments