|
20 | 20 | from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Set, Tuple |
21 | 21 |
|
22 | 22 | from pyiceberg.conversions import from_bytes |
23 | | -from pyiceberg.manifest import DataFile, DataFileContent, ManifestContent, ManifestFile, PartitionFieldSummary |
| 23 | +from pyiceberg.manifest import DataFileContent, ManifestContent, ManifestFile, PartitionFieldSummary |
24 | 24 | from pyiceberg.partitioning import PartitionSpec |
25 | 25 | from pyiceberg.table.snapshots import Snapshot, ancestors_of |
26 | 26 | from pyiceberg.types import PrimitiveType |
@@ -288,64 +288,86 @@ def partitions(self, snapshot_id: Optional[int] = None) -> "pa.Table": |
288 | 288 |
|
289 | 289 | table_schema = pa.unify_schemas([partitions_schema, table_schema]) |
290 | 290 |
|
291 | | - def update_partitions_map( |
292 | | - partitions_map: Dict[Tuple[str, Any], Any], |
293 | | - file: DataFile, |
294 | | - partition_record_dict: Dict[str, Any], |
295 | | - snapshot: Optional[Snapshot], |
296 | | - ) -> None: |
| 291 | + snapshot = self._get_snapshot(snapshot_id) |
| 292 | + executor = ExecutorFactory.get_or_create() |
| 293 | + local_partitions_maps = executor.map(self._process_manifest, snapshot.manifests(self.tbl.io)) |
| 294 | + |
| 295 | + partitions_map: Dict[Tuple[str, Any], Any] = {} |
| 296 | + for local_map in local_partitions_maps: |
| 297 | + for partition_record_key, partition_row in local_map.items(): |
| 298 | + if partition_record_key not in partitions_map: |
| 299 | + partitions_map[partition_record_key] = partition_row |
| 300 | + else: |
| 301 | + existing = partitions_map[partition_record_key] |
| 302 | + existing["record_count"] += partition_row["record_count"] |
| 303 | + existing["file_count"] += partition_row["file_count"] |
| 304 | + existing["total_data_file_size_in_bytes"] += partition_row["total_data_file_size_in_bytes"] |
| 305 | + existing["position_delete_record_count"] += partition_row["position_delete_record_count"] |
| 306 | + existing["position_delete_file_count"] += partition_row["position_delete_file_count"] |
| 307 | + existing["equality_delete_record_count"] += partition_row["equality_delete_record_count"] |
| 308 | + existing["equality_delete_file_count"] += partition_row["equality_delete_file_count"] |
| 309 | + |
| 310 | + if partition_row["last_updated_at"] and ( |
| 311 | + not existing["last_updated_at"] or partition_row["last_updated_at"] > existing["last_updated_at"] |
| 312 | + ): |
| 313 | + existing["last_updated_at"] = partition_row["last_updated_at"] |
| 314 | + existing["last_updated_snapshot_id"] = partition_row["last_updated_snapshot_id"] |
| 315 | + |
| 316 | + return pa.Table.from_pylist( |
| 317 | + partitions_map.values(), |
| 318 | + schema=table_schema, |
| 319 | + ) |
| 320 | + |
| 321 | + def _process_manifest(self, manifest: ManifestFile) -> Dict[Tuple[str, Any], Any]: |
| 322 | + partitions_map: Dict[Tuple[str, Any], Any] = {} |
| 323 | + for entry in manifest.fetch_manifest_entry(io=self.tbl.io): |
| 324 | + partition = entry.data_file.partition |
| 325 | + partition_record_dict = { |
| 326 | + field.name: partition[pos] |
| 327 | + for pos, field in enumerate(self.tbl.metadata.specs()[manifest.partition_spec_id].fields) |
| 328 | + } |
| 329 | + entry_snapshot = self.tbl.snapshot_by_id(entry.snapshot_id) if entry.snapshot_id is not None else None |
| 330 | + |
297 | 331 | partition_record_key = _convert_to_hashable_type(partition_record_dict) |
298 | 332 | if partition_record_key not in partitions_map: |
299 | 333 | partitions_map[partition_record_key] = { |
300 | 334 | "partition": partition_record_dict, |
301 | | - "spec_id": file.spec_id, |
| 335 | + "spec_id": entry.data_file.spec_id, |
302 | 336 | "record_count": 0, |
303 | 337 | "file_count": 0, |
304 | 338 | "total_data_file_size_in_bytes": 0, |
305 | 339 | "position_delete_record_count": 0, |
306 | 340 | "position_delete_file_count": 0, |
307 | 341 | "equality_delete_record_count": 0, |
308 | 342 | "equality_delete_file_count": 0, |
309 | | - "last_updated_at": snapshot.timestamp_ms if snapshot else None, |
310 | | - "last_updated_snapshot_id": snapshot.snapshot_id if snapshot else None, |
| 343 | + "last_updated_at": entry_snapshot.timestamp_ms if entry_snapshot else None, |
| 344 | + "last_updated_snapshot_id": entry_snapshot.snapshot_id if entry_snapshot else None, |
311 | 345 | } |
312 | 346 |
|
313 | 347 | partition_row = partitions_map[partition_record_key] |
314 | 348 |
|
315 | | - if snapshot is not None: |
316 | | - if partition_row["last_updated_at"] is None or partition_row["last_updated_snapshot_id"] < snapshot.timestamp_ms: |
317 | | - partition_row["last_updated_at"] = snapshot.timestamp_ms |
318 | | - partition_row["last_updated_snapshot_id"] = snapshot.snapshot_id |
| 349 | + if entry_snapshot is not None: |
| 350 | + if ( |
| 351 | + partition_row["last_updated_at"] is None |
| 352 | + or partition_row["last_updated_snapshot_id"] < entry_snapshot.timestamp_ms |
| 353 | + ): |
| 354 | + partition_row["last_updated_at"] = entry_snapshot.timestamp_ms |
| 355 | + partition_row["last_updated_snapshot_id"] = entry_snapshot.snapshot_id |
319 | 356 |
|
320 | | - if file.content == DataFileContent.DATA: |
321 | | - partition_row["record_count"] += file.record_count |
| 357 | + if entry.data_file.content == DataFileContent.DATA: |
| 358 | + partition_row["record_count"] += entry.data_file.record_count |
322 | 359 | partition_row["file_count"] += 1 |
323 | | - partition_row["total_data_file_size_in_bytes"] += file.file_size_in_bytes |
324 | | - elif file.content == DataFileContent.POSITION_DELETES: |
325 | | - partition_row["position_delete_record_count"] += file.record_count |
| 360 | + partition_row["total_data_file_size_in_bytes"] += entry.data_file.file_size_in_bytes |
| 361 | + elif entry.data_file.content == DataFileContent.POSITION_DELETES: |
| 362 | + partition_row["position_delete_record_count"] += entry.data_file.record_count |
326 | 363 | partition_row["position_delete_file_count"] += 1 |
327 | | - elif file.content == DataFileContent.EQUALITY_DELETES: |
328 | | - partition_row["equality_delete_record_count"] += file.record_count |
| 364 | + elif entry.data_file.content == DataFileContent.EQUALITY_DELETES: |
| 365 | + partition_row["equality_delete_record_count"] += entry.data_file.record_count |
329 | 366 | partition_row["equality_delete_file_count"] += 1 |
330 | 367 | else: |
331 | | - raise ValueError(f"Unknown DataFileContent ({file.content})") |
332 | | - |
333 | | - partitions_map: Dict[Tuple[str, Any], Any] = {} |
334 | | - snapshot = self._get_snapshot(snapshot_id) |
335 | | - for manifest in snapshot.manifests(self.tbl.io): |
336 | | - for entry in manifest.fetch_manifest_entry(io=self.tbl.io): |
337 | | - partition = entry.data_file.partition |
338 | | - partition_record_dict = { |
339 | | - field.name: partition[pos] |
340 | | - for pos, field in enumerate(self.tbl.metadata.specs()[manifest.partition_spec_id].fields) |
341 | | - } |
342 | | - entry_snapshot = self.tbl.snapshot_by_id(entry.snapshot_id) if entry.snapshot_id is not None else None |
343 | | - update_partitions_map(partitions_map, entry.data_file, partition_record_dict, entry_snapshot) |
| 368 | + raise ValueError(f"Unknown DataFileContent ({entry.data_file.content})") |
344 | 369 |
|
345 | | - return pa.Table.from_pylist( |
346 | | - partitions_map.values(), |
347 | | - schema=table_schema, |
348 | | - ) |
| 370 | + return partitions_map |
349 | 371 |
|
350 | 372 | def _get_manifests_schema(self) -> "pa.Schema": |
351 | 373 | import pyarrow as pa |
|
0 commit comments