Skip to content

Commit be914fd

Browse files
committed
Add support for v3 snapshot metadata fields
1 parent 2a9f2ea commit be914fd

File tree

2 files changed

+117
-1
lines changed

2 files changed

+117
-1
lines changed

pyiceberg/table/snapshots.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from enum import Enum
2323
from typing import TYPE_CHECKING, Any, DefaultDict, Dict, Iterable, List, Mapping, Optional
2424

25-
from pydantic import Field, PrivateAttr, model_serializer
25+
from pydantic import Field, PrivateAttr, field_validator, model_serializer, model_validator
2626

2727
from pyiceberg.io import FileIO
2828
from pyiceberg.manifest import DataFile, DataFileContent, ManifestFile, _manifests
@@ -237,14 +237,55 @@ def __eq__(self, other: Any) -> bool:
237237

238238

239239
class Snapshot(IcebergBaseModel):
240+
"""Represents a snapshot of an Iceberg table at a specific point in time.
241+
242+
A snapshot tracks the state of a table, including all data and delete files,
243+
at the time the snapshot was created.
244+
"""
245+
240246
snapshot_id: int = Field(alias="snapshot-id")
241247
parent_snapshot_id: Optional[int] = Field(alias="parent-snapshot-id", default=None)
242248
sequence_number: Optional[int] = Field(alias="sequence-number", default=INITIAL_SEQUENCE_NUMBER)
243249
timestamp_ms: int = Field(alias="timestamp-ms", default_factory=lambda: int(time.time() * 1000))
244250
manifest_list: str = Field(alias="manifest-list", description="Location of the snapshot's manifest list file")
251+
first_row_id: Optional[int] = Field(
252+
alias="first-row-id",
253+
default=None,
254+
description="The row-id of the first newly added row in this snapshot. Returns None when row lineage is not supported.",
255+
)
256+
added_rows: Optional[int] = Field(
257+
alias="added-rows",
258+
default=None,
259+
description=(
260+
"The upper bound of number of rows with assigned row IDs in this snapshot. Returns None if the value was not stored."
261+
),
262+
)
245263
summary: Optional[Summary] = Field(default=None)
246264
schema_id: Optional[int] = Field(alias="schema-id", default=None)
247265

266+
@field_validator("first_row_id")
267+
@classmethod
268+
def validate_first_row_id(cls, v: Optional[int]) -> Optional[int]:
269+
"""Validate that first_row_id is non-negative if provided."""
270+
if v is not None and v < 0:
271+
raise ValueError(f"Invalid first-row-id (cannot be negative): {v}")
272+
return v
273+
274+
@field_validator("added_rows")
275+
@classmethod
276+
def validate_added_rows(cls, v: Optional[int]) -> Optional[int]:
277+
"""Validate that added_rows is non-negative if provided."""
278+
if v is not None and v < 0:
279+
raise ValueError(f"Invalid added-rows (cannot be negative): {v}")
280+
return v
281+
282+
@model_validator(mode="after")
283+
def validate_row_lineage_fields(self) -> "Snapshot":
284+
"""Validate that added_rows is required when first_row_id is set."""
285+
if self.first_row_id is not None and self.added_rows is None:
286+
raise ValueError("Invalid added-rows (required when first-row-id is set): None")
287+
return self
288+
248289
def __str__(self) -> str:
249290
"""Return the string representation of the Snapshot class."""
250291
operation = f"{self.summary.operation}: " if self.summary else ""

tests/table/test_snapshots.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -456,3 +456,78 @@ def test_ancestors_between(table_v2_with_extensive_snapshots: Table) -> None:
456456
)
457457
== 2000
458458
)
459+
460+
461+
def test_snapshot_v3_fields() -> None:
462+
snapshot = Snapshot(
463+
**{
464+
"snapshot-id": 1,
465+
"timestamp-ms": 1234567890,
466+
"manifest-list": "s3:/a/b/c.avro",
467+
"first-row-id": 100,
468+
"added-rows": 1000,
469+
"summary": {"operation": "append"},
470+
}
471+
)
472+
473+
assert snapshot.first_row_id == 100
474+
assert snapshot.added_rows == 1000
475+
476+
actual = snapshot.model_dump_json()
477+
expected = """{"snapshot-id":1,"sequence-number":0,"timestamp-ms":1234567890,"manifest-list":"s3:/a/b/c.avro","first-row-id":100,"added-rows":1000,"summary":{"operation":"append"}}"""
478+
assert actual == expected
479+
480+
481+
def test_snapshot_v3_fields_validation_negative_first_row_id() -> None:
482+
with pytest.raises(ValueError, match="Invalid first-row-id \\(cannot be negative\\): -1"):
483+
Snapshot(
484+
**{
485+
"snapshot-id": 1,
486+
"timestamp-ms": 1234567890,
487+
"manifest-list": "s3:/a/b/c.avro",
488+
"first-row-id": -1,
489+
"added-rows": 1000,
490+
"summary": {"operation": "append"},
491+
}
492+
)
493+
494+
495+
def test_snapshot_v3_fields_validation_negative_added_rows() -> None:
496+
with pytest.raises(ValueError, match="Invalid added-rows \\(cannot be negative\\): -1"):
497+
Snapshot(
498+
**{
499+
"snapshot-id": 1,
500+
"timestamp-ms": 1234567890,
501+
"manifest-list": "s3:/a/b/c.avro",
502+
"first-row-id": 100,
503+
"added-rows": -1,
504+
"summary": {"operation": "append"},
505+
}
506+
)
507+
508+
509+
def test_snapshot_v3_fields_validation_first_row_id_requires_added_rows() -> None:
510+
with pytest.raises(ValueError, match="Invalid added-rows \\(required when first-row-id is set\\): None"):
511+
Snapshot(
512+
**{
513+
"snapshot-id": 1,
514+
"timestamp-ms": 1234567890,
515+
"manifest-list": "s3:/a/b/c.avro",
516+
"first-row-id": 100,
517+
"summary": {"operation": "append"},
518+
}
519+
)
520+
521+
522+
def test_snapshot_v3_fields_added_rows_without_first_row_id() -> None:
523+
snapshot = Snapshot(
524+
**{
525+
"snapshot-id": 1,
526+
"timestamp-ms": 1234567890,
527+
"manifest-list": "s3:/a/b/c.avro",
528+
"added-rows": 1000,
529+
"summary": {"operation": "append"},
530+
}
531+
)
532+
assert snapshot.first_row_id is None
533+
assert snapshot.added_rows == 1000

0 commit comments

Comments
 (0)