Skip to content

Commit e147381

Browse files
committed
Merge branch 'main' of github.com:apache/iceberg-python into fd-rust
2 parents a08c353 + 1f9c46b commit e147381

File tree

19 files changed

+1049
-601
lines changed

19 files changed

+1049
-601
lines changed

.github/workflows/nightly-pypi-build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ jobs:
7171

7272
steps:
7373
- name: Download all the artifacts
74-
uses: actions/download-artifact@v5
74+
uses: actions/download-artifact@v6
7575
with:
7676
merge-multiple: true
7777
path: dist/

mkdocs/docs/how-to-release.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ This guide outlines the process for releasing PyIceberg in accordance with the [
3030
## Requirements
3131

3232
* A GPG key must be registered and published in the [Apache Iceberg KEYS file](https://downloads.apache.org/iceberg/KEYS). Follow [the instructions for setting up a GPG key and uploading it to the KEYS file](#set-up-gpg-key-and-upload-to-apache-iceberg-keys-file).
33+
* Permission to update the `KEYS` artifact in the [Apache release distribution](https://dist.apache.org/repos/dist/release/iceberg/) (requires Iceberg PMC privileges).
3334
* SVN Access
3435
* Permission to upload artifacts to the [Apache development distribution](https://dist.apache.org/repos/dist/dev/iceberg/) (requires Apache Committer access).
3536
* Permission to upload artifacts to the [Apache release distribution](https://dist.apache.org/repos/dist/release/iceberg/) (requires Apache PMC access).
@@ -405,5 +406,12 @@ cd icebergsvn
405406
echo "" >> KEYS # append a newline
406407
gpg --list-sigs <YOUR KEY ID HERE> >> KEYS # append signatures
407408
gpg --armor --export <YOUR KEY ID HERE> >> KEYS # append public key block
408-
svn commit -m "add key for <YOUR NAME HERE>"
409+
svn commit -m "add key for <YOUR NAME HERE>" # this requires Iceberg PMC privileges
409410
```
411+
412+
<!-- prettier-ignore-start -->
413+
414+
!!! note
415+
Updating the `KEYS` artifact in the `release/` distribution requires Iceberg PMC privileges. Please work with a PMC member to update the file.
416+
417+
<!-- prettier-ignore-end -->

poetry.lock

Lines changed: 610 additions & 500 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyiceberg/catalog/glue.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -355,34 +355,29 @@ def __init__(self, name: str, client: Optional["GlueClient"] = None, **propertie
355355
_register_glue_catalog_id_with_glue_client(self.glue, glue_catalog_id)
356356

357357
def _convert_glue_to_iceberg(self, glue_table: "TableTypeDef") -> Table:
358-
properties: Properties = glue_table["Parameters"]
359-
360-
database_name = glue_table.get("DatabaseName", None)
361-
if database_name is None:
358+
if (database_name := glue_table.get("DatabaseName")) is None:
362359
raise ValueError("Glue table is missing DatabaseName property")
363360

364-
parameters = glue_table.get("Parameters", None)
365-
if parameters is None:
366-
raise ValueError("Glue table is missing Parameters property")
361+
if (table_name := glue_table.get("Name")) is None:
362+
raise ValueError("Glue table is missing Name property")
367363

368-
table_name = glue_table["Name"]
364+
if (parameters := glue_table.get("Parameters")) is None:
365+
raise ValueError("Glue table is missing Parameters property")
369366

370-
if TABLE_TYPE not in properties:
367+
if (glue_table_type := parameters.get(TABLE_TYPE)) is None:
371368
raise NoSuchPropertyException(
372369
f"Property {TABLE_TYPE} missing, could not determine type: {database_name}.{table_name}"
373370
)
374-
glue_table_type = properties[TABLE_TYPE]
375371

376372
if glue_table_type.lower() != ICEBERG:
377373
raise NoSuchIcebergTableError(
378374
f"Property table_type is {glue_table_type}, expected {ICEBERG}: {database_name}.{table_name}"
379375
)
380376

381-
if METADATA_LOCATION not in properties:
377+
if (metadata_location := parameters.get(METADATA_LOCATION)) is None:
382378
raise NoSuchPropertyException(
383379
f"Table property {METADATA_LOCATION} is missing, cannot find metadata for: {database_name}.{table_name}"
384380
)
385-
metadata_location = properties[METADATA_LOCATION]
386381

387382
io = self._load_file_io(location=metadata_location)
388383
file = io.new_input(metadata_location)

pyiceberg/catalog/rest/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,9 @@ def _create_session(self) -> Session:
238238
"""Create a request session with provided catalog configuration."""
239239
session = Session()
240240

241+
# Set HTTP headers
242+
self._config_headers(session)
243+
241244
# Sets the client side and server side SSL cert verification, if provided as properties.
242245
if ssl_config := self.properties.get(SSL):
243246
if ssl_ca_bundle := ssl_config.get(CA_BUNDLE):
@@ -265,9 +268,6 @@ def _create_session(self) -> Session:
265268
else:
266269
session.auth = AuthManagerAdapter(self._create_legacy_oauth2_auth_manager(session))
267270

268-
# Set HTTP headers
269-
self._config_headers(session)
270-
271271
# Configure SigV4 Request Signing
272272
if property_as_bool(self.properties, SIGV4, False):
273273
self._init_sigv4(session)

pyiceberg/expressions/__init__.py

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
)
3434
from typing import Literal as TypingLiteral
3535

36-
from pydantic import Field
36+
from pydantic import ConfigDict, Field
3737

3838
from pyiceberg.expressions.literals import (
3939
AboveMax,
@@ -302,12 +302,19 @@ def __getnewargs__(self) -> Tuple[BooleanExpression, BooleanExpression]:
302302
return (self.left, self.right)
303303

304304

305-
class Or(BooleanExpression):
305+
class Or(IcebergBaseModel, BooleanExpression):
306306
"""OR operation expression - logical disjunction."""
307307

308+
model_config = ConfigDict(arbitrary_types_allowed=True)
309+
310+
type: TypingLiteral["or"] = Field(default="or", alias="type")
308311
left: BooleanExpression
309312
right: BooleanExpression
310313

314+
def __init__(self, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression) -> None:
315+
if isinstance(self, Or) and not hasattr(self, "left") and not hasattr(self, "right"):
316+
super().__init__(left=left, right=right)
317+
311318
def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: BooleanExpression) -> BooleanExpression: # type: ignore
312319
if rest:
313320
return _build_balanced_tree(Or, (left, right, *rest))
@@ -319,10 +326,12 @@ def __new__(cls, left: BooleanExpression, right: BooleanExpression, *rest: Boole
319326
return left
320327
else:
321328
obj = super().__new__(cls)
322-
obj.left = left
323-
obj.right = right
324329
return obj
325330

331+
def __str__(self) -> str:
332+
"""Return the string representation of the Or class."""
333+
return f"{str(self.__class__.__name__)}(left={repr(self.left)}, right={repr(self.right)})"
334+
326335
def __eq__(self, other: Any) -> bool:
327336
"""Return the equality of two instances of the Or class."""
328337
return self.left == other.left and self.right == other.right if isinstance(other, Or) else False
@@ -341,22 +350,31 @@ def __getnewargs__(self) -> Tuple[BooleanExpression, BooleanExpression]:
341350
return (self.left, self.right)
342351

343352

344-
class Not(BooleanExpression):
353+
class Not(IcebergBaseModel, BooleanExpression):
345354
"""NOT operation expression - logical negation."""
346355

347-
child: BooleanExpression
356+
model_config = ConfigDict(arbitrary_types_allowed=True)
357+
358+
type: TypingLiteral["not"] = Field(default="not")
359+
child: BooleanExpression = Field()
360+
361+
def __init__(self, child: BooleanExpression, **_: Any) -> None:
362+
super().__init__(child=child)
348363

349-
def __new__(cls, child: BooleanExpression) -> BooleanExpression: # type: ignore
364+
def __new__(cls, child: BooleanExpression, **_: Any) -> BooleanExpression: # type: ignore
350365
if child is AlwaysTrue():
351366
return AlwaysFalse()
352367
elif child is AlwaysFalse():
353368
return AlwaysTrue()
354369
elif isinstance(child, Not):
355370
return child.child
356371
obj = super().__new__(cls)
357-
obj.child = child
358372
return obj
359373

374+
def __str__(self) -> str:
375+
"""Return the string representation of the Not class."""
376+
return f"Not(child={self.child})"
377+
360378
def __repr__(self) -> str:
361379
"""Return the string representation of the Not class."""
362380
return f"Not(child={repr(self.child)})"
@@ -373,8 +391,6 @@ def __getnewargs__(self) -> Tuple[BooleanExpression]:
373391
"""Pickle the Not class."""
374392
return (self.child,)
375393

376-
"""TRUE expression."""
377-
378394

379395
class AlwaysTrue(BooleanExpression, Singleton, IcebergRootModel[str]):
380396
"""TRUE expression."""
@@ -447,7 +463,20 @@ def bind(self, schema: Schema, case_sensitive: bool = True) -> BooleanExpression
447463
def as_bound(self) -> Type[BoundPredicate[L]]: ...
448464

449465

450-
class UnaryPredicate(UnboundPredicate[Any], ABC):
466+
class UnaryPredicate(IcebergBaseModel, UnboundPredicate[Any], ABC):
467+
type: str
468+
469+
model_config = {"arbitrary_types_allowed": True}
470+
471+
def __init__(self, term: Union[str, UnboundTerm[Any]]):
472+
unbound = _to_unbound_term(term)
473+
super().__init__(term=unbound)
474+
475+
def __str__(self) -> str:
476+
"""Return the string representation of the UnaryPredicate class."""
477+
# Sort to make it deterministic
478+
return f"{str(self.__class__.__name__)}(term={str(self.term)})"
479+
451480
def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundUnaryPredicate[Any]:
452481
bound_term = self.term.bind(schema, case_sensitive)
453482
return self.as_bound(bound_term)
@@ -506,6 +535,8 @@ def as_unbound(self) -> Type[NotNull]:
506535

507536

508537
class IsNull(UnaryPredicate):
538+
type: str = "is-null"
539+
509540
def __invert__(self) -> NotNull:
510541
"""Transform the Expression into its negated version."""
511542
return NotNull(self.term)
@@ -516,6 +547,8 @@ def as_bound(self) -> Type[BoundIsNull[L]]:
516547

517548

518549
class NotNull(UnaryPredicate):
550+
type: str = "not-null"
551+
519552
def __invert__(self) -> IsNull:
520553
"""Transform the Expression into its negated version."""
521554
return IsNull(self.term)
@@ -558,6 +591,8 @@ def as_unbound(self) -> Type[NotNaN]:
558591

559592

560593
class IsNaN(UnaryPredicate):
594+
type: str = "is-nan"
595+
561596
def __invert__(self) -> NotNaN:
562597
"""Transform the Expression into its negated version."""
563598
return NotNaN(self.term)
@@ -568,6 +603,8 @@ def as_bound(self) -> Type[BoundIsNaN[L]]:
568603

569604

570605
class NotNaN(UnaryPredicate):
606+
type: str = "not-nan"
607+
571608
def __invert__(self) -> IsNaN:
572609
"""Transform the Expression into its negated version."""
573610
return IsNaN(self.term)

pyiceberg/io/pyarrow.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2082,13 +2082,18 @@ def __init__(self, iceberg_type: PrimitiveType, physical_type_string: str, trunc
20822082
self.trunc_length = trunc_length
20832083

20842084
expected_physical_type = _primitive_to_physical(iceberg_type)
2085+
2086+
# TODO: Refactor to use promotion logic
20852087
if expected_physical_type != physical_type_string:
20862088
# Allow promotable physical types
20872089
# INT32 -> INT64 and FLOAT -> DOUBLE are safe type casts
20882090
if (physical_type_string == "INT32" and expected_physical_type == "INT64") or (
20892091
physical_type_string == "FLOAT" and expected_physical_type == "DOUBLE"
20902092
):
20912093
pass
2094+
# Allow DECIMAL to be stored as FIXED_LEN_BYTE_ARRAY, INT32 or INT64
2095+
elif physical_type_string == "FIXED_LEN_BYTE_ARRAY" and expected_physical_type in ("INT32", "INT64"):
2096+
pass
20922097
else:
20932098
raise ValueError(
20942099
f"Unexpected physical type {physical_type_string} for {iceberg_type}, expected {expected_physical_type}"
@@ -2506,12 +2511,16 @@ def data_file_statistics_from_parquet_metadata(
25062511

25072512
if isinstance(stats_col.iceberg_type, DecimalType) and statistics.physical_type != "FIXED_LEN_BYTE_ARRAY":
25082513
scale = stats_col.iceberg_type.scale
2509-
col_aggs[field_id].update_min(
2510-
unscaled_to_decimal(statistics.min_raw, scale)
2511-
) if statistics.min_raw is not None else None
2512-
col_aggs[field_id].update_max(
2513-
unscaled_to_decimal(statistics.max_raw, scale)
2514-
) if statistics.max_raw is not None else None
2514+
(
2515+
col_aggs[field_id].update_min(unscaled_to_decimal(statistics.min_raw, scale))
2516+
if statistics.min_raw is not None
2517+
else None
2518+
)
2519+
(
2520+
col_aggs[field_id].update_max(unscaled_to_decimal(statistics.max_raw, scale))
2521+
if statistics.max_raw is not None
2522+
else None
2523+
)
25152524
else:
25162525
col_aggs[field_id].update_min(statistics.min)
25172526
col_aggs[field_id].update_max(statistics.max)

pyiceberg/table/snapshots.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,12 @@ class Snapshot(IcebergBaseModel):
244244
manifest_list: str = Field(alias="manifest-list", description="Location of the snapshot's manifest list file")
245245
summary: Optional[Summary] = Field(default=None)
246246
schema_id: Optional[int] = Field(alias="schema-id", default=None)
247+
first_row_id: Optional[int] = Field(
248+
alias="first-row-id", default=None, description="assigned to the first row in the first data file in the first manifest"
249+
)
250+
added_rows: Optional[int] = Field(
251+
alias="added-rows", default=None, description="The upper bound of the number of rows with assigned row IDs"
252+
)
247253

248254
def __str__(self) -> str:
249255
"""Return the string representation of the Snapshot class."""
@@ -253,6 +259,22 @@ def __str__(self) -> str:
253259
result_str = f"{operation}id={self.snapshot_id}{parent_id}{schema_id}"
254260
return result_str
255261

262+
def __repr__(self) -> str:
263+
"""Return the string representation of the Snapshot class."""
264+
fields = [
265+
f"snapshot_id={self.snapshot_id}",
266+
f"parent_snapshot_id={self.parent_snapshot_id}",
267+
f"sequence_number={self.sequence_number}",
268+
f"timestamp_ms={self.timestamp_ms}",
269+
f"manifest_list='{self.manifest_list}'",
270+
f"summary={repr(self.summary)}" if self.summary else None,
271+
f"schema_id={self.schema_id}" if self.schema_id is not None else None,
272+
f"first_row_id={self.first_row_id}" if self.first_row_id is not None else None,
273+
f"added_rows={self.added_rows}" if self.added_rows is not None else None,
274+
]
275+
filtered_fields = [field for field in fields if field is not None]
276+
return f"Snapshot({', '.join(filtered_fields)})"
277+
256278
def manifests(self, io: FileIO) -> List[ManifestFile]:
257279
"""Return the manifests for the given snapshot."""
258280
return list(_manifests(io, self.manifest_list))

pyiceberg/table/update/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,13 +437,29 @@ def _(update: AddSnapshotUpdate, base_metadata: TableMetadata, context: _TableMe
437437
f"Cannot add snapshot with sequence number {update.snapshot.sequence_number} "
438438
f"older than last sequence number {base_metadata.last_sequence_number}"
439439
)
440+
elif base_metadata.format_version >= 3 and update.snapshot.first_row_id is None:
441+
raise ValueError("Cannot add snapshot without first row id")
442+
elif (
443+
base_metadata.format_version >= 3
444+
and update.snapshot.first_row_id is not None
445+
and base_metadata.next_row_id is not None
446+
and update.snapshot.first_row_id < base_metadata.next_row_id
447+
):
448+
raise ValueError(
449+
f"Cannot add a snapshot with first row id smaller than the table's next-row-id {update.snapshot.first_row_id} < {base_metadata.next_row_id}"
450+
)
440451

441452
context.add_update(update)
442453
return base_metadata.model_copy(
443454
update={
444455
"last_updated_ms": update.snapshot.timestamp_ms,
445456
"last_sequence_number": update.snapshot.sequence_number,
446457
"snapshots": base_metadata.snapshots + [update.snapshot],
458+
"next_row_id": base_metadata.next_row_id + update.snapshot.added_rows
459+
if base_metadata.format_version >= 3
460+
and base_metadata.next_row_id is not None
461+
and update.snapshot.added_rows is not None
462+
else None,
447463
}
448464
)
449465

0 commit comments

Comments
 (0)