Skip to content

Commit a4bf206

Browse files
committed
Merge branch 'main' of github.com:apache/iceberg-python into fd-deserialize-expr
2 parents 2793895 + 32c97ae commit a4bf206

File tree

3 files changed

+46
-3
lines changed

3 files changed

+46
-3
lines changed

.asf.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ github:
4242
required_approving_review_count: 1
4343

4444
required_linear_history: true
45-
del_branch_on_merge: true
45+
pull_requests:
46+
# auto-delete head branches after being merged
47+
del_branch_on_merge: true
4648
features:
4749
wiki: true
4850
issues: true

pyiceberg/io/pyarrow.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@
196196
PYARROW_PARQUET_FIELD_ID_KEY = b"PARQUET:field_id"
197197
# ORC field ID key for Iceberg field IDs in ORC metadata
198198
ORC_FIELD_ID_KEY = b"iceberg.id"
199+
ORC_FIELD_REQUIRED_KEY = b"iceberg.required"
199200
PYARROW_FIELD_DOC_KEY = b"doc"
200201
LIST_ELEMENT_NAME = "element"
201202
MAP_KEY_NAME = "key"
@@ -717,6 +718,8 @@ def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field:
717718
else:
718719
# Default to Parquet for backward compatibility
719720
metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id)
721+
if self._file_format == FileFormat.ORC:
722+
metadata[ORC_FIELD_REQUIRED_KEY] = str(field.required).lower()
720723

721724
return pa.field(
722725
name=field.name,

tests/io/test_pyarrow.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3840,8 +3840,46 @@ def test_orc_schema_conversion_with_field_ids() -> None:
38403840
id_field_no_ids = arrow_schema_no_ids.field(0)
38413841
name_field_no_ids = arrow_schema_no_ids.field(1)
38423842

3843-
assert not id_field_no_ids.metadata
3844-
assert not name_field_no_ids.metadata
3843+
assert ORC_FIELD_ID_KEY not in id_field_no_ids.metadata
3844+
assert ORC_FIELD_ID_KEY not in name_field_no_ids.metadata
3845+
assert PYARROW_PARQUET_FIELD_ID_KEY not in id_field_no_ids.metadata
3846+
assert PYARROW_PARQUET_FIELD_ID_KEY not in name_field_no_ids.metadata
3847+
3848+
3849+
def test_orc_schema_conversion_with_required_attribute() -> None:
3850+
"""
3851+
Test that schema_to_pyarrow correctly adds ORC iceberg.required attribute.
3852+
To run just this test:
3853+
pytest tests/io/test_pyarrow.py -k test_orc_schema_conversion_with_required_attribute
3854+
"""
3855+
from pyiceberg.io.pyarrow import ORC_FIELD_REQUIRED_KEY, schema_to_pyarrow
3856+
from pyiceberg.manifest import FileFormat
3857+
from pyiceberg.schema import Schema
3858+
from pyiceberg.types import IntegerType, StringType
3859+
3860+
# Define schema
3861+
schema = Schema(
3862+
NestedField(1, "id", IntegerType(), required=True),
3863+
NestedField(2, "name", StringType(), required=False),
3864+
)
3865+
3866+
# Test 1: Specify Parquet format
3867+
arrow_schema_default = schema_to_pyarrow(schema, file_format=FileFormat.PARQUET)
3868+
3869+
id_field = arrow_schema_default.field(0)
3870+
name_field = arrow_schema_default.field(1)
3871+
3872+
assert ORC_FIELD_REQUIRED_KEY not in id_field.metadata
3873+
assert ORC_FIELD_REQUIRED_KEY not in name_field.metadata
3874+
3875+
# Test 2: Specify ORC format
3876+
arrow_schema_orc = schema_to_pyarrow(schema, file_format=FileFormat.ORC)
3877+
3878+
id_field_orc = arrow_schema_orc.field(0)
3879+
name_field_orc = arrow_schema_orc.field(1)
3880+
3881+
assert id_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"true"
3882+
assert name_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] == b"false"
38453883

38463884

38473885
def test_orc_batching_behavior_documentation(tmp_path: Path) -> None:

0 commit comments

Comments
 (0)