Skip to content

Commit 48068e5

Browse files
committed
Add missing ORC iceberg.required attribute
1 parent e759044 commit 48068e5

File tree

2 files changed

+39
-0
lines changed

2 files changed

+39
-0
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@
201201
PYARROW_PARQUET_FIELD_ID_KEY = b"PARQUET:field_id"
202202
# ORC field ID key for Iceberg field IDs in ORC metadata
203203
ORC_FIELD_ID_KEY = b"iceberg.id"
204+
ORC_FIELD_REQUIRED_KEY = b"iceberg.required"
204205
PYARROW_FIELD_DOC_KEY = b"doc"
205206
LIST_ELEMENT_NAME = "element"
206207
MAP_KEY_NAME = "key"
@@ -722,6 +723,8 @@ def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field:
722723
else:
723724
# Default to Parquet for backward compatibility
724725
metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id)
726+
if self._file_format == FileFormat.ORC:
727+
metadata[ORC_FIELD_REQUIRED_KEY] = str(field.required)
725728

726729
return pa.field(
727730
name=field.name,

tests/io/test_pyarrow.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3844,6 +3844,42 @@ def test_orc_schema_conversion_with_field_ids() -> None:
38443844
assert not name_field_no_ids.metadata
38453845

38463846

3847+
def test_orc_schema_conversion_with_required_attribute() -> None:
3848+
"""
3849+
Test that schema_to_pyarrow correctly adds ORC iceberg.required attribute.
3850+
To run just this test:
3851+
pytest tests/io/test_pyarrow.py -k test_orc_schema_conversion_with_required_attribute
3852+
"""
3853+
from pyiceberg.io.pyarrow import ORC_FIELD_REQUIRED_KEY, schema_to_pyarrow
3854+
from pyiceberg.manifest import FileFormat
3855+
from pyiceberg.schema import Schema
3856+
from pyiceberg.types import IntegerType, StringType
3857+
3858+
# Define schema
3859+
schema = Schema(
3860+
NestedField(1, "id", IntegerType(), required=True),
3861+
NestedField(2, "name", StringType(), required=False),
3862+
)
3863+
3864+
# Test 1: Specify Parquet format
3865+
arrow_schema_default = schema_to_pyarrow(schema, file_format=FileFormat.PARQUET)
3866+
3867+
id_field = arrow_schema_default.field(0)
3868+
name_field = arrow_schema_default.field(1)
3869+
3870+
assert ORC_FIELD_REQUIRED_KEY not in id_field.metadata
3871+
assert ORC_FIELD_REQUIRED_KEY not in name_field.metadata
3872+
3873+
# Test 2: Specify ORC format
3874+
arrow_schema_orc = schema_to_pyarrow(schema, file_format=FileFormat.ORC)
3875+
3876+
id_field_orc = arrow_schema_orc.field(0)
3877+
name_field_orc = arrow_schema_orc.field(1)
3878+
3879+
assert id_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] is True
3880+
assert name_field_orc.metadata[ORC_FIELD_REQUIRED_KEY] is False
3881+
3882+
38473883
def test_orc_batching_behavior_documentation(tmp_path: Path) -> None:
38483884
"""
38493885
Document and verify PyArrow's exact batching behavior for ORC files.

0 commit comments

Comments
 (0)