Skip to content

Commit ab43c6c

Browse files
authored
fix KeyError raised by add_files when parquet file doe not have column stats (#1354)
* fix KeyError, by switching del to pop * added unit test * update test * fix python 3.9 compatibility, and refactor test * update test
1 parent cc1ab2c commit ab43c6c

File tree

2 files changed

+43
-4
lines changed

2 files changed

+43
-4
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2397,8 +2397,8 @@ def data_file_statistics_from_parquet_metadata(
23972397
split_offsets.sort()
23982398

23992399
for field_id in invalidate_col:
2400-
del col_aggs[field_id]
2401-
del null_value_counts[field_id]
2400+
col_aggs.pop(field_id, None)
2401+
null_value_counts.pop(field_id, None)
24022402

24032403
return DataFileStatistics(
24042404
record_count=parquet_metadata.num_rows,

tests/io/test_pyarrow_stats.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,9 @@ class TestStruct:
8181
y: Optional[float]
8282

8383

84-
def construct_test_table() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]:
84+
def construct_test_table(
85+
write_statistics: Union[bool, List[str]] = True,
86+
) -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]:
8587
table_metadata = {
8688
"format-version": 2,
8789
"location": "s3://bucket/test/location",
@@ -169,7 +171,9 @@ def construct_test_table() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, Tabl
169171
metadata_collector: List[Any] = []
170172

171173
with pa.BufferOutputStream() as f:
172-
with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector) as writer:
174+
with pq.ParquetWriter(
175+
f, table.schema, metadata_collector=metadata_collector, write_statistics=write_statistics
176+
) as writer:
173177
writer.write_table(table)
174178

175179
return metadata_collector[0], table_metadata
@@ -681,6 +685,41 @@ def test_stats_types(table_schema_nested: Schema) -> None:
681685
]
682686

683687

688+
def test_read_missing_statistics() -> None:
689+
# write statistics for only for "strings" column
690+
metadata, table_metadata = construct_test_table(write_statistics=["strings"])
691+
692+
# expect only "strings" column to have statistics in metadata
693+
# and all other columns to have no statistics
694+
for r in range(metadata.num_row_groups):
695+
for pos in range(metadata.num_columns):
696+
if metadata.row_group(r).column(pos).path_in_schema == "strings":
697+
assert metadata.row_group(r).column(pos).is_stats_set is True
698+
assert metadata.row_group(r).column(pos).statistics is not None
699+
else:
700+
assert metadata.row_group(r).column(pos).is_stats_set is False
701+
assert metadata.row_group(r).column(pos).statistics is None
702+
703+
schema = get_current_schema(table_metadata)
704+
statistics = data_file_statistics_from_parquet_metadata(
705+
parquet_metadata=metadata,
706+
stats_columns=compute_statistics_plan(schema, table_metadata.properties),
707+
parquet_column_mapping=parquet_path_to_id_mapping(schema),
708+
)
709+
710+
datafile = DataFile(**statistics.to_serialized_dict())
711+
712+
# expect only "strings" column values to be reflected in the
713+
# upper_bound, lower_bound and null_value_counts props of datafile
714+
string_col_idx = 1
715+
assert len(datafile.lower_bounds) == 1
716+
assert datafile.lower_bounds[string_col_idx].decode() == "aaaaaaaaaaaaaaaa"
717+
assert len(datafile.upper_bounds) == 1
718+
assert datafile.upper_bounds[string_col_idx].decode() == "zzzzzzzzzzzzzzz{"
719+
assert len(datafile.null_value_counts) == 1
720+
assert datafile.null_value_counts[string_col_idx] == 1
721+
722+
684723
# This is commented out for now because write_to_dataset drops the partition
685724
# columns making it harder to calculate the mapping from the column index to
686725
# datatype id

0 commit comments

Comments
 (0)