@@ -81,7 +81,9 @@ class TestStruct:
8181 y : Optional [float ]
8282
8383
84- def construct_test_table () -> Tuple [pq .FileMetaData , Union [TableMetadataV1 , TableMetadataV2 ]]:
84+ def construct_test_table (
85+ write_statistics : Union [bool , List [str ]] = True ,
86+ ) -> Tuple [pq .FileMetaData , Union [TableMetadataV1 , TableMetadataV2 ]]:
8587 table_metadata = {
8688 "format-version" : 2 ,
8789 "location" : "s3://bucket/test/location" ,
@@ -169,7 +171,9 @@ def construct_test_table() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, Tabl
169171 metadata_collector : List [Any ] = []
170172
171173 with pa .BufferOutputStream () as f :
172- with pq .ParquetWriter (f , table .schema , metadata_collector = metadata_collector ) as writer :
174+ with pq .ParquetWriter (
175+ f , table .schema , metadata_collector = metadata_collector , write_statistics = write_statistics
176+ ) as writer :
173177 writer .write_table (table )
174178
175179 return metadata_collector [0 ], table_metadata
@@ -681,6 +685,41 @@ def test_stats_types(table_schema_nested: Schema) -> None:
681685 ]
682686
683687
688+ def test_read_missing_statistics () -> None :
689+ # write statistics for only for "strings" column
690+ metadata , table_metadata = construct_test_table (write_statistics = ["strings" ])
691+
692+ # expect only "strings" column to have statistics in metadata
693+ # and all other columns to have no statistics
694+ for r in range (metadata .num_row_groups ):
695+ for pos in range (metadata .num_columns ):
696+ if metadata .row_group (r ).column (pos ).path_in_schema == "strings" :
697+ assert metadata .row_group (r ).column (pos ).is_stats_set is True
698+ assert metadata .row_group (r ).column (pos ).statistics is not None
699+ else :
700+ assert metadata .row_group (r ).column (pos ).is_stats_set is False
701+ assert metadata .row_group (r ).column (pos ).statistics is None
702+
703+ schema = get_current_schema (table_metadata )
704+ statistics = data_file_statistics_from_parquet_metadata (
705+ parquet_metadata = metadata ,
706+ stats_columns = compute_statistics_plan (schema , table_metadata .properties ),
707+ parquet_column_mapping = parquet_path_to_id_mapping (schema ),
708+ )
709+
710+ datafile = DataFile (** statistics .to_serialized_dict ())
711+
712+ # expect only "strings" column values to be reflected in the
713+ # upper_bound, lower_bound and null_value_counts props of datafile
714+ string_col_idx = 1
715+ assert len (datafile .lower_bounds ) == 1
716+ assert datafile .lower_bounds [string_col_idx ].decode () == "aaaaaaaaaaaaaaaa"
717+ assert len (datafile .upper_bounds ) == 1
718+ assert datafile .upper_bounds [string_col_idx ].decode () == "zzzzzzzzzzzzzzz{"
719+ assert len (datafile .null_value_counts ) == 1
720+ assert datafile .null_value_counts [string_col_idx ] == 1
721+
722+
684723# This is commented out for now because write_to_dataset drops the partition
685724# columns making it harder to calculate the mapping from the column index to
686725# datatype id
0 commit comments