|
17 | 17 |
|
18 | 18 | #include "parquet/metadata.h" |
19 | 19 |
|
| 20 | +#include <gmock/gmock.h> |
20 | 21 | #include <gtest/gtest.h> |
21 | 22 |
|
22 | 23 | #include "arrow/util/key_value_metadata.h" |
@@ -427,6 +428,45 @@ TEST(Metadata, TestReadPageIndex) { |
427 | 428 | } |
428 | 429 | } |
429 | 430 |
|
| 431 | +// Regression test: a column with max_definition_level == 0 cannot encode |
| 432 | +// nulls, so any positive null_count in its statistics indicates a malformed |
| 433 | +// file. The reader should reject it when the metadata is loaded. |
| 434 | +TEST(Metadata, RejectsRequiredColumnWithNonZeroNullCount) { |
| 435 | + schema::NodeVector fields; |
| 436 | + fields.push_back(schema::Int32("required_col", Repetition::REQUIRED)); |
| 437 | + auto schema_node = std::static_pointer_cast<schema::GroupNode>( |
| 438 | + schema::GroupNode::Make("schema", Repetition::REQUIRED, fields)); |
| 439 | + |
| 440 | + SchemaDescriptor schema_descr; |
| 441 | + schema_descr.Init(schema_node); |
| 442 | + |
| 443 | + format::ColumnChunk column_chunk; |
| 444 | + format::ColumnMetaData& column_metadata = column_chunk.meta_data; |
| 445 | + column_chunk.__isset.meta_data = true; |
| 446 | + |
| 447 | + column_metadata.type = format::Type::INT32; |
| 448 | + column_metadata.codec = format::CompressionCodec::UNCOMPRESSED; |
| 449 | + column_metadata.num_values = 1000; |
| 450 | + column_metadata.total_uncompressed_size = 4000; |
| 451 | + column_metadata.total_compressed_size = 4000; |
| 452 | + column_metadata.data_page_offset = 4; |
| 453 | + column_metadata.path_in_schema.push_back("required_col"); |
| 454 | + |
| 455 | + column_metadata.statistics.null_count = 105; |
| 456 | + column_metadata.statistics.__isset.null_count = true; |
| 457 | + column_metadata.__isset.statistics = true; |
| 458 | + |
| 459 | + EXPECT_THROW_THAT( |
| 460 | + [&]() { ColumnChunkMetaData::Make(&column_chunk, schema_descr.Column(0)); }, |
| 461 | + ParquetException, |
| 462 | + ::testing::Property( |
| 463 | + &ParquetException::what, |
| 464 | + ::testing::AllOf(::testing::HasSubstr("Malformed Parquet file"), |
| 465 | + ::testing::HasSubstr("required_col"), |
| 466 | + ::testing::HasSubstr("max_definition_level == 0"), |
| 467 | + ::testing::HasSubstr("null_count=105")))); |
| 468 | +} |
| 469 | + |
430 | 470 | TEST(Metadata, TestSortingColumns) { |
431 | 471 | schema::NodeVector fields; |
432 | 472 | fields.push_back(schema::Int32("sort_col", Repetition::REQUIRED)); |
|
0 commit comments