Skip to content

Commit 76111a6

Browse files
committed
update
1 parent 0600621 commit 76111a6

2 files changed

Lines changed: 53 additions & 0 deletions

File tree

cpp/src/parquet/metadata.cc

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,19 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
307307
possible_encoded_stats_ = nullptr;
308308
possible_geo_stats_ = nullptr;
309309
InitKeyValueMetadata();
310+
311+
// Per the Parquet specification, a column with max_definition_level == 0
312+
// is required at every level and therefore cannot contain null values.
313+
// Reject inconsisten metadata from writers violating this invariant.
314+
if (descr_->max_definition_level() == 0 && column_metadata_->__isset.statistics &&
315+
column_metadata_->statistics.__isset.null_count &&
316+
column_metadata_->statistics.null_count > 0) {
317+
std::stringstream ss;
318+
ss << "Malformed Parquet file: column '" << descr_->path()->ToDotString()
319+
<< "' has max_definition_level == 0 but statistics report null_count="
320+
<< column_metadata_->statistics.null_count;
321+
throw ParquetException(ss.str());
322+
}
310323
}
311324

312325
bool Equals(const ColumnChunkMetaDataImpl& other) const {

cpp/src/parquet/metadata_test.cc

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
#include "parquet/metadata.h"
1919

20+
#include <gmock/gmock.h>
2021
#include <gtest/gtest.h>
2122

2223
#include "arrow/util/key_value_metadata.h"
@@ -427,6 +428,45 @@ TEST(Metadata, TestReadPageIndex) {
427428
}
428429
}
429430

431+
// Regression test: a column with max_definition_level == 0 cannot encode
432+
// nulls, so any positive null_count in its statistics indicates a malformed
433+
// file. The reader should reject it when the metadata is loaded.
434+
TEST(Metadata, RejectsRequiredColumnWithNonZeroNullCount) {
435+
schema::NodeVector fields;
436+
fields.push_back(schema::Int32("required_col", Repetition::REQUIRED));
437+
auto schema_node = std::static_pointer_cast<schema::GroupNode>(
438+
schema::GroupNode::Make("schema", Repetition::REQUIRED, fields));
439+
440+
SchemaDescriptor schema_descr;
441+
schema_descr.Init(schema_node);
442+
443+
format::ColumnChunk column_chunk;
444+
format::ColumnMetaData& column_metadata = column_chunk.meta_data;
445+
column_chunk.__isset.meta_data = true;
446+
447+
column_metadata.type = format::Type::INT32;
448+
column_metadata.codec = format::CompressionCodec::UNCOMPRESSED;
449+
column_metadata.num_values = 1000;
450+
column_metadata.total_uncompressed_size = 4000;
451+
column_metadata.total_compressed_size = 4000;
452+
column_metadata.data_page_offset = 4;
453+
column_metadata.path_in_schema.push_back("required_col");
454+
455+
column_metadata.statistics.null_count = 105;
456+
column_metadata.statistics.__isset.null_count = true;
457+
column_metadata.__isset.statistics = true;
458+
459+
EXPECT_THROW_THAT(
460+
[&]() { ColumnChunkMetaData::Make(&column_chunk, schema_descr.Column(0)); },
461+
ParquetException,
462+
::testing::Property(
463+
&ParquetException::what,
464+
::testing::AllOf(::testing::HasSubstr("Malformed Parquet file"),
465+
::testing::HasSubstr("required_col"),
466+
::testing::HasSubstr("max_definition_level == 0"),
467+
::testing::HasSubstr("null_count=105"))));
468+
}
469+
430470
TEST(Metadata, TestSortingColumns) {
431471
schema::NodeVector fields;
432472
fields.push_back(schema::Int32("sort_col", Repetition::REQUIRED));

0 commit comments

Comments
 (0)