Skip to content

Commit e8fea7e

Browse files
committed
GH-48858: [C++][Parquet] Avoid re-serializing footer for signature verification
1 parent 7b99930 commit e8fea7e

File tree

3 files changed

+99
-71
lines changed

3 files changed

+99
-71
lines changed

cpp/src/parquet/file_reader.cc

Lines changed: 55 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ using arrow::internal::AddWithOverflow;
5555

5656
namespace parquet {
5757

58+
using ::arrow::Future;
59+
using ::arrow::Result;
60+
using ::arrow::Status;
61+
5862
namespace {
5963
bool IsColumnChunkFullyDictionaryEncoded(const ColumnChunkMetaData& col) {
6064
// Check the encoding_stats to see if all data pages are dictionary encoded.
@@ -398,7 +402,7 @@ class SerializedFile : public ParquetFileReader::Contents {
398402
PARQUET_THROW_NOT_OK(cached_source_->Cache(ranges));
399403
}
400404

401-
::arrow::Result<std::vector<::arrow::io::ReadRange>> GetReadRanges(
405+
Result<std::vector<::arrow::io::ReadRange>> GetReadRanges(
402406
const std::vector<int>& row_groups, const std::vector<int>& column_indices,
403407
int64_t hole_size_limit, int64_t range_size_limit) {
404408
std::vector<::arrow::io::ReadRange> ranges;
@@ -413,10 +417,10 @@ class SerializedFile : public ParquetFileReader::Contents {
413417
range_size_limit);
414418
}
415419

416-
::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
417-
const std::vector<int>& column_indices) const {
420+
Future<> WhenBuffered(const std::vector<int>& row_groups,
421+
const std::vector<int>& column_indices) const {
418422
if (!cached_source_) {
419-
return ::arrow::Status::Invalid("Must call PreBuffer before WhenBuffered");
423+
return Status::Invalid("Must call PreBuffer before WhenBuffered");
420424
}
421425
std::vector<::arrow::io::ReadRange> ranges;
422426
for (int row : row_groups) {
@@ -465,23 +469,8 @@ class SerializedFile : public ParquetFileReader::Contents {
465469
// Fall through
466470
}
467471

468-
const uint32_t read_metadata_len = ParseUnencryptedFileMetadata(
469-
metadata_buffer, metadata_len, std::move(file_decryptor));
470-
auto file_decryption_properties = properties_.file_decryption_properties();
471-
if (is_encrypted_footer) {
472-
// Nothing else to do here.
473-
return;
474-
} else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file.
475-
if (file_decryption_properties != nullptr) {
476-
if (!file_decryption_properties->plaintext_files_allowed()) {
477-
throw ParquetException("Applying decryption properties on plaintext file");
478-
}
479-
}
480-
} else {
481-
// Encrypted file with plaintext footer mode.
482-
ParseMetaDataOfEncryptedFileWithPlaintextFooter(
483-
file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len);
484-
}
472+
ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, is_encrypted_footer,
473+
std::move(file_decryptor));
485474
}
486475

487476
// Validate the source size and get the initial read size.
@@ -522,16 +511,15 @@ class SerializedFile : public ParquetFileReader::Contents {
522511
}
523512

524513
// Does not throw.
525-
::arrow::Future<> ParseMetaDataAsync() {
514+
Future<> ParseMetaDataAsync() {
526515
int64_t footer_read_size;
527516
BEGIN_PARQUET_CATCH_EXCEPTIONS
528517
footer_read_size = GetFooterReadSize();
529518
END_PARQUET_CATCH_EXCEPTIONS
530519
// Assumes this is kept alive externally
531520
return source_->ReadAsync(source_size_ - footer_read_size, footer_read_size)
532-
.Then([this,
533-
footer_read_size](const std::shared_ptr<::arrow::Buffer>& footer_buffer)
534-
-> ::arrow::Future<> {
521+
.Then([this, footer_read_size](
522+
const std::shared_ptr<::arrow::Buffer>& footer_buffer) -> Future<> {
535523
uint32_t metadata_len;
536524
BEGIN_PARQUET_CATCH_EXCEPTIONS
537525
metadata_len = ParseFooterLength(footer_buffer, footer_read_size);
@@ -557,7 +545,7 @@ class SerializedFile : public ParquetFileReader::Contents {
557545
}
558546

559547
// Continuation
560-
::arrow::Future<> ParseMaybeEncryptedMetaDataAsync(
548+
Future<> ParseMaybeEncryptedMetaDataAsync(
561549
std::shared_ptr<::arrow::Buffer> footer_buffer,
562550
std::shared_ptr<::arrow::Buffer> metadata_buffer, int64_t footer_read_size,
563551
uint32_t metadata_len) {
@@ -580,26 +568,30 @@ class SerializedFile : public ParquetFileReader::Contents {
580568
file_decryptor = std::move(file_decryptor)](
581569
const std::shared_ptr<::arrow::Buffer>& metadata_buffer) {
582570
// Continue and read the file footer
583-
return ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer,
584-
file_decryptor);
571+
BEGIN_PARQUET_CATCH_EXCEPTIONS
572+
ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer,
573+
file_decryptor);
574+
END_PARQUET_CATCH_EXCEPTIONS
575+
return Status::OK();
585576
});
586577
}
587-
return ParseMetaDataFinal(std::move(metadata_buffer), metadata_len,
588-
is_encrypted_footer, std::move(file_decryptor));
578+
BEGIN_PARQUET_CATCH_EXCEPTIONS
579+
ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, is_encrypted_footer,
580+
std::move(file_decryptor));
581+
END_PARQUET_CATCH_EXCEPTIONS
582+
return Status::OK();
589583
}
590584

591585
// Continuation
592-
::arrow::Status ParseMetaDataFinal(
593-
std::shared_ptr<::arrow::Buffer> metadata_buffer, uint32_t metadata_len,
594-
const bool is_encrypted_footer,
595-
std::shared_ptr<InternalFileDecryptor> file_decryptor) {
596-
BEGIN_PARQUET_CATCH_EXCEPTIONS
586+
void ParseMetaDataFinal(std::shared_ptr<::arrow::Buffer> metadata_buffer,
587+
uint32_t metadata_len, const bool is_encrypted_footer,
588+
std::shared_ptr<InternalFileDecryptor> file_decryptor) {
597589
const uint32_t read_metadata_len = ParseUnencryptedFileMetadata(
598590
metadata_buffer, metadata_len, std::move(file_decryptor));
599591
auto file_decryption_properties = properties_.file_decryption_properties();
600592
if (is_encrypted_footer) {
601593
// Nothing else to do here.
602-
return ::arrow::Status::OK();
594+
return;
603595
} else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file.
604596
if (file_decryption_properties != nullptr) {
605597
if (!file_decryption_properties->plaintext_files_allowed()) {
@@ -611,8 +603,6 @@ class SerializedFile : public ParquetFileReader::Contents {
611603
ParseMetaDataOfEncryptedFileWithPlaintextFooter(
612604
file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len);
613605
}
614-
END_PARQUET_CATCH_EXCEPTIONS
615-
return ::arrow::Status::OK();
616606
}
617607

618608
private:
@@ -707,20 +697,16 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter(
707697
auto file_decryptor = std::make_shared<InternalFileDecryptor>(
708698
file_decryption_properties, file_aad, algo.algorithm,
709699
file_metadata_->footer_signing_key_metadata(), properties_.memory_pool());
710-
// set the InternalFileDecryptor in the metadata as well, as it's used
711-
// for signature verification and for ColumnChunkMetaData creation.
712-
file_metadata_->set_file_decryptor(std::move(file_decryptor));
700+
// Set the InternalFileDecryptor in the metadata as well, as it's used
701+
// for ColumnChunkMetaData creation.
702+
file_metadata_->set_file_decryptor(file_decryptor);
713703

714704
if (file_decryption_properties->check_plaintext_footer_integrity()) {
715-
if (metadata_len - read_metadata_len !=
716-
(parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength)) {
717-
throw ParquetInvalidOrCorruptedFileException(
718-
"Failed reading metadata for encryption signature (requested ",
719-
parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength,
720-
" bytes but have ", metadata_len - read_metadata_len, " bytes)");
721-
}
722-
723-
if (!file_metadata_->VerifySignature(metadata_buffer->data() + read_metadata_len)) {
705+
auto serialized_metadata =
706+
metadata_buffer->span_as<uint8_t>().subspan(0, read_metadata_len);
707+
auto signature = metadata_buffer->span_as<uint8_t>().subspan(read_metadata_len);
708+
if (!FileMetaData::VerifySignature(serialized_metadata, signature,
709+
file_decryptor.get())) {
724710
throw ParquetInvalidOrCorruptedFileException(
725711
"Parquet crypto signature verification failed");
726712
}
@@ -804,7 +790,7 @@ std::unique_ptr<ParquetFileReader::Contents> ParquetFileReader::Contents::Open(
804790
return result;
805791
}
806792

807-
::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>
793+
Future<std::unique_ptr<ParquetFileReader::Contents>>
808794
ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
809795
const ReaderProperties& props,
810796
std::shared_ptr<FileMetaData> metadata) {
@@ -815,7 +801,7 @@ ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
815801
if (metadata == nullptr) {
816802
// TODO(ARROW-12259): workaround since we have Future<(move-only type)>
817803
struct {
818-
::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>> operator()() {
804+
Result<std::unique_ptr<ParquetFileReader::Contents>> operator()() {
819805
return std::move(result);
820806
}
821807

@@ -825,7 +811,7 @@ ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
825811
return file->ParseMetaDataAsync().Then(std::move(Continuation));
826812
} else {
827813
file->set_metadata(std::move(metadata));
828-
return ::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>::MakeFinished(
814+
return Future<std::unique_ptr<ParquetFileReader::Contents>>::MakeFinished(
829815
std::move(result));
830816
}
831817
END_PARQUET_CATCH_EXCEPTIONS
@@ -855,24 +841,24 @@ std::unique_ptr<ParquetFileReader> ParquetFileReader::OpenFile(
855841
return Open(std::move(source), props, std::move(metadata));
856842
}
857843

858-
::arrow::Future<std::unique_ptr<ParquetFileReader>> ParquetFileReader::OpenAsync(
844+
Future<std::unique_ptr<ParquetFileReader>> ParquetFileReader::OpenAsync(
859845
std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props,
860846
std::shared_ptr<FileMetaData> metadata) {
861847
BEGIN_PARQUET_CATCH_EXCEPTIONS
862848
auto fut = SerializedFile::OpenAsync(std::move(source), props, std::move(metadata));
863849
// TODO(ARROW-12259): workaround since we have Future<(move-only type)>
864-
auto completed = ::arrow::Future<std::unique_ptr<ParquetFileReader>>::Make();
865-
fut.AddCallback([fut, completed](
866-
const ::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>>&
867-
contents) mutable {
868-
if (!contents.ok()) {
869-
completed.MarkFinished(contents.status());
870-
return;
871-
}
872-
std::unique_ptr<ParquetFileReader> result = std::make_unique<ParquetFileReader>();
873-
result->Open(fut.MoveResult().MoveValueUnsafe());
874-
completed.MarkFinished(std::move(result));
875-
});
850+
auto completed = Future<std::unique_ptr<ParquetFileReader>>::Make();
851+
fut.AddCallback(
852+
[fut, completed](
853+
const Result<std::unique_ptr<ParquetFileReader::Contents>>& contents) mutable {
854+
if (!contents.ok()) {
855+
completed.MarkFinished(contents.status());
856+
return;
857+
}
858+
std::unique_ptr<ParquetFileReader> result = std::make_unique<ParquetFileReader>();
859+
result->Open(fut.MoveResult().MoveValueUnsafe());
860+
completed.MarkFinished(std::move(result));
861+
});
876862
return completed;
877863
END_PARQUET_CATCH_EXCEPTIONS
878864
}
@@ -919,7 +905,7 @@ void ParquetFileReader::PreBuffer(const std::vector<int>& row_groups,
919905
file->PreBuffer(row_groups, column_indices, ctx, options);
920906
}
921907

922-
::arrow::Result<std::vector<::arrow::io::ReadRange>> ParquetFileReader::GetReadRanges(
908+
Result<std::vector<::arrow::io::ReadRange>> ParquetFileReader::GetReadRanges(
923909
const std::vector<int>& row_groups, const std::vector<int>& column_indices,
924910
int64_t hole_size_limit, int64_t range_size_limit) {
925911
// Access private methods here
@@ -929,8 +915,8 @@ ::arrow::Result<std::vector<::arrow::io::ReadRange>> ParquetFileReader::GetReadR
929915
range_size_limit);
930916
}
931917

932-
::arrow::Future<> ParquetFileReader::WhenBuffered(
933-
const std::vector<int>& row_groups, const std::vector<int>& column_indices) const {
918+
Future<> ParquetFileReader::WhenBuffered(const std::vector<int>& row_groups,
919+
const std::vector<int>& column_indices) const {
934920
// Access private methods here
935921
SerializedFile* file =
936922
::arrow::internal::checked_cast<SerializedFile*>(contents_.get());

cpp/src/parquet/metadata.cc

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1169,6 +1169,42 @@ void FileMetaData::WriteTo(::arrow::io::OutputStream* dst,
11691169
return impl_->WriteTo(dst, encryptor);
11701170
}
11711171

1172+
bool FileMetaData::VerifySignature(std::span<const uint8_t> serialized_metadata,
1173+
std::span<const uint8_t> signature,
1174+
InternalFileDecryptor* file_decryptor) {
1175+
DCHECK_NE(file_decryptor, nullptr);
1176+
1177+
// In plaintext footer, the "signature" is the concatenation of the nonce used
1178+
// for GCM encryption, and the authentication tag obtained after GCM encryption.
1179+
if (signature.size() != encryption::kGcmTagLength + encryption::kNonceLength) {
1180+
throw ParquetInvalidOrCorruptedFileException(
1181+
"Invalid footer encryption signature (expected ",
1182+
encryption::kGcmTagLength + encryption::kNonceLength, " bytes, got ",
1183+
signature.size(), ")");
1184+
}
1185+
1186+
// Encrypt plaintext serialized metadata so as to compute its signature
1187+
auto nonce = signature.subspan(0, encryption::kNonceLength);
1188+
auto tag = signature.subspan(encryption::kNonceLength);
1189+
const SecureString& key = file_decryptor->GetFooterKey();
1190+
const std::string& aad = encryption::CreateFooterAad(file_decryptor->file_aad());
1191+
1192+
auto aes_encryptor = encryption::AesEncryptor::Make(
1193+
file_decryptor->algorithm(), static_cast<int>(key.size()), /*metadata=*/true,
1194+
/*write_length=*/false);
1195+
1196+
std::shared_ptr<Buffer> encrypted_buffer =
1197+
AllocateBuffer(file_decryptor->pool(),
1198+
aes_encryptor->CiphertextLength(serialized_metadata.size()));
1199+
int32_t encrypted_len = aes_encryptor->SignedFooterEncrypt(
1200+
serialized_metadata, key.as_span(), str2span(aad), nonce,
1201+
encrypted_buffer->mutable_span_as<uint8_t>());
1202+
DCHECK_EQ(encrypted_len, encrypted_buffer->size());
1203+
// Check computed signature against expected
1204+
return 0 == memcmp(encrypted_buffer->data() + encrypted_len - encryption::kGcmTagLength,
1205+
tag.data(), encryption::kGcmTagLength);
1206+
}
1207+
11721208
class FileCryptoMetaData::FileCryptoMetaDataImpl {
11731209
public:
11741210
FileCryptoMetaDataImpl() = default;

cpp/src/parquet/metadata.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include <map>
2222
#include <memory>
2323
#include <optional>
24+
#include <span>
2425
#include <string>
2526
#include <vector>
2627

@@ -322,8 +323,8 @@ class PARQUET_EXPORT FileMetaData {
322323
EncryptionAlgorithm encryption_algorithm() const;
323324
const std::string& footer_signing_key_metadata() const;
324325

325-
/// \brief Verify signature of FileMetaData when file is encrypted but footer
326-
/// is not encrypted (plaintext footer).
326+
PARQUET_DEPRECATED(
327+
"Deprecated in 24.0.0. If you need this functionality, please report an issue.")
327328
bool VerifySignature(const void* signature);
328329

329330
void WriteTo(::arrow::io::OutputStream* dst,
@@ -383,6 +384,11 @@ class PARQUET_EXPORT FileMetaData {
383384
void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor);
384385
const std::shared_ptr<InternalFileDecryptor>& file_decryptor() const;
385386

387+
// Verify the signature of a plaintext footer.
388+
static bool VerifySignature(std::span<const uint8_t> serialized_metadata,
389+
std::span<const uint8_t> signature,
390+
InternalFileDecryptor* file_decryptor);
391+
386392
// PIMPL Idiom
387393
FileMetaData();
388394
class FileMetaDataImpl;

0 commit comments

Comments
 (0)