-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-48467: [C++][Parquet] Add configure to limit the row group size in bytes #48468
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
7b8e058
8142f17
e19db37
13fe7b1
0e6e303
ea88cc7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -397,13 +397,18 @@ class FileWriterImpl : public FileWriter { | |
|
|
||
| if (chunk_size <= 0 && table.num_rows() > 0) { | ||
wecharyu marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return Status::Invalid("chunk size per row_group must be greater than 0"); | ||
| } else if (!table.schema()->Equals(*schema_, false)) { | ||
| } else if (!table.schema()->Equals(*schema_, /*check_metadata=*/false)) { | ||
| return Status::Invalid("table schema does not match this writer's. table:'", | ||
| table.schema()->ToString(), "' this:'", schema_->ToString(), | ||
| "'"); | ||
| } else if (chunk_size > this->properties().max_row_group_length()) { | ||
| chunk_size = this->properties().max_row_group_length(); | ||
| } | ||
| if (auto avg_row_size = EstimateCompressedBytesPerRow()) { | ||
| chunk_size = std::min( | ||
| chunk_size, static_cast<int64_t>(this->properties().max_row_group_bytes() / | ||
| avg_row_size.value())); | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to clamp the chunk size between 1 and |
||
| } | ||
|
|
||
| auto WriteRowGroup = [&](int64_t offset, int64_t size) { | ||
| RETURN_NOT_OK(NewRowGroup()); | ||
|
|
@@ -442,12 +447,8 @@ class FileWriterImpl : public FileWriter { | |
| return Status::OK(); | ||
| } | ||
|
|
||
| // Max number of rows allowed in a row group. | ||
| const int64_t max_row_group_length = this->properties().max_row_group_length(); | ||
|
|
||
| // Initialize a new buffered row group writer if necessary. | ||
| if (row_group_writer_ == nullptr || !row_group_writer_->buffered() || | ||
| row_group_writer_->num_rows() >= max_row_group_length) { | ||
| if (row_group_writer_ == nullptr || !row_group_writer_->buffered()) { | ||
| RETURN_NOT_OK(NewBufferedRowGroup()); | ||
| } | ||
|
|
||
|
|
@@ -480,17 +481,24 @@ class FileWriterImpl : public FileWriter { | |
| return Status::OK(); | ||
| }; | ||
|
|
||
| const int64_t max_row_group_length = this->properties().max_row_group_length(); | ||
wgtmac marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| const int64_t max_row_group_bytes = this->properties().max_row_group_bytes(); | ||
|
|
||
| int64_t offset = 0; | ||
| while (offset < batch.num_rows()) { | ||
| const int64_t batch_size = | ||
| std::min(max_row_group_length - row_group_writer_->num_rows(), | ||
| batch.num_rows() - offset); | ||
| RETURN_NOT_OK(WriteBatch(offset, batch_size)); | ||
| offset += batch_size; | ||
|
|
||
| // Flush current row group writer and create a new writer if it is full. | ||
| if (row_group_writer_->num_rows() >= max_row_group_length && | ||
| offset < batch.num_rows()) { | ||
| int64_t batch_size = std::min(max_row_group_length - row_group_writer_->num_rows(), | ||
| batch.num_rows() - offset); | ||
| if (auto avg_row_size = EstimateCompressedBytesPerRow()) { | ||
| int64_t buffered_bytes = row_group_writer_->EstimatedTotalCompressedBytes(); | ||
| batch_size = std::min( | ||
| batch_size, static_cast<int64_t>((max_row_group_bytes - buffered_bytes) / | ||
| avg_row_size.value())); | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto. |
||
| } | ||
| if (batch_size > 0) { | ||
| RETURN_NOT_OK(WriteBatch(offset, batch_size)); | ||
| offset += batch_size; | ||
| } else if (offset < batch.num_rows()) { | ||
| // Current row group is full, write remaining rows in a new group. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will it cause infinite loop at this line if
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would cause infinite loop only when the if (batch_size == 0 && row_group_writer_->num_rows() == 0) {
return Status::Invalid(
"Configured max_row_group_bytes is too small to hold a single row");
}
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We cannot accept infinite loop so perhaps we have to set the minimum batch size to 1 in this case?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Set the minimum batch size to 1 is not reasonable, when the buffered_bytes > max_row_group_bytes we still set the batch size as 1, then it will continually append one row to the active row group and never create a new one. Returning an invalid status might be more intuitive.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't we check row group size after writing each batch? If a large per row size leads to batch size equal to 1, we just end up with checking row group size after writing every row.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We does not check row group size after write batch, current write logic is like:
In this way we don't need check size after written, it's guaranteed in step 1; and we'll not leave an possible empty row group in the final batch, it guaranteed in step 3. |
||
| RETURN_NOT_OK(NewBufferedRowGroup()); | ||
| } | ||
| } | ||
|
|
@@ -516,6 +524,17 @@ class FileWriterImpl : public FileWriter { | |
| return Status::OK(); | ||
| } | ||
|
|
||
| std::optional<double> EstimateCompressedBytesPerRow() const override { | ||
| if (auto value = writer_->EstimateCompressedBytesPerRow()) { | ||
| return value; | ||
| } | ||
| if (row_group_writer_ != nullptr && row_group_writer_->num_rows() > 0) { | ||
| return static_cast<double>(row_group_writer_->EstimatedTotalCompressedBytes()) / | ||
| row_group_writer_->num_rows(); | ||
| } | ||
| return std::nullopt; | ||
| } | ||
|
|
||
| private: | ||
| friend class FileWriter; | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -68,6 +68,12 @@ int64_t RowGroupWriter::total_compressed_bytes_written() const { | |
| return contents_->total_compressed_bytes_written(); | ||
| } | ||
|
|
||
| int64_t RowGroupWriter::EstimatedTotalCompressedBytes() const { | ||
| return contents_->total_compressed_bytes() + | ||
| contents_->total_compressed_bytes_written() + | ||
| contents_->EstimatedBufferedValueBytes(); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Are we sure we want to account for contents not serialized into a page yet?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This encoding size is a reference before the first page written, and its impact diminishes as more pages are written.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure that makes it useful in any way, though.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In many common cases, the compression ratio is close to 3:1. So I used something like
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need this logic here? we can do it in |
||
| } | ||
|
|
||
| bool RowGroupWriter::buffered() const { return contents_->buffered(); } | ||
|
|
||
| int RowGroupWriter::current_column() { return contents_->current_column(); } | ||
|
|
@@ -195,6 +201,20 @@ class RowGroupSerializer : public RowGroupWriter::Contents { | |
| return total_compressed_bytes_written; | ||
| } | ||
|
|
||
| int64_t EstimatedBufferedValueBytes() const override { | ||
| if (closed_) { | ||
| return 0; | ||
| } | ||
| int64_t estimated_buffered_value_bytes = 0; | ||
| for (size_t i = 0; i < column_writers_.size(); i++) { | ||
| if (column_writers_[i]) { | ||
| estimated_buffered_value_bytes += | ||
| column_writers_[i]->estimated_buffered_value_bytes(); | ||
| } | ||
| } | ||
| return estimated_buffered_value_bytes; | ||
| } | ||
|
|
||
| bool buffered() const override { return buffered_row_group_; } | ||
|
|
||
| void Close() override { | ||
|
|
@@ -329,6 +349,7 @@ class FileSerializer : public ParquetFileWriter::Contents { | |
| if (row_group_writer_) { | ||
| num_rows_ += row_group_writer_->num_rows(); | ||
| row_group_writer_->Close(); | ||
| written_compressed_bytes_ += row_group_writer_->total_compressed_bytes_written(); | ||
| } | ||
| row_group_writer_.reset(); | ||
|
|
||
|
|
@@ -352,6 +373,8 @@ class FileSerializer : public ParquetFileWriter::Contents { | |
|
|
||
| int64_t num_rows() const override { return num_rows_; } | ||
|
|
||
| int64_t written_compressed_bytes() const override { return written_compressed_bytes_; } | ||
|
|
||
| const std::shared_ptr<WriterProperties>& properties() const override { | ||
| return properties_; | ||
| } | ||
|
|
@@ -360,6 +383,7 @@ class FileSerializer : public ParquetFileWriter::Contents { | |
| if (row_group_writer_) { | ||
| num_rows_ += row_group_writer_->num_rows(); | ||
| row_group_writer_->Close(); | ||
| written_compressed_bytes_ += row_group_writer_->total_compressed_bytes_written(); | ||
| } | ||
| int16_t row_group_ordinal = -1; // row group ordinal not set | ||
| if (file_encryptor_ != nullptr) { | ||
|
|
@@ -415,6 +439,7 @@ class FileSerializer : public ParquetFileWriter::Contents { | |
| properties_(std::move(properties)), | ||
| num_row_groups_(0), | ||
| num_rows_(0), | ||
| written_compressed_bytes_(0), | ||
| metadata_(FileMetaDataBuilder::Make(&schema_, properties_)) { | ||
| PARQUET_ASSIGN_OR_THROW(int64_t position, sink_->Tell()); | ||
| if (position == 0) { | ||
|
|
@@ -468,6 +493,7 @@ class FileSerializer : public ParquetFileWriter::Contents { | |
| const std::shared_ptr<WriterProperties> properties_; | ||
| int num_row_groups_; | ||
| int64_t num_rows_; | ||
| int64_t written_compressed_bytes_; | ||
| std::unique_ptr<FileMetaDataBuilder> metadata_; | ||
| // Only one of the row group writers is active at a time | ||
| std::unique_ptr<RowGroupWriter> row_group_writer_; | ||
|
|
@@ -640,6 +666,29 @@ void ParquetFileWriter::AddKeyValueMetadata( | |
| } | ||
| } | ||
|
|
||
| std::optional<double> ParquetFileWriter::EstimateCompressedBytesPerRow() const { | ||
| if (contents_ && contents_->num_rows() > 0) { | ||
| // Use written row groups to estimate. | ||
| return static_cast<double>(contents_->written_compressed_bytes()) / | ||
| contents_->num_rows(); | ||
| } | ||
| if (file_metadata_) { | ||
| // Use closed file metadata to estimate. | ||
| int64_t total_compressed_bytes = 0; | ||
| int64_t total_rows = 0; | ||
| for (int i = 0; i < file_metadata_->num_row_groups(); i++) { | ||
| const auto row_group = file_metadata_->RowGroup(i); | ||
| total_compressed_bytes += row_group->total_compressed_size(); | ||
| total_rows += row_group->num_rows(); | ||
| } | ||
| if (total_compressed_bytes == 0 || total_rows == 0) { | ||
| return std::nullopt; | ||
| } | ||
| return static_cast<double>(total_compressed_bytes) / total_rows; | ||
| } | ||
| return std::nullopt; | ||
| } | ||
|
|
||
| const std::shared_ptr<WriterProperties>& ParquetFileWriter::properties() const { | ||
| if (contents_) { | ||
| return contents_->properties(); | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -58,6 +58,9 @@ class PARQUET_EXPORT RowGroupWriter { | |||||||||
| virtual int64_t total_compressed_bytes() const = 0; | ||||||||||
| /// \brief total compressed bytes written by the page writer | ||||||||||
| virtual int64_t total_compressed_bytes_written() const = 0; | ||||||||||
| /// \brief estimated bytes of values that are buffered by the page writer | ||||||||||
| /// but not written to a page yet | ||||||||||
| virtual int64_t EstimatedBufferedValueBytes() const = 0; | ||||||||||
|
|
||||||||||
| virtual bool buffered() const = 0; | ||||||||||
| }; | ||||||||||
|
|
@@ -99,6 +102,8 @@ class PARQUET_EXPORT RowGroupWriter { | |||||||||
| int64_t total_compressed_bytes() const; | ||||||||||
| /// \brief total compressed bytes written by the page writer | ||||||||||
| int64_t total_compressed_bytes_written() const; | ||||||||||
| /// \brief Estimate total compressed bytes including written and buffered bytes. | ||||||||||
| int64_t EstimatedTotalCompressedBytes() const; | ||||||||||
|
|
||||||||||
| /// Returns whether the current RowGroupWriter is in the buffered mode and is created | ||||||||||
| /// by calling ParquetFileWriter::AppendBufferedRowGroup. | ||||||||||
|
|
@@ -151,6 +156,7 @@ class PARQUET_EXPORT ParquetFileWriter { | |||||||||
| virtual RowGroupWriter* AppendBufferedRowGroup() = 0; | ||||||||||
|
|
||||||||||
| virtual int64_t num_rows() const = 0; | ||||||||||
| virtual int64_t written_compressed_bytes() const = 0; | ||||||||||
|
Comment on lines
158
to
+159
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
| virtual int num_columns() const = 0; | ||||||||||
| virtual int num_row_groups() const = 0; | ||||||||||
|
|
||||||||||
|
|
@@ -207,6 +213,10 @@ class PARQUET_EXPORT ParquetFileWriter { | |||||||||
| void AddKeyValueMetadata( | ||||||||||
| const std::shared_ptr<const KeyValueMetadata>& key_value_metadata); | ||||||||||
|
|
||||||||||
| /// \brief Estimate compressed bytes per row from closed row groups. | ||||||||||
| /// \return Estimated bytes or std::nullopt when no written row group. | ||||||||||
| std::optional<double> EstimateCompressedBytesPerRow() const; | ||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unrelated to this PR: perhaps it is also useful to provide an estimation of the current file size to facilitate downstream to implement a rolling file writer. |
||||||||||
|
|
||||||||||
| /// Number of columns. | ||||||||||
| /// | ||||||||||
| /// This number is fixed during the lifetime of the writer as it is determined via | ||||||||||
|
|
||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These three lines are unrelated changes introduced by inconsistent local clang-format version.