Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 69 additions & 3 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -378,19 +378,19 @@ const double test_traits<::arrow::DoubleType>::value(4.2);
template <>
struct test_traits<::arrow::StringType> {
static constexpr ParquetType::type parquet_enum = ParquetType::BYTE_ARRAY;
static std::string const value;
static const std::string value;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These three lines are unrelated changes introduced by inconsistent local clang-format version.

};

template <>
struct test_traits<::arrow::BinaryType> {
static constexpr ParquetType::type parquet_enum = ParquetType::BYTE_ARRAY;
static std::string const value;
static const std::string value;
};

template <>
struct test_traits<::arrow::FixedSizeBinaryType> {
static constexpr ParquetType::type parquet_enum = ParquetType::FIXED_LEN_BYTE_ARRAY;
static std::string const value;
static const std::string value;
};

const std::string test_traits<::arrow::StringType>::value("Test"); // NOLINT
Expand Down Expand Up @@ -5794,6 +5794,72 @@ TEST(TestArrowReadWrite, WriteRecordBatchNotProduceEmptyRowGroup) {
}
}

TEST(TestArrowReadWrite, WriteRecordBatchFlushRowGroupByBufferedSize) {
auto pool = ::arrow::default_memory_pool();
auto sink = CreateOutputStream();
// Limit the max bytes in a row group to 100 so that each batch produces a new group.
auto writer_properties = WriterProperties::Builder().max_row_group_bytes(100)->build();
auto arrow_writer_properties = default_arrow_writer_properties();

// Prepare schema
auto schema = ::arrow::schema({::arrow::field("a", ::arrow::int64())});
std::shared_ptr<SchemaDescriptor> parquet_schema;
ASSERT_OK_NO_THROW(ToParquetSchema(schema.get(), *writer_properties,
*arrow_writer_properties, &parquet_schema));
auto schema_node = std::static_pointer_cast<GroupNode>(parquet_schema->schema_root());

auto gen = ::arrow::random::RandomArrayGenerator(/*seed=*/42);

// Create writer to write data via RecordBatch.
ASSERT_OK_AND_ASSIGN(auto arrow_writer, parquet::arrow::FileWriter::Open(
*schema, pool, sink, writer_properties,
arrow_writer_properties));
// NewBufferedRowGroup() is not called explicitly and it will be called
// inside WriteRecordBatch().
for (int i = 0; i < 5; ++i) {
auto record_batch =
gen.BatchOf({::arrow::field("a", ::arrow::int64())}, /*length=*/1);
ASSERT_OK_NO_THROW(arrow_writer->WriteRecordBatch(*record_batch));
}
ASSERT_OK_NO_THROW(arrow_writer->Close());
ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());

auto file_metadata = arrow_writer->metadata();
EXPECT_EQ(5, file_metadata->num_row_groups());
for (int i = 0; i < 5; ++i) {
EXPECT_EQ(1, file_metadata->RowGroup(i)->num_rows());
}
}

TEST(TestArrowReadWrite, WriteTableFlushRowGroupByBufferedSize) {
auto pool = ::arrow::default_memory_pool();
auto sink = CreateOutputStream();
// Limit the max bytes in a row group to 100, then first table generates one row group,
// and second table generates 5 row groups.
auto writer_properties = WriterProperties::Builder().max_row_group_bytes(100)->build();
auto arrow_writer_properties = default_arrow_writer_properties();

// Prepare schema
auto schema = ::arrow::schema({::arrow::field("a", ::arrow::int64())});
auto table = ::arrow::Table::Make(
schema, {::arrow::ArrayFromJSON(::arrow::int64(), R"([1, 2, 3, 4, 5])")});
ASSERT_OK_AND_ASSIGN(auto arrow_writer, parquet::arrow::FileWriter::Open(
*schema, pool, sink, writer_properties,
arrow_writer_properties));
for (int i = 0; i < 2; ++i) {
ASSERT_OK_NO_THROW(arrow_writer->WriteTable(*table, 5));
}
ASSERT_OK_NO_THROW(arrow_writer->Close());
ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());

auto file_metadata = arrow_writer->metadata();
EXPECT_EQ(6, file_metadata->num_row_groups());
EXPECT_EQ(5, file_metadata->RowGroup(0)->num_rows());
for (int i = 1; i < 6; ++i) {
EXPECT_EQ(1, file_metadata->RowGroup(i)->num_rows());
}
}

TEST(TestArrowReadWrite, MultithreadedWrite) {
const int num_columns = 20;
const int num_rows = 1000;
Expand Down
49 changes: 34 additions & 15 deletions cpp/src/parquet/arrow/writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -397,13 +397,18 @@ class FileWriterImpl : public FileWriter {

if (chunk_size <= 0 && table.num_rows() > 0) {
return Status::Invalid("chunk size per row_group must be greater than 0");
} else if (!table.schema()->Equals(*schema_, false)) {
} else if (!table.schema()->Equals(*schema_, /*check_metadata=*/false)) {
return Status::Invalid("table schema does not match this writer's. table:'",
table.schema()->ToString(), "' this:'", schema_->ToString(),
"'");
} else if (chunk_size > this->properties().max_row_group_length()) {
chunk_size = this->properties().max_row_group_length();
}
if (auto avg_row_size = EstimateCompressedBytesPerRow()) {
chunk_size = std::min(
chunk_size, static_cast<int64_t>(this->properties().max_row_group_bytes() /
avg_row_size.value()));
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The chunk_size could be 0 if the configured max_row_group_bytes is less than avg_row_size, do we need a double check here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to clamp the chunk size between 1 and max_row_group_bytes/avg_row_size.

}

auto WriteRowGroup = [&](int64_t offset, int64_t size) {
RETURN_NOT_OK(NewRowGroup());
Expand Down Expand Up @@ -442,12 +447,8 @@ class FileWriterImpl : public FileWriter {
return Status::OK();
}

// Max number of rows allowed in a row group.
const int64_t max_row_group_length = this->properties().max_row_group_length();

// Initialize a new buffered row group writer if necessary.
if (row_group_writer_ == nullptr || !row_group_writer_->buffered() ||
row_group_writer_->num_rows() >= max_row_group_length) {
if (row_group_writer_ == nullptr || !row_group_writer_->buffered()) {
RETURN_NOT_OK(NewBufferedRowGroup());
}

Expand Down Expand Up @@ -480,17 +481,24 @@ class FileWriterImpl : public FileWriter {
return Status::OK();
};

const int64_t max_row_group_length = this->properties().max_row_group_length();
const int64_t max_row_group_bytes = this->properties().max_row_group_bytes();

int64_t offset = 0;
while (offset < batch.num_rows()) {
const int64_t batch_size =
std::min(max_row_group_length - row_group_writer_->num_rows(),
batch.num_rows() - offset);
RETURN_NOT_OK(WriteBatch(offset, batch_size));
offset += batch_size;

// Flush current row group writer and create a new writer if it is full.
if (row_group_writer_->num_rows() >= max_row_group_length &&
offset < batch.num_rows()) {
int64_t batch_size = std::min(max_row_group_length - row_group_writer_->num_rows(),
batch.num_rows() - offset);
if (auto avg_row_size = EstimateCompressedBytesPerRow()) {
int64_t buffered_bytes = row_group_writer_->EstimatedTotalCompressedBytes();
batch_size = std::min(
batch_size, static_cast<int64_t>((max_row_group_bytes - buffered_bytes) /
avg_row_size.value()));
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto.

}
if (batch_size > 0) {
RETURN_NOT_OK(WriteBatch(offset, batch_size));
offset += batch_size;
} else if (offset < batch.num_rows()) {
// Current row group is full, write remaining rows in a new group.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will it cause infinite loop at this line if batch_size is always 0?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would cause infinite loop only when the max_row_group_bytes/avg_row_size is 0, is it OK to return Invalid status in WriteXxx() at this case?

        if (batch_size == 0 && row_group_writer_->num_rows() == 0) {
          return Status::Invalid(
              "Configured max_row_group_bytes is too small to hold a single row");
        }

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We cannot accept infinite loop so perhaps we have to set the minimum batch size to 1 in this case?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Set the minimum batch size to 1 is not reasonable, when the buffered_bytes > max_row_group_bytes we still set the batch size as 1, then it will continually append one row to the active row group and never create a new one. Returning an invalid status might be more intuitive.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we check row group size after writing each batch? If a large per row size leads to batch size equal to 1, we just end up with checking row group size after writing every row.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We does not check row group size after write batch, current write logic is like:

  1. check rows and bytes to determine current batch_size
  2. if batch_size > 0, write these rows to current row group, it's guaranteed not exceeds the row group limits
  3. if batch_size = 0 and still has rows to write, new a row group
  4. next loop to 1

In this way we don't need check size after written, it's guaranteed in step 1; and we'll not leave an possible empty row group in the final batch, it guaranteed in step 3.

RETURN_NOT_OK(NewBufferedRowGroup());
}
}
Expand All @@ -516,6 +524,17 @@ class FileWriterImpl : public FileWriter {
return Status::OK();
}

std::optional<double> EstimateCompressedBytesPerRow() const override {
if (auto value = writer_->EstimateCompressedBytesPerRow()) {
return value;
}
if (row_group_writer_ != nullptr && row_group_writer_->num_rows() > 0) {
return static_cast<double>(row_group_writer_->EstimatedTotalCompressedBytes()) /
row_group_writer_->num_rows();
}
return std::nullopt;
}

private:
friend class FileWriter;

Expand Down
9 changes: 6 additions & 3 deletions cpp/src/parquet/arrow/writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,9 @@ class PARQUET_EXPORT FileWriter {
/// Multiple RecordBatches can be written into the same row group
/// through this method.
///
/// WriterProperties.max_row_group_length() is respected and a new
/// row group will be created if the current row group exceeds the
/// limit.
/// WriterProperties.max_row_group_length() and WriterProperties.max_row_group_bytes()
/// are respected and a new row group will be created if the current row group exceeds
/// the limits.
///
/// Batches get flushed to the output stream once NewBufferedRowGroup()
/// or Close() is called.
Expand All @@ -139,6 +139,9 @@ class PARQUET_EXPORT FileWriter {
/// `store_schema` being unusable during read.
virtual ::arrow::Status AddKeyValueMetadata(
const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata) = 0;
/// \brief Estimate compressed bytes per row from data written so far.
/// \note std::nullopt will be returned if there is no row written.
virtual std::optional<double> EstimateCompressedBytesPerRow() const = 0;
/// \brief Return the file metadata, only available after calling Close().
virtual const std::shared_ptr<FileMetaData> metadata() const = 0;
};
Expand Down
49 changes: 49 additions & 0 deletions cpp/src/parquet/file_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,12 @@ int64_t RowGroupWriter::total_compressed_bytes_written() const {
return contents_->total_compressed_bytes_written();
}

int64_t RowGroupWriter::EstimatedTotalCompressedBytes() const {
return contents_->total_compressed_bytes() +
contents_->total_compressed_bytes_written() +
contents_->EstimatedBufferedValueBytes();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

EstimatedBufferedValueBytes does not account for compression and may therefore wildly overestimate the final compressed size?

Are we sure we want to account for contents not serialized into a page yet?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This encoding size is a reference before the first page written, and its impact diminishes as more pages are written.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure that makes it useful in any way, though.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In many common cases, the compression ratio is close to 3:1. So I used something like total_compressed_bytes + total_compressed_bytes_written + EstimatedBufferedValueBytes / (codec_type != NONE ? 3 : 1) as an empirical value in the past.
`

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this logic here? we can do it in EstimatedBufferedValueBytes::Contents::EstimatedBufferedValueBytes(), and rename it like EstimatedCompressedBufferedBytes().

}

bool RowGroupWriter::buffered() const { return contents_->buffered(); }

int RowGroupWriter::current_column() { return contents_->current_column(); }
Expand Down Expand Up @@ -195,6 +201,20 @@ class RowGroupSerializer : public RowGroupWriter::Contents {
return total_compressed_bytes_written;
}

int64_t EstimatedBufferedValueBytes() const override {
if (closed_) {
return 0;
}
int64_t estimated_buffered_value_bytes = 0;
for (size_t i = 0; i < column_writers_.size(); i++) {
if (column_writers_[i]) {
estimated_buffered_value_bytes +=
column_writers_[i]->estimated_buffered_value_bytes();
}
}
return estimated_buffered_value_bytes;
}

bool buffered() const override { return buffered_row_group_; }

void Close() override {
Expand Down Expand Up @@ -329,6 +349,7 @@ class FileSerializer : public ParquetFileWriter::Contents {
if (row_group_writer_) {
num_rows_ += row_group_writer_->num_rows();
row_group_writer_->Close();
written_compressed_bytes_ += row_group_writer_->total_compressed_bytes_written();
}
row_group_writer_.reset();

Expand All @@ -352,6 +373,8 @@ class FileSerializer : public ParquetFileWriter::Contents {

int64_t num_rows() const override { return num_rows_; }

int64_t written_compressed_bytes() const override { return written_compressed_bytes_; }

const std::shared_ptr<WriterProperties>& properties() const override {
return properties_;
}
Expand All @@ -360,6 +383,7 @@ class FileSerializer : public ParquetFileWriter::Contents {
if (row_group_writer_) {
num_rows_ += row_group_writer_->num_rows();
row_group_writer_->Close();
written_compressed_bytes_ += row_group_writer_->total_compressed_bytes_written();
}
int16_t row_group_ordinal = -1; // row group ordinal not set
if (file_encryptor_ != nullptr) {
Expand Down Expand Up @@ -415,6 +439,7 @@ class FileSerializer : public ParquetFileWriter::Contents {
properties_(std::move(properties)),
num_row_groups_(0),
num_rows_(0),
written_compressed_bytes_(0),
metadata_(FileMetaDataBuilder::Make(&schema_, properties_)) {
PARQUET_ASSIGN_OR_THROW(int64_t position, sink_->Tell());
if (position == 0) {
Expand Down Expand Up @@ -468,6 +493,7 @@ class FileSerializer : public ParquetFileWriter::Contents {
const std::shared_ptr<WriterProperties> properties_;
int num_row_groups_;
int64_t num_rows_;
int64_t written_compressed_bytes_;
std::unique_ptr<FileMetaDataBuilder> metadata_;
// Only one of the row group writers is active at a time
std::unique_ptr<RowGroupWriter> row_group_writer_;
Expand Down Expand Up @@ -640,6 +666,29 @@ void ParquetFileWriter::AddKeyValueMetadata(
}
}

std::optional<double> ParquetFileWriter::EstimateCompressedBytesPerRow() const {
if (contents_ && contents_->num_rows() > 0) {
// Use written row groups to estimate.
return static_cast<double>(contents_->written_compressed_bytes()) /
contents_->num_rows();
}
if (file_metadata_) {
// Use closed file metadata to estimate.
int64_t total_compressed_bytes = 0;
int64_t total_rows = 0;
for (int i = 0; i < file_metadata_->num_row_groups(); i++) {
const auto row_group = file_metadata_->RowGroup(i);
total_compressed_bytes += row_group->total_compressed_size();
total_rows += row_group->num_rows();
}
if (total_compressed_bytes == 0 || total_rows == 0) {
return std::nullopt;
}
return static_cast<double>(total_compressed_bytes) / total_rows;
}
return std::nullopt;
}

const std::shared_ptr<WriterProperties>& ParquetFileWriter::properties() const {
if (contents_) {
return contents_->properties();
Expand Down
10 changes: 10 additions & 0 deletions cpp/src/parquet/file_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ class PARQUET_EXPORT RowGroupWriter {
virtual int64_t total_compressed_bytes() const = 0;
/// \brief total compressed bytes written by the page writer
virtual int64_t total_compressed_bytes_written() const = 0;
/// \brief estimated bytes of values that are buffered by the page writer
/// but not written to a page yet
virtual int64_t EstimatedBufferedValueBytes() const = 0;

virtual bool buffered() const = 0;
};
Expand Down Expand Up @@ -99,6 +102,8 @@ class PARQUET_EXPORT RowGroupWriter {
int64_t total_compressed_bytes() const;
/// \brief total compressed bytes written by the page writer
int64_t total_compressed_bytes_written() const;
/// \brief Estimate total compressed bytes including written and buffered bytes.
int64_t EstimatedTotalCompressedBytes() const;

/// Returns whether the current RowGroupWriter is in the buffered mode and is created
/// by calling ParquetFileWriter::AppendBufferedRowGroup.
Expand Down Expand Up @@ -151,6 +156,7 @@ class PARQUET_EXPORT ParquetFileWriter {
virtual RowGroupWriter* AppendBufferedRowGroup() = 0;

virtual int64_t num_rows() const = 0;
virtual int64_t written_compressed_bytes() const = 0;
Comment on lines 158 to +159
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
virtual int64_t num_rows() const = 0;
virtual int64_t written_compressed_bytes() const = 0;
virtual int64_t written_compressed_bytes() const = 0;
virtual int64_t num_rows() const = 0;

virtual int num_columns() const = 0;
virtual int num_row_groups() const = 0;

Expand Down Expand Up @@ -207,6 +213,10 @@ class PARQUET_EXPORT ParquetFileWriter {
void AddKeyValueMetadata(
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata);

/// \brief Estimate compressed bytes per row from closed row groups.
/// \return Estimated bytes or std::nullopt when no written row group.
std::optional<double> EstimateCompressedBytesPerRow() const;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated to this PR: perhaps it is also useful to provide an estimation of the current file size to facilitate downstream to implement a rolling file writer.


/// Number of columns.
///
/// This number is fixed during the lifetime of the writer as it is determined via
Expand Down
14 changes: 14 additions & 0 deletions cpp/src/parquet/properties.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,20 @@ std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties() {
return default_writer_properties;
}

WriterProperties::Builder* WriterProperties::Builder::max_row_group_length(
int64_t max_row_group_length) {
ARROW_CHECK_GT(max_row_group_length, 0) << "max_row_group_length must be positive";
max_row_group_length_ = max_row_group_length;
return this;
}

WriterProperties::Builder* WriterProperties::Builder::max_row_group_bytes(
int64_t max_row_group_bytes) {
ARROW_CHECK_GT(max_row_group_bytes, 0) << "max_row_group_bytes must be positive";
max_row_group_bytes_ = max_row_group_bytes;
return this;
}

void WriterProperties::Builder::CopyColumnSpecificProperties(
const WriterProperties& properties) {
for (const auto& [col_path, col_props] : properties.column_properties_) {
Expand Down
Loading
Loading