Skip to content

Commit 84dd650

Browse files
committed
support customizing more Zstd parameters
address review use ZSTD_compress2 instead of ZSTD_compressCCtx add a test to ensure zstd context parameters work effectively
1 parent 7820f67 commit 84dd650

7 files changed

Lines changed: 209 additions & 109 deletions

File tree

cpp/src/arrow/util/compression.cc

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <memory>
2121
#include <string>
2222
#include <utility>
23+
#include <vector>
2324

2425
#include "arrow/result.h"
2526
#include "arrow/status.h"
@@ -200,11 +201,16 @@ Result<std::unique_ptr<Codec>> Codec::Create(Compression::type codec_type,
200201
codec = internal::MakeLz4HadoopRawCodec();
201202
#endif
202203
break;
203-
case Compression::ZSTD:
204+
case Compression::ZSTD: {
204205
#ifdef ARROW_WITH_ZSTD
205-
codec = internal::MakeZSTDCodec(compression_level);
206+
auto opt = dynamic_cast<const ZstdCodecOptions*>(&codec_options);
207+
codec = internal::MakeZSTDCodec(
208+
compression_level,
209+
opt ? opt->compression_context_params : std::vector<std::pair<int, int>>{},
210+
opt ? opt->decompression_context_params : std::vector<std::pair<int, int>>{});
206211
#endif
207212
break;
213+
}
208214
case Compression::BZ2:
209215
#ifdef ARROW_WITH_BZ2
210216
codec = internal::MakeBZ2Codec(compression_level);

cpp/src/arrow/util/compression.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#include <memory>
2323
#include <optional>
2424
#include <string>
25+
#include <utility>
26+
#include <vector>
2527

2628
#include "arrow/result.h"
2729
#include "arrow/status.h"
@@ -142,6 +144,15 @@ class ARROW_EXPORT BrotliCodecOptions : public CodecOptions {
142144
std::optional<int> window_bits;
143145
};
144146

147+
// ----------------------------------------------------------------------
148+
// Zstd codec options implementation
149+
150+
class ARROW_EXPORT ZstdCodecOptions : public CodecOptions {
151+
public:
152+
std::vector<std::pair<int, int>> compression_context_params;
153+
std::vector<std::pair<int, int>> decompression_context_params;
154+
};
155+
145156
/// \brief Compression codec
146157
class ARROW_EXPORT Codec {
147158
public:

cpp/src/arrow/util/compression_internal.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
#pragma once
1919

2020
#include <memory>
21+
#include <utility>
22+
#include <vector>
2123

2224
#include "arrow/util/compression.h" // IWYU pragma: export
2325

@@ -74,7 +76,9 @@ std::unique_ptr<Codec> MakeLz4HadoopRawCodec();
7476
constexpr int kZSTDDefaultCompressionLevel = 1;
7577

7678
std::unique_ptr<Codec> MakeZSTDCodec(
77-
int compression_level = kZSTDDefaultCompressionLevel);
79+
int compression_level = kZSTDDefaultCompressionLevel,
80+
std::vector<std::pair<int, int>> compression_context_params = {},
81+
std::vector<std::pair<int, int>> decompression_context_params = {});
7882

7983
} // namespace internal
8084
} // namespace util

cpp/src/arrow/util/compression_test.cc

Lines changed: 92 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,15 @@
1616
// under the License.
1717

1818
#include <algorithm>
19+
#include <concepts>
1920
#include <cstdint>
2021
#include <cstring>
2122
#include <memory>
2223
#include <ostream>
2324
#include <random>
25+
#include <span>
2426
#include <string>
27+
#include <utility>
2528
#include <vector>
2629

2730
#include <gtest/gtest.h>
@@ -446,36 +449,17 @@ TEST(TestCodecMisc, SpecifyCompressionLevel) {
446449
}
447450
}
448451

449-
TEST(TestCodecMisc, SpecifyCodecOptionsGZip) {
450-
// for now only GZIP & Brotli codec options supported, since it has specific parameters
451-
// to be customized, other codecs could directly go with CodecOptions, could add more
452-
// specific codec options if needed.
453-
struct CombinationOption {
454-
int level;
455-
GZipFormat format;
456-
int window_bits;
457-
bool expect_success;
458-
};
459-
constexpr CombinationOption combinations[] = {{2, GZipFormat::ZLIB, 12, true},
460-
{9, GZipFormat::GZIP, 9, true},
461-
{9, GZipFormat::GZIP, 20, false},
462-
{5, GZipFormat::DEFLATE, -12, false},
463-
{-992, GZipFormat::GZIP, 15, false}};
464-
452+
template <std::derived_from<arrow::util::CodecOptions> T>
453+
void CheckSpecifyCodecOptions(Compression::type compression,
454+
std::span<const std::pair<T, bool>> options) {
465455
std::vector<uint8_t> data = MakeRandomData(2000);
466-
for (const auto& combination : combinations) {
467-
const auto compression = Compression::GZIP;
456+
for (const auto& [codec_option, expect_success] : options) {
468457
if (!Codec::IsAvailable(compression)) {
469458
// Support for this codec hasn't been built
470459
continue;
471460
}
472-
auto codec_options = arrow::util::GZipCodecOptions();
473-
codec_options.compression_level = combination.level;
474-
codec_options.gzip_format = combination.format;
475-
codec_options.window_bits = combination.window_bits;
476-
const auto expect_success = combination.expect_success;
477-
auto result1 = Codec::Create(compression, codec_options);
478-
auto result2 = Codec::Create(compression, codec_options);
461+
auto result1 = Codec::Create(compression, codec_option);
462+
auto result2 = Codec::Create(compression, codec_option);
479463
ASSERT_EQ(expect_success, result1.ok());
480464
ASSERT_EQ(expect_success, result2.ok());
481465
if (expect_success) {
@@ -484,37 +468,92 @@ TEST(TestCodecMisc, SpecifyCodecOptionsGZip) {
484468
}
485469
}
486470

471+
TEST(TestCodecMisc, SpecifyCodecOptionsGZip) {
472+
auto make_option = [](int compression_level, GZipFormat format,
473+
std::optional<int> window_bits) {
474+
arrow::util::GZipCodecOptions option;
475+
option.compression_level = compression_level;
476+
option.gzip_format = format;
477+
option.window_bits = window_bits;
478+
return option;
479+
};
480+
const std::pair<arrow::util::GZipCodecOptions, bool> options[]{
481+
{make_option(5, GZipFormat::GZIP, 15), true},
482+
{make_option(9, GZipFormat::ZLIB, 12), true},
483+
{make_option(-1, GZipFormat::DEFLATE, 10), true},
484+
{make_option(10, GZipFormat::GZIP, 25), false},
485+
{make_option(-992, GZipFormat::GZIP, 15), false},
486+
};
487+
CheckSpecifyCodecOptions<arrow::util::GZipCodecOptions>(Compression::GZIP, options);
488+
}
489+
487490
TEST(TestCodecMisc, SpecifyCodecOptionsBrotli) {
488-
// for now only GZIP & Brotli codec options supported, since it has specific parameters
489-
// to be customized, other codecs could directly go with CodecOptions, could add more
490-
// specific codec options if needed.
491-
struct CombinationOption {
492-
int level;
493-
int window_bits;
494-
bool expect_success;
491+
auto make_option = [](int compression_level, std::optional<int> window_bits) {
492+
arrow::util::BrotliCodecOptions option;
493+
option.compression_level = compression_level;
494+
option.window_bits = window_bits;
495+
return option;
495496
};
496-
constexpr CombinationOption combinations[] = {
497-
{8, 22, true}, {11, 10, true}, {1, 24, true}, {5, -12, false}, {-992, 25, false}};
497+
const std::pair<arrow::util::BrotliCodecOptions, bool> options[]{
498+
{make_option(8, 22), true}, {make_option(11, 10), true},
499+
{make_option(1, 24), true}, {make_option(5, -12), false},
500+
{make_option(-992, 25), false},
501+
};
502+
CheckSpecifyCodecOptions<arrow::util::BrotliCodecOptions>(Compression::BROTLI, options);
503+
}
498504

499-
std::vector<uint8_t> data = MakeRandomData(2000);
500-
for (const auto& combination : combinations) {
501-
const auto compression = Compression::BROTLI;
502-
if (!Codec::IsAvailable(compression)) {
503-
// Support for this codec hasn't been built
504-
continue;
505-
}
506-
auto codec_options = arrow::util::BrotliCodecOptions();
507-
codec_options.compression_level = combination.level;
508-
codec_options.window_bits = combination.window_bits;
509-
const auto expect_success = combination.expect_success;
510-
auto result1 = Codec::Create(compression, codec_options);
511-
auto result2 = Codec::Create(compression, codec_options);
512-
ASSERT_EQ(expect_success, result1.ok());
513-
ASSERT_EQ(expect_success, result2.ok());
514-
if (expect_success) {
515-
CheckCodecRoundtrip(*result1, *result2, data);
516-
}
517-
}
505+
TEST(TestCodecMisc, SpecifyCodecOptionsZstd) {
506+
auto make_option = [](int compression_level,
507+
std::vector<std::pair<int, int>> compression_context_params,
508+
std::vector<std::pair<int, int>> decompression_context_params) {
509+
arrow::util::ZstdCodecOptions option;
510+
option.compression_level = compression_level;
511+
option.compression_context_params = std::move(compression_context_params);
512+
option.decompression_context_params = std::move(decompression_context_params);
513+
return option;
514+
};
515+
constexpr int ZSTD_c_windowLog = 101;
516+
constexpr int ZSTD_d_windowLogMax = 100;
517+
const std::pair<arrow::util::ZstdCodecOptions, bool> options[]{
518+
{make_option(2, {}, {}), true},
519+
{make_option(9, {}, {}), true},
520+
{make_option(15, {}, {}), true},
521+
{make_option(-992, {}, {}), true},
522+
{make_option(3, {{ZSTD_c_windowLog, 23}}, {}), true},
523+
{make_option(3, {{ZSTD_c_windowLog, 28}}, {{ZSTD_d_windowLogMax, 28}}), true}};
524+
CheckSpecifyCodecOptions<arrow::util::ZstdCodecOptions>(Compression::ZSTD, options);
525+
}
526+
527+
TEST(TestCodecMisc, ZstdLargerWindowLog) {
528+
constexpr int ZSTD_c_windowLog = 101;
529+
constexpr int ZSTD_d_windowLogMax = 100;
530+
531+
arrow::util::ZstdCodecOptions option1;
532+
option1.compression_level = 3;
533+
534+
arrow::util::ZstdCodecOptions option2;
535+
option2.compression_level = 3;
536+
option2.compression_context_params = {{ZSTD_c_windowLog, 23}};
537+
option2.decompression_context_params = {{ZSTD_d_windowLogMax, 23}};
538+
539+
std::vector<uint8_t> data = MakeRandomData(4 * 1024 * 1024);
540+
data.reserve(data.size() * 2);
541+
data.insert(data.end(), data.begin(), data.end());
542+
543+
ASSERT_OK_AND_ASSIGN(auto result1, Codec::Create(Compression::ZSTD, option1));
544+
ASSERT_OK_AND_ASSIGN(auto result2, Codec::Create(Compression::ZSTD, option2));
545+
546+
int max_compressed_len =
547+
static_cast<int>(result1->MaxCompressedLen(data.size(), data.data()));
548+
std::vector<uint8_t> compressed(max_compressed_len);
549+
550+
ASSERT_OK_AND_ASSIGN(
551+
int64_t actual_size1,
552+
result1->Compress(data.size(), data.data(), max_compressed_len, compressed.data()));
553+
ASSERT_OK_AND_ASSIGN(
554+
int64_t actual_size2,
555+
result2->Compress(data.size(), data.data(), max_compressed_len, compressed.data()));
556+
ASSERT_GT(actual_size1, actual_size2);
518557
}
519558

520559
TEST_P(CodecTest, MinMaxCompressionLevel) {

0 commit comments

Comments
 (0)