Skip to content

Commit 316cda5

Browse files
committed
[#24540] docdb: Added flag rocksdb_allow_multiple_pending_compactions_for_priority_thread_pool
Summary: Idea is to avoid adding new compaction task for the same RocksDB instance while we already have a pending one. That was the way compaction logic worked in original RocksDB but due to some reasons wasn’t implemented for compaction tasks scheduling logic which is used with priority thread pool. As we saw earlier scheduling compaction tasks for the same RocksDB can lead to concurrent compaction tasks for the same tablet and also interferes with compaction picker logic making it unable to select (and as a result execute) a new compaction task for delayed time. This change adds `rocksdb_allow_multiple_pending_compactions_for_priority_thread_pool` flag which limits number of pending compaction to 1 small and 1 large when set to false. Jira: DB-13574 Test Plan: `ybd --cxx-test rocksdb_db_compaction_test --gtest_filter DBCompactionTest.LimitPendingCompactionTasks -n 50` for debug/asan/tsan Reviewers: sergei, arybochkin Reviewed By: sergei, arybochkin Subscribers: rthallam, ybase Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D39261
1 parent 3f2ac8c commit 316cda5

File tree

9 files changed

+446
-94
lines changed

9 files changed

+446
-94
lines changed

src/yb/rocksdb/db/column_family.cc

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -363,8 +363,10 @@ ColumnFamilyData::ColumnFamilyData(
363363
log_number_(0),
364364
column_family_set_(column_family_set),
365365
pending_flush_(false),
366-
pending_compaction_(false),
367366
prev_compaction_needed_bytes_(0) {
367+
for (auto& num_pending_compactions : num_pending_compactions_) {
368+
num_pending_compactions = 0;
369+
}
368370
Ref();
369371

370372
// Convert user defined table properties collector factories to internal ones.
@@ -442,7 +444,11 @@ ColumnFamilyData::~ColumnFamilyData() {
442444
// It would be wrong if this ColumnFamilyData is in flush_queue_ or
443445
// compaction_queue_ and we destroyed it
444446
DCHECK(!pending_flush_);
445-
DCHECK(!pending_compaction_);
447+
for (size_t idx = 0; idx < num_pending_compactions_.size(); ++idx) {
448+
LOG_IF(DFATAL, num_pending_compactions_[idx] != 0)
449+
<< "Expected no " << yb::AsString(CompactionSizeKind(idx))
450+
<< " pending compactions, but got: " << num_pending_compactions_[idx];
451+
}
446452

447453
if (super_version_ != nullptr) {
448454
// Release SuperVersion reference kept in ThreadLocalPtr.
@@ -556,6 +562,33 @@ int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger,
556562
}
557563
} // namespace
558564

565+
void ColumnFamilyData::PendingCompactionAdded(CompactionSizeKind compaction_size_kind) {
566+
num_pending_compactions_[yb::to_underlying(compaction_size_kind)].fetch_add(1);
567+
}
568+
569+
void ColumnFamilyData::PendingCompactionRemoved(CompactionSizeKind compaction_size_kind) {
570+
if (num_pending_compactions_[yb::to_underlying(compaction_size_kind)].fetch_sub(1) == 0) {
571+
LOG_WITH_FUNC(DFATAL) << ioptions_.info_log->Prefix() << "No pending "
572+
<< yb::AsString(compaction_size_kind) << " compactions";
573+
num_pending_compactions_[yb::to_underlying(compaction_size_kind)].fetch_add(1);
574+
}
575+
}
576+
577+
void ColumnFamilyData::PendingCompactionSizeKindUpdated(
578+
CompactionSizeKind from, CompactionSizeKind to) {
579+
PendingCompactionRemoved(from);
580+
PendingCompactionAdded(to);
581+
}
582+
583+
bool ColumnFamilyData::pending_compaction() const {
584+
for (auto& num_pending_compactions : num_pending_compactions_) {
585+
if (num_pending_compactions > 0) {
586+
return true;
587+
}
588+
}
589+
return false;
590+
}
591+
559592
void ColumnFamilyData::RecalculateWriteStallConditions(
560593
const MutableCFOptions& mutable_cf_options) {
561594
Version* current_version = current();
@@ -631,10 +664,10 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
631664
InternalStats::LEVEL0_SLOWDOWN_WITH_COMPACTION, 1);
632665
}
633666
RLOG(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
634-
"[%s] Stalling writes because we have %d level-0 files "
667+
"[%s] Stalling writes because we have %d level-0 files (trigger: %d) "
635668
"rate %" PRIu64,
636669
name_.c_str(), vstorage->l0_delay_trigger_count(),
637-
write_controller->delayed_write_rate());
670+
mutable_cf_options.level0_slowdown_writes_trigger, write_controller->delayed_write_rate());
638671
} else if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0 &&
639672
vstorage->estimated_compaction_needed_bytes() >=
640673
mutable_cf_options.soft_pending_compaction_bytes_limit) {

src/yb/rocksdb/db/column_family.h

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
#include "yb/rocksdb/util/mutable_cf_options.h"
4141
#include "yb/rocksdb/util/thread_local.h"
4242

43+
#include "yb/util/enums.h"
44+
4345
namespace rocksdb {
4446

4547
class Version;
@@ -56,6 +58,8 @@ class LogBuffer;
5658
class InstrumentedMutex;
5759
class InstrumentedMutexLock;
5860

61+
YB_DEFINE_ENUM(CompactionSizeKind, (kSmall)(kLarge));
62+
5963
extern const double kSlowdownRatio;
6064

6165
// ColumnFamilyHandleImpl is the class that clients use to access different
@@ -326,9 +330,20 @@ class ColumnFamilyData {
326330

327331
// Protected by DB mutex
328332
void set_pending_flush(bool value) { pending_flush_ = value; }
329-
void set_pending_compaction(bool value) { pending_compaction_ = value; }
330333
bool pending_flush() { return pending_flush_; }
331-
bool pending_compaction() { return pending_compaction_; }
334+
335+
void PendingCompactionAdded(CompactionSizeKind compaction_size_kind);
336+
void PendingCompactionRemoved(CompactionSizeKind compaction_size_kind);
337+
void PendingCompactionSizeKindUpdated(CompactionSizeKind from, CompactionSizeKind to);
338+
bool pending_compaction() const;
339+
340+
bool pending_compaction(CompactionSizeKind compaction_size_kind) const {
341+
return num_pending_compactions_[yb::to_underlying(compaction_size_kind)] > 0;
342+
}
343+
344+
size_t TEST_num_pending_compactions(CompactionSizeKind compaction_size_kind) const {
345+
return num_pending_compactions_[yb::to_underlying(compaction_size_kind)];
346+
}
332347

333348
// Recalculate some small conditions, which are changed only during
334349
// compaction, adding new memtable and/or
@@ -403,9 +418,11 @@ class ColumnFamilyData {
403418
// If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
404419
bool pending_flush_;
405420

406-
// If true --> this ColumnFamily is currently present in
407-
// DBImpl::compaction_queue_
408-
bool pending_compaction_;
421+
// How many times this ColumnFamily is currently present in DBImpl::compaction_queue_.
422+
// Note: in general it might be not effective to use such nearly aligned atomics. This is not a
423+
// problem for this particular use case because this is not a hot path, but shouldn't be applied
424+
// in other cases where performance is critical.
425+
std::array<std::atomic<size_t>, kElementsInCompactionSizeKind> num_pending_compactions_;
409426

410427
uint64_t prev_compaction_needed_bytes_;
411428
};

src/yb/rocksdb/db/db_compaction_test.cc

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,13 @@
2727
#include "yb/rocksdb/port/stack_trace.h"
2828
#include "yb/rocksdb/rate_limiter.h"
2929
#include "yb/rocksdb/util/file_util.h"
30+
#include "yb/rocksdb/util/task_metrics.h"
3031
#include "yb/rocksdb/util/testutil.h"
3132

3233
#include "yb/rocksutil/yb_rocksdb_logger.h"
3334

3435
#include "yb/util/backoff_waiter.h"
36+
#include "yb/util/metrics.h"
3537
#include "yb/util/priority_thread_pool.h"
3638
#include "yb/util/random_util.h"
3739
#include "yb/util/sync_point.h"
@@ -41,6 +43,7 @@
4143
DECLARE_bool(flush_rocksdb_on_shutdown);
4244
DECLARE_bool(use_priority_thread_pool_for_compactions);
4345
DECLARE_bool(use_priority_thread_pool_for_flushes);
46+
DECLARE_bool(rocksdb_allow_multiple_pending_compactions_for_priority_thread_pool);
4447

4548
using std::atomic;
4649
using namespace std::literals;
@@ -2948,6 +2951,206 @@ TEST_F_EX(DBCompactionTest, AbortManualCompactionOnShutdown, RocksDBTest) {
29482951
}
29492952
}
29502953

2954+
namespace {
2955+
2956+
class DelayFilter : public CompactionFilter {
2957+
public:
2958+
explicit DelayFilter(std::atomic<int>* delay_ms_per_entry)
2959+
: delay_ms_per_entry_(delay_ms_per_entry) {}
2960+
2961+
FilterDecision Filter(int level, const Slice& key, const Slice& value,
2962+
std::string* new_value, bool* value_changed) override {
2963+
auto delay_ms = delay_ms_per_entry_->load();
2964+
if (delay_ms > 0) {
2965+
std::this_thread::sleep_for(delay_ms * 1ms);
2966+
}
2967+
return FilterDecision::kKeep;
2968+
}
2969+
2970+
const char* Name() const override { return "KeepFilter"; }
2971+
2972+
private:
2973+
std::atomic<int>* delay_ms_per_entry_;
2974+
};
2975+
2976+
class DelayFilterFactory : public CompactionFilterFactory {
2977+
public:
2978+
virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
2979+
const CompactionFilter::Context& context) override {
2980+
return std::unique_ptr<CompactionFilter>(new DelayFilter(&delay_ms_per_entry_));
2981+
}
2982+
2983+
void SetDelayMsPerEntry(int delay_ms_per_entry) {
2984+
delay_ms_per_entry_ = delay_ms_per_entry;
2985+
}
2986+
2987+
const char* Name() const override { return "DelayFilterFactory"; }
2988+
2989+
private:
2990+
std::atomic<int> delay_ms_per_entry_;
2991+
};
2992+
2993+
Result<std::pair<size_t, size_t>> CheckPendingCompactions(DBImpl* db) {
2994+
size_t num_small_pending_compactions;
2995+
size_t num_large_pending_compactions;
2996+
size_t num_small_not_started_compactions;
2997+
size_t num_large_not_started_compactions;
2998+
{
2999+
std::lock_guard db_lock(*db->TEST_mutex());
3000+
std::lock_guard priority_thread_pool_lock(
3001+
*db->GetOptions().priority_thread_pool_for_compactions_and_flushes->TEST_mutex());
3002+
3003+
auto* cfd = pointer_cast<ColumnFamilyHandleImpl*>(db->DefaultColumnFamily())->cfd();
3004+
num_small_pending_compactions = cfd->TEST_num_pending_compactions(CompactionSizeKind::kSmall);
3005+
num_large_pending_compactions = cfd->TEST_num_pending_compactions(CompactionSizeKind::kLarge);
3006+
3007+
num_small_not_started_compactions =
3008+
db->TEST_NumNotStartedCompactionsUnlocked(CompactionSizeKind::kSmall);
3009+
num_large_not_started_compactions =
3010+
db->TEST_NumNotStartedCompactionsUnlocked(CompactionSizeKind::kLarge);
3011+
}
3012+
3013+
LOG(INFO) << "num_small_pending_compactions: " << num_small_pending_compactions
3014+
<< " num_large_pending_compactions: " << num_large_pending_compactions;
3015+
LOG(INFO) << "num_small_not_started_compactions: " << num_small_not_started_compactions
3016+
<< " num_large_not_started_compactions: " << num_large_not_started_compactions;
3017+
3018+
SCHECK_LE(num_small_not_started_compactions, num_small_pending_compactions, IllegalState,
3019+
"Pending compactions should include not started and paused.");
3020+
SCHECK_LE(num_large_not_started_compactions, num_large_pending_compactions, IllegalState,
3021+
"Pending compactions should include not started and paused.");
3022+
3023+
// Probably we should abort not yet started compaction if pausing another one in order to limit
3024+
// number of pending compactions but this is non-trivial and should be addressed by
3025+
// https://github.com/yugabyte/yugabyte-db/issues/24541.
3026+
SCHECK_LE(
3027+
num_small_not_started_compactions, std::size_t{1}, IllegalState,
3028+
"Expected at most 1 not started small compaction.");
3029+
SCHECK_LE(
3030+
num_large_not_started_compactions, std::size_t{1}, IllegalState,
3031+
"Expected at most 1 not started large compaction.");
3032+
3033+
SCHECK_LE(
3034+
num_small_pending_compactions, std::size_t{2}, IllegalState,
3035+
"Expected at most 2 pending small compaction.");
3036+
SCHECK_LE(
3037+
num_large_pending_compactions, std::size_t{2}, IllegalState,
3038+
"Expected at most 2 pending large compaction.");
3039+
return std::make_pair(num_small_pending_compactions, num_large_pending_compactions);
3040+
}
3041+
3042+
} // namespace
3043+
3044+
TEST_F(DBCompactionTest, LimitPendingCompactionTasks) {
3045+
ANNOTATE_UNPROTECTED_WRITE(FLAGS_use_priority_thread_pool_for_compactions) = true;
3046+
ANNOTATE_UNPROTECTED_WRITE(
3047+
FLAGS_rocksdb_allow_multiple_pending_compactions_for_priority_thread_pool) = false;
3048+
3049+
constexpr auto kMaxBackgroundCompactions = 1;
3050+
constexpr auto kNumKeysPerSmallSstFile = 100;
3051+
constexpr auto kNumKeysPerLargeSstFile = 1000;
3052+
constexpr auto kNumSstFiles = 30;
3053+
constexpr auto kNumLargeSstFiles = 10;
3054+
constexpr auto kCompactionDelayMsPerEntry = 10000 * yb::kTimeMultiplier / kNumKeysPerSmallSstFile;
3055+
constexpr auto kTimeout = 10s * yb::kTimeMultiplier;
3056+
constexpr auto kMaxCompactFlushRate = 256_MB;
3057+
constexpr auto kValueSizeBytes = 1_KB;
3058+
constexpr auto kNumFilesCompactionTrigger = 5;
3059+
constexpr auto kMaxNumExpectedFiles = 2 * kNumFilesCompactionTrigger;
3060+
3061+
// Static to avoid destruction before RocksDB is destroyed by owning test object.
3062+
static yb::PriorityThreadPool thread_pool(kMaxBackgroundCompactions);
3063+
3064+
std::shared_ptr<RateLimiter> rate_limiter(NewGenericRateLimiter(kMaxCompactFlushRate));
3065+
auto compaction_filter_factory = std::make_shared<DelayFilterFactory>();
3066+
compaction_filter_factory->SetDelayMsPerEntry(kCompactionDelayMsPerEntry);
3067+
3068+
rocksdb::BlockBasedTableOptions table_options;
3069+
table_options.block_size = 2_KB;
3070+
table_options.filter_block_size = 2_KB;
3071+
table_options.index_block_size = 2_KB;
3072+
3073+
Options options;
3074+
options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(table_options));
3075+
// Setting write_buffer_size big enough, because we use manual flush in this test.
3076+
options.write_buffer_size = 128_MB;
3077+
options.arena_block_size = 4_KB;
3078+
options.num_levels = 1;
3079+
options.compaction_style = kCompactionStyleUniversal;
3080+
options.compaction_filter_factory = compaction_filter_factory;
3081+
options.level0_file_num_compaction_trigger = 5;
3082+
options.level0_stop_writes_trigger = kNumSstFiles * 2;
3083+
options.level0_slowdown_writes_trigger = options.level0_stop_writes_trigger;
3084+
options.max_background_compactions = kMaxBackgroundCompactions;
3085+
options.priority_thread_pool_for_compactions_and_flushes = &thread_pool;
3086+
options.info_log_level = InfoLogLevel::DEBUG_LEVEL;
3087+
options.info_log = std::make_shared<yb::YBRocksDBLogger>(options.log_prefix);
3088+
options.rate_limiter = rate_limiter;
3089+
options.create_if_missing = true;
3090+
options.compaction_size_threshold_bytes = 2 * kValueSizeBytes * kNumKeysPerLargeSstFile;
3091+
3092+
auto& compaction_options_universal = options.compaction_options_universal;
3093+
compaction_options_universal.stop_style =
3094+
rocksdb::CompactionStopStyle::kCompactionStopStyleTotalSize;
3095+
compaction_options_universal.min_merge_width = 4;
3096+
compaction_options_universal.size_ratio = 20;
3097+
compaction_options_universal.always_include_size_threshold =
3098+
2 * kValueSizeBytes * kNumKeysPerSmallSstFile;
3099+
3100+
METRIC_DEFINE_entity(test_entity);
3101+
ROCKSDB_PRIORITY_THREAD_POOL_METRICS_DEFINE(test_entity);
3102+
yb::MetricRegistry registry;
3103+
auto entity = METRIC_ENTITY_test_entity.Instantiate(&registry, "task metrics");
3104+
3105+
auto priority_thread_pool_metrics =
3106+
std::make_shared<RocksDBPriorityThreadPoolMetrics>(
3107+
ROCKSDB_PRIORITY_THREAD_POOL_METRICS_INSTANCE(entity));
3108+
options.priority_thread_pool_metrics = priority_thread_pool_metrics;
3109+
3110+
DestroyAndReopen(options);
3111+
3112+
ColumnFamilyData* cfd = pointer_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
3113+
3114+
int num_keys = 0;
3115+
3116+
for (auto num = 0; num < kNumSstFiles; num++) {
3117+
const auto num_keys_per_file =
3118+
(num < kNumLargeSstFiles) ? kNumKeysPerLargeSstFile : kNumKeysPerSmallSstFile;
3119+
for (auto i = 0; i < num_keys_per_file; i++) {
3120+
auto key = Key(++num_keys);
3121+
ASSERT_OK(Put(key, yb::RandomHumanReadableString(kValueSizeBytes)));
3122+
}
3123+
ASSERT_OK(Flush());
3124+
ASSERT_OK(ResultToStatus(CheckPendingCompactions(dbfull())));
3125+
}
3126+
LOG(INFO) << "Waiting for flushes to complete...";
3127+
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
3128+
LOG(INFO) << "Waiting for flushes to complete - DONE";
3129+
3130+
auto num_pending_compactions = ASSERT_RESULT(CheckPendingCompactions(dbfull()));
3131+
// We should achieve at least 1 small and 1 large pending compactions (and we've verified that at
3132+
// most 2 is pending and at most 1 is not yet started in each category).
3133+
ASSERT_GE(num_pending_compactions.first, 1);
3134+
ASSERT_GE(num_pending_compactions.second, 1);
3135+
3136+
compaction_filter_factory->SetDelayMsPerEntry(0);
3137+
3138+
auto deadline = yb::CoarseMonoClock::Now() + kTimeout;
3139+
while (yb::CoarseMonoClock::Now() <= deadline) {
3140+
auto num_sst_files = db_->GetCurrentVersionNumSSTFiles();
3141+
LOG(INFO) << "num_sst_files: " << num_sst_files;
3142+
if (std::cmp_less(num_sst_files, kMaxNumExpectedFiles + 1)) {
3143+
break;
3144+
}
3145+
ASSERT_OK(ResultToStatus(CheckPendingCompactions(dbfull())));
3146+
std::this_thread::sleep_for(1s);
3147+
}
3148+
3149+
ASSERT_LE(db_->GetCurrentVersionNumSSTFiles(), kMaxNumExpectedFiles);
3150+
3151+
Close();
3152+
}
3153+
29513154
INSTANTIATE_TEST_CASE_P(
29523155
CompactionPriTest, CompactionPriTest,
29533156
::testing::Values(CompactionPri::kByCompensatedSize,

0 commit comments

Comments
 (0)