Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[core]Make GCS InternalKV workload configurable to the Policy. #47736

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/ray/common/asio/asio_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#include "ray/common/asio/instrumented_io_context.h"
#include "ray/util/array.h"
#include "ray/util/type_traits.h"
#include "ray/util/util.h"

template <typename Duration>
Expand Down Expand Up @@ -139,7 +140,11 @@ class IOContextProvider {
instrumented_io_context &GetIOContext() const {
constexpr int index = Policy::template GetDedicatedIOContextIndex<T>();
static_assert(
index >= -1 && index < Policy::kAllDedicatedIOContextNames.size(),
(index == -1) ||
(index >= 0 &&
static_cast<size_t>(index) < Policy::kAllDedicatedIOContextNames.size()) ||
// To show index in compile error...
ray::AlwaysFalseValue<index>,
"index out of bound, invalid GetDedicatedIOContextIndex implementation! Index "
"can only be -1 or within range of kAllDedicatedIOContextNames");

Expand Down
47 changes: 20 additions & 27 deletions src/ray/common/asio/periodical_runner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
namespace ray {

PeriodicalRunner::PeriodicalRunner(instrumented_io_context &io_service)
: io_service_(io_service), mutex_(), stopped_(std::make_shared<bool>(false)) {}
: io_service_(io_service) {}

PeriodicalRunner::~PeriodicalRunner() {
RAY_LOG(DEBUG) << "PeriodicalRunner is destructed";
Expand All @@ -38,7 +38,7 @@ void PeriodicalRunner::Clear() {

void PeriodicalRunner::RunFnPeriodically(std::function<void()> fn,
uint64_t period_ms,
const std::string name) {
const std::string &name) {
*stopped_ = false;
if (period_ms > 0) {
auto timer = std::make_shared<boost::asio::deadline_timer>(io_service_);
Expand Down Expand Up @@ -74,28 +74,27 @@ void PeriodicalRunner::DoRunFnPeriodically(
fn();
absl::MutexLock lock(&mutex_);
timer->expires_from_now(period);
timer->async_wait(
[this, stopped = stopped_, fn = std::move(fn), period, timer = std::move(timer)](
const boost::system::error_code &error) {
if (*stopped) {
return;
}
if (error == boost::asio::error::operation_aborted) {
// `operation_aborted` is set when `timer` is canceled or destroyed.
// The Monitor lifetime may be short than the object who use it. (e.g.
// gcs_server)
return;
}
RAY_CHECK(!error) << error.message();
DoRunFnPeriodically(fn, period, timer);
});
timer->async_wait([this, stopped = stopped_, fn, period, timer = std::move(timer)](
const boost::system::error_code &error) {
if (*stopped) {
return;
}
if (error == boost::asio::error::operation_aborted) {
// `operation_aborted` is set when `timer` is canceled or destroyed.
// The Monitor lifetime may be short than the object who use it. (e.g.
// gcs_server)
return;
}
RAY_CHECK(!error) << error.message();
DoRunFnPeriodically(fn, period, timer);
});
}

void PeriodicalRunner::DoRunFnPeriodicallyInstrumented(
const std::function<void()> &fn,
boost::posix_time::milliseconds period,
std::shared_ptr<boost::asio::deadline_timer> timer,
const std::string name) {
const std::string &name) {
fn();
absl::MutexLock lock(&mutex_);
timer->expires_from_now(period);
Expand All @@ -104,7 +103,7 @@ void PeriodicalRunner::DoRunFnPeriodicallyInstrumented(
// event loop.
auto stats_handle = io_service_.stats().RecordStart(name, period.total_nanoseconds());
timer->async_wait([this,
fn = std::move(fn),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why removing move?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because that fn move did not work, because fn is a const& and move is no-op.

fn,
stopped = stopped_,
period,
timer = std::move(timer),
Expand All @@ -114,13 +113,7 @@ void PeriodicalRunner::DoRunFnPeriodicallyInstrumented(
return;
}
io_service_.stats().RecordExecution(
[this,
stopped = stopped_,
fn = std::move(fn),
error,
period,
timer = std::move(timer),
name]() {
[this, stopped = stopped, fn, error, period, timer, name]() {
if (*stopped) {
return;
}
Expand All @@ -133,7 +126,7 @@ void PeriodicalRunner::DoRunFnPeriodicallyInstrumented(
RAY_CHECK(!error) << error.message();
DoRunFnPeriodicallyInstrumented(fn, period, timer, name);
},
std::move(stats_handle));
stats_handle);
});
}

Expand Down
10 changes: 6 additions & 4 deletions src/ray/common/asio/periodical_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ namespace ray {
/// All registered functions will stop running once this object is destructed.
class PeriodicalRunner {
public:
PeriodicalRunner(instrumented_io_context &io_service);
explicit PeriodicalRunner(instrumented_io_context &io_service);

~PeriodicalRunner();

void Clear();

void RunFnPeriodically(std::function<void()> fn,
uint64_t period_ms,
const std::string name) ABSL_LOCKS_EXCLUDED(mutex_);
const std::string &name) ABSL_LOCKS_EXCLUDED(mutex_);

private:
void DoRunFnPeriodically(const std::function<void()> &fn,
Expand All @@ -49,14 +49,16 @@ class PeriodicalRunner {
void DoRunFnPeriodicallyInstrumented(const std::function<void()> &fn,
boost::posix_time::milliseconds period,
std::shared_ptr<boost::asio::deadline_timer> timer,
const std::string name)
const std::string &name)
ABSL_LOCKS_EXCLUDED(mutex_);

instrumented_io_context &io_service_;
mutable absl::Mutex mutex_;
std::vector<std::shared_ptr<boost::asio::deadline_timer>> timers_
ABSL_GUARDED_BY(mutex_);
std::shared_ptr<bool> stopped_;
// `stopped_` is copied to the timer callback, and may outlive `this`.
std::shared_ptr<std::atomic<bool>> stopped_ =
std::make_shared<std::atomic<bool>>(false);
};

} // namespace ray
17 changes: 10 additions & 7 deletions src/ray/gcs/gcs_server/gcs_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,17 @@ GcsServer::GcsServer(const ray::gcs::GcsServerConfig &config,
periodical_runner_(io_context_provider_.GetDefaultIOContext()),
is_started_(false),
is_stopped_(false) {
// Init GCS table storage.
// Init GCS table storage. Note this is on the default io context, not the one with
// GcsInternalKVManager, to avoid congestion on the latter.
Comment on lines +72 to +73
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does this mean?

Copy link
Contributor Author

@rynewang rynewang Oct 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Meaning gcs_table_storage_ should always go with GetDefaultIOContext, not with GetIOContext<GcsInternalKVManager>()

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to avoid congestion on the latter.

What does this mean?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If gcs_table_storage_ operations go with GcsInternalKVManager's, it may slow down GcsInternalKVManager

RAY_LOG(INFO) << "GCS storage type is " << storage_type_;
switch (storage_type_) {
case StorageType::IN_MEMORY:
gcs_table_storage_ = std::make_shared<InMemoryGcsTableStorage>(
io_context_provider_.GetDefaultIOContext());
break;
case StorageType::REDIS_PERSIST:
gcs_table_storage_ = std::make_shared<gcs::RedisGcsTableStorage>(GetOrConnectRedis());
gcs_table_storage_ = std::make_shared<gcs::RedisGcsTableStorage>(
GetOrConnectRedis(io_context_provider_.GetDefaultIOContext()));
break;
default:
RAY_LOG(FATAL) << "Unexpected storage type: " << storage_type_;
Expand Down Expand Up @@ -557,13 +559,13 @@ void GcsServer::InitKVManager() {
std::unique_ptr<InternalKVInterface> instance;
switch (storage_type_) {
case (StorageType::REDIS_PERSIST):
instance = std::make_unique<StoreClientInternalKV>(
std::make_unique<RedisStoreClient>(GetOrConnectRedis()));
instance = std::make_unique<StoreClientInternalKV>(std::make_unique<RedisStoreClient>(
GetOrConnectRedis(io_context_provider_.GetIOContext<GcsInternalKVManager>())));
break;
case (StorageType::IN_MEMORY):
instance = std::make_unique<StoreClientInternalKV>(
std::make_unique<ObservableStoreClient>(std::make_unique<InMemoryStoreClient>(
io_context_provider_.GetDefaultIOContext())));
io_context_provider_.GetIOContext<GcsInternalKVManager>())));
break;
default:
RAY_LOG(FATAL) << "Unexpected storage type! " << storage_type_;
Expand All @@ -576,7 +578,7 @@ void GcsServer::InitKVManager() {
void GcsServer::InitKVService() {
RAY_CHECK(kv_manager_);
kv_service_ = std::make_unique<rpc::InternalKVGrpcService>(
io_context_provider_.GetDefaultIOContext(), *kv_manager_);
io_context_provider_.GetIOContext<GcsInternalKVManager>(), *kv_manager_);
// Register service.
rpc_server_.RegisterService(*kv_service_, false /* token_auth */);
}
Expand Down Expand Up @@ -819,7 +821,8 @@ std::string GcsServer::GetDebugState() const {
return stream.str();
}

std::shared_ptr<RedisClient> GcsServer::GetOrConnectRedis() {
std::shared_ptr<RedisClient> GcsServer::GetOrConnectRedis(
instrumented_io_context &io_service) {
if (redis_client_ == nullptr) {
redis_client_ = std::make_shared<RedisClient>(GetRedisClientOptions());
auto status = redis_client_->Connect(io_context_provider_.GetDefaultIOContext());
Expand Down
2 changes: 1 addition & 1 deletion src/ray/gcs/gcs_server/gcs_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ class GcsServer {
void PrintAsioStats();

/// Get or connect to a redis server
std::shared_ptr<RedisClient> GetOrConnectRedis();
std::shared_ptr<RedisClient> GetOrConnectRedis(instrumented_io_context &io_service);

void TryGlobalGC();

Expand Down
8 changes: 6 additions & 2 deletions src/ray/gcs/gcs_server/gcs_server_io_context_policy.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "ray/gcs/gcs_server/gcs_task_manager.h"
#include "ray/gcs/pubsub/gcs_pub_sub.h"
#include "ray/util/array.h"
#include "ray/util/type_traits.h"

namespace ray {
namespace gcs {
Expand All @@ -40,10 +41,13 @@ struct GcsServerIOContextPolicy {
return IndexOf("pubsub_io_context");
} else if constexpr (std::is_same_v<T, syncer::RaySyncer>) {
return IndexOf("ray_syncer_io_context");
} else if constexpr (std::is_same_v<T, GcsInternalKVManager>) {
// default io context
return -1;
} else {
// Due to if-constexpr limitations, this have to be in an else block.
// Using this tuple_size_v to put T into compile error message.
static_assert(std::tuple_size_v<std::tuple<T>> == 0, "unknown type");
// Using this template to put T into compile error message.
static_assert(AlwaysFalse<T>, "unknown type");
}
}

Expand Down
31 changes: 31 additions & 0 deletions src/ray/util/type_traits.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// Copyright 2024 The Ray Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

namespace ray {

template <typename T>
constexpr bool AlwaysFalse = false;

template <typename T>
constexpr bool AlwaysTrue = true;

template <int N>
constexpr bool AlwaysFalseValue = false;

template <int N>
constexpr bool AlwaysTrueValue = true;

} // namespace ray