Skip to content

Commit 51b6652

Browse files
authored
state storage in hc (#26890)
1 parent 924d2e5 commit 51b6652

File tree

7 files changed

+239
-4
lines changed

7 files changed

+239
-4
lines changed

ydb/core/base/statestorage.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,13 +356,13 @@ TIntrusivePtr<TStateStorageInfo> BuildStateStorageInfoImpl(const char* namePrefi
356356
memset(name + offset, 0, TActorId::MaxServiceIDLength - offset);
357357
for (size_t i = 0; i < config.RingGroupsSize(); i++) {
358358
auto& ringGroup = config.GetRingGroups(i);
359-
info->RingGroups.push_back({GetRingGroupState(ringGroup), ringGroup.GetWriteOnly(), ringGroup.GetNToSelect(), {}});
359+
info->RingGroups.push_back({GetRingGroupState(ringGroup), ringGroup.GetWriteOnly(), ringGroup.GetNToSelect(), TBridgePileId::FromProto(&ringGroup, &NKikimrConfig::TDomainsConfig::TStateStorage::TRing::GetBridgePileId), {}});
360360
CopyStateStorageRingInfo(ringGroup, info->RingGroups.back(), name, offset, ringGroup.GetRingGroupActorIdOffset());
361361
memset(name + offset, 0, TActorId::MaxServiceIDLength - offset);
362362
}
363363
if (config.HasRing()) {
364364
auto& ring = config.GetRing();
365-
info->RingGroups.push_back({ERingGroupState::PRIMARY, false, ring.GetNToSelect(), {}});
365+
info->RingGroups.push_back({ERingGroupState::PRIMARY, false, ring.GetNToSelect(), {}, {}});
366366
CopyStateStorageRingInfo(ring, info->RingGroups.back(), name, offset, ring.GetRingGroupActorIdOffset());
367367
}
368368
return info;

ydb/core/base/statestorage.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#pragma once
2+
#include "bridge.h"
23
#include "defs.h"
34
#include "events.h"
45
#include <ydb/core/protos/statestorage.pb.h>
@@ -30,6 +31,7 @@ struct TEvStateStorage {
3031
EvPublishActorGone,
3132
EvRingGroupPassAway,
3233
EvConfigVersionInfo,
34+
EvListBoard,
3335

3436
// replies (local, from proxy)
3537
EvInfo = EvLookup + 512,
@@ -39,6 +41,7 @@ struct TEvStateStorage {
3941
EvDeleteResult,
4042
EvListSchemeBoardResult,
4143
EvListStateStorageResult,
44+
EvListBoardResult,
4245

4346
// replicas interface
4447
EvReplicaLookup = EvLock + 2 * 512,
@@ -383,6 +386,8 @@ struct TEvStateStorage {
383386
struct TEvPublishActorGone;
384387
struct TEvUpdateGroupConfig;
385388
struct TEvRingGroupPassAway;
389+
struct TEvListBoard;
390+
struct TEvListBoardResult;
386391

387392
struct TEvReplicaShutdown : public TEventPB<TEvStateStorage::TEvReplicaShutdown, NKikimrStateStorage::TEvReplicaShutdown, TEvStateStorage::EvReplicaShutdown> {
388393
};
@@ -514,6 +519,7 @@ struct TStateStorageInfo : public TThrRefBase {
514519
ERingGroupState State;
515520
bool WriteOnly = false;
516521
ui32 NToSelect = 0;
522+
TBridgePileId BridgePileId;
517523
TVector<TRing> Rings;
518524

519525
TString ToString() const;

ydb/core/base/statestorage_impl.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,17 @@ struct TEvStateStorage::TEvListStateStorageResult : public TEventLocal<TEvListSt
218218
{}
219219
};
220220

221+
struct TEvStateStorage::TEvListBoard : public TEventLocal<TEvListBoard, EvListBoard> {
222+
};
223+
224+
struct TEvStateStorage::TEvListBoardResult : public TEventLocal<TEvListBoardResult, EvListBoardResult> {
225+
TIntrusiveConstPtr<TStateStorageInfo> Info;
226+
227+
TEvListBoardResult(const TIntrusiveConstPtr<TStateStorageInfo> &info)
228+
: Info(info)
229+
{}
230+
};
231+
221232
struct TEvStateStorage::TEvPublishActorGone : public TEventLocal<TEvPublishActorGone, EvPublishActorGone> {
222233
TActorId Replica;
223234

ydb/core/base/statestorage_proxy.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,6 +1070,10 @@ class TStateStorageProxy : public TActor<TStateStorageProxy> {
10701070
Send(ev->Sender, new TEvStateStorage::TEvListStateStorageResult(Info), 0, ev->Cookie);
10711071
}
10721072

1073+
void Handle(TEvStateStorage::TEvListBoard::TPtr &ev) {
1074+
Send(ev->Sender, new TEvStateStorage::TEvListBoardResult(BoardInfo), 0, ev->Cookie);
1075+
}
1076+
10731077
void Handle(TEvStateStorage::TEvUpdateGroupConfig::TPtr &ev) {
10741078
auto *msg = ev->Get();
10751079
Info = msg->GroupConfig;
@@ -1142,6 +1146,7 @@ class TStateStorageProxy : public TActor<TStateStorageProxy> {
11421146
hFunc(TEvStateStorage::TEvListStateStorage, Handle);
11431147
hFunc(TEvStateStorage::TEvUpdateGroupConfig, Handle);
11441148
hFunc(TEvStateStorage::TEvRingGroupPassAway, Handle);
1149+
hFunc(TEvStateStorage::TEvListBoard, Handle);
11451150
fFunc(TEvents::TSystem::Unsubscribe, HandleUnsubscribe);
11461151
default:
11471152
if (Info->RingGroups.size() > 1)

ydb/core/health_check/health_check.cpp

Lines changed: 139 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <ydb/core/base/hive.h>
1616
#include <ydb/core/base/path.h>
1717
#include <ydb/core/base/statestorage.h>
18+
#include <ydb/core/base/statestorage_impl.h>
1819
#include <ydb/core/base/tablet_pipe.h>
1920
#include <ydb/core/cms/console/configs_dispatcher.h>
2021
#include <ydb/core/mon/mon.h>
@@ -155,6 +156,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
155156
QuotaUsage,
156157
BridgeGroupState,
157158
PileComputeState,
159+
StateStorage,
160+
StateStorageRing,
161+
StateStorageNode,
158162
};
159163

160164
enum ETimeoutTag {
@@ -316,7 +320,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
316320
if (issueLog.status() != Ydb::Monitoring::StatusFlag::UNSPECIFIED) {
317321
id << Ydb::Monitoring::StatusFlag_Status_Name(issueLog.status()) << '-';
318322
}
319-
id << crc16(issueLog.message());
323+
id << crc16(TStringBuilder() << issueLog.message() << issueLog.type());
320324
if (location.database().name()) {
321325
id << '-' << crc32(location.database().name());
322326
}
@@ -363,6 +367,15 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
363367
if (location.compute().schema().path()) {
364368
id << '-' << crc32(location.compute().schema().path());
365369
}
370+
if (location.compute().state_storage().pile().name()) {
371+
id << '-' << location.compute().state_storage().pile().name();
372+
}
373+
if (location.compute().state_storage().ring()) {
374+
id << '-' << location.compute().state_storage().ring();
375+
}
376+
if (location.compute().state_storage().node().id()) {
377+
id << '-' << location.compute().state_storage().node().id();
378+
}
366379
return id.Str();
367380
}
368381

@@ -392,10 +405,10 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
392405
if (Location.ByteSizeLong() > 0) {
393406
issueLog.mutable_location()->CopyFrom(Location);
394407
}
395-
issueLog.set_id(GetIssueId(issueLog));
396408
if (Type) {
397409
issueLog.set_type(Type);
398410
}
411+
issueLog.set_id(GetIssueId(issueLog));
399412
issueLog.set_level(Level);
400413
if (!reason.empty()) {
401414
for (const TString& r : reason) {
@@ -675,6 +688,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
675688
std::optional<TRequestResponse<TEvSysView::TEvGetPDisksResponse>> PDisks;
676689
std::optional<TRequestResponse<TEvNodeWardenStorageConfig>> NodeWardenStorageConfig;
677690
std::optional<TRequestResponse<TEvStateStorage::TEvBoardInfo>> DatabaseBoardInfo;
691+
std::optional<TRequestResponse<TEvStateStorage::TEvListStateStorageResult>> StateStorageInfo;
692+
std::optional<TRequestResponse<TEvStateStorage::TEvListSchemeBoardResult>> SchemeBoardInfo;
693+
std::optional<TRequestResponse<TEvStateStorage::TEvListBoardResult>> BoardInfo;
678694
THashSet<TNodeId> UnknownStaticGroups;
679695

680696
const NKikimrConfig::THealthCheckConfig& HealthCheckConfig;
@@ -837,6 +853,16 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
837853
NodeWardenStorageConfig = RequestStorageConfig();
838854
}
839855

856+
if (!IsSpecificDatabaseFilter()) {
857+
StateStorageInfo = TRequestResponse<TEvStateStorage::TEvListStateStorageResult>(Span.CreateChild(TComponentTracingLevels::TTablet::Detailed, "TEvStateStorage::TEvListStateStorageResult"));
858+
Send(MakeStateStorageProxyID(), new TEvStateStorage::TEvListStateStorage(), 0/*flags*/, 0/*cookie*/, Span.GetTraceId());
859+
SchemeBoardInfo = TRequestResponse<TEvStateStorage::TEvListSchemeBoardResult>(Span.CreateChild(TComponentTracingLevels::TTablet::Detailed, "TEvStateStorage::TEvListSchemeBoardResult"));
860+
Send(MakeStateStorageProxyID(), new TEvStateStorage::TEvListSchemeBoard(false), 0/*flags*/, 0/*cookie*/, Span.GetTraceId());
861+
BoardInfo = TRequestResponse<TEvStateStorage::TEvListBoardResult>(Span.CreateChild(TComponentTracingLevels::TTablet::Detailed, "TEvStateStorage::TEvListBoardResult"));
862+
Send(MakeStateStorageProxyID(), new TEvStateStorage::TEvListBoard(), 0/*flags*/, 0/*cookie*/, Span.GetTraceId());
863+
Requests += 3;
864+
}
865+
840866

841867
NodesInfo = TRequestResponse<TEvInterconnect::TEvNodesInfo>(Span.CreateChild(TComponentTracingLevels::TTablet::Detailed, "TEvInterconnect::TEvListNodes"));
842868
Send(GetNameserviceActorId(), new TEvInterconnect::TEvListNodes(), 0/*flags*/, 0/*cookie*/, Span.GetTraceId());
@@ -921,6 +947,37 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
921947
}
922948
}
923949

950+
void RequestNodes(TIntrusiveConstPtr<TStateStorageInfo> info) {
951+
for (const auto& group : info->RingGroups) {
952+
for (const auto& ring : group.Rings) {
953+
for (const auto& replica : ring.Replicas) {
954+
RequestGenericNode(replica.NodeId());
955+
}
956+
}
957+
}
958+
}
959+
960+
void Handle(TEvStateStorage::TEvListStateStorageResult::TPtr& ev) {
961+
if (StateStorageInfo->Set(std::move(ev))) {
962+
RequestNodes(StateStorageInfo->Get()->Info);
963+
RequestDone("TEvListStateStorageResult");
964+
}
965+
}
966+
967+
void Handle(TEvStateStorage::TEvListSchemeBoardResult::TPtr& ev) {
968+
if (SchemeBoardInfo->Set(std::move(ev))) {
969+
RequestNodes(SchemeBoardInfo->Get()->Info);
970+
RequestDone("TEvListSсhemeBoardResult");
971+
}
972+
}
973+
974+
void Handle(TEvStateStorage::TEvListBoardResult::TPtr& ev) {
975+
if (BoardInfo->Set(std::move(ev))) {
976+
RequestNodes(BoardInfo->Get()->Info);
977+
RequestDone("TEvListBoardResult");
978+
}
979+
}
980+
924981
STATEFN(StateWait) {
925982
switch (ev->GetTypeRewrite()) {
926983
hFunc(TEvents::TEvUndelivered, Handle);
@@ -946,6 +1003,9 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
9461003
hFunc(TEvStateStorage::TEvBoardInfo, Handle);
9471004
hFunc(TEvents::TEvWakeup, HandleTimeout);
9481005
hFunc(TEvNodeWardenStorageConfig, Handle);
1006+
hFunc(TEvStateStorage::TEvListStateStorageResult, Handle);
1007+
hFunc(TEvStateStorage::TEvListSchemeBoardResult, Handle);
1008+
hFunc(TEvStateStorage::TEvListBoardResult, Handle);
9491009
}
9501010
}
9511011

@@ -2999,6 +3059,18 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
29993059
message = std::regex_replace(message.c_str(), std::regex("^PDisk "), "PDisks ");
30003060
break;
30013061
}
3062+
case ETags::StateStorageRing: {
3063+
message = std::regex_replace(message.c_str(), std::regex("^Ring has "), "Rings have ");
3064+
message = std::regex_replace(message.c_str(), std::regex("^Ring is "), "Rings are ");
3065+
message = std::regex_replace(message.c_str(), std::regex("^Ring "), "Rings ");
3066+
break;
3067+
}
3068+
case ETags::StateStorageNode: {
3069+
message = std::regex_replace(message.c_str(), std::regex("^Ring has "), "Rings have ");
3070+
message = std::regex_replace(message.c_str(), std::regex("^Ring is "), "Rings are ");
3071+
message = std::regex_replace(message.c_str(), std::regex("^Ring "), "Rings ");
3072+
break;
3073+
}
30023074
default:
30033075
break;
30043076
}
@@ -3054,6 +3126,10 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
30543126
isSimilar = it->IssueLog.location().storage().pool().group().pile().name()
30553127
== similar.begin()->IssueLog.location().storage().pool().group().pile().name();
30563128
}
3129+
if (isSimilar && similar.begin()->IssueLog.location().compute().state_storage().has_pile()) {
3130+
isSimilar = it->IssueLog.location().compute().state_storage().pile().name()
3131+
== similar.begin()->IssueLog.location().compute().state_storage().pile().name();
3132+
}
30573133
if (isSimilar) {
30583134
auto move = it++;
30593135
similar.splice(similar.end(), records, move);
@@ -3295,6 +3371,8 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
32953371
MergeLevelRecords(mergeContext, ETags::VDiskState, ETags::BridgeGroupState);
32963372
MergeLevelRecords(mergeContext, ETags::VDiskState, ETags::GroupState);
32973373
MergeLevelRecords(mergeContext, ETags::PDiskState, ETags::VDiskState);
3374+
MergeLevelRecords(mergeContext, ETags::StateStorageRing);
3375+
MergeLevelRecords(mergeContext, ETags::StateStorageNode, ETags::StateStorageRing);
32983376
}
32993377
mergeContext.FillRecords(records);
33003378
}
@@ -3463,13 +3541,72 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
34633541
}
34643542
}
34653543

3544+
void FillStateStorage(TOverallStateContext& context, TString type, TIntrusiveConstPtr<TStateStorageInfo> info) {
3545+
TSelfCheckResult ssContext;
3546+
ssContext.Type = type;
3547+
for (const auto& ringGroup : info->RingGroups) {
3548+
if (ringGroup.State != ERingGroupState::PRIMARY && ringGroup.State != ERingGroupState::SYNCHRONIZED) {
3549+
continue;
3550+
}
3551+
TSelfCheckResult* currentContext = &ssContext;
3552+
TSelfCheckContext pileContext(&ssContext, TStringBuilder() << "PILE_" << type);
3553+
if ((bool)ringGroup.BridgePileId && NodeWardenStorageConfig && NodeWardenStorageConfig->IsOk()) {
3554+
const auto& pileName = NodeWardenStorageConfig->Get()->BridgeInfo->GetPile(ringGroup.BridgePileId)->Name;
3555+
pileContext.Location.mutable_compute()->mutable_state_storage()->mutable_pile()->set_name(pileName);
3556+
currentContext = &pileContext;
3557+
}
3558+
ui32 disabledRings = 0;
3559+
ui32 badRings = 0;
3560+
for (size_t ringIdx = 0; ringIdx < ringGroup.Rings.size(); ++ringIdx) {
3561+
const auto& ring = ringGroup.Rings[ringIdx];
3562+
TSelfCheckContext ringContext(currentContext, TStringBuilder() << type << "_RING");
3563+
ringContext.Location.mutable_compute()->mutable_state_storage()->set_ring(ringIdx + 1);
3564+
if (ring.IsDisabled) {
3565+
++disabledRings;
3566+
continue;
3567+
}
3568+
for (const auto& replica : ring.Replicas) {
3569+
const auto node = replica.NodeId();
3570+
if (!NodeSystemState[node].IsOk()) {
3571+
TSelfCheckContext nodeContext(&ringContext, TStringBuilder() << type << "_NODE");
3572+
nodeContext.Location.mutable_compute()->mutable_state_storage()->mutable_node()->set_id(node);
3573+
nodeContext.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Node is not available", ETags::StateStorageNode);
3574+
}
3575+
}
3576+
ringContext.ReportWithMaxChildStatus("Ring has unavailable nodes", ETags::StateStorageRing, {ETags::StateStorageNode});
3577+
if (ringContext.GetOverallStatus() == Ydb::Monitoring::StatusFlag::RED) {
3578+
++badRings;
3579+
}
3580+
}
3581+
if (disabledRings + badRings > (ringGroup.NToSelect - 1) / 2) {
3582+
currentContext->ReportStatus(Ydb::Monitoring::StatusFlag::RED, "There is not enough functional rings", ETags::StateStorage);
3583+
} else if (badRings > 1) {
3584+
currentContext->ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Multiple rings have unavailable replicas", ETags::StateStorage);
3585+
} else if (badRings > 0) {
3586+
currentContext->ReportStatus(Ydb::Monitoring::StatusFlag::BLUE, "One ring has unavailable replicas", ETags::StateStorage);
3587+
}
3588+
}
3589+
MergeRecords(ssContext.IssueRecords);
3590+
context.UpdateMaxStatus(ssContext.GetOverallStatus());
3591+
context.AddIssues(ssContext.IssueRecords);
3592+
}
3593+
34663594
void FillResult(TOverallStateContext context) {
34673595
if (IsSpecificDatabaseFilter()) {
34683596
FillDatabaseResult(context, FilterDatabase, DatabaseState[FilterDatabase]);
34693597
} else {
34703598
for (auto& [path, state] : DatabaseState) {
34713599
FillDatabaseResult(context, path, state);
34723600
}
3601+
if (StateStorageInfo && StateStorageInfo->IsOk()) {
3602+
FillStateStorage(context, "STATE_STORAGE", StateStorageInfo->Get()->Info);
3603+
}
3604+
if (SchemeBoardInfo && SchemeBoardInfo->IsOk()) {
3605+
FillStateStorage(context, "SCHEME_BOARD", SchemeBoardInfo->Get()->Info);
3606+
}
3607+
if (BoardInfo && BoardInfo->IsOk()) {
3608+
FillStateStorage(context, "BOARD", BoardInfo->Get()->Info);
3609+
}
34733610
}
34743611
if (DatabaseState.empty()) {
34753612
Ydb::Monitoring::DatabaseStatus& databaseStatus(*context.Result->add_database_status());

0 commit comments

Comments
 (0)