aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorinnokentii <innokentii@yandex-team.com>2022-11-23 18:01:44 +0300
committerinnokentii <innokentii@yandex-team.com>2022-11-23 18:01:44 +0300
commit5acd8c75835f61b89d3fb33d96a84d8c07ccdbb4 (patch)
tree0dfb9689f809372aff490851dd10588ee73df616
parent852c6a25f38f4967fa9a5a5d999f50cac0974b8e (diff)
downloadydb-5acd8c75835f61b89d3fb33d96a84d8c07ccdbb4.tar.gz
Sentinel add ignore reason
add ignore reason
-rw-r--r--ydb/core/cms/sentinel.cpp26
-rw-r--r--ydb/core/cms/sentinel_impl.h6
-rw-r--r--ydb/core/cms/sentinel_ut.cpp18
-rw-r--r--ydb/core/cms/ui/sentinel_state.js2
-rw-r--r--ydb/core/protos/cms.proto10
5 files changed, 50 insertions, 12 deletions
diff --git a/ydb/core/cms/sentinel.cpp b/ydb/core/cms/sentinel.cpp
index e7ccde90a9a..a818da004f4 100644
--- a/ydb/core/cms/sentinel.cpp
+++ b/ydb/core/cms/sentinel.cpp
@@ -213,7 +213,7 @@ TGuardian::TGuardian(TSentinelState::TPtr state, ui32 dataCenterRatio, ui32 room
}
TClusterMap::TPDiskIDSet TGuardian::GetAllowedPDisks(const TClusterMap& all, TString& issues,
- TPDiskIDSet& disallowed) const {
+ TPDiskIgnoredMap& disallowed) const {
TPDiskIDSet result;
TStringBuilder issuesBuilder;
@@ -232,7 +232,9 @@ TClusterMap::TPDiskIDSet TGuardian::GetAllowedPDisks(const TClusterMap& all, TSt
result.insert(kv.second.begin(), kv.second.end());
} else {
LOG_IGNORED(DataCenter);
- disallowed.insert(kv.second.begin(), kv.second.end());
+ for (auto& pdisk : kv.second) {
+ disallowed.emplace(pdisk, NKikimrCms::TPDiskInfo::RATIO_BY_DATACENTER);
+ }
}
}
@@ -241,7 +243,9 @@ TClusterMap::TPDiskIDSet TGuardian::GetAllowedPDisks(const TClusterMap& all, TSt
if (kv.first && !CheckRatio(kv, all.ByRoom, RoomRatio)) {
LOG_IGNORED(Room);
- disallowed.insert(kv.second.begin(), kv.second.end());
+ for (auto& pdisk : kv.second) {
+ disallowed.emplace(pdisk, NKikimrCms::TPDiskInfo::RATIO_BY_ROOM);
+ }
EraseNodesIf(result, [&room = kv.second](const TPDiskID& id) {
return room.contains(id);
});
@@ -257,7 +261,9 @@ TClusterMap::TPDiskIDSet TGuardian::GetAllowedPDisks(const TClusterMap& all, TSt
}
if (kv.first && !CheckRatio(kv, all.ByRack, RackRatio)) {
LOG_IGNORED(Rack);
- disallowed.insert(kv.second.begin(), kv.second.end());
+ for (auto& pdisk : kv.second) {
+ disallowed.emplace(pdisk, NKikimrCms::TPDiskInfo::RATIO_BY_RACK);
+ }
EraseNodesIf(result, [&rack = kv.second](const TPDiskID& id) {
return rack.contains(id);
});
@@ -967,6 +973,7 @@ class TSentinel: public TActorBootstrapped<TSentinel> {
if (!SentinelState->Nodes.contains(id.NodeId)) {
LOG_E("Missing node info"
<< ": pdiskId# " << id);
+ info.IgnoreReason = NKikimrCms::TPDiskInfo::MISSING_NODE;
continue;
}
@@ -983,7 +990,7 @@ class TSentinel: public TActorBootstrapped<TSentinel> {
}
TString issues;
- THashSet<TPDiskID, TPDiskIDHash> disallowed;
+ TClusterMap::TPDiskIgnoredMap disallowed;
TClusterMap::TPDiskIDSet allowed = changed.GetAllowedPDisks(all, issues, disallowed);
std::move(alwaysAllowed.begin(), alwaysAllowed.end(), std::inserter(allowed, allowed.begin()));
@@ -991,6 +998,8 @@ class TSentinel: public TActorBootstrapped<TSentinel> {
Y_VERIFY(SentinelState->PDisks.contains(id));
TPDiskInfo::TPtr info = SentinelState->PDisks.at(id);
+ info->IgnoreReason = NKikimrCms::TPDiskInfo::NOT_IGNORED;
+
if (!info->IsChangingAllowed()) {
info->AllowChanging();
continue;
@@ -1019,9 +1028,11 @@ class TSentinel: public TActorBootstrapped<TSentinel> {
}
}
- for (const auto& id : disallowed) {
+ for (const auto& [id, reason] : disallowed) {
Y_VERIFY(SentinelState->PDisks.contains(id));
- SentinelState->PDisks.at(id)->DisallowChanging();
+ auto& pdisk = SentinelState->PDisks.at(id);
+ pdisk->DisallowChanging();
+ pdisk->IgnoreReason = reason;
}
if (issues) {
@@ -1127,6 +1138,7 @@ class TSentinel: public TActorBootstrapped<TSentinel> {
entry.MutableInfo()->SetPrevDesiredStatus(info->PrevStatusChangerState->Status);
entry.MutableInfo()->SetPrevStatusChangeAttempts(info->PrevStatusChangerState->Attempt);
}
+ entry.MutableInfo()->SetIgnoreReason(info->IgnoreReason);
}
}
}
diff --git a/ydb/core/cms/sentinel_impl.h b/ydb/core/cms/sentinel_impl.h
index 00029ed6169..bdb380858b1 100644
--- a/ydb/core/cms/sentinel_impl.h
+++ b/ydb/core/cms/sentinel_impl.h
@@ -88,10 +88,13 @@ struct TPDiskInfo
{
using TPtr = TIntrusivePtr<TPDiskInfo>;
+ using EIgnoreReason = NKikimrCms::TPDiskInfo::EIgnoreReason;
+
TActorId StatusChanger;
TInstant LastStatusChange;
TStatusChangerState::TPtr StatusChangerState;
TStatusChangerState::TPtr PrevStatusChangerState;
+ EIgnoreReason IgnoreReason = NKikimrCms::TPDiskInfo::NOT_IGNORED;
explicit TPDiskInfo(EPDiskStatus initialStatus, const ui32& defaultStateLimit, const TLimitsMap& stateLimits);
@@ -137,6 +140,7 @@ struct TSentinelState: public TSimpleRefCount<TSentinelState> {
class TClusterMap {
public:
using TPDiskIDSet = THashSet<TPDiskID, TPDiskIDHash>;
+ using TPDiskIgnoredMap = THashMap<TPDiskID, TPDiskInfo::EIgnoreReason, TPDiskIDHash>;
using TDistribution = THashMap<TString, TPDiskIDSet>;
using TNodeIDSet = THashSet<ui32>;
@@ -163,7 +167,7 @@ class TGuardian : public TClusterMap {
public:
explicit TGuardian(TSentinelState::TPtr state, ui32 dataCenterRatio = 100, ui32 roomRatio = 100, ui32 rackRatio = 100);
- TPDiskIDSet GetAllowedPDisks(const TClusterMap& all, TString& issues, TPDiskIDSet& disallowed) const;
+ TPDiskIDSet GetAllowedPDisks(const TClusterMap& all, TString& issues, TPDiskIgnoredMap& disallowed) const;
private:
const ui32 DataCenterRatio;
diff --git a/ydb/core/cms/sentinel_ut.cpp b/ydb/core/cms/sentinel_ut.cpp
index 0e1f33ae94d..1867ef3d032 100644
--- a/ydb/core/cms/sentinel_ut.cpp
+++ b/ydb/core/cms/sentinel_ut.cpp
@@ -174,6 +174,16 @@ Y_UNIT_TEST_SUITE(TSentinelBaseTests) {
return {state, sentinelState};
}
+ THashSet<TPDiskID, TPDiskIDHash> MapKeys(TClusterMap::TPDiskIgnoredMap& map) {
+ THashSet<TPDiskID, TPDiskIDHash> result;
+
+ for (auto& [k, _] : map) {
+ result.insert(k);
+ }
+
+ return result;
+ };
+
void GuardianDataCenterRatio(ui16 numDataCenter, const TVector<ui16>& nodesPerDataCenterVariants, bool anyDC = false) {
UNIT_ASSERT(!anyDC || numDataCenter == 1);
@@ -198,7 +208,7 @@ Y_UNIT_TEST_SUITE(TSentinelBaseTests) {
}
TString issues;
- THashSet<TPDiskID, TPDiskIDHash> disallowed;
+ TClusterMap::TPDiskIgnoredMap disallowed;
UNIT_ASSERT_VALUES_EQUAL(changed.GetAllowedPDisks(all, issues, disallowed), changedSet);
UNIT_ASSERT(disallowed.empty());
@@ -218,7 +228,7 @@ Y_UNIT_TEST_SUITE(TSentinelBaseTests) {
disallowed.clear();
if (!anyDC) {
UNIT_ASSERT(changed.GetAllowedPDisks(all, issues, disallowed).empty());
- UNIT_ASSERT_VALUES_EQUAL(disallowed, changedSet);
+ UNIT_ASSERT_VALUES_EQUAL(MapKeys(disallowed), changedSet);
UNIT_ASSERT_STRING_CONTAINS(issues, "due to DataCenterRatio");
} else {
UNIT_ASSERT_VALUES_EQUAL(changed.GetAllowedPDisks(all, issues, disallowed), changedSet);
@@ -259,7 +269,7 @@ Y_UNIT_TEST_SUITE(TSentinelBaseTests) {
}
TString issues;
- THashSet<TPDiskID, TPDiskIDHash> disallowed;
+ TClusterMap::TPDiskIgnoredMap disallowed;
UNIT_ASSERT_VALUES_EQUAL(changed.GetAllowedPDisks(all, issues, disallowed), changedSet);
UNIT_ASSERT(disallowed.empty());
@@ -287,7 +297,7 @@ Y_UNIT_TEST_SUITE(TSentinelBaseTests) {
UNIT_ASSERT(issues.empty());
} else {
UNIT_ASSERT_VALUES_EQUAL(allowed, decltype(allowed){});
- UNIT_ASSERT_VALUES_EQUAL(disallowed, changedSet);
+ UNIT_ASSERT_VALUES_EQUAL(MapKeys(disallowed), changedSet);
UNIT_ASSERT_STRING_CONTAINS(issues, "due to RackRatio");
}
}
diff --git a/ydb/core/cms/ui/sentinel_state.js b/ydb/core/cms/ui/sentinel_state.js
index d151441b059..9ae740493d4 100644
--- a/ydb/core/cms/ui/sentinel_state.js
+++ b/ydb/core/cms/ui/sentinel_state.js
@@ -44,6 +44,7 @@ const PDiskHeaders = [
"PrevDesiredStatus",
"PrevStatusChangeAttempts",
"LastStatusChange",
+ "IgnoreReason",
];
class CmsSentinelState {
@@ -145,6 +146,7 @@ class CmsSentinelState {
"PrevDesiredStatus": this.id.bind(this),
"PrevStatusChangeAttempts": this.id.bind(this),
"LastStatusChange": this.id.bind(this),
+ "IgnoreReason": this.id.bind(this),
};
}
diff --git a/ydb/core/protos/cms.proto b/ydb/core/protos/cms.proto
index 85168eab00a..05ae41eac08 100644
--- a/ydb/core/protos/cms.proto
+++ b/ydb/core/protos/cms.proto
@@ -601,11 +601,20 @@ message TGetSentinelStateRequest {
SUSPICIOUS = 2;
ALL = 3;
}
+
optional EShow Show = 1;
repeated TFilterRange Ranges = 2;
}
message TPDiskInfo {
+ enum EIgnoreReason {
+ NOT_IGNORED = 1;
+ MISSING_NODE = 2;
+ RATIO_BY_DATACENTER = 3;
+ RATIO_BY_ROOM = 4;
+ RATIO_BY_RACK = 5;
+ }
+
optional uint32 State = 1; // EPDiskState
optional uint32 PrevState = 2; // EPDiskState
optional uint64 StateCounter = 3;
@@ -617,6 +626,7 @@ message TPDiskInfo {
optional uint32 PrevDesiredStatus = 9;
optional uint32 PrevStatusChangeAttempts = 10;
optional string LastStatusChange = 11;
+ optional EIgnoreReason IgnoreReason = 12;
}
message TPDisk {