diff options
author | innokentii <innokentii@yandex-team.com> | 2022-11-23 18:01:44 +0300 |
---|---|---|
committer | innokentii <innokentii@yandex-team.com> | 2022-11-23 18:01:44 +0300 |
commit | 5acd8c75835f61b89d3fb33d96a84d8c07ccdbb4 (patch) | |
tree | 0dfb9689f809372aff490851dd10588ee73df616 | |
parent | 852c6a25f38f4967fa9a5a5d999f50cac0974b8e (diff) | |
download | ydb-5acd8c75835f61b89d3fb33d96a84d8c07ccdbb4.tar.gz |
Sentinel add ignore reason
add ignore reason
-rw-r--r-- | ydb/core/cms/sentinel.cpp | 26 | ||||
-rw-r--r-- | ydb/core/cms/sentinel_impl.h | 6 | ||||
-rw-r--r-- | ydb/core/cms/sentinel_ut.cpp | 18 | ||||
-rw-r--r-- | ydb/core/cms/ui/sentinel_state.js | 2 | ||||
-rw-r--r-- | ydb/core/protos/cms.proto | 10 |
5 files changed, 50 insertions, 12 deletions
diff --git a/ydb/core/cms/sentinel.cpp b/ydb/core/cms/sentinel.cpp index e7ccde90a9a..a818da004f4 100644 --- a/ydb/core/cms/sentinel.cpp +++ b/ydb/core/cms/sentinel.cpp @@ -213,7 +213,7 @@ TGuardian::TGuardian(TSentinelState::TPtr state, ui32 dataCenterRatio, ui32 room } TClusterMap::TPDiskIDSet TGuardian::GetAllowedPDisks(const TClusterMap& all, TString& issues, - TPDiskIDSet& disallowed) const { + TPDiskIgnoredMap& disallowed) const { TPDiskIDSet result; TStringBuilder issuesBuilder; @@ -232,7 +232,9 @@ TClusterMap::TPDiskIDSet TGuardian::GetAllowedPDisks(const TClusterMap& all, TSt result.insert(kv.second.begin(), kv.second.end()); } else { LOG_IGNORED(DataCenter); - disallowed.insert(kv.second.begin(), kv.second.end()); + for (auto& pdisk : kv.second) { + disallowed.emplace(pdisk, NKikimrCms::TPDiskInfo::RATIO_BY_DATACENTER); + } } } @@ -241,7 +243,9 @@ TClusterMap::TPDiskIDSet TGuardian::GetAllowedPDisks(const TClusterMap& all, TSt if (kv.first && !CheckRatio(kv, all.ByRoom, RoomRatio)) { LOG_IGNORED(Room); - disallowed.insert(kv.second.begin(), kv.second.end()); + for (auto& pdisk : kv.second) { + disallowed.emplace(pdisk, NKikimrCms::TPDiskInfo::RATIO_BY_ROOM); + } EraseNodesIf(result, [&room = kv.second](const TPDiskID& id) { return room.contains(id); }); @@ -257,7 +261,9 @@ TClusterMap::TPDiskIDSet TGuardian::GetAllowedPDisks(const TClusterMap& all, TSt } if (kv.first && !CheckRatio(kv, all.ByRack, RackRatio)) { LOG_IGNORED(Rack); - disallowed.insert(kv.second.begin(), kv.second.end()); + for (auto& pdisk : kv.second) { + disallowed.emplace(pdisk, NKikimrCms::TPDiskInfo::RATIO_BY_RACK); + } EraseNodesIf(result, [&rack = kv.second](const TPDiskID& id) { return rack.contains(id); }); @@ -967,6 +973,7 @@ class TSentinel: public TActorBootstrapped<TSentinel> { if (!SentinelState->Nodes.contains(id.NodeId)) { LOG_E("Missing node info" << ": pdiskId# " << id); + info.IgnoreReason = NKikimrCms::TPDiskInfo::MISSING_NODE; continue; } @@ -983,7 +990,7 @@ class TSentinel: public TActorBootstrapped<TSentinel> { } TString issues; - THashSet<TPDiskID, TPDiskIDHash> disallowed; + TClusterMap::TPDiskIgnoredMap disallowed; TClusterMap::TPDiskIDSet allowed = changed.GetAllowedPDisks(all, issues, disallowed); std::move(alwaysAllowed.begin(), alwaysAllowed.end(), std::inserter(allowed, allowed.begin())); @@ -991,6 +998,8 @@ class TSentinel: public TActorBootstrapped<TSentinel> { Y_VERIFY(SentinelState->PDisks.contains(id)); TPDiskInfo::TPtr info = SentinelState->PDisks.at(id); + info->IgnoreReason = NKikimrCms::TPDiskInfo::NOT_IGNORED; + if (!info->IsChangingAllowed()) { info->AllowChanging(); continue; @@ -1019,9 +1028,11 @@ class TSentinel: public TActorBootstrapped<TSentinel> { } } - for (const auto& id : disallowed) { + for (const auto& [id, reason] : disallowed) { Y_VERIFY(SentinelState->PDisks.contains(id)); - SentinelState->PDisks.at(id)->DisallowChanging(); + auto& pdisk = SentinelState->PDisks.at(id); + pdisk->DisallowChanging(); + pdisk->IgnoreReason = reason; } if (issues) { @@ -1127,6 +1138,7 @@ class TSentinel: public TActorBootstrapped<TSentinel> { entry.MutableInfo()->SetPrevDesiredStatus(info->PrevStatusChangerState->Status); entry.MutableInfo()->SetPrevStatusChangeAttempts(info->PrevStatusChangerState->Attempt); } + entry.MutableInfo()->SetIgnoreReason(info->IgnoreReason); } } } diff --git a/ydb/core/cms/sentinel_impl.h b/ydb/core/cms/sentinel_impl.h index 00029ed6169..bdb380858b1 100644 --- a/ydb/core/cms/sentinel_impl.h +++ b/ydb/core/cms/sentinel_impl.h @@ -88,10 +88,13 @@ struct TPDiskInfo { using TPtr = TIntrusivePtr<TPDiskInfo>; + using EIgnoreReason = NKikimrCms::TPDiskInfo::EIgnoreReason; + TActorId StatusChanger; TInstant LastStatusChange; TStatusChangerState::TPtr StatusChangerState; TStatusChangerState::TPtr PrevStatusChangerState; + EIgnoreReason IgnoreReason = NKikimrCms::TPDiskInfo::NOT_IGNORED; explicit TPDiskInfo(EPDiskStatus initialStatus, const ui32& defaultStateLimit, const TLimitsMap& stateLimits); @@ -137,6 +140,7 @@ struct TSentinelState: public TSimpleRefCount<TSentinelState> { class TClusterMap { public: using TPDiskIDSet = THashSet<TPDiskID, TPDiskIDHash>; + using TPDiskIgnoredMap = THashMap<TPDiskID, TPDiskInfo::EIgnoreReason, TPDiskIDHash>; using TDistribution = THashMap<TString, TPDiskIDSet>; using TNodeIDSet = THashSet<ui32>; @@ -163,7 +167,7 @@ class TGuardian : public TClusterMap { public: explicit TGuardian(TSentinelState::TPtr state, ui32 dataCenterRatio = 100, ui32 roomRatio = 100, ui32 rackRatio = 100); - TPDiskIDSet GetAllowedPDisks(const TClusterMap& all, TString& issues, TPDiskIDSet& disallowed) const; + TPDiskIDSet GetAllowedPDisks(const TClusterMap& all, TString& issues, TPDiskIgnoredMap& disallowed) const; private: const ui32 DataCenterRatio; diff --git a/ydb/core/cms/sentinel_ut.cpp b/ydb/core/cms/sentinel_ut.cpp index 0e1f33ae94d..1867ef3d032 100644 --- a/ydb/core/cms/sentinel_ut.cpp +++ b/ydb/core/cms/sentinel_ut.cpp @@ -174,6 +174,16 @@ Y_UNIT_TEST_SUITE(TSentinelBaseTests) { return {state, sentinelState}; } + THashSet<TPDiskID, TPDiskIDHash> MapKeys(TClusterMap::TPDiskIgnoredMap& map) { + THashSet<TPDiskID, TPDiskIDHash> result; + + for (auto& [k, _] : map) { + result.insert(k); + } + + return result; + }; + void GuardianDataCenterRatio(ui16 numDataCenter, const TVector<ui16>& nodesPerDataCenterVariants, bool anyDC = false) { UNIT_ASSERT(!anyDC || numDataCenter == 1); @@ -198,7 +208,7 @@ Y_UNIT_TEST_SUITE(TSentinelBaseTests) { } TString issues; - THashSet<TPDiskID, TPDiskIDHash> disallowed; + TClusterMap::TPDiskIgnoredMap disallowed; UNIT_ASSERT_VALUES_EQUAL(changed.GetAllowedPDisks(all, issues, disallowed), changedSet); UNIT_ASSERT(disallowed.empty()); @@ -218,7 +228,7 @@ Y_UNIT_TEST_SUITE(TSentinelBaseTests) { disallowed.clear(); if (!anyDC) { UNIT_ASSERT(changed.GetAllowedPDisks(all, issues, disallowed).empty()); - UNIT_ASSERT_VALUES_EQUAL(disallowed, changedSet); + UNIT_ASSERT_VALUES_EQUAL(MapKeys(disallowed), changedSet); UNIT_ASSERT_STRING_CONTAINS(issues, "due to DataCenterRatio"); } else { UNIT_ASSERT_VALUES_EQUAL(changed.GetAllowedPDisks(all, issues, disallowed), changedSet); @@ -259,7 +269,7 @@ Y_UNIT_TEST_SUITE(TSentinelBaseTests) { } TString issues; - THashSet<TPDiskID, TPDiskIDHash> disallowed; + TClusterMap::TPDiskIgnoredMap disallowed; UNIT_ASSERT_VALUES_EQUAL(changed.GetAllowedPDisks(all, issues, disallowed), changedSet); UNIT_ASSERT(disallowed.empty()); @@ -287,7 +297,7 @@ Y_UNIT_TEST_SUITE(TSentinelBaseTests) { UNIT_ASSERT(issues.empty()); } else { UNIT_ASSERT_VALUES_EQUAL(allowed, decltype(allowed){}); - UNIT_ASSERT_VALUES_EQUAL(disallowed, changedSet); + UNIT_ASSERT_VALUES_EQUAL(MapKeys(disallowed), changedSet); UNIT_ASSERT_STRING_CONTAINS(issues, "due to RackRatio"); } } diff --git a/ydb/core/cms/ui/sentinel_state.js b/ydb/core/cms/ui/sentinel_state.js index d151441b059..9ae740493d4 100644 --- a/ydb/core/cms/ui/sentinel_state.js +++ b/ydb/core/cms/ui/sentinel_state.js @@ -44,6 +44,7 @@ const PDiskHeaders = [ "PrevDesiredStatus", "PrevStatusChangeAttempts", "LastStatusChange", + "IgnoreReason", ]; class CmsSentinelState { @@ -145,6 +146,7 @@ class CmsSentinelState { "PrevDesiredStatus": this.id.bind(this), "PrevStatusChangeAttempts": this.id.bind(this), "LastStatusChange": this.id.bind(this), + "IgnoreReason": this.id.bind(this), }; } diff --git a/ydb/core/protos/cms.proto b/ydb/core/protos/cms.proto index 85168eab00a..05ae41eac08 100644 --- a/ydb/core/protos/cms.proto +++ b/ydb/core/protos/cms.proto @@ -601,11 +601,20 @@ message TGetSentinelStateRequest { SUSPICIOUS = 2; ALL = 3; } + optional EShow Show = 1; repeated TFilterRange Ranges = 2; } message TPDiskInfo { + enum EIgnoreReason { + NOT_IGNORED = 1; + MISSING_NODE = 2; + RATIO_BY_DATACENTER = 3; + RATIO_BY_ROOM = 4; + RATIO_BY_RACK = 5; + } + optional uint32 State = 1; // EPDiskState optional uint32 PrevState = 2; // EPDiskState optional uint64 StateCounter = 3; @@ -617,6 +626,7 @@ message TPDiskInfo { optional uint32 PrevDesiredStatus = 9; optional uint32 PrevStatusChangeAttempts = 10; optional string LastStatusChange = 11; + optional EIgnoreReason IgnoreReason = 12; } message TPDisk { |