diff options
author | serg-belyakov <serg-belyakov@yandex-team.com> | 2023-05-15 18:04:22 +0300 |
---|---|---|
committer | serg-belyakov <serg-belyakov@yandex-team.com> | 2023-05-15 18:04:22 +0300 |
commit | 515a7f2286e7559ea89864f64a1ed6f50a552d0b (patch) | |
tree | fa3b108796d0625b928b372eead67edc80986cdd | |
parent | b8c3fc044a5af9565cb6135e80a103bf3717ce63 (diff) | |
download | ydb-515a7f2286e7559ea89864f64a1ed6f50a552d0b.tar.gz |
Add diagnostic information to GroupLayoutSanitizer,
Add error reason to reassigner actor
Add operation log to html page
Add TOperationLog for GroupLayoutSanitizer to SelfHeal actor
-rw-r--r-- | ydb/core/mind/bscontroller/self_heal.cpp | 72 |
1 files changed, 65 insertions, 7 deletions
diff --git a/ydb/core/mind/bscontroller/self_heal.cpp b/ydb/core/mind/bscontroller/self_heal.cpp index 095efb5b5fa..dd2a3aa6c29 100644 --- a/ydb/core/mind/bscontroller/self_heal.cpp +++ b/ydb/core/mind/bscontroller/self_heal.cpp @@ -6,7 +6,13 @@ #include "group_layout_checker.h" #include "layout_helpers.h" +#include <ydb/core/debug_tools/operation_log.h> + namespace NKikimr::NBsController { + enum class EGroupRepairOperation { + SelfHeal = 0, + GroupLayoutSanitizer, + }; enum { EvReassignerDone = EventSpaceBegin(TKikimrEvents::ES_PRIVATE), @@ -15,10 +21,14 @@ namespace NKikimr::NBsController { struct TEvReassignerDone : TEventLocal<TEvReassignerDone, EvReassignerDone> { TGroupId GroupId; bool Success; + EGroupRepairOperation Operation; + TString ErrorReason; - TEvReassignerDone(TGroupId groupId, bool success) + TEvReassignerDone(TGroupId groupId, bool success, EGroupRepairOperation operation, TString errorReason = "") : GroupId(groupId) , Success(success) + , Operation(operation) + , ErrorReason(errorReason) {} }; @@ -120,7 +130,7 @@ namespace NKikimr::NBsController { bool diskIsOk = false; if (record.GetStatus() == NKikimrProto::RACE) { - return Finish(false); // group reconfigured while we were querying it + return Finish(false, "Race occured"); // group reconfigured while we were querying it } else if (record.GetStatus() == NKikimrProto::OK) { diskIsOk = record.GetJoinedGroup() && record.GetReplicated(); } @@ -150,7 +160,7 @@ namespace NKikimr::NBsController { auto& checker = Topology.GetQuorumChecker(); if (!checker.CheckFailModelForGroup(*FailedGroupDisks)) { STLOG(PRI_DEBUG, BS_SELFHEAL, BSSH06, "Reassigner ProcessResult quorum checker failed", (GroupId, GroupId)); - return Finish(false); // this change will render group unusable + return Finish(false, "Reassigner ProcessResult quorum checker failed"); // this change will render group unusable } auto ev = MakeHolder<TEvBlobStorage::TEvControllerConfigRequest>(); @@ -179,6 +189,7 @@ namespace NKikimr::NBsController { if (!record.GetResponse().GetSuccess()) { STLOG(PRI_WARN, BS_SELFHEAL, BSSH07, "Reassigner ReassignGroupDisk request failed", (GroupId, GroupId), (VDiskToReplace, VDiskToReplace), (Response, record)); + Finish(false, record.GetResponse().GetErrorDescription()); } else { TString items = "none"; for (const auto& item : record.GetResponse().GetStatus(0).GetReassignedItem()) { @@ -186,20 +197,21 @@ namespace NKikimr::NBsController { << TVSlotId(item.GetFrom()) << " -> " << TVSlotId(item.GetTo()); } STLOG(PRI_INFO, BS_SELFHEAL, BSSH09, "Reassigner succeeded", (GroupId, GroupId), (Items, items)); + Finish(true); } - Finish(record.GetResponse().GetSuccess()); } - void Finish(bool success) { + void Finish(bool success, TString errorReason = "") { STLOG(PRI_DEBUG, BS_SELFHEAL, BSSH08, "Reassigner finished", (GroupId, GroupId), (Success, success)); - Send(SelfHealId, new TEvReassignerDone(GroupId, success)); + auto operation = VDiskToReplace ? EGroupRepairOperation::SelfHeal : EGroupRepairOperation::GroupLayoutSanitizer; + Send(SelfHealId, new TEvReassignerDone(GroupId, success, operation, errorReason)); PassAway(); } void HandleWakeup() { // actually it is watchdog timer for VDisk status query if (PendingVDisks) { - Finish(false); + Finish(false, "VDisk status query timer expired"); } } @@ -256,6 +268,9 @@ namespace NKikimr::NBsController { static constexpr TDuration SelfHealWakeupPeriod = TDuration::Seconds(10); + static constexpr uint32_t GroupLayoutSanitizerOperationLogSize = 128; + TOperationLog<GroupLayoutSanitizerOperationLogSize> GroupLayoutSanitizerOperationLog; + public: TSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups, THostRecordMap hostRecords) : TabletId(tabletId) @@ -361,6 +376,8 @@ namespace NKikimr::NBsController { if (group.ReassignerActorId || now < group.NextRetryTimestamp) { // nothing to do } else { + ADD_RECORD_WITH_TIMESTAMP_TO_OPERATION_LOG(GroupLayoutSanitizerOperationLog, + "Start sanitizing GroupId# " << group.GroupId); group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, std::nullopt)); } } @@ -451,9 +468,17 @@ namespace NKikimr::NBsController { if (ev->Get()->Success) { group.NextRetryTimestamp = now; group.RetryTimeout = MinRetryTimeout; + if (ev->Get()->Operation == EGroupRepairOperation::GroupLayoutSanitizer) { + ADD_RECORD_WITH_TIMESTAMP_TO_OPERATION_LOG(GroupLayoutSanitizerOperationLog, + "Sanitizing succeeded GroupId# " << group.GroupId); + } } else { group.NextRetryTimestamp = now + group.RetryTimeout; group.RetryTimeout = std::min(MaxRetryTimeout, group.RetryTimeout * 3 / 2); + if (ev->Get()->Operation == EGroupRepairOperation::GroupLayoutSanitizer) { + ADD_RECORD_WITH_TIMESTAMP_TO_OPERATION_LOG(GroupLayoutSanitizerOperationLog, + "Sanitizing failed GroupId# " << group.GroupId << " ErrorReason# " << ev->Get()->ErrorReason); + } } CheckGroups(); @@ -621,6 +646,39 @@ namespace NKikimr::NBsController { } DIV_CLASS("panel-body") { out << "Status: " << (GroupLayoutSanitizerEnabled ? "enabled" : "disabled"); + + out << "<br/>"; + + out << "<button type='button' class='btn btn-default' data-toggle='collapse' style='margin:5px' \ + data-target='#operationLogCollapse'>Operation Log</button>"; + out << "<div id='operationLogCollapse' class='collapse'>"; + TABLE_CLASS("table") { + TABLEHEAD() { + TABLER() { + TABLEH() { out << "Index"; } + TABLEH() { out << "Record"; } + } + } + + ui32 logSize = GroupLayoutSanitizerOperationLog.Size(); + TABLEBODY() { + for (ui32 i = 0; i < logSize; ++i) { + TABLER() { + TABLED() { + out << i; + } + TABLED() { + auto record = GroupLayoutSanitizerOperationLog.BorrowByIdx(i); + if (record) { + out << *record; + GroupLayoutSanitizerOperationLog.ReturnBorrowedRecord(record); + } + } + } + } + } + } + out << "<div/>"; } } |