aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorserg-belyakov <serg-belyakov@yandex-team.com>2023-05-15 18:04:22 +0300
committerserg-belyakov <serg-belyakov@yandex-team.com>2023-05-15 18:04:22 +0300
commit515a7f2286e7559ea89864f64a1ed6f50a552d0b (patch)
treefa3b108796d0625b928b372eead67edc80986cdd
parentb8c3fc044a5af9565cb6135e80a103bf3717ce63 (diff)
downloadydb-515a7f2286e7559ea89864f64a1ed6f50a552d0b.tar.gz
Add diagnostic information to GroupLayoutSanitizer,
Add error reason to reassigner actor Add operation log to html page Add TOperationLog for GroupLayoutSanitizer to SelfHeal actor
-rw-r--r--ydb/core/mind/bscontroller/self_heal.cpp72
1 files changed, 65 insertions, 7 deletions
diff --git a/ydb/core/mind/bscontroller/self_heal.cpp b/ydb/core/mind/bscontroller/self_heal.cpp
index 095efb5b5fa..dd2a3aa6c29 100644
--- a/ydb/core/mind/bscontroller/self_heal.cpp
+++ b/ydb/core/mind/bscontroller/self_heal.cpp
@@ -6,7 +6,13 @@
#include "group_layout_checker.h"
#include "layout_helpers.h"
+#include <ydb/core/debug_tools/operation_log.h>
+
namespace NKikimr::NBsController {
+ enum class EGroupRepairOperation {
+ SelfHeal = 0,
+ GroupLayoutSanitizer,
+ };
enum {
EvReassignerDone = EventSpaceBegin(TKikimrEvents::ES_PRIVATE),
@@ -15,10 +21,14 @@ namespace NKikimr::NBsController {
struct TEvReassignerDone : TEventLocal<TEvReassignerDone, EvReassignerDone> {
TGroupId GroupId;
bool Success;
+ EGroupRepairOperation Operation;
+ TString ErrorReason;
- TEvReassignerDone(TGroupId groupId, bool success)
+ TEvReassignerDone(TGroupId groupId, bool success, EGroupRepairOperation operation, TString errorReason = "")
: GroupId(groupId)
, Success(success)
+ , Operation(operation)
+ , ErrorReason(errorReason)
{}
};
@@ -120,7 +130,7 @@ namespace NKikimr::NBsController {
bool diskIsOk = false;
if (record.GetStatus() == NKikimrProto::RACE) {
- return Finish(false); // group reconfigured while we were querying it
+ return Finish(false, "Race occured"); // group reconfigured while we were querying it
} else if (record.GetStatus() == NKikimrProto::OK) {
diskIsOk = record.GetJoinedGroup() && record.GetReplicated();
}
@@ -150,7 +160,7 @@ namespace NKikimr::NBsController {
auto& checker = Topology.GetQuorumChecker();
if (!checker.CheckFailModelForGroup(*FailedGroupDisks)) {
STLOG(PRI_DEBUG, BS_SELFHEAL, BSSH06, "Reassigner ProcessResult quorum checker failed", (GroupId, GroupId));
- return Finish(false); // this change will render group unusable
+ return Finish(false, "Reassigner ProcessResult quorum checker failed"); // this change will render group unusable
}
auto ev = MakeHolder<TEvBlobStorage::TEvControllerConfigRequest>();
@@ -179,6 +189,7 @@ namespace NKikimr::NBsController {
if (!record.GetResponse().GetSuccess()) {
STLOG(PRI_WARN, BS_SELFHEAL, BSSH07, "Reassigner ReassignGroupDisk request failed", (GroupId, GroupId),
(VDiskToReplace, VDiskToReplace), (Response, record));
+ Finish(false, record.GetResponse().GetErrorDescription());
} else {
TString items = "none";
for (const auto& item : record.GetResponse().GetStatus(0).GetReassignedItem()) {
@@ -186,20 +197,21 @@ namespace NKikimr::NBsController {
<< TVSlotId(item.GetFrom()) << " -> " << TVSlotId(item.GetTo());
}
STLOG(PRI_INFO, BS_SELFHEAL, BSSH09, "Reassigner succeeded", (GroupId, GroupId), (Items, items));
+ Finish(true);
}
- Finish(record.GetResponse().GetSuccess());
}
- void Finish(bool success) {
+ void Finish(bool success, TString errorReason = "") {
STLOG(PRI_DEBUG, BS_SELFHEAL, BSSH08, "Reassigner finished", (GroupId, GroupId), (Success, success));
- Send(SelfHealId, new TEvReassignerDone(GroupId, success));
+ auto operation = VDiskToReplace ? EGroupRepairOperation::SelfHeal : EGroupRepairOperation::GroupLayoutSanitizer;
+ Send(SelfHealId, new TEvReassignerDone(GroupId, success, operation, errorReason));
PassAway();
}
void HandleWakeup() {
// actually it is watchdog timer for VDisk status query
if (PendingVDisks) {
- Finish(false);
+ Finish(false, "VDisk status query timer expired");
}
}
@@ -256,6 +268,9 @@ namespace NKikimr::NBsController {
static constexpr TDuration SelfHealWakeupPeriod = TDuration::Seconds(10);
+ static constexpr uint32_t GroupLayoutSanitizerOperationLogSize = 128;
+ TOperationLog<GroupLayoutSanitizerOperationLogSize> GroupLayoutSanitizerOperationLog;
+
public:
TSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups, THostRecordMap hostRecords)
: TabletId(tabletId)
@@ -361,6 +376,8 @@ namespace NKikimr::NBsController {
if (group.ReassignerActorId || now < group.NextRetryTimestamp) {
// nothing to do
} else {
+ ADD_RECORD_WITH_TIMESTAMP_TO_OPERATION_LOG(GroupLayoutSanitizerOperationLog,
+ "Start sanitizing GroupId# " << group.GroupId);
group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, std::nullopt));
}
}
@@ -451,9 +468,17 @@ namespace NKikimr::NBsController {
if (ev->Get()->Success) {
group.NextRetryTimestamp = now;
group.RetryTimeout = MinRetryTimeout;
+ if (ev->Get()->Operation == EGroupRepairOperation::GroupLayoutSanitizer) {
+ ADD_RECORD_WITH_TIMESTAMP_TO_OPERATION_LOG(GroupLayoutSanitizerOperationLog,
+ "Sanitizing succeeded GroupId# " << group.GroupId);
+ }
} else {
group.NextRetryTimestamp = now + group.RetryTimeout;
group.RetryTimeout = std::min(MaxRetryTimeout, group.RetryTimeout * 3 / 2);
+ if (ev->Get()->Operation == EGroupRepairOperation::GroupLayoutSanitizer) {
+ ADD_RECORD_WITH_TIMESTAMP_TO_OPERATION_LOG(GroupLayoutSanitizerOperationLog,
+ "Sanitizing failed GroupId# " << group.GroupId << " ErrorReason# " << ev->Get()->ErrorReason);
+ }
}
CheckGroups();
@@ -621,6 +646,39 @@ namespace NKikimr::NBsController {
}
DIV_CLASS("panel-body") {
out << "Status: " << (GroupLayoutSanitizerEnabled ? "enabled" : "disabled");
+
+ out << "<br/>";
+
+ out << "<button type='button' class='btn btn-default' data-toggle='collapse' style='margin:5px' \
+ data-target='#operationLogCollapse'>Operation Log</button>";
+ out << "<div id='operationLogCollapse' class='collapse'>";
+ TABLE_CLASS("table") {
+ TABLEHEAD() {
+ TABLER() {
+ TABLEH() { out << "Index"; }
+ TABLEH() { out << "Record"; }
+ }
+ }
+
+ ui32 logSize = GroupLayoutSanitizerOperationLog.Size();
+ TABLEBODY() {
+ for (ui32 i = 0; i < logSize; ++i) {
+ TABLER() {
+ TABLED() {
+ out << i;
+ }
+ TABLED() {
+ auto record = GroupLayoutSanitizerOperationLog.BorrowByIdx(i);
+ if (record) {
+ out << *record;
+ GroupLayoutSanitizerOperationLog.ReturnBorrowedRecord(record);
+ }
+ }
+ }
+ }
+ }
+ }
+ out << "<div/>";
}
}