aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorandrew-rykov <arykov@ydb.tech>2023-10-04 23:20:41 +0300
committerandrew-rykov <arykov@ydb.tech>2023-10-04 23:51:02 +0300
commite1f68d95129f42a52fadccbf1f72ae33767c5f9e (patch)
tree619837ffa019de52033e6da966575bb3bd078e43
parent506c0937db5b51d261b135b42d0b4bd7459a7dec (diff)
downloadydb-e1f68d95129f42a52fadccbf1f72ae33767c5f9e.tar.gz
KIKIMR-14204 healthcheck report groups status using bsc info
-rw-r--r--ydb/core/health_check/health_check.cpp76
-rw-r--r--ydb/core/health_check/health_check_ut.cpp128
2 files changed, 44 insertions, 160 deletions
diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp
index 9a0e0925d2..524c33c833 100644
--- a/ydb/core/health_check/health_check.cpp
+++ b/ydb/core/health_check/health_check.cpp
@@ -1589,17 +1589,6 @@ public:
static const inline TString MIRROR_3_DC = "mirror-3-dc";
static const int MERGING_IGNORE_SIZE = 4;
- static void IncrementFor(TStackVec<std::pair<ui32, int>>& realms, ui32 realm) {
- auto itRealm = FindIf(realms, [realm](const std::pair<ui32, int>& p) -> bool {
- return p.first == realm;
- });
- if (itRealm == realms.end()) {
- itRealm = realms.insert(realms.end(), { realm, 1 });
- } else {
- itRealm->second++;
- }
- }
-
struct TMergeIssuesContext {
std::unordered_map<ETags, TList<TSelfCheckContext::TIssueRecord>> recordsMap;
std::unordered_set<TString> removeIssuesIds;
@@ -1866,66 +1855,43 @@ public:
}
auto& groupInfo = *itGroup->second;
- int disksColors[Ydb::Monitoring::StatusFlag::Status_ARRAYSIZE] = {};
- TStackVec<std::pair<ui32, int>> failedRealms;
- int failedDisks = 0;
+ bool onlyGoodDisks = true;
for (const auto& vSlotIdProto : groupInfo.vslotid()) {
TString vDiskId = GetVSlotId(vSlotIdProto);
Ydb::Monitoring::StorageVDiskStatus& vDiskStatus = *storageGroupStatus.add_vdisks();
FillVDiskStatus(vDiskId, vDiskStatus, {&context, "VDISK"});
- ++disksColors[vDiskStatus.overall()];
- switch (vDiskStatus.overall()) {
- case Ydb::Monitoring::StatusFlag::BLUE: // disk is good, but not available
- case Ydb::Monitoring::StatusFlag::RED: // disk is bad, probably not available
- case Ydb::Monitoring::StatusFlag::GREY: { // the status is absent, the disk is not available
- auto itVDisk = BSConfigVSlots.find(vDiskId);
- if (itVDisk != BSConfigVSlots.end()) {
- IncrementFor(failedRealms, itVDisk->second->failrealmidx());
- ++failedDisks;
- }
- break;
- }
- default:
- break;
- }
+ onlyGoodDisks &= vDiskStatus.overall() != Ydb::Monitoring::StatusFlag::RED
+ && vDiskStatus.overall() != Ydb::Monitoring::StatusFlag::GREY;
}
context.Location.mutable_storage()->clear_node(); // group doesn't have node
context.OverallStatus = MinStatus(context.OverallStatus, Ydb::Monitoring::StatusFlag::YELLOW);
- if (groupInfo.erasurespecies() == NONE) {
- if (failedDisks > 0) {
- context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Group failed", ETags::GroupState, {ETags::VDiskState});
- } else if (disksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0) {
- context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState});
+ switch (groupInfo.operatingstatus()) {
+ case NKikimrBlobStorage::TGroupStatus::FULL: { // all VDisks of the group are READY for specific period of time
+ context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
+ break;
}
- } else if (groupInfo.erasurespecies() == BLOCK_4_2) {
- if (failedDisks > 2) {
- context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Group failed", ETags::GroupState, {ETags::VDiskState});
- } else if (failedDisks > 1) {
- context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Group has no redundancy", ETags::GroupState, {ETags::VDiskState});
- } else if (failedDisks > 0) {
- if (disksColors[Ydb::Monitoring::StatusFlag::BLUE] == failedDisks) {
+ case NKikimrBlobStorage::TGroupStatus::PARTIAL: { // some of VDisks are operational, but group is not yet DEGRADED
+ if ((groupInfo.erasurespecies() == BLOCK_4_2 || groupInfo.erasurespecies() == MIRROR_3_DC) && onlyGoodDisks) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::BLUE, "Group degraded", ETags::GroupState, {ETags::VDiskState});
} else {
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState});
}
- } else if (disksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0) {
- context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState});
+ break;
}
- } else if (groupInfo.erasurespecies() == MIRROR_3_DC) {
- if (failedRealms.size() > 2 || (failedRealms.size() == 2 && failedRealms[0].second > 1 && failedRealms[1].second > 1)) {
- context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Group failed", ETags::GroupState, {ETags::VDiskState});
- } else if (failedRealms.size() == 2) {
+ case NKikimrBlobStorage::TGroupStatus::DEGRADED: { // group is DEGRADED -- one random failure may lead to group loss (but may not lead too)
context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Group has no redundancy", ETags::GroupState, {ETags::VDiskState});
- } else if (failedDisks > 0) {
- if (disksColors[Ydb::Monitoring::StatusFlag::BLUE] == failedDisks) {
- context.ReportStatus(Ydb::Monitoring::StatusFlag::BLUE, "Group degraded", ETags::GroupState, {ETags::VDiskState});
- } else {
- context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState});
- }
- } else if (disksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0) {
- context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState});
+ break;
+ }
+ case NKikimrBlobStorage::TGroupStatus::DISINTEGRATED: { // group is not available for operation
+ context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Group failed", ETags::GroupState, {ETags::VDiskState});
+ break;
+ }
+ case NKikimrBlobStorage::TGroupStatus::UNKNOWN: { // default value, can't happen
+ default:
+ context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "System tablet BSC provided unexpected group status", ETags::GroupState, {ETags::VDiskState});
+ break;
}
}
diff --git a/ydb/core/health_check/health_check_ut.cpp b/ydb/core/health_check/health_check_ut.cpp
index 5aecd442a9..96a53a13b3 100644
--- a/ydb/core/health_check/health_check_ut.cpp
+++ b/ydb/core/health_check/health_check_ut.cpp
@@ -79,7 +79,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
}
};
- void AddGroupVSlotInControllerConfigResponse(TEvBlobStorage::TEvControllerConfigResponse::TPtr* ev, int groupCount, int vslotCount, TString erasurespecies = NHealthCheck::TSelfCheckRequest::BLOCK_4_2) {
+ void AddGroupVSlotInControllerConfigResponse(TEvBlobStorage::TEvControllerConfigResponse::TPtr* ev, const int groupCount, const int vslotCount) {
auto& pbRecord = (*ev)->Get()->Record;
auto pbConfig = pbRecord.mutable_response()->mutable_status(0)->mutable_baseconfig();
@@ -98,7 +98,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
auto group = pbConfig->add_group();
group->CopyFrom(groupSample);
group->set_groupid(groupId);
- group->set_erasurespecies(erasurespecies);
+ group->set_erasurespecies(NHealthCheck::TSelfCheckRequest::BLOCK_4_2);
+ group->set_operatingstatus(NKikimrBlobStorage::TGroupStatus::DISINTEGRATED);
group->clear_vslotid();
auto vslotId = VCARD_START_ID;
@@ -121,7 +122,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
}
};
- void AddGroupVSlotInControllerConfigResponse(TEvBlobStorage::TEvControllerConfigResponse::TPtr* ev, const TVector<Ydb::Monitoring::StatusFlag::Status>& vdiskStatuses, TString erasurespecies = NHealthCheck::TSelfCheckRequest::BLOCK_4_2, bool oneFailRealmIdx = false) {
+ void AddGroupVSlotInControllerConfigResponse(TEvBlobStorage::TEvControllerConfigResponse::TPtr* ev, const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVector<NKikimrBlobStorage::EVDiskStatus>& vdiskStatuses) {
auto& pbRecord = (*ev)->Get()->Record;
auto pbConfig = pbRecord.mutable_response()->mutable_status(0)->mutable_baseconfig();
@@ -139,7 +140,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
auto group = pbConfig->add_group();
group->CopyFrom(groupSample);
group->set_groupid(groupId);
- group->set_erasurespecies(erasurespecies);
+ group->set_operatingstatus(groupStatus);
+ group->set_erasurespecies(NHealthCheck::TSelfCheckRequest::BLOCK_4_2);
group->clear_vslotid();
auto vslotId = VCARD_START_ID;
@@ -149,29 +151,15 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
vslot->CopyFrom(vslotSample);
vslot->set_vdiskidx(vslotId);
vslot->set_groupid(groupId);
- vslot->set_failrealmidx(oneFailRealmIdx ? VCARD_START_ID : vslotId);
+ vslot->set_failrealmidx(vslotId);
vslot->mutable_vslotid()->set_vslotid(vslotId);
auto slotId = group->add_vslotid();
slotId->CopyFrom(vslotIdSample);
slotId->set_vslotid(vslotId);
- switch (status) {
- case Ydb::Monitoring::StatusFlag::GREEN:
- vslot->set_status("READY");
- break;
- case Ydb::Monitoring::StatusFlag::YELLOW:
- vslot->set_status("INIT_PENDING");
- break;
- case Ydb::Monitoring::StatusFlag::BLUE:
- vslot->set_status("REPLICATING");
- break;
- case Ydb::Monitoring::StatusFlag::RED:
- vslot->set_status("ERROR");
- break;
- default:
- break;
- }
+ const auto *descriptor = NKikimrBlobStorage::EVDiskStatus_descriptor();
+ vslot->set_status(descriptor->FindValueByNumber(status)->name());
vslotId++;
}
@@ -293,7 +281,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords);
}
- Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(TString erasurespecies, const TVector<Ydb::Monitoring::StatusFlag::Status>& vdiskStatuses, bool oneFailRealmIdx = false) {
+ Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVector<NKikimrBlobStorage::EVDiskStatus>& vdiskStatuses) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
@@ -323,7 +311,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
}
case TEvBlobStorage::EvControllerConfigResponse: {
auto *x = reinterpret_cast<TEvBlobStorage::TEvControllerConfigResponse::TPtr*>(&ev);
- AddGroupVSlotInControllerConfigResponse(x, vdiskStatuses, erasurespecies, oneFailRealmIdx);
+ AddGroupVSlotInControllerConfigResponse(x, groupStatus, vdiskStatuses);
break;
}
}
@@ -431,101 +419,31 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
ListingTest(100, 100, true);
}
- Y_UNIT_TEST(NoneRedGroupWhenRedVdisk) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::NONE, {Ydb::Monitoring::StatusFlag::RED});
- CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1);
- }
-
- Y_UNIT_TEST(NoneRedGroupWhenBlueVdisk) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::NONE, {Ydb::Monitoring::StatusFlag::BLUE});
- CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1);
+ Y_UNIT_TEST(YellowroupIssueWhenPartialGroupStatus) {
+ auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, {NKikimrBlobStorage::ERROR});
+ CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1);
}
- Y_UNIT_TEST(NoneYellowGroupWhenYellowVdisk) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::NONE, {Ydb::Monitoring::StatusFlag::YELLOW});
- CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1);
+ Y_UNIT_TEST(BlueGroupIssueWhenPartialGroupStatusAndReplicationDisks) {
+ auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, {NKikimrBlobStorage::REPLICATING});
+ CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::BLUE, 1);
}
- Y_UNIT_TEST(Block42OrangeGroupWhen100YellowAnd2RedVdisks) {
- TVector<Ydb::Monitoring::StatusFlag::Status> vdiskStatuses(100, Ydb::Monitoring::StatusFlag::YELLOW);
- vdiskStatuses.emplace_back(Ydb::Monitoring::StatusFlag::RED);
- vdiskStatuses.emplace_back(Ydb::Monitoring::StatusFlag::RED);
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::BLOCK_4_2, vdiskStatuses);
-
+ Y_UNIT_TEST(OrangeGroupIssueWhenDegradedGroupStatus) {
+ auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::DEGRADED, {});
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::ORANGE, 1);
- CheckHcResultHasIssuesWithStatus(result, "VDISK", Ydb::Monitoring::StatusFlag::RED, 1);
}
- Y_UNIT_TEST(Block42RedGroupWhen3RedVdisks) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::BLOCK_4_2, {Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::RED});
+ Y_UNIT_TEST(RedGroupIssueWhenDisintegratedGroupStatus) {
+ auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::DISINTEGRATED, {});
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1);
}
- Y_UNIT_TEST(Block42RedGroupWhen2RedBlueVdisks) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::BLOCK_4_2, {Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::BLUE});
+ Y_UNIT_TEST(RedGroupIssueWhenUnknownGroupStatus) {
+ auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::UNKNOWN, {});
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1);
}
- Y_UNIT_TEST(Block42OrangeGroupWhen2RedVdisks) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::BLOCK_4_2, {Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::RED});
- CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::ORANGE, 1);
- }
-
- Y_UNIT_TEST(Block42OrangeGroupWhenRedBlueVdisks) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::BLOCK_4_2, {Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::BLUE});
- CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::ORANGE, 1);
- }
-
- Y_UNIT_TEST(Block42YellowGroupWhenRedVdisk) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::BLOCK_4_2, {Ydb::Monitoring::StatusFlag::RED});
- CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1);
- }
-
- Y_UNIT_TEST(Block42BlueGroupWhenBlueVdisk) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::BLOCK_4_2, {Ydb::Monitoring::StatusFlag::BLUE});
- CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::BLUE, 1);
- }
-
- Y_UNIT_TEST(Block42YellowGroupWhenYellowVdisk) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::BLOCK_4_2, {Ydb::Monitoring::StatusFlag::YELLOW});
- CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1);
- }
-
- Y_UNIT_TEST(Mirrot3dcYellowGroupWhen3RedVdisks) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::MIRROR_3_DC, {Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::RED}, true);
- CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1);
- }
-
- Y_UNIT_TEST(Mirrot3dcYellowGroupWhen2RedBlueVdisks) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::MIRROR_3_DC, {Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::BLUE}, true);
- CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1);
- }
-
- Y_UNIT_TEST(Mirrot3dcYellowGroupWhen2RedVdisks) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::MIRROR_3_DC, {Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::RED}, true);
- CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1);
- }
-
- Y_UNIT_TEST(Mirrot3dcYellowGroupWhenRedBlueVdisks) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::MIRROR_3_DC, {Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::BLUE}, true);
- CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1);
- }
-
- Y_UNIT_TEST(Mirrot3dcYellowGroupWhenRedVdisk) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::MIRROR_3_DC, {Ydb::Monitoring::StatusFlag::RED}, true);
- CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1);
- }
-
- Y_UNIT_TEST(Mirrot3dcBlueGroupWhenBlueVdisk) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::MIRROR_3_DC, {Ydb::Monitoring::StatusFlag::BLUE}, true);
- CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::BLUE, 1);
- }
-
- Y_UNIT_TEST(Mirrot3dcYellowGroupWhenYellowVdisk) {
- auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::MIRROR_3_DC, {Ydb::Monitoring::StatusFlag::YELLOW}, true);
- CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1);
- }
-
Y_UNIT_TEST(StorageLimit95) {
StorageTest(95, 100, 1, Ydb::Monitoring::StatusFlag::RED);
}