diff options
author | andrew-rykov <arykov@ydb.tech> | 2023-10-04 23:20:41 +0300 |
---|---|---|
committer | andrew-rykov <arykov@ydb.tech> | 2023-10-04 23:51:02 +0300 |
commit | e1f68d95129f42a52fadccbf1f72ae33767c5f9e (patch) | |
tree | 619837ffa019de52033e6da966575bb3bd078e43 | |
parent | 506c0937db5b51d261b135b42d0b4bd7459a7dec (diff) | |
download | ydb-e1f68d95129f42a52fadccbf1f72ae33767c5f9e.tar.gz |
KIKIMR-14204 healthcheck report groups status using bsc info
-rw-r--r-- | ydb/core/health_check/health_check.cpp | 76 | ||||
-rw-r--r-- | ydb/core/health_check/health_check_ut.cpp | 128 |
2 files changed, 44 insertions, 160 deletions
diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index 9a0e0925d2..524c33c833 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -1589,17 +1589,6 @@ public: static const inline TString MIRROR_3_DC = "mirror-3-dc"; static const int MERGING_IGNORE_SIZE = 4; - static void IncrementFor(TStackVec<std::pair<ui32, int>>& realms, ui32 realm) { - auto itRealm = FindIf(realms, [realm](const std::pair<ui32, int>& p) -> bool { - return p.first == realm; - }); - if (itRealm == realms.end()) { - itRealm = realms.insert(realms.end(), { realm, 1 }); - } else { - itRealm->second++; - } - } - struct TMergeIssuesContext { std::unordered_map<ETags, TList<TSelfCheckContext::TIssueRecord>> recordsMap; std::unordered_set<TString> removeIssuesIds; @@ -1866,66 +1855,43 @@ public: } auto& groupInfo = *itGroup->second; - int disksColors[Ydb::Monitoring::StatusFlag::Status_ARRAYSIZE] = {}; - TStackVec<std::pair<ui32, int>> failedRealms; - int failedDisks = 0; + bool onlyGoodDisks = true; for (const auto& vSlotIdProto : groupInfo.vslotid()) { TString vDiskId = GetVSlotId(vSlotIdProto); Ydb::Monitoring::StorageVDiskStatus& vDiskStatus = *storageGroupStatus.add_vdisks(); FillVDiskStatus(vDiskId, vDiskStatus, {&context, "VDISK"}); - ++disksColors[vDiskStatus.overall()]; - switch (vDiskStatus.overall()) { - case Ydb::Monitoring::StatusFlag::BLUE: // disk is good, but not available - case Ydb::Monitoring::StatusFlag::RED: // disk is bad, probably not available - case Ydb::Monitoring::StatusFlag::GREY: { // the status is absent, the disk is not available - auto itVDisk = BSConfigVSlots.find(vDiskId); - if (itVDisk != BSConfigVSlots.end()) { - IncrementFor(failedRealms, itVDisk->second->failrealmidx()); - ++failedDisks; - } - break; - } - default: - break; - } + onlyGoodDisks &= vDiskStatus.overall() != Ydb::Monitoring::StatusFlag::RED + && vDiskStatus.overall() != Ydb::Monitoring::StatusFlag::GREY; } context.Location.mutable_storage()->clear_node(); // group doesn't have node context.OverallStatus = MinStatus(context.OverallStatus, Ydb::Monitoring::StatusFlag::YELLOW); - if (groupInfo.erasurespecies() == NONE) { - if (failedDisks > 0) { - context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Group failed", ETags::GroupState, {ETags::VDiskState}); - } else if (disksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0) { - context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState}); + switch (groupInfo.operatingstatus()) { + case NKikimrBlobStorage::TGroupStatus::FULL: { // all VDisks of the group are READY for specific period of time + context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); + break; } - } else if (groupInfo.erasurespecies() == BLOCK_4_2) { - if (failedDisks > 2) { - context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Group failed", ETags::GroupState, {ETags::VDiskState}); - } else if (failedDisks > 1) { - context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Group has no redundancy", ETags::GroupState, {ETags::VDiskState}); - } else if (failedDisks > 0) { - if (disksColors[Ydb::Monitoring::StatusFlag::BLUE] == failedDisks) { + case NKikimrBlobStorage::TGroupStatus::PARTIAL: { // some of VDisks are operational, but group is not yet DEGRADED + if ((groupInfo.erasurespecies() == BLOCK_4_2 || groupInfo.erasurespecies() == MIRROR_3_DC) && onlyGoodDisks) { context.ReportStatus(Ydb::Monitoring::StatusFlag::BLUE, "Group degraded", ETags::GroupState, {ETags::VDiskState}); } else { context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState}); } - } else if (disksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0) { - context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState}); + break; } - } else if (groupInfo.erasurespecies() == MIRROR_3_DC) { - if (failedRealms.size() > 2 || (failedRealms.size() == 2 && failedRealms[0].second > 1 && failedRealms[1].second > 1)) { - context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Group failed", ETags::GroupState, {ETags::VDiskState}); - } else if (failedRealms.size() == 2) { + case NKikimrBlobStorage::TGroupStatus::DEGRADED: { // group is DEGRADED -- one random failure may lead to group loss (but may not lead too) context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Group has no redundancy", ETags::GroupState, {ETags::VDiskState}); - } else if (failedDisks > 0) { - if (disksColors[Ydb::Monitoring::StatusFlag::BLUE] == failedDisks) { - context.ReportStatus(Ydb::Monitoring::StatusFlag::BLUE, "Group degraded", ETags::GroupState, {ETags::VDiskState}); - } else { - context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState}); - } - } else if (disksColors[Ydb::Monitoring::StatusFlag::YELLOW] > 0) { - context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Group degraded", ETags::GroupState, {ETags::VDiskState}); + break; + } + case NKikimrBlobStorage::TGroupStatus::DISINTEGRATED: { // group is not available for operation + context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Group failed", ETags::GroupState, {ETags::VDiskState}); + break; + } + case NKikimrBlobStorage::TGroupStatus::UNKNOWN: { // default value, can't happen + default: + context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "System tablet BSC provided unexpected group status", ETags::GroupState, {ETags::VDiskState}); + break; } } diff --git a/ydb/core/health_check/health_check_ut.cpp b/ydb/core/health_check/health_check_ut.cpp index 5aecd442a9..96a53a13b3 100644 --- a/ydb/core/health_check/health_check_ut.cpp +++ b/ydb/core/health_check/health_check_ut.cpp @@ -79,7 +79,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { } }; - void AddGroupVSlotInControllerConfigResponse(TEvBlobStorage::TEvControllerConfigResponse::TPtr* ev, int groupCount, int vslotCount, TString erasurespecies = NHealthCheck::TSelfCheckRequest::BLOCK_4_2) { + void AddGroupVSlotInControllerConfigResponse(TEvBlobStorage::TEvControllerConfigResponse::TPtr* ev, const int groupCount, const int vslotCount) { auto& pbRecord = (*ev)->Get()->Record; auto pbConfig = pbRecord.mutable_response()->mutable_status(0)->mutable_baseconfig(); @@ -98,7 +98,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { auto group = pbConfig->add_group(); group->CopyFrom(groupSample); group->set_groupid(groupId); - group->set_erasurespecies(erasurespecies); + group->set_erasurespecies(NHealthCheck::TSelfCheckRequest::BLOCK_4_2); + group->set_operatingstatus(NKikimrBlobStorage::TGroupStatus::DISINTEGRATED); group->clear_vslotid(); auto vslotId = VCARD_START_ID; @@ -121,7 +122,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { } }; - void AddGroupVSlotInControllerConfigResponse(TEvBlobStorage::TEvControllerConfigResponse::TPtr* ev, const TVector<Ydb::Monitoring::StatusFlag::Status>& vdiskStatuses, TString erasurespecies = NHealthCheck::TSelfCheckRequest::BLOCK_4_2, bool oneFailRealmIdx = false) { + void AddGroupVSlotInControllerConfigResponse(TEvBlobStorage::TEvControllerConfigResponse::TPtr* ev, const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVector<NKikimrBlobStorage::EVDiskStatus>& vdiskStatuses) { auto& pbRecord = (*ev)->Get()->Record; auto pbConfig = pbRecord.mutable_response()->mutable_status(0)->mutable_baseconfig(); @@ -139,7 +140,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { auto group = pbConfig->add_group(); group->CopyFrom(groupSample); group->set_groupid(groupId); - group->set_erasurespecies(erasurespecies); + group->set_operatingstatus(groupStatus); + group->set_erasurespecies(NHealthCheck::TSelfCheckRequest::BLOCK_4_2); group->clear_vslotid(); auto vslotId = VCARD_START_ID; @@ -149,29 +151,15 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { vslot->CopyFrom(vslotSample); vslot->set_vdiskidx(vslotId); vslot->set_groupid(groupId); - vslot->set_failrealmidx(oneFailRealmIdx ? VCARD_START_ID : vslotId); + vslot->set_failrealmidx(vslotId); vslot->mutable_vslotid()->set_vslotid(vslotId); auto slotId = group->add_vslotid(); slotId->CopyFrom(vslotIdSample); slotId->set_vslotid(vslotId); - switch (status) { - case Ydb::Monitoring::StatusFlag::GREEN: - vslot->set_status("READY"); - break; - case Ydb::Monitoring::StatusFlag::YELLOW: - vslot->set_status("INIT_PENDING"); - break; - case Ydb::Monitoring::StatusFlag::BLUE: - vslot->set_status("REPLICATING"); - break; - case Ydb::Monitoring::StatusFlag::RED: - vslot->set_status("ERROR"); - break; - default: - break; - } + const auto *descriptor = NKikimrBlobStorage::EVDiskStatus_descriptor(); + vslot->set_status(descriptor->FindValueByNumber(status)->name()); vslotId++; } @@ -293,7 +281,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords); } - Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(TString erasurespecies, const TVector<Ydb::Monitoring::StatusFlag::Status>& vdiskStatuses, bool oneFailRealmIdx = false) { + Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVector<NKikimrBlobStorage::EVDiskStatus>& vdiskStatuses) { TPortManager tp; ui16 port = tp.GetPort(2134); ui16 grpcPort = tp.GetPort(2135); @@ -323,7 +311,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { } case TEvBlobStorage::EvControllerConfigResponse: { auto *x = reinterpret_cast<TEvBlobStorage::TEvControllerConfigResponse::TPtr*>(&ev); - AddGroupVSlotInControllerConfigResponse(x, vdiskStatuses, erasurespecies, oneFailRealmIdx); + AddGroupVSlotInControllerConfigResponse(x, groupStatus, vdiskStatuses); break; } } @@ -431,101 +419,31 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { ListingTest(100, 100, true); } - Y_UNIT_TEST(NoneRedGroupWhenRedVdisk) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::NONE, {Ydb::Monitoring::StatusFlag::RED}); - CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1); - } - - Y_UNIT_TEST(NoneRedGroupWhenBlueVdisk) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::NONE, {Ydb::Monitoring::StatusFlag::BLUE}); - CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1); + Y_UNIT_TEST(YellowroupIssueWhenPartialGroupStatus) { + auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, {NKikimrBlobStorage::ERROR}); + CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1); } - Y_UNIT_TEST(NoneYellowGroupWhenYellowVdisk) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::NONE, {Ydb::Monitoring::StatusFlag::YELLOW}); - CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1); + Y_UNIT_TEST(BlueGroupIssueWhenPartialGroupStatusAndReplicationDisks) { + auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, {NKikimrBlobStorage::REPLICATING}); + CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::BLUE, 1); } - Y_UNIT_TEST(Block42OrangeGroupWhen100YellowAnd2RedVdisks) { - TVector<Ydb::Monitoring::StatusFlag::Status> vdiskStatuses(100, Ydb::Monitoring::StatusFlag::YELLOW); - vdiskStatuses.emplace_back(Ydb::Monitoring::StatusFlag::RED); - vdiskStatuses.emplace_back(Ydb::Monitoring::StatusFlag::RED); - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::BLOCK_4_2, vdiskStatuses); - + Y_UNIT_TEST(OrangeGroupIssueWhenDegradedGroupStatus) { + auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::DEGRADED, {}); CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::ORANGE, 1); - CheckHcResultHasIssuesWithStatus(result, "VDISK", Ydb::Monitoring::StatusFlag::RED, 1); } - Y_UNIT_TEST(Block42RedGroupWhen3RedVdisks) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::BLOCK_4_2, {Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::RED}); + Y_UNIT_TEST(RedGroupIssueWhenDisintegratedGroupStatus) { + auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::DISINTEGRATED, {}); CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1); } - Y_UNIT_TEST(Block42RedGroupWhen2RedBlueVdisks) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::BLOCK_4_2, {Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::BLUE}); + Y_UNIT_TEST(RedGroupIssueWhenUnknownGroupStatus) { + auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::UNKNOWN, {}); CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1); } - Y_UNIT_TEST(Block42OrangeGroupWhen2RedVdisks) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::BLOCK_4_2, {Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::RED}); - CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::ORANGE, 1); - } - - Y_UNIT_TEST(Block42OrangeGroupWhenRedBlueVdisks) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::BLOCK_4_2, {Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::BLUE}); - CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::ORANGE, 1); - } - - Y_UNIT_TEST(Block42YellowGroupWhenRedVdisk) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::BLOCK_4_2, {Ydb::Monitoring::StatusFlag::RED}); - CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1); - } - - Y_UNIT_TEST(Block42BlueGroupWhenBlueVdisk) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::BLOCK_4_2, {Ydb::Monitoring::StatusFlag::BLUE}); - CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::BLUE, 1); - } - - Y_UNIT_TEST(Block42YellowGroupWhenYellowVdisk) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::BLOCK_4_2, {Ydb::Monitoring::StatusFlag::YELLOW}); - CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1); - } - - Y_UNIT_TEST(Mirrot3dcYellowGroupWhen3RedVdisks) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::MIRROR_3_DC, {Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::RED}, true); - CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1); - } - - Y_UNIT_TEST(Mirrot3dcYellowGroupWhen2RedBlueVdisks) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::MIRROR_3_DC, {Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::BLUE}, true); - CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1); - } - - Y_UNIT_TEST(Mirrot3dcYellowGroupWhen2RedVdisks) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::MIRROR_3_DC, {Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::RED}, true); - CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1); - } - - Y_UNIT_TEST(Mirrot3dcYellowGroupWhenRedBlueVdisks) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::MIRROR_3_DC, {Ydb::Monitoring::StatusFlag::RED, Ydb::Monitoring::StatusFlag::BLUE}, true); - CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1); - } - - Y_UNIT_TEST(Mirrot3dcYellowGroupWhenRedVdisk) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::MIRROR_3_DC, {Ydb::Monitoring::StatusFlag::RED}, true); - CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1); - } - - Y_UNIT_TEST(Mirrot3dcBlueGroupWhenBlueVdisk) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::MIRROR_3_DC, {Ydb::Monitoring::StatusFlag::BLUE}, true); - CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::BLUE, 1); - } - - Y_UNIT_TEST(Mirrot3dcYellowGroupWhenYellowVdisk) { - auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::MIRROR_3_DC, {Ydb::Monitoring::StatusFlag::YELLOW}, true); - CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1); - } - Y_UNIT_TEST(StorageLimit95) { StorageTest(95, 100, 1, Ydb::Monitoring::StatusFlag::RED); } |