diff options
author | andrew-rykov <arykov@ydb.tech> | 2023-09-15 14:35:23 +0300 |
---|---|---|
committer | andrew-rykov <arykov@ydb.tech> | 2023-09-15 14:58:12 +0300 |
commit | a11ce9b2b81c12f969dc88aff250f64d412e547a (patch) | |
tree | ddbdc647566e15fe25e6296350c6f4f793073ca6 | |
parent | d8446ab9751ae8ba96fec17140cf3e07f0fcf1ab (diff) | |
download | ydb-a11ce9b2b81c12f969dc88aff250f64d412e547a.tar.gz |
KIKIMR-19304 add sorting before truncation
-rw-r--r-- | ydb/core/health_check/health_check.cpp | 129 | ||||
-rw-r--r-- | ydb/core/health_check/health_check_ut.cpp | 34 | ||||
-rw-r--r-- | ydb/core/viewer/json_healthcheck.h | 2 | ||||
-rw-r--r-- | ydb/public/api/protos/ydb_monitoring.proto | 1 |
4 files changed, 49 insertions, 117 deletions
diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index af88ede2a25..f27fe91babc 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -515,7 +515,6 @@ public: TTabletRequestsState TabletRequests; TDuration Timeout = TDuration::MilliSeconds(10000); - ui32 ChildrenRecordsLimit = 0; static constexpr TStringBuf STATIC_STORAGE_POOL_NAME = "static"; bool IsSpecificDatabaseFilter() { @@ -527,7 +526,6 @@ public: if (Request->Request.operation_params().has_operation_timeout()) { Timeout = GetDuration(Request->Request.operation_params().operation_timeout()); } - ChildrenRecordsLimit = Request->Request.records_limit(); TIntrusivePtr<TDomainsInfo> domains = AppData()->DomainsInfo; TIntrusivePtr<TDomainsInfo::TDomain> domain = domains->Domains.begin()->second; DomainPath = "/" + domain->Name; @@ -1990,80 +1988,6 @@ public: record.IssueLog.set_listed(value); } - void RemoveRecordsAboveLimit(TMergeIssuesContext& context, TList<TSelfCheckContext::TIssueRecord>& records) { - records.sort([](TSelfCheckContext::TIssueRecord& a, TSelfCheckContext::TIssueRecord& b) { return b.IssueLog.status() < a.IssueLog.status(); }); - ui32 commonListed = 0; - for (auto it = records.begin(); it != records.end(); it++) { - if (commonListed == ChildrenRecordsLimit) { - auto removeIt = it; - it--; - SetIssueCount(*it, GetIssueCount(*it) + GetIssueCount(*removeIt)); - - auto reasons = removeIt->IssueLog.reason(); - for (auto reasonIt = reasons.begin(); reasonIt != reasons.end(); reasonIt++) { - context.removeIssuesIds.insert(*reasonIt); - } - context.removeIssuesIds.insert(removeIt->IssueLog.id()); - records.erase(removeIt); - } else if (commonListed + GetIssueListed(*it) > ChildrenRecordsLimit) { - auto aboveLimit = commonListed + GetIssueListed(*it) - ChildrenRecordsLimit; - SetIssueListed(*it, GetIssueListed(*it) - aboveLimit); - - switch (it->Tag) { - case ETags::GroupState: { - auto groupIds = it->IssueLog.mutable_location()->mutable_storage()->mutable_pool()->mutable_group()->mutable_id(); - while (aboveLimit > 0) { - groupIds->RemoveLast(); - aboveLimit--; - } - break; - } - case ETags::VDiskState: { - auto vdiscIds = it->IssueLog.mutable_location()->mutable_storage()->mutable_pool()->mutable_group()->mutable_vdisk()->mutable_id(); - while (aboveLimit > 0) { - vdiscIds->RemoveLast(); - aboveLimit--; - } - break; - } - case ETags::PDiskState: { - auto pdiscs = it->IssueLog.mutable_location()->mutable_storage()->mutable_pool()->mutable_group()->mutable_vdisk()->mutable_pdisk(); - while (aboveLimit > 0) { - pdiscs->RemoveLast(); - aboveLimit--; - } - break; - } - default: {} - } - commonListed = ChildrenRecordsLimit; - } else { - commonListed += GetIssueListed(*it); - } - } - } - - void RemoveRecordsAboveLimit(TMergeIssuesContext& context, ETags levelTag) { - auto& records = context.GetRecords(levelTag); - if (records.size() > 0) { - RemoveRecordsAboveLimit(context, records); - } - } - - void RemoveRecordsAboveLimit(TMergeIssuesContext& context, ETags levelTag, ETags upperTag) { - auto& levelRecords = context.GetRecords(levelTag); - auto& upperRecords = context.GetRecords(upperTag); - - TList<TSelfCheckResult::TIssueRecord> handled; - for (auto it = upperRecords.begin(); it != upperRecords.end(); it++) { - auto children = FindChildrenRecords(levelRecords, *it); - - RemoveRecordsAboveLimit(context, *children); - handled.splice(handled.end(), *children); - } - levelRecords.splice(levelRecords.end(), handled); - } - void FillGroupStatus(TGroupId groupId, const NKikimrWhiteboard::TBSGroupStateInfo& groupInfo, Ydb::Monitoring::StorageGroupStatus& storageGroupStatus, TSelfCheckContext context) { if (context.Location.mutable_storage()->mutable_pool()->mutable_group()->mutable_id()->empty()) { context.Location.mutable_storage()->mutable_pool()->mutable_group()->add_id(); @@ -2158,11 +2082,6 @@ public: MergeLevelRecords(mergeContext, ETags::VDiskState, ETags::GroupState); MergeLevelRecords(mergeContext, ETags::PDiskState, ETags::VDiskState); } - if (ChildrenRecordsLimit != 0) { - RemoveRecordsAboveLimit(mergeContext, ETags::PDiskState, ETags::VDiskState); - RemoveRecordsAboveLimit(mergeContext, ETags::VDiskState, ETags::GroupState); - RemoveRecordsAboveLimit(mergeContext, ETags::GroupState); - } mergeContext.FillRecords(records); } @@ -2405,6 +2324,43 @@ public: context.FillSelfCheckResult(); } + bool TruncateIssuesWithStatusWhileBeyondLimit(Ydb::Monitoring::SelfCheckResult& result, ui64 byteLimit, Ydb::Monitoring::StatusFlag::Status status) { + auto byteResult = result.ByteSizeLong(); + if (byteResult <= byteLimit) { + return true; + } + + int totalIssues = result.issue_log_size(); + TVector<int> indexesToRemove; + for (int i = 0; i < totalIssues && byteResult > byteLimit; ++i) { + if (result.issue_log(i).status() == status) { + byteResult -= result.issue_log(i).ByteSizeLong(); + indexesToRemove.push_back(i); + } + } + + for (auto it = indexesToRemove.rbegin(); it != indexesToRemove.rend(); ++it) { + result.mutable_issue_log()->SwapElements(*it, result.issue_log_size() - 1); + result.mutable_issue_log()->RemoveLast(); + } + + return byteResult <= byteLimit; + } + + void TruncateIssuesIfBeyondLimit(Ydb::Monitoring::SelfCheckResult& result, ui64 byteLimit) { + auto truncateStatusPriority = { + Ydb::Monitoring::StatusFlag::BLUE, + Ydb::Monitoring::StatusFlag::YELLOW, + Ydb::Monitoring::StatusFlag::ORANGE, + Ydb::Monitoring::StatusFlag::RED + }; + for (Ydb::Monitoring::StatusFlag::Status truncateStatus: truncateStatusPriority) { + if (TruncateIssuesWithStatusWhileBeyondLimit(result, byteLimit, truncateStatus)) { + break; + } + } + } + void ReplyAndPassAway() { THolder<TEvSelfCheckResult> response = MakeHolder<TEvSelfCheckResult>(); Ydb::Monitoring::SelfCheckResult& result = response->Result; @@ -2423,15 +2379,8 @@ public: auto byteSize = result.ByteSizeLong(); auto byteLimit = 50_MB - 1_KB; // 1_KB - for HEALTHCHECK STATUS issue going last - if (byteSize > byteLimit) { - do { - ui32 total = result.issue_log_size(); - ui32 to_remove = total / 20; - for (ui32 i = 0; i < to_remove; ++i) { - result.mutable_issue_log()->RemoveLast(); - } - } while (result.ByteSizeLong() > byteLimit); - } + TruncateIssuesIfBeyondLimit(result, byteLimit); + if (byteSize > 30_MB) { auto* issue = result.add_issue_log(); issue->set_type("HEALTHCHECK_STATUS"); diff --git a/ydb/core/health_check/health_check_ut.cpp b/ydb/core/health_check/health_check_ut.cpp index 1465a2e8da8..a64bb9ee899 100644 --- a/ydb/core/health_check/health_check_ut.cpp +++ b/ydb/core/health_check/health_check_ut.cpp @@ -182,7 +182,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { } } - Ydb::Monitoring::SelfCheckResult RequestHc(int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false, int const recordsLimit = 0) { + Ydb::Monitoring::SelfCheckResult RequestHc(int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false) { TPortManager tp; ui16 port = tp.GetPort(2134); ui16 grpcPort = tp.GetPort(2135); @@ -228,12 +228,11 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { auto *request = new NHealthCheck::TEvSelfCheckRequest; request->Request.set_merge_records(isMergeRecords); - request->Request.set_records_limit(recordsLimit); runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0)); return runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result; } - void CheckHcResult(Ydb::Monitoring::SelfCheckResult& result, int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false, int const recordsLimit = 0) { + void CheckHcResult(Ydb::Monitoring::SelfCheckResult& result, int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false) { int groupIssuesCount = 0; int groupIssuesNumber = !isMergeRecords ? groupNumber : 1; for (const auto& issue_log : result.Getissue_log()) { @@ -243,9 +242,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { UNIT_ASSERT_VALUES_EQUAL(issue_log.listed(), 0); UNIT_ASSERT_VALUES_EQUAL(issue_log.count(), 0); } else { - int groupListed = recordsLimit == 0 ? groupNumber : std::min<int>(groupNumber, recordsLimit); - UNIT_ASSERT_VALUES_EQUAL(issue_log.location().storage().pool().group().id_size(), groupListed); - UNIT_ASSERT_VALUES_EQUAL(issue_log.listed(), groupListed); + UNIT_ASSERT_VALUES_EQUAL(issue_log.location().storage().pool().group().id_size(), groupNumber); + UNIT_ASSERT_VALUES_EQUAL(issue_log.listed(), groupNumber); UNIT_ASSERT_VALUES_EQUAL(issue_log.count(), groupNumber); } groupIssuesCount++; @@ -264,9 +262,9 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { UNIT_ASSERT_VALUES_EQUAL(issueVdiscCount, issueVdiscNumber); } - void ListingTest(int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false, int const recordsLimit = 0) { - auto result = RequestHc(groupNumber, vdiscPerGroupNumber, isMergeRecords, recordsLimit); - CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords, recordsLimit); + void ListingTest(int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false) { + auto result = RequestHc(groupNumber, vdiscPerGroupNumber, isMergeRecords); + CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords); } Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(TString erasurespecies, const TVector<Ydb::Monitoring::StatusFlag::Status>& vdiskStatuses) { @@ -320,7 +318,6 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { auto *request = new NHealthCheck::TEvSelfCheckRequest; request->Request.set_merge_records(true); - request->Request.set_records_limit(10); runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0)); return runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result; @@ -418,18 +415,6 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { ListingTest(100, 100, true); } - Y_UNIT_TEST(Issues100GroupsMergingAndLimiting) { - ListingTest(100, 1, true, 10); - } - - Y_UNIT_TEST(Issues100VCardMergingAndLimiting) { - ListingTest(1, 100, true, 10); - } - - Y_UNIT_TEST(Issues100Groups100VCardMergingAndLimiting) { - ListingTest(100, 100, true, 10); - } - Y_UNIT_TEST(NoneRedGroupWhenRedVdisk) { auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::NONE, {Ydb::Monitoring::StatusFlag::RED}); CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1); @@ -555,16 +540,17 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { UNIT_ASSERT_VALUES_EQUAL(issuesCount, total); } - Y_UNIT_TEST(ProtobufUnderLimit50Mb) { + Y_UNIT_TEST(ProtobufUnderLimitFor5000Groups) { auto result = RequestHc(3000, 1); CheckHcProtobufSize(result, Ydb::Monitoring::StatusFlag::RED, 0); CheckHcProtobufSize(result, Ydb::Monitoring::StatusFlag::ORANGE, 0); CheckHcProtobufSize(result, Ydb::Monitoring::StatusFlag::YELLOW, 0); } - Y_UNIT_TEST(ProtobufBeyondLimit50Mb) { + Y_UNIT_TEST(ProtobufUnderLimitFor350000Groups) { auto result = RequestHc(350000, 1); CheckHcProtobufSize(result, Ydb::Monitoring::StatusFlag::RED, 1); + UNIT_ASSERT_LT(result.ByteSizeLong(), 50_MB); } } diff --git a/ydb/core/viewer/json_healthcheck.h b/ydb/core/viewer/json_healthcheck.h index a98ef7b7e80..99c63da3bbb 100644 --- a/ydb/core/viewer/json_healthcheck.h +++ b/ydb/core/viewer/json_healthcheck.h @@ -72,7 +72,6 @@ public: request->Request.set_return_verbose_status(FromStringWithDefault<bool>(params.Get("verbose"), false)); request->Request.set_maximum_level(FromStringWithDefault<ui32>(params.Get("max_level"), 0)); request->Request.set_merge_records(FromStringWithDefault<bool>(params.Get("merge_records"), false)); - request->Request.set_records_limit(FromStringWithDefault<ui32>(params.Get("records_limit"), 0)); SetDuration(TDuration::MilliSeconds(Timeout), *request->Request.mutable_operation_params()->mutable_operation_timeout()); if (params.Has("min_status")) { Ydb::Monitoring::StatusFlag::Status minStatus; @@ -210,7 +209,6 @@ struct TJsonRequestParameters<TJsonHealthCheck> { {"name":"tenant","in":"query","description":"path to database","required":false,"type":"string"}, {"name":"verbose","in":"query","description":"return verbose status","required":false,"type":"boolean"}, {"name":"merge_records","in":"query","description":"merge records","required":false,"type":"boolean"}, - {"name":"records_limit","in":"query","description":"children records limit","required":false,"type":"integer"}, {"name":"max_level","in":"query","description":"max depth of issues to return","required":false,"type":"integer"}, {"name":"min_status","in":"query","description":"min status of issues to return","required":false,"type":"string"}, {"name":"format","in":"query","description":"format of reply","required":false,"type":"string"}])___"; diff --git a/ydb/public/api/protos/ydb_monitoring.proto b/ydb/public/api/protos/ydb_monitoring.proto index 91a4ffbb5d1..d58cd6c9988 100644 --- a/ydb/public/api/protos/ydb_monitoring.proto +++ b/ydb/public/api/protos/ydb_monitoring.proto @@ -29,7 +29,6 @@ message SelfCheckRequest { uint32 maximum_level = 4; // maximum level of issues to return bool do_not_cache = 5; // by default database health state is taken from metadata cache; this option can be used to force bypassing that cache bool merge_records = 6; // combine similar records with similar status, message and level into one issue - uint32 records_limit = 7; // limit the number of records that have one parent record, default - without limit } message SelfCheckResponse { |