aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorandrew-rykov <arykov@ydb.tech>2023-09-15 14:35:23 +0300
committerandrew-rykov <arykov@ydb.tech>2023-09-15 14:58:12 +0300
commita11ce9b2b81c12f969dc88aff250f64d412e547a (patch)
treeddbdc647566e15fe25e6296350c6f4f793073ca6
parentd8446ab9751ae8ba96fec17140cf3e07f0fcf1ab (diff)
downloadydb-a11ce9b2b81c12f969dc88aff250f64d412e547a.tar.gz
KIKIMR-19304 add sorting before truncation
-rw-r--r--ydb/core/health_check/health_check.cpp129
-rw-r--r--ydb/core/health_check/health_check_ut.cpp34
-rw-r--r--ydb/core/viewer/json_healthcheck.h2
-rw-r--r--ydb/public/api/protos/ydb_monitoring.proto1
4 files changed, 49 insertions, 117 deletions
diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp
index af88ede2a25..f27fe91babc 100644
--- a/ydb/core/health_check/health_check.cpp
+++ b/ydb/core/health_check/health_check.cpp
@@ -515,7 +515,6 @@ public:
TTabletRequestsState TabletRequests;
TDuration Timeout = TDuration::MilliSeconds(10000);
- ui32 ChildrenRecordsLimit = 0;
static constexpr TStringBuf STATIC_STORAGE_POOL_NAME = "static";
bool IsSpecificDatabaseFilter() {
@@ -527,7 +526,6 @@ public:
if (Request->Request.operation_params().has_operation_timeout()) {
Timeout = GetDuration(Request->Request.operation_params().operation_timeout());
}
- ChildrenRecordsLimit = Request->Request.records_limit();
TIntrusivePtr<TDomainsInfo> domains = AppData()->DomainsInfo;
TIntrusivePtr<TDomainsInfo::TDomain> domain = domains->Domains.begin()->second;
DomainPath = "/" + domain->Name;
@@ -1990,80 +1988,6 @@ public:
record.IssueLog.set_listed(value);
}
- void RemoveRecordsAboveLimit(TMergeIssuesContext& context, TList<TSelfCheckContext::TIssueRecord>& records) {
- records.sort([](TSelfCheckContext::TIssueRecord& a, TSelfCheckContext::TIssueRecord& b) { return b.IssueLog.status() < a.IssueLog.status(); });
- ui32 commonListed = 0;
- for (auto it = records.begin(); it != records.end(); it++) {
- if (commonListed == ChildrenRecordsLimit) {
- auto removeIt = it;
- it--;
- SetIssueCount(*it, GetIssueCount(*it) + GetIssueCount(*removeIt));
-
- auto reasons = removeIt->IssueLog.reason();
- for (auto reasonIt = reasons.begin(); reasonIt != reasons.end(); reasonIt++) {
- context.removeIssuesIds.insert(*reasonIt);
- }
- context.removeIssuesIds.insert(removeIt->IssueLog.id());
- records.erase(removeIt);
- } else if (commonListed + GetIssueListed(*it) > ChildrenRecordsLimit) {
- auto aboveLimit = commonListed + GetIssueListed(*it) - ChildrenRecordsLimit;
- SetIssueListed(*it, GetIssueListed(*it) - aboveLimit);
-
- switch (it->Tag) {
- case ETags::GroupState: {
- auto groupIds = it->IssueLog.mutable_location()->mutable_storage()->mutable_pool()->mutable_group()->mutable_id();
- while (aboveLimit > 0) {
- groupIds->RemoveLast();
- aboveLimit--;
- }
- break;
- }
- case ETags::VDiskState: {
- auto vdiscIds = it->IssueLog.mutable_location()->mutable_storage()->mutable_pool()->mutable_group()->mutable_vdisk()->mutable_id();
- while (aboveLimit > 0) {
- vdiscIds->RemoveLast();
- aboveLimit--;
- }
- break;
- }
- case ETags::PDiskState: {
- auto pdiscs = it->IssueLog.mutable_location()->mutable_storage()->mutable_pool()->mutable_group()->mutable_vdisk()->mutable_pdisk();
- while (aboveLimit > 0) {
- pdiscs->RemoveLast();
- aboveLimit--;
- }
- break;
- }
- default: {}
- }
- commonListed = ChildrenRecordsLimit;
- } else {
- commonListed += GetIssueListed(*it);
- }
- }
- }
-
- void RemoveRecordsAboveLimit(TMergeIssuesContext& context, ETags levelTag) {
- auto& records = context.GetRecords(levelTag);
- if (records.size() > 0) {
- RemoveRecordsAboveLimit(context, records);
- }
- }
-
- void RemoveRecordsAboveLimit(TMergeIssuesContext& context, ETags levelTag, ETags upperTag) {
- auto& levelRecords = context.GetRecords(levelTag);
- auto& upperRecords = context.GetRecords(upperTag);
-
- TList<TSelfCheckResult::TIssueRecord> handled;
- for (auto it = upperRecords.begin(); it != upperRecords.end(); it++) {
- auto children = FindChildrenRecords(levelRecords, *it);
-
- RemoveRecordsAboveLimit(context, *children);
- handled.splice(handled.end(), *children);
- }
- levelRecords.splice(levelRecords.end(), handled);
- }
-
void FillGroupStatus(TGroupId groupId, const NKikimrWhiteboard::TBSGroupStateInfo& groupInfo, Ydb::Monitoring::StorageGroupStatus& storageGroupStatus, TSelfCheckContext context) {
if (context.Location.mutable_storage()->mutable_pool()->mutable_group()->mutable_id()->empty()) {
context.Location.mutable_storage()->mutable_pool()->mutable_group()->add_id();
@@ -2158,11 +2082,6 @@ public:
MergeLevelRecords(mergeContext, ETags::VDiskState, ETags::GroupState);
MergeLevelRecords(mergeContext, ETags::PDiskState, ETags::VDiskState);
}
- if (ChildrenRecordsLimit != 0) {
- RemoveRecordsAboveLimit(mergeContext, ETags::PDiskState, ETags::VDiskState);
- RemoveRecordsAboveLimit(mergeContext, ETags::VDiskState, ETags::GroupState);
- RemoveRecordsAboveLimit(mergeContext, ETags::GroupState);
- }
mergeContext.FillRecords(records);
}
@@ -2405,6 +2324,43 @@ public:
context.FillSelfCheckResult();
}
+ bool TruncateIssuesWithStatusWhileBeyondLimit(Ydb::Monitoring::SelfCheckResult& result, ui64 byteLimit, Ydb::Monitoring::StatusFlag::Status status) {
+ auto byteResult = result.ByteSizeLong();
+ if (byteResult <= byteLimit) {
+ return true;
+ }
+
+ int totalIssues = result.issue_log_size();
+ TVector<int> indexesToRemove;
+ for (int i = 0; i < totalIssues && byteResult > byteLimit; ++i) {
+ if (result.issue_log(i).status() == status) {
+ byteResult -= result.issue_log(i).ByteSizeLong();
+ indexesToRemove.push_back(i);
+ }
+ }
+
+ for (auto it = indexesToRemove.rbegin(); it != indexesToRemove.rend(); ++it) {
+ result.mutable_issue_log()->SwapElements(*it, result.issue_log_size() - 1);
+ result.mutable_issue_log()->RemoveLast();
+ }
+
+ return byteResult <= byteLimit;
+ }
+
+ void TruncateIssuesIfBeyondLimit(Ydb::Monitoring::SelfCheckResult& result, ui64 byteLimit) {
+ auto truncateStatusPriority = {
+ Ydb::Monitoring::StatusFlag::BLUE,
+ Ydb::Monitoring::StatusFlag::YELLOW,
+ Ydb::Monitoring::StatusFlag::ORANGE,
+ Ydb::Monitoring::StatusFlag::RED
+ };
+ for (Ydb::Monitoring::StatusFlag::Status truncateStatus: truncateStatusPriority) {
+ if (TruncateIssuesWithStatusWhileBeyondLimit(result, byteLimit, truncateStatus)) {
+ break;
+ }
+ }
+ }
+
void ReplyAndPassAway() {
THolder<TEvSelfCheckResult> response = MakeHolder<TEvSelfCheckResult>();
Ydb::Monitoring::SelfCheckResult& result = response->Result;
@@ -2423,15 +2379,8 @@ public:
auto byteSize = result.ByteSizeLong();
auto byteLimit = 50_MB - 1_KB; // 1_KB - for HEALTHCHECK STATUS issue going last
- if (byteSize > byteLimit) {
- do {
- ui32 total = result.issue_log_size();
- ui32 to_remove = total / 20;
- for (ui32 i = 0; i < to_remove; ++i) {
- result.mutable_issue_log()->RemoveLast();
- }
- } while (result.ByteSizeLong() > byteLimit);
- }
+ TruncateIssuesIfBeyondLimit(result, byteLimit);
+
if (byteSize > 30_MB) {
auto* issue = result.add_issue_log();
issue->set_type("HEALTHCHECK_STATUS");
diff --git a/ydb/core/health_check/health_check_ut.cpp b/ydb/core/health_check/health_check_ut.cpp
index 1465a2e8da8..a64bb9ee899 100644
--- a/ydb/core/health_check/health_check_ut.cpp
+++ b/ydb/core/health_check/health_check_ut.cpp
@@ -182,7 +182,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
}
}
- Ydb::Monitoring::SelfCheckResult RequestHc(int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false, int const recordsLimit = 0) {
+ Ydb::Monitoring::SelfCheckResult RequestHc(int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
@@ -228,12 +228,11 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
auto *request = new NHealthCheck::TEvSelfCheckRequest;
request->Request.set_merge_records(isMergeRecords);
- request->Request.set_records_limit(recordsLimit);
runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0));
return runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
}
- void CheckHcResult(Ydb::Monitoring::SelfCheckResult& result, int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false, int const recordsLimit = 0) {
+ void CheckHcResult(Ydb::Monitoring::SelfCheckResult& result, int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false) {
int groupIssuesCount = 0;
int groupIssuesNumber = !isMergeRecords ? groupNumber : 1;
for (const auto& issue_log : result.Getissue_log()) {
@@ -243,9 +242,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
UNIT_ASSERT_VALUES_EQUAL(issue_log.listed(), 0);
UNIT_ASSERT_VALUES_EQUAL(issue_log.count(), 0);
} else {
- int groupListed = recordsLimit == 0 ? groupNumber : std::min<int>(groupNumber, recordsLimit);
- UNIT_ASSERT_VALUES_EQUAL(issue_log.location().storage().pool().group().id_size(), groupListed);
- UNIT_ASSERT_VALUES_EQUAL(issue_log.listed(), groupListed);
+ UNIT_ASSERT_VALUES_EQUAL(issue_log.location().storage().pool().group().id_size(), groupNumber);
+ UNIT_ASSERT_VALUES_EQUAL(issue_log.listed(), groupNumber);
UNIT_ASSERT_VALUES_EQUAL(issue_log.count(), groupNumber);
}
groupIssuesCount++;
@@ -264,9 +262,9 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
UNIT_ASSERT_VALUES_EQUAL(issueVdiscCount, issueVdiscNumber);
}
- void ListingTest(int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false, int const recordsLimit = 0) {
- auto result = RequestHc(groupNumber, vdiscPerGroupNumber, isMergeRecords, recordsLimit);
- CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords, recordsLimit);
+ void ListingTest(int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false) {
+ auto result = RequestHc(groupNumber, vdiscPerGroupNumber, isMergeRecords);
+ CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords);
}
Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(TString erasurespecies, const TVector<Ydb::Monitoring::StatusFlag::Status>& vdiskStatuses) {
@@ -320,7 +318,6 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
auto *request = new NHealthCheck::TEvSelfCheckRequest;
request->Request.set_merge_records(true);
- request->Request.set_records_limit(10);
runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0));
return runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
@@ -418,18 +415,6 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
ListingTest(100, 100, true);
}
- Y_UNIT_TEST(Issues100GroupsMergingAndLimiting) {
- ListingTest(100, 1, true, 10);
- }
-
- Y_UNIT_TEST(Issues100VCardMergingAndLimiting) {
- ListingTest(1, 100, true, 10);
- }
-
- Y_UNIT_TEST(Issues100Groups100VCardMergingAndLimiting) {
- ListingTest(100, 100, true, 10);
- }
-
Y_UNIT_TEST(NoneRedGroupWhenRedVdisk) {
auto result = RequestHcWithVdisks(NHealthCheck::TSelfCheckRequest::NONE, {Ydb::Monitoring::StatusFlag::RED});
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1);
@@ -555,16 +540,17 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
UNIT_ASSERT_VALUES_EQUAL(issuesCount, total);
}
- Y_UNIT_TEST(ProtobufUnderLimit50Mb) {
+ Y_UNIT_TEST(ProtobufUnderLimitFor5000Groups) {
auto result = RequestHc(3000, 1);
CheckHcProtobufSize(result, Ydb::Monitoring::StatusFlag::RED, 0);
CheckHcProtobufSize(result, Ydb::Monitoring::StatusFlag::ORANGE, 0);
CheckHcProtobufSize(result, Ydb::Monitoring::StatusFlag::YELLOW, 0);
}
- Y_UNIT_TEST(ProtobufBeyondLimit50Mb) {
+ Y_UNIT_TEST(ProtobufUnderLimitFor350000Groups) {
auto result = RequestHc(350000, 1);
CheckHcProtobufSize(result, Ydb::Monitoring::StatusFlag::RED, 1);
+ UNIT_ASSERT_LT(result.ByteSizeLong(), 50_MB);
}
}
diff --git a/ydb/core/viewer/json_healthcheck.h b/ydb/core/viewer/json_healthcheck.h
index a98ef7b7e80..99c63da3bbb 100644
--- a/ydb/core/viewer/json_healthcheck.h
+++ b/ydb/core/viewer/json_healthcheck.h
@@ -72,7 +72,6 @@ public:
request->Request.set_return_verbose_status(FromStringWithDefault<bool>(params.Get("verbose"), false));
request->Request.set_maximum_level(FromStringWithDefault<ui32>(params.Get("max_level"), 0));
request->Request.set_merge_records(FromStringWithDefault<bool>(params.Get("merge_records"), false));
- request->Request.set_records_limit(FromStringWithDefault<ui32>(params.Get("records_limit"), 0));
SetDuration(TDuration::MilliSeconds(Timeout), *request->Request.mutable_operation_params()->mutable_operation_timeout());
if (params.Has("min_status")) {
Ydb::Monitoring::StatusFlag::Status minStatus;
@@ -210,7 +209,6 @@ struct TJsonRequestParameters<TJsonHealthCheck> {
{"name":"tenant","in":"query","description":"path to database","required":false,"type":"string"},
{"name":"verbose","in":"query","description":"return verbose status","required":false,"type":"boolean"},
{"name":"merge_records","in":"query","description":"merge records","required":false,"type":"boolean"},
- {"name":"records_limit","in":"query","description":"children records limit","required":false,"type":"integer"},
{"name":"max_level","in":"query","description":"max depth of issues to return","required":false,"type":"integer"},
{"name":"min_status","in":"query","description":"min status of issues to return","required":false,"type":"string"},
{"name":"format","in":"query","description":"format of reply","required":false,"type":"string"}])___";
diff --git a/ydb/public/api/protos/ydb_monitoring.proto b/ydb/public/api/protos/ydb_monitoring.proto
index 91a4ffbb5d1..d58cd6c9988 100644
--- a/ydb/public/api/protos/ydb_monitoring.proto
+++ b/ydb/public/api/protos/ydb_monitoring.proto
@@ -29,7 +29,6 @@ message SelfCheckRequest {
uint32 maximum_level = 4; // maximum level of issues to return
bool do_not_cache = 5; // by default database health state is taken from metadata cache; this option can be used to force bypassing that cache
bool merge_records = 6; // combine similar records with similar status, message and level into one issue
- uint32 records_limit = 7; // limit the number of records that have one parent record, default - without limit
}
message SelfCheckResponse {