aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorandrew-rykov <arykov@ydb.tech>2022-09-20 17:51:36 +0300
committerandrew-rykov <arykov@ydb.tech>2022-09-20 17:51:36 +0300
commit2997e47e591189dc9a36311a44bc6aecb4685687 (patch)
tree537524ee1200f688b2700cd9d48d4886dcee2cb4
parent388e291e7fe19e4f128775a7bdfe5e366e47bf7c (diff)
downloadydb-2997e47e591189dc9a36311a44bc6aecb4685687.tar.gz
PR from branch users/andrew-rykov//merge-issues
added merging vdisks removed TSelfCheckContext made node repeated merge issues
-rw-r--r--ydb/core/health_check/health_check.cpp111
-rw-r--r--ydb/public/api/protos/ydb_monitoring.proto2
2 files changed, 87 insertions, 26 deletions
diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp
index 1a55044d5e3..6d6e946fc26 100644
--- a/ydb/core/health_check/health_check.cpp
+++ b/ydb/core/health_check/health_check.cpp
@@ -187,7 +187,7 @@ public:
};
Ydb::Monitoring::StatusFlag::Status OverallStatus = Ydb::Monitoring::StatusFlag::GREY;
- TList<TIssueRecord> IssueLog;
+ TList<TIssueRecord> IssueRecords;
Ydb::Monitoring::Location Location;
int Level = 1;
TString Type;
@@ -205,14 +205,16 @@ public:
id << Ydb::Monitoring::StatusFlag_Status_Name(issueLog.status());
id << '-' << crc16(issueLog.message());
const Ydb::Monitoring::Location& location(issueLog.location());
- if (location.storage().node().id()) {
- id << '-' << location.storage().node().id();
- } else {
- if (location.storage().node().host()) {
- id << '-' << location.storage().node().host();
- }
- if (location.storage().node().port()) {
- id << '-' << location.storage().node().port();
+ if (!location.storage().node().empty()) {
+ if (location.storage().node()[0].id()) {
+ id << '-' << location.storage().node()[0].id();
+ } else {
+ if (location.storage().node()[0].host()) {
+ id << '-' << location.storage().node()[0].host();
+ }
+ if (location.storage().node()[0].port()) {
+ id << '-' << location.storage().node()[0].port();
+ }
}
}
if (location.storage().pool().group().vdisk().id()) {
@@ -256,7 +258,7 @@ public:
if (IsErrorStatus(status)) {
std::vector<TString> reason;
if (includeTags.size() != 0) {
- for (const TIssueRecord& record : IssueLog) {
+ for (const TIssueRecord& record : IssueRecords) {
for (const TString& tag : includeTags) {
if (record.Tag == tag) {
reason.push_back(record.IssueLog.id());
@@ -267,7 +269,7 @@ public:
}
std::sort(reason.begin(), reason.end());
reason.erase(std::unique(reason.begin(), reason.end()), reason.end());
- TIssueRecord& issueRecord(*IssueLog.emplace(IssueLog.begin()));
+ TIssueRecord& issueRecord(*IssueRecords.emplace(IssueRecords.begin()));
Ydb::Monitoring::IssueLog& issueLog(issueRecord.IssueLog);
issueLog.set_status(status);
issueLog.set_message(message);
@@ -291,7 +293,7 @@ public:
}
bool HasTags(std::initializer_list<TString> tags) const {
- for (const TIssueRecord& record : IssueLog) {
+ for (const TIssueRecord& record : IssueRecords) {
for (const TString& tag : tags) {
if (record.Tag == tag) {
return true;
@@ -308,12 +310,12 @@ public:
void SetOverallStatus(Ydb::Monitoring::StatusFlag::Status status) {
OverallStatus = status;
}
-
+
void InheritFrom(TSelfCheckResult& lower) {
if (lower.GetOverallStatus() >= OverallStatus) {
OverallStatus = lower.GetOverallStatus();
}
- IssueLog.splice(IssueLog.end(), std::move(lower.IssueLog));
+ IssueRecords.splice(IssueRecords.end(), std::move(lower.IssueRecords));
}
};
@@ -1554,15 +1556,19 @@ public:
if (UnavailableStorageNodes.count(pDiskInfo.nodeid()) != 0) {
TSelfCheckContext nodeContext(&context, "STORAGE_NODE");
nodeContext.Location.mutable_storage()->clear_pool();
- nodeContext.Location.mutable_storage()->mutable_node()->set_id(pDiskInfo.nodeid());
+
+ if (context.Location.mutable_storage()->mutable_node()->empty()) {
+ context.Location.mutable_storage()->mutable_node()->Add();
+ }
+ nodeContext.Location.mutable_storage()->mutable_node(0)->set_id(pDiskInfo.nodeid());
const TEvInterconnect::TNodeInfo* nodeInfo = nullptr;
auto itNodeInfo = MergedNodeInfo.find(pDiskInfo.nodeid());
if (itNodeInfo != MergedNodeInfo.end()) {
nodeInfo = itNodeInfo->second;
}
if (nodeInfo) {
- nodeContext.Location.mutable_storage()->mutable_node()->set_host(nodeInfo->Host);
- nodeContext.Location.mutable_storage()->mutable_node()->set_port(nodeInfo->Port);
+ nodeContext.Location.mutable_storage()->mutable_node(0)->set_host(nodeInfo->Host);
+ nodeContext.Location.mutable_storage()->mutable_node(0)->set_port(nodeInfo->Port);
}
nodeContext.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
TStringBuilder() << "Storage node is not available",
@@ -1660,6 +1666,8 @@ public:
static const inline TString NONE = "none";
static const inline TString BLOCK_4_2 = "block-4-2";
static const inline TString MIRROR_3_DC = "mirror-3-dc";
+ static const int COMPRESS_SIZE_TRIGGER = 10;
+ static const int COMPRESS_GROUP_LIMIT = 10;
static void IncrementFor(TStackVec<std::pair<ui32, int>>& realms, ui32 realm) {
auto itRealm = FindIf(realms, [realm](const std::pair<ui32, int>& p) -> bool {
@@ -1678,27 +1686,33 @@ public:
int disksColors[Ydb::Monitoring::StatusFlag::Status_ARRAYSIZE] = {};
TStackVec<std::pair<ui32, int>> failedRealms;
int failedDisks = 0;
+
+ if (context.Location.mutable_storage()->mutable_node()->empty()) {
+ context.Location.mutable_storage()->mutable_node()->Add();
+ }
for (const auto& protoVDiskId : groupInfo.vdiskids()) {
TString vDiskId = GetVDiskId(protoVDiskId);
auto itVDisk = MergedVDiskState.find(vDiskId);
const TEvInterconnect::TNodeInfo* nodeInfo = nullptr;
+
if (itVDisk != MergedVDiskState.end()) {
TNodeId nodeId = itVDisk->second->nodeid();
auto itNodeInfo = MergedNodeInfo.find(nodeId);
if (itNodeInfo != MergedNodeInfo.end()) {
nodeInfo = itNodeInfo->second;
}
- context.Location.mutable_storage()->mutable_node()->set_id(nodeId);
+ context.Location.mutable_storage()->mutable_node(0)->set_id(nodeId);
} else {
- context.Location.mutable_storage()->mutable_node()->clear_id();
+ context.Location.mutable_storage()->mutable_node(0)->clear_id();
}
if (nodeInfo) {
- context.Location.mutable_storage()->mutable_node()->set_host(nodeInfo->Host);
- context.Location.mutable_storage()->mutable_node()->set_port(nodeInfo->Port);
+ context.Location.mutable_storage()->mutable_node(0)->set_host(nodeInfo->Host);
+ context.Location.mutable_storage()->mutable_node(0)->set_port(nodeInfo->Port);
} else {
- context.Location.mutable_storage()->mutable_node()->clear_host();
- context.Location.mutable_storage()->mutable_node()->clear_port();
+ context.Location.mutable_storage()->mutable_node(0)->clear_host();
+ context.Location.mutable_storage()->mutable_node(0)->clear_port();
}
+
Ydb::Monitoring::StorageVDiskStatus& vDiskStatus = *storageGroupStatus.add_vdisks();
FillVDiskStatus(vDiskId, itVDisk != MergedVDiskState.end() ? *itVDisk->second : NKikimrWhiteboard::TVDiskStateInfo(), vDiskStatus, {&context, "VDISK"});
++disksColors[vDiskStatus.overall()];
@@ -1713,6 +1727,53 @@ public:
break;
}
}
+
+ TList<TSelfCheckContext::TIssueRecord> compress;
+ while (!context.IssueRecords.empty()) {
+ TList<TSelfCheckContext::TIssueRecord> group;
+ group.splice(group.begin(), context.IssueRecords, context.IssueRecords.begin());
+ for (auto it = context.IssueRecords.begin(); it != context.IssueRecords.end(); ) {
+ if (it->IssueLog.Getstatus() == group.begin()->IssueLog.Getstatus() &&
+ it->IssueLog.Getmessage() == group.begin()->IssueLog.Getmessage() &&
+ it->IssueLog.Getlevel() == group.begin()->IssueLog.Getlevel() &&
+ it->Tag == "vdisk-state") {
+
+ auto move = it++;
+ group.splice(group.begin(), context.IssueRecords, move);
+ } else {
+ ++it;
+ }
+ }
+
+ if (group.size() >= COMPRESS_SIZE_TRIGGER) {
+ int compressCount = 0;
+ for (auto it = std::next(group.begin(), 1); it != group.end(); ) {
+ if (compressCount < COMPRESS_GROUP_LIMIT) {
+ auto nodesA = group.begin()->IssueLog.mutable_location()->mutable_storage()->mutable_node();
+ auto nodesB = it->IssueLog.mutable_location()->mutable_storage()->mutable_node();
+ nodesA->Add(nodesB->begin(), nodesB->end());
+ nodesB->Clear();
+
+ auto reasonA = group.begin()->IssueLog.mutable_reason();
+ auto reasonB = it->IssueLog.mutable_reason();
+ reasonA->Add(reasonB->begin(), reasonB->end());
+ reasonB->Clear();
+
+ compressCount++;
+ } else {
+ it->IssueLog.mutable_location()->mutable_storage()->mutable_node()->Clear();
+ it->IssueLog.mutable_reason()->Clear();
+ }
+
+ it = context.IssueRecords.erase(it);
+ }
+ compress.emplace_back(group.front());
+ } else {
+ compress.splice(compress.end(), group);
+ }
+ }
+
+ context.IssueRecords.splice(context.IssueRecords.begin(), compress);
context.Location.mutable_storage()->clear_node(); // group doesn't have node
context.OverallStatus = MinStatus(context.OverallStatus, Ydb::Monitoring::StatusFlag::YELLOW);
@@ -1832,7 +1893,7 @@ public:
}
databaseStatus.set_overall(context.GetOverallStatus());
overall = MaxStatus(overall, context.GetOverallStatus());
- for (auto& issueRecord : context.IssueLog) {
+ for (auto& issueRecord : context.IssueRecords) {
std::pair<TString, TString> key{issueRecord.IssueLog.location().database().name(), issueRecord.IssueLog.id()};
if (issueIds.emplace(key).second) {
result.mutable_issue_log()->Add()->CopyFrom(issueRecord.IssueLog);
@@ -1865,7 +1926,7 @@ public:
FillStorage(unknownDatabase, *databaseStatus.mutable_storage(), {&context, "STORAGE"});
databaseStatus.set_overall(context.GetOverallStatus());
overall = MaxStatus(overall, context.GetOverallStatus());
- for (auto& issueRecord : context.IssueLog) {
+ for (auto& issueRecord : context.IssueRecords) {
std::pair<TString, TString> key{issueRecord.IssueLog.location().database().name(), issueRecord.IssueLog.id()};
if (issueIds.emplace(key).second) {
result.mutable_issue_log()->Add()->CopyFrom(issueRecord.IssueLog);
diff --git a/ydb/public/api/protos/ydb_monitoring.proto b/ydb/public/api/protos/ydb_monitoring.proto
index d279e415d4a..95f31b6d85c 100644
--- a/ydb/public/api/protos/ydb_monitoring.proto
+++ b/ydb/public/api/protos/ydb_monitoring.proto
@@ -145,7 +145,7 @@ message LocationStoragePool {
}
message LocationStorage {
- LocationNode node = 1;
+ repeated LocationNode node = 1;
LocationStoragePool pool = 2;
}