summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorandrew-rykov <[email protected]>2023-04-11 13:03:48 +0300
committerandrew-rykov <[email protected]>2023-04-11 13:03:48 +0300
commitb06f9bece6a6eb7dd6f1e112b46092c22c0ab3dc (patch)
tree2a1cc3e4adcd8d59b51047c09ce2d6c8fa8a4bdb
parentdeba2b73f2b94b0d032fc4d41099f79b69504d51 (diff)
hc add node time diff
hc add nodes time diff hc возвращает конкретику по ClockSkew, когда verbose=true если нет verbose, то только в alert на расхождение в 5 и 25 мс
-rw-r--r--ydb/core/health_check/health_check.cpp43
-rw-r--r--ydb/core/protos/node_whiteboard.proto1
-rw-r--r--ydb/core/tablet/node_whiteboard.cpp15
-rw-r--r--ydb/public/api/protos/ydb_monitoring.proto2
4 files changed, 56 insertions, 5 deletions
diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp
index 409b3594db0..1f87def2507 100644
--- a/ydb/core/health_check/health_check.cpp
+++ b/ydb/core/health_check/health_check.cpp
@@ -330,6 +330,26 @@ public:
return false;
}
+ Ydb::Monitoring::StatusFlag::Status FindMaxStatus(std::initializer_list<ETags> tags) const {
+ Ydb::Monitoring::StatusFlag::Status status = Ydb::Monitoring::StatusFlag::GREY;
+ for (const TIssueRecord& record : IssueRecords) {
+ for (const ETags tag : tags) {
+ if (record.Tag == tag) {
+ status = MaxStatus(status, record.IssueLog.status());
+ }
+ }
+ }
+ return status;
+ }
+
+ void ReportWithMaxChildStatus(const TString& message = {},
+ ETags setTag = ETags::None,
+ std::initializer_list<ETags> includeTags = {}) {
+ if (HasTags(includeTags)) {
+ ReportStatus(FindMaxStatus(includeTags), message, setTag, includeTags);
+ }
+ }
+
Ydb::Monitoring::StatusFlag::Status GetOverallStatus() const {
return OverallStatus;
}
@@ -1397,6 +1417,16 @@ public:
}
loadAverageStatus.set_overall(laContext.GetOverallStatus());
}
+
+ TSelfCheckContext clockSkewContext(&context, "CLOCK_SKEW");
+ computeNodeStatus.set_maxclockskewmicrosec(nodeSystemState.clockskewmicrosec());
+ if (nodeSystemState.clockskewmicrosec() > 25000) {
+ clockSkewContext.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "ClockSkew above 25 ms", ETags::NodeState);
+ } else if (nodeSystemState.clockskewmicrosec() > 5000) {
+ clockSkewContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "ClockSkew above 5 ms", ETags::NodeState);
+ } else {
+ clockSkewContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
+ }
} else {
// context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
// TStringBuilder() << "Compute node is not available",
@@ -1426,15 +1456,18 @@ public:
if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
context.ReportStatus(systemStatus, "Compute has issues with system tablets", ETags::ComputeState, {ETags::SystemTabletState});
}
- Ydb::Monitoring::StatusFlag::Status nodesStatus = Ydb::Monitoring::StatusFlag::GREEN;
+ ui64 clockSkew = 0;
for (TNodeId nodeId : *computeNodeIds) {
auto& computeNode = *computeStatus.add_nodes();
FillComputeNodeStatus(nodeId, computeNode, {&context, "COMPUTE_NODE"});
- nodesStatus = MaxStatus(nodesStatus, computeNode.overall());
- }
- if (nodesStatus != Ydb::Monitoring::StatusFlag::GREEN) {
- context.ReportStatus(nodesStatus, "Compute is overloaded", ETags::ComputeState, {ETags::OverloadState});
+ ui64 skew = computeNode.maxclockskewmicrosec();
+ if (skew > clockSkew) {
+ clockSkew = skew;
+ }
}
+ computeStatus.set_maxclockskewmicrosec(clockSkew);
+ context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState});
+ context.ReportWithMaxChildStatus("ClockSkew exceeded", ETags::ComputeState, {ETags::NodeState});
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
computeNodeIds->push_back(0); // for tablets without node
for (TNodeId nodeId : *computeNodeIds) {
diff --git a/ydb/core/protos/node_whiteboard.proto b/ydb/core/protos/node_whiteboard.proto
index 82f4b547ca4..e7142c4cd31 100644
--- a/ydb/core/protos/node_whiteboard.proto
+++ b/ydb/core/protos/node_whiteboard.proto
@@ -306,6 +306,7 @@ message TSystemStateInfo {
optional uint64 MemoryUsedInAlloc = 29;
optional double MaxDiskUsage = 30;
optional NActorsInterconnect.TNodeLocation Location = 31;
+ optional uint64 ClockSkewMicrosec = 32;
}
message TEvSystemStateRequest {
diff --git a/ydb/core/tablet/node_whiteboard.cpp b/ydb/core/tablet/node_whiteboard.cpp
index 289e67c7881..6a9d552d271 100644
--- a/ydb/core/tablet/node_whiteboard.cpp
+++ b/ydb/core/tablet/node_whiteboard.cpp
@@ -653,6 +653,21 @@ protected:
SystemStateInfo.SetSystemState(eFlag);
SystemStateInfo.SetChangeTime(ctx.Now().MilliSeconds());
}
+
+ const TIntrusivePtr<::NMonitoring::TDynamicCounters> &counters = AppData(ctx)->Counters;
+ TIntrusivePtr<::NMonitoring::TDynamicCounters> interconnectCounters = GetServiceCounters(counters, "interconnect");
+ ui64 clockSkew = 0;
+ interconnectCounters->EnumerateSubgroups([&interconnectCounters, &clockSkew](const TString &name, const TString &value) -> void {
+ if (name == "peer") {
+ TIntrusivePtr<::NMonitoring::TDynamicCounters> peerCounters = interconnectCounters->GetSubgroup(name, value);
+ ::NMonitoring::TDynamicCounters::TCounterPtr connectedCounter = peerCounters->GetCounter("ClockSkewMicrosec");
+ ui64 skew = abs(connectedCounter->Val());
+ if (skew > clockSkew) {
+ clockSkew = skew;
+ }
+ }
+ });
+ SystemStateInfo.SetClockSkewMicrosec(clockSkew);
}
static void CopyTabletStateInfo(
diff --git a/ydb/public/api/protos/ydb_monitoring.proto b/ydb/public/api/protos/ydb_monitoring.proto
index 96908f7963c..b7942f7952a 100644
--- a/ydb/public/api/protos/ydb_monitoring.proto
+++ b/ydb/public/api/protos/ydb_monitoring.proto
@@ -110,12 +110,14 @@ message ComputeNodeStatus {
repeated ComputeTabletStatus tablets = 3;
repeated ThreadPoolStatus pools = 4;
LoadAverageStatus load = 5;
+ uint64 maxClockSkewMicrosec = 6;
}
message ComputeStatus {
StatusFlag.Status overall = 1;
repeated ComputeNodeStatus nodes = 2;
repeated ComputeTabletStatus tablets = 3;
+ uint64 maxClockSkewMicrosec = 4;
}
message LocationNode {