diff options
author | andrew-rykov <[email protected]> | 2023-04-11 13:03:48 +0300 |
---|---|---|
committer | andrew-rykov <[email protected]> | 2023-04-11 13:03:48 +0300 |
commit | b06f9bece6a6eb7dd6f1e112b46092c22c0ab3dc (patch) | |
tree | 2a1cc3e4adcd8d59b51047c09ce2d6c8fa8a4bdb | |
parent | deba2b73f2b94b0d032fc4d41099f79b69504d51 (diff) |
hc add node time diff
hc add nodes time diff
hc возвращает конкретику по ClockSkew, когда verbose=true
если нет verbose, то только в alert на расхождение в 5 и 25 мс
-rw-r--r-- | ydb/core/health_check/health_check.cpp | 43 | ||||
-rw-r--r-- | ydb/core/protos/node_whiteboard.proto | 1 | ||||
-rw-r--r-- | ydb/core/tablet/node_whiteboard.cpp | 15 | ||||
-rw-r--r-- | ydb/public/api/protos/ydb_monitoring.proto | 2 |
4 files changed, 56 insertions, 5 deletions
diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index 409b3594db0..1f87def2507 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -330,6 +330,26 @@ public: return false; } + Ydb::Monitoring::StatusFlag::Status FindMaxStatus(std::initializer_list<ETags> tags) const { + Ydb::Monitoring::StatusFlag::Status status = Ydb::Monitoring::StatusFlag::GREY; + for (const TIssueRecord& record : IssueRecords) { + for (const ETags tag : tags) { + if (record.Tag == tag) { + status = MaxStatus(status, record.IssueLog.status()); + } + } + } + return status; + } + + void ReportWithMaxChildStatus(const TString& message = {}, + ETags setTag = ETags::None, + std::initializer_list<ETags> includeTags = {}) { + if (HasTags(includeTags)) { + ReportStatus(FindMaxStatus(includeTags), message, setTag, includeTags); + } + } + Ydb::Monitoring::StatusFlag::Status GetOverallStatus() const { return OverallStatus; } @@ -1397,6 +1417,16 @@ public: } loadAverageStatus.set_overall(laContext.GetOverallStatus()); } + + TSelfCheckContext clockSkewContext(&context, "CLOCK_SKEW"); + computeNodeStatus.set_maxclockskewmicrosec(nodeSystemState.clockskewmicrosec()); + if (nodeSystemState.clockskewmicrosec() > 25000) { + clockSkewContext.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "ClockSkew above 25 ms", ETags::NodeState); + } else if (nodeSystemState.clockskewmicrosec() > 5000) { + clockSkewContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "ClockSkew above 5 ms", ETags::NodeState); + } else { + clockSkewContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); + } } else { // context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, // TStringBuilder() << "Compute node is not available", @@ -1426,15 +1456,18 @@ public: if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) { context.ReportStatus(systemStatus, "Compute has issues with system tablets", ETags::ComputeState, {ETags::SystemTabletState}); } - Ydb::Monitoring::StatusFlag::Status nodesStatus = Ydb::Monitoring::StatusFlag::GREEN; + ui64 clockSkew = 0; for (TNodeId nodeId : *computeNodeIds) { auto& computeNode = *computeStatus.add_nodes(); FillComputeNodeStatus(nodeId, computeNode, {&context, "COMPUTE_NODE"}); - nodesStatus = MaxStatus(nodesStatus, computeNode.overall()); - } - if (nodesStatus != Ydb::Monitoring::StatusFlag::GREEN) { - context.ReportStatus(nodesStatus, "Compute is overloaded", ETags::ComputeState, {ETags::OverloadState}); + ui64 skew = computeNode.maxclockskewmicrosec(); + if (skew > clockSkew) { + clockSkew = skew; + } } + computeStatus.set_maxclockskewmicrosec(clockSkew); + context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState}); + context.ReportWithMaxChildStatus("ClockSkew exceeded", ETags::ComputeState, {ETags::NodeState}); Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN; computeNodeIds->push_back(0); // for tablets without node for (TNodeId nodeId : *computeNodeIds) { diff --git a/ydb/core/protos/node_whiteboard.proto b/ydb/core/protos/node_whiteboard.proto index 82f4b547ca4..e7142c4cd31 100644 --- a/ydb/core/protos/node_whiteboard.proto +++ b/ydb/core/protos/node_whiteboard.proto @@ -306,6 +306,7 @@ message TSystemStateInfo { optional uint64 MemoryUsedInAlloc = 29; optional double MaxDiskUsage = 30; optional NActorsInterconnect.TNodeLocation Location = 31; + optional uint64 ClockSkewMicrosec = 32; } message TEvSystemStateRequest { diff --git a/ydb/core/tablet/node_whiteboard.cpp b/ydb/core/tablet/node_whiteboard.cpp index 289e67c7881..6a9d552d271 100644 --- a/ydb/core/tablet/node_whiteboard.cpp +++ b/ydb/core/tablet/node_whiteboard.cpp @@ -653,6 +653,21 @@ protected: SystemStateInfo.SetSystemState(eFlag); SystemStateInfo.SetChangeTime(ctx.Now().MilliSeconds()); } + + const TIntrusivePtr<::NMonitoring::TDynamicCounters> &counters = AppData(ctx)->Counters; + TIntrusivePtr<::NMonitoring::TDynamicCounters> interconnectCounters = GetServiceCounters(counters, "interconnect"); + ui64 clockSkew = 0; + interconnectCounters->EnumerateSubgroups([&interconnectCounters, &clockSkew](const TString &name, const TString &value) -> void { + if (name == "peer") { + TIntrusivePtr<::NMonitoring::TDynamicCounters> peerCounters = interconnectCounters->GetSubgroup(name, value); + ::NMonitoring::TDynamicCounters::TCounterPtr connectedCounter = peerCounters->GetCounter("ClockSkewMicrosec"); + ui64 skew = abs(connectedCounter->Val()); + if (skew > clockSkew) { + clockSkew = skew; + } + } + }); + SystemStateInfo.SetClockSkewMicrosec(clockSkew); } static void CopyTabletStateInfo( diff --git a/ydb/public/api/protos/ydb_monitoring.proto b/ydb/public/api/protos/ydb_monitoring.proto index 96908f7963c..b7942f7952a 100644 --- a/ydb/public/api/protos/ydb_monitoring.proto +++ b/ydb/public/api/protos/ydb_monitoring.proto @@ -110,12 +110,14 @@ message ComputeNodeStatus { repeated ComputeTabletStatus tablets = 3; repeated ThreadPoolStatus pools = 4; LoadAverageStatus load = 5; + uint64 maxClockSkewMicrosec = 6; } message ComputeStatus { StatusFlag.Status overall = 1; repeated ComputeNodeStatus nodes = 2; repeated ComputeTabletStatus tablets = 3; + uint64 maxClockSkewMicrosec = 4; } message LocationNode { |