diff options
author | Andrei Rykov <arykov@ydb.tech> | 2024-07-03 07:50:02 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-07-03 07:50:02 +0200 |
commit | 2fe919ac3cdde544d69d862bbd543a7a9e674c7b (patch) | |
tree | f35a018de70f2b87cc03e1d5b9128de402a7ab18 | |
parent | 4c547a3cd21814dbc33f731d55884733f2f1b83b (diff) | |
download | ydb-2fe919ac3cdde544d69d862bbd543a7a9e674c7b.tar.gz |
move time difference issue under database level (#5859)
-rw-r--r-- | ydb/core/health_check/health_check.cpp | 80 | ||||
-rw-r--r-- | ydb/public/api/protos/ydb_monitoring.proto | 8 |
2 files changed, 51 insertions, 37 deletions
diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index 8851c2697b5..46460e4de84 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -1450,7 +1450,7 @@ public: } } - void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) { + void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context, bool reportTimeDifference) { FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node()); TSelfCheckContext rrContext(&context, "NODE_UPTIME"); @@ -1488,6 +1488,34 @@ public: } loadAverageStatus.set_overall(laContext.GetOverallStatus()); } + + if (nodeSystemState.HasMaxClockSkewPeerId()) { + TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId(); + long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs(); + TDuration timeDifferenceDuration = TDuration::MicroSeconds(abs(timeDifferenceUs)); + Ydb::Monitoring::StatusFlag::Status status; + if (timeDifferenceDuration > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) { + status = Ydb::Monitoring::StatusFlag::ORANGE; + } else if (timeDifferenceDuration > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) { + status = Ydb::Monitoring::StatusFlag::YELLOW; + } else { + status = Ydb::Monitoring::StatusFlag::GREEN; + } + + computeNodeStatus.mutable_max_time_difference()->set_peer(ToString(peerId)); + computeNodeStatus.mutable_max_time_difference()->set_difference_ms(timeDifferenceDuration.MilliSeconds()); + computeNodeStatus.set_overall(status); + + if (reportTimeDifference) { + TSelfCheckContext tdContext(&context, "NODES_TIME_DIFFERENCE"); + FillNodeInfo(peerId, tdContext.Location.mutable_compute()->mutable_peer()); + if (status == Ydb::Monitoring::StatusFlag::GREEN) { + tdContext.ReportStatus(status); + } else { + tdContext.ReportStatus(status, TStringBuilder() << "The nodes have a time difference of " << timeDifferenceDuration.MilliSeconds() << " ms", ETags::SyncState); + } + } + } } else { // context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, // TStringBuilder() << "Compute node is not available", @@ -1552,14 +1580,27 @@ public: if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) { context.ReportStatus(systemStatus, "Compute has issues with system tablets", ETags::ComputeState, {ETags::SystemTabletState}); } + long maxClockSkewUs = 0; + TNodeId maxClockSkewNodeId = 0; + for (TNodeId nodeId : *computeNodeIds) { + auto itNodeSystemState = MergedNodeSystemState.find(nodeId); + if (itNodeSystemState != MergedNodeSystemState.end()) { + if (std::count(computeNodeIds->begin(), computeNodeIds->end(), itNodeSystemState->second->GetMaxClockSkewPeerId()) > 0 + && abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) { + maxClockSkewUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()); + maxClockSkewNodeId = nodeId; + } + } + } for (TNodeId nodeId : *computeNodeIds) { auto& computeNode = *computeStatus.add_nodes(); - FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"}); + FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"}, maxClockSkewNodeId == nodeId); } FillComputeDatabaseStatus(databaseState, computeStatus, {&context, "COMPUTE_QUOTA"}); context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime}); context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState}); context.ReportWithMaxChildStatus("Compute quota usage", ETags::ComputeState, {ETags::QuotaUsage}); + context.ReportWithMaxChildStatus("Database has time difference between nodes", ETags::ComputeState, {ETags::SyncState}); Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN; computeNodeIds->push_back(0); // for tablets without node for (TNodeId nodeId : *computeNodeIds) { @@ -2599,40 +2640,6 @@ public: const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000); const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000); - void FillNodesSyncStatus(TOverallStateContext& context) { - long maxClockSkewUs = 0; - TNodeId maxClockSkewPeerId = 0; - TNodeId maxClockSkewNodeId = 0; - for (auto& [nodeId, nodeSystemState] : MergedNodeSystemState) { - if (IsTimeDifferenceCheckNode(nodeId) && IsTimeDifferenceCheckNode(nodeSystemState->GetMaxClockSkewPeerId()) - && abs(nodeSystemState->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) { - maxClockSkewUs = abs(nodeSystemState->GetMaxClockSkewWithPeerUs()); - maxClockSkewPeerId = nodeSystemState->GetMaxClockSkewPeerId(); - maxClockSkewNodeId = nodeId; - } - } - if (!maxClockSkewNodeId) { - return; - } - - TSelfCheckResult syncContext; - syncContext.Type = "NODES_TIME_DIFFERENCE"; - FillNodeInfo(maxClockSkewNodeId, syncContext.Location.mutable_node()); - FillNodeInfo(maxClockSkewPeerId, syncContext.Location.mutable_peer()); - - TDuration maxClockSkewTime = TDuration::MicroSeconds(maxClockSkewUs); - if (maxClockSkewTime > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) { - syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState); - } else if (maxClockSkewTime > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) { - syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState); - } else { - syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); - } - - context.UpdateMaxStatus(syncContext.GetOverallStatus()); - context.AddIssues(syncContext.IssueRecords); - } - void FillResult(TOverallStateContext context) { if (IsSpecificDatabaseFilter()) { FillDatabaseResult(context, FilterDatabase, DatabaseState[FilterDatabase]); @@ -2641,7 +2648,6 @@ public: FillDatabaseResult(context, path, state); } } - FillNodesSyncStatus(context); if (DatabaseState.empty()) { Ydb::Monitoring::DatabaseStatus& databaseStatus(*context.Result->add_database_status()); TSelfCheckResult tabletContext; diff --git a/ydb/public/api/protos/ydb_monitoring.proto b/ydb/public/api/protos/ydb_monitoring.proto index dd99eb583f2..dc47c4ecfb8 100644 --- a/ydb/public/api/protos/ydb_monitoring.proto +++ b/ydb/public/api/protos/ydb_monitoring.proto @@ -106,12 +106,19 @@ message LoadAverageStatus { uint32 cores = 3; } +message TimeDifferenceStatus { + StatusFlag.Status overall = 1; + int64 difference_ms = 2; + string peer = 3; +} + message ComputeNodeStatus { string id = 1; StatusFlag.Status overall = 2; repeated ComputeTabletStatus tablets = 3; repeated ThreadPoolStatus pools = 4; LoadAverageStatus load = 5; + TimeDifferenceStatus max_time_difference = 6; } message ComputeStatus { @@ -167,6 +174,7 @@ message LocationCompute { LocationNode node = 1; LocationComputePool pool = 2; LocationComputeTablet tablet = 3; + LocationNode peer = 4; } message LocationDatabase { |