aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrei Rykov <arykov@ydb.tech>2024-07-03 07:50:02 +0200
committerGitHub <noreply@github.com>2024-07-03 07:50:02 +0200
commit2fe919ac3cdde544d69d862bbd543a7a9e674c7b (patch)
treef35a018de70f2b87cc03e1d5b9128de402a7ab18
parent4c547a3cd21814dbc33f731d55884733f2f1b83b (diff)
downloadydb-2fe919ac3cdde544d69d862bbd543a7a9e674c7b.tar.gz
move time difference issue under database level (#5859)
-rw-r--r--ydb/core/health_check/health_check.cpp80
-rw-r--r--ydb/public/api/protos/ydb_monitoring.proto8
2 files changed, 51 insertions, 37 deletions
diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp
index 8851c2697b5..46460e4de84 100644
--- a/ydb/core/health_check/health_check.cpp
+++ b/ydb/core/health_check/health_check.cpp
@@ -1450,7 +1450,7 @@ public:
}
}
- void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
+ void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context, bool reportTimeDifference) {
FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node());
TSelfCheckContext rrContext(&context, "NODE_UPTIME");
@@ -1488,6 +1488,34 @@ public:
}
loadAverageStatus.set_overall(laContext.GetOverallStatus());
}
+
+ if (nodeSystemState.HasMaxClockSkewPeerId()) {
+ TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId();
+ long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs();
+ TDuration timeDifferenceDuration = TDuration::MicroSeconds(abs(timeDifferenceUs));
+ Ydb::Monitoring::StatusFlag::Status status;
+ if (timeDifferenceDuration > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
+ status = Ydb::Monitoring::StatusFlag::ORANGE;
+ } else if (timeDifferenceDuration > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
+ status = Ydb::Monitoring::StatusFlag::YELLOW;
+ } else {
+ status = Ydb::Monitoring::StatusFlag::GREEN;
+ }
+
+ computeNodeStatus.mutable_max_time_difference()->set_peer(ToString(peerId));
+ computeNodeStatus.mutable_max_time_difference()->set_difference_ms(timeDifferenceDuration.MilliSeconds());
+ computeNodeStatus.set_overall(status);
+
+ if (reportTimeDifference) {
+ TSelfCheckContext tdContext(&context, "NODES_TIME_DIFFERENCE");
+ FillNodeInfo(peerId, tdContext.Location.mutable_compute()->mutable_peer());
+ if (status == Ydb::Monitoring::StatusFlag::GREEN) {
+ tdContext.ReportStatus(status);
+ } else {
+ tdContext.ReportStatus(status, TStringBuilder() << "The nodes have a time difference of " << timeDifferenceDuration.MilliSeconds() << " ms", ETags::SyncState);
+ }
+ }
+ }
} else {
// context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
// TStringBuilder() << "Compute node is not available",
@@ -1552,14 +1580,27 @@ public:
if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
context.ReportStatus(systemStatus, "Compute has issues with system tablets", ETags::ComputeState, {ETags::SystemTabletState});
}
+ long maxClockSkewUs = 0;
+ TNodeId maxClockSkewNodeId = 0;
+ for (TNodeId nodeId : *computeNodeIds) {
+ auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
+ if (itNodeSystemState != MergedNodeSystemState.end()) {
+ if (std::count(computeNodeIds->begin(), computeNodeIds->end(), itNodeSystemState->second->GetMaxClockSkewPeerId()) > 0
+ && abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) {
+ maxClockSkewUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs());
+ maxClockSkewNodeId = nodeId;
+ }
+ }
+ }
for (TNodeId nodeId : *computeNodeIds) {
auto& computeNode = *computeStatus.add_nodes();
- FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
+ FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"}, maxClockSkewNodeId == nodeId);
}
FillComputeDatabaseStatus(databaseState, computeStatus, {&context, "COMPUTE_QUOTA"});
context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime});
context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState});
context.ReportWithMaxChildStatus("Compute quota usage", ETags::ComputeState, {ETags::QuotaUsage});
+ context.ReportWithMaxChildStatus("Database has time difference between nodes", ETags::ComputeState, {ETags::SyncState});
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
computeNodeIds->push_back(0); // for tablets without node
for (TNodeId nodeId : *computeNodeIds) {
@@ -2599,40 +2640,6 @@ public:
const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000);
const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000);
- void FillNodesSyncStatus(TOverallStateContext& context) {
- long maxClockSkewUs = 0;
- TNodeId maxClockSkewPeerId = 0;
- TNodeId maxClockSkewNodeId = 0;
- for (auto& [nodeId, nodeSystemState] : MergedNodeSystemState) {
- if (IsTimeDifferenceCheckNode(nodeId) && IsTimeDifferenceCheckNode(nodeSystemState->GetMaxClockSkewPeerId())
- && abs(nodeSystemState->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) {
- maxClockSkewUs = abs(nodeSystemState->GetMaxClockSkewWithPeerUs());
- maxClockSkewPeerId = nodeSystemState->GetMaxClockSkewPeerId();
- maxClockSkewNodeId = nodeId;
- }
- }
- if (!maxClockSkewNodeId) {
- return;
- }
-
- TSelfCheckResult syncContext;
- syncContext.Type = "NODES_TIME_DIFFERENCE";
- FillNodeInfo(maxClockSkewNodeId, syncContext.Location.mutable_node());
- FillNodeInfo(maxClockSkewPeerId, syncContext.Location.mutable_peer());
-
- TDuration maxClockSkewTime = TDuration::MicroSeconds(maxClockSkewUs);
- if (maxClockSkewTime > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
- syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
- } else if (maxClockSkewTime > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
- syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
- } else {
- syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
- }
-
- context.UpdateMaxStatus(syncContext.GetOverallStatus());
- context.AddIssues(syncContext.IssueRecords);
- }
-
void FillResult(TOverallStateContext context) {
if (IsSpecificDatabaseFilter()) {
FillDatabaseResult(context, FilterDatabase, DatabaseState[FilterDatabase]);
@@ -2641,7 +2648,6 @@ public:
FillDatabaseResult(context, path, state);
}
}
- FillNodesSyncStatus(context);
if (DatabaseState.empty()) {
Ydb::Monitoring::DatabaseStatus& databaseStatus(*context.Result->add_database_status());
TSelfCheckResult tabletContext;
diff --git a/ydb/public/api/protos/ydb_monitoring.proto b/ydb/public/api/protos/ydb_monitoring.proto
index dd99eb583f2..dc47c4ecfb8 100644
--- a/ydb/public/api/protos/ydb_monitoring.proto
+++ b/ydb/public/api/protos/ydb_monitoring.proto
@@ -106,12 +106,19 @@ message LoadAverageStatus {
uint32 cores = 3;
}
+message TimeDifferenceStatus {
+ StatusFlag.Status overall = 1;
+ int64 difference_ms = 2;
+ string peer = 3;
+}
+
message ComputeNodeStatus {
string id = 1;
StatusFlag.Status overall = 2;
repeated ComputeTabletStatus tablets = 3;
repeated ThreadPoolStatus pools = 4;
LoadAverageStatus load = 5;
+ TimeDifferenceStatus max_time_difference = 6;
}
message ComputeStatus {
@@ -167,6 +174,7 @@ message LocationCompute {
LocationNode node = 1;
LocationComputePool pool = 2;
LocationComputeTablet tablet = 3;
+ LocationNode peer = 4;
}
message LocationDatabase {