aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrei Rykov <arykov@ydb.tech>2025-03-14 08:56:27 +0100
committerGitHub <noreply@github.com>2025-03-14 08:56:27 +0100
commitd843851178e39f1c590e0b4fe5adb96a4eaee240 (patch)
treea0137e87e7b06454efe0db7cb7af71c3718194b6
parentc39dd93abac1bfedd69222295717b2d519a8dd84 (diff)
downloadydb-d843851178e39f1c590e0b4fe5adb96a4eaee240.tar.gz
changed healthcheck config (#15693)
-rw-r--r--ydb/core/health_check/health_check.cpp12
-rw-r--r--ydb/core/health_check/health_check_ut.cpp4
-rw-r--r--ydb/core/protos/config.proto14
3 files changed, 17 insertions, 13 deletions
diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp
index 82d8119a09..b1afa3d2f0 100644
--- a/ydb/core/health_check/health_check.cpp
+++ b/ydb/core/health_check/health_check.cpp
@@ -773,7 +773,7 @@ public:
TTabletRequestsState TabletRequests;
- TDuration Timeout = TDuration::MilliSeconds(20000);
+ TDuration Timeout = TDuration::MilliSeconds(HealthCheckConfig.GetTimeout());
bool ReturnHints = false;
static constexpr TStringBuf STATIC_STORAGE_POOL_NAME = "static";
@@ -1644,7 +1644,7 @@ public:
for (const auto& [hiveId, hiveResponse] : HiveInfo) {
if (hiveResponse.IsOk()) {
settings.AliveBarrier = TInstant::MilliSeconds(hiveResponse->Record.GetResponseTimestamp()) - TDuration::Minutes(5);
- settings.MaxRestartsPerPeriod = HealthCheckConfig.GetTabletsRestartsPerPeriodOrangeThreshold();
+ settings.MaxRestartsPerPeriod = HealthCheckConfig.GetThresholds().GetTabletsRestartsOrange();
for (const NKikimrHive::TTabletInfo& hiveTablet : hiveResponse->Record.GetTablets()) {
TSubDomainKey tenantId = TSubDomainKey(hiveTablet.GetObjectDomain());
auto itDomain = FilterDomainKey.find(tenantId);
@@ -1870,9 +1870,9 @@ public:
FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node());
TSelfCheckContext rrContext(&context, "NODE_UPTIME");
- if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetNodeRestartsPerPeriodOrangeThreshold()) {
+ if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetThresholds().GetNodeRestartsOrange()) {
rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Node is restarting too often", ETags::Uptime);
- } else if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetNodeRestartsPerPeriodYellowThreshold()) {
+ } else if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetThresholds().GetNodeRestartsYellow()) {
rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "The number of node restarts has increased", ETags::Uptime);
} else {
rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
@@ -1910,9 +1910,9 @@ public:
long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs();
TDuration timeDifferenceDuration = TDuration::MicroSeconds(abs(timeDifferenceUs));
Ydb::Monitoring::StatusFlag::Status status;
- if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetNodesTimeDifferenceUsOrangeThreshold())) {
+ if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceOrange())) {
status = Ydb::Monitoring::StatusFlag::ORANGE;
- } else if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetNodesTimeDifferenceUsYellowThreshold())) {
+ } else if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceYellow())) {
status = Ydb::Monitoring::StatusFlag::YELLOW;
} else {
status = Ydb::Monitoring::StatusFlag::GREEN;
diff --git a/ydb/core/health_check/health_check_ut.cpp b/ydb/core/health_check/health_check_ut.cpp
index bb5062cd03..d42658305b 100644
--- a/ydb/core/health_check/health_check_ut.cpp
+++ b/ydb/core/health_check/health_check_ut.cpp
@@ -1982,8 +1982,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
void ChangeNodeRestartsPerPeriod(TTestActorRuntime &runtime, const TActorId& sender, const ui32 restartsYellow, const ui32 restartsOrange) {
NKikimrConfig::TAppConfig ext;
auto &cfg = *ext.MutableHealthCheckConfig();
- cfg.SetNodeRestartsPerPeriodYellowThreshold(restartsYellow);
- cfg.SetNodeRestartsPerPeriodOrangeThreshold(restartsOrange);
+ cfg.MutableThresholds()->SetNodeRestartsYellow(restartsYellow);
+ cfg.MutableThresholds()->SetNodeRestartsOrange(restartsOrange);
SendHealthCheckConfigUpdate(runtime, sender, cfg);
}
diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto
index b20401f92d..9197664a76 100644
--- a/ydb/core/protos/config.proto
+++ b/ydb/core/protos/config.proto
@@ -1781,11 +1781,15 @@ message THiveConfig {
}
message THealthCheckConfig {
- optional uint32 NodeRestartsPerPeriodYellowThreshold = 1 [default = 10];
- optional uint32 NodeRestartsPerPeriodOrangeThreshold = 2 [default = 30];
- optional uint64 NodesTimeDifferenceUsYellowThreshold = 3 [default = 5000];
- optional uint64 NodesTimeDifferenceUsOrangeThreshold = 4 [default = 25000];
- optional uint32 TabletsRestartsPerPeriodOrangeThreshold = 5 [default = 30];
+ message TThresholds {
+ optional uint32 NodeRestartsYellow = 1 [default = 10]; // per period, see HiveConfig.NodeRestartWatchPeriod
+ optional uint32 NodeRestartsOrange = 2 [default = 30]; // per period, see HiveConfig.NodeRestartWatchPeriod
+ optional uint64 NodesTimeDifferenceYellow = 3 [default = 5000]; // microseconds
+ optional uint64 NodesTimeDifferenceOrange = 4 [default = 25000]; // microseconds
+ optional uint32 TabletsRestartsOrange = 5 [default = 30]; // per period, see HiveConfig.TabletRestartWatchPeriod
+ }
+ optional TThresholds Thresholds = 1;
+ optional uint32 Timeout = 2 [default = 20000]; // milliseconds
}
message TBlobCacheConfig {