diff options
author | Andrei Rykov <arykov@ydb.tech> | 2025-03-14 08:56:27 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-03-14 08:56:27 +0100 |
commit | d843851178e39f1c590e0b4fe5adb96a4eaee240 (patch) | |
tree | a0137e87e7b06454efe0db7cb7af71c3718194b6 | |
parent | c39dd93abac1bfedd69222295717b2d519a8dd84 (diff) | |
download | ydb-d843851178e39f1c590e0b4fe5adb96a4eaee240.tar.gz |
changed healthcheck config (#15693)
-rw-r--r-- | ydb/core/health_check/health_check.cpp | 12 | ||||
-rw-r--r-- | ydb/core/health_check/health_check_ut.cpp | 4 | ||||
-rw-r--r-- | ydb/core/protos/config.proto | 14 |
3 files changed, 17 insertions, 13 deletions
diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index 82d8119a09..b1afa3d2f0 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -773,7 +773,7 @@ public: TTabletRequestsState TabletRequests; - TDuration Timeout = TDuration::MilliSeconds(20000); + TDuration Timeout = TDuration::MilliSeconds(HealthCheckConfig.GetTimeout()); bool ReturnHints = false; static constexpr TStringBuf STATIC_STORAGE_POOL_NAME = "static"; @@ -1644,7 +1644,7 @@ public: for (const auto& [hiveId, hiveResponse] : HiveInfo) { if (hiveResponse.IsOk()) { settings.AliveBarrier = TInstant::MilliSeconds(hiveResponse->Record.GetResponseTimestamp()) - TDuration::Minutes(5); - settings.MaxRestartsPerPeriod = HealthCheckConfig.GetTabletsRestartsPerPeriodOrangeThreshold(); + settings.MaxRestartsPerPeriod = HealthCheckConfig.GetThresholds().GetTabletsRestartsOrange(); for (const NKikimrHive::TTabletInfo& hiveTablet : hiveResponse->Record.GetTablets()) { TSubDomainKey tenantId = TSubDomainKey(hiveTablet.GetObjectDomain()); auto itDomain = FilterDomainKey.find(tenantId); @@ -1870,9 +1870,9 @@ public: FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node()); TSelfCheckContext rrContext(&context, "NODE_UPTIME"); - if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetNodeRestartsPerPeriodOrangeThreshold()) { + if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetThresholds().GetNodeRestartsOrange()) { rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Node is restarting too often", ETags::Uptime); - } else if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetNodeRestartsPerPeriodYellowThreshold()) { + } else if (databaseState.NodeRestartsPerPeriod[nodeId] >= HealthCheckConfig.GetThresholds().GetNodeRestartsYellow()) { rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "The number of node restarts has increased", ETags::Uptime); } else { rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); @@ -1910,9 +1910,9 @@ public: long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs(); TDuration timeDifferenceDuration = TDuration::MicroSeconds(abs(timeDifferenceUs)); Ydb::Monitoring::StatusFlag::Status status; - if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetNodesTimeDifferenceUsOrangeThreshold())) { + if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceOrange())) { status = Ydb::Monitoring::StatusFlag::ORANGE; - } else if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetNodesTimeDifferenceUsYellowThreshold())) { + } else if (timeDifferenceDuration > TDuration::MicroSeconds(HealthCheckConfig.GetThresholds().GetNodesTimeDifferenceYellow())) { status = Ydb::Monitoring::StatusFlag::YELLOW; } else { status = Ydb::Monitoring::StatusFlag::GREEN; diff --git a/ydb/core/health_check/health_check_ut.cpp b/ydb/core/health_check/health_check_ut.cpp index bb5062cd03..d42658305b 100644 --- a/ydb/core/health_check/health_check_ut.cpp +++ b/ydb/core/health_check/health_check_ut.cpp @@ -1982,8 +1982,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { void ChangeNodeRestartsPerPeriod(TTestActorRuntime &runtime, const TActorId& sender, const ui32 restartsYellow, const ui32 restartsOrange) { NKikimrConfig::TAppConfig ext; auto &cfg = *ext.MutableHealthCheckConfig(); - cfg.SetNodeRestartsPerPeriodYellowThreshold(restartsYellow); - cfg.SetNodeRestartsPerPeriodOrangeThreshold(restartsOrange); + cfg.MutableThresholds()->SetNodeRestartsYellow(restartsYellow); + cfg.MutableThresholds()->SetNodeRestartsOrange(restartsOrange); SendHealthCheckConfigUpdate(runtime, sender, cfg); } diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto index b20401f92d..9197664a76 100644 --- a/ydb/core/protos/config.proto +++ b/ydb/core/protos/config.proto @@ -1781,11 +1781,15 @@ message THiveConfig { } message THealthCheckConfig { - optional uint32 NodeRestartsPerPeriodYellowThreshold = 1 [default = 10]; - optional uint32 NodeRestartsPerPeriodOrangeThreshold = 2 [default = 30]; - optional uint64 NodesTimeDifferenceUsYellowThreshold = 3 [default = 5000]; - optional uint64 NodesTimeDifferenceUsOrangeThreshold = 4 [default = 25000]; - optional uint32 TabletsRestartsPerPeriodOrangeThreshold = 5 [default = 30]; + message TThresholds { + optional uint32 NodeRestartsYellow = 1 [default = 10]; // per period, see HiveConfig.NodeRestartWatchPeriod + optional uint32 NodeRestartsOrange = 2 [default = 30]; // per period, see HiveConfig.NodeRestartWatchPeriod + optional uint64 NodesTimeDifferenceYellow = 3 [default = 5000]; // microseconds + optional uint64 NodesTimeDifferenceOrange = 4 [default = 25000]; // microseconds + optional uint32 TabletsRestartsOrange = 5 [default = 30]; // per period, see HiveConfig.TabletRestartWatchPeriod + } + optional TThresholds Thresholds = 1; + optional uint32 Timeout = 2 [default = 20000]; // milliseconds } message TBlobCacheConfig { |