diff options
author | zalyalov <[email protected]> | 2023-08-22 09:38:22 +0300 |
---|---|---|
committer | zalyalov <[email protected]> | 2023-08-22 09:56:17 +0300 |
commit | b9cdc33a8132729df65c4cd731bafeb24da9a11c (patch) | |
tree | 928268abd9b948c567d698b1c05db781a7a69c3d | |
parent | 3dbb3ac028228f187ca2b256a2b93e4f44de4ca4 (diff) |
pessimize tablets with frequent restarts in boot queue KIKIMR-15303
-rw-r--r-- | ydb/core/mind/hive/boot_queue.h | 3 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_impl.h | 7 | ||||
-rw-r--r-- | ydb/core/mind/hive/tablet_info.cpp | 8 | ||||
-rw-r--r-- | ydb/core/mind/hive/tablet_info.h | 1 | ||||
-rw-r--r-- | ydb/core/mind/hive/tx__update_tablet_status.cpp | 2 | ||||
-rw-r--r-- | ydb/core/protos/config.proto | 3 |
6 files changed, 20 insertions, 4 deletions
diff --git a/ydb/core/mind/hive/boot_queue.h b/ydb/core/mind/hive/boot_queue.h index 320ddb63b58..5b13e08f6d7 100644 --- a/ydb/core/mind/hive/boot_queue.h +++ b/ydb/core/mind/hive/boot_queue.h @@ -32,6 +32,9 @@ struct TBootQueue { break; } priority += tablet.Weight; + if (tablet.RestartsOften()) { + priority -= 5; + } return priority; } diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h index 4ab7ff5bd15..5c63f0cbae6 100644 --- a/ydb/core/mind/hive/hive_impl.h +++ b/ydb/core/mind/hive/hive_impl.h @@ -769,8 +769,11 @@ public: return TDuration::MilliSeconds(CurrentConfig.GetTabletRestartsPeriod()); } - ui64 GetTabletRestarsMaxCount() const { - return CurrentConfig.GetTabletRestarsMaxCount(); + ui64 GetTabletRestartsMaxCount() const { + if (CurrentConfig.HasTabletRestarsMaxCount() && !CurrentConfig.HasTabletRestartsMaxCount()) { + return CurrentConfig.GetTabletRestarsMaxCount(); + } + return CurrentConfig.GetTabletRestartsMaxCount(); } TDuration GetPostponeStartPeriod() const { diff --git a/ydb/core/mind/hive/tablet_info.cpp b/ydb/core/mind/hive/tablet_info.cpp index 81ad491b22e..d64e70101c7 100644 --- a/ydb/core/mind/hive/tablet_info.cpp +++ b/ydb/core/mind/hive/tablet_info.cpp @@ -498,5 +498,13 @@ ui64 TTabletInfo::GetRestartsPerPeriod(TInstant barrier) { return restarts; } +bool TTabletInfo::RestartsOften() const { + // Statistics.RestartTimestamp is a repeated proto field that gets trimmed + // upon each update of tablet metrics (or restart). + // If its current size is >= RestartsMaxCount, it means the tablet was restarting + // often at the time of last update, and thus deserves low booting priority + return Statistics.RestartTimestampSize() >= Hive.GetTabletRestartsMaxCount(); +} + } // NHive } // NKikimr diff --git a/ydb/core/mind/hive/tablet_info.h b/ydb/core/mind/hive/tablet_info.h index 5ab320e7591..023437edae5 100644 --- a/ydb/core/mind/hive/tablet_info.h +++ b/ydb/core/mind/hive/tablet_info.h @@ -277,6 +277,7 @@ public: void ActualizeTabletStatistics(TInstant now); ui64 GetRestartsPerPeriod(TInstant barrier); + bool RestartsOften() const; }; diff --git a/ydb/core/mind/hive/tx__update_tablet_status.cpp b/ydb/core/mind/hive/tx__update_tablet_status.cpp index c736be00aa2..388992312ea 100644 --- a/ydb/core/mind/hive/tx__update_tablet_status.cpp +++ b/ydb/core/mind/hive/tx__update_tablet_status.cpp @@ -132,7 +132,7 @@ public: if (Generation < leader.KnownGeneration) { return true; } - if (leader.GetRestartsPerPeriod(now - Self->GetTabletRestartsPeriod()) >= Self->GetTabletRestarsMaxCount()) { + if (leader.GetRestartsPerPeriod(now - Self->GetTabletRestartsPeriod()) >= Self->GetTabletRestartsMaxCount()) { if (IsGoodStatusForPostpone()) { leader.PostponeStart(now + Self->GetPostponeStartPeriod()); BLOG_D("THive::TTxUpdateTabletStatus::Execute for tablet " << tablet->ToString() diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto index b72face3ebc..e6a080785a2 100644 --- a/ydb/core/protos/config.proto +++ b/ydb/core/protos/config.proto @@ -1711,7 +1711,7 @@ message THiveConfig { optional uint64 SystemTabletCategoryId = 39 [default = 1]; optional bool EnableFastTabletMove = 40 [default = true]; optional uint64 TabletRestartsPeriod = 42 [default = 1000]; // milliseconds - optional uint64 TabletRestarsMaxCount = 43 [default = 2]; // number + optional uint64 TabletRestarsMaxCount = 43 [default = 2]; // deprecated, use TabletRestartsMaxCount instead optional uint64 PostponeStartPeriod = 44 [default = 1000]; // milliseconds optional EHiveNodeSelectStrategy NodeSelectStrategy = 45 [default = HIVE_NODE_SELECT_STRATEGY_RANDOM_MIN_7P]; optional bool CheckMoveExpediency = 46 [default = true]; @@ -1729,6 +1729,7 @@ message THiveConfig { optional bool ContinueEmergencyBalancer = 58 [default = true]; optional double MinPeriodBetweenEmergencyBalance = 59 [default = 0.1]; // seconds optional EHiveBootStrategy BootStrategy = 60 [default = HIVE_BOOT_STRATEGY_BALANCED]; + optional uint64 TabletRestartsMaxCount = 61 [default = 2]; } message TDataShardConfig { |