summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorzalyalov <[email protected]>2023-08-22 09:38:22 +0300
committerzalyalov <[email protected]>2023-08-22 09:56:17 +0300
commitb9cdc33a8132729df65c4cd731bafeb24da9a11c (patch)
tree928268abd9b948c567d698b1c05db781a7a69c3d
parent3dbb3ac028228f187ca2b256a2b93e4f44de4ca4 (diff)
pessimize tablets with frequent restarts in boot queue KIKIMR-15303
-rw-r--r--ydb/core/mind/hive/boot_queue.h3
-rw-r--r--ydb/core/mind/hive/hive_impl.h7
-rw-r--r--ydb/core/mind/hive/tablet_info.cpp8
-rw-r--r--ydb/core/mind/hive/tablet_info.h1
-rw-r--r--ydb/core/mind/hive/tx__update_tablet_status.cpp2
-rw-r--r--ydb/core/protos/config.proto3
6 files changed, 20 insertions, 4 deletions
diff --git a/ydb/core/mind/hive/boot_queue.h b/ydb/core/mind/hive/boot_queue.h
index 320ddb63b58..5b13e08f6d7 100644
--- a/ydb/core/mind/hive/boot_queue.h
+++ b/ydb/core/mind/hive/boot_queue.h
@@ -32,6 +32,9 @@ struct TBootQueue {
break;
}
priority += tablet.Weight;
+ if (tablet.RestartsOften()) {
+ priority -= 5;
+ }
return priority;
}
diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h
index 4ab7ff5bd15..5c63f0cbae6 100644
--- a/ydb/core/mind/hive/hive_impl.h
+++ b/ydb/core/mind/hive/hive_impl.h
@@ -769,8 +769,11 @@ public:
return TDuration::MilliSeconds(CurrentConfig.GetTabletRestartsPeriod());
}
- ui64 GetTabletRestarsMaxCount() const {
- return CurrentConfig.GetTabletRestarsMaxCount();
+ ui64 GetTabletRestartsMaxCount() const {
+ if (CurrentConfig.HasTabletRestarsMaxCount() && !CurrentConfig.HasTabletRestartsMaxCount()) {
+ return CurrentConfig.GetTabletRestarsMaxCount();
+ }
+ return CurrentConfig.GetTabletRestartsMaxCount();
}
TDuration GetPostponeStartPeriod() const {
diff --git a/ydb/core/mind/hive/tablet_info.cpp b/ydb/core/mind/hive/tablet_info.cpp
index 81ad491b22e..d64e70101c7 100644
--- a/ydb/core/mind/hive/tablet_info.cpp
+++ b/ydb/core/mind/hive/tablet_info.cpp
@@ -498,5 +498,13 @@ ui64 TTabletInfo::GetRestartsPerPeriod(TInstant barrier) {
return restarts;
}
+bool TTabletInfo::RestartsOften() const {
+ // Statistics.RestartTimestamp is a repeated proto field that gets trimmed
+ // upon each update of tablet metrics (or restart).
+ // If its current size is >= RestartsMaxCount, it means the tablet was restarting
+ // often at the time of last update, and thus deserves low booting priority
+ return Statistics.RestartTimestampSize() >= Hive.GetTabletRestartsMaxCount();
+}
+
} // NHive
} // NKikimr
diff --git a/ydb/core/mind/hive/tablet_info.h b/ydb/core/mind/hive/tablet_info.h
index 5ab320e7591..023437edae5 100644
--- a/ydb/core/mind/hive/tablet_info.h
+++ b/ydb/core/mind/hive/tablet_info.h
@@ -277,6 +277,7 @@ public:
void ActualizeTabletStatistics(TInstant now);
ui64 GetRestartsPerPeriod(TInstant barrier);
+ bool RestartsOften() const;
};
diff --git a/ydb/core/mind/hive/tx__update_tablet_status.cpp b/ydb/core/mind/hive/tx__update_tablet_status.cpp
index c736be00aa2..388992312ea 100644
--- a/ydb/core/mind/hive/tx__update_tablet_status.cpp
+++ b/ydb/core/mind/hive/tx__update_tablet_status.cpp
@@ -132,7 +132,7 @@ public:
if (Generation < leader.KnownGeneration) {
return true;
}
- if (leader.GetRestartsPerPeriod(now - Self->GetTabletRestartsPeriod()) >= Self->GetTabletRestarsMaxCount()) {
+ if (leader.GetRestartsPerPeriod(now - Self->GetTabletRestartsPeriod()) >= Self->GetTabletRestartsMaxCount()) {
if (IsGoodStatusForPostpone()) {
leader.PostponeStart(now + Self->GetPostponeStartPeriod());
BLOG_D("THive::TTxUpdateTabletStatus::Execute for tablet " << tablet->ToString()
diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto
index b72face3ebc..e6a080785a2 100644
--- a/ydb/core/protos/config.proto
+++ b/ydb/core/protos/config.proto
@@ -1711,7 +1711,7 @@ message THiveConfig {
optional uint64 SystemTabletCategoryId = 39 [default = 1];
optional bool EnableFastTabletMove = 40 [default = true];
optional uint64 TabletRestartsPeriod = 42 [default = 1000]; // milliseconds
- optional uint64 TabletRestarsMaxCount = 43 [default = 2]; // number
+ optional uint64 TabletRestarsMaxCount = 43 [default = 2]; // deprecated, use TabletRestartsMaxCount instead
optional uint64 PostponeStartPeriod = 44 [default = 1000]; // milliseconds
optional EHiveNodeSelectStrategy NodeSelectStrategy = 45 [default = HIVE_NODE_SELECT_STRATEGY_RANDOM_MIN_7P];
optional bool CheckMoveExpediency = 46 [default = true];
@@ -1729,6 +1729,7 @@ message THiveConfig {
optional bool ContinueEmergencyBalancer = 58 [default = true];
optional double MinPeriodBetweenEmergencyBalance = 59 [default = 0.1]; // seconds
optional EHiveBootStrategy BootStrategy = 60 [default = HIVE_BOOT_STRATEGY_BALANCED];
+ optional uint64 TabletRestartsMaxCount = 61 [default = 2];
}
message TDataShardConfig {