aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorzalyalov <zalyalov@yandex-team.com>2023-05-02 10:20:58 +0300
committerzalyalov <zalyalov@yandex-team.com>2023-05-02 10:20:58 +0300
commit966d37c5d0af38a01513800a699b10e71d4ca05b (patch)
treec8b366b0f26265428caf5b71dc49b13f0b35a909
parent90ddf78c37a7fe594e2f333ba85dba51a8b8aa83 (diff)
downloadydb-966d37c5d0af38a01513800a699b10e71d4ca05b.tar.gz
separate settings for different balancer triggers
-rw-r--r--ydb/core/mind/hive/balancer.cpp10
-rw-r--r--ydb/core/mind/hive/hive_impl.cpp4
-rw-r--r--ydb/core/mind/hive/hive_impl.h6
-rw-r--r--ydb/core/mind/hive/hive_ut.cpp3
-rw-r--r--ydb/core/mind/hive/monitoring.cpp10
-rw-r--r--ydb/core/protos/config.proto3
6 files changed, 26 insertions, 10 deletions
diff --git a/ydb/core/mind/hive/balancer.cpp b/ydb/core/mind/hive/balancer.cpp
index b8084d6468d..f539d8a4d3e 100644
--- a/ydb/core/mind/hive/balancer.cpp
+++ b/ydb/core/mind/hive/balancer.cpp
@@ -118,6 +118,7 @@ protected:
THive* Hive;
using TTabletId = TFullTabletId;
ui64 KickInFlight;
+ ui64 MaxKickInFlight;
int Movements;
int MaxMovements;
bool RecheckOnFinish;
@@ -157,7 +158,7 @@ protected:
}
bool CanKickNextTablet() const {
- return KickInFlight < Hive->GetBalancerInflight();
+ return KickInFlight < MaxKickInFlight;
}
void UpdateProgress() {
@@ -300,9 +301,10 @@ public:
return NKikimrServices::TActivity::HIVE_BALANCER_ACTOR;
}
- THiveBalancer(THive* hive, int maxMovements = 0, bool recheckOnFinish = false, const std::vector<TNodeId>& filterNodeIds = {})
+ THiveBalancer(THive* hive, int maxMovements = 0, bool recheckOnFinish = false, ui64 maxInFlight = 1, const std::vector<TNodeId>& filterNodeIds = {})
: Hive(hive)
, KickInFlight(0)
+ , MaxKickInFlight(maxInFlight)
, Movements(0)
, MaxMovements(maxMovements)
, RecheckOnFinish(recheckOnFinish)
@@ -325,9 +327,9 @@ public:
}
};
-void THive::StartHiveBalancer(int maxMovements, bool recheckOnFinish, const std::vector<TNodeId>& filterNodeIds) {
+void THive::StartHiveBalancer(int maxMovements, bool recheckOnFinish, ui64 maxInFlight, const std::vector<TNodeId>& filterNodeIds) {
if (BalancerProgress == -1) {
- auto* balancer = new THiveBalancer(this, maxMovements, recheckOnFinish, filterNodeIds);
+ auto* balancer = new THiveBalancer(this, maxMovements, recheckOnFinish, maxInFlight, filterNodeIds);
SubActors.emplace_back(balancer);
BalancerProgress = -2;
RegisterWithSameMailbox(balancer);
diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp
index dc46080b8a6..9177ff85ddf 100644
--- a/ydb/core/mind/hive/hive_impl.cpp
+++ b/ydb/core/mind/hive/hive_impl.cpp
@@ -2131,7 +2131,7 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) {
if (!overloadedNodes.empty()) {
BLOG_D("Nodes " << overloadedNodes << " with usage over limit " << GetMaxNodeUsageToKick() << " - starting balancer");
- StartHiveBalancer(CurrentConfig.GetMaxMovementsOnAutoBalancer(), CurrentConfig.GetContinueAutoBalancer(), overloadedNodes);
+ StartHiveBalancer(CurrentConfig.GetMaxMovementsOnEmergencyBalancer(), CurrentConfig.GetContinueEmergencyBalancer(), GetEmergencyBalancerInflight(), overloadedNodes);
return;
}
}
@@ -2143,7 +2143,7 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) {
if (stats.Scatter >= GetMinScatterToBalance()) {
BLOG_TRACE("Scatter " << stats.Scatter << " over limit "
<< GetMinScatterToBalance() << " - starting balancer");
- StartHiveBalancer(CurrentConfig.GetMaxMovementsOnAutoBalancer(), CurrentConfig.GetContinueAutoBalancer());
+ StartHiveBalancer(CurrentConfig.GetMaxMovementsOnAutoBalancer(), CurrentConfig.GetContinueAutoBalancer(), GetBalancerInflight());
}
}
diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h
index a91910badaf..7e0462c8eff 100644
--- a/ydb/core/mind/hive/hive_impl.h
+++ b/ydb/core/mind/hive/hive_impl.h
@@ -221,7 +221,7 @@ protected:
friend struct TStoragePoolInfo;
- void StartHiveBalancer(int maxMovements = 0, bool recheckOnFinish = false, const std::vector<TNodeId>& filterNodeIds = {});
+ void StartHiveBalancer(int maxMovements = 0, bool recheckOnFinish = false, ui64 maxInFlight = 1, const std::vector<TNodeId>& filterNodeIds = {});
void StartHiveDrain(TNodeId nodeId, TDrainSettings settings);
void StartHiveFill(TNodeId nodeId, const TActorId& initiator);
void CreateEvMonitoring(NMon::TEvRemoteHttpInfo::TPtr& ev, const TActorContext& ctx);
@@ -665,6 +665,10 @@ public:
return CurrentConfig.GetBalancerInflight();
}
+ ui64 GetEmergencyBalancerInflight() const {
+ return CurrentConfig.GetEmergencyBalancerInflight();
+ }
+
ui64 GetMaxBootBatchSize() const {
return CurrentConfig.GetMaxBootBatchSize();
}
diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp
index 6283822441a..884c0e1a754 100644
--- a/ydb/core/mind/hive/hive_ut.cpp
+++ b/ydb/core/mind/hive/hive_ut.cpp
@@ -3728,13 +3728,14 @@ Y_UNIT_TEST_SUITE(THiveTest) {
TTestBasicRuntime runtime(NUM_NODES, false);
Setup(runtime, true, 1, [](TAppPrepare& app) {
- app.HiveConfig.SetMaxMovementsOnAutoBalancer(100);
+ app.HiveConfig.SetMaxMovementsOnEmergencyBalancer(100);
app.HiveConfig.SetMinPeriodBetweenBalance(0.1);
app.HiveConfig.SetTabletKickCooldownPeriod(0);
app.HiveConfig.SetResourceChangeReactionPeriod(0);
// this value of MaxNodeUsageToKick is selected specifically to make test scenario work
// in link with number of tablets and values of network usage metrics used below
app.HiveConfig.SetMaxNodeUsageToKick(0.01);
+ app.HiveConfig.SetEmergencyBalancerInflight(1); // to ensure fair distribution
});
TActorId senderA = runtime.AllocateEdgeActor();
diff --git a/ydb/core/mind/hive/monitoring.cpp b/ydb/core/mind/hive/monitoring.cpp
index 944e1667f98..74d53201363 100644
--- a/ydb/core/mind/hive/monitoring.cpp
+++ b/ydb/core/mind/hive/monitoring.cpp
@@ -782,12 +782,15 @@ public:
UpdateConfig(db, "MaxRequestSequenceSize", TSchemeIds::State::MaxRequestSequenceSize);
UpdateConfig(db, "MetricsWindowSize", TSchemeIds::State::MetricsWindowSize);
UpdateConfig(db, "ResourceOvercommitment", TSchemeIds::State::ResourceOvercommitment);
- UpdateConfig(db, "BalancerInflight");
UpdateConfig(db, "MinPeriodBetweenBalance");
UpdateConfig(db, "NodeBalanceStrategy");
UpdateConfig(db, "TabletBalanceStrategy");
+ UpdateConfig(db, "BalancerInflight");
UpdateConfig(db, "MaxMovementsOnAutoBalancer");
UpdateConfig(db, "ContinueAutoBalancer");
+ UpdateConfig(db, "EmergencyBalancerInflight");
+ UpdateConfig(db, "MaxMovementsOnEmergencyBalancer");
+ UpdateConfig(db, "ContinueEmergencyBalancer");
UpdateConfig(db, "MinNodeUsageToBalance");
UpdateConfig(db, "MinPeriodBetweenReassign");
UpdateConfig(db, "NodeSelectStrategy");
@@ -1061,10 +1064,13 @@ public:
ShowConfig(out, "ResourceOvercommitment");
ShowConfig(out, "NodeBalanceStrategy");
ShowConfig(out, "TabletBalanceStrategy");
- ShowConfig(out, "BalancerInflight");
ShowConfig(out, "MinPeriodBetweenBalance");
+ ShowConfig(out, "BalancerInflight");
ShowConfig(out, "MaxMovementsOnAutoBalancer");
ShowConfig(out, "ContinueAutoBalancer");
+ ShowConfig(out, "EmergencyBalancerInflight");
+ ShowConfig(out, "MaxMovementsOnEmergencyBalancer");
+ ShowConfig(out, "ContinueEmergencyBalancer");
ShowConfig(out, "CheckMoveExpediency");
ShowConfig(out, "SpaceUsagePenaltyThreshold");
ShowConfig(out, "SpaceUsagePenalty");
diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto
index c286c0d4d5c..9905bfe8ac5 100644
--- a/ydb/core/protos/config.proto
+++ b/ydb/core/protos/config.proto
@@ -1591,6 +1591,9 @@ message THiveConfig {
optional uint64 NodeRestartsToIgnoreInWarmup = 51 [default = 10];
optional double MaxWarmUpPeriod = 52 [default = 30.0]; // seconds
optional bool WarmUpEnabled = 55 [default = false];
+ optional uint64 EmergencyBalancerInflight = 56 [default = 5]; // tablets
+ optional uint64 MaxMovementsOnEmergencyBalancer = 57 [default = 1000];
+ optional bool ContinueEmergencyBalancer = 58 [default = true];
}
message TDataShardConfig {