diff options
author | zalyalov <zalyalov@yandex-team.com> | 2023-05-02 10:20:58 +0300 |
---|---|---|
committer | zalyalov <zalyalov@yandex-team.com> | 2023-05-02 10:20:58 +0300 |
commit | 966d37c5d0af38a01513800a699b10e71d4ca05b (patch) | |
tree | c8b366b0f26265428caf5b71dc49b13f0b35a909 | |
parent | 90ddf78c37a7fe594e2f333ba85dba51a8b8aa83 (diff) | |
download | ydb-966d37c5d0af38a01513800a699b10e71d4ca05b.tar.gz |
separate settings for different balancer triggers
-rw-r--r-- | ydb/core/mind/hive/balancer.cpp | 10 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_impl.cpp | 4 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_impl.h | 6 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_ut.cpp | 3 | ||||
-rw-r--r-- | ydb/core/mind/hive/monitoring.cpp | 10 | ||||
-rw-r--r-- | ydb/core/protos/config.proto | 3 |
6 files changed, 26 insertions, 10 deletions
diff --git a/ydb/core/mind/hive/balancer.cpp b/ydb/core/mind/hive/balancer.cpp index b8084d6468d..f539d8a4d3e 100644 --- a/ydb/core/mind/hive/balancer.cpp +++ b/ydb/core/mind/hive/balancer.cpp @@ -118,6 +118,7 @@ protected: THive* Hive; using TTabletId = TFullTabletId; ui64 KickInFlight; + ui64 MaxKickInFlight; int Movements; int MaxMovements; bool RecheckOnFinish; @@ -157,7 +158,7 @@ protected: } bool CanKickNextTablet() const { - return KickInFlight < Hive->GetBalancerInflight(); + return KickInFlight < MaxKickInFlight; } void UpdateProgress() { @@ -300,9 +301,10 @@ public: return NKikimrServices::TActivity::HIVE_BALANCER_ACTOR; } - THiveBalancer(THive* hive, int maxMovements = 0, bool recheckOnFinish = false, const std::vector<TNodeId>& filterNodeIds = {}) + THiveBalancer(THive* hive, int maxMovements = 0, bool recheckOnFinish = false, ui64 maxInFlight = 1, const std::vector<TNodeId>& filterNodeIds = {}) : Hive(hive) , KickInFlight(0) + , MaxKickInFlight(maxInFlight) , Movements(0) , MaxMovements(maxMovements) , RecheckOnFinish(recheckOnFinish) @@ -325,9 +327,9 @@ public: } }; -void THive::StartHiveBalancer(int maxMovements, bool recheckOnFinish, const std::vector<TNodeId>& filterNodeIds) { +void THive::StartHiveBalancer(int maxMovements, bool recheckOnFinish, ui64 maxInFlight, const std::vector<TNodeId>& filterNodeIds) { if (BalancerProgress == -1) { - auto* balancer = new THiveBalancer(this, maxMovements, recheckOnFinish, filterNodeIds); + auto* balancer = new THiveBalancer(this, maxMovements, recheckOnFinish, maxInFlight, filterNodeIds); SubActors.emplace_back(balancer); BalancerProgress = -2; RegisterWithSameMailbox(balancer); diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index dc46080b8a6..9177ff85ddf 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -2131,7 +2131,7 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) { if (!overloadedNodes.empty()) { BLOG_D("Nodes " << overloadedNodes << " with usage over limit " << GetMaxNodeUsageToKick() << " - starting balancer"); - StartHiveBalancer(CurrentConfig.GetMaxMovementsOnAutoBalancer(), CurrentConfig.GetContinueAutoBalancer(), overloadedNodes); + StartHiveBalancer(CurrentConfig.GetMaxMovementsOnEmergencyBalancer(), CurrentConfig.GetContinueEmergencyBalancer(), GetEmergencyBalancerInflight(), overloadedNodes); return; } } @@ -2143,7 +2143,7 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) { if (stats.Scatter >= GetMinScatterToBalance()) { BLOG_TRACE("Scatter " << stats.Scatter << " over limit " << GetMinScatterToBalance() << " - starting balancer"); - StartHiveBalancer(CurrentConfig.GetMaxMovementsOnAutoBalancer(), CurrentConfig.GetContinueAutoBalancer()); + StartHiveBalancer(CurrentConfig.GetMaxMovementsOnAutoBalancer(), CurrentConfig.GetContinueAutoBalancer(), GetBalancerInflight()); } } diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h index a91910badaf..7e0462c8eff 100644 --- a/ydb/core/mind/hive/hive_impl.h +++ b/ydb/core/mind/hive/hive_impl.h @@ -221,7 +221,7 @@ protected: friend struct TStoragePoolInfo; - void StartHiveBalancer(int maxMovements = 0, bool recheckOnFinish = false, const std::vector<TNodeId>& filterNodeIds = {}); + void StartHiveBalancer(int maxMovements = 0, bool recheckOnFinish = false, ui64 maxInFlight = 1, const std::vector<TNodeId>& filterNodeIds = {}); void StartHiveDrain(TNodeId nodeId, TDrainSettings settings); void StartHiveFill(TNodeId nodeId, const TActorId& initiator); void CreateEvMonitoring(NMon::TEvRemoteHttpInfo::TPtr& ev, const TActorContext& ctx); @@ -665,6 +665,10 @@ public: return CurrentConfig.GetBalancerInflight(); } + ui64 GetEmergencyBalancerInflight() const { + return CurrentConfig.GetEmergencyBalancerInflight(); + } + ui64 GetMaxBootBatchSize() const { return CurrentConfig.GetMaxBootBatchSize(); } diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp index 6283822441a..884c0e1a754 100644 --- a/ydb/core/mind/hive/hive_ut.cpp +++ b/ydb/core/mind/hive/hive_ut.cpp @@ -3728,13 +3728,14 @@ Y_UNIT_TEST_SUITE(THiveTest) { TTestBasicRuntime runtime(NUM_NODES, false); Setup(runtime, true, 1, [](TAppPrepare& app) { - app.HiveConfig.SetMaxMovementsOnAutoBalancer(100); + app.HiveConfig.SetMaxMovementsOnEmergencyBalancer(100); app.HiveConfig.SetMinPeriodBetweenBalance(0.1); app.HiveConfig.SetTabletKickCooldownPeriod(0); app.HiveConfig.SetResourceChangeReactionPeriod(0); // this value of MaxNodeUsageToKick is selected specifically to make test scenario work // in link with number of tablets and values of network usage metrics used below app.HiveConfig.SetMaxNodeUsageToKick(0.01); + app.HiveConfig.SetEmergencyBalancerInflight(1); // to ensure fair distribution }); TActorId senderA = runtime.AllocateEdgeActor(); diff --git a/ydb/core/mind/hive/monitoring.cpp b/ydb/core/mind/hive/monitoring.cpp index 944e1667f98..74d53201363 100644 --- a/ydb/core/mind/hive/monitoring.cpp +++ b/ydb/core/mind/hive/monitoring.cpp @@ -782,12 +782,15 @@ public: UpdateConfig(db, "MaxRequestSequenceSize", TSchemeIds::State::MaxRequestSequenceSize); UpdateConfig(db, "MetricsWindowSize", TSchemeIds::State::MetricsWindowSize); UpdateConfig(db, "ResourceOvercommitment", TSchemeIds::State::ResourceOvercommitment); - UpdateConfig(db, "BalancerInflight"); UpdateConfig(db, "MinPeriodBetweenBalance"); UpdateConfig(db, "NodeBalanceStrategy"); UpdateConfig(db, "TabletBalanceStrategy"); + UpdateConfig(db, "BalancerInflight"); UpdateConfig(db, "MaxMovementsOnAutoBalancer"); UpdateConfig(db, "ContinueAutoBalancer"); + UpdateConfig(db, "EmergencyBalancerInflight"); + UpdateConfig(db, "MaxMovementsOnEmergencyBalancer"); + UpdateConfig(db, "ContinueEmergencyBalancer"); UpdateConfig(db, "MinNodeUsageToBalance"); UpdateConfig(db, "MinPeriodBetweenReassign"); UpdateConfig(db, "NodeSelectStrategy"); @@ -1061,10 +1064,13 @@ public: ShowConfig(out, "ResourceOvercommitment"); ShowConfig(out, "NodeBalanceStrategy"); ShowConfig(out, "TabletBalanceStrategy"); - ShowConfig(out, "BalancerInflight"); ShowConfig(out, "MinPeriodBetweenBalance"); + ShowConfig(out, "BalancerInflight"); ShowConfig(out, "MaxMovementsOnAutoBalancer"); ShowConfig(out, "ContinueAutoBalancer"); + ShowConfig(out, "EmergencyBalancerInflight"); + ShowConfig(out, "MaxMovementsOnEmergencyBalancer"); + ShowConfig(out, "ContinueEmergencyBalancer"); ShowConfig(out, "CheckMoveExpediency"); ShowConfig(out, "SpaceUsagePenaltyThreshold"); ShowConfig(out, "SpaceUsagePenalty"); diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto index c286c0d4d5c..9905bfe8ac5 100644 --- a/ydb/core/protos/config.proto +++ b/ydb/core/protos/config.proto @@ -1591,6 +1591,9 @@ message THiveConfig { optional uint64 NodeRestartsToIgnoreInWarmup = 51 [default = 10]; optional double MaxWarmUpPeriod = 52 [default = 30.0]; // seconds optional bool WarmUpEnabled = 55 [default = false]; + optional uint64 EmergencyBalancerInflight = 56 [default = 5]; // tablets + optional uint64 MaxMovementsOnEmergencyBalancer = 57 [default = 1000]; + optional bool ContinueEmergencyBalancer = 58 [default = true]; } message TDataShardConfig { |