diff options
author | vporyadke <zalyalov@ydb.tech> | 2024-01-09 18:25:16 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-09 16:25:16 +0100 |
commit | 8c1f7ef7a4091b7bb300ee3030ec339b6cf77a6e (patch) | |
tree | c20e74ec3caf2e5fa331650ece0ca5b7483e3ca9 | |
parent | eee3fb563609cf04464365724d4badf54b84c771 (diff) | |
download | ydb-8c1f7ef7a4091b7bb300ee3030ec339b6cf77a6e.tar.gz |
Storage Balancer KIKIMR-20636 (#770)
-rw-r--r-- | ydb/core/mind/hive/balancer.h | 4 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive.cpp | 1 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive.h | 9 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_events.h | 14 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_impl.cpp | 9 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_impl.h | 12 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_ut.cpp | 90 | ||||
-rw-r--r-- | ydb/core/mind/hive/leader_tablet_info.cpp | 6 | ||||
-rw-r--r-- | ydb/core/mind/hive/leader_tablet_info.h | 27 | ||||
-rw-r--r-- | ydb/core/mind/hive/monitoring.cpp | 32 | ||||
-rw-r--r-- | ydb/core/mind/hive/storage_balancer.cpp | 193 | ||||
-rw-r--r-- | ydb/core/mind/hive/tx__update_tablet_groups.cpp | 6 | ||||
-rw-r--r-- | ydb/core/mind/hive/ya.make | 1 | ||||
-rw-r--r-- | ydb/core/protos/config.proto | 8 | ||||
-rw-r--r-- | ydb/core/protos/counters_hive.proto | 1 | ||||
-rw-r--r-- | ydb/core/protos/hive.proto | 1 |
16 files changed, 412 insertions, 2 deletions
diff --git a/ydb/core/mind/hive/balancer.h b/ydb/core/mind/hive/balancer.h index 267d827fa1..62289c4f24 100644 --- a/ydb/core/mind/hive/balancer.h +++ b/ydb/core/mind/hive/balancer.h @@ -1,6 +1,7 @@ #pragma once #include "hive_impl.h" +#include "leader_tablet_info.h" namespace NKikimr { namespace NHive { @@ -11,5 +12,8 @@ void BalanceNodes(std::vector<TNodeInfo*>& nodes, EResourceToBalance resourceTob template<NKikimrConfig::THiveConfig::EHiveTabletBalanceStrategy EHiveTabletBalanceStrategy> void BalanceTablets(std::vector<TTabletInfo*>& tablets, EResourceToBalance resourceToBalance); +template <NKikimrConfig::THiveConfig::EHiveChannelBalanceStrategy> +void BalanceChannels(std::vector<TLeaderTabletInfo::TChannel>& channels, NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy metricToBalance); + } } diff --git a/ydb/core/mind/hive/hive.cpp b/ydb/core/mind/hive/hive.cpp index 803704e162..cd0798d471 100644 --- a/ydb/core/mind/hive/hive.cpp +++ b/ydb/core/mind/hive/hive.cpp @@ -41,6 +41,7 @@ TString EBalancerTypeName(EBalancerType value) { case EBalancerType::Emergency: return "Emergency"; case EBalancerType::SpreadNeighbours: return "Spread"; case EBalancerType::Manual: return "Manual"; + case EBalancerType::Storage: return "Storage"; } } diff --git a/ydb/core/mind/hive/hive.h b/ydb/core/mind/hive/hive.h index 860f1124f0..66034f4b0a 100644 --- a/ydb/core/mind/hive/hive.h +++ b/ydb/core/mind/hive/hive.h @@ -85,8 +85,9 @@ enum class EBalancerType { ScatterNetwork, Emergency, SpreadNeighbours, + Storage, - Last = SpreadNeighbours, + Last = Storage, }; constexpr std::size_t EBalancerTypeSize = static_cast<std::size_t>(EBalancerType::Last) + 1; @@ -261,6 +262,12 @@ struct TBalancerSettings { std::optional<TFullObjectId> FilterObjectId; }; +struct TStorageBalancerSettings { + ui64 NumReassigns; + ui64 MaxInFlight; + TString StoragePool; +}; + struct TBalancerStats { ui64 TotalRuns = 0; ui64 TotalMovements = 0; diff --git a/ydb/core/mind/hive/hive_events.h b/ydb/core/mind/hive/hive_events.h index c6fa33e057..875bf731be 100644 --- a/ydb/core/mind/hive/hive_events.h +++ b/ydb/core/mind/hive/hive_events.h @@ -27,6 +27,8 @@ struct TEvPrivate { EvProcessIncomingEvent, EvRefreshStorageInfo, EvLogTabletMoves, + EvStartStorageBalancer, + EvRestartCancelled, EvEnd }; @@ -90,6 +92,18 @@ struct TEvPrivate { struct TEvRefreshStorageInfo : TEventLocal<TEvRefreshStorageInfo, EvRefreshStorageInfo> {}; struct TEvLogTabletMoves : TEventLocal<TEvLogTabletMoves, EvLogTabletMoves> {}; + + struct TEvStartStorageBalancer : TEventLocal<TEvStartStorageBalancer, EvStartStorageBalancer> { + TStorageBalancerSettings Settings; + + TEvStartStorageBalancer(TStorageBalancerSettings settings) : Settings(settings) {} + }; + + struct TEvRestartCancelled : TEventLocal<TEvRestartCancelled, EvRestartCancelled> { + TFullTabletId TabletId; + + TEvRestartCancelled(TFullTabletId tabletId) : TabletId(tabletId) {} + }; }; } // NHive diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index b2fc1bf6c9..9ec9320a4c 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -350,6 +350,12 @@ void THive::Handle(TEvPrivate::TEvBalancerOut::TPtr&) { BLOG_D("Handle BalancerOut"); } + +void THive::Handle(TEvPrivate::TEvStartStorageBalancer::TPtr& ev) { + BLOG_D("Handle StartStorageBalancer"); + StartHiveStorageBalancer(std::move(ev->Get()->Settings)); +} + void THive::Handle(TEvHive::TEvBootTablet::TPtr& ev) { TTabletId tabletId = ev->Get()->Record.GetTabletID(); TTabletInfo* tablet = FindTablet(tabletId); @@ -2650,6 +2656,7 @@ TDuration THive::GetBalancerCooldown() const { case EBalancerType::ScatterMemory: case EBalancerType::ScatterNetwork: case EBalancerType::SpreadNeighbours: + case EBalancerType::Storage: return GetMinPeriodBetweenBalance(); case EBalancerType::Emergency: return GetMinPeriodBetweenEmergencyBalance(); @@ -2860,6 +2867,7 @@ void THive::ProcessEvent(std::unique_ptr<IEventHandle> event) { hFunc(TEvHive::TEvUpdateTabletsObject, Handle); hFunc(TEvPrivate::TEvRefreshStorageInfo, Handle); hFunc(TEvPrivate::TEvLogTabletMoves, Handle); + hFunc(TEvPrivate::TEvStartStorageBalancer, Handle); hFunc(TEvHive::TEvUpdateDomain, Handle); } } @@ -2958,6 +2966,7 @@ STFUNC(THive::StateWork) { fFunc(TEvHive::TEvUpdateTabletsObject::EventType, EnqueueIncomingEvent); fFunc(TEvPrivate::TEvRefreshStorageInfo::EventType, EnqueueIncomingEvent); fFunc(TEvPrivate::TEvLogTabletMoves::EventType, EnqueueIncomingEvent); + fFunc(TEvPrivate::TEvStartStorageBalancer::EventType, EnqueueIncomingEvent); fFunc(TEvHive::TEvUpdateDomain::EventType, EnqueueIncomingEvent); hFunc(TEvPrivate::TEvProcessIncomingEvent, Handle); default: diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h index f1408d78cd..e2894adb5f 100644 --- a/ydb/core/mind/hive/hive_impl.h +++ b/ydb/core/mind/hive/hive_impl.h @@ -169,6 +169,7 @@ protected: friend class TQueryMigrationWaitActor; friend class TReleaseTabletsWaitActor; friend class TDrainNodeWaitActor; + friend class THiveStorageBalancer;; friend struct TNodeInfo; friend class TTxInitScheme; @@ -204,6 +205,7 @@ protected: friend class TTxMonEvent_QueryMigration; friend class TTxMonEvent_RebalanceFromScratch; friend class TTxMonEvent_ObjectStats; + friend class TTxMonEvent_StorageRebalance; friend class TTxKillNode; friend class TTxLoadEverything; friend class TTxRestartTablet; @@ -239,6 +241,7 @@ protected: void StartHiveBalancer(TBalancerSettings&& settings); void StartHiveDrain(TNodeId nodeId, TDrainSettings settings); void StartHiveFill(TNodeId nodeId, const TActorId& initiator); + void StartHiveStorageBalancer(TStorageBalancerSettings settings); void CreateEvMonitoring(NMon::TEvRemoteHttpInfo::TPtr& ev, const TActorContext& ctx); NJson::TJsonValue GetBalancerProgressJson(); ITransaction* CreateDeleteTablet(TEvHive::TEvDeleteTablet::TPtr& ev); @@ -551,6 +554,7 @@ protected: void Handle(TEvHive::TEvUpdateTabletsObject::TPtr& ev); void Handle(TEvPrivate::TEvRefreshStorageInfo::TPtr& ev); void Handle(TEvPrivate::TEvLogTabletMoves::TPtr& ev); + void Handle(TEvPrivate::TEvStartStorageBalancer::TPtr& ev); void Handle(TEvPrivate::TEvProcessIncomingEvent::TPtr& ev); void Handle(TEvHive::TEvUpdateDomain::TPtr& ev); @@ -901,6 +905,14 @@ public: return CurrentConfig.GetBootStrategy(); } + NKikimrConfig::THiveConfig::EHiveChannelBalanceStrategy GetChannelBalanceStrategy() const { + return CurrentConfig.GetChannelBalanceStrategy(); + } + + ui64 GetMaxChannelHistorySize() const { + return CurrentConfig.GetMaxChannelHistorySize(); + } + static void ActualizeRestartStatistics(google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier); static bool IsSystemTablet(TTabletTypes::EType type); diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp index 67c5ed29a7..984674a398 100644 --- a/ydb/core/mind/hive/hive_ut.cpp +++ b/ydb/core/mind/hive/hive_ut.cpp @@ -2780,6 +2780,96 @@ Y_UNIT_TEST_SUITE(THiveTest) { UNIT_ASSERT_VALUES_EQUAL(getGroup(tabletId), goodGroup); } + Y_UNIT_TEST(TestStorageBalancer) { + static constexpr ui64 NUM_TABLETS = 4; + TTestBasicRuntime runtime(1, false); + Setup(runtime, true, 2, [](TAppPrepare& app) { + app.HiveConfig.SetMinPeriodBetweenReassign(0); + }); + const ui64 hiveTablet = MakeDefaultHiveID(0); + const ui64 testerTablet = MakeDefaultHiveID(1); + CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive); + + TTabletTypes::EType tabletType = TTabletTypes::Dummy; + TVector<ui64> tablets; + for (ui64 i = 0; i < NUM_TABLETS; ++i) { + THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS)); + ev->Record.SetObjectId(i); + ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true); + tablets.emplace_back(tabletId); + MakeSureTabletIsUp(runtime, tabletId, 0); + } + ui64 tabletBase = tablets.front(); + + TActorId sender = runtime.AllocateEdgeActor(); + auto getGroup = [&runtime, sender, hiveTablet](ui64 tabletId) { + runtime.SendToPipe(hiveTablet, sender, new TEvHive::TEvRequestHiveInfo({ + .TabletId = tabletId, + .ReturnChannelHistory = true, + })); + TAutoPtr<IEventHandle> handle; + TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle); + + const auto& tablet = response->Record.GetTablets().Get(0); + const auto& channel = tablet.GetTabletChannels().Get(0); + const auto& history = channel.GetHistory(); + return history.Get(history.size() - 1).GetGroup(); + }; + + std::unordered_map<ui64, std::vector<ui64>> groupToTablets; + for (auto tablet : tablets) { + groupToTablets[getGroup(tablet)].push_back(tablet); + } + ui64 tabletA; + ui64 tabletB; + for (const auto& [group, tablets] : groupToTablets) { + if (tablets.size() >= 2) { + tabletA = tablets[0]; + tabletB = tablets[1]; + } + } + + // If assured space is not set, usage is always set to 1 + auto groupMetricsExchange = MakeHolder<TEvBlobStorage::TEvControllerGroupMetricsExchange>(); + for (const auto& [group, tablets] : groupToTablets) { + NKikimrBlobStorage::TGroupMetrics* metrics = groupMetricsExchange->Record.AddGroupMetrics(); + + metrics->SetGroupId(group); + metrics->MutableGroupParameters()->SetGroupID(group); + metrics->MutableGroupParameters()->SetStoragePoolName("def1"); + metrics->MutableGroupParameters()->MutableAssuredResources()->SetSpace(300'000'000); + } + + runtime.SendToPipe(MakeBSControllerID(0), sender, groupMetricsExchange.Release(), 0, GetPipeConfigWithRetries()); + { + TDispatchOptions options; + options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvBlobStorage::EvControllerGroupMetricsExchange)); + runtime.DispatchEvents(options); + } + + TChannelsBindings channels = BINDED_CHANNELS; + for (auto& bind : channels) { + bind.SetSize(200'000'000); + } + for (auto tablet : {tabletA, tabletB}) { + TAutoPtr<TEvHive::TEvCreateTablet> updateTablet(new TEvHive::TEvCreateTablet(testerTablet, 100500 + (tablet - tabletBase), tabletType, channels)); + SendCreateTestTablet(runtime, hiveTablet, testerTablet, updateTablet, 0, true); + } + runtime.SendToPipe(hiveTablet, sender, new NHive::TEvPrivate::TEvStartStorageBalancer({ + .NumReassigns = 100, + .MaxInFlight = 1, + .StoragePool = "def1", + })); + + { + TDispatchOptions options; + options.FinalEvents.emplace_back(NHive::TEvPrivate::EvRestartComplete, 4); // should actually be less than 4 + runtime.DispatchEvents(options, TDuration::Seconds(10)); + } + + UNIT_ASSERT_VALUES_UNEQUAL(getGroup(tabletA), getGroup(tabletB)); + } + // Y_UNIT_TEST(TestCreateTabletAndChangeProfiles) { // TTestBasicRuntime runtime(1, false); // Setup(runtime, true); diff --git a/ydb/core/mind/hive/leader_tablet_info.cpp b/ydb/core/mind/hive/leader_tablet_info.cpp index 8a53d228c1..43a6e260eb 100644 --- a/ydb/core/mind/hive/leader_tablet_info.cpp +++ b/ydb/core/mind/hive/leader_tablet_info.cpp @@ -261,6 +261,12 @@ const NKikimrBlobStorage::TEvControllerSelectGroupsResult::TGroupParameters* TLe }); break; } + case NKikimrHive::TEvReassignTablet::HIVE_REASSIGN_REASON_BALANCE: { + return storagePool->FindFreeAllocationUnit([¶ms](const TStorageGroupInfo& newGroup) -> bool { + return newGroup.IsMatchesParameters(*params); + }); + break; + } case NKikimrHive::TEvReassignTablet::HIVE_REASSIGN_REASON_SPACE: { NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy balanceStrategy = Hive.CurrentConfig.GetStorageBalanceStrategy(); Hive.CurrentConfig.SetStorageBalanceStrategy(NKikimrConfig::THiveConfig::HIVE_STORAGE_BALANCE_STRATEGY_SIZE); diff --git a/ydb/core/mind/hive/leader_tablet_info.h b/ydb/core/mind/hive/leader_tablet_info.h index 77a2b1c2a8..dba2b570d5 100644 --- a/ydb/core/mind/hive/leader_tablet_info.h +++ b/ydb/core/mind/hive/leader_tablet_info.h @@ -26,6 +26,25 @@ protected: static TString DEFAULT_STORAGE_POOL_NAME; public: + struct TChannel { + TTabletId TabletId; + ui32 ChannelId; + const TChannelBind* ChannelInfo; + + double GetWeight(NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy metricToBalance) const { + Y_DEBUG_ABORT_UNLESS(ChannelInfo); + switch (metricToBalance) { + case NKikimrConfig::THiveConfig::HIVE_STORAGE_BALANCE_STRATEGY_IOPS: + return ChannelInfo->GetIOPS(); + case NKikimrConfig::THiveConfig::HIVE_STORAGE_BALANCE_STRATEGY_THROUGHPUT: + return ChannelInfo->GetThroughput(); + default: + case NKikimrConfig::THiveConfig::HIVE_STORAGE_BALANCE_STRATEGY_SIZE: + return ChannelInfo->GetSize(); + } + } + }; + TTabletId Id; ETabletState State; TTabletTypes::EType Type; @@ -298,6 +317,14 @@ public: return BoundChannels.size(); } + TChannel GetChannel(ui32 channelId) const { + TChannel channel{.TabletId = Id, .ChannelId = channelId, .ChannelInfo = nullptr}; + if (channelId < BoundChannels.size()) { + channel.ChannelInfo = &BoundChannels[channelId]; + } + return channel; + } + void AcquireAllocationUnits(); void ReleaseAllocationUnits(); bool AcquireAllocationUnit(ui32 channelId); diff --git a/ydb/core/mind/hive/monitoring.cpp b/ydb/core/mind/hive/monitoring.cpp index 949809d3c2..36de3b6aa8 100644 --- a/ydb/core/mind/hive/monitoring.cpp +++ b/ydb/core/mind/hive/monitoring.cpp @@ -1424,7 +1424,8 @@ public: EBalancerType::Emergency, EBalancerType::SpreadNeighbours, EBalancerType::Scatter, - EBalancerType::Manual + EBalancerType::Manual, + EBalancerType::Storage, }) { int balancer = static_cast<int>(type); out << "<tr id='balancer" << balancer << "'><td>" << EBalancerTypeName(type) << "</td><td></td><td></td><td></td><td></td><td></td></tr>"; @@ -2505,6 +2506,32 @@ public: } }; +class TTxMonEvent_StorageRebalance : public TTransactionBase<THive> { +public: + const TActorId Source; + TStorageBalancerSettings Settings; + + TTxMonEvent_StorageRebalance(const TActorId& source, NMon::TEvRemoteHttpInfo::TPtr& ev, TSelf* hive) + : TBase(hive) + , Source(source) + { + Settings.NumReassigns = FromStringWithDefault(ev->Get()->Cgi().Get("reassigns"), 1000); + Settings.MaxInFlight = FromStringWithDefault(ev->Get()->Cgi().Get("inflight"), 1); + Settings.StoragePool = ev->Get()->Cgi().Get("pool"); + } + + TTxType GetTxType() const override { return NHive::TXTYPE_MON_REBALANCE; } + + bool Execute(TTransactionContext&, const TActorContext&) override { + Self->StartHiveStorageBalancer(Settings); + return true; + } + + void Complete(const TActorContext& ctx) override { + ctx.Send(Source, new NMon::TEvRemoteJsonInfoRes("{}")); + } +}; + class TTxMonEvent_RebalanceFromScratch : public TTransactionBase<THive> { public: const TActorId Source; @@ -4072,6 +4099,9 @@ void THive::CreateEvMonitoring(NMon::TEvRemoteHttpInfo::TPtr& ev, const TActorCo if (page == "Storage") { return Execute(new TTxMonEvent_Storage(ev->Sender, ev, this), ctx); } + if (page == "StorageRebalance") { + return Execute(new TTxMonEvent_StorageRebalance(ev->Sender, ev, this), ctx); + } return Execute(new TTxMonEvent_Landing(ev->Sender, ev, this), ctx); } diff --git a/ydb/core/mind/hive/storage_balancer.cpp b/ydb/core/mind/hive/storage_balancer.cpp new file mode 100644 index 0000000000..eb8528b546 --- /dev/null +++ b/ydb/core/mind/hive/storage_balancer.cpp @@ -0,0 +1,193 @@ +#include <ydb/library/actors/core/actor_bootstrapped.h> +#include "hive_impl.h" +#include "hive_log.h" +#include "balancer.h" + +namespace NKikimr { +namespace NHive { + +template<> +void BalanceChannels<NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM>(std::vector<TLeaderTabletInfo::TChannel>& channels, NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy metricToBalance) { + auto& randGen = *TAppData::RandomProvider.Get(); + std::vector<double> weights; + std::vector<size_t> order; + weights.reserve(channels.size()); + order.reserve(channels.size()); + for (size_t i = 0; i < channels.size(); ++i) { + double weight = channels[i].GetWeight(metricToBalance); + weights.push_back(weight * randGen()); + order.push_back(i); + } + std::sort(order.begin(), order.end(), [&weights](size_t i, size_t j) -> bool { + return weights[i] > weights[j]; + }); + std::vector<TLeaderTabletInfo::TChannel> result; + result.reserve(channels.size()); + for (size_t i : order) { + result.push_back(channels[i]); + } + result.swap(channels); +} + +template<> +void BalanceChannels<NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_HEAVIEST>(std::vector<TLeaderTabletInfo::TChannel>& channels, NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy metricToBalance) { + std::sort(channels.begin(), channels.end(), [metricToBalance](const TLeaderTabletInfo::TChannel& a, const TLeaderTabletInfo::TChannel& b) -> bool { + return a.GetWeight(metricToBalance) > b.GetWeight(metricToBalance); + }); +} + +template<> +void BalanceChannels<NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_RANDOM>(std::vector<TLeaderTabletInfo::TChannel>& channels, NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy) { + auto& randGen = *TAppData::RandomProvider.Get(); + std::shuffle(channels.begin(), channels.end(), randGen); +} + +class THiveStorageBalancer : public NActors::TActorBootstrapped<THiveStorageBalancer>, public ISubActor { +protected: + static constexpr TDuration TIMEOUT = TDuration::Minutes(10); + THive* Hive; + TStorageBalancerSettings Settings; + using TOperations = std::unordered_map<TTabletId, std::unique_ptr<TEvHive::TEvReassignTablet>>; + TOperations Operations; + TOperations::iterator NextReassign; + ui64 ReassignInFlight = 0; + ui64 Reassigns = 0; + std::unordered_set<TFullTabletId> SkippedTablets; + TBalancerStats& Stats; + + TString GetLogPrefix() const { + return Hive->GetLogPrefix(); + } + + void PassAway() override { + BLOG_I("StorageBalancer finished"); + Stats.TotalRuns++; + Stats.TotalMovements += Reassigns; + Stats.LastRunMovements = Reassigns; + Stats.IsRunningNow = false; + Hive->RemoveSubActor(this); + return IActor::PassAway(); + } + + void Cleanup() override { + return PassAway(); + } + + void ReassignNextTablet() { + while (NextReassign != Operations.end() && ReassignInFlight < Settings.MaxInFlight) { + auto tablet = Hive->FindTablet(NextReassign->first); + if (!tablet) { + continue; + } + tablet->ActorsToNotifyOnRestart.emplace_back(SelfId()); + BLOG_D("StorageBalancer initiating reassign for tablet " << NextReassign->first); + Send(Hive->SelfId(), NextReassign->second.release()); + ++NextReassign; + ++ReassignInFlight; + } + if (ReassignInFlight == 0) { + return PassAway(); + } + } + + void Handle(TEvPrivate::TEvRestartComplete::TPtr& ev) { + auto tabletId = ev->Get()->TabletId; + BLOG_D("StorageBalancer received " << ev->Get()->Status << " for tablet " << tabletId); + if (SkippedTablets.contains(tabletId)) { + return; + } + --ReassignInFlight; + Stats.CurrentMovements = ++Reassigns; + ReassignNextTablet(); + } + + void Handle(TEvPrivate::TEvRestartCancelled::TPtr& ev) { + auto tabletId = ev->Get()->TabletId; + BLOG_D("StorageBalancer received RestartCancelled for tablet " << tabletId); + SkippedTablets.insert(tabletId); + auto tablet = Hive->FindTablet(tabletId); + if (tablet) { + std::erase(tablet->ActorsToNotifyOnRestart, SelfId()); + } + --ReassignInFlight; + ReassignNextTablet(); + } + +public: + static constexpr NKikimrServices::TActivity::EType ActorActivityType() { + return NKikimrServices::TActivity::HIVE_BALANCER_ACTOR; + } + + THiveStorageBalancer(THive* hive, TStorageBalancerSettings&& settings) + : Hive(hive) + , Settings(settings) + , Stats(Hive->BalancerStats[static_cast<size_t>(EBalancerType::Storage)]) + { + Stats.IsRunningNow = true; + Stats.CurrentMaxMovements = Settings.NumReassigns; + Stats.CurrentMovements = 0; + Stats.LastRunTimestamp = TActivationContext::Now(); + } + + void Bootstrap() { + Hive->TabletCounters->Cumulative()[NHive::COUNTER_STORAGE_BALANCER_EXECUTED].Increment(1); + Become(&TThis::StateWork, TIMEOUT, new TEvents::TEvPoison()); + TInstant now = TActivationContext::Now(); + std::vector<TLeaderTabletInfo::TChannel> channels; + for (const auto& [tabletId, tablet] : Hive->Tablets) { + if (!tablet.IsReadyToReassignTablet()) { + continue; + } + for (ui32 channelId = 0; channelId < tablet.GetChannelCount(); ++channelId) { + const auto* channelInfo = tablet.TabletStorageInfo->ChannelInfo(channelId); + + if (channelInfo + && (Settings.StoragePool.empty() || channelInfo->StoragePool == Settings.StoragePool) + && channelInfo->History.back().Timestamp + Hive->GetMinPeriodBetweenReassign() <= now + && channelInfo->History.size() < Hive->GetMaxChannelHistorySize()) { + channels.push_back(tablet.GetChannel(channelId)); + } + } + } + BLOG_D("StorageBalancer for pool " << Settings.StoragePool << ": " << channels.size() << " tablet channels suitable for balancing"); + auto metricToBalance = Hive->GetStorageBalanceStrategy(); + switch (Hive->GetChannelBalanceStrategy()) { + case NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM: + BalanceChannels<NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM>(channels, metricToBalance); + case NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_HEAVIEST: + BalanceChannels<NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_HEAVIEST>(channels, metricToBalance); + case NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_RANDOM: + BalanceChannels<NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_RANDOM>(channels, metricToBalance); + } + for (size_t i = 0; i < channels.size() && Operations.size() < Settings.NumReassigns; ++i) { + const auto& channel = channels[i]; + auto& ev = Operations[channel.TabletId]; + if (!ev) { + ev = std::make_unique<TEvHive::TEvReassignTablet>(channel.TabletId); + ev->Record.SetReassignReason(NKikimrHive::TEvReassignTablet::HIVE_REASSIGN_REASON_BALANCE); + } + ev->Record.AddChannels(channel.ChannelId); + } + NextReassign = Operations.begin(); + ReassignNextTablet(); + } + + STATEFN(StateWork) { + switch(ev->GetTypeRewrite()) { + cFunc(TEvents::TSystem::PoisonPill, PassAway); + hFunc(TEvPrivate::TEvRestartComplete, Handle); + hFunc(TEvPrivate::TEvRestartCancelled, Handle); + } + } +}; + +void THive::StartHiveStorageBalancer(TStorageBalancerSettings settings) { + if (IsItPossibleToStartBalancer(EBalancerType::Storage)) { + auto* balancer = new THiveStorageBalancer(this, std::move(settings)); + SubActors.emplace_back(balancer); + RegisterWithSameMailbox(balancer); + } +} + +} // NHive +} // NKikimr diff --git a/ydb/core/mind/hive/tx__update_tablet_groups.cpp b/ydb/core/mind/hive/tx__update_tablet_groups.cpp index 4396b448aa..be2900c19f 100644 --- a/ydb/core/mind/hive/tx__update_tablet_groups.cpp +++ b/ydb/core/mind/hive/tx__update_tablet_groups.cpp @@ -160,6 +160,9 @@ public: ui32 fromGeneration; if (channel->History.empty()) { fromGeneration = 0; + } else if (channel->History.back().GroupID == group->GetGroupID()) { + // We decided to keep the group the same + continue; } else { needToIncreaseGeneration = true; fromGeneration = tablet->KnownGeneration + 1; @@ -250,6 +253,9 @@ public: tablet->ChannelProfileNewGroup.reset(channelId); } } + for (const TActorId& actor : tablet->ActorsToNotifyOnRestart) { + SideEffects.Send(actor, new TEvPrivate::TEvRestartCancelled(tablet->GetFullTabletId())); + } newTabletState = ETabletState::ReadyToWork; } } diff --git a/ydb/core/mind/hive/ya.make b/ydb/core/mind/hive/ya.make index 57cc970768..4580b49092 100644 --- a/ydb/core/mind/hive/ya.make +++ b/ydb/core/mind/hive/ya.make @@ -33,6 +33,7 @@ SRCS( follower_group.h follower_tablet_info.cpp follower_tablet_info.h + storage_balancer.cpp storage_group_info.cpp storage_group_info.h storage_pool_info.cpp diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto index 91b546f145..36b5743a93 100644 --- a/ydb/core/protos/config.proto +++ b/ydb/core/protos/config.proto @@ -1323,6 +1323,12 @@ message THiveConfig { HIVE_TABLET_BALANCE_STRATEGY_WEIGHTED_RANDOM = 3; } + enum EHiveChannelBalanceStrategy { + HIVE_CHANNEL_BALANCE_STRATEGY_HEAVIEST = 1; + HIVE_CHANNEL_BALANCE_STRATEGY_RANDOM = 2; + HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM = 3; + } + enum EHiveNodeSelectStrategy { HIVE_NODE_SELECT_STRATEGY_WEIGHTED_RANDOM = 0; HIVE_NODE_SELECT_STRATEGY_EXACT_MIN = 1; @@ -1400,6 +1406,8 @@ message THiveConfig { optional double MinCounterScatterToBalance = 65 [default = 0.02]; reserved 66; optional double ObjectImbalanceToBalance = 67 [default = 0.02]; + optional EHiveChannelBalanceStrategy ChannelBalanceStrategy = 68 [default = HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM]; + optional uint64 MaxChannelHistorySize = 69 [default = 200]; } message TColumnShardConfig { diff --git a/ydb/core/protos/counters_hive.proto b/ydb/core/protos/counters_hive.proto index c2d984c94b..dff2855cbc 100644 --- a/ydb/core/protos/counters_hive.proto +++ b/ydb/core/protos/counters_hive.proto @@ -43,6 +43,7 @@ enum ECumulativeCounters { COUNTER_TABLETS_MOVED = 9 [(CounterOpts) = {Name: "TabletsMoved"}]; COUNTER_SUGGESTED_SCALE_UP = 10 [(CounterOpts) = {Name: "SuggestedScaleUp"}]; COUNTER_SUGGESTED_SCALE_DOWN = 11 [(CounterOpts) = {Name: "SuggestedScaleDown"}]; + COUNTER_STORAGE_BALANCER_EXECUTED = 12 [(CounterOpts) = {Name: "StorageBalancerExecuted"}]; } enum EPercentileCounters { diff --git a/ydb/core/protos/hive.proto b/ydb/core/protos/hive.proto index 575e0154d4..b7fb1324a6 100644 --- a/ydb/core/protos/hive.proto +++ b/ydb/core/protos/hive.proto @@ -246,6 +246,7 @@ message TEvReassignTablet { enum EHiveReassignReason { HIVE_REASSIGN_REASON_NO = 0; HIVE_REASSIGN_REASON_SPACE = 1; + HIVE_REASSIGN_REASON_BALANCE = 2; // internal to Hive } optional fixed64 TabletID = 1; |