aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorvporyadke <zalyalov@ydb.tech>2024-01-09 18:25:16 +0300
committerGitHub <noreply@github.com>2024-01-09 16:25:16 +0100
commit8c1f7ef7a4091b7bb300ee3030ec339b6cf77a6e (patch)
treec20e74ec3caf2e5fa331650ece0ca5b7483e3ca9
parenteee3fb563609cf04464365724d4badf54b84c771 (diff)
downloadydb-8c1f7ef7a4091b7bb300ee3030ec339b6cf77a6e.tar.gz
Storage Balancer KIKIMR-20636 (#770)
-rw-r--r--ydb/core/mind/hive/balancer.h4
-rw-r--r--ydb/core/mind/hive/hive.cpp1
-rw-r--r--ydb/core/mind/hive/hive.h9
-rw-r--r--ydb/core/mind/hive/hive_events.h14
-rw-r--r--ydb/core/mind/hive/hive_impl.cpp9
-rw-r--r--ydb/core/mind/hive/hive_impl.h12
-rw-r--r--ydb/core/mind/hive/hive_ut.cpp90
-rw-r--r--ydb/core/mind/hive/leader_tablet_info.cpp6
-rw-r--r--ydb/core/mind/hive/leader_tablet_info.h27
-rw-r--r--ydb/core/mind/hive/monitoring.cpp32
-rw-r--r--ydb/core/mind/hive/storage_balancer.cpp193
-rw-r--r--ydb/core/mind/hive/tx__update_tablet_groups.cpp6
-rw-r--r--ydb/core/mind/hive/ya.make1
-rw-r--r--ydb/core/protos/config.proto8
-rw-r--r--ydb/core/protos/counters_hive.proto1
-rw-r--r--ydb/core/protos/hive.proto1
16 files changed, 412 insertions, 2 deletions
diff --git a/ydb/core/mind/hive/balancer.h b/ydb/core/mind/hive/balancer.h
index 267d827fa1..62289c4f24 100644
--- a/ydb/core/mind/hive/balancer.h
+++ b/ydb/core/mind/hive/balancer.h
@@ -1,6 +1,7 @@
#pragma once
#include "hive_impl.h"
+#include "leader_tablet_info.h"
namespace NKikimr {
namespace NHive {
@@ -11,5 +12,8 @@ void BalanceNodes(std::vector<TNodeInfo*>& nodes, EResourceToBalance resourceTob
template<NKikimrConfig::THiveConfig::EHiveTabletBalanceStrategy EHiveTabletBalanceStrategy>
void BalanceTablets(std::vector<TTabletInfo*>& tablets, EResourceToBalance resourceToBalance);
+template <NKikimrConfig::THiveConfig::EHiveChannelBalanceStrategy>
+void BalanceChannels(std::vector<TLeaderTabletInfo::TChannel>& channels, NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy metricToBalance);
+
}
}
diff --git a/ydb/core/mind/hive/hive.cpp b/ydb/core/mind/hive/hive.cpp
index 803704e162..cd0798d471 100644
--- a/ydb/core/mind/hive/hive.cpp
+++ b/ydb/core/mind/hive/hive.cpp
@@ -41,6 +41,7 @@ TString EBalancerTypeName(EBalancerType value) {
case EBalancerType::Emergency: return "Emergency";
case EBalancerType::SpreadNeighbours: return "Spread";
case EBalancerType::Manual: return "Manual";
+ case EBalancerType::Storage: return "Storage";
}
}
diff --git a/ydb/core/mind/hive/hive.h b/ydb/core/mind/hive/hive.h
index 860f1124f0..66034f4b0a 100644
--- a/ydb/core/mind/hive/hive.h
+++ b/ydb/core/mind/hive/hive.h
@@ -85,8 +85,9 @@ enum class EBalancerType {
ScatterNetwork,
Emergency,
SpreadNeighbours,
+ Storage,
- Last = SpreadNeighbours,
+ Last = Storage,
};
constexpr std::size_t EBalancerTypeSize = static_cast<std::size_t>(EBalancerType::Last) + 1;
@@ -261,6 +262,12 @@ struct TBalancerSettings {
std::optional<TFullObjectId> FilterObjectId;
};
+struct TStorageBalancerSettings {
+ ui64 NumReassigns;
+ ui64 MaxInFlight;
+ TString StoragePool;
+};
+
struct TBalancerStats {
ui64 TotalRuns = 0;
ui64 TotalMovements = 0;
diff --git a/ydb/core/mind/hive/hive_events.h b/ydb/core/mind/hive/hive_events.h
index c6fa33e057..875bf731be 100644
--- a/ydb/core/mind/hive/hive_events.h
+++ b/ydb/core/mind/hive/hive_events.h
@@ -27,6 +27,8 @@ struct TEvPrivate {
EvProcessIncomingEvent,
EvRefreshStorageInfo,
EvLogTabletMoves,
+ EvStartStorageBalancer,
+ EvRestartCancelled,
EvEnd
};
@@ -90,6 +92,18 @@ struct TEvPrivate {
struct TEvRefreshStorageInfo : TEventLocal<TEvRefreshStorageInfo, EvRefreshStorageInfo> {};
struct TEvLogTabletMoves : TEventLocal<TEvLogTabletMoves, EvLogTabletMoves> {};
+
+ struct TEvStartStorageBalancer : TEventLocal<TEvStartStorageBalancer, EvStartStorageBalancer> {
+ TStorageBalancerSettings Settings;
+
+ TEvStartStorageBalancer(TStorageBalancerSettings settings) : Settings(settings) {}
+ };
+
+ struct TEvRestartCancelled : TEventLocal<TEvRestartCancelled, EvRestartCancelled> {
+ TFullTabletId TabletId;
+
+ TEvRestartCancelled(TFullTabletId tabletId) : TabletId(tabletId) {}
+ };
};
} // NHive
diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp
index b2fc1bf6c9..9ec9320a4c 100644
--- a/ydb/core/mind/hive/hive_impl.cpp
+++ b/ydb/core/mind/hive/hive_impl.cpp
@@ -350,6 +350,12 @@ void THive::Handle(TEvPrivate::TEvBalancerOut::TPtr&) {
BLOG_D("Handle BalancerOut");
}
+
+void THive::Handle(TEvPrivate::TEvStartStorageBalancer::TPtr& ev) {
+ BLOG_D("Handle StartStorageBalancer");
+ StartHiveStorageBalancer(std::move(ev->Get()->Settings));
+}
+
void THive::Handle(TEvHive::TEvBootTablet::TPtr& ev) {
TTabletId tabletId = ev->Get()->Record.GetTabletID();
TTabletInfo* tablet = FindTablet(tabletId);
@@ -2650,6 +2656,7 @@ TDuration THive::GetBalancerCooldown() const {
case EBalancerType::ScatterMemory:
case EBalancerType::ScatterNetwork:
case EBalancerType::SpreadNeighbours:
+ case EBalancerType::Storage:
return GetMinPeriodBetweenBalance();
case EBalancerType::Emergency:
return GetMinPeriodBetweenEmergencyBalance();
@@ -2860,6 +2867,7 @@ void THive::ProcessEvent(std::unique_ptr<IEventHandle> event) {
hFunc(TEvHive::TEvUpdateTabletsObject, Handle);
hFunc(TEvPrivate::TEvRefreshStorageInfo, Handle);
hFunc(TEvPrivate::TEvLogTabletMoves, Handle);
+ hFunc(TEvPrivate::TEvStartStorageBalancer, Handle);
hFunc(TEvHive::TEvUpdateDomain, Handle);
}
}
@@ -2958,6 +2966,7 @@ STFUNC(THive::StateWork) {
fFunc(TEvHive::TEvUpdateTabletsObject::EventType, EnqueueIncomingEvent);
fFunc(TEvPrivate::TEvRefreshStorageInfo::EventType, EnqueueIncomingEvent);
fFunc(TEvPrivate::TEvLogTabletMoves::EventType, EnqueueIncomingEvent);
+ fFunc(TEvPrivate::TEvStartStorageBalancer::EventType, EnqueueIncomingEvent);
fFunc(TEvHive::TEvUpdateDomain::EventType, EnqueueIncomingEvent);
hFunc(TEvPrivate::TEvProcessIncomingEvent, Handle);
default:
diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h
index f1408d78cd..e2894adb5f 100644
--- a/ydb/core/mind/hive/hive_impl.h
+++ b/ydb/core/mind/hive/hive_impl.h
@@ -169,6 +169,7 @@ protected:
friend class TQueryMigrationWaitActor;
friend class TReleaseTabletsWaitActor;
friend class TDrainNodeWaitActor;
+ friend class THiveStorageBalancer;;
friend struct TNodeInfo;
friend class TTxInitScheme;
@@ -204,6 +205,7 @@ protected:
friend class TTxMonEvent_QueryMigration;
friend class TTxMonEvent_RebalanceFromScratch;
friend class TTxMonEvent_ObjectStats;
+ friend class TTxMonEvent_StorageRebalance;
friend class TTxKillNode;
friend class TTxLoadEverything;
friend class TTxRestartTablet;
@@ -239,6 +241,7 @@ protected:
void StartHiveBalancer(TBalancerSettings&& settings);
void StartHiveDrain(TNodeId nodeId, TDrainSettings settings);
void StartHiveFill(TNodeId nodeId, const TActorId& initiator);
+ void StartHiveStorageBalancer(TStorageBalancerSettings settings);
void CreateEvMonitoring(NMon::TEvRemoteHttpInfo::TPtr& ev, const TActorContext& ctx);
NJson::TJsonValue GetBalancerProgressJson();
ITransaction* CreateDeleteTablet(TEvHive::TEvDeleteTablet::TPtr& ev);
@@ -551,6 +554,7 @@ protected:
void Handle(TEvHive::TEvUpdateTabletsObject::TPtr& ev);
void Handle(TEvPrivate::TEvRefreshStorageInfo::TPtr& ev);
void Handle(TEvPrivate::TEvLogTabletMoves::TPtr& ev);
+ void Handle(TEvPrivate::TEvStartStorageBalancer::TPtr& ev);
void Handle(TEvPrivate::TEvProcessIncomingEvent::TPtr& ev);
void Handle(TEvHive::TEvUpdateDomain::TPtr& ev);
@@ -901,6 +905,14 @@ public:
return CurrentConfig.GetBootStrategy();
}
+ NKikimrConfig::THiveConfig::EHiveChannelBalanceStrategy GetChannelBalanceStrategy() const {
+ return CurrentConfig.GetChannelBalanceStrategy();
+ }
+
+ ui64 GetMaxChannelHistorySize() const {
+ return CurrentConfig.GetMaxChannelHistorySize();
+ }
+
static void ActualizeRestartStatistics(google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
static bool IsSystemTablet(TTabletTypes::EType type);
diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp
index 67c5ed29a7..984674a398 100644
--- a/ydb/core/mind/hive/hive_ut.cpp
+++ b/ydb/core/mind/hive/hive_ut.cpp
@@ -2780,6 +2780,96 @@ Y_UNIT_TEST_SUITE(THiveTest) {
UNIT_ASSERT_VALUES_EQUAL(getGroup(tabletId), goodGroup);
}
+ Y_UNIT_TEST(TestStorageBalancer) {
+ static constexpr ui64 NUM_TABLETS = 4;
+ TTestBasicRuntime runtime(1, false);
+ Setup(runtime, true, 2, [](TAppPrepare& app) {
+ app.HiveConfig.SetMinPeriodBetweenReassign(0);
+ });
+ const ui64 hiveTablet = MakeDefaultHiveID(0);
+ const ui64 testerTablet = MakeDefaultHiveID(1);
+ CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);
+
+ TTabletTypes::EType tabletType = TTabletTypes::Dummy;
+ TVector<ui64> tablets;
+ for (ui64 i = 0; i < NUM_TABLETS; ++i) {
+ THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS));
+ ev->Record.SetObjectId(i);
+ ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
+ tablets.emplace_back(tabletId);
+ MakeSureTabletIsUp(runtime, tabletId, 0);
+ }
+ ui64 tabletBase = tablets.front();
+
+ TActorId sender = runtime.AllocateEdgeActor();
+ auto getGroup = [&runtime, sender, hiveTablet](ui64 tabletId) {
+ runtime.SendToPipe(hiveTablet, sender, new TEvHive::TEvRequestHiveInfo({
+ .TabletId = tabletId,
+ .ReturnChannelHistory = true,
+ }));
+ TAutoPtr<IEventHandle> handle;
+ TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
+
+ const auto& tablet = response->Record.GetTablets().Get(0);
+ const auto& channel = tablet.GetTabletChannels().Get(0);
+ const auto& history = channel.GetHistory();
+ return history.Get(history.size() - 1).GetGroup();
+ };
+
+ std::unordered_map<ui64, std::vector<ui64>> groupToTablets;
+ for (auto tablet : tablets) {
+ groupToTablets[getGroup(tablet)].push_back(tablet);
+ }
+ ui64 tabletA;
+ ui64 tabletB;
+ for (const auto& [group, tablets] : groupToTablets) {
+ if (tablets.size() >= 2) {
+ tabletA = tablets[0];
+ tabletB = tablets[1];
+ }
+ }
+
+ // If assured space is not set, usage is always set to 1
+ auto groupMetricsExchange = MakeHolder<TEvBlobStorage::TEvControllerGroupMetricsExchange>();
+ for (const auto& [group, tablets] : groupToTablets) {
+ NKikimrBlobStorage::TGroupMetrics* metrics = groupMetricsExchange->Record.AddGroupMetrics();
+
+ metrics->SetGroupId(group);
+ metrics->MutableGroupParameters()->SetGroupID(group);
+ metrics->MutableGroupParameters()->SetStoragePoolName("def1");
+ metrics->MutableGroupParameters()->MutableAssuredResources()->SetSpace(300'000'000);
+ }
+
+ runtime.SendToPipe(MakeBSControllerID(0), sender, groupMetricsExchange.Release(), 0, GetPipeConfigWithRetries());
+ {
+ TDispatchOptions options;
+ options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvBlobStorage::EvControllerGroupMetricsExchange));
+ runtime.DispatchEvents(options);
+ }
+
+ TChannelsBindings channels = BINDED_CHANNELS;
+ for (auto& bind : channels) {
+ bind.SetSize(200'000'000);
+ }
+ for (auto tablet : {tabletA, tabletB}) {
+ TAutoPtr<TEvHive::TEvCreateTablet> updateTablet(new TEvHive::TEvCreateTablet(testerTablet, 100500 + (tablet - tabletBase), tabletType, channels));
+ SendCreateTestTablet(runtime, hiveTablet, testerTablet, updateTablet, 0, true);
+ }
+ runtime.SendToPipe(hiveTablet, sender, new NHive::TEvPrivate::TEvStartStorageBalancer({
+ .NumReassigns = 100,
+ .MaxInFlight = 1,
+ .StoragePool = "def1",
+ }));
+
+ {
+ TDispatchOptions options;
+ options.FinalEvents.emplace_back(NHive::TEvPrivate::EvRestartComplete, 4); // should actually be less than 4
+ runtime.DispatchEvents(options, TDuration::Seconds(10));
+ }
+
+ UNIT_ASSERT_VALUES_UNEQUAL(getGroup(tabletA), getGroup(tabletB));
+ }
+
// Y_UNIT_TEST(TestCreateTabletAndChangeProfiles) {
// TTestBasicRuntime runtime(1, false);
// Setup(runtime, true);
diff --git a/ydb/core/mind/hive/leader_tablet_info.cpp b/ydb/core/mind/hive/leader_tablet_info.cpp
index 8a53d228c1..43a6e260eb 100644
--- a/ydb/core/mind/hive/leader_tablet_info.cpp
+++ b/ydb/core/mind/hive/leader_tablet_info.cpp
@@ -261,6 +261,12 @@ const NKikimrBlobStorage::TEvControllerSelectGroupsResult::TGroupParameters* TLe
});
break;
}
+ case NKikimrHive::TEvReassignTablet::HIVE_REASSIGN_REASON_BALANCE: {
+ return storagePool->FindFreeAllocationUnit([&params](const TStorageGroupInfo& newGroup) -> bool {
+ return newGroup.IsMatchesParameters(*params);
+ });
+ break;
+ }
case NKikimrHive::TEvReassignTablet::HIVE_REASSIGN_REASON_SPACE: {
NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy balanceStrategy = Hive.CurrentConfig.GetStorageBalanceStrategy();
Hive.CurrentConfig.SetStorageBalanceStrategy(NKikimrConfig::THiveConfig::HIVE_STORAGE_BALANCE_STRATEGY_SIZE);
diff --git a/ydb/core/mind/hive/leader_tablet_info.h b/ydb/core/mind/hive/leader_tablet_info.h
index 77a2b1c2a8..dba2b570d5 100644
--- a/ydb/core/mind/hive/leader_tablet_info.h
+++ b/ydb/core/mind/hive/leader_tablet_info.h
@@ -26,6 +26,25 @@ protected:
static TString DEFAULT_STORAGE_POOL_NAME;
public:
+ struct TChannel {
+ TTabletId TabletId;
+ ui32 ChannelId;
+ const TChannelBind* ChannelInfo;
+
+ double GetWeight(NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy metricToBalance) const {
+ Y_DEBUG_ABORT_UNLESS(ChannelInfo);
+ switch (metricToBalance) {
+ case NKikimrConfig::THiveConfig::HIVE_STORAGE_BALANCE_STRATEGY_IOPS:
+ return ChannelInfo->GetIOPS();
+ case NKikimrConfig::THiveConfig::HIVE_STORAGE_BALANCE_STRATEGY_THROUGHPUT:
+ return ChannelInfo->GetThroughput();
+ default:
+ case NKikimrConfig::THiveConfig::HIVE_STORAGE_BALANCE_STRATEGY_SIZE:
+ return ChannelInfo->GetSize();
+ }
+ }
+ };
+
TTabletId Id;
ETabletState State;
TTabletTypes::EType Type;
@@ -298,6 +317,14 @@ public:
return BoundChannels.size();
}
+ TChannel GetChannel(ui32 channelId) const {
+ TChannel channel{.TabletId = Id, .ChannelId = channelId, .ChannelInfo = nullptr};
+ if (channelId < BoundChannels.size()) {
+ channel.ChannelInfo = &BoundChannels[channelId];
+ }
+ return channel;
+ }
+
void AcquireAllocationUnits();
void ReleaseAllocationUnits();
bool AcquireAllocationUnit(ui32 channelId);
diff --git a/ydb/core/mind/hive/monitoring.cpp b/ydb/core/mind/hive/monitoring.cpp
index 949809d3c2..36de3b6aa8 100644
--- a/ydb/core/mind/hive/monitoring.cpp
+++ b/ydb/core/mind/hive/monitoring.cpp
@@ -1424,7 +1424,8 @@ public:
EBalancerType::Emergency,
EBalancerType::SpreadNeighbours,
EBalancerType::Scatter,
- EBalancerType::Manual
+ EBalancerType::Manual,
+ EBalancerType::Storage,
}) {
int balancer = static_cast<int>(type);
out << "<tr id='balancer" << balancer << "'><td>" << EBalancerTypeName(type) << "</td><td></td><td></td><td></td><td></td><td></td></tr>";
@@ -2505,6 +2506,32 @@ public:
}
};
+class TTxMonEvent_StorageRebalance : public TTransactionBase<THive> {
+public:
+ const TActorId Source;
+ TStorageBalancerSettings Settings;
+
+ TTxMonEvent_StorageRebalance(const TActorId& source, NMon::TEvRemoteHttpInfo::TPtr& ev, TSelf* hive)
+ : TBase(hive)
+ , Source(source)
+ {
+ Settings.NumReassigns = FromStringWithDefault(ev->Get()->Cgi().Get("reassigns"), 1000);
+ Settings.MaxInFlight = FromStringWithDefault(ev->Get()->Cgi().Get("inflight"), 1);
+ Settings.StoragePool = ev->Get()->Cgi().Get("pool");
+ }
+
+ TTxType GetTxType() const override { return NHive::TXTYPE_MON_REBALANCE; }
+
+ bool Execute(TTransactionContext&, const TActorContext&) override {
+ Self->StartHiveStorageBalancer(Settings);
+ return true;
+ }
+
+ void Complete(const TActorContext& ctx) override {
+ ctx.Send(Source, new NMon::TEvRemoteJsonInfoRes("{}"));
+ }
+};
+
class TTxMonEvent_RebalanceFromScratch : public TTransactionBase<THive> {
public:
const TActorId Source;
@@ -4072,6 +4099,9 @@ void THive::CreateEvMonitoring(NMon::TEvRemoteHttpInfo::TPtr& ev, const TActorCo
if (page == "Storage") {
return Execute(new TTxMonEvent_Storage(ev->Sender, ev, this), ctx);
}
+ if (page == "StorageRebalance") {
+ return Execute(new TTxMonEvent_StorageRebalance(ev->Sender, ev, this), ctx);
+ }
return Execute(new TTxMonEvent_Landing(ev->Sender, ev, this), ctx);
}
diff --git a/ydb/core/mind/hive/storage_balancer.cpp b/ydb/core/mind/hive/storage_balancer.cpp
new file mode 100644
index 0000000000..eb8528b546
--- /dev/null
+++ b/ydb/core/mind/hive/storage_balancer.cpp
@@ -0,0 +1,193 @@
+#include <ydb/library/actors/core/actor_bootstrapped.h>
+#include "hive_impl.h"
+#include "hive_log.h"
+#include "balancer.h"
+
+namespace NKikimr {
+namespace NHive {
+
+template<>
+void BalanceChannels<NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM>(std::vector<TLeaderTabletInfo::TChannel>& channels, NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy metricToBalance) {
+ auto& randGen = *TAppData::RandomProvider.Get();
+ std::vector<double> weights;
+ std::vector<size_t> order;
+ weights.reserve(channels.size());
+ order.reserve(channels.size());
+ for (size_t i = 0; i < channels.size(); ++i) {
+ double weight = channels[i].GetWeight(metricToBalance);
+ weights.push_back(weight * randGen());
+ order.push_back(i);
+ }
+ std::sort(order.begin(), order.end(), [&weights](size_t i, size_t j) -> bool {
+ return weights[i] > weights[j];
+ });
+ std::vector<TLeaderTabletInfo::TChannel> result;
+ result.reserve(channels.size());
+ for (size_t i : order) {
+ result.push_back(channels[i]);
+ }
+ result.swap(channels);
+}
+
+template<>
+void BalanceChannels<NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_HEAVIEST>(std::vector<TLeaderTabletInfo::TChannel>& channels, NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy metricToBalance) {
+ std::sort(channels.begin(), channels.end(), [metricToBalance](const TLeaderTabletInfo::TChannel& a, const TLeaderTabletInfo::TChannel& b) -> bool {
+ return a.GetWeight(metricToBalance) > b.GetWeight(metricToBalance);
+ });
+}
+
+template<>
+void BalanceChannels<NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_RANDOM>(std::vector<TLeaderTabletInfo::TChannel>& channels, NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy) {
+ auto& randGen = *TAppData::RandomProvider.Get();
+ std::shuffle(channels.begin(), channels.end(), randGen);
+}
+
+class THiveStorageBalancer : public NActors::TActorBootstrapped<THiveStorageBalancer>, public ISubActor {
+protected:
+ static constexpr TDuration TIMEOUT = TDuration::Minutes(10);
+ THive* Hive;
+ TStorageBalancerSettings Settings;
+ using TOperations = std::unordered_map<TTabletId, std::unique_ptr<TEvHive::TEvReassignTablet>>;
+ TOperations Operations;
+ TOperations::iterator NextReassign;
+ ui64 ReassignInFlight = 0;
+ ui64 Reassigns = 0;
+ std::unordered_set<TFullTabletId> SkippedTablets;
+ TBalancerStats& Stats;
+
+ TString GetLogPrefix() const {
+ return Hive->GetLogPrefix();
+ }
+
+ void PassAway() override {
+ BLOG_I("StorageBalancer finished");
+ Stats.TotalRuns++;
+ Stats.TotalMovements += Reassigns;
+ Stats.LastRunMovements = Reassigns;
+ Stats.IsRunningNow = false;
+ Hive->RemoveSubActor(this);
+ return IActor::PassAway();
+ }
+
+ void Cleanup() override {
+ return PassAway();
+ }
+
+ void ReassignNextTablet() {
+ while (NextReassign != Operations.end() && ReassignInFlight < Settings.MaxInFlight) {
+ auto tablet = Hive->FindTablet(NextReassign->first);
+ if (!tablet) {
+ continue;
+ }
+ tablet->ActorsToNotifyOnRestart.emplace_back(SelfId());
+ BLOG_D("StorageBalancer initiating reassign for tablet " << NextReassign->first);
+ Send(Hive->SelfId(), NextReassign->second.release());
+ ++NextReassign;
+ ++ReassignInFlight;
+ }
+ if (ReassignInFlight == 0) {
+ return PassAway();
+ }
+ }
+
+ void Handle(TEvPrivate::TEvRestartComplete::TPtr& ev) {
+ auto tabletId = ev->Get()->TabletId;
+ BLOG_D("StorageBalancer received " << ev->Get()->Status << " for tablet " << tabletId);
+ if (SkippedTablets.contains(tabletId)) {
+ return;
+ }
+ --ReassignInFlight;
+ Stats.CurrentMovements = ++Reassigns;
+ ReassignNextTablet();
+ }
+
+ void Handle(TEvPrivate::TEvRestartCancelled::TPtr& ev) {
+ auto tabletId = ev->Get()->TabletId;
+ BLOG_D("StorageBalancer received RestartCancelled for tablet " << tabletId);
+ SkippedTablets.insert(tabletId);
+ auto tablet = Hive->FindTablet(tabletId);
+ if (tablet) {
+ std::erase(tablet->ActorsToNotifyOnRestart, SelfId());
+ }
+ --ReassignInFlight;
+ ReassignNextTablet();
+ }
+
+public:
+ static constexpr NKikimrServices::TActivity::EType ActorActivityType() {
+ return NKikimrServices::TActivity::HIVE_BALANCER_ACTOR;
+ }
+
+ THiveStorageBalancer(THive* hive, TStorageBalancerSettings&& settings)
+ : Hive(hive)
+ , Settings(settings)
+ , Stats(Hive->BalancerStats[static_cast<size_t>(EBalancerType::Storage)])
+ {
+ Stats.IsRunningNow = true;
+ Stats.CurrentMaxMovements = Settings.NumReassigns;
+ Stats.CurrentMovements = 0;
+ Stats.LastRunTimestamp = TActivationContext::Now();
+ }
+
+ void Bootstrap() {
+ Hive->TabletCounters->Cumulative()[NHive::COUNTER_STORAGE_BALANCER_EXECUTED].Increment(1);
+ Become(&TThis::StateWork, TIMEOUT, new TEvents::TEvPoison());
+ TInstant now = TActivationContext::Now();
+ std::vector<TLeaderTabletInfo::TChannel> channels;
+ for (const auto& [tabletId, tablet] : Hive->Tablets) {
+ if (!tablet.IsReadyToReassignTablet()) {
+ continue;
+ }
+ for (ui32 channelId = 0; channelId < tablet.GetChannelCount(); ++channelId) {
+ const auto* channelInfo = tablet.TabletStorageInfo->ChannelInfo(channelId);
+
+ if (channelInfo
+ && (Settings.StoragePool.empty() || channelInfo->StoragePool == Settings.StoragePool)
+ && channelInfo->History.back().Timestamp + Hive->GetMinPeriodBetweenReassign() <= now
+ && channelInfo->History.size() < Hive->GetMaxChannelHistorySize()) {
+ channels.push_back(tablet.GetChannel(channelId));
+ }
+ }
+ }
+ BLOG_D("StorageBalancer for pool " << Settings.StoragePool << ": " << channels.size() << " tablet channels suitable for balancing");
+ auto metricToBalance = Hive->GetStorageBalanceStrategy();
+ switch (Hive->GetChannelBalanceStrategy()) {
+ case NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM:
+ BalanceChannels<NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM>(channels, metricToBalance);
+ case NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_HEAVIEST:
+ BalanceChannels<NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_HEAVIEST>(channels, metricToBalance);
+ case NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_RANDOM:
+ BalanceChannels<NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_RANDOM>(channels, metricToBalance);
+ }
+ for (size_t i = 0; i < channels.size() && Operations.size() < Settings.NumReassigns; ++i) {
+ const auto& channel = channels[i];
+ auto& ev = Operations[channel.TabletId];
+ if (!ev) {
+ ev = std::make_unique<TEvHive::TEvReassignTablet>(channel.TabletId);
+ ev->Record.SetReassignReason(NKikimrHive::TEvReassignTablet::HIVE_REASSIGN_REASON_BALANCE);
+ }
+ ev->Record.AddChannels(channel.ChannelId);
+ }
+ NextReassign = Operations.begin();
+ ReassignNextTablet();
+ }
+
+ STATEFN(StateWork) {
+ switch(ev->GetTypeRewrite()) {
+ cFunc(TEvents::TSystem::PoisonPill, PassAway);
+ hFunc(TEvPrivate::TEvRestartComplete, Handle);
+ hFunc(TEvPrivate::TEvRestartCancelled, Handle);
+ }
+ }
+};
+
+void THive::StartHiveStorageBalancer(TStorageBalancerSettings settings) {
+ if (IsItPossibleToStartBalancer(EBalancerType::Storage)) {
+ auto* balancer = new THiveStorageBalancer(this, std::move(settings));
+ SubActors.emplace_back(balancer);
+ RegisterWithSameMailbox(balancer);
+ }
+}
+
+} // NHive
+} // NKikimr
diff --git a/ydb/core/mind/hive/tx__update_tablet_groups.cpp b/ydb/core/mind/hive/tx__update_tablet_groups.cpp
index 4396b448aa..be2900c19f 100644
--- a/ydb/core/mind/hive/tx__update_tablet_groups.cpp
+++ b/ydb/core/mind/hive/tx__update_tablet_groups.cpp
@@ -160,6 +160,9 @@ public:
ui32 fromGeneration;
if (channel->History.empty()) {
fromGeneration = 0;
+ } else if (channel->History.back().GroupID == group->GetGroupID()) {
+ // We decided to keep the group the same
+ continue;
} else {
needToIncreaseGeneration = true;
fromGeneration = tablet->KnownGeneration + 1;
@@ -250,6 +253,9 @@ public:
tablet->ChannelProfileNewGroup.reset(channelId);
}
}
+ for (const TActorId& actor : tablet->ActorsToNotifyOnRestart) {
+ SideEffects.Send(actor, new TEvPrivate::TEvRestartCancelled(tablet->GetFullTabletId()));
+ }
newTabletState = ETabletState::ReadyToWork;
}
}
diff --git a/ydb/core/mind/hive/ya.make b/ydb/core/mind/hive/ya.make
index 57cc970768..4580b49092 100644
--- a/ydb/core/mind/hive/ya.make
+++ b/ydb/core/mind/hive/ya.make
@@ -33,6 +33,7 @@ SRCS(
follower_group.h
follower_tablet_info.cpp
follower_tablet_info.h
+ storage_balancer.cpp
storage_group_info.cpp
storage_group_info.h
storage_pool_info.cpp
diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto
index 91b546f145..36b5743a93 100644
--- a/ydb/core/protos/config.proto
+++ b/ydb/core/protos/config.proto
@@ -1323,6 +1323,12 @@ message THiveConfig {
HIVE_TABLET_BALANCE_STRATEGY_WEIGHTED_RANDOM = 3;
}
+ enum EHiveChannelBalanceStrategy {
+ HIVE_CHANNEL_BALANCE_STRATEGY_HEAVIEST = 1;
+ HIVE_CHANNEL_BALANCE_STRATEGY_RANDOM = 2;
+ HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM = 3;
+ }
+
enum EHiveNodeSelectStrategy {
HIVE_NODE_SELECT_STRATEGY_WEIGHTED_RANDOM = 0;
HIVE_NODE_SELECT_STRATEGY_EXACT_MIN = 1;
@@ -1400,6 +1406,8 @@ message THiveConfig {
optional double MinCounterScatterToBalance = 65 [default = 0.02];
reserved 66;
optional double ObjectImbalanceToBalance = 67 [default = 0.02];
+ optional EHiveChannelBalanceStrategy ChannelBalanceStrategy = 68 [default = HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM];
+ optional uint64 MaxChannelHistorySize = 69 [default = 200];
}
message TColumnShardConfig {
diff --git a/ydb/core/protos/counters_hive.proto b/ydb/core/protos/counters_hive.proto
index c2d984c94b..dff2855cbc 100644
--- a/ydb/core/protos/counters_hive.proto
+++ b/ydb/core/protos/counters_hive.proto
@@ -43,6 +43,7 @@ enum ECumulativeCounters {
COUNTER_TABLETS_MOVED = 9 [(CounterOpts) = {Name: "TabletsMoved"}];
COUNTER_SUGGESTED_SCALE_UP = 10 [(CounterOpts) = {Name: "SuggestedScaleUp"}];
COUNTER_SUGGESTED_SCALE_DOWN = 11 [(CounterOpts) = {Name: "SuggestedScaleDown"}];
+ COUNTER_STORAGE_BALANCER_EXECUTED = 12 [(CounterOpts) = {Name: "StorageBalancerExecuted"}];
}
enum EPercentileCounters {
diff --git a/ydb/core/protos/hive.proto b/ydb/core/protos/hive.proto
index 575e0154d4..b7fb1324a6 100644
--- a/ydb/core/protos/hive.proto
+++ b/ydb/core/protos/hive.proto
@@ -246,6 +246,7 @@ message TEvReassignTablet {
enum EHiveReassignReason {
HIVE_REASSIGN_REASON_NO = 0;
HIVE_REASSIGN_REASON_SPACE = 1;
+ HIVE_REASSIGN_REASON_BALANCE = 2; // internal to Hive
}
optional fixed64 TabletID = 1;