aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorsharpeye <sharpeye@yandex-team.com>2023-03-31 06:18:00 +0300
committersharpeye <sharpeye@yandex-team.com>2023-03-31 06:18:00 +0300
commit6be8bf780352147bcac5afec43505883b69229a0 (patch)
treed134636431096e9eca7746a989c38e75f1b80746
parentccb2e27952e4fe4baa1c8946c641e56f281aa063 (diff)
downloadydb-6be8bf780352147bcac5afec43505883b69229a0.tar.gz
spare nodes for tablets
-rw-r--r--ydb/core/mind/hive/hive_impl.cpp18
-rw-r--r--ydb/core/mind/hive/hive_impl.h1
-rw-r--r--ydb/core/mind/hive/hive_ut.cpp166
-rw-r--r--ydb/core/mind/hive/node_info.cpp9
-rw-r--r--ydb/core/mind/hive/node_info.h1
-rw-r--r--ydb/core/mind/local.cpp1
-rw-r--r--ydb/core/mind/local.h4
-rw-r--r--ydb/core/protos/local.proto1
8 files changed, 200 insertions, 1 deletions
diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp
index 3bde213490..77a443f53b 100644
--- a/ydb/core/mind/hive/hive_impl.cpp
+++ b/ydb/core/mind/hive/hive_impl.cpp
@@ -1082,6 +1082,22 @@ TNodeInfo* THive::SelectNode<NKikimrConfig::THiveConfig::HIVE_NODE_SELECT_STRATE
return itNode->Node;
}
+TVector<THive::TSelectedNode> THive::SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet) const
+{
+ i32 priority = std::numeric_limits<i32>::min();
+ for (const TSelectedNode& selectedNode : selectedNodes) {
+ priority = std::max(priority, selectedNode.Node->GetPriorityForTablet(tablet));
+ }
+
+ auto it = std::partition(selectedNodes.begin(), selectedNodes.end(), [&] (const TSelectedNode& selectedNode) {
+ return selectedNode.Node->GetPriorityForTablet(tablet) == priority;
+ });
+
+ selectedNodes.erase(it, selectedNodes.end());
+
+ return selectedNodes;
+}
+
THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet) {
BLOG_D("[FBN] Finding best node for tablet " << tablet.ToString());
BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " family " << tablet.FamilyString());
@@ -1233,6 +1249,8 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet) {
TNodeInfo* selectedNode = nullptr;
if (!selectedNodes.empty()) {
+ selectedNodes = SelectMaxPriorityNodes(std::move(selectedNodes), tablet);
+
switch (GetNodeSelectStrategy()) {
case NKikimrConfig::THiveConfig::HIVE_NODE_SELECT_STRATEGY_WEIGHTED_RANDOM:
selectedNode = SelectNode<NKikimrConfig::THiveConfig::HIVE_NODE_SELECT_STRATEGY_WEIGHTED_RANDOM>(selectedNodes);
diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h
index 87083a2c4e..cc3bd8f562 100644
--- a/ydb/core/mind/hive/hive_impl.h
+++ b/ydb/core/mind/hive/hive_impl.h
@@ -551,6 +551,7 @@ protected:
template <NKikimrConfig::THiveConfig::EHiveNodeSelectStrategy Strategy>
TNodeInfo* SelectNode(const std::vector<TSelectedNode>& selectedNodes);
+ TVector<TSelectedNode> SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet) const;
public:
void AssignTabletGroups(TLeaderTabletInfo& tablet);
diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp
index 2974c99828..bf9fc07c53 100644
--- a/ydb/core/mind/hive/hive_ut.cpp
+++ b/ydb/core/mind/hive/hive_ut.cpp
@@ -4986,6 +4986,172 @@ Y_UNIT_TEST_SUITE(THiveTest) {
UNIT_ASSERT(storageInfo.ChannelsSize() > 0);
}
}
+
+ Y_UNIT_TEST(TestHiveBalancerWithSpareNodes) {
+ static const int NUM_NODES = 6;
+ static const int NUM_TABLETS = 9;
+ TTestBasicRuntime runtime(NUM_NODES, false);
+ runtime.LocationCallback = GetLocation;
+ Setup(runtime, true);
+ SendKillLocal(runtime, 0);
+ SendKillLocal(runtime, 1);
+ SendKillLocal(runtime, 3);
+ SendKillLocal(runtime, 4);
+ SendKillLocal(runtime, 5);
+ {
+ TLocalConfig::TPtr local = new TLocalConfig();
+ local->TabletClassInfo[TTabletTypes::Dummy].SetupInfo = new TTabletSetupInfo(&CreateFlatDummyTablet,
+ TMailboxType::Simple, 0,
+ TMailboxType::Simple, 0);
+ local->TabletClassInfo[TTabletTypes::Dummy].MaxCount = 2;
+ CreateLocal(runtime, 0, local); // max 2 dummies on 0
+ }
+ {
+ TLocalConfig::TPtr local = new TLocalConfig();
+ // it can't be empty, otherwise it will fallback to default behavior
+ local->TabletClassInfo[TTabletTypes::Unknown].SetupInfo = nullptr;
+ CreateLocal(runtime, 1, local); // no tablets on 1
+ }
+
+ // 3, 4 & 5 are spare nodes for Dummy
+
+ for (int i = 3; i != 5; ++i) {
+ TLocalConfig::TPtr local = new TLocalConfig();
+ local->TabletClassInfo[TTabletTypes::Dummy].SetupInfo = new TTabletSetupInfo(&CreateFlatDummyTablet,
+ TMailboxType::Simple, 0,
+ TMailboxType::Simple, 0);
+ local->TabletClassInfo[TTabletTypes::Dummy].MaxCount = 3;
+ local->TabletClassInfo[TTabletTypes::Dummy].Priority = -1;
+ CreateLocal(runtime, i, local);
+ }
+
+ {
+ TLocalConfig::TPtr local = new TLocalConfig();
+ local->TabletClassInfo[TTabletTypes::Dummy].SetupInfo = new TTabletSetupInfo(&CreateFlatDummyTablet,
+ TMailboxType::Simple, 0,
+ TMailboxType::Simple, 0);
+ local->TabletClassInfo[TTabletTypes::Dummy].Priority = -2;
+ CreateLocal(runtime, 5, local);
+ }
+
+ const int nodeBase = runtime.GetNodeId(0);
+ TActorId senderA = runtime.AllocateEdgeActor();
+ const ui64 hiveTablet = MakeDefaultHiveID(0);
+ const ui64 testerTablet = MakeDefaultHiveID(1);
+ CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);
+ {
+ TDispatchOptions options;
+ options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES);
+ runtime.DispatchEvents(options);
+ }
+ for (int nodeIdx = 0; nodeIdx < NUM_NODES; ++nodeIdx) {
+ TActorId senderLocal = runtime.AllocateEdgeActor(nodeIdx);
+ THolder<TEvHive::TEvTabletMetrics> ev = MakeHolder<TEvHive::TEvTabletMetrics>();
+ ev->Record.MutableTotalResourceUsage()->SetCPU(999); // KIKIMR-9870
+ runtime.SendToPipe(hiveTablet, senderLocal, ev.Release(), nodeIdx, GetPipeConfigWithRetries());
+ TAutoPtr<IEventHandle> handle;
+ TEvLocal::TEvTabletMetricsAck* response = runtime.GrabEdgeEvent<TEvLocal::TEvTabletMetricsAck>(handle);
+ Y_UNUSED(response);
+ }
+
+ // creating NUM_TABLETS tablets
+ TTabletTypes::EType tabletType = TTabletTypes::Dummy;
+ TVector<ui64> tablets;
+ for (int i = 0; i < NUM_TABLETS; ++i) {
+ THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS));
+ ev->Record.SetObjectId(i);
+ ev->Record.MutableDataCentersPreference()->AddDataCentersGroups()->AddDataCenter(ToString(1));
+ ev->Record.MutableDataCentersPreference()->AddDataCentersGroups()->AddDataCenter(ToString(2));
+ ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
+ tablets.emplace_back(tabletId);
+ MakeSureTabletIsUp(runtime, tabletId, 0);
+ }
+
+ auto getNodeTablets = [&] {
+ std::array<int, NUM_NODES> nodeTablets = {};
+ runtime.SendToPipe(hiveTablet, senderA, new TEvHive::TEvRequestHiveInfo());
+ TAutoPtr<IEventHandle> handle;
+ TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
+ for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) {
+ UNIT_ASSERT_C(((int)tablet.GetNodeID() - nodeBase >= 0) && (tablet.GetNodeID() - nodeBase < NUM_NODES),
+ "nodeId# " << tablet.GetNodeID() << " nodeBase# " << nodeBase);
+ nodeTablets[tablet.GetNodeID() - nodeBase]++;
+ }
+
+ return nodeTablets;
+ };
+
+ auto shutdownNode = [&] (ui32 nodeIndex, int expectedDrainMovements) {
+ const ui32 nodeId = runtime.GetNodeId(nodeIndex);
+ runtime.SendToPipe(hiveTablet, senderA, new TEvHive::TEvDrainNode(nodeId));
+ TAutoPtr<IEventHandle> handle;
+ auto drainResponse = runtime.GrabEdgeEventRethrow<TEvHive::TEvDrainNodeResult>(handle, TDuration::Seconds(30));
+ UNIT_ASSERT_VALUES_EQUAL(drainResponse->Record.GetStatus(), NKikimrProto::EReplyStatus::OK);
+ int drainMovements = drainResponse->Record.GetMovements();
+ UNIT_ASSERT_VALUES_EQUAL(drainMovements, expectedDrainMovements);
+
+ SendKillLocal(runtime, nodeIndex);
+
+ WaitForEvServerDisconnected(runtime);
+
+ for (TTabletId tabletId : tablets) {
+ MakeSureTabletIsUp(runtime, tabletId, 0);
+ }
+ };
+
+ auto nodeTablets = getNodeTablets();
+
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[0], 2);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[1], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[2], NUM_TABLETS - 2);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[3], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[4], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[5], 0);
+
+ shutdownNode(0, 2);
+
+ nodeTablets = getNodeTablets();
+
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[0], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[1], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[2], NUM_TABLETS);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[3], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[4], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[5], 0);
+
+ shutdownNode(2, NUM_TABLETS);
+
+ nodeTablets = getNodeTablets();
+
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[0], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[1], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[2], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[3], 3);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[4], 3);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[5], 3);
+
+ shutdownNode(3, 3);
+
+ nodeTablets = getNodeTablets();
+
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[0], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[1], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[2], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[3], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[4], 3);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[5], NUM_TABLETS - 3);
+
+ shutdownNode(4, 3);
+
+ nodeTablets = getNodeTablets();
+
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[0], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[1], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[2], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[3], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[4], 0);
+ UNIT_ASSERT_VALUES_EQUAL(nodeTablets[5], NUM_TABLETS);
+ }
}
}
diff --git a/ydb/core/mind/hive/node_info.cpp b/ydb/core/mind/hive/node_info.cpp
index 8b6913c981..bd5c0368f4 100644
--- a/ydb/core/mind/hive/node_info.cpp
+++ b/ydb/core/mind/hive/node_info.cpp
@@ -175,6 +175,15 @@ bool TNodeInfo::IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugStat
return true;
}
+i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet) const {
+ auto it = TabletAvailability.find(tablet.GetTabletType());
+ if (it == TabletAvailability.end()) {
+ return 0;
+ }
+
+ return it->second.GetPriority();
+}
+
bool TNodeInfo::IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState) const {
if (tablet.IsAliveOnLocal(Local)) {
return !IsOverloaded();
diff --git a/ydb/core/mind/hive/node_info.h b/ydb/core/mind/hive/node_info.h
index fa7400fd25..d615b44209 100644
--- a/ydb/core/mind/hive/node_info.h
+++ b/ydb/core/mind/hive/node_info.h
@@ -137,6 +137,7 @@ public:
bool IsAllowedToRunTablet(TTabletDebugState* debugState = nullptr) const;
bool IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState = nullptr) const;
bool IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState = nullptr) const;
+ i32 GetPriorityForTablet(const TTabletInfo& tablet) const;
ui64 GetMaxTabletsScheduled() const;
bool IsAbleToScheduleTablet() const {
diff --git a/ydb/core/mind/local.cpp b/ydb/core/mind/local.cpp
index 854045bd60..30b4a34aec 100644
--- a/ydb/core/mind/local.cpp
+++ b/ydb/core/mind/local.cpp
@@ -196,6 +196,7 @@ class TLocalNodeRegistrar : public TActorBootstrapped<TLocalNodeRegistrar> {
if (tabletInfo.MaxCount != 0) {
tabletAvailability->SetMaxCount(tabletInfo.MaxCount);
}
+ tabletAvailability->SetPriority(tabletInfo.Priority);
}
NTabletPipe::SendData(ctx, HivePipeClient, request.Release());
diff --git a/ydb/core/mind/local.h b/ydb/core/mind/local.h
index 0738bc20f6..466e88dd33 100644
--- a/ydb/core/mind/local.h
+++ b/ydb/core/mind/local.h
@@ -356,12 +356,14 @@ struct TLocalConfig : public TThrRefBase {
struct TTabletClassInfo {
TTabletSetupInfo::TPtr SetupInfo;
ui64 MaxCount = 0; // maximum allowed number of running tablets, 0 means unlimited
+ i32 Priority = 0;
TTabletClassInfo()
{}
- TTabletClassInfo(TTabletSetupInfo::TPtr setupInfo)
+ TTabletClassInfo(TTabletSetupInfo::TPtr setupInfo, i32 priority = 0)
: SetupInfo(setupInfo)
+ , Priority(priority)
{}
};
diff --git a/ydb/core/protos/local.proto b/ydb/core/protos/local.proto
index 8178c4f10a..3ba2f5b24b 100644
--- a/ydb/core/protos/local.proto
+++ b/ydb/core/protos/local.proto
@@ -11,6 +11,7 @@ option java_package = "ru.yandex.kikimr.proto";
message TTabletAvailability {
optional NKikimrTabletBase.TTabletTypes.EType Type = 1;
optional uint64 MaxCount = 2;
+ optional int32 Priority = 3;
}
message TEvRegisterNode {