diff options
author | sharpeye <sharpeye@yandex-team.com> | 2023-03-31 06:18:00 +0300 |
---|---|---|
committer | sharpeye <sharpeye@yandex-team.com> | 2023-03-31 06:18:00 +0300 |
commit | 6be8bf780352147bcac5afec43505883b69229a0 (patch) | |
tree | d134636431096e9eca7746a989c38e75f1b80746 | |
parent | ccb2e27952e4fe4baa1c8946c641e56f281aa063 (diff) | |
download | ydb-6be8bf780352147bcac5afec43505883b69229a0.tar.gz |
spare nodes for tablets
-rw-r--r-- | ydb/core/mind/hive/hive_impl.cpp | 18 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_impl.h | 1 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_ut.cpp | 166 | ||||
-rw-r--r-- | ydb/core/mind/hive/node_info.cpp | 9 | ||||
-rw-r--r-- | ydb/core/mind/hive/node_info.h | 1 | ||||
-rw-r--r-- | ydb/core/mind/local.cpp | 1 | ||||
-rw-r--r-- | ydb/core/mind/local.h | 4 | ||||
-rw-r--r-- | ydb/core/protos/local.proto | 1 |
8 files changed, 200 insertions, 1 deletions
diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index 3bde213490..77a443f53b 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -1082,6 +1082,22 @@ TNodeInfo* THive::SelectNode<NKikimrConfig::THiveConfig::HIVE_NODE_SELECT_STRATE return itNode->Node; } +TVector<THive::TSelectedNode> THive::SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet) const +{ + i32 priority = std::numeric_limits<i32>::min(); + for (const TSelectedNode& selectedNode : selectedNodes) { + priority = std::max(priority, selectedNode.Node->GetPriorityForTablet(tablet)); + } + + auto it = std::partition(selectedNodes.begin(), selectedNodes.end(), [&] (const TSelectedNode& selectedNode) { + return selectedNode.Node->GetPriorityForTablet(tablet) == priority; + }); + + selectedNodes.erase(it, selectedNodes.end()); + + return selectedNodes; +} + THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet) { BLOG_D("[FBN] Finding best node for tablet " << tablet.ToString()); BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " family " << tablet.FamilyString()); @@ -1233,6 +1249,8 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet) { TNodeInfo* selectedNode = nullptr; if (!selectedNodes.empty()) { + selectedNodes = SelectMaxPriorityNodes(std::move(selectedNodes), tablet); + switch (GetNodeSelectStrategy()) { case NKikimrConfig::THiveConfig::HIVE_NODE_SELECT_STRATEGY_WEIGHTED_RANDOM: selectedNode = SelectNode<NKikimrConfig::THiveConfig::HIVE_NODE_SELECT_STRATEGY_WEIGHTED_RANDOM>(selectedNodes); diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h index 87083a2c4e..cc3bd8f562 100644 --- a/ydb/core/mind/hive/hive_impl.h +++ b/ydb/core/mind/hive/hive_impl.h @@ -551,6 +551,7 @@ protected: template <NKikimrConfig::THiveConfig::EHiveNodeSelectStrategy Strategy> TNodeInfo* SelectNode(const std::vector<TSelectedNode>& selectedNodes); + TVector<TSelectedNode> SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet) const; public: void AssignTabletGroups(TLeaderTabletInfo& tablet); diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp index 2974c99828..bf9fc07c53 100644 --- a/ydb/core/mind/hive/hive_ut.cpp +++ b/ydb/core/mind/hive/hive_ut.cpp @@ -4986,6 +4986,172 @@ Y_UNIT_TEST_SUITE(THiveTest) { UNIT_ASSERT(storageInfo.ChannelsSize() > 0); } } + + Y_UNIT_TEST(TestHiveBalancerWithSpareNodes) { + static const int NUM_NODES = 6; + static const int NUM_TABLETS = 9; + TTestBasicRuntime runtime(NUM_NODES, false); + runtime.LocationCallback = GetLocation; + Setup(runtime, true); + SendKillLocal(runtime, 0); + SendKillLocal(runtime, 1); + SendKillLocal(runtime, 3); + SendKillLocal(runtime, 4); + SendKillLocal(runtime, 5); + { + TLocalConfig::TPtr local = new TLocalConfig(); + local->TabletClassInfo[TTabletTypes::Dummy].SetupInfo = new TTabletSetupInfo(&CreateFlatDummyTablet, + TMailboxType::Simple, 0, + TMailboxType::Simple, 0); + local->TabletClassInfo[TTabletTypes::Dummy].MaxCount = 2; + CreateLocal(runtime, 0, local); // max 2 dummies on 0 + } + { + TLocalConfig::TPtr local = new TLocalConfig(); + // it can't be empty, otherwise it will fallback to default behavior + local->TabletClassInfo[TTabletTypes::Unknown].SetupInfo = nullptr; + CreateLocal(runtime, 1, local); // no tablets on 1 + } + + // 3, 4 & 5 are spare nodes for Dummy + + for (int i = 3; i != 5; ++i) { + TLocalConfig::TPtr local = new TLocalConfig(); + local->TabletClassInfo[TTabletTypes::Dummy].SetupInfo = new TTabletSetupInfo(&CreateFlatDummyTablet, + TMailboxType::Simple, 0, + TMailboxType::Simple, 0); + local->TabletClassInfo[TTabletTypes::Dummy].MaxCount = 3; + local->TabletClassInfo[TTabletTypes::Dummy].Priority = -1; + CreateLocal(runtime, i, local); + } + + { + TLocalConfig::TPtr local = new TLocalConfig(); + local->TabletClassInfo[TTabletTypes::Dummy].SetupInfo = new TTabletSetupInfo(&CreateFlatDummyTablet, + TMailboxType::Simple, 0, + TMailboxType::Simple, 0); + local->TabletClassInfo[TTabletTypes::Dummy].Priority = -2; + CreateLocal(runtime, 5, local); + } + + const int nodeBase = runtime.GetNodeId(0); + TActorId senderA = runtime.AllocateEdgeActor(); + const ui64 hiveTablet = MakeDefaultHiveID(0); + const ui64 testerTablet = MakeDefaultHiveID(1); + CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive); + { + TDispatchOptions options; + options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES); + runtime.DispatchEvents(options); + } + for (int nodeIdx = 0; nodeIdx < NUM_NODES; ++nodeIdx) { + TActorId senderLocal = runtime.AllocateEdgeActor(nodeIdx); + THolder<TEvHive::TEvTabletMetrics> ev = MakeHolder<TEvHive::TEvTabletMetrics>(); + ev->Record.MutableTotalResourceUsage()->SetCPU(999); // KIKIMR-9870 + runtime.SendToPipe(hiveTablet, senderLocal, ev.Release(), nodeIdx, GetPipeConfigWithRetries()); + TAutoPtr<IEventHandle> handle; + TEvLocal::TEvTabletMetricsAck* response = runtime.GrabEdgeEvent<TEvLocal::TEvTabletMetricsAck>(handle); + Y_UNUSED(response); + } + + // creating NUM_TABLETS tablets + TTabletTypes::EType tabletType = TTabletTypes::Dummy; + TVector<ui64> tablets; + for (int i = 0; i < NUM_TABLETS; ++i) { + THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS)); + ev->Record.SetObjectId(i); + ev->Record.MutableDataCentersPreference()->AddDataCentersGroups()->AddDataCenter(ToString(1)); + ev->Record.MutableDataCentersPreference()->AddDataCentersGroups()->AddDataCenter(ToString(2)); + ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true); + tablets.emplace_back(tabletId); + MakeSureTabletIsUp(runtime, tabletId, 0); + } + + auto getNodeTablets = [&] { + std::array<int, NUM_NODES> nodeTablets = {}; + runtime.SendToPipe(hiveTablet, senderA, new TEvHive::TEvRequestHiveInfo()); + TAutoPtr<IEventHandle> handle; + TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle); + for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) { + UNIT_ASSERT_C(((int)tablet.GetNodeID() - nodeBase >= 0) && (tablet.GetNodeID() - nodeBase < NUM_NODES), + "nodeId# " << tablet.GetNodeID() << " nodeBase# " << nodeBase); + nodeTablets[tablet.GetNodeID() - nodeBase]++; + } + + return nodeTablets; + }; + + auto shutdownNode = [&] (ui32 nodeIndex, int expectedDrainMovements) { + const ui32 nodeId = runtime.GetNodeId(nodeIndex); + runtime.SendToPipe(hiveTablet, senderA, new TEvHive::TEvDrainNode(nodeId)); + TAutoPtr<IEventHandle> handle; + auto drainResponse = runtime.GrabEdgeEventRethrow<TEvHive::TEvDrainNodeResult>(handle, TDuration::Seconds(30)); + UNIT_ASSERT_VALUES_EQUAL(drainResponse->Record.GetStatus(), NKikimrProto::EReplyStatus::OK); + int drainMovements = drainResponse->Record.GetMovements(); + UNIT_ASSERT_VALUES_EQUAL(drainMovements, expectedDrainMovements); + + SendKillLocal(runtime, nodeIndex); + + WaitForEvServerDisconnected(runtime); + + for (TTabletId tabletId : tablets) { + MakeSureTabletIsUp(runtime, tabletId, 0); + } + }; + + auto nodeTablets = getNodeTablets(); + + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[0], 2); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[1], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[2], NUM_TABLETS - 2); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[3], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[4], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[5], 0); + + shutdownNode(0, 2); + + nodeTablets = getNodeTablets(); + + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[0], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[1], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[2], NUM_TABLETS); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[3], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[4], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[5], 0); + + shutdownNode(2, NUM_TABLETS); + + nodeTablets = getNodeTablets(); + + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[0], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[1], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[2], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[3], 3); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[4], 3); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[5], 3); + + shutdownNode(3, 3); + + nodeTablets = getNodeTablets(); + + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[0], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[1], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[2], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[3], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[4], 3); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[5], NUM_TABLETS - 3); + + shutdownNode(4, 3); + + nodeTablets = getNodeTablets(); + + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[0], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[1], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[2], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[3], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[4], 0); + UNIT_ASSERT_VALUES_EQUAL(nodeTablets[5], NUM_TABLETS); + } } } diff --git a/ydb/core/mind/hive/node_info.cpp b/ydb/core/mind/hive/node_info.cpp index 8b6913c981..bd5c0368f4 100644 --- a/ydb/core/mind/hive/node_info.cpp +++ b/ydb/core/mind/hive/node_info.cpp @@ -175,6 +175,15 @@ bool TNodeInfo::IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugStat return true; } +i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet) const { + auto it = TabletAvailability.find(tablet.GetTabletType()); + if (it == TabletAvailability.end()) { + return 0; + } + + return it->second.GetPriority(); +} + bool TNodeInfo::IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState) const { if (tablet.IsAliveOnLocal(Local)) { return !IsOverloaded(); diff --git a/ydb/core/mind/hive/node_info.h b/ydb/core/mind/hive/node_info.h index fa7400fd25..d615b44209 100644 --- a/ydb/core/mind/hive/node_info.h +++ b/ydb/core/mind/hive/node_info.h @@ -137,6 +137,7 @@ public: bool IsAllowedToRunTablet(TTabletDebugState* debugState = nullptr) const; bool IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState = nullptr) const; bool IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState = nullptr) const; + i32 GetPriorityForTablet(const TTabletInfo& tablet) const; ui64 GetMaxTabletsScheduled() const; bool IsAbleToScheduleTablet() const { diff --git a/ydb/core/mind/local.cpp b/ydb/core/mind/local.cpp index 854045bd60..30b4a34aec 100644 --- a/ydb/core/mind/local.cpp +++ b/ydb/core/mind/local.cpp @@ -196,6 +196,7 @@ class TLocalNodeRegistrar : public TActorBootstrapped<TLocalNodeRegistrar> { if (tabletInfo.MaxCount != 0) { tabletAvailability->SetMaxCount(tabletInfo.MaxCount); } + tabletAvailability->SetPriority(tabletInfo.Priority); } NTabletPipe::SendData(ctx, HivePipeClient, request.Release()); diff --git a/ydb/core/mind/local.h b/ydb/core/mind/local.h index 0738bc20f6..466e88dd33 100644 --- a/ydb/core/mind/local.h +++ b/ydb/core/mind/local.h @@ -356,12 +356,14 @@ struct TLocalConfig : public TThrRefBase { struct TTabletClassInfo { TTabletSetupInfo::TPtr SetupInfo; ui64 MaxCount = 0; // maximum allowed number of running tablets, 0 means unlimited + i32 Priority = 0; TTabletClassInfo() {} - TTabletClassInfo(TTabletSetupInfo::TPtr setupInfo) + TTabletClassInfo(TTabletSetupInfo::TPtr setupInfo, i32 priority = 0) : SetupInfo(setupInfo) + , Priority(priority) {} }; diff --git a/ydb/core/protos/local.proto b/ydb/core/protos/local.proto index 8178c4f10a..3ba2f5b24b 100644 --- a/ydb/core/protos/local.proto +++ b/ydb/core/protos/local.proto @@ -11,6 +11,7 @@ option java_package = "ru.yandex.kikimr.proto"; message TTabletAvailability { optional NKikimrTabletBase.TTabletTypes.EType Type = 1; optional uint64 MaxCount = 2; + optional int32 Priority = 3; } message TEvRegisterNode { |