diff options
| -rw-r--r-- | ydb/core/mind/hive/hive.h | 1 | ||||
| -rw-r--r-- | ydb/core/mind/hive/hive_impl.cpp | 61 | ||||
| -rw-r--r-- | ydb/core/mind/hive/hive_impl.h | 6 | ||||
| -rw-r--r-- | ydb/core/mind/hive/hive_ut.cpp | 59 | ||||
| -rw-r--r-- | ydb/core/mind/hive/monitoring.cpp | 2 | ||||
| -rw-r--r-- | ydb/core/mind/hive/node_info.cpp | 5 | ||||
| -rw-r--r-- | ydb/core/mind/hive/node_info.h | 4 | ||||
| -rw-r--r-- | ydb/core/protos/config.proto | 1 |
8 files changed, 89 insertions, 50 deletions
diff --git a/ydb/core/mind/hive/hive.h b/ydb/core/mind/hive/hive.h index 52bff5dcdf3..b0c13529d14 100644 --- a/ydb/core/mind/hive/hive.h +++ b/ydb/core/mind/hive/hive.h @@ -55,6 +55,7 @@ using TResourceRawValues = std::tuple<i64, i64, i64, i64>; // CPU, Memory, Netwo using TResourceNormalizedValues = std::tuple<double, double, double, double>; using TOwnerIdxType = NScheme::TPairUi64Ui64; using TSubActorId = ui64; // = LocalId part of TActorId +using TDataCenterPriority = std::unordered_map<TDataCenterId, i32>; static constexpr std::size_t MAX_TABLET_CHANNELS = 256; diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index d8f5a075079..26be2c38cae 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -1176,15 +1176,15 @@ TNodeInfo* THive::SelectNode<NKikimrConfig::THiveConfig::HIVE_NODE_SELECT_STRATE return itNode->Node; } -TVector<THive::TSelectedNode> THive::SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet) const +TVector<THive::TSelectedNode> THive::SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet, TDataCenterPriority& dcPriority) const { i32 priority = std::numeric_limits<i32>::min(); for (const TSelectedNode& selectedNode : selectedNodes) { - priority = std::max(priority, selectedNode.Node->GetPriorityForTablet(tablet)); + priority = std::max(priority, selectedNode.Node->GetPriorityForTablet(tablet, dcPriority)); } auto it = std::partition(selectedNodes.begin(), selectedNodes.end(), [&] (const TSelectedNode& selectedNode) { - return selectedNode.Node->GetPriorityForTablet(tablet) == priority; + return selectedNode.Node->GetPriorityForTablet(tablet, dcPriority) == priority; }); selectedNodes.erase(it, selectedNodes.end()); @@ -1279,53 +1279,21 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet, TNodeId su } } - std::vector<std::vector<TNodeInfo*>> candidateGroups; - candidateGroups.resize(dataCentersGroups.size() + 1); - std::unordered_map<TDataCenterId, std::vector<TNodeInfo*>*> indexDC2Group; + TDataCenterPriority dcPriority; for (size_t numGroup = 0; numGroup < dataCentersGroups.size(); ++numGroup) { const NKikimrHive::TDataCentersGroup* dcGroup = dataCentersGroups[numGroup]; - if (dcGroup->DataCenterSize()) { - for (TDataCenterId dc : dcGroup->GetDataCenter()) { - indexDC2Group[dc] = candidateGroups.data() + numGroup; - } - } else { - for (const ui64 dcId : dcGroup->GetDataCenterNum()) { - indexDC2Group[DataCenterToString(dcId)] = candidateGroups.data() + numGroup; - } - } - } - for (auto it = Nodes.begin(); it != Nodes.end(); ++it) { - TNodeInfo* nodeInfo = &it->second; - if (nodeInfo->IsAlive()) { - TDataCenterId dataCenterId = nodeInfo->GetDataCenter(); - auto itDataCenter = indexDC2Group.find(dataCenterId); - if (itDataCenter != indexDC2Group.end()) { - itDataCenter->second->push_back(nodeInfo); - } else { - candidateGroups.back().push_back(nodeInfo); - } - } else { - BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " node " << nodeInfo->Id << " is not alive"); - debugState.NodesDead++; + for (TDataCenterId dc : dcGroup->GetDataCenter()) { + // First group gets largest priority, last group gets +1 priority, dcs not in any groups get 0 + dcPriority[dc] = dataCentersGroups.size() - numGroup; } } TVector<TSelectedNode> selectedNodes; + selectedNodes.reserve(Nodes.size()); bool thereAreNodesWithManyStarts = false; - for (auto itCandidateNodes = candidateGroups.begin(); itCandidateNodes != candidateGroups.end(); ++itCandidateNodes) { - const std::vector<TNodeInfo*>& candidateNodes(*itCandidateNodes); - if (candidateGroups.size() > 1) { - BLOG_TRACE("[FBN] Tablet " << tablet.ToString() - << " checking candidates group " << (itCandidateNodes - candidateGroups.begin() + 1) - << " of " << candidateGroups.size()); - } - - selectedNodes.clear(); - selectedNodes.reserve(candidateNodes.size()); - - for (auto it = candidateNodes.begin(); it != candidateNodes.end(); ++it) { - TNodeInfo& nodeInfo = *(*it); + for (auto& [_, nodeInfo] : Nodes) { + if (nodeInfo.IsAlive()) { if (nodeInfo.IsAllowedToRunTablet(tablet, &debugState)) { if (nodeInfo.IsAbleToScheduleTablet()) { if (nodeInfo.IsAbleToRunTablet(tablet, &debugState)) { @@ -1351,11 +1319,12 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet, TNodeId su << " tablet allowed domains " << tablet.GetNodeFilter().AllowedDomains << " tablet effective allowed domains " << tablet.GetNodeFilter().GetEffectiveAllowedDomains()); } - } - if (!selectedNodes.empty()) { - break; + } else { + BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " node " << nodeInfo.Id << " is not alive"); + debugState.NodesDead++; } } + BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " selected nodes count " << selectedNodes.size()); if (selectedNodes.empty() && thereAreNodesWithManyStarts) { BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " all available nodes are booting too many tablets"); @@ -1364,7 +1333,7 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet, TNodeId su TNodeInfo* selectedNode = nullptr; if (!selectedNodes.empty()) { - selectedNodes = SelectMaxPriorityNodes(std::move(selectedNodes), tablet); + selectedNodes = SelectMaxPriorityNodes(std::move(selectedNodes), tablet, dcPriority); BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " selected max priority nodes count " << selectedNodes.size()); switch (GetNodeSelectStrategy()) { diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h index 7e46e79377e..56cc212260a 100644 --- a/ydb/core/mind/hive/hive_impl.h +++ b/ydb/core/mind/hive/hive_impl.h @@ -640,7 +640,7 @@ protected: template <NKikimrConfig::THiveConfig::EHiveNodeSelectStrategy Strategy> TNodeInfo* SelectNode(const std::vector<TSelectedNode>& selectedNodes); - TVector<TSelectedNode> SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet) const; + TVector<TSelectedNode> SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet, TDataCenterPriority& dcPriority) const; public: void AssignTabletGroups(TLeaderTabletInfo& tablet); @@ -1002,6 +1002,10 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId return CurrentConfig.GetMaxPingsInFlight(); } + ui64 GetNodeRestartsForPenalty() const { + return CurrentConfig.GetNodeRestartsForPenalty(); + } + static void ActualizeRestartStatistics(google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier); static ui64 GetRestartsPerPeriod(const google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier); static bool IsSystemTablet(TTabletTypes::EType type); diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp index 788b6736bb6..4e853768cf8 100644 --- a/ydb/core/mind/hive/hive_ut.cpp +++ b/ydb/core/mind/hive/hive_ut.cpp @@ -3810,6 +3810,65 @@ Y_UNIT_TEST_SUITE(THiveTest) { } } + Y_UNIT_TEST(TestHiveBalancerWithPreferredDC3) { + // Tablet prefers DC 1, but the nodes there are constantly crashing + // Test that it will be eventually launched in DC 2 + static const int NUM_NODES = 4; + TTestBasicRuntime runtime(NUM_NODES, false); + + runtime.LocationCallback = GetLocation; + + Setup(runtime, true); + const int nodeBase = runtime.GetNodeId(0); + TActorId senderA = runtime.AllocateEdgeActor(); + const ui64 hiveTablet = MakeDefaultHiveID(); + const ui64 testerTablet = MakeTabletID(false, 1); + CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive); + { + TDispatchOptions options; + options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES); + runtime.DispatchEvents(options); + } + + TTabletTypes::EType tabletType = TTabletTypes::Dummy; + THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500, tabletType, BINDED_CHANNELS)); + ev->Record.SetFollowerCount(3); + auto* group = ev->Record.MutableDataCentersPreference()->AddDataCentersGroups(); + group->AddDataCenter(ToString(1)); + ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true); + MakeSureTabletIsUp(runtime, tabletId, 0); + + auto getTabletDC = [&]() -> std::optional<TString> { + std::unique_ptr<TEvHive::TEvRequestHiveInfo> request = std::make_unique<TEvHive::TEvRequestHiveInfo>(); + runtime.SendToPipe(hiveTablet, senderA, request.release()); + TAutoPtr<IEventHandle> handle; + TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle); + for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) { + if (tablet.GetTabletID() == tabletId) { + ui32 nodeId = tablet.GetNodeID(); + if (nodeId == 0) { + return std::nullopt; + } + auto location = GetLocation(nodeId - nodeBase); + return location.GetDataCenterId(); + } + } + return std::nullopt; + }; + + UNIT_ASSERT_VALUES_EQUAL(getTabletDC(), "1"); + for (ui32 i = 0;; ++i) { + // restart node in DC 1 + SendKillLocal(runtime, i % 2); + CreateLocal(runtime, i % 2); + auto dc = getTabletDC(); + Ctest << "tablet is in dc" << dc << Endl; + if (dc == "2") { + break; + } + } + } + Y_UNIT_TEST(TestHiveFollowersWithChangingDC) { static const int NUM_NODES = 6; static const int NUM_TABLETS = 1; diff --git a/ydb/core/mind/hive/monitoring.cpp b/ydb/core/mind/hive/monitoring.cpp index becd4522ad2..91fe1caf80d 100644 --- a/ydb/core/mind/hive/monitoring.cpp +++ b/ydb/core/mind/hive/monitoring.cpp @@ -848,6 +848,7 @@ public: UpdateConfig(db, "ScaleInWindowSize", configUpdates); UpdateConfig(db, "TargetTrackingCPUMargin", configUpdates); UpdateConfig(db, "DryRunTargetTrackingCPU", configUpdates); + UpdateConfig(db, "NodeRestartsForPenalty", configUpdates); if (params.contains("BalancerIgnoreTabletTypes")) { auto value = params.Get("BalancerIgnoreTabletTypes"); @@ -1201,6 +1202,7 @@ public: ShowConfig(out, "ScaleInWindowSize"); ShowConfig(out, "TargetTrackingCPUMargin"); ShowConfig(out, "DryRunTargetTrackingCPU"); + ShowConfig(out, "NodeRestartsForPenalty"); out << "<div class='row' style='margin-top:40px'>"; out << "<div class='col-sm-2' style='padding-top:30px;text-align:right'><label for='allowedMetrics'>AllowedMetrics:</label></div>"; diff --git a/ydb/core/mind/hive/node_info.cpp b/ydb/core/mind/hive/node_info.cpp index a477c48e709..110233f0e68 100644 --- a/ydb/core/mind/hive/node_info.cpp +++ b/ydb/core/mind/hive/node_info.cpp @@ -209,7 +209,7 @@ bool TNodeInfo::IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugStat return true; } -i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet) const { +i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet, TDataCenterPriority& dcPriority) const { i32 priority = 0; auto it = TabletAvailability.find(tablet.GetTabletType()); @@ -221,6 +221,9 @@ i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet) const { --priority; } + priority += dcPriority[GetDataCenter()]; + priority -= GetRestartsPerPeriod() / Hive.GetNodeRestartsForPenalty(); + return priority; } diff --git a/ydb/core/mind/hive/node_info.h b/ydb/core/mind/hive/node_info.h index bf43aa0b00f..504c78aec29 100644 --- a/ydb/core/mind/hive/node_info.h +++ b/ydb/core/mind/hive/node_info.h @@ -160,7 +160,7 @@ public: bool IsAllowedToRunTablet(TTabletDebugState* debugState = nullptr) const; bool IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState = nullptr) const; bool IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState = nullptr) const; - i32 GetPriorityForTablet(const TTabletInfo& tablet) const; + i32 GetPriorityForTablet(const TTabletInfo& tablet, TDataCenterPriority& dcPriority) const; ui64 GetMaxTabletsScheduled() const; ui64 GetMaxCountForTabletType(TTabletTypes::EType tabletType) const; @@ -272,7 +272,7 @@ public: void UpdateResourceTotalUsage(const NKikimrHive::TEvTabletMetrics& metrics); void ActualizeNodeStatistics(TInstant now); - ui64 GetRestartsPerPeriod(TInstant barrier) const; + ui64 GetRestartsPerPeriod(TInstant barrier = {}) const; TDataCenterId GetDataCenter() const { return Location.GetDataCenterId(); diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto index 6d0107d4c6d..9783a4507e0 100644 --- a/ydb/core/protos/config.proto +++ b/ydb/core/protos/config.proto @@ -1763,6 +1763,7 @@ message THiveConfig { optional uint64 ScaleInWindowSize = 82 [default = 5]; // buckets optional double TargetTrackingCPUMargin = 83 [default = 0.1]; // percent optional double DryRunTargetTrackingCPU = 84; // percent + optional uint64 NodeRestartsForPenalty = 85 [default = 3]; } message TBlobCacheConfig { |
