summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ydb/core/mind/hive/hive.h1
-rw-r--r--ydb/core/mind/hive/hive_impl.cpp61
-rw-r--r--ydb/core/mind/hive/hive_impl.h6
-rw-r--r--ydb/core/mind/hive/hive_ut.cpp59
-rw-r--r--ydb/core/mind/hive/monitoring.cpp2
-rw-r--r--ydb/core/mind/hive/node_info.cpp5
-rw-r--r--ydb/core/mind/hive/node_info.h4
-rw-r--r--ydb/core/protos/config.proto1
8 files changed, 89 insertions, 50 deletions
diff --git a/ydb/core/mind/hive/hive.h b/ydb/core/mind/hive/hive.h
index 52bff5dcdf3..b0c13529d14 100644
--- a/ydb/core/mind/hive/hive.h
+++ b/ydb/core/mind/hive/hive.h
@@ -55,6 +55,7 @@ using TResourceRawValues = std::tuple<i64, i64, i64, i64>; // CPU, Memory, Netwo
using TResourceNormalizedValues = std::tuple<double, double, double, double>;
using TOwnerIdxType = NScheme::TPairUi64Ui64;
using TSubActorId = ui64; // = LocalId part of TActorId
+using TDataCenterPriority = std::unordered_map<TDataCenterId, i32>;
static constexpr std::size_t MAX_TABLET_CHANNELS = 256;
diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp
index d8f5a075079..26be2c38cae 100644
--- a/ydb/core/mind/hive/hive_impl.cpp
+++ b/ydb/core/mind/hive/hive_impl.cpp
@@ -1176,15 +1176,15 @@ TNodeInfo* THive::SelectNode<NKikimrConfig::THiveConfig::HIVE_NODE_SELECT_STRATE
return itNode->Node;
}
-TVector<THive::TSelectedNode> THive::SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet) const
+TVector<THive::TSelectedNode> THive::SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet, TDataCenterPriority& dcPriority) const
{
i32 priority = std::numeric_limits<i32>::min();
for (const TSelectedNode& selectedNode : selectedNodes) {
- priority = std::max(priority, selectedNode.Node->GetPriorityForTablet(tablet));
+ priority = std::max(priority, selectedNode.Node->GetPriorityForTablet(tablet, dcPriority));
}
auto it = std::partition(selectedNodes.begin(), selectedNodes.end(), [&] (const TSelectedNode& selectedNode) {
- return selectedNode.Node->GetPriorityForTablet(tablet) == priority;
+ return selectedNode.Node->GetPriorityForTablet(tablet, dcPriority) == priority;
});
selectedNodes.erase(it, selectedNodes.end());
@@ -1279,53 +1279,21 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet, TNodeId su
}
}
- std::vector<std::vector<TNodeInfo*>> candidateGroups;
- candidateGroups.resize(dataCentersGroups.size() + 1);
- std::unordered_map<TDataCenterId, std::vector<TNodeInfo*>*> indexDC2Group;
+ TDataCenterPriority dcPriority;
for (size_t numGroup = 0; numGroup < dataCentersGroups.size(); ++numGroup) {
const NKikimrHive::TDataCentersGroup* dcGroup = dataCentersGroups[numGroup];
- if (dcGroup->DataCenterSize()) {
- for (TDataCenterId dc : dcGroup->GetDataCenter()) {
- indexDC2Group[dc] = candidateGroups.data() + numGroup;
- }
- } else {
- for (const ui64 dcId : dcGroup->GetDataCenterNum()) {
- indexDC2Group[DataCenterToString(dcId)] = candidateGroups.data() + numGroup;
- }
- }
- }
- for (auto it = Nodes.begin(); it != Nodes.end(); ++it) {
- TNodeInfo* nodeInfo = &it->second;
- if (nodeInfo->IsAlive()) {
- TDataCenterId dataCenterId = nodeInfo->GetDataCenter();
- auto itDataCenter = indexDC2Group.find(dataCenterId);
- if (itDataCenter != indexDC2Group.end()) {
- itDataCenter->second->push_back(nodeInfo);
- } else {
- candidateGroups.back().push_back(nodeInfo);
- }
- } else {
- BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " node " << nodeInfo->Id << " is not alive");
- debugState.NodesDead++;
+ for (TDataCenterId dc : dcGroup->GetDataCenter()) {
+ // First group gets largest priority, last group gets +1 priority, dcs not in any groups get 0
+ dcPriority[dc] = dataCentersGroups.size() - numGroup;
}
}
TVector<TSelectedNode> selectedNodes;
+ selectedNodes.reserve(Nodes.size());
bool thereAreNodesWithManyStarts = false;
- for (auto itCandidateNodes = candidateGroups.begin(); itCandidateNodes != candidateGroups.end(); ++itCandidateNodes) {
- const std::vector<TNodeInfo*>& candidateNodes(*itCandidateNodes);
- if (candidateGroups.size() > 1) {
- BLOG_TRACE("[FBN] Tablet " << tablet.ToString()
- << " checking candidates group " << (itCandidateNodes - candidateGroups.begin() + 1)
- << " of " << candidateGroups.size());
- }
-
- selectedNodes.clear();
- selectedNodes.reserve(candidateNodes.size());
-
- for (auto it = candidateNodes.begin(); it != candidateNodes.end(); ++it) {
- TNodeInfo& nodeInfo = *(*it);
+ for (auto& [_, nodeInfo] : Nodes) {
+ if (nodeInfo.IsAlive()) {
if (nodeInfo.IsAllowedToRunTablet(tablet, &debugState)) {
if (nodeInfo.IsAbleToScheduleTablet()) {
if (nodeInfo.IsAbleToRunTablet(tablet, &debugState)) {
@@ -1351,11 +1319,12 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet, TNodeId su
<< " tablet allowed domains " << tablet.GetNodeFilter().AllowedDomains
<< " tablet effective allowed domains " << tablet.GetNodeFilter().GetEffectiveAllowedDomains());
}
- }
- if (!selectedNodes.empty()) {
- break;
+ } else {
+ BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " node " << nodeInfo.Id << " is not alive");
+ debugState.NodesDead++;
}
}
+
BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " selected nodes count " << selectedNodes.size());
if (selectedNodes.empty() && thereAreNodesWithManyStarts) {
BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " all available nodes are booting too many tablets");
@@ -1364,7 +1333,7 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet, TNodeId su
TNodeInfo* selectedNode = nullptr;
if (!selectedNodes.empty()) {
- selectedNodes = SelectMaxPriorityNodes(std::move(selectedNodes), tablet);
+ selectedNodes = SelectMaxPriorityNodes(std::move(selectedNodes), tablet, dcPriority);
BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " selected max priority nodes count " << selectedNodes.size());
switch (GetNodeSelectStrategy()) {
diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h
index 7e46e79377e..56cc212260a 100644
--- a/ydb/core/mind/hive/hive_impl.h
+++ b/ydb/core/mind/hive/hive_impl.h
@@ -640,7 +640,7 @@ protected:
template <NKikimrConfig::THiveConfig::EHiveNodeSelectStrategy Strategy>
TNodeInfo* SelectNode(const std::vector<TSelectedNode>& selectedNodes);
- TVector<TSelectedNode> SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet) const;
+ TVector<TSelectedNode> SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet, TDataCenterPriority& dcPriority) const;
public:
void AssignTabletGroups(TLeaderTabletInfo& tablet);
@@ -1002,6 +1002,10 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
return CurrentConfig.GetMaxPingsInFlight();
}
+ ui64 GetNodeRestartsForPenalty() const {
+ return CurrentConfig.GetNodeRestartsForPenalty();
+ }
+
static void ActualizeRestartStatistics(google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
static ui64 GetRestartsPerPeriod(const google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
static bool IsSystemTablet(TTabletTypes::EType type);
diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp
index 788b6736bb6..4e853768cf8 100644
--- a/ydb/core/mind/hive/hive_ut.cpp
+++ b/ydb/core/mind/hive/hive_ut.cpp
@@ -3810,6 +3810,65 @@ Y_UNIT_TEST_SUITE(THiveTest) {
}
}
+ Y_UNIT_TEST(TestHiveBalancerWithPreferredDC3) {
+ // Tablet prefers DC 1, but the nodes there are constantly crashing
+ // Test that it will be eventually launched in DC 2
+ static const int NUM_NODES = 4;
+ TTestBasicRuntime runtime(NUM_NODES, false);
+
+ runtime.LocationCallback = GetLocation;
+
+ Setup(runtime, true);
+ const int nodeBase = runtime.GetNodeId(0);
+ TActorId senderA = runtime.AllocateEdgeActor();
+ const ui64 hiveTablet = MakeDefaultHiveID();
+ const ui64 testerTablet = MakeTabletID(false, 1);
+ CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);
+ {
+ TDispatchOptions options;
+ options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES);
+ runtime.DispatchEvents(options);
+ }
+
+ TTabletTypes::EType tabletType = TTabletTypes::Dummy;
+ THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500, tabletType, BINDED_CHANNELS));
+ ev->Record.SetFollowerCount(3);
+ auto* group = ev->Record.MutableDataCentersPreference()->AddDataCentersGroups();
+ group->AddDataCenter(ToString(1));
+ ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
+ MakeSureTabletIsUp(runtime, tabletId, 0);
+
+ auto getTabletDC = [&]() -> std::optional<TString> {
+ std::unique_ptr<TEvHive::TEvRequestHiveInfo> request = std::make_unique<TEvHive::TEvRequestHiveInfo>();
+ runtime.SendToPipe(hiveTablet, senderA, request.release());
+ TAutoPtr<IEventHandle> handle;
+ TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
+ for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) {
+ if (tablet.GetTabletID() == tabletId) {
+ ui32 nodeId = tablet.GetNodeID();
+ if (nodeId == 0) {
+ return std::nullopt;
+ }
+ auto location = GetLocation(nodeId - nodeBase);
+ return location.GetDataCenterId();
+ }
+ }
+ return std::nullopt;
+ };
+
+ UNIT_ASSERT_VALUES_EQUAL(getTabletDC(), "1");
+ for (ui32 i = 0;; ++i) {
+ // restart node in DC 1
+ SendKillLocal(runtime, i % 2);
+ CreateLocal(runtime, i % 2);
+ auto dc = getTabletDC();
+ Ctest << "tablet is in dc" << dc << Endl;
+ if (dc == "2") {
+ break;
+ }
+ }
+ }
+
Y_UNIT_TEST(TestHiveFollowersWithChangingDC) {
static const int NUM_NODES = 6;
static const int NUM_TABLETS = 1;
diff --git a/ydb/core/mind/hive/monitoring.cpp b/ydb/core/mind/hive/monitoring.cpp
index becd4522ad2..91fe1caf80d 100644
--- a/ydb/core/mind/hive/monitoring.cpp
+++ b/ydb/core/mind/hive/monitoring.cpp
@@ -848,6 +848,7 @@ public:
UpdateConfig(db, "ScaleInWindowSize", configUpdates);
UpdateConfig(db, "TargetTrackingCPUMargin", configUpdates);
UpdateConfig(db, "DryRunTargetTrackingCPU", configUpdates);
+ UpdateConfig(db, "NodeRestartsForPenalty", configUpdates);
if (params.contains("BalancerIgnoreTabletTypes")) {
auto value = params.Get("BalancerIgnoreTabletTypes");
@@ -1201,6 +1202,7 @@ public:
ShowConfig(out, "ScaleInWindowSize");
ShowConfig(out, "TargetTrackingCPUMargin");
ShowConfig(out, "DryRunTargetTrackingCPU");
+ ShowConfig(out, "NodeRestartsForPenalty");
out << "<div class='row' style='margin-top:40px'>";
out << "<div class='col-sm-2' style='padding-top:30px;text-align:right'><label for='allowedMetrics'>AllowedMetrics:</label></div>";
diff --git a/ydb/core/mind/hive/node_info.cpp b/ydb/core/mind/hive/node_info.cpp
index a477c48e709..110233f0e68 100644
--- a/ydb/core/mind/hive/node_info.cpp
+++ b/ydb/core/mind/hive/node_info.cpp
@@ -209,7 +209,7 @@ bool TNodeInfo::IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugStat
return true;
}
-i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet) const {
+i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet, TDataCenterPriority& dcPriority) const {
i32 priority = 0;
auto it = TabletAvailability.find(tablet.GetTabletType());
@@ -221,6 +221,9 @@ i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet) const {
--priority;
}
+ priority += dcPriority[GetDataCenter()];
+ priority -= GetRestartsPerPeriod() / Hive.GetNodeRestartsForPenalty();
+
return priority;
}
diff --git a/ydb/core/mind/hive/node_info.h b/ydb/core/mind/hive/node_info.h
index bf43aa0b00f..504c78aec29 100644
--- a/ydb/core/mind/hive/node_info.h
+++ b/ydb/core/mind/hive/node_info.h
@@ -160,7 +160,7 @@ public:
bool IsAllowedToRunTablet(TTabletDebugState* debugState = nullptr) const;
bool IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState = nullptr) const;
bool IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState = nullptr) const;
- i32 GetPriorityForTablet(const TTabletInfo& tablet) const;
+ i32 GetPriorityForTablet(const TTabletInfo& tablet, TDataCenterPriority& dcPriority) const;
ui64 GetMaxTabletsScheduled() const;
ui64 GetMaxCountForTabletType(TTabletTypes::EType tabletType) const;
@@ -272,7 +272,7 @@ public:
void UpdateResourceTotalUsage(const NKikimrHive::TEvTabletMetrics& metrics);
void ActualizeNodeStatistics(TInstant now);
- ui64 GetRestartsPerPeriod(TInstant barrier) const;
+ ui64 GetRestartsPerPeriod(TInstant barrier = {}) const;
TDataCenterId GetDataCenter() const {
return Location.GetDataCenterId();
diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto
index 6d0107d4c6d..9783a4507e0 100644
--- a/ydb/core/protos/config.proto
+++ b/ydb/core/protos/config.proto
@@ -1763,6 +1763,7 @@ message THiveConfig {
optional uint64 ScaleInWindowSize = 82 [default = 5]; // buckets
optional double TargetTrackingCPUMargin = 83 [default = 0.1]; // percent
optional double DryRunTargetTrackingCPU = 84; // percent
+ optional uint64 NodeRestartsForPenalty = 85 [default = 3];
}
message TBlobCacheConfig {