diff options
author | zalyalov <zalyalov@yandex-team.com> | 2023-10-24 19:38:39 +0300 |
---|---|---|
committer | zalyalov <zalyalov@yandex-team.com> | 2023-10-24 20:01:15 +0300 |
commit | a37ccbf072fe185a1781f70b8ee55ce9b678dc7d (patch) | |
tree | eb9ea1b20ab5994810e732d65a8601aa4d62ea25 | |
parent | 1c1a58e15405cbb07776a5c15c8530bd06d6f3f5 (diff) | |
download | ydb-a37ccbf072fe185a1781f70b8ee55ce9b678dc7d.tar.gz |
exclude nodes that are not allowed to run tablets from neighbour trigger KIKIMR-19696
-rw-r--r-- | ydb/core/mind/hive/follower_group.h | 9 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive.h | 6 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_domains.cpp | 2 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_impl.cpp | 14 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_impl.h | 2 | ||||
-rw-r--r-- | ydb/core/mind/hive/leader_tablet_info.cpp | 10 | ||||
-rw-r--r-- | ydb/core/mind/hive/leader_tablet_info.h | 4 | ||||
-rw-r--r-- | ydb/core/mind/hive/monitoring.cpp | 10 | ||||
-rw-r--r-- | ydb/core/mind/hive/node_info.cpp | 95 | ||||
-rw-r--r-- | ydb/core/mind/hive/node_info.h | 1 | ||||
-rw-r--r-- | ydb/core/mind/hive/object_distribution.h | 63 | ||||
-rw-r--r-- | ydb/core/mind/hive/object_distribution_ut.cpp | 24 | ||||
-rw-r--r-- | ydb/core/mind/hive/tablet_info.cpp | 18 | ||||
-rw-r--r-- | ydb/core/mind/hive/tablet_info.h | 3 | ||||
-rw-r--r-- | ydb/core/mind/hive/tx__create_tablet.cpp | 25 | ||||
-rw-r--r-- | ydb/core/mind/hive/tx__kill_node.cpp | 2 | ||||
-rw-r--r-- | ydb/core/mind/hive/tx__load_everything.cpp | 12 | ||||
-rw-r--r-- | ydb/core/mind/hive/tx__seize_tablets_reply.cpp | 6 | ||||
-rw-r--r-- | ydb/core/mind/hive/tx__status.cpp | 2 | ||||
-rw-r--r-- | ydb/core/mind/hive/tx__update_tablets_object.cpp | 13 |
20 files changed, 182 insertions, 139 deletions
diff --git a/ydb/core/mind/hive/follower_group.h b/ydb/core/mind/hive/follower_group.h index 33ab3189ffe..68f863e7b34 100644 --- a/ydb/core/mind/hive/follower_group.h +++ b/ydb/core/mind/hive/follower_group.h @@ -12,8 +12,7 @@ struct TFollowerGroup { bool AllowLeaderPromotion = false; bool AllowClientRead = false; bool RequireAllDataCenters = true; - TVector<TNodeId> AllowedNodes; - TVector<TDataCenterId> AllowedDataCenters; + TNodeFilter NodeFilter; bool LocalNodeOnly = true; // run follower on the same node as leader bool RequireDifferentNodes = false; // do not run followers on same nodes as another followers of the same leader bool FollowerCountPerDataCenter = false; // PER_AZ KIKIMR-10443 @@ -35,14 +34,14 @@ struct TFollowerGroup { RequireAllDataCenters = followerGroup.GetRequireAllDataCenters(); { const auto& allowedNodes(followerGroup.GetAllowedNodeIDs()); - std::copy(allowedNodes.begin(), allowedNodes.end(), std::back_inserter(AllowedNodes)); + std::copy(allowedNodes.begin(), allowedNodes.end(), std::back_inserter(NodeFilter.AllowedNodes)); } { if (const auto& x = followerGroup.GetAllowedDataCenters(); !x.empty()) { - AllowedDataCenters.insert(AllowedDataCenters.end(), x.begin(), x.end()); + NodeFilter.AllowedDataCenters.insert(NodeFilter.AllowedDataCenters.end(), x.begin(), x.end()); } else { for (const auto& dataCenterId : followerGroup.GetAllowedDataCenterNumIDs()) { - AllowedDataCenters.push_back(DataCenterToString(dataCenterId)); + NodeFilter.AllowedDataCenters.push_back(DataCenterToString(dataCenterId)); } } } diff --git a/ydb/core/mind/hive/hive.h b/ydb/core/mind/hive/hive.h index 6e80f782642..79d1d758343 100644 --- a/ydb/core/mind/hive/hive.h +++ b/ydb/core/mind/hive/hive.h @@ -269,6 +269,12 @@ struct TBalancerStats { ui64 LastRunMovements = 0; }; +struct TNodeFilter { + TVector<TSubDomainKey> AllowedDomains; + TVector<TNodeId> AllowedNodes; + TVector<TDataCenterId> AllowedDataCenters; +}; + } // NHive } // NKikimr diff --git a/ydb/core/mind/hive/hive_domains.cpp b/ydb/core/mind/hive/hive_domains.cpp index 8cd602d89bb..2b483eafdf5 100644 --- a/ydb/core/mind/hive/hive_domains.cpp +++ b/ydb/core/mind/hive/hive_domains.cpp @@ -6,8 +6,6 @@ namespace NHive { void TDomainsView::RegisterNode(const TNodeInfo& node) { for (auto &domainKey: node.ServicedDomains) { - BLOG_TRACE("Node(" << node.Id << ")" - << " RegisterInDomain (" << domainKey << ") : " << TotalCount[domainKey] << " -> " << TotalCount[domainKey] + 1); ++TotalCount[domainKey]; } } diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index 97cb577dfaf..fbe4ad43da7 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -105,6 +105,7 @@ void THive::Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev) { if (node != nullptr) { Erase(node->PipeServers, ev->Get()->ServerId); if (node->PipeServers.empty() && node->IsUnknown() && node->CanBeDeleted()) { + ObjectDistributions.RemoveNode(*node); DeleteNode(node->Id); } } @@ -475,7 +476,7 @@ void THive::Handle(TEvPrivate::TEvBootTablets::TPtr&) { << " VolatileState:" << TTabletInfo::EVolatileStateName(tablet.GetVolatileState())); } - for (const auto& domain : tablet.EffectiveAllowedDomains) { + for (const auto& domain : tablet.NodeFilter.AllowedDomains) { SeenDomain(domain); } if (tablet.ObjectDomain) { @@ -1265,7 +1266,7 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet) { << " to run the tablet " << tablet.ToString() << " node domains " << nodeInfo.ServicedDomains << " tablet object domain " << tablet.GetLeader().ObjectDomain - << " tablet allowed domains " << tablet.GetLeader().EffectiveAllowedDomains); + << " tablet allowed domains " << tablet.GetNodeFilter().AllowedDomains); } } if (!selectedNodes.empty()) { @@ -1333,7 +1334,7 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet) { } nodesLeft -= debugState.NodesWithSomeoneFromOurFamily; if (debugState.NodesWithoutDomain == nodesLeft) { - tablet.BootState = TStringBuilder() << "Can't find domain " << tablet.GetLeader().EffectiveAllowedDomains; + tablet.BootState = TStringBuilder() << "Can't find domain " << tablet.GetNodeFilter().AllowedDomains; return TBestNodeResult(true); } nodesLeft -= debugState.NodesWithoutDomain; @@ -2584,14 +2585,14 @@ TDuration THive::GetBalancerCooldown() const { } } -void THive::UpdateObjectCount(TFullObjectId object, TNodeId node, i64 diff) { +void THive::UpdateObjectCount(const TLeaderTabletInfo& tablet, const TNodeInfo& node, i64 diff) { if (!GetSpreadNeighbours()) { return; } - ObjectDistributions.UpdateCount(object, node, diff); + ObjectDistributions.UpdateCountForTablet(tablet, node, diff); TabletCounters->Simple()[NHive::COUNTER_IMBALANCED_OBJECTS].Set(ObjectDistributions.GetImbalancedObjectsCount()); TabletCounters->Simple()[NHive::COUNTER_WORST_OBJECT_VARIANCE].Set(ObjectDistributions.GetWorstObjectVariance()); - BLOG_TRACE("UpdateObjectCount " << "for " << object << " on " << node << " (" << diff << ") ~> Imbalance: " << ObjectDistributions.GetMaxImbalance()); + BLOG_TRACE("UpdateObjectCount " << "for " << tablet.ObjectId << " on " << node.Id << " (" << diff << ") ~> Imbalance: " << ObjectDistributions.GetMaxImbalance()); } ui64 THive::GetObjectImbalance(TFullObjectId object) { @@ -2614,6 +2615,7 @@ THive::THive(TTabletStorageInfo *info, const TActorId &tablet) , PipeClientCache(NTabletPipe::CreateBoundedClientCache(PipeClientCacheConfig)) , PipeTracker(*PipeClientCache) , PipeRetryPolicy() + , ObjectDistributions(Nodes) , ResponsivenessPinger(nullptr) { TabletCountersPtr.Reset(new TProtobufTabletCounters< diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h index b91c806454f..05597d13361 100644 --- a/ydb/core/mind/hive/hive_impl.h +++ b/ydb/core/mind/hive/hive_impl.h @@ -659,7 +659,7 @@ public: void ExecuteProcessBootQueue(NIceDb::TNiceDb& db, TSideEffects& sideEffects); void UpdateTabletFollowersNumber(TLeaderTabletInfo& tablet, NIceDb::TNiceDb& db, TSideEffects& sideEffects); TDuration GetBalancerCooldown() const; - void UpdateObjectCount(TFullObjectId object, TNodeId node, i64 diff); + void UpdateObjectCount(const TLeaderTabletInfo& tablet, const TNodeInfo& node, i64 diff); ui64 GetObjectImbalance(TFullObjectId object); ui32 GetEventPriority(IEventHandle* ev); diff --git a/ydb/core/mind/hive/leader_tablet_info.cpp b/ydb/core/mind/hive/leader_tablet_info.cpp index e2069065a42..04a1892eb57 100644 --- a/ydb/core/mind/hive/leader_tablet_info.cpp +++ b/ydb/core/mind/hive/leader_tablet_info.cpp @@ -64,19 +64,22 @@ TFollowerId TLeaderTabletInfo::GetFollowerPromotableOnNode(TNodeId nodeId) const void TLeaderTabletInfo::AssignDomains(const TSubDomainKey& objectDomain, const TVector<TSubDomainKey>& allowedDomains) { if (!allowedDomains.empty()) { - EffectiveAllowedDomains = allowedDomains; + NodeFilter.AllowedDomains = allowedDomains; if (!objectDomain) { ObjectDomain = allowedDomains.front(); } else { ObjectDomain = objectDomain; } } else if (objectDomain) { - EffectiveAllowedDomains = { objectDomain }; + NodeFilter.AllowedDomains = { objectDomain }; ObjectDomain = objectDomain; } else { - EffectiveAllowedDomains = { Hive.GetRootDomainKey() }; + NodeFilter.AllowedDomains = { Hive.GetRootDomainKey() }; ObjectDomain = { Hive.GetRootDomainKey() }; } + for (auto& followerGroup : FollowerGroups) { + followerGroup.NodeFilter.AllowedDomains = NodeFilter.AllowedDomains; + } } bool TLeaderTabletInfo::InitiateAssignTabletGroups() { @@ -137,6 +140,7 @@ TFollowerGroup& TLeaderTabletInfo::AddFollowerGroup(TFollowerGroupId followerGro } else { followerGroup.Id = followerGroupId; } + followerGroup.NodeFilter.AllowedDomains = NodeFilter.AllowedDomains; return followerGroup; } diff --git a/ydb/core/mind/hive/leader_tablet_info.h b/ydb/core/mind/hive/leader_tablet_info.h index ec43cc2ddc0..a1c3eb3ebbe 100644 --- a/ydb/core/mind/hive/leader_tablet_info.h +++ b/ydb/core/mind/hive/leader_tablet_info.h @@ -30,8 +30,7 @@ public: TTabletTypes::EType Type; TFullObjectId ObjectId; TSubDomainKey ObjectDomain; - TVector<TNodeId> AllowedNodes; - TVector<TDataCenterId> AllowedDataCenters; + TNodeFilter NodeFilter; NKikimrHive::TDataCentersPreference DataCentersPreference; TIntrusivePtr<TTabletStorageInfo> TabletStorageInfo; TChannelsBindings BoundChannels; @@ -42,7 +41,6 @@ public: TList<TFollowerGroup> FollowerGroups; TList<TFollowerTabletInfo> Followers; TOwnerIdxType::TValueType Owner; - TVector<TSubDomainKey> EffectiveAllowedDomains; // AllowedDomains | ObjectDomain NKikimrHive::ETabletBootMode BootMode; TVector<TActorId> StorageInfoSubscribers; TActorId LockedToActor; diff --git a/ydb/core/mind/hive/monitoring.cpp b/ydb/core/mind/hive/monitoring.cpp index fe7a8435cf0..b20741d0403 100644 --- a/ydb/core/mind/hive/monitoring.cpp +++ b/ydb/core/mind/hive/monitoring.cpp @@ -3341,8 +3341,8 @@ public: result["AllowLeaderPromotion"] = group.AllowLeaderPromotion; result["AllowClientRead"] = group.AllowClientRead; result["RequireAllDataCenters"] = group.RequireAllDataCenters; - result["AllowedNodes"] = MakeFrom(group.AllowedNodes); - result["AllowedDataCenters"] = MakeFrom(group.AllowedDataCenters); + result["AllowedNodes"] = MakeFrom(group.NodeFilter.AllowedNodes); + result["AllowedDataCenters"] = MakeFrom(group.NodeFilter.AllowedDataCenters); result["LocalNodeOnly"] = group.LocalNodeOnly; result["RequireDifferentNodes"] = group.RequireDifferentNodes; result["FollowerCountPerDataCenter"] = group.FollowerCountPerDataCenter; @@ -3357,8 +3357,8 @@ public: result["Type"] = TTabletTypes::EType_Name(tablet.Type); result["ObjectId"] = TStringBuilder() << tablet.ObjectId; result["ObjectDomain"] = TStringBuilder() << tablet.ObjectDomain; - result["AllowedNodes"] = MakeFrom(tablet.AllowedNodes); - result["AllowedDataCenters"] = MakeFrom(tablet.AllowedDataCenters); + result["AllowedNodes"] = MakeFrom(tablet.NodeFilter.AllowedNodes); + result["AllowedDataCenters"] = MakeFrom(tablet.NodeFilter.AllowedDataCenters); result["DataCenterPreference"] = MakeFrom(tablet.DataCentersPreference); result["TabletStorageInfo"] = MakeFrom(tablet.TabletStorageInfo); result["BoundChannels"] = MakeFrom(tablet.BoundChannels); @@ -3367,7 +3367,7 @@ public: result["KnownGeneration"] = tablet.KnownGeneration; result["BootMode"] = NKikimrHive::ETabletBootMode_Name(tablet.BootMode); result["Owner"] = TStringBuilder() << tablet.Owner; - result["EffectiveAllowedDomain"] = MakeFrom(tablet.EffectiveAllowedDomains); + result["EffectiveAllowedDomain"] = MakeFrom(tablet.NodeFilter.AllowedDomains); result["StorageInfoSubscribers"] = MakeFrom(tablet.StorageInfoSubscribers); result["LockedToActor"] = MakeFrom(tablet.LockedToActor); result["LockedReconnectTimeout"] = tablet.LockedReconnectTimeout.ToString(); diff --git a/ydb/core/mind/hive/node_info.cpp b/ydb/core/mind/hive/node_info.cpp index 48b9a77ec09..77d726eaa3e 100644 --- a/ydb/core/mind/hive/node_info.cpp +++ b/ydb/core/mind/hive/node_info.cpp @@ -69,8 +69,8 @@ bool TNodeInfo::OnTabletChangeVolatileState(TTabletInfo* tablet, TTabletInfo::EV TabletsRunningByType[tablet->GetTabletType()].erase(tablet); TabletsOfObject[tablet->GetObjectId()].erase(tablet); Hive.UpdateCounterTabletsAlive(-1); - if (tablet->HasCounter()) { - Hive.UpdateObjectCount(tablet->GetObjectId(), Id, -1); + if (tablet->HasCounter() && tablet->IsLeader()) { + Hive.UpdateObjectCount(tablet->AsLeader(), *this, -1); } } if (IsResourceDrainingState(newState)) { @@ -84,8 +84,8 @@ bool TNodeInfo::OnTabletChangeVolatileState(TTabletInfo* tablet, TTabletInfo::EV TabletsRunningByType[tablet->GetTabletType()].emplace(tablet); TabletsOfObject[tablet->GetObjectId()].emplace(tablet); Hive.UpdateCounterTabletsAlive(+1); - if (tablet->HasCounter()) { - Hive.UpdateObjectCount(tablet->GetObjectId(), Id, +1); + if (tablet->HasCounter() && tablet->IsLeader()) { + Hive.UpdateObjectCount(tablet->AsLeader(), *this, +1); } } return true; @@ -101,29 +101,8 @@ void TNodeInfo::UpdateResourceValues(const TTabletInfo* tablet, const NKikimrTab Hive.UpdateTotalResourceValues(this, tablet, before, after, ResourceValues - oldResourceValues, normalizedValues - oldNormalizedValues); } -bool TNodeInfo::IsAllowedToRunTablet(TTabletDebugState* debugState) const { - if (Down) { - if (debugState) { - debugState->NodesDown++; - } - return false; - } - - if (!LocationAcquired) { - if (debugState) { - debugState->NodesWithoutLocation++; - } - return false; - } - return true; -} - -bool TNodeInfo::IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState) const { - if (!IsAllowedToRunTablet(debugState)) { - return false; - } - - const TVector<TSubDomainKey>& allowedDomains = tablet.GetLeader().EffectiveAllowedDomains; +bool TNodeInfo::MatchesFilter(const TNodeFilter& filter, TTabletDebugState* debugState) const { + const auto& allowedDomains = filter.AllowedDomains; bool result = false; for (const auto& candidate : allowedDomains) { @@ -143,7 +122,7 @@ bool TNodeInfo::IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugStat return false; } - const TVector<TNodeId>& allowedNodes = tablet.GetAllowedNodes(); + const auto& allowedNodes = filter.AllowedNodes; if (!allowedNodes.empty() && std::find(allowedNodes.begin(), allowedNodes.end(), Id) == allowedNodes.end()) { @@ -153,6 +132,48 @@ bool TNodeInfo::IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugStat return false; } + const TVector<TDataCenterId>& allowedDataCenters = filter.AllowedDataCenters; + + if (!allowedDataCenters.empty() + && std::find( + allowedDataCenters.begin(), + allowedDataCenters.end(), + GetDataCenter()) == allowedDataCenters.end()) { + if (debugState) { + debugState->NodesInDatacentersNotAllowed++; + } + return false; + } + + return true; +} + +bool TNodeInfo::IsAllowedToRunTablet(TTabletDebugState* debugState) const { + if (Down) { + if (debugState) { + debugState->NodesDown++; + } + return false; + } + + if (!LocationAcquired) { + if (debugState) { + debugState->NodesWithoutLocation++; + } + return false; + } + return true; +} + +bool TNodeInfo::IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState) const { + if (!IsAllowedToRunTablet(debugState)) { + return false; + } + + if (!MatchesFilter(tablet.GetNodeFilter(), debugState)) { + return false; + } + if (tablet.IsFollower() && tablet.AsFollower().FollowerGroup.LocalNodeOnly) { const TLeaderTabletInfo& leader = tablet.GetLeader(); if (!leader.IsRunning()) { @@ -169,19 +190,6 @@ bool TNodeInfo::IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugStat } } - const TVector<TDataCenterId>& allowedDataCenters = tablet.GetAllowedDataCenters(); - - if (!allowedDataCenters.empty() - && std::find( - allowedDataCenters.begin(), - allowedDataCenters.end(), - GetDataCenter()) == allowedDataCenters.end()) { - if (debugState) { - debugState->NodesInDatacentersNotAllowed++; - } - return false; - } - return true; } @@ -336,7 +344,10 @@ void TNodeInfo::SendReconnect(const TActorId& local) { void TNodeInfo::SetDown(bool down) { Down = down; - if (!Down) { + if (Down) { + Hive.ObjectDistributions.RemoveNode(*this); + } else { + Hive.ObjectDistributions.AddNode(*this); Hive.ProcessWaitQueue(); } } diff --git a/ydb/core/mind/hive/node_info.h b/ydb/core/mind/hive/node_info.h index 603b4c1ab75..2fde2dc505b 100644 --- a/ydb/core/mind/hive/node_info.h +++ b/ydb/core/mind/hive/node_info.h @@ -142,6 +142,7 @@ public: return VolatileState == EVolatileState::Connecting || VolatileState == EVolatileState::Connected; } + bool MatchesFilter(const TNodeFilter& filter, TTabletDebugState* debugState = nullptr) const; bool IsAllowedToRunTablet(TTabletDebugState* debugState = nullptr) const; bool IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState = nullptr) const; bool IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState = nullptr) const; diff --git a/ydb/core/mind/hive/object_distribution.h b/ydb/core/mind/hive/object_distribution.h index f2814a82055..de02be04a43 100644 --- a/ydb/core/mind/hive/object_distribution.h +++ b/ydb/core/mind/hive/object_distribution.h @@ -2,6 +2,8 @@ #include "hive.h" #include "hive_log.h" +#include "node_info.h" +#include "tablet_info.h" #include <set> #include <unordered_map> @@ -16,8 +18,12 @@ struct TObjectDistribution { const TFullObjectId Id; double Mean = 0; double VarianceNumerator = 0; + TNodeFilter NodeFilter; // We assume all tablets of one object have the same filter - TObjectDistribution(TFullObjectId id) : Id(id) {} + TObjectDistribution(const TLeaderTabletInfo& tablet) : Id(tablet.ObjectId) + , NodeFilter(tablet.NodeFilter) + { + } double GetImbalance() const { if (SortedDistribution.empty()) { @@ -50,8 +56,8 @@ struct TObjectDistribution { Mean = meanWithoutNode; } - void UpdateCount(TNodeId node, i64 diff) { - auto [it, newNode] = Distribution.insert({node, 0}); + void UpdateCount(const TNodeInfo& node, i64 diff) { + auto [it, newNode] = Distribution.insert({node.Id, 0}); i64& value = it->second; i64 numNodes = Distribution.size(); if (!newNode) { @@ -86,9 +92,11 @@ struct TObjectDistributions { std::multiset<TObjectDistribution> SortedDistributions; std::unordered_map<TFullObjectId, std::multiset<TObjectDistribution>::iterator> Distributions; ui64 ImbalancedObjects = 0; - std::unordered_set<TNodeId> Nodes; + const std::unordered_map<TNodeId, TNodeInfo>& Nodes; bool Enabled = true; + TObjectDistributions(const std::unordered_map<TNodeId, TNodeInfo>& nodes) : Nodes(nodes) {} + double GetMaxImbalance() { if (SortedDistributions.empty()) { return 0; @@ -112,9 +120,6 @@ struct TObjectDistributions { i64 maxCnt = *dist.SortedDistribution.rbegin(); TObjectToBalance result(dist.Id); for (const auto& [node, cnt] : dist.Distribution) { - ui64 n = node; - i64 c = cnt; - BLOG_TRACE("Node " << n << "has " << c << ", maximum: " << maxCnt); if (cnt == maxCnt) { result.Nodes.push_back(node); } @@ -162,43 +167,56 @@ struct TObjectDistributions { } - void UpdateCount(TFullObjectId object, TNodeId node, i64 diff) { + bool UpdateCount(TFullObjectId object, const TNodeInfo& node, i64 diff) { + auto updateFunc = [&](TObjectDistribution& dist) { + dist.UpdateCount(node, diff); + }; + return UpdateDistribution(object, updateFunc); + } + + void UpdateCountForTablet(const TLeaderTabletInfo& tablet, const TNodeInfo& node, i64 diff) { if (!Enabled) { return; } - auto updateFunc = [=](TObjectDistribution& dist) { - dist.UpdateCount(node, diff); - }; - if (!UpdateDistribution(object, updateFunc)) { - TObjectDistribution dist(object); - for (auto node : Nodes) { - dist.UpdateCount(node, 0); + if (!node.IsAllowedToRunTablet(tablet) && diff <= 0) { + return; + } + auto object = tablet.ObjectId; + if (!UpdateCount(object, node, diff)) { + if (diff <= 0) { + return; + } + TObjectDistribution dist(tablet); + for (const auto& [nodeId, node] : Nodes) { + if (node.IsAllowedToRunTablet(tablet)) { + dist.UpdateCount(node, 0); + } } dist.UpdateCount(node, diff); auto sortedDistIt = SortedDistributions.insert(std::move(dist)); Distributions.emplace(object, sortedDistIt); - return; } - // std::cerr << object << ": " << diff << " ~>" << GetTotalImbalance() << std::endl; } - void AddNode(TNodeId node) { + void AddNode(const TNodeInfo& node) { if (!Enabled) { return; } - Nodes.insert(node); for (const auto& [obj, it] : Distributions) { UpdateCount(obj, node, 0); } + for (const auto& [obj, tablets] : node.TabletsOfObject) { + UpdateCount(obj, node, tablets.size()); + } } - void RemoveNode(TNodeId node) { + void RemoveNode(const TNodeInfo& node) { if (!Enabled) { return; } - Nodes.erase(node); + TNodeId nodeId = node.Id; auto updateFunc = [=](TObjectDistribution& dist) { - dist.RemoveNode(node); + dist.RemoveNode(nodeId); }; for (auto it = Distributions.begin(); it != Distributions.end();) { UpdateDistribution((it++)->first, updateFunc); @@ -207,7 +225,6 @@ struct TObjectDistributions { void Disable() { Enabled = false; - Nodes.clear(); SortedDistributions.clear(); Distributions.clear(); } diff --git a/ydb/core/mind/hive/object_distribution_ut.cpp b/ydb/core/mind/hive/object_distribution_ut.cpp index d33a58f33e8..94e047c0e81 100644 --- a/ydb/core/mind/hive/object_distribution_ut.cpp +++ b/ydb/core/mind/hive/object_distribution_ut.cpp @@ -1,4 +1,5 @@ #include <library/cpp/testing/unittest/registar.h> +#include "hive_impl.h" #include "object_distribution.h" #include <map> @@ -7,14 +8,17 @@ using namespace NKikimr; using namespace NHive; + Y_UNIT_TEST_SUITE(ObjectDistribuiton) { Y_UNIT_TEST(TestImbalanceCalcualtion) { - TObjectDistributions objectDistributions; - static constexpr size_t NUM_NODES = 8; static constexpr size_t NUM_OBJECTS = 250; static constexpr size_t NUM_OPERATIONS = 10'000; + static constexpr TSubDomainKey TEST_DOMAIN = {1, 1}; + TIntrusivePtr<TTabletStorageInfo> hiveStorage = new TTabletStorageInfo; + hiveStorage->TabletType = TTabletTypes::Hive; + THive hive(hiveStorage.Get(), TActorId()); std::map<std::pair<TNodeId, TFullObjectId>, ui64> trueDistribution; std::mt19937 engine(42); @@ -22,12 +26,20 @@ Y_UNIT_TEST_SUITE(ObjectDistribuiton) { std::uniform_int_distribution<TNodeId> pickNode(0, NUM_NODES - 1); std::bernoulli_distribution subtract(0.2); - for (TNodeId node = 0; node < NUM_NODES; ++node) { - objectDistributions.AddNode(node); + std::unordered_map<TNodeId, TNodeInfo> nodes; + TObjectDistributions objectDistributions(nodes); + for (TNodeId nodeId = 0; nodeId < NUM_NODES; ++nodeId) { + TNodeInfo& node = nodes.emplace(std::piecewise_construct, std::tuple<TNodeId>(nodeId), std::tuple<TNodeId, THive&>(nodeId, hive)).first->second; + node.ServicedDomains.push_back(TEST_DOMAIN); + node.RegisterInDomains(); + node.LocationAcquired = true; } for (size_t i = 0; i < NUM_OPERATIONS; i++) { - TFullObjectId object = {0, pickObject(engine)}; + TLeaderTabletInfo tablet(0, hive); + tablet.AssignDomains(TEST_DOMAIN, {}); + tablet.ObjectId.second = pickObject(engine); + TFullObjectId object = tablet.ObjectId; TNodeId node = pickNode(engine); ui64& curCount = trueDistribution[{node, object}]; i64 diff = 1; @@ -35,7 +47,7 @@ Y_UNIT_TEST_SUITE(ObjectDistribuiton) { diff = -1; } curCount += diff; - objectDistributions.UpdateCount(object, node, diff); + objectDistributions.UpdateCountForTablet(tablet, nodes.at(node), diff); } ui64 imbalancedObjects = 0; diff --git a/ydb/core/mind/hive/tablet_info.cpp b/ydb/core/mind/hive/tablet_info.cpp index bf0950ef11f..d7c162fe0ae 100644 --- a/ydb/core/mind/hive/tablet_info.cpp +++ b/ydb/core/mind/hive/tablet_info.cpp @@ -376,8 +376,8 @@ void TTabletInfo::UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics if (Node != nullptr) { Node->UpdateResourceValues(this, before, after); i64 deltaCounter = counterAfter - counterBefore; - if (deltaCounter != 0) { - Hive.UpdateObjectCount(GetObjectId(), Node->Id, deltaCounter); + if (deltaCounter != 0 && IsLeader()) { + Hive.UpdateObjectCount(AsLeader(), *Node, deltaCounter); } } } @@ -446,19 +446,11 @@ void TTabletInfo::ActualizeCounter() { ResourceValues.SetCounter(value); } -const TVector<TNodeId>& TTabletInfo::GetAllowedNodes() const { +const TNodeFilter& TTabletInfo::GetNodeFilter() const { if (IsLeader()) { - return AsLeader().AllowedNodes; + return AsLeader().NodeFilter; } else { - return AsFollower().FollowerGroup.AllowedNodes; - } -} - -const TVector<TDataCenterId>& TTabletInfo::GetAllowedDataCenters() const { - if (IsLeader()) { - return AsLeader().AllowedDataCenters; - } else { - return AsFollower().FollowerGroup.AllowedDataCenters; + return AsFollower().FollowerGroup.NodeFilter; } } diff --git a/ydb/core/mind/hive/tablet_info.h b/ydb/core/mind/hive/tablet_info.h index c32ae342eba..89ed766a052 100644 --- a/ydb/core/mind/hive/tablet_info.h +++ b/ydb/core/mind/hive/tablet_info.h @@ -269,8 +269,7 @@ public: PostponedStart = nextStart; } - const TVector<TNodeId>& GetAllowedNodes() const; - const TVector<TDataCenterId>& GetAllowedDataCenters() const; + const TNodeFilter& GetNodeFilter() const; bool InitiateStart(TNodeInfo* node); const NKikimrTabletBase::TMetrics& GetResourceValues() const { diff --git a/ydb/core/mind/hive/tx__create_tablet.cpp b/ydb/core/mind/hive/tx__create_tablet.cpp index 6e73801529f..27660b5dfd9 100644 --- a/ydb/core/mind/hive/tx__create_tablet.cpp +++ b/ydb/core/mind/hive/tx__create_tablet.cpp @@ -265,8 +265,8 @@ public: db.Table<Schema::Tablet>().Key(TabletId).Update<Schema::Tablet::State>(tablet->State); tablet->ActorsToNotify.push_back(Sender); db.Table<Schema::Tablet>().Key(TabletId).Update<Schema::Tablet::ActorsToNotify>(tablet->ActorsToNotify); - tablet->AllowedNodes = AllowedNodeIds; - db.Table<Schema::Tablet>().Key(TabletId).Update<Schema::Tablet::AllowedNodes>(tablet->AllowedNodes); + tablet->NodeFilter.AllowedNodes = AllowedNodeIds; + db.Table<Schema::Tablet>().Key(TabletId).Update<Schema::Tablet::AllowedNodes>(tablet->NodeFilter.AllowedNodes); tablet->AssignDomains(ObjectDomain, AllowedDomains); db.Table<Schema::Tablet>().Key(TabletId).Update<Schema::Tablet::ObjectDomain>(ObjectDomain); db.Table<Schema::Tablet>().Key(TabletId).Update<Schema::Tablet::AllowedDomains>(AllowedDomains); @@ -292,16 +292,16 @@ public: *followerGroup = srcFollowerGroup; TVector<ui32> allowedDataCenters; - for (const TDataCenterId& dc : followerGroup->AllowedDataCenters) { + for (const TDataCenterId& dc : followerGroup->NodeFilter.AllowedDataCenters) { allowedDataCenters.push_back(DataCenterFromString(dc)); } db.Table<Schema::TabletFollowerGroup>().Key(TabletId, followerGroup->Id).Update( NIceDb::TUpdate<Schema::TabletFollowerGroup::FollowerCount>(followerGroup->GetRawFollowerCount()), NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowLeaderPromotion>(followerGroup->AllowLeaderPromotion), NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowClientRead>(followerGroup->AllowClientRead), - NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowedNodes>(followerGroup->AllowedNodes), + NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowedNodes>(followerGroup->NodeFilter.AllowedNodes), NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowedDataCenters>(allowedDataCenters), - NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowedDataCenterIds>(followerGroup->AllowedDataCenters), + NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowedDataCenterIds>(followerGroup->NodeFilter.AllowedDataCenters), NIceDb::TUpdate<Schema::TabletFollowerGroup::RequireAllDataCenters>(followerGroup->RequireAllDataCenters), NIceDb::TUpdate<Schema::TabletFollowerGroup::FollowerCountPerDataCenter>(followerGroup->FollowerCountPerDataCenter), NIceDb::TUpdate<Schema::TabletFollowerGroup::RequireDifferentNodes>(followerGroup->RequireDifferentNodes)); @@ -366,9 +366,8 @@ public: tablet.KnownGeneration = 0; // because we will increase it on start tablet.State = ETabletState::GroupAssignment; tablet.ActorsToNotify.push_back(Sender); - tablet.AllowedNodes = AllowedNodeIds; + tablet.NodeFilter = {.AllowedNodes = AllowedNodeIds, .AllowedDataCenters = AllowedDataCenterIds}; tablet.Owner = ownerIdx; - tablet.AllowedDataCenters = AllowedDataCenterIds; tablet.DataCentersPreference = DataCentersPreference; tablet.BootMode = BootMode; tablet.ObjectId = {OwnerId, ObjectId}; @@ -377,7 +376,7 @@ public: tablet.BalancerPolicy = BalancerPolicy; TVector<ui32> allowedDataCenters; - for (const TDataCenterId& dc : tablet.AllowedDataCenters) { + for (const TDataCenterId& dc : tablet.NodeFilter.AllowedDataCenters) { allowedDataCenters.push_back(DataCenterFromString(dc)); } db.Table<Schema::Tablet>().Key(TabletId).Update(NIceDb::TUpdate<Schema::Tablet::Owner>(tablet.Owner), @@ -386,9 +385,9 @@ public: NIceDb::TUpdate<Schema::Tablet::KnownGeneration>(tablet.KnownGeneration), NIceDb::TUpdate<Schema::Tablet::State>(tablet.State), NIceDb::TUpdate<Schema::Tablet::ActorsToNotify>(TVector<TActorId>(1, Sender)), - NIceDb::TUpdate<Schema::Tablet::AllowedNodes>(tablet.AllowedNodes), + NIceDb::TUpdate<Schema::Tablet::AllowedNodes>(tablet.NodeFilter.AllowedNodes), NIceDb::TUpdate<Schema::Tablet::AllowedDataCenters>(allowedDataCenters), - NIceDb::TUpdate<Schema::Tablet::AllowedDataCenterIds>(tablet.AllowedDataCenters), + NIceDb::TUpdate<Schema::Tablet::AllowedDataCenterIds>(tablet.NodeFilter.AllowedDataCenters), NIceDb::TUpdate<Schema::Tablet::DataCentersPreference>(tablet.DataCentersPreference), NIceDb::TUpdate<Schema::Tablet::AllowedDomains>(AllowedDomains), NIceDb::TUpdate<Schema::Tablet::BootMode>(tablet.BootMode), @@ -450,16 +449,16 @@ public: followerGroup = srcFollowerGroup; TVector<ui32> allowedDataCenters; - for (const TDataCenterId& dc : followerGroup.AllowedDataCenters) { + for (const TDataCenterId& dc : followerGroup.NodeFilter.AllowedDataCenters) { allowedDataCenters.push_back(DataCenterFromString(dc)); } db.Table<Schema::TabletFollowerGroup>().Key(TabletId, followerGroup.Id).Update( NIceDb::TUpdate<Schema::TabletFollowerGroup::FollowerCount>(followerGroup.GetRawFollowerCount()), NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowLeaderPromotion>(followerGroup.AllowLeaderPromotion), NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowClientRead>(followerGroup.AllowClientRead), - NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowedNodes>(followerGroup.AllowedNodes), + NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowedNodes>(followerGroup.NodeFilter.AllowedNodes), NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowedDataCenters>(allowedDataCenters), - NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowedDataCenterIds>(followerGroup.AllowedDataCenters), + NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowedDataCenterIds>(followerGroup.NodeFilter.AllowedDataCenters), NIceDb::TUpdate<Schema::TabletFollowerGroup::RequireAllDataCenters>(followerGroup.RequireAllDataCenters), NIceDb::TUpdate<Schema::TabletFollowerGroup::FollowerCountPerDataCenter>(followerGroup.FollowerCountPerDataCenter)); } diff --git a/ydb/core/mind/hive/tx__kill_node.cpp b/ydb/core/mind/hive/tx__kill_node.cpp index 8cc06e642f6..e773bfc9fb4 100644 --- a/ydb/core/mind/hive/tx__kill_node.cpp +++ b/ydb/core/mind/hive/tx__kill_node.cpp @@ -51,6 +51,7 @@ public: SideEffects.Send(pipeServer, new TEvents::TEvPoisonPill()); } node->PipeServers.clear(); + Self->ObjectDistributions.RemoveNode(*node); if (node->CanBeDeleted()) { db.Table<Schema::Node>().Key(NodeId).Delete(); Self->DeleteNode(NodeId); @@ -58,7 +59,6 @@ public: db.Table<Schema::Node>().Key(NodeId).Update<Schema::Node::Local>(TActorId()); } } - Self->ObjectDistributions.RemoveNode(NodeId); return true; } diff --git a/ydb/core/mind/hive/tx__load_everything.cpp b/ydb/core/mind/hive/tx__load_everything.cpp index b56a7df7531..73525a3752b 100644 --- a/ydb/core/mind/hive/tx__load_everything.cpp +++ b/ydb/core/mind/hive/tx__load_everything.cpp @@ -382,15 +382,15 @@ public: Self->ObjectToTabletMetrics[tablet.ObjectId].IncreaseCount(); Self->TabletTypeToTabletMetrics[tablet.Type].IncreaseCount(); - tablet.AllowedNodes = tabletRowset.GetValue<Schema::Tablet::AllowedNodes>(); + tablet.NodeFilter.AllowedNodes = tabletRowset.GetValue<Schema::Tablet::AllowedNodes>(); if (tabletRowset.HaveValue<Schema::Tablet::AllowedDataCenters>()) { // this is priority format due to migration issues; when migration is complete, this code will // be removed for (const ui32 dcId : tabletRowset.GetValue<Schema::Tablet::AllowedDataCenters>()) { - tablet.AllowedDataCenters.push_back(DataCenterToString(dcId)); + tablet.NodeFilter.AllowedDataCenters.push_back(DataCenterToString(dcId)); } } else { - tablet.AllowedDataCenters = tabletRowset.GetValueOrDefault<Schema::Tablet::AllowedDataCenterIds>(); + tablet.NodeFilter.AllowedDataCenters = tabletRowset.GetValueOrDefault<Schema::Tablet::AllowedDataCenterIds>(); } tablet.DataCentersPreference = tabletRowset.GetValueOrDefault<Schema::Tablet::DataCentersPreference>(); TVector<TSubDomainKey> allowedDomains = tabletRowset.GetValueOrDefault<Schema::Tablet::AllowedDomains>(); @@ -549,16 +549,16 @@ public: followerGroup.SetFollowerCount(tabletFollowerGroupRowset.GetValue<Schema::TabletFollowerGroup::FollowerCount>()); followerGroup.AllowLeaderPromotion = tabletFollowerGroupRowset.GetValueOrDefault<Schema::TabletFollowerGroup::AllowLeaderPromotion>(); followerGroup.AllowClientRead = tabletFollowerGroupRowset.GetValueOrDefault<Schema::TabletFollowerGroup::AllowClientRead>(); - followerGroup.AllowedNodes = tabletFollowerGroupRowset.GetValueOrDefault<Schema::TabletFollowerGroup::AllowedNodes>(); + followerGroup.NodeFilter.AllowedNodes = tabletFollowerGroupRowset.GetValueOrDefault<Schema::TabletFollowerGroup::AllowedNodes>(); if (tabletFollowerGroupRowset.HaveValue<Schema::TabletFollowerGroup::AllowedDataCenters>()) { // this is priority format due to migration issues; when migration is complete, this code will // be removed for (const ui32 dcId : tabletFollowerGroupRowset.GetValue<Schema::TabletFollowerGroup::AllowedDataCenters>()) { - followerGroup.AllowedDataCenters.push_back(DataCenterToString(dcId)); + followerGroup.NodeFilter.AllowedDataCenters.push_back(DataCenterToString(dcId)); } } else { - followerGroup.AllowedDataCenters = tabletFollowerGroupRowset.GetValueOrDefault<Schema::TabletFollowerGroup::AllowedDataCenterIds>(); + followerGroup.NodeFilter.AllowedDataCenters = tabletFollowerGroupRowset.GetValueOrDefault<Schema::TabletFollowerGroup::AllowedDataCenterIds>(); } followerGroup.RequireAllDataCenters = tabletFollowerGroupRowset.GetValueOrDefault<Schema::TabletFollowerGroup::RequireAllDataCenters>(); diff --git a/ydb/core/mind/hive/tx__seize_tablets_reply.cpp b/ydb/core/mind/hive/tx__seize_tablets_reply.cpp index 15114e8acfe..c32074b86fd 100644 --- a/ydb/core/mind/hive/tx__seize_tablets_reply.cpp +++ b/ydb/core/mind/hive/tx__seize_tablets_reply.cpp @@ -116,16 +116,16 @@ public: followerGroup = protoFollowerGroup; TVector<ui32> allowedDataCenters; - for (const TDataCenterId& dc : followerGroup.AllowedDataCenters) { + for (const TDataCenterId& dc : followerGroup.NodeFilter.AllowedDataCenters) { allowedDataCenters.push_back(DataCenterFromString(dc)); } db.Table<Schema::TabletFollowerGroup>().Key(tabletId, followerGroup.Id).Update( NIceDb::TUpdate<Schema::TabletFollowerGroup::FollowerCount>(followerGroup.GetRawFollowerCount()), NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowLeaderPromotion>(followerGroup.AllowLeaderPromotion), NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowClientRead>(followerGroup.AllowClientRead), - NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowedNodes>(followerGroup.AllowedNodes), + NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowedNodes>(followerGroup.NodeFilter.AllowedNodes), NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowedDataCenters>(allowedDataCenters), - NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowedDataCenterIds>(followerGroup.AllowedDataCenters), + NIceDb::TUpdate<Schema::TabletFollowerGroup::AllowedDataCenterIds>(followerGroup.NodeFilter.AllowedDataCenters), NIceDb::TUpdate<Schema::TabletFollowerGroup::RequireAllDataCenters>(followerGroup.RequireAllDataCenters), NIceDb::TUpdate<Schema::TabletFollowerGroup::RequireDifferentNodes>(followerGroup.RequireDifferentNodes), NIceDb::TUpdate<Schema::TabletFollowerGroup::FollowerCountPerDataCenter>(followerGroup.FollowerCountPerDataCenter)); diff --git a/ydb/core/mind/hive/tx__status.cpp b/ydb/core/mind/hive/tx__status.cpp index 95229f65b21..f1e4f7a9d85 100644 --- a/ydb/core/mind/hive/tx__status.cpp +++ b/ydb/core/mind/hive/tx__status.cpp @@ -47,7 +47,7 @@ public: BLOG_D("THive::TTxStatus(" << nodeId << ")::Complete - continuing node drain"); Self->StartHiveDrain(nodeId, {.Persist = true, .KeepDown = node.Down}); } - Self->ObjectDistributions.AddNode(nodeId); + Self->ObjectDistributions.AddNode(node); } else { BLOG_W("THive::TTxStatus(status=" << static_cast<int>(status) << " node=" << TNodeInfo::EVolatileStateName(node.GetVolatileState()) << ") - killing node " << node.Id); diff --git a/ydb/core/mind/hive/tx__update_tablets_object.cpp b/ydb/core/mind/hive/tx__update_tablets_object.cpp index 83f45ab3ca2..763ec3b3216 100644 --- a/ydb/core/mind/hive/tx__update_tablets_object.cpp +++ b/ydb/core/mind/hive/tx__update_tablets_object.cpp @@ -31,8 +31,17 @@ public: if (tablet == nullptr) { continue; } + auto node = tablet->GetNode(); auto oldObject = tablet->GetObjectId(); + + if (tablet->HasCounter() && node != nullptr) { + Self->UpdateObjectCount(*tablet, *node, -1); + } tablet->ObjectId.second = objectId; + if (tablet->HasCounter() && node != nullptr) { + Self->UpdateObjectCount(*tablet, *node, +1); + } + auto newObject = tablet->GetObjectId(); // It should be the same on every iteration if (oldObject == newObject) { continue; @@ -56,10 +65,6 @@ public: if (auto node = tablet->GetNode(); node != nullptr) { node->TabletsOfObject[oldObject].erase(tablet); node->TabletsOfObject[newObject].emplace(tablet); - if (tablet->HasCounter()) { - Self->UpdateObjectCount(oldObject, node->Id, -1); - Self->UpdateObjectCount(newObject, node->Id, +1); - } } db.Table<Schema::Tablet>().Key(tabletId).Update<Schema::Tablet::ObjectID>(objectId); |