diff options
author | zalyalov <zalyalov@yandex-team.com> | 2023-10-05 15:43:22 +0300 |
---|---|---|
committer | zalyalov <zalyalov@yandex-team.com> | 2023-10-05 16:04:31 +0300 |
commit | d0e657937bb5c52a909ee0997d9b244d8db7d0d5 (patch) | |
tree | 7d680b2fbdf3edad528b2c6f290927ad72776777 | |
parent | c322e8b8a6f08fe2b706f546384716302a305d32 (diff) | |
download | ydb-d0e657937bb5c52a909ee0997d9b244d8db7d0d5.tar.gz |
atttempt to better integrate neighbour balancing
-rw-r--r-- | ydb/core/mind/hive/balancer.cpp | 32 | ||||
-rw-r--r-- | ydb/core/mind/hive/drain.cpp | 2 | ||||
-rw-r--r-- | ydb/core/mind/hive/fill.cpp | 3 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive.cpp | 7 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive.h | 24 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_impl.cpp | 94 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_impl.h | 13 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_statics.cpp | 54 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_ut.cpp | 10 | ||||
-rw-r--r-- | ydb/core/mind/hive/monitoring.cpp | 926 | ||||
-rw-r--r-- | ydb/core/mind/hive/node_info.cpp | 8 | ||||
-rw-r--r-- | ydb/core/mind/hive/object_distribution.h | 145 | ||||
-rw-r--r-- | ydb/core/mind/hive/object_distribution_ut.cpp | 23 | ||||
-rw-r--r-- | ydb/core/mind/hive/tablet_info.cpp | 13 | ||||
-rw-r--r-- | ydb/core/mind/hive/tablet_info.h | 5 | ||||
-rw-r--r-- | ydb/core/mind/hive/tx__kill_node.cpp | 1 | ||||
-rw-r--r-- | ydb/core/mind/hive/tx__load_everything.cpp | 6 | ||||
-rw-r--r-- | ydb/core/mind/hive/tx__status.cpp | 1 | ||||
-rw-r--r-- | ydb/core/mind/hive/tx__update_tablets_object.cpp | 6 | ||||
-rw-r--r-- | ydb/core/protos/config.proto | 11 |
20 files changed, 842 insertions, 542 deletions
diff --git a/ydb/core/mind/hive/balancer.cpp b/ydb/core/mind/hive/balancer.cpp index 37050c6c9a6..6ec197906e0 100644 --- a/ydb/core/mind/hive/balancer.cpp +++ b/ydb/core/mind/hive/balancer.cpp @@ -120,14 +120,18 @@ protected: ui64 KickInFlight; int Movements; TBalancerSettings Settings; + TBalancerStats& Stats; TString GetLogPrefix() const { return Hive->GetLogPrefix(); } void PassAway() override { - Hive->BalancerProgress = -1; BLOG_I("Balancer finished with " << Movements << " movements made"); + Stats.TotalRuns++; + Stats.TotalMovements += Movements; + Stats.LastRunMovements = Movements; + Stats.IsRunningNow = false; Hive->RemoveSubActor(this); if (Movements == 0) { Hive->TabletCounters->Cumulative()[NHive::COUNTER_BALANCER_FAILED].Increment(1); @@ -159,15 +163,7 @@ protected: } void UpdateProgress() { - if (Settings.MaxMovements != 0) { - Hive->BalancerProgress = Movements * 100 / Settings.MaxMovements; - } else { - if (Hive->TabletsTotal != 0) { - Hive->BalancerProgress = Movements * 100 / Hive->TabletsTotal; - } else { - Hive->BalancerProgress = 0; - } - } + Stats.CurrentMovements = Movements; } void KickNextTablet() { @@ -299,12 +295,18 @@ public: return NKikimrServices::TActivity::HIVE_BALANCER_ACTOR; } - THiveBalancer(THive* hive, TBalancerSettings settings) + THiveBalancer(THive* hive, TBalancerSettings&& settings) : Hive(hive) , KickInFlight(0) , Movements(0) , Settings(std::move(settings)) - {} + , Stats(Hive->BalancerStats[static_cast<std::size_t>(Settings.Type)]) + { + Stats.IsRunningNow = true; + Stats.CurrentMaxMovements = Settings.MaxMovements ? Settings.MaxMovements : Hive->TabletsTotal; + Stats.CurrentMovements = 0; + Stats.LastRunTimestamp = TActivationContext::Now(); + } void Bootstrap() { UpdateProgress(); @@ -322,11 +324,11 @@ public: } }; -void THive::StartHiveBalancer(TBalancerSettings settings) { - if (BalancerProgress == -1) { +void THive::StartHiveBalancer(TBalancerSettings&& settings) { + if (IsItPossibleToStartBalancer(settings.Type)) { + LastBalancerTrigger = settings.Type; auto* balancer = new THiveBalancer(this, std::move(settings)); SubActors.emplace_back(balancer); - BalancerProgress = -2; RegisterWithSameMailbox(balancer); } } diff --git a/ydb/core/mind/hive/drain.cpp b/ydb/core/mind/hive/drain.cpp index c6080744068..3583a91efd4 100644 --- a/ydb/core/mind/hive/drain.cpp +++ b/ydb/core/mind/hive/drain.cpp @@ -210,6 +210,8 @@ void THive::StartHiveDrain(TNodeId nodeId, TDrainSettings settings) { auto* balancer = new THiveDrain(this, nodeId, std::move(settings)); SubActors.emplace_back(balancer); RegisterWithSameMailbox(balancer); + } else { + BLOG_W("It's not possible to start drain on node " << nodeId << ", the node is already busy"); } } diff --git a/ydb/core/mind/hive/fill.cpp b/ydb/core/mind/hive/fill.cpp index ca821cff578..383c8566362 100644 --- a/ydb/core/mind/hive/fill.cpp +++ b/ydb/core/mind/hive/fill.cpp @@ -134,7 +134,8 @@ void THive::StartHiveFill(TNodeId nodeId, const TActorId& initiator) { SubActors.emplace_back(balancer); RegisterWithSameMailbox(balancer); } else { - Send(initiator, new TEvHive::TEvDrainNodeResult(NKikimrProto::ALREADY)); + BLOG_W("It's not possible to start fill on node " << nodeId << ", the node is already busy"); + Send(initiator, new TEvHive::TEvFillNodeResult(NKikimrProto::ALREADY)); } } diff --git a/ydb/core/mind/hive/hive.cpp b/ydb/core/mind/hive/hive.cpp index 5c30add1b78..1879c2d0fc5 100644 --- a/ydb/core/mind/hive/hive.cpp +++ b/ydb/core/mind/hive/hive.cpp @@ -30,11 +30,14 @@ TString EFollowerStrategyName(EFollowerStrategy value) { TString EBalancerTypeName(EBalancerType value) { switch (value) { - case EBalancerType::None: return "???"; case EBalancerType::Scatter: return "Scatter"; + case EBalancerType::ScatterCounter: return "Counter"; + case EBalancerType::ScatterCPU: return "CPU"; + case EBalancerType::ScatterMemory: return "Memory"; + case EBalancerType::ScatterNetwork: return "Network"; case EBalancerType::Emergency: return "Emergency"; + case EBalancerType::SpreadNeighbours: return "Spread"; case EBalancerType::Manual: return "Manual"; - case EBalancerType::SpreadNeighbours: return "SpreadNeighbours"; } } diff --git a/ydb/core/mind/hive/hive.h b/ydb/core/mind/hive/hive.h index 440e101615e..58e8b145f1e 100644 --- a/ydb/core/mind/hive/hive.h +++ b/ydb/core/mind/hive/hive.h @@ -74,21 +74,28 @@ enum class EFollowerStrategy : ui32 { TString EFollowerStrategyName(EFollowerStrategy value); enum class EBalancerType { - None, + Manual, Scatter, + ScatterCounter, + ScatterCPU, + ScatterMemory, + ScatterNetwork, Emergency, - Manual, SpreadNeighbours, + + Last = SpreadNeighbours, }; +constexpr std::size_t EBalancerTypeSize = static_cast<std::size_t>(EBalancerType::Last) + 1; + TString EBalancerTypeName(EBalancerType value); enum class EResourceToBalance { Dominant, + Counter, CPU, Memory, Network, - Counter, }; EResourceToBalance ToResourceToBalance(NMetrics::EResource resource); @@ -242,6 +249,7 @@ struct TDrainSettings { }; struct TBalancerSettings { + EBalancerType Type = EBalancerType::Manual; int MaxMovements = 0; bool RecheckOnFinish = false; ui64 MaxInFlight = 1; @@ -250,6 +258,16 @@ struct TBalancerSettings { std::optional<TObjectId> FilterObjectId; }; +struct TBalancerStats { + ui64 TotalRuns = 0; + ui64 TotalMovements = 0; + bool IsRunningNow = false; + ui64 CurrentMovements = 0; + ui64 CurrentMaxMovements = 0; + TInstant LastRunTimestamp; + ui64 LastRunMovements = 0; +}; + } // NHive } // NKikimr diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index cfe43d91baa..5f27a0ab6cb 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -2188,14 +2188,13 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) { if (!overloadedNodes.empty()) { BLOG_D("Nodes " << overloadedNodes << " with usage over limit " << GetMaxNodeUsageToKick() << " - starting balancer"); - LastBalancerTrigger = EBalancerType::Emergency; - TBalancerSettings emergencySettings{ + StartHiveBalancer({ + .Type = EBalancerType::Emergency, .MaxMovements = (int)CurrentConfig.GetMaxMovementsOnEmergencyBalancer(), .RecheckOnFinish = CurrentConfig.GetContinueEmergencyBalancer(), .MaxInFlight = GetEmergencyBalancerInflight(), .FilterNodeIds = std::move(overloadedNodes), - }; - StartHiveBalancer(std::move(emergencySettings)); + }); return; } } @@ -2204,32 +2203,56 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) { TabletCounters->Cumulative()[NHive::COUNTER_SUGGESTED_SCALE_DOWN].Increment(1); } + if (ObjectDistributions.GetMaxImbalance() > GetObjectImbalanceToBalance()) { + TInstant now = TActivationContext::Now(); + if (LastBalancerTrigger != EBalancerType::SpreadNeighbours + || BalancerStats[static_cast<std::size_t>(EBalancerType::SpreadNeighbours)].LastRunMovements != 0 + || BalancerStats[static_cast<std::size_t>(EBalancerType::SpreadNeighbours)].LastRunTimestamp + TDuration::Seconds(1) < now) { + auto objectToBalance = ObjectDistributions.GetObjectToBalance(); + BLOG_D("Max imbalance " << ObjectDistributions.GetMaxImbalance() << " - starting balancer for object " << objectToBalance.ObjectId); + StartHiveBalancer({ + .Type = EBalancerType::SpreadNeighbours, + .MaxMovements = (int)CurrentConfig.GetMaxMovementsOnAutoBalancer(), + .RecheckOnFinish = CurrentConfig.GetContinueAutoBalancer(), + .MaxInFlight = GetBalancerInflight(), + .FilterNodeIds = std::move(objectToBalance.Nodes), + .FilterObjectId = objectToBalance.ObjectId, + }); + return; + } else { + BLOG_D("Skipping SpreadNeigbours Balancer, now: " << now << ", allowed: " << BalancerStats[static_cast<std::size_t>(EBalancerType::SpreadNeighbours)].LastRunTimestamp + TDuration::Seconds(1)); + } + } + auto scatteredResource = CheckScatter(stats.ScatterByResource); if (scatteredResource) { + EBalancerType balancerType = EBalancerType::Scatter; + switch (*scatteredResource) { + case EResourceToBalance::Counter: + balancerType = EBalancerType::ScatterCounter; + break; + case EResourceToBalance::CPU: + balancerType = EBalancerType::ScatterCPU; + break; + case EResourceToBalance::Memory: + balancerType = EBalancerType::ScatterMemory; + break; + case EResourceToBalance::Network: + balancerType = EBalancerType::ScatterNetwork; + break; + case EResourceToBalance::Dominant: + balancerType = EBalancerType::Scatter; + break; + } BLOG_TRACE("Scatter " << stats.ScatterByResource << " over limit " - << GetMinScatterToBalance() << " - starting balancer"); - LastBalancerTrigger = EBalancerType::Scatter; - TBalancerSettings scatterSettings{ + << GetMinScatterToBalance() << " - starting balancer " << EBalancerTypeName(balancerType)); + StartHiveBalancer({ + .Type = balancerType, .MaxMovements = (int)CurrentConfig.GetMaxMovementsOnAutoBalancer(), .RecheckOnFinish = CurrentConfig.GetContinueAutoBalancer(), .MaxInFlight = GetBalancerInflight(), .ResourceToBalance = *scatteredResource, - }; - StartHiveBalancer(std::move(scatterSettings)); - return; - } - - if (ObjectDistributions.GetTotalImbalance() > GetObjectImbalanceToBalance()) { - LastBalancerTrigger = EBalancerType::SpreadNeighbours; - auto objectToBalance = ObjectDistributions.GetObjectToBalance(); - TBalancerSettings neighboursSettings{ - .MaxMovements = (int)CurrentConfig.GetMaxMovementsOnAutoBalancer(), - .RecheckOnFinish = CurrentConfig.GetContinueAutoBalancer(), - .MaxInFlight = GetBalancerInflight(), - .FilterNodeIds = std::move(objectToBalance.Nodes), - .FilterObjectId = objectToBalance.ObjectId, - }; - StartHiveBalancer(std::move(neighboursSettings)); + }); return; } @@ -2296,11 +2319,11 @@ bool THive::IsValidMetrics(const NKikimrTabletBase::TMetrics& metrics) { } bool THive::IsValidMetricsCPU(const NKikimrTabletBase::TMetrics& metrics) { - return metrics.GetCPU() > 1000/*1ms*/; + return metrics.GetCPU() > 1'000/*1ms*/; } bool THive::IsValidMetricsMemory(const NKikimrTabletBase::TMetrics& metrics) { - return metrics.GetMemory() > 1024/*1KB*/; + return metrics.GetMemory() > 128'000/*128KB*/; } bool THive::IsValidMetricsNetwork(const NKikimrTabletBase::TMetrics& metrics) { @@ -2518,9 +2541,11 @@ void THive::UpdateTabletFollowersNumber(TLeaderTabletInfo& tablet, NIceDb::TNice TDuration THive::GetBalancerCooldown() const { switch(LastBalancerTrigger) { - case EBalancerType::None: - return TDuration::Seconds(0); case EBalancerType::Scatter: + case EBalancerType::ScatterCounter: + case EBalancerType::ScatterCPU: + case EBalancerType::ScatterMemory: + case EBalancerType::ScatterNetwork: case EBalancerType::SpreadNeighbours: return GetMinPeriodBetweenBalance(); case EBalancerType::Emergency: @@ -2532,9 +2557,9 @@ TDuration THive::GetBalancerCooldown() const { void THive::UpdateObjectCount(TObjectId object, TNodeId node, i64 diff) { ObjectDistributions.UpdateCount(object, node, diff); - TabletCounters->Simple()[NHive::COUNTER_BALANCE_OBJECT_IMBALANCE].Set(ObjectDistributions.GetTotalImbalance()); TabletCounters->Simple()[NHive::COUNTER_IMBALANCED_OBJECTS].Set(ObjectDistributions.GetImbalancedObjectsCount()); TabletCounters->Simple()[NHive::COUNTER_WORST_OBJECT_VARIANCE].Set(ObjectDistributions.GetWorstObjectVariance()); + BLOG_TRACE("UpdateObjectCount " << "for " << object << " on " << node << " (" << diff << ") ~> Imbalance: " << ObjectDistributions.GetMaxImbalance()); } ui64 THive::GetObjectImbalance(TObjectId object) { @@ -2557,7 +2582,6 @@ THive::THive(TTabletStorageInfo *info, const TActorId &tablet) , PipeClientCache(NTabletPipe::CreateBoundedClientCache(PipeClientCacheConfig)) , PipeTracker(*PipeClientCache) , PipeRetryPolicy() - , BalancerProgress(-1) , ResponsivenessPinger(nullptr) { TabletCountersPtr.Reset(new TProtobufTabletCounters< @@ -3136,6 +3160,18 @@ TString THive::GetLogPrefix() const { return TStringBuilder() << "HIVE#" << TabletID() << " "; } +bool THive::IsItPossibleToStartBalancer(EBalancerType balancerType) { + for (std::size_t balancer = 0; balancer < std::size(BalancerStats); ++balancer) { + const auto& stats(BalancerStats[balancer]); + if (stats.IsRunningNow) { + EBalancerType type = static_cast<EBalancerType>(balancer); + BLOG_D("It's not possible to start balancer " << EBalancerTypeName(balancerType) << " because balancer " << EBalancerTypeName(type) << " is already running"); + return false; + } + } + return true; +} + } // NHive IActor* CreateDefaultHive(const TActorId &tablet, TTabletStorageInfo *info) { diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h index 8c9d0012d02..f935a090e07 100644 --- a/ydb/core/mind/hive/hive_impl.h +++ b/ydb/core/mind/hive/hive_impl.h @@ -125,7 +125,9 @@ TResourceRawValues ResourceRawValuesFromMetrics(const NKikimrHive::TTabletMetric TString GetResourceValuesText(const NKikimrTabletBase::TMetrics& values); TString GetResourceValuesText(const TTabletInfo& tablet); TString GetResourceValuesText(const TResourceRawValues& values); +NJson::TJsonValue GetResourceValuesJson(const TResourceRawValues& values); TString GetResourceValuesText(const TResourceNormalizedValues& values); +NJson::TJsonValue GetResourceValuesJson(const TResourceNormalizedValues& values); TString GetResourceValuesHtml(const TResourceRawValues& values); NJson::TJsonValue GetResourceValuesJson(const TResourceRawValues& values); NJson::TJsonValue GetResourceValuesJson(const TResourceRawValues& values, const TResourceRawValues& maximum); @@ -146,7 +148,6 @@ TString LongToShortTabletName(const TString& longTabletName); TString GetLocationString(const NActors::TNodeLocation& location); void MakeTabletTypeSet(std::vector<TTabletTypes::EType>& list); bool IsValidTabletType(TTabletTypes::EType type); -TString GetBalancerProgressText(i32 balancerProgress, EBalancerType balancerType); TString GetRunningTabletsText(ui64 runningTablets, ui64 totalTablets, bool warmUp); class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveSharedSettings { @@ -165,6 +166,7 @@ protected: friend class TQueryMigrationWaitActor; friend class TReleaseTabletsWaitActor; friend class TDrainNodeWaitActor; + friend struct TNodeInfo; friend class TTxInitScheme; friend class TTxDeleteBase; @@ -230,10 +232,12 @@ protected: friend struct TStoragePoolInfo; - void StartHiveBalancer(TBalancerSettings settings); + bool IsItPossibleToStartBalancer(EBalancerType balancerType); + void StartHiveBalancer(TBalancerSettings&& settings); void StartHiveDrain(TNodeId nodeId, TDrainSettings settings); void StartHiveFill(TNodeId nodeId, const TActorId& initiator); void CreateEvMonitoring(NMon::TEvRemoteHttpInfo::TPtr& ev, const TActorContext& ctx); + NJson::TJsonValue GetBalancerProgressJson(); ITransaction* CreateDeleteTablet(TEvHive::TEvDeleteTablet::TPtr& ev); ITransaction* CreateDeleteOwnerTablets(TEvHive::TEvDeleteOwnerTablets::TPtr& ev); ITransaction* CreateDeleteTabletResult(TEvTabletBase::TEvDeleteTabletResult::TPtr& ev); @@ -360,10 +364,9 @@ protected: THashMap<ui32, TEvInterconnect::TNodeInfo> NodesInfo; TTabletCountersBase* TabletCounters; TAutoPtr<TTabletCountersBase> TabletCountersPtr; - i32 BalancerProgress; // all values below 0 mean that balancer is not active (-1 = dead, -2 = starting) std::unordered_set<TNodeId> BalancerNodes; // all nodes, affected by running balancers - EBalancerType LastBalancerTrigger = EBalancerType::None; - + EBalancerType LastBalancerTrigger = EBalancerType::Manual; + std::array<TBalancerStats, EBalancerTypeSize> BalancerStats; NKikimrHive::EMigrationState MigrationState = NKikimrHive::EMigrationState::MIGRATION_UNKNOWN; i32 MigrationProgress = 0; NKikimrHive::TEvSeizeTablets MigrationFilter; diff --git a/ydb/core/mind/hive/hive_statics.cpp b/ydb/core/mind/hive/hive_statics.cpp index bf1736e5f67..f9fae013064 100644 --- a/ydb/core/mind/hive/hive_statics.cpp +++ b/ydb/core/mind/hive/hive_statics.cpp @@ -76,6 +76,15 @@ TString GetResourceValuesText(const TResourceRawValues& values) { return str.Str(); } +NJson::TJsonValue GetResourceValuesJson(const TResourceRawValues& values) { + NJson::TJsonValue json; + json["Counter"] = GetCounter(std::get<NMetrics::EResource::Counter>(values)); + json["CPU"] = GetTimes(std::get<NMetrics::EResource::CPU>(values)); + json["Memory"] = GetBytes(std::get<NMetrics::EResource::Memory>(values)); + json["Network"] = GetBytesPerSecond(std::get<NMetrics::EResource::Network>(values)); + return json; +} + TString GetResourceValuesText(const TResourceNormalizedValues& values) { TStringStream str; str << '('; @@ -90,6 +99,15 @@ TString GetResourceValuesText(const TResourceNormalizedValues& values) { return str.Str(); } +NJson::TJsonValue GetResourceValuesJson(const TResourceNormalizedValues& values) { + NJson::TJsonValue json; + json["Counter"] = Sprintf("%.9f", std::get<NMetrics::EResource::Counter>(values)); + json["CPU"] = Sprintf("%.9f", std::get<NMetrics::EResource::CPU>(values)); + json["Memory"] = Sprintf("%.9f", std::get<NMetrics::EResource::Memory>(values)); + json["Network"] = Sprintf("%.9f", std::get<NMetrics::EResource::Network>(values)); + return json; +} + TString GetResourceValuesText(const TTabletInfo& tablet) { TStringStream str; const auto& values(tablet.GetResourceValues()); @@ -141,14 +159,14 @@ TString GetResourceValuesHtml(const TResourceRawValues& values) { return str.Str(); } -NJson::TJsonValue GetResourceValuesJson(const TResourceRawValues& values) { - NJson::TJsonValue value; - value.AppendValue(GetCounter(std::get<NMetrics::EResource::Counter>(values))); - value.AppendValue(GetTimes(std::get<NMetrics::EResource::CPU>(values))); - value.AppendValue(GetBytes(std::get<NMetrics::EResource::Memory>(values))); - value.AppendValue(GetBytesPerSecond(std::get<NMetrics::EResource::Network>(values))); - return value; -} +// NJson::TJsonValue GetResourceValuesJson(const TResourceRawValues& values) { +// NJson::TJsonValue value; +// value.AppendValue(GetCounter(std::get<NMetrics::EResource::Counter>(values))); +// value.AppendValue(GetTimes(std::get<NMetrics::EResource::CPU>(values))); +// value.AppendValue(GetBytes(std::get<NMetrics::EResource::Memory>(values))); +// value.AppendValue(GetBytesPerSecond(std::get<NMetrics::EResource::Network>(values))); +// return value; +// } NJson::TJsonValue GetResourceValuesJson(const TResourceRawValues& values, const TResourceRawValues& maximum) { NMetrics::EResource resource = GetDominantResourceType(values, maximum); @@ -201,7 +219,7 @@ TString GetValueWithColoredGlyph(double val, double maxVal) { if (maxVal != 0) { ratio = val / maxVal; } else { - ratio = 1.0; + ratio = val ? 1.0 : 0.0; } TString glyph; if (ratio < 0.9) { @@ -370,12 +388,20 @@ bool IsValidTabletType(TTabletTypes::EType type) { ); } -TString GetBalancerProgressText(i32 balancerProgress, EBalancerType balancerType) { - TStringBuilder str; - if (balancerProgress >= 0) { - str << balancerProgress << "% (" << EBalancerTypeName(balancerType) << ")"; +NJson::TJsonValue THive::GetBalancerProgressJson() { + NJson::TJsonValue result; + for (const auto& stats : BalancerStats) { + NJson::TJsonValue json; + json["TotalRuns"] = stats.TotalRuns; + json["TotalMovements"] = stats.TotalMovements; + json["IsRunningNow"] = stats.IsRunningNow; + json["CurrentMovements"] = stats.CurrentMovements; + json["CurrentMaxMovements"] = stats.CurrentMaxMovements; + json["LastRunTimestamp"] = stats.LastRunTimestamp.ToString(); + json["LastRunMovements"] = stats.LastRunMovements; + result.AppendValue(std::move(json)); } - return str; + return result; } TString GetRunningTabletsText(ui64 runningTablets, ui64 totalTablets, bool warmUp) { diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp index 770ca31a6ff..445a16aa876 100644 --- a/ydb/core/mind/hive/hive_ut.cpp +++ b/ydb/core/mind/hive/hive_ut.cpp @@ -4191,6 +4191,16 @@ Y_UNIT_TEST_SUITE(THiveTest) { MakeSureTabletIsUp(runtime, tabletId, 0); } + // report empty metrics to turn neighbour-balancing on + for (auto tablet : tablets) { + THolder<TEvHive::TEvTabletMetrics> metrics = MakeHolder<TEvHive::TEvTabletMetrics>(); + NKikimrHive::TTabletMetrics* metric = metrics->Record.AddTabletMetrics(); + metric->SetTabletID(tablet); + metric->MutableResourceUsage()->SetMemory(0); + + runtime.SendToPipe(hiveTablet, senderA, metrics.Release()); + } + // update objects, so that distribution of objects on nodes becomes {0, 0, 0, 1}, {0, 1, 1, 1} auto initialDistribution = getDistribution(); TVector<ui64> tabletsToUpdate = {initialDistribution[0][0], initialDistribution[1][0], initialDistribution[1][1], initialDistribution[1][2]}; diff --git a/ydb/core/mind/hive/monitoring.cpp b/ydb/core/mind/hive/monitoring.cpp index b3e2913d48d..8d4ebbeb104 100644 --- a/ydb/core/mind/hive/monitoring.cpp +++ b/ydb/core/mind/hive/monitoring.cpp @@ -1,4 +1,4 @@ -#include <library/cpp/monlib/service/pages/templates.h> +#include <library/cpp/monlib/service/pages/templates.h> #include <library/cpp/json/json_writer.h> #include <library/cpp/protobuf/json/proto2json.h> #include <util/string/vector.h> @@ -1327,6 +1327,7 @@ public: out << "<head>"; out << "<style>"; + out << "table.simple-table1 th { text-align: center; }"; out << "table.simple-table1 td { padding: 1px 3px; }"; out << "table.simple-table1 td:nth-child(1) { text-align: right; }"; out << "table.simple-table2 th { text-align: right; }"; @@ -1337,41 +1338,73 @@ public: out << "table.simple-table2 td:nth-child(2) { text-align: left; }"; out << "table.simple-table2 td:nth-child(3) { text-align: left; }"; out << "table.simple-table2 td:nth-child(4) { text-align: left; }"; + out << "table.simple-table3 td { padding: 1px 3px; text-align: right; }"; + out << "table.simple-table3 th { text-align: center; }"; out << ".table-hover tbody tr:hover > td { background-color: #9dddf2; }"; out << ".blinking { animation:blinkingText 0.8s infinite; }"; out << "@keyframes blinkingText { 0% { color: #000; } 49% { color: #000; } 60% { color: transparent; } 99% { color:transparent; } 100% { color: #000; } }"; out << "</style>"; out << "</head>"; out << "<body>"; - out << "<table class='simple-table1'>"; - + out << "<div style='display:flex'><div style='min-width:220px'><table class='simple-table1'>"; + out << "<tr><th colspan='2'>Info</th></tr>"; TSubDomainKey domainId = Self->GetMySubDomainKey(); if (domainId) { - out << "<tr><td>" << "Tenant:" << "</td>"; + out << "<tr><td><span id='alert-placeholder' class='glyphicon' style='height:14px'></span>" << "Tenant:" << "</td>"; TDomainInfo* domainInfo = Self->FindDomain(domainId); if (domainInfo && domainInfo->Path) { out << "<td>" << domainInfo->Path << "</td>"; } else { out << "<td>" << domainId << "</td>"; } - out << "<td><span id='alert-placeholder' class='glyphicon' style='height:14px'></span></td>"; - out << "</tr>"; + out << "<td></tr>"; } - /*out << "<tr><td>" << "Nodes:" << "</td><td id='aliveNodes'>" << (nodes == 0 ? 0 : aliveNodes * 100 / nodes) << "% " - << aliveNodes << "/" << nodes << "</td></tr>";*/ - out << "<tr><td>" << "Tablets:" << "</td><td id='runningTablets'>" << GetRunningTabletsText(runningTablets, tablets, Self->WarmUp) << "</td></tr>"; - out << "<tr><td>Balancer:</td><td id='balancerProgress'>" - << GetBalancerProgressText(Self->BalancerProgress, Self->LastBalancerTrigger) << "</td></tr>"; + out << "<tr><td>" << "Nodes:" << "</td><td id='aliveNodes'>" << aliveNodes << "</td></tr>"; + out << "<tr><td>" << "Tablets:" << "</td><td id='runningTablets'>" << runningTablets << "</td></tr>"; out << "<tr><td>" << "Boot Queue:" << "</td><td id='bootQueue'>" << Self->BootQueue.BootQueue.size() << "</td></tr>"; out << "<tr><td>" << "Wait Queue:" << "</td><td id='waitQueue'>" << Self->BootQueue.WaitQueue.size() << "</td></tr>"; - out << "<tr><td>" << "Resource Total: " << "</td><td id='resourceTotal'>" << GetResourceValuesText(Self->TotalRawResourceValues) << "</td></tr>"; - out << "<tr><td>" << "Resource StDev: " << "</td><td id='resourceVariance'>" - << convert(Self->GetStDevResourceValues(), [](double d) -> TString { return Sprintf("%.9f", d); }) << "</td></tr>"; - THive::THiveStats stats = Self->GetStats(); - out << "<tr><td>" << "Max usage:" << "<td id='maxUsage'>" << GetValueWithColoredGlyph(stats.MaxUsage, Self->GetMaxNodeUsageToKick()) << "</td></tr>"; - out << "<tr><td>" << "Scatter:" << "<td id='scatter'>" << convert(stats.ScatterByResource, Self->GetMinScatterToBalance(), GetValueWithColoredGlyph) << "</td></tr>"; - out << "</table>"; + out << "</table></div>"; + out << "<div style='width:180px'><table class='simple-table1'>"; + out << "<tr><th colspan='2'>Totals</th></tr>"; + out << "<tr><td>Counter</td><td id='resourceTotalCounter'></td></tr>"; + out << "<tr><td>CPU</td><td id='resourceTotalCPU'></td></tr>"; + out << "<tr><td>Memory</td><td id='resourceTotalMemory'></td></tr>"; + out << "<tr><td>Network</td><td id='resourceTotalNetwork'></td></tr>"; + out << "</table></div>"; + out << "<div style='width:220px'><table class='simple-table1'>"; + out << "<tr><th colspan='2'>Variance</th></tr>"; + out << "<tr><td>Counter</td><td id='resourceStdDevCounter'></td></tr>"; + out << "<tr><td>CPU</td><td id='resourceStdDevCPU'></td></tr>"; + out << "<tr><td>Memory</td><td id='resourceStdDevMemory'></td></tr>"; + out << "<tr><td>Network</td><td id='resourceStdDevNetwork'></td></tr>"; + out << "</table></div>"; + out << "<div style='min-width:220px'><table class='simple-table1'>"; + out << "<tr><th colspan='2'>Triggers</th></tr>"; + out << "<tr><td>Counter</td><td id='resourceScatterCounter'></td></tr>"; + out << "<tr><td>CPU</td><td id='resourceScatterCPU'></td></tr>"; + out << "<tr><td>Memory</td><td id='resourceScatterMemory'></td></tr>"; + out << "<tr><td>Network</td><td id='resourceScatterNetwork'></td></tr>"; + out << "<tr><td>MaxUsage</td><td id='maxUsage'></td></tr>"; + out << "<tr><td>Imbalance</td><td id='objectImbalance'></td></tr>"; + out << "</table></div>"; + out << "<div style='min-width:220px'><table class='simple-table3'>"; + out << "<tr><th>Balancer</th><th style='min-width:50px'>Runs</th><th style='min-width:50px'>Moves</th>"; + out << "<th style='min-width:80px'>Last run</th><th style='min-width:80px'>Last moves</th><th style='min-width:80px'>Progress</th></tr>"; + for (EBalancerType type : { + EBalancerType::ScatterCounter, + EBalancerType::ScatterCPU, + EBalancerType::ScatterMemory, + EBalancerType::ScatterNetwork, + EBalancerType::Emergency, + EBalancerType::SpreadNeighbours, + EBalancerType::Scatter, + EBalancerType::Manual + }) { + int balancer = static_cast<int>(type); + out << "<tr id='balancer" << balancer << "'><td>" << EBalancerTypeName(type) << "</td><td></td><td></td><td></td><td></td><td></td></tr>"; + } + out << "</table></div></div>"; out << "<table id='node_table' class='table simple-table2 table-hover table-condensed'>"; out << "<thead><tr>" @@ -1649,350 +1682,420 @@ public: } out << "];"; out << R"___( - $('.container') - .toggleClass('container container-fluid') - .css('padding-left', '1%') - .css('padding-right', '1%'); - - function initReassignGroups() { - var domTabletType = document.getElementById('tablet_type'); - for (var tab = 0; tab < tablets.length; tab++) { - var opt = document.createElement('option'); - opt.text = tablets[tab].name; - opt.value = tablets[tab].type; - domTabletType.add(opt); - } - } - )___"; +$('.container') + .toggleClass('container container-fluid') + .css('padding-left', '1%') + .css('padding-right', '1%'); - out << R"___( - initReassignGroups(); - - var tablets_found; - var Nodes = {}; - - function queryTablets() { - var storage_pool = $('#tablet_storage_pool').val(); - var storage_group = $('#tablet_storage_group').val(); - var tablet_type = $('#tablet_type').val(); - var channel_from = $('#tablet_from_channel').val(); - var channel_to = $('#tablet_to_channel').val(); - var percent = $('#tablet_percent').val(); - var url = 'app?TabletID=' + hiveId + '&page=FindTablet'; - if (storage_pool) { - url = url + '&storagePool=' + storage_pool; - } - if (storage_group) { - url = url + '&group=' + storage_group; - } - if (tablet_type) { - url = url + '&type=' + tablet_type; - } - if (channel_from) { - url = url + '&channelFrom=' + channel_from; - } - if (channel_to) { - url = url + '&channelTo=' + channel_to; - } - if (percent) { - url = url + '&percent=' + percent; - } - $.ajax({ - url: url, - success: function(result) { - tablets_found = result; - $('#tablets_found_group').parent().css({visibility: 'visible'}); - $('#tablets_found').text(tablets_found.length); - $('#button_reassign').removeClass('disabled'); - }, - error: function(jqXHR, status) { - $('#status_text').text(status); - } - }); - } - - var tables_processed; - var current_inflight; - - function continueReassign() { - var max_inflight = $('#tablet_reassign_inflight').val(); - while (tablets_processed < tablets_found.length && current_inflight < max_inflight) { - var tablet = tablets_found[tablets_processed]; - tablets_processed++; - current_inflight++; - $('#current_inflight').text(current_inflight); - $.ajax({ - url: 'app?TabletID=' + hiveId - + '&page=ReassignTablet&tablet=' + tablet.tabletId - + '&channels=' + tablet.channels - + '&wait=1', - success: function() { - - }, - error: function(jqXHR, status) { - $('#status_text').text(status); - }, - complete: function() { - $('#tablets_processed').text(tablets_processed); - var value = Number(tablets_processed * 100 / tablets_found.length).toFixed(); - $('#progress_bar').css('width', value + '%').attr('aria-valuenow', value).text(value + '%'); - current_inflight--; - continueReassign(); - }, - }); - } - if (tablets_processed >= tablets_found.length) { - $('#button_query').removeClass('disabled'); - $('#button_reassign').removeClass('disabled'); - } - } +function initReassignGroups() { + var domTabletType = document.getElementById('tablet_type'); + for (var tab = 0; tab < tablets.length; tab++) { + var opt = document.createElement('option'); + opt.text = tablets[tab].name; + opt.value = tablets[tab].type; + domTabletType.add(opt); + } +} - function cancel() { - tablets_processed = tablets_found.length; - $('#tablets_processed_group').parent().css({visibility: 'hidden'}); - $('#current_inflight_group').parent().css({visibility: 'hidden'}); - $('#time_left_group').parent().css({visibility: 'hidden'}); - $('#progress_bar_group').parent().css({visibility: 'hidden'}); - } +initReassignGroups(); + +var tablets_found; +var Nodes = {}; + +function queryTablets() { + var storage_pool = $('#tablet_storage_pool').val(); + var storage_group = $('#tablet_storage_group').val(); + var tablet_type = $('#tablet_type').val(); + var channel_from = $('#tablet_from_channel').val(); + var channel_to = $('#tablet_to_channel').val(); + var percent = $('#tablet_percent').val(); + var url = 'app?TabletID=' + hiveId + '&page=FindTablet'; + if (storage_pool) { + url = url + '&storagePool=' + storage_pool; + } + if (storage_group) { + url = url + '&group=' + storage_group; + } + if (tablet_type) { + url = url + '&type=' + tablet_type; + } + if (channel_from) { + url = url + '&channelFrom=' + channel_from; + } + if (channel_to) { + url = url + '&channelTo=' + channel_to; + } + if (percent) { + url = url + '&percent=' + percent; + } + $.ajax({ + url: url, + success: function(result) { + tablets_found = result; + $('#tablets_found_group').parent().css({visibility: 'visible'}); + $('#tablets_found').text(tablets_found.length); + $('#button_reassign').removeClass('disabled'); + }, + error: function(jqXHR, status) { + $('#status_text').text(status); + } + }); +} - function reassignGroups() { - $('#tablets_processed_group').parent().css({visibility: 'visible'}); - $('#current_inflight_group').parent().css({visibility: 'visible'}); - //$('#time_left_group').parent().css({visibility: 'visible'}); - $('#progress_bar_group').parent().css({visibility: 'visible'}); - $('#button_query').addClass('disabled'); - $('#button_reassign').addClass('disabled'); - tablets_processed = 0; - current_inflight = 0; +var tables_processed; +var current_inflight; + +function continueReassign() { + var max_inflight = $('#tablet_reassign_inflight').val(); + while (tablets_processed < tablets_found.length && current_inflight < max_inflight) { + var tablet = tablets_found[tablets_processed]; + tablets_processed++; + current_inflight++; + $('#current_inflight').text(current_inflight); + $.ajax({ + url: 'app?TabletID=' + hiveId + + '&page=ReassignTablet&tablet=' + tablet.tabletId + + '&channels=' + tablet.channels + + '&wait=1', + success: function() { + + }, + error: function(jqXHR, status) { + $('#status_text').text(status); + }, + complete: function() { + $('#tablets_processed').text(tablets_processed); + var value = Number(tablets_processed * 100 / tablets_found.length).toFixed(); + $('#progress_bar').css('width', value + '%').attr('aria-valuenow', value).text(value + '%'); + current_inflight--; continueReassign(); - } + }, + }); + } + if (tablets_processed >= tablets_found.length) { + $('#button_query').removeClass('disabled'); + $('#button_reassign').removeClass('disabled'); + } +} - function setDown(element, nodeId, down) { - if (down && $(element).hasClass('glyphicon-ok')) { - $(element).removeClass('glyphicon-ok'); - element.inProgress = true; - $.ajax({url:'app?TabletID=' + hiveId + '&node=' + nodeId + '&page=SetDown&down=1', success: function(){ $(element).addClass('glyphicon-remove'); element.inProgress = false; }}); - } else if (!down && $(element).hasClass('glyphicon-remove')) { - $(element).removeClass('glyphicon-remove'); - element.inProgress = true; - $.ajax({url:'app?TabletID=' + hiveId + '&node=' + nodeId + '&page=SetDown&down=0', success: function(){ $(element).addClass('glyphicon-ok'); element.inProgress = false; }}); - } - } +function cancel() { + tablets_processed = tablets_found.length; + $('#tablets_processed_group').parent().css({visibility: 'hidden'}); + $('#current_inflight_group').parent().css({visibility: 'hidden'}); + $('#time_left_group').parent().css({visibility: 'hidden'}); + $('#progress_bar_group').parent().css({visibility: 'hidden'}); +} - function toggleDown(element, nodeId) { - setDown(element, nodeId, $(element).hasClass('glyphicon-ok')); - } +function reassignGroups() { + $('#tablets_processed_group').parent().css({visibility: 'visible'}); + $('#current_inflight_group').parent().css({visibility: 'visible'}); + //$('#time_left_group').parent().css({visibility: 'visible'}); + $('#progress_bar_group').parent().css({visibility: 'visible'}); + $('#button_query').addClass('disabled'); + $('#button_reassign').addClass('disabled'); + tablets_processed = 0; + current_inflight = 0; + continueReassign(); +} - function toggleFreeze(element, nodeId) { - if ($(element).hasClass('glyphicon-play')) { - $(element).removeClass('glyphicon-play'); - element.inProgress = true; - $.ajax({url:'app?TabletID=' + hiveId + '&node=' + nodeId + '&page=SetFreeze&freeze=1', success: function(){ $(element).addClass('glyphicon-pause'); element.inProgress = false; }}); - } else if ($(element).hasClass('glyphicon-pause')) { - $(element).removeClass('glyphicon-pause'); - element.inProgress = true; - $.ajax({url:'app?TabletID=' + hiveId + '&node=' + nodeId + '&page=SetFreeze&freeze=0', success: function(){ $(element).addClass('glyphicon-play'); element.inProgress = false; }}); - } - } +function setDown(element, nodeId, down) { + if (down && $(element).hasClass('glyphicon-ok')) { + $(element).removeClass('glyphicon-ok'); + element.inProgress = true; + $.ajax({url:'app?TabletID=' + hiveId + '&node=' + nodeId + '&page=SetDown&down=1', success: function(){ $(element).addClass('glyphicon-remove'); element.inProgress = false; }}); + } else if (!down && $(element).hasClass('glyphicon-remove')) { + $(element).removeClass('glyphicon-remove'); + element.inProgress = true; + $.ajax({url:'app?TabletID=' + hiveId + '&node=' + nodeId + '&page=SetDown&down=0', success: function(){ $(element).addClass('glyphicon-ok'); element.inProgress = false; }}); + } +} - function kickNode(element, nodeId) { - $(element).removeClass('glyphicon-transfer'); - $.ajax({url:'app?TabletID=' + hiveId + '&node=' + nodeId + '&page=KickNode', success: function(){ $(element).addClass('glyphicon-transfer'); }}); - } +function toggleDown(element, nodeId) { + setDown(element, nodeId, $(element).hasClass('glyphicon-ok')); +} - function drainNode(element, nodeId) { - $(element).removeClass('glyphicon-transfer'); - $.ajax({url:'app?TabletID=' + hiveId + '&node=' + nodeId + '&page=DrainNode', success: function(){ $(element).addClass('blinking'); Nodes[nodeId].Drain = true; }}); - } +function toggleFreeze(element, nodeId) { + if ($(element).hasClass('glyphicon-play')) { + $(element).removeClass('glyphicon-play'); + element.inProgress = true; + $.ajax({url:'app?TabletID=' + hiveId + '&node=' + nodeId + '&page=SetFreeze&freeze=1', success: function(){ $(element).addClass('glyphicon-pause'); element.inProgress = false; }}); + } else if ($(element).hasClass('glyphicon-pause')) { + $(element).removeClass('glyphicon-pause'); + element.inProgress = true; + $.ajax({url:'app?TabletID=' + hiveId + '&node=' + nodeId + '&page=SetFreeze&freeze=0', success: function(){ $(element).addClass('glyphicon-play'); element.inProgress = false; }}); + } +} - function rebalanceTablets() { - $('#balancerProgress').html('o.O'); - var max_movements = $('#balancer_max_movements').val(); - $.ajax({url:'app?TabletID=' + hiveId + '&page=Rebalance&movements=' + max_movements}); - } +function kickNode(element, nodeId) { + $(element).removeClass('glyphicon-transfer'); + $.ajax({url:'app?TabletID=' + hiveId + '&node=' + nodeId + '&page=KickNode', success: function(){ $(element).addClass('glyphicon-transfer'); }}); +} - function rebalanceTabletsFromScratch(element) { - var tenant_name = $('#tenant_name').val(); - $.ajax({url:'app?TabletID=' + hiveId + '&page=RebalanceFromScratch&tenantName=' + tenant_name}); - } +function drainNode(element, nodeId) { + $(element).removeClass('glyphicon-transfer'); + $.ajax({url:'app?TabletID=' + hiveId + '&node=' + nodeId + '&page=DrainNode', success: function(){ $(element).addClass('blinking'); Nodes[nodeId].Drain = true; }}); +} - function toggleAlert() { - $('#alert-placeholder').toggleClass('glyphicon-refresh'); - } +function rebalanceTablets() { + $('#balancerProgress').html('o.O'); + var max_movements = $('#balancer_max_movements').val(); + $.ajax({url:'app?TabletID=' + hiveId + '&page=Rebalance&movements=' + max_movements}); +} + +function rebalanceTabletsFromScratch(element) { + var tenant_name = $('#tenant_name').val(); + $.ajax({url:'app?TabletID=' + hiveId + '&page=RebalanceFromScratch&tenantName=' + tenant_name}); +} + +function toggleAlert() { + $('#alert-placeholder').toggleClass('glyphicon-refresh'); +} + +function clearAlert() { + $('#alert-placeholder').removeClass('glyphicon-refresh'); +} - function clearAlert() { - $('#alert-placeholder').removeClass('glyphicon-refresh'); +var Empty = true; + +function getBalancerString(balancer) { + return 'runs=' + balancer.TotalRuns + ' moves=' + balancer.TotalMovements; +} + +function fillDataShort(result) { + try { + if ("TotalTablets" in result) { + var percent = Math.floor(result.RunningTablets * 100 / result.TotalTablets) + '%'; + var values = result.RunningTablets + ' of ' + result.TotalTablets; + var warmup = result.Warmup ? "<span class='glyphicon glyphicon-fire' style='color:red; margin-right:4px'></span>" : ""; + $('#runningTablets').html(warmup + percent + ' (' + values + ')'); + $('#aliveNodes').html(result.AliveNodes); + $('#bootQueue').html(result.BootQueueSize); + $('#waitQueue').html(result.WaitQueueSize); + $('#maxUsage').html(result.MaxUsage); + $('#objectImbalance').html(result.ObjectImbalance); + + $('#resourceTotalCounter').html(result.ResourceTotal.Counter); + $('#resourceTotalCPU').html(result.ResourceTotal.CPU); + $('#resourceTotalMemory').html(result.ResourceTotal.Memory); + $('#resourceTotalNetwork').html(result.ResourceTotal.Network); + + $('#resourceStdDevCounter').html(result.ResourceVariance.Counter); + $('#resourceStdDevCPU').html(result.ResourceVariance.CPU); + $('#resourceStdDevMemory').html(result.ResourceVariance.Memory); + $('#resourceStdDevNetwork').html(result.ResourceVariance.Network); + + $('#resourceScatterCounter').html(result.ScatterHtml.Counter); + $('#resourceScatterCPU').html(result.ScatterHtml.CPU); + $('#resourceScatterMemory').html(result.ScatterHtml.Memory); + $('#resourceScatterNetwork').html(result.ScatterHtml.Network); + + for (var b = 0; b < result.Balancers.length; b++) { + var balancerObj = result.Balancers[b]; + var balancerHtml = $('#balancer' + b)[0]; + balancerHtml.cells[1].innerHTML = balancerObj.TotalRuns; + balancerHtml.cells[2].innerHTML = balancerObj.TotalMovements; + if (balancerObj.TotalRuns > 0) { + balancerHtml.cells[3].innerHTML = balancerObj.LastRunTimestamp; + balancerHtml.cells[4].innerHTML = balancerObj.LastRunMovements; + } else { + balancerHtml.cells[3].innerHTML = ''; + balancerHtml.cells[4].innerHTML = ''; + } + if (balancerObj.IsRunningNow && balancerObj.CurrentMaxMovements > 0) { + balancerHtml.cells[5].innerHTML = Math.floor(balancerObj.CurrentMovements * 100 / balancerObj.CurrentMaxMovements) + '%'; + } else { + balancerHtml.cells[5].innerHTML = ''; + } } - )___"; + } + clearAlert(); + } + catch(err) { + toggleAlert(); + } +} - out << R"___( +function onFreshDataShort(result) { + fillDataShort(result); + setTimeout(function(){updateDataShort();}, 500); +} - var Empty = true; - - function onFreshData(result) { - var nlen; - try { - if ("TotalTablets" in result) { - $('#runningTablets').html(result.RunningTabletsText); - //$('#aliveNodes').html(result.TotalNodes == 0 ? 0 : Math.floor(result.AliveNodes * 100 / result.TotalNodes) + '% ' + result.AliveNodes + '/' + result.TotalNodes); - $('#resourceVariance').html(result.ResourceVariance); - $('#resourceTotal').html(result.ResourceTotal); - $('#bootQueue').html(result.BootQueueSize); - $('#waitQueue').html(result.WaitQueueSize); - $('#balancerProgress').html(result.BalancerProgress); - $('#maxUsage').html(result.MaxUsage); - $('#scatter').html(result.Scatter); - $('#move_history > tbody > tr').remove(); - for (var i in result.Moves) { - $(result.Moves[i]).appendTo('#move_history > tbody'); - } - var old_nodes = {}; - if (Empty) { - // initialization - $('#node_table > tbody > tr').remove(); - Empty = false; +function onFreshDataLong(result) { + var nlen; + try { + fillDataShort(result); + if ("Nodes" in result) { + $('#move_history > tbody > tr').remove(); + for (var i in result.Moves) { + $(result.Moves[i]).appendTo('#move_history > tbody'); + } + var old_nodes = {}; + if (Empty) { + // initialization + $('#node_table > tbody > tr').remove(); + Empty = false; + } else { + for (var id in Nodes) { + old_nodes[id] = true; + } + } + var was_append = false; + nlen = result.Nodes.length; + for (i = 0; i < nlen; i++) { + var node = result.Nodes[i]; + var old_node = Nodes[node.Id]; + var nodeElement = $('#node' + node.Id).get(0); + var nodeElement; + if (old_node) { + nodeElement = old_node.NodeElement; + } else { + nodeElement = $('<tr id="node' + node.Id + '"><td>' + node.Id + '</td>' + + '<td></td>' + + '<td></td>' + + '<td></td>' + + '<td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td>' + + '<td style="text-align:center"><span title="Toggle node availability" onclick="toggleDown(this,' + node.Id + ')" style="cursor:pointer" class="active-mark glyphicon glyphicon-ok"></span></td>' + + '<td style="text-align:center"><span title="Toggle node freeze" onclick="toggleFreeze(this,' + node.Id + ')" style="cursor:pointer" class="glyphicon glyphicon-play"></span></td>' + + '<td style="text-align:center"><span title="Kick tablets on this node" onclick="kickNode(this,' + node.Id + ')" style="cursor:pointer" class="glyphicon glyphicon-transfer"></span></td>' + + '<td style="text-align:center"><span title="Drain this node" onclick="drainNode(this,' + node.Id + ')" style="cursor:pointer" class="glyphicon glyphicon-log-out"></span></td>' + + '</tr>').appendTo('#node_table > tbody').get(0); + nodeElement.cells[1].innerHTML = '<a href="' + node.Host + ':8765">' + node.Name + '</a>'; + nodeElement.cells[2].innerHTML = node.DataCenter; + was_append = true; + } + delete old_nodes[node.Id]; + if (!old_node || old_node.Alive != node.Alive) { + if (node.Alive) { + nodeElement.style.color = 'initial'; + } else { + nodeElement.style.color = '#E0E0E0'; + } + } + var element = $(nodeElement.cells[14].children[0]); + if (!element.hasOwnProperty("inProgress") || !element.inProgress) { + if (!old_node || old_node.Down != node.Down) { + if (node.Down) { + element.removeClass('glyphicon-ok'); + element.addClass('glyphicon-remove'); } else { - for (var id in Nodes) { - old_nodes[id] = true; - } - } - var was_append = false; - nlen = result.Nodes.length; - for (i = 0; i < nlen; i++) { - var node = result.Nodes[i]; - var old_node = Nodes[node.Id]; - var nodeElement = $('#node' + node.Id).get(0); - var nodeElement; - if (old_node) { - nodeElement = old_node.NodeElement; - } else { - nodeElement = $('<tr id="node' + node.Id + '"><td>' + node.Id + '</td>' - + '<td></td>' - + '<td></td>' - + '<td></td>' - + '<td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td>' - + '<td style="text-align:center"><span title="Toggle node availability" onclick="toggleDown(this,' + node.Id + ')" style="cursor:pointer" class="active-mark glyphicon glyphicon-ok"></span></td>' - + '<td style="text-align:center"><span title="Toggle node freeze" onclick="toggleFreeze(this,' + node.Id + ')" style="cursor:pointer" class="glyphicon glyphicon-play"></span></td>' - + '<td style="text-align:center"><span title="Kick tablets on this node" onclick="kickNode(this,' + node.Id + ')" style="cursor:pointer" class="glyphicon glyphicon-transfer"></span></td>' - + '<td style="text-align:center"><span title="Drain this node" onclick="drainNode(this,' + node.Id + ')" style="cursor:pointer" class="glyphicon glyphicon-log-out"></span></td>' - + '</tr>').appendTo('#node_table > tbody').get(0); - nodeElement.cells[1].innerHTML = '<a href="' + node.Host + ':8765">' + node.Name + '</a>'; - nodeElement.cells[2].innerHTML = node.DataCenter; - was_append = true; - } - delete old_nodes[node.Id]; - if (!old_node || old_node.Alive != node.Alive) { - if (node.Alive) { - nodeElement.style.color = 'initial'; - } else { - nodeElement.style.color = '#E0E0E0'; - } - } - var element = $(nodeElement.cells[14].children[0]); - if (!element.hasOwnProperty("inProgress") || !element.inProgress) { - if (!old_node || old_node.Down != node.Down) { - if (node.Down) { - element.removeClass('glyphicon-ok'); - element.addClass('glyphicon-remove'); - } else { - element.removeClass('glyphicon-remove'); - element.addClass('glyphicon-ok'); - } - } - } - element = $(nodeElement.cells[15].children[0]); - if (!element.hasOwnProperty("inProgress") || !element.inProgress) { - if (!old_node || old_node.Freeze != node.Freeze) { - if (node.Freeze) { - element.removeClass('glyphicon-play'); - element.addClass('glyphicon-pause'); - } else { - element.removeClass('glyphicon-pause'); - element.addClass('glyphicon-play'); - } - } - } - element = $(nodeElement.cells[17].children[0]); - if (!element.hasOwnProperty("inProgress") || !element.inProgress) { - if (!old_node || old_node.Drain != node.Drain) { - if (node.Drain) { - element.addClass('blinking'); - } else { - element.removeClass('blinking'); - } - } - } - if (!old_node || old_node.Name != node.Name) { - nodeElement.cells[1].innerHTML = '<a href="' + node.Host + ':8765">' + node.Name + '</a>'; - } - if (!old_node || old_node.DataCenter != node.DataCenter) { - nodeElement.cells[2].innerHTML = node.DataCenter; - } - if (!old_node || old_node.Domain != node.Domain) { - nodeElement.cells[3].innerHTML = node.Domain; - } - if (!old_node || old_node.Uptime != node.Uptime) { - nodeElement.cells[4].innerHTML = node.Uptime; - } - if (!old_node || old_node.Unknown != node.Unknown) { - nodeElement.cells[5].innerHTML = node.Unknown; - } - if (!old_node || old_node.Starting != node.Starting) { - nodeElement.cells[6].innerHTML = node.Starting; - } - if (!old_node || old_node.Running != node.Running) { - nodeElement.cells[7].innerHTML = node.Running; - } - if (!old_node || old_node.Types != node.Types) { - nodeElement.cells[8].innerHTML = node.Types; - } - if (!old_node || old_node.Usage != node.Usage) { - nodeElement.cells[9].innerHTML = node.Usage; - } - if (!old_node || old_node.ResourceValues[0] != node.ResourceValues[0]) { - nodeElement.cells[10].innerHTML = node.ResourceValues[0]; - } - if (!old_node || old_node.ResourceValues[1] != node.ResourceValues[1]) { - nodeElement.cells[11].innerHTML = node.ResourceValues[1]; - } - if (!old_node || old_node.ResourceValues[2] != node.ResourceValues[2]) { - nodeElement.cells[12].innerHTML = node.ResourceValues[2]; - } - if (!old_node || old_node.ResourceValues[3] != node.ResourceValues[3]) { - nodeElement.cells[13].innerHTML = node.ResourceValues[3]; - } - node.NodeElement = nodeElement; - Nodes[node.Id] = node; + element.removeClass('glyphicon-remove'); + element.addClass('glyphicon-ok'); } - for (var id in old_nodes) { - $('#node' + id).remove(); - delete Nodes[id]; + } + } + element = $(nodeElement.cells[15].children[0]); + if (!element.hasOwnProperty("inProgress") || !element.inProgress) { + if (!old_node || old_node.Freeze != node.Freeze) { + if (node.Freeze) { + element.removeClass('glyphicon-play'); + element.addClass('glyphicon-pause'); + } else { + element.removeClass('glyphicon-pause'); + element.addClass('glyphicon-play'); } - if (was_append) { - $('#node_table > tbody > tr').sort(function(a,b) { - if (a.cells[3].innerHTML > b.cells[3].innerHTML) - return 1; - if (a.cells[3].innerHTML < b.cells[3].innerHTML) - return -1; - return parseInt(a.cells[0].innerHTML, 10) - parseInt(b.cells[0].innerHTML, 10); - }).appendTo('#node_table > tbody'); + } + } + element = $(nodeElement.cells[17].children[0]); + if (!element.hasOwnProperty("inProgress") || !element.inProgress) { + if (!old_node || old_node.Drain != node.Drain) { + if (node.Drain) { + element.addClass('blinking'); + } else { + element.removeClass('blinking'); } } - clearAlert(); } - catch(err) { - toggleAlert(); + if (!old_node || old_node.Name != node.Name) { + nodeElement.cells[1].innerHTML = '<a href="' + node.Host + ':8765">' + node.Name + '</a>'; + } + if (!old_node || old_node.DataCenter != node.DataCenter) { + nodeElement.cells[2].innerHTML = node.DataCenter; + } + if (!old_node || old_node.Domain != node.Domain) { + nodeElement.cells[3].innerHTML = node.Domain; + } + if (!old_node || old_node.Uptime != node.Uptime) { + nodeElement.cells[4].innerHTML = node.Uptime; + } + if (!old_node || old_node.Unknown != node.Unknown) { + nodeElement.cells[5].innerHTML = node.Unknown; + } + if (!old_node || old_node.Starting != node.Starting) { + nodeElement.cells[6].innerHTML = node.Starting; + } + if (!old_node || old_node.Running != node.Running) { + nodeElement.cells[7].innerHTML = node.Running; } - setTimeout(function(){updateData();}, 500 + nlen); + if (!old_node || old_node.Types != node.Types) { + nodeElement.cells[8].innerHTML = node.Types; + } + if (!old_node || old_node.Usage != node.Usage) { + nodeElement.cells[9].innerHTML = node.Usage; + } + if (!old_node || old_node.ResourceValues[0] != node.ResourceValues[0]) { + nodeElement.cells[10].innerHTML = node.ResourceValues[0]; + } + if (!old_node || old_node.ResourceValues[1] != node.ResourceValues[1]) { + nodeElement.cells[11].innerHTML = node.ResourceValues[1]; + } + if (!old_node || old_node.ResourceValues[2] != node.ResourceValues[2]) { + nodeElement.cells[12].innerHTML = node.ResourceValues[2]; + } + if (!old_node || old_node.ResourceValues[3] != node.ResourceValues[3]) { + nodeElement.cells[13].innerHTML = node.ResourceValues[3]; + } + node.NodeElement = nodeElement; + Nodes[node.Id] = node; } - - function updateData() { - $.ajax({url:'app?TabletID=' + hiveId + '&page=LandingData', - success: function(result){ onFreshData(result); }, - error: function(){ toggleAlert(); setTimeout(updateData, 1000); } - }); + for (var id in old_nodes) { + $('#node' + id).remove(); + delete Nodes[id]; + } + if (was_append) { + $('#node_table > tbody > tr').sort(function(a,b) { + if (a.cells[3].innerHTML > b.cells[3].innerHTML) + return 1; + if (a.cells[3].innerHTML < b.cells[3].innerHTML) + return -1; + return parseInt(a.cells[0].innerHTML, 10) - parseInt(b.cells[0].innerHTML, 10); + }).appendTo('#node_table > tbody'); } + } + clearAlert(); + } + catch(err) { + toggleAlert(); + } + setTimeout(function(){updateDataLong();}, 500 + nlen * 10); +} + +var switchToLong = false; + +function updateDataShort() { + if (switchToLong) { + updateDataLong(); + return; + } + $.ajax({url:'app?TabletID=' + hiveId + '&page=LandingData', + success: function(result){ onFreshDataShort(result); }, + error: function(){ toggleAlert(); setTimeout(updateDataShort, 1000); } + }); +} + +function updateDataLong() { + $.ajax({url:'app?TabletID=' + hiveId + '&page=LandingData&nodes=1&moves=1', + success: function(result){ onFreshDataLong(result); }, + error: function(){ toggleAlert(); setTimeout(updateDataLong, 1000); } + }); +} + +function updateData() { + switchToLong = true; +} + +updateDataShort(); + )___"; out << "</script>"; @@ -2068,89 +2171,98 @@ public: jsonData["RunningTablets"] = runningTablets; jsonData["TotalNodes"] = nodes; jsonData["AliveNodes"] = aliveNodes; - jsonData["ResourceTotal"] = GetResourceValuesText(Self->TotalRawResourceValues); - jsonData["ResourceVariance"] = GetResourceValuesText(Self->GetStDevResourceValues());//, [](double d) -> TString { return Sprintf("%.9f", d); }); + jsonData["ResourceTotal"] = GetResourceValuesJson(Self->TotalRawResourceValues); + jsonData["ResourceVariance"] = GetResourceValuesJson(Self->GetStDevResourceValues()); jsonData["BootQueueSize"] = Self->BootQueue.BootQueue.size(); jsonData["WaitQueueSize"] = Self->BootQueue.WaitQueue.size(); - jsonData["BalancerProgress"] = GetBalancerProgressText(Self->BalancerProgress, Self->LastBalancerTrigger); + jsonData["Balancers"] = Self->GetBalancerProgressJson(); jsonData["MaxUsage"] = GetValueWithColoredGlyph(stats.MaxUsage, Self->GetMaxNodeUsageToKick()) ; - jsonData["Scatter"] = TStringBuilder() << convert(stats.ScatterByResource, Self->GetMinScatterToBalance(), GetValueWithColoredGlyph); - jsonData["RunningTabletsText"] = GetRunningTabletsText(runningTablets, tablets, Self->WarmUp); - - TVector<TNodeInfo*> nodeInfos; - nodeInfos.reserve(Self->Nodes.size()); - for (auto& pr : Self->Nodes) { - if (!pr.second.IsUnknown()) { - nodeInfos.push_back(&pr.second); + auto scatterHtml = convert(stats.ScatterByResource, Self->GetMinScatterToBalance(), GetValueWithColoredGlyph); + jsonData["ScatterHtml"]["Counter"] = std::get<NMetrics::EResource::Counter>(scatterHtml); + jsonData["ScatterHtml"]["CPU"] = std::get<NMetrics::EResource::CPU>(scatterHtml); + jsonData["ScatterHtml"]["Memory"] = std::get<NMetrics::EResource::Memory>(scatterHtml); + jsonData["ScatterHtml"]["Network"] = std::get<NMetrics::EResource::Network>(scatterHtml); + jsonData["ObjectImbalance"] = GetValueWithColoredGlyph(Self->ObjectDistributions.GetMaxImbalance(), Self->GetObjectImbalanceToBalance()); + jsonData["WarmUp"] = Self->WarmUp; + + if (Cgi.Get("nodes") == "1") { + TVector<TNodeInfo*> nodeInfos; + nodeInfos.reserve(Self->Nodes.size()); + for (auto& pr : Self->Nodes) { + if (!pr.second.IsUnknown()) { + nodeInfos.push_back(&pr.second); + } } - } - std::sort(nodeInfos.begin(), nodeInfos.end(), [](TNodeInfo* a, TNodeInfo* b) -> bool { - return std::make_tuple(a->ServicedDomains, a->Id) < std::make_tuple(b->ServicedDomains, b->Id); - }); + std::sort(nodeInfos.begin(), nodeInfos.end(), [](TNodeInfo* a, TNodeInfo* b) -> bool { + return std::make_tuple(a->ServicedDomains, a->Id) < std::make_tuple(b->ServicedDomains, b->Id); + }); - TInstant aliveLine = TInstant::Now() - TDuration::Minutes(10); + TInstant aliveLine = TInstant::Now() - TDuration::Minutes(10); - NJson::TJsonValue& jsonNodes = jsonData["Nodes"]; - for (TNodeInfo* nodeInfo : nodeInfos) { - TNodeInfo& node = *nodeInfo; - TNodeId id = node.Id; + NJson::TJsonValue& jsonNodes = jsonData["Nodes"]; + for (TNodeInfo* nodeInfo : nodeInfos) { + TNodeInfo& node = *nodeInfo; + TNodeId id = node.Id; - if (!node.IsAlive() && TInstant::MilliSeconds(node.Statistics.GetLastAliveTimestamp()) < aliveLine) { - continue; - } + if (!node.IsAlive() && TInstant::MilliSeconds(node.Statistics.GetLastAliveTimestamp()) < aliveLine) { + continue; + } - NJson::TJsonValue& jsonNode = jsonNodes.AppendValue(NJson::TJsonValue()); - TString name = ""; - TString host; - auto it = Self->NodesInfo.find(node.Id); - if (it != Self->NodesInfo.end()) { - auto &ni = it->second; - if (ni.Host.empty()) { - name = ni.Address + ":" + ToString(ni.Port); - host = ni.Address; - } else { - name = ni.Host.substr(0, ni.Host.find('.')) + ":" + ToString(ni.Port); - host = ni.Host; + NJson::TJsonValue& jsonNode = jsonNodes.AppendValue(NJson::TJsonValue()); + TString name = ""; + TString host; + auto it = Self->NodesInfo.find(node.Id); + if (it != Self->NodesInfo.end()) { + auto &ni = it->second; + if (ni.Host.empty()) { + name = ni.Address + ":" + ToString(ni.Port); + host = ni.Address; + } else { + name = ni.Host.substr(0, ni.Host.find('.')) + ":" + ToString(ni.Port); + host = ni.Host; + } } - } - jsonNode["Id"] = id; - jsonNode["Host"] = host; - jsonNode["Name"] = name; - if (node.LocationAcquired) { - jsonNode["DataCenter"] = node.Location.GetDataCenterId(); - } - jsonNode["Domain"] = node.ServicedDomains.empty() ? "" : Self->GetDomainName(node.GetServicedDomain()); - jsonNode["Alive"] = node.IsAlive(); - jsonNode["Down"] = node.Down; - jsonNode["Freeze"] = node.Freeze; - jsonNode["Drain"] = node.IsAlive() ? node.Drain : false; - jsonNode["Uptime"] = node.IsAlive() ? GetDurationString(node.GetUptime()) : ""; - jsonNode["Unknown"] = node.Tablets[TTabletInfo::EVolatileState::TABLET_VOLATILE_STATE_UNKNOWN].size(); - jsonNode["Starting"] = node.Tablets[TTabletInfo::EVolatileState::TABLET_VOLATILE_STATE_STARTING].size(); - jsonNode["Running"] = node.Tablets[TTabletInfo::EVolatileState::TABLET_VOLATILE_STATE_RUNNING].size(); - { - TString types; - auto nodeTabletTypes = tabletsByNodeByType.find(node.Id); - if (nodeTabletTypes != tabletsByNodeByType.end()) { - for (auto it = nodeTabletTypes->second.begin(); it != nodeTabletTypes->second.end(); ++it) { - if (!types.empty()) { - types += ' '; + jsonNode["Id"] = id; + jsonNode["Host"] = host; + jsonNode["Name"] = name; + if (node.LocationAcquired) { + jsonNode["DataCenter"] = node.Location.GetDataCenterId(); + } + jsonNode["Domain"] = node.ServicedDomains.empty() ? "" : Self->GetDomainName(node.GetServicedDomain()); + jsonNode["Alive"] = node.IsAlive(); + jsonNode["Down"] = node.Down; + jsonNode["Freeze"] = node.Freeze; + jsonNode["Drain"] = node.IsAlive() ? node.Drain : false; + jsonNode["Uptime"] = node.IsAlive() ? GetDurationString(node.GetUptime()) : ""; + jsonNode["Unknown"] = node.Tablets[TTabletInfo::EVolatileState::TABLET_VOLATILE_STATE_UNKNOWN].size(); + jsonNode["Starting"] = node.Tablets[TTabletInfo::EVolatileState::TABLET_VOLATILE_STATE_STARTING].size(); + jsonNode["Running"] = node.Tablets[TTabletInfo::EVolatileState::TABLET_VOLATILE_STATE_RUNNING].size(); + { + TString types; + auto nodeTabletTypes = tabletsByNodeByType.find(node.Id); + if (nodeTabletTypes != tabletsByNodeByType.end()) { + for (auto it = nodeTabletTypes->second.begin(); it != nodeTabletTypes->second.end(); ++it) { + if (!types.empty()) { + types += ' '; + } + types += Sprintf("%s:%d", it->first.c_str(), it->second); } - types += Sprintf("%s:%d", it->first.c_str(), it->second); } + jsonNode["Types"] = types; } - jsonNode["Types"] = types; + double nodeUsage = node.GetNodeUsage(); + jsonNode["Usage"] = GetConditionalRedString(Sprintf("%.9f", nodeUsage), nodeUsage >= 1); + jsonNode["ResourceValues"] = GetResourceValuesJson(node.ResourceValues, node.ResourceMaximumValues); + jsonNode["StDevResourceValues"] = GetResourceValuesText(node.GetStDevResourceValues()); } - double nodeUsage = node.GetNodeUsage(); - jsonNode["Usage"] = GetConditionalRedString(Sprintf("%.9f", nodeUsage), nodeUsage >= 1); - jsonNode["ResourceValues"] = GetResourceValuesJson(node.ResourceValues, node.ResourceMaximumValues); - jsonNode["StDevResourceValues"] = GetResourceValuesText(node.GetStDevResourceValues()); } - NJson::TJsonValue& moves = jsonData["Moves"]; - if (Self->TabletMoveHistory.TotalSize()) { - for (int i = Self->TabletMoveHistory.TotalSize() - 1; i >= (int)Self->TabletMoveHistory.FirstIndex(); --i) { - moves.AppendValue(Self->TabletMoveHistory[i].ToHTML()); + if (Cgi.Get("moves") == "1") { + NJson::TJsonValue& moves = jsonData["Moves"]; + if (Self->TabletMoveHistory.TotalSize()) { + for (int i = Self->TabletMoveHistory.TotalSize() - 1; i >= (int)Self->TabletMoveHistory.FirstIndex(); --i) { + moves.AppendValue(Self->TabletMoveHistory[i].ToHTML()); + } } } NJson::WriteJson(&out, &jsonData); @@ -2356,8 +2468,10 @@ public: TTxType GetTxType() const override { return NHive::TXTYPE_MON_REBALANCE; } bool Execute(TTransactionContext&, const TActorContext&) override { - Self->LastBalancerTrigger = EBalancerType::Manual; - Self->StartHiveBalancer({.MaxMovements = MaxMovements}); + Self->StartHiveBalancer({ + .Type = EBalancerType::Manual, + .MaxMovements = MaxMovements + }); return true; } diff --git a/ydb/core/mind/hive/node_info.cpp b/ydb/core/mind/hive/node_info.cpp index b265ff0128a..aadc3ac627e 100644 --- a/ydb/core/mind/hive/node_info.cpp +++ b/ydb/core/mind/hive/node_info.cpp @@ -69,7 +69,9 @@ bool TNodeInfo::OnTabletChangeVolatileState(TTabletInfo* tablet, TTabletInfo::EV TabletsRunningByType[tablet->GetTabletType()].erase(tablet); TabletsOfObject[tablet->GetObjectId()].erase(tablet); Hive.UpdateCounterTabletsAlive(-1); - Hive.UpdateObjectCount(tablet->GetObjectId(), Id, -1); + if (tablet->HasCounter()) { + Hive.UpdateObjectCount(tablet->GetObjectId(), Id, -1); + } } if (IsResourceDrainingState(newState)) { if (Tablets[newState].insert(tablet).second) { @@ -82,7 +84,9 @@ bool TNodeInfo::OnTabletChangeVolatileState(TTabletInfo* tablet, TTabletInfo::EV TabletsRunningByType[tablet->GetTabletType()].emplace(tablet); TabletsOfObject[tablet->GetObjectId()].emplace(tablet); Hive.UpdateCounterTabletsAlive(+1); - Hive.UpdateObjectCount(tablet->GetObjectId(), Id, +1); + if (tablet->HasCounter()) { + Hive.UpdateObjectCount(tablet->GetObjectId(), Id, +1); + } } return true; } diff --git a/ydb/core/mind/hive/object_distribution.h b/ydb/core/mind/hive/object_distribution.h index 79dc976790a..857230f3714 100644 --- a/ydb/core/mind/hive/object_distribution.h +++ b/ydb/core/mind/hive/object_distribution.h @@ -11,21 +11,24 @@ namespace NKikimr { namespace NHive { struct TObjectDistribution { - std::multiset<ui64> SortedDistribution; - std::unordered_map<TNodeId, ui64> Distribution; + std::multiset<i64> SortedDistribution; + std::unordered_map<TNodeId, i64> Distribution; const TObjectId Id; double Mean = 0; double VarianceNumerator = 0; TObjectDistribution(TObjectId id) : Id(id) {} - ui64 GetImbalance() const { + double GetImbalance() const { if (SortedDistribution.empty()) { return 0; } - ui64 minVal = *SortedDistribution.begin(); - ui64 maxVal = *SortedDistribution.rbegin(); - return std::max<ui64>(maxVal - minVal, 1) - 1; + i64 minVal = *SortedDistribution.begin(); + i64 maxVal = *SortedDistribution.rbegin(); + if (maxVal == 0) { + return 0; + } + return (std::max<double>(maxVal - minVal, 1) - 1) / maxVal; } double GetVariance() const { @@ -35,29 +38,43 @@ struct TObjectDistribution { return VarianceNumerator / Distribution.size(); } - void UpdateCount(TNodeId node, i64 diff) { - ui64& value = Distribution[node]; + void RemoveFromSortedDistribution(i64 value) { + i64 numNodes = Distribution.size(); auto it = SortedDistribution.find(value); + SortedDistribution.erase(it); + double meanWithoutNode = 0; + if (numNodes > 1) { + meanWithoutNode = (Mean * numNodes - value) / (numNodes - 1); + } + VarianceNumerator -= (Mean - value) * (meanWithoutNode - value); + Mean = meanWithoutNode; + } + + void UpdateCount(TNodeId node, i64 diff) { + auto [it, newNode] = Distribution.insert({node, 0}); + i64& value = it->second; i64 numNodes = Distribution.size(); - if (it != SortedDistribution.end()) { - SortedDistribution.erase(it); - double meanWithoutNode = 0; - if (numNodes > 1) { - meanWithoutNode = (Mean * numNodes - value) / (numNodes - 1); - } - VarianceNumerator -= (Mean - value) * (meanWithoutNode - value); - Mean = meanWithoutNode; + if (!newNode) { + RemoveFromSortedDistribution(value); } - Y_VERIFY(diff + value >= 0); + if (diff + value < 0) { + BLOG_ERROR("UpdateObjectCount: new value " << diff + value << " is negative"); + } + Y_VERIFY_DEBUG(diff + value >= 0); value += diff; - if (value > 0) { - SortedDistribution.insert(value); - double newMean = (Mean * (numNodes - 1) + value) / numNodes; - VarianceNumerator += (Mean - value) * (newMean - value); - Mean = newMean; - } else { - Distribution.erase(node); + SortedDistribution.insert(value); + double newMean = (Mean * (numNodes - 1) + value) / numNodes; + VarianceNumerator += (Mean - value) * (newMean - value); + Mean = newMean; + } + + void RemoveNode(TNodeId node) { + auto it = Distribution.find(node); + if (it == Distribution.end()) { + return; } + RemoveFromSortedDistribution(it->second); + Distribution.erase(node); } bool operator<(const TObjectDistribution& other) const { @@ -68,11 +85,14 @@ struct TObjectDistribution { struct TObjectDistributions { std::multiset<TObjectDistribution> SortedDistributions; std::unordered_map<TObjectId, std::multiset<TObjectDistribution>::iterator> Distributions; - ui64 TotalImbalance = 0; ui64 ImbalancedObjects = 0; + std::unordered_set<TNodeId> Nodes; - ui64 GetTotalImbalance() { - return TotalImbalance; + double GetMaxImbalance() { + if (SortedDistributions.empty()) { + return 0; + } + return SortedDistributions.rbegin()->GetImbalance(); } struct TObjectToBalance { @@ -85,11 +105,11 @@ struct TObjectDistributions { TObjectToBalance GetObjectToBalance() { Y_VERIFY(!SortedDistributions.empty()); const auto& dist = *SortedDistributions.rbegin(); - ui64 maxCnt = *dist.SortedDistribution.rbegin(); + i64 maxCnt = *dist.SortedDistribution.rbegin(); TObjectToBalance result(dist.Id); for (const auto& [node, cnt] : dist.Distribution) { ui64 n = node; - ui64 c = cnt; + i64 c = cnt; BLOG_TRACE("Node " << n << "has " << c << ", maximum: " << maxCnt); if (cnt == maxCnt) { result.Nodes.push_back(node); @@ -109,30 +129,67 @@ struct TObjectDistributions { return SortedDistributions.rbegin()->GetVariance(); } - void UpdateCount(TObjectId object, TNodeId node, i64 diff) { + template <typename F> + bool UpdateDistribution(TObjectId object, F updateFunc) { auto distIt = Distributions.find(object); if (distIt == Distributions.end()) { + return false; + } + auto handle = SortedDistributions.extract(distIt->second); + if (!handle) { + return false; + } + auto& dist = handle.value(); + double imbalanceBefore = dist.GetImbalance(); + updateFunc(dist); + double imbalanceAfter = dist.GetImbalance(); + if (imbalanceBefore <= 1e-7 && imbalanceAfter > 1e-7) { + ++ImbalancedObjects; + } else if (imbalanceBefore > 1e-7 && imbalanceAfter <= 1e-7) { + --ImbalancedObjects; + } + if (!dist.Distribution.empty()) { + auto sortedIt = SortedDistributions.insert(std::move(handle)); + distIt->second = sortedIt; + } else { + Distributions.erase(distIt); + } + return true; + } + + + void UpdateCount(TObjectId object, TNodeId node, i64 diff) { + auto updateFunc = [=](TObjectDistribution& dist) { + dist.UpdateCount(node, diff); + }; + if (!UpdateDistribution(object, updateFunc)) { TObjectDistribution dist(object); + for (auto node : Nodes) { + dist.UpdateCount(node, 0); + } dist.UpdateCount(node, diff); - TotalImbalance += dist.GetImbalance(); auto sortedDistIt = SortedDistributions.insert(std::move(dist)); Distributions.emplace(object, sortedDistIt); return; } - auto handle = SortedDistributions.extract(distIt->second); - Y_VERIFY(handle); - auto& dist = handle.value(); - ui64 imbalanceBefore = dist.GetImbalance(); - dist.UpdateCount(node, diff); - ui64 imbalanceAfter = dist.GetImbalance(); - TotalImbalance += imbalanceAfter - imbalanceBefore; - if (imbalanceBefore == 0 && imbalanceAfter > 0) { - ++ImbalancedObjects; - } else if (imbalanceBefore > 0 && imbalanceAfter == 0) { - --ImbalancedObjects; + // std::cerr << object << ": " << diff << " ~>" << GetTotalImbalance() << std::endl; + } + + void AddNode(TNodeId node) { + Nodes.insert(node); + for (const auto& [obj, it] : Distributions) { + UpdateCount(obj, node, 0); + } + } + + void RemoveNode(TNodeId node) { + Nodes.erase(node); + auto updateFunc = [=](TObjectDistribution& dist) { + dist.RemoveNode(node); + }; + for (auto it = Distributions.begin(); it != Distributions.end();) { + UpdateDistribution((it++)->first, updateFunc); } - auto sortedIt = SortedDistributions.insert(std::move(handle)); - distIt->second = sortedIt; } }; diff --git a/ydb/core/mind/hive/object_distribution_ut.cpp b/ydb/core/mind/hive/object_distribution_ut.cpp index c8cefc22eb6..a695d0f03a3 100644 --- a/ydb/core/mind/hive/object_distribution_ut.cpp +++ b/ydb/core/mind/hive/object_distribution_ut.cpp @@ -22,6 +22,10 @@ Y_UNIT_TEST_SUITE(ObjectDistribuiton) { std::uniform_int_distribution<TNodeId> pickNode(0, NUM_NODES - 1); std::bernoulli_distribution subtract(0.2); + for (TNodeId node = 0; node < NUM_NODES; ++node) { + objectDistributions.AddNode(node); + } + for (size_t i = 0; i < NUM_OPERATIONS; i++) { TObjectId object = pickObject(engine); TNodeId node = pickNode(engine); @@ -35,7 +39,6 @@ Y_UNIT_TEST_SUITE(ObjectDistribuiton) { } ui64 imbalancedObjects = 0; - ui64 totalImbalance = 0; for (const auto& [object, it] : objectDistributions.Distributions) { ui64 maxCnt = 0; ui64 minCnt = NUM_OPERATIONS; @@ -43,9 +46,9 @@ Y_UNIT_TEST_SUITE(ObjectDistribuiton) { ui64 nonZeroCount = 0; for (TNodeId node = 0; node < NUM_NODES; ++node) { ui64 cnt = trueDistribution[{node, object}]; - if (cnt == 0) { + /* if (cnt == 0) { continue; - } + }*/ maxCnt = std::max(maxCnt, cnt); minCnt = std::min(minCnt, cnt); total += cnt; @@ -54,28 +57,26 @@ Y_UNIT_TEST_SUITE(ObjectDistribuiton) { if (maxCnt == 0) { continue; } - ui64 trueImbalance = std::max<ui64>(maxCnt - minCnt, 1) - 1; + double trueImbalance = (std::max<double>(maxCnt - minCnt, 1) - 1) / maxCnt; // std::cerr << "imbalance for " << object << " should be " << trueImbalance << std::endl; - ui64 imbalance = it->GetImbalance(); - UNIT_ASSERT_VALUES_EQUAL(trueImbalance, imbalance); + double imbalance = it->GetImbalance(); + UNIT_ASSERT_DOUBLES_EQUAL(trueImbalance, imbalance, 1e-5); - totalImbalance += trueImbalance; - imbalancedObjects += (trueImbalance != 0); + imbalancedObjects += (trueImbalance > 1e-7); double mean = (double)total / nonZeroCount; double varianceNumerator = 0; for (TNodeId node = 0; node < NUM_NODES; ++node) { ui64 cnt = trueDistribution[{node, object}]; - if (cnt == 0) { + /* if (cnt == 0) { continue; - } + }*/ varianceNumerator += (mean - cnt) * (mean - cnt); } double trueVariance = varianceNumerator / nonZeroCount; double variance = it->GetVariance(); UNIT_ASSERT_DOUBLES_EQUAL(trueVariance, variance, 1e-5); } - UNIT_ASSERT_VALUES_EQUAL(totalImbalance, objectDistributions.GetTotalImbalance()); UNIT_ASSERT_VALUES_EQUAL(imbalancedObjects, objectDistributions.GetImbalancedObjectsCount()); } } diff --git a/ydb/core/mind/hive/tablet_info.cpp b/ydb/core/mind/hive/tablet_info.cpp index d64e70101c7..a852abc1d9f 100644 --- a/ydb/core/mind/hive/tablet_info.cpp +++ b/ydb/core/mind/hive/tablet_info.cpp @@ -369,10 +369,16 @@ void TTabletInfo::UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics ResourceValues.AddGroupWriteThroughput()->CopyFrom(v); } } - ResourceValues.SetCounter(GetCounterValue(ResourceValues, GetTabletAllowedMetricIds())); + i64 counterBefore = ResourceValues.GetCounter(); + ActualizeCounter(); + i64 counterAfter = ResourceValues.GetCounter(); const auto& after = ResourceValues; if (Node != nullptr) { Node->UpdateResourceValues(this, before, after); + i64 deltaCounter = counterAfter - counterBefore; + if (deltaCounter != 0) { + Hive.UpdateObjectCount(GetObjectId(), Node->Id, deltaCounter); + } } } @@ -435,6 +441,11 @@ void TTabletInfo::FilterRawValues(TResourceNormalizedValues& values) const { } } +void TTabletInfo::ActualizeCounter() { + auto value = GetCounterValue(ResourceValues, GetTabletAllowedMetricIds()); + ResourceValues.SetCounter(value); +} + const TVector<TNodeId>& TTabletInfo::GetAllowedNodes() const { if (IsLeader()) { return AsLeader().AllowedNodes; diff --git a/ydb/core/mind/hive/tablet_info.h b/ydb/core/mind/hive/tablet_info.h index be4ca7d5865..9d8ea8484e3 100644 --- a/ydb/core/mind/hive/tablet_info.h +++ b/ydb/core/mind/hive/tablet_info.h @@ -233,6 +233,7 @@ public: static i64 GetCounterValue(const NKikimrTabletBase::TMetrics& metrics, const TVector<i64>& allowedMetricIds); void FilterRawValues(TResourceRawValues& values) const; void FilterRawValues(TResourceNormalizedValues& values) const; + void ActualizeCounter(); template <typename ResourcesType> static double GetUsage(const ResourcesType& current, const ResourcesType& maximum, EResourceToBalance resource = EResourceToBalance::Dominant) { @@ -295,6 +296,10 @@ public: void ActualizeTabletStatistics(TInstant now); ui64 GetRestartsPerPeriod(TInstant barrier); bool RestartsOften() const; + + bool HasCounter() { + return std::get<NMetrics::EResource::Counter>(GetResourceCurrentValues()) > 0; + } }; diff --git a/ydb/core/mind/hive/tx__kill_node.cpp b/ydb/core/mind/hive/tx__kill_node.cpp index bcfba26feb8..8cc06e642f6 100644 --- a/ydb/core/mind/hive/tx__kill_node.cpp +++ b/ydb/core/mind/hive/tx__kill_node.cpp @@ -58,6 +58,7 @@ public: db.Table<Schema::Node>().Key(NodeId).Update<Schema::Node::Local>(TActorId()); } } + Self->ObjectDistributions.RemoveNode(NodeId); return true; } diff --git a/ydb/core/mind/hive/tx__load_everything.cpp b/ydb/core/mind/hive/tx__load_everything.cpp index 96636d550af..981fde2cd2a 100644 --- a/ydb/core/mind/hive/tx__load_everything.cpp +++ b/ydb/core/mind/hive/tx__load_everything.cpp @@ -423,7 +423,6 @@ public: tablet.NeedToReleaseFromParent = tabletRowset.GetValueOrDefault<Schema::Tablet::NeedToReleaseFromParent>(); tablet.ChannelProfileReassignReason = tabletRowset.GetValueOrDefault<Schema::Tablet::ReassignReason>(); tablet.Statistics = tabletRowset.GetValueOrDefault<Schema::Tablet::Statistics>(); - tablet.InitTabletMetrics(); if (tablet.NodeId == 0) { tablet.BecomeStopped(); @@ -584,7 +583,6 @@ public: TFollowerGroup& followerGroup = tablet->GetFollowerGroup(followerGroupId); TFollowerTabletInfo& follower = tablet->AddFollower(followerGroup, followerId); follower.Statistics = tabletFollowerRowset.GetValueOrDefault<Schema::TabletFollowerTablet::Statistics>(); - follower.InitTabletMetrics(); if (nodeId == 0) { follower.BecomeStopped(); } else { @@ -635,6 +633,10 @@ public: << numMissingTablets << " for missing tablets)"); } + for (auto& [tabletId, tablet] : Self->Tablets) { + tablet.ActualizeCounter(); + } + size_t numDeletedNodes = 0; for (auto itNode = Self->Nodes.begin(); itNode != Self->Nodes.end();) { if (itNode->second.CanBeDeleted()) { diff --git a/ydb/core/mind/hive/tx__status.cpp b/ydb/core/mind/hive/tx__status.cpp index 93546a9e08a..95229f65b21 100644 --- a/ydb/core/mind/hive/tx__status.cpp +++ b/ydb/core/mind/hive/tx__status.cpp @@ -47,6 +47,7 @@ public: BLOG_D("THive::TTxStatus(" << nodeId << ")::Complete - continuing node drain"); Self->StartHiveDrain(nodeId, {.Persist = true, .KeepDown = node.Down}); } + Self->ObjectDistributions.AddNode(nodeId); } else { BLOG_W("THive::TTxStatus(status=" << static_cast<int>(status) << " node=" << TNodeInfo::EVolatileStateName(node.GetVolatileState()) << ") - killing node " << node.Id); diff --git a/ydb/core/mind/hive/tx__update_tablets_object.cpp b/ydb/core/mind/hive/tx__update_tablets_object.cpp index 6193bc0938a..3cbca3ed803 100644 --- a/ydb/core/mind/hive/tx__update_tablets_object.cpp +++ b/ydb/core/mind/hive/tx__update_tablets_object.cpp @@ -52,8 +52,10 @@ public: if (auto node = tablet->GetNode(); node != nullptr) { node->TabletsOfObject[oldObject].erase(tablet); node->TabletsOfObject[objectId].emplace(tablet); - Self->UpdateObjectCount(oldObject, node->Id, -1); - Self->UpdateObjectCount(objectId, node->Id, +1); + if (tablet->HasCounter()) { + Self->UpdateObjectCount(oldObject, node->Id, -1); + Self->UpdateObjectCount(objectId, node->Id, +1); + } } db.Table<Schema::Tablet>().Key(tabletId).Update<Schema::Tablet::ObjectID>(objectId); diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto index 318c31838be..1036cc5be0f 100644 --- a/ydb/core/protos/config.proto +++ b/ydb/core/protos/config.proto @@ -1726,15 +1726,15 @@ message THiveConfig { optional uint64 MetricsWindowSize = 21 [default = 60000]; // milliseconds optional double MaxNodeUsageToKick = 22 [default = 0.9]; optional uint64 ResourceChangeReactionPeriod = 23 [default = 10]; // seconds - optional uint64 TabletKickCooldownPeriod = 24 [default = 1800]; // seconds + optional uint64 TabletKickCooldownPeriod = 24 [default = 600]; // seconds optional double ResourceOvercommitment = 25 [default = 3.00]; optional uint64 BalancerInflight = 26 [default = 1]; // tablets optional EHiveNodeBalanceStrategy NodeBalanceStrategy = 27 [default = HIVE_NODE_BALANCE_STRATEGY_HEAVIEST]; optional EHiveTabletBalanceStrategy TabletBalanceStrategy = 28 [default = HIVE_TABLET_BALANCE_STRATEGY_WEIGHTED_RANDOM]; - optional double MinPeriodBetweenBalance = 29 [default = 1.0]; // seconds + optional double MinPeriodBetweenBalance = 29 [default = 0.2]; // seconds optional uint64 MaxMovementsOnAutoBalancer = 30 [default = 1]; // tablets optional bool ContinueAutoBalancer = 31 [default = true]; - optional double MinNodeUsageToBalance = 32 [default = 0.3]; + optional double MinNodeUsageToBalance = 32 [default = 0.1]; optional double MinPeriodBetweenReassign = 33 [default = 300.0]; // seconds optional double TabletRestartWatchPeriod = 34 [default = 3600.0]; // seconds optional double NodeRestartWatchPeriod = 35 [default = 3600.0]; // seconds @@ -1766,8 +1766,9 @@ message THiveConfig { optional double MinCPUScatterToBalance = 62 [default = 0.5]; optional double MinMemoryScatterToBalance = 63 [default = 0.5]; optional double MinNetworkScatterToBalance = 64 [default = 0.5]; - optional double MinCounterScatterToBalance = 65 [default = 0.01]; - optional uint64 ObjectImbalanceToBalance = 66 [default = 0]; + optional double MinCounterScatterToBalance = 65 [default = 0.02]; + reserved 66; + optional double ObjectImbalanceToBalance = 67 [default = 0.02]; } message TDataShardConfig { |