diff options
author | zalyalov <zalyalov@yandex-team.com> | 2023-07-31 13:03:57 +0300 |
---|---|---|
committer | zalyalov <zalyalov@yandex-team.com> | 2023-07-31 13:03:57 +0300 |
commit | e53b4e5164d3e236957b17a24fcac932ba3cae85 (patch) | |
tree | 7a07899deef6afe68db15ebf3516adefc2950771 | |
parent | db37abc806f241e1a92f5f02d84b09abddc5a3b4 (diff) | |
download | ydb-e53b4e5164d3e236957b17a24fcac932ba3cae85.tar.gz |
more balancer information in hive ui KIKIMR-18706
balancer progress shows what triggered the balancer
max usage and scatter values are shown with coloring based on relation to threshold values
show last 5 tablet moves
-rw-r--r-- | ydb/core/mind/hive/CMakeLists.darwin-x86_64.txt | 1 | ||||
-rw-r--r-- | ydb/core/mind/hive/CMakeLists.linux-aarch64.txt | 1 | ||||
-rw-r--r-- | ydb/core/mind/hive/CMakeLists.linux-x86_64.txt | 1 | ||||
-rw-r--r-- | ydb/core/mind/hive/CMakeLists.windows-x86_64.txt | 1 | ||||
-rw-r--r-- | ydb/core/mind/hive/balancer.cpp | 2 | ||||
-rw-r--r-- | ydb/core/mind/hive/drain.cpp | 2 | ||||
-rw-r--r-- | ydb/core/mind/hive/fill.cpp | 2 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive.cpp | 9 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive.h | 5 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_impl.cpp | 9 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_impl.h | 20 | ||||
-rw-r--r-- | ydb/core/mind/hive/hive_statics.cpp | 22 | ||||
-rw-r--r-- | ydb/core/mind/hive/monitoring.cpp | 74 | ||||
-rw-r--r-- | ydb/core/mind/hive/ya.make | 1 |
14 files changed, 136 insertions, 14 deletions
diff --git a/ydb/core/mind/hive/CMakeLists.darwin-x86_64.txt b/ydb/core/mind/hive/CMakeLists.darwin-x86_64.txt index 96414c5a99..245396fdd2 100644 --- a/ydb/core/mind/hive/CMakeLists.darwin-x86_64.txt +++ b/ydb/core/mind/hive/CMakeLists.darwin-x86_64.txt @@ -14,6 +14,7 @@ target_link_libraries(core-mind-hive PUBLIC yutil cpp-actors-core cpp-actors-interconnect + cpp-containers-ring_buffer library-cpp-json cpp-monlib-dynamic_counters ydb-core-base diff --git a/ydb/core/mind/hive/CMakeLists.linux-aarch64.txt b/ydb/core/mind/hive/CMakeLists.linux-aarch64.txt index 3a4b9dc0a6..df4f5c4075 100644 --- a/ydb/core/mind/hive/CMakeLists.linux-aarch64.txt +++ b/ydb/core/mind/hive/CMakeLists.linux-aarch64.txt @@ -15,6 +15,7 @@ target_link_libraries(core-mind-hive PUBLIC yutil cpp-actors-core cpp-actors-interconnect + cpp-containers-ring_buffer library-cpp-json cpp-monlib-dynamic_counters ydb-core-base diff --git a/ydb/core/mind/hive/CMakeLists.linux-x86_64.txt b/ydb/core/mind/hive/CMakeLists.linux-x86_64.txt index 3a4b9dc0a6..df4f5c4075 100644 --- a/ydb/core/mind/hive/CMakeLists.linux-x86_64.txt +++ b/ydb/core/mind/hive/CMakeLists.linux-x86_64.txt @@ -15,6 +15,7 @@ target_link_libraries(core-mind-hive PUBLIC yutil cpp-actors-core cpp-actors-interconnect + cpp-containers-ring_buffer library-cpp-json cpp-monlib-dynamic_counters ydb-core-base diff --git a/ydb/core/mind/hive/CMakeLists.windows-x86_64.txt b/ydb/core/mind/hive/CMakeLists.windows-x86_64.txt index 96414c5a99..245396fdd2 100644 --- a/ydb/core/mind/hive/CMakeLists.windows-x86_64.txt +++ b/ydb/core/mind/hive/CMakeLists.windows-x86_64.txt @@ -14,6 +14,7 @@ target_link_libraries(core-mind-hive PUBLIC yutil cpp-actors-core cpp-actors-interconnect + cpp-containers-ring_buffer library-cpp-json cpp-monlib-dynamic_counters ydb-core-base diff --git a/ydb/core/mind/hive/balancer.cpp b/ydb/core/mind/hive/balancer.cpp index f539d8a4d3..27261627b1 100644 --- a/ydb/core/mind/hive/balancer.cpp +++ b/ydb/core/mind/hive/balancer.cpp @@ -270,7 +270,7 @@ protected: BLOG_D("Balancer moving tablet " << tablet->ToString() << " " << tablet->GetResourceValues() << " from node " << tablet->Node->Id << " " << tablet->Node->ResourceValues << " to node " << result.BestNode->Id << " " << result.BestNode->ResourceValues); - Hive->TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1); + Hive->RecordTabletMove({now, tablet->GetFullTabletId(), tablet->Node->Id, result.BestNode->Id}); Hive->Execute(Hive->CreateRestartTablet(tablet->GetFullTabletId(), result.BestNode->Id)); UpdateProgress(); if (!CanKickNextTablet()) { diff --git a/ydb/core/mind/hive/drain.cpp b/ydb/core/mind/hive/drain.cpp index 2e263a7f0e..180cb2dd06 100644 --- a/ydb/core/mind/hive/drain.cpp +++ b/ydb/core/mind/hive/drain.cpp @@ -70,7 +70,7 @@ protected: << " from node " << tablet->Node->Id << " " << tablet->Node->ResourceValues << " to node " << result.BestNode->Id << " " << result.BestNode->ResourceValues); Hive->TabletCounters->Cumulative()[NHive::COUNTER_DRAIN_EXECUTED].Increment(1); - Hive->TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1); + Hive->RecordTabletMove({TInstant::Now(), tablet->GetFullTabletId(), tablet->Node->Id, result.BestNode->Id}); Hive->Execute(Hive->CreateRestartTablet(tabletId, result.BestNode->Id)); } else { Hive->TabletCounters->Cumulative()[NHive::COUNTER_DRAIN_FAILED].Increment(1); diff --git a/ydb/core/mind/hive/fill.cpp b/ydb/core/mind/hive/fill.cpp index f90fa56f42..ca821cff57 100644 --- a/ydb/core/mind/hive/fill.cpp +++ b/ydb/core/mind/hive/fill.cpp @@ -56,7 +56,7 @@ protected: << " from node " << tablet->Node->Id << " " << tablet->Node->ResourceValues << " to node " << result.BestNode->Id << " " << result.BestNode->ResourceValues); Hive->TabletCounters->Cumulative()[NHive::COUNTER_FILL_EXECUTED].Increment(1); - Hive->TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1); + Hive->RecordTabletMove({TInstant::Now(), tablet->GetFullTabletId(), tablet->Node->Id, result.BestNode->Id}); Hive->Execute(Hive->CreateRestartTablet(tablet->GetFullTabletId(), result.BestNode->Id), ctx); } } diff --git a/ydb/core/mind/hive/hive.cpp b/ydb/core/mind/hive/hive.cpp index 58c64db832..437bbd477b 100644 --- a/ydb/core/mind/hive/hive.cpp +++ b/ydb/core/mind/hive/hive.cpp @@ -26,6 +26,15 @@ TString EFollowerStrategyName(EFollowerStrategy value) { } } +TString EBalancerTypeName(EBalancerType value) { + switch (value) { + case EBalancerType::None: return "???"; + case EBalancerType::Scatter: return "Scatter"; + case EBalancerType::Emergency: return "Emergency"; + case EBalancerType::Manual: return "Manual"; + } +} + TResourceNormalizedValues NormalizeRawValues(const TResourceRawValues& values, const TResourceRawValues& maximum) { TResourceNormalizedValues normValues = {}; if (std::get<NMetrics::EResource::Counter>(maximum) != 0) { diff --git a/ydb/core/mind/hive/hive.h b/ydb/core/mind/hive/hive.h index 8791cfb0c4..2031119817 100644 --- a/ydb/core/mind/hive/hive.h +++ b/ydb/core/mind/hive/hive.h @@ -71,13 +71,16 @@ enum class EFollowerStrategy : ui32 { Read, }; +TString EFollowerStrategyName(EFollowerStrategy value); + enum class EBalancerType { None, Scatter, Emergency, + Manual, }; -TString EFollowerStrategyName(EFollowerStrategy value); +TString EBalancerTypeName(EBalancerType value); struct ISubActor { virtual void Cleanup() = 0; diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index 30ea7e4fc2..e1a11a7860 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -1569,6 +1569,11 @@ void THive::UpdateCounterEventQueueSize(i64 eventQueueSizeDiff) { } } +void THive::RecordTabletMove(const TTabletMoveInfo& moveInfo) { + TabletMoveHistory.PushBack(moveInfo); + TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1); +} + bool THive::DomainHasNodes(const TSubDomainKey &domainKey) const { return !DomainsView.IsEmpty(domainKey); } @@ -2109,7 +2114,7 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) { ProcessTabletBalancerScheduled = false; if (!SubActors.empty()) { BLOG_D("Balancer has been postponed because of sub activity"); - ProcessTabletBalancerPostponed = false; + ProcessTabletBalancerPostponed = true; return; } @@ -2439,6 +2444,8 @@ TDuration THive::GetBalancerCooldown() const { return GetMinPeriodBetweenBalance(); case EBalancerType::Emergency: return GetMinPeriodBetweenEmergencyBalance(); + case EBalancerType::Manual: + return TDuration::Seconds(1); } } diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h index be6ea78637..e43b7aec95 100644 --- a/ydb/core/mind/hive/hive_impl.h +++ b/ydb/core/mind/hive/hive_impl.h @@ -31,6 +31,7 @@ #include <library/cpp/actors/core/interconnect.h> #include <library/cpp/actors/core/hfunc.h> +#include <library/cpp/containers/ring_buffer/ring_buffer.h> #include <util/generic/queue.h> #include <util/random/random.h> @@ -138,11 +139,13 @@ TString GetTimes(ui64 times, const TString& zero = "0.00%"); TString GetConditionalGreyString(const TString& str, bool condition); TString GetConditionalBoldString(const TString& str, bool condition); TString GetConditionalRedString(const TString& str, bool condition); +TString GetColoredValue(double val, double maxVal); TString GetDataCenterName(ui64 dataCenterId); TString LongToShortTabletName(const TString& longTabletName); TString GetLocationString(const NActors::TNodeLocation& location); void MakeTabletTypeSet(std::vector<TTabletTypes::EType>& list); bool IsValidTabletType(TTabletTypes::EType type); +TString GetBalancerProgressText(i32 balancerProgress, EBalancerType balancerType); class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveSharedSettings { public: @@ -420,6 +423,22 @@ protected: // normalized to be sorted list of unique values std::vector<TTabletTypes::EType> BalancerIgnoreTabletTypes; // built from CurrentConfig + struct TTabletMoveInfo { + TInstant Timestamp; + TFullTabletId Tablet; + TNodeId From; + TNodeId To; + + TString ToHTML() { + TStringBuilder str; + str << "<tr><td>" << Timestamp << "</td><td>" << Tablet + << "</td><td>" << From << "→" << To << "</td><tr>"; + return str; + } + }; + + TStaticRingBuffer<TTabletMoveInfo, 5> TabletMoveHistory; + // to be removed later bool TabletOwnersSynced = false; // to be removed later @@ -592,6 +611,7 @@ public: void UpdateCounterTabletsAlive(i64 tabletsAliveDiff); void UpdateCounterBootQueueSize(ui64 bootQueueSize); void UpdateCounterEventQueueSize(i64 eventQueueSizeDiff); + void RecordTabletMove(const TTabletMoveInfo& info); bool DomainHasNodes(const TSubDomainKey &domainKey) const; void ProcessBootQueue(); void ProcessWaitQueue(); diff --git a/ydb/core/mind/hive/hive_statics.cpp b/ydb/core/mind/hive/hive_statics.cpp index 4eae631b94..f4c19ba58a 100644 --- a/ydb/core/mind/hive/hive_statics.cpp +++ b/ydb/core/mind/hive/hive_statics.cpp @@ -196,6 +196,20 @@ TString GetConditionalRedString(const TString& str, bool condition) { } } +TString GetColoredValue(double val, double maxVal) { + double ratio = val / maxVal; + TString color; + if (ratio < 0.9) { + color = "green"; + } else if (ratio < 1.0) { + color = "yellow"; + } else { + color = "red"; + } + + return Sprintf("<span style='color:%s'>%.2f</span>", color.c_str(), val); +} + ui64 GetReadThroughput(const NKikimrTabletBase::TMetrics& values) { ui64 acc = 0; for (const auto& throughput : values.GetGroupReadThroughput()) { @@ -351,5 +365,13 @@ bool IsValidTabletType(TTabletTypes::EType type) { ); } +TString GetBalancerProgressText(i32 balancerProgress, EBalancerType balancerType) { + TStringBuilder str; + if (balancerProgress >= 0) { + str << balancerProgress << "% (" << EBalancerTypeName(balancerType) << ")"; + } + return str; +} + } // NHive } // NKikimr diff --git a/ydb/core/mind/hive/monitoring.cpp b/ydb/core/mind/hive/monitoring.cpp index 3e567133a6..d5eba8979c 100644 --- a/ydb/core/mind/hive/monitoring.cpp +++ b/ydb/core/mind/hive/monitoring.cpp @@ -1352,13 +1352,16 @@ public: << aliveNodes << "/" << nodes << "</td></tr>";*/ out << "<tr><td>" << "Tablets:" << "</td><td id='runningTablets'>" << (tablets == 0 ? 0 : runningTablets * 100 / tablets) << "% " << runningTablets << "/" << tablets << "</td></tr>"; - out << "<tr><td><button type='button' class='btn btn-link' data-toggle='modal' data-target='#rebalance'>Balancer: </button></td><td id='balancerProgress'>" - << (Self->BalancerProgress >= 0 ? Sprintf("%d%%", Self->BalancerProgress) : TString()) << "</td></tr>"; + out << "<tr><td><a role='button' data-toggle='modal' href='#rebalance'>Balancer:</a></td><td id='balancerProgress'>" + << GetBalancerProgressText(Self->BalancerProgress, Self->LastBalancerTrigger) << "</td></tr>"; out << "<tr><td>" << "Boot Queue:" << "</td><td id='bootQueue'>" << Self->BootQueue.BootQueue.size() << "</td></tr>"; out << "<tr><td>" << "Wait Queue:" << "</td><td id='waitQueue'>" << Self->BootQueue.WaitQueue.size() << "</td></tr>"; out << "<tr><td>" << "Resource Total: " << "</td><td id='resourceTotal'>" << GetResourceValuesText(Self->TotalRawResourceValues) << "</td></tr>"; out << "<tr><td>" << "Resource StDev: " << "</td><td id='resourceVariance'>" << convert(Self->GetStDevResourceValues(), [](double d) -> TString { return Sprintf("%.9f", d); }) << "</td></tr>"; + THive::THiveStats stats = Self->GetStats(); + out << "<tr><td>" << "Max usage:" << "<td id='maxUsage'>" << GetColoredValue(stats.MaxUsage, Self->GetMaxNodeUsageToKick()) << "</td></tr>"; + out << "<tr><td>" << "Scatter:" << "<td id='scatter'>" << GetColoredValue(stats.Scatter, Self->GetMinScatterToBalance()) << "</td></tr>"; out << "</table>"; out << "<table id='node_table' class='table simple-table2 table-hover table-condensed'>"; @@ -1546,17 +1549,26 @@ public: <div class='modal-content'> <div class='modal-header'> <button type='button' class='close' data-dismiss='modal'>×</button> - <h4 class='modal-title'>Rebalance tablets</h4> + <h4 class='modal-title'>Balancer</h4> </div> <div class='modal-body'> <div class='row'> <div class='col-md-12'> <h2> Run Balancer</h2> + </div> + </div> + <div class='row'> + <div class='col-md-2'> <label for='balancer_max_movements'>Max movements</label> <div in='balancer_max_movements' class='input-group'> <input id='balancer_max_movements' type='number' value='1000' class='form-control'> </div> - <button type='submit' class='btn btn-primary' onclick='rebalanceTablets()' data-dismiss='modal'>Run</button> + <br> + </div> + </div> + <div class='row'> + <div class='col-md-2'> + <button type='submit' class='btn btn-primary' onclick='rebalanceTablets()' data-dismiss='modal' id='run-balancer'>Run</button> </div> </div> <div class='row'> @@ -1567,13 +1579,45 @@ public: <div class='row'> <div class='col-md-12'> <h2> Rebalance ALL tablets FROM SCRATCH</h2> + </div> + </div> + <div class='row'> + <div class='col-md-8'> <label for='tenant_name'> Please enter the tenant name to confirm you know what you are doing</label> <div in='tenant_name' class='input-group' style='width:100%'> <input id='tenant_name' type='text' class='form-control'> </div> + <br> + </div> + </div> + <div class='row'> + <div class='col-md-2'> <button id='button_rebalance' type='submit' class='btn btn-danger' onclick='rebalanceTabletsFromScratch();' data-dismiss='modal'>Run</button> </div> </div> + <div class='row'> + <div class='col-md-12'> + <hr> + </div> + </div> + <div class='row'> + <div class='col-md-12'> + <h2> Latest tablet moves</h2> + </div> + </div> + <div class='row'> + <div class='col-md-12'> + <table id='move_history' class='table table-stripped'> + <thead> + <th>Timestamp</th> + <th>Tablet</th> + <th>Node</th> + </thead> + <tbody> + </tbody> + </table> + </div> + </div> </div> <div class='modal-footer'> <button type='button' class='btn btn-default' data-dismiss='modal'>Cancel</button> @@ -1784,10 +1828,12 @@ public: $('#resourceTotal').html(result.ResourceTotal); $('#bootQueue').html(result.BootQueueSize); $('#waitQueue').html(result.WaitQueueSize); - if (result.BalancerProgress >= 0) { - $('#balancerProgress').html(result.BalancerProgress + '%'); - } else { - $('#balancerProgress').html(''); + $('#balancerProgress').html(result.BalancerProgress); + $('#maxUsage').html(result.MaxUsage); + $('#scatter').html(result.Scatter); + $('#move_history > tbody > tr').remove(); + for (var i in result.Moves) { + $(result.Moves[i]).appendTo('#move_history > tbody'); } var old_nodes = {}; if (Empty) { @@ -2004,6 +2050,7 @@ public: } NJson::TJsonValue jsonData; + THive::THiveStats stats = Self->GetStats(); jsonData["TotalTablets"] = tablets; jsonData["RunningTablets"] = runningTablets; @@ -2013,7 +2060,9 @@ public: jsonData["ResourceVariance"] = GetResourceValuesText(Self->GetStDevResourceValues());//, [](double d) -> TString { return Sprintf("%.9f", d); }); jsonData["BootQueueSize"] = Self->BootQueue.BootQueue.size(); jsonData["WaitQueueSize"] = Self->BootQueue.WaitQueue.size(); - jsonData["BalancerProgress"] = Self->BalancerProgress; + jsonData["BalancerProgress"] = GetBalancerProgressText(Self->BalancerProgress, Self->LastBalancerTrigger); + jsonData["MaxUsage"] = GetColoredValue(stats.MaxUsage, Self->GetMaxNodeUsageToKick()) ; + jsonData["Scatter"] = GetColoredValue(stats.Scatter, Self->GetMinScatterToBalance()); TVector<TNodeInfo*> nodeInfos; nodeInfos.reserve(Self->Nodes.size()); @@ -2085,6 +2134,12 @@ public: jsonNode["ResourceValues"] = GetResourceValuesJson(node.ResourceValues, node.ResourceMaximumValues); jsonNode["StDevResourceValues"] = GetResourceValuesText(node.GetStDevResourceValues()); } + NJson::TJsonValue& moves = jsonData["Moves"]; + if (Self->TabletMoveHistory.TotalSize()) { + for (int i = Self->TabletMoveHistory.TotalSize() - 1; i >= (int)Self->TabletMoveHistory.FirstIndex(); --i) { + moves.AppendValue(Self->TabletMoveHistory[i].ToHTML()); + } + } NJson::WriteJson(&out, &jsonData); } }; @@ -2288,6 +2343,7 @@ public: TTxType GetTxType() const override { return NHive::TXTYPE_MON_REBALANCE; } bool Execute(TTransactionContext&, const TActorContext&) override { + Self->LastBalancerTrigger = EBalancerType::Manual; Self->StartHiveBalancer(MaxMovements); return true; } diff --git a/ydb/core/mind/hive/ya.make b/ydb/core/mind/hive/ya.make index 12034b12a0..de0262a0f7 100644 --- a/ydb/core/mind/hive/ya.make +++ b/ydb/core/mind/hive/ya.make @@ -80,6 +80,7 @@ SRCS( PEERDIR( library/cpp/actors/core library/cpp/actors/interconnect + library/cpp/containers/ring_buffer library/cpp/json library/cpp/monlib/dynamic_counters ydb/core/base |