aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorzalyalov <zalyalov@yandex-team.com>2023-07-31 13:03:57 +0300
committerzalyalov <zalyalov@yandex-team.com>2023-07-31 13:03:57 +0300
commite53b4e5164d3e236957b17a24fcac932ba3cae85 (patch)
tree7a07899deef6afe68db15ebf3516adefc2950771
parentdb37abc806f241e1a92f5f02d84b09abddc5a3b4 (diff)
downloadydb-e53b4e5164d3e236957b17a24fcac932ba3cae85.tar.gz
more balancer information in hive ui KIKIMR-18706
balancer progress shows what triggered the balancer max usage and scatter values are shown with coloring based on relation to threshold values show last 5 tablet moves
-rw-r--r--ydb/core/mind/hive/CMakeLists.darwin-x86_64.txt1
-rw-r--r--ydb/core/mind/hive/CMakeLists.linux-aarch64.txt1
-rw-r--r--ydb/core/mind/hive/CMakeLists.linux-x86_64.txt1
-rw-r--r--ydb/core/mind/hive/CMakeLists.windows-x86_64.txt1
-rw-r--r--ydb/core/mind/hive/balancer.cpp2
-rw-r--r--ydb/core/mind/hive/drain.cpp2
-rw-r--r--ydb/core/mind/hive/fill.cpp2
-rw-r--r--ydb/core/mind/hive/hive.cpp9
-rw-r--r--ydb/core/mind/hive/hive.h5
-rw-r--r--ydb/core/mind/hive/hive_impl.cpp9
-rw-r--r--ydb/core/mind/hive/hive_impl.h20
-rw-r--r--ydb/core/mind/hive/hive_statics.cpp22
-rw-r--r--ydb/core/mind/hive/monitoring.cpp74
-rw-r--r--ydb/core/mind/hive/ya.make1
14 files changed, 136 insertions, 14 deletions
diff --git a/ydb/core/mind/hive/CMakeLists.darwin-x86_64.txt b/ydb/core/mind/hive/CMakeLists.darwin-x86_64.txt
index 96414c5a99..245396fdd2 100644
--- a/ydb/core/mind/hive/CMakeLists.darwin-x86_64.txt
+++ b/ydb/core/mind/hive/CMakeLists.darwin-x86_64.txt
@@ -14,6 +14,7 @@ target_link_libraries(core-mind-hive PUBLIC
yutil
cpp-actors-core
cpp-actors-interconnect
+ cpp-containers-ring_buffer
library-cpp-json
cpp-monlib-dynamic_counters
ydb-core-base
diff --git a/ydb/core/mind/hive/CMakeLists.linux-aarch64.txt b/ydb/core/mind/hive/CMakeLists.linux-aarch64.txt
index 3a4b9dc0a6..df4f5c4075 100644
--- a/ydb/core/mind/hive/CMakeLists.linux-aarch64.txt
+++ b/ydb/core/mind/hive/CMakeLists.linux-aarch64.txt
@@ -15,6 +15,7 @@ target_link_libraries(core-mind-hive PUBLIC
yutil
cpp-actors-core
cpp-actors-interconnect
+ cpp-containers-ring_buffer
library-cpp-json
cpp-monlib-dynamic_counters
ydb-core-base
diff --git a/ydb/core/mind/hive/CMakeLists.linux-x86_64.txt b/ydb/core/mind/hive/CMakeLists.linux-x86_64.txt
index 3a4b9dc0a6..df4f5c4075 100644
--- a/ydb/core/mind/hive/CMakeLists.linux-x86_64.txt
+++ b/ydb/core/mind/hive/CMakeLists.linux-x86_64.txt
@@ -15,6 +15,7 @@ target_link_libraries(core-mind-hive PUBLIC
yutil
cpp-actors-core
cpp-actors-interconnect
+ cpp-containers-ring_buffer
library-cpp-json
cpp-monlib-dynamic_counters
ydb-core-base
diff --git a/ydb/core/mind/hive/CMakeLists.windows-x86_64.txt b/ydb/core/mind/hive/CMakeLists.windows-x86_64.txt
index 96414c5a99..245396fdd2 100644
--- a/ydb/core/mind/hive/CMakeLists.windows-x86_64.txt
+++ b/ydb/core/mind/hive/CMakeLists.windows-x86_64.txt
@@ -14,6 +14,7 @@ target_link_libraries(core-mind-hive PUBLIC
yutil
cpp-actors-core
cpp-actors-interconnect
+ cpp-containers-ring_buffer
library-cpp-json
cpp-monlib-dynamic_counters
ydb-core-base
diff --git a/ydb/core/mind/hive/balancer.cpp b/ydb/core/mind/hive/balancer.cpp
index f539d8a4d3..27261627b1 100644
--- a/ydb/core/mind/hive/balancer.cpp
+++ b/ydb/core/mind/hive/balancer.cpp
@@ -270,7 +270,7 @@ protected:
BLOG_D("Balancer moving tablet " << tablet->ToString() << " " << tablet->GetResourceValues()
<< " from node " << tablet->Node->Id << " " << tablet->Node->ResourceValues
<< " to node " << result.BestNode->Id << " " << result.BestNode->ResourceValues);
- Hive->TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1);
+ Hive->RecordTabletMove({now, tablet->GetFullTabletId(), tablet->Node->Id, result.BestNode->Id});
Hive->Execute(Hive->CreateRestartTablet(tablet->GetFullTabletId(), result.BestNode->Id));
UpdateProgress();
if (!CanKickNextTablet()) {
diff --git a/ydb/core/mind/hive/drain.cpp b/ydb/core/mind/hive/drain.cpp
index 2e263a7f0e..180cb2dd06 100644
--- a/ydb/core/mind/hive/drain.cpp
+++ b/ydb/core/mind/hive/drain.cpp
@@ -70,7 +70,7 @@ protected:
<< " from node " << tablet->Node->Id << " " << tablet->Node->ResourceValues
<< " to node " << result.BestNode->Id << " " << result.BestNode->ResourceValues);
Hive->TabletCounters->Cumulative()[NHive::COUNTER_DRAIN_EXECUTED].Increment(1);
- Hive->TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1);
+ Hive->RecordTabletMove({TInstant::Now(), tablet->GetFullTabletId(), tablet->Node->Id, result.BestNode->Id});
Hive->Execute(Hive->CreateRestartTablet(tabletId, result.BestNode->Id));
} else {
Hive->TabletCounters->Cumulative()[NHive::COUNTER_DRAIN_FAILED].Increment(1);
diff --git a/ydb/core/mind/hive/fill.cpp b/ydb/core/mind/hive/fill.cpp
index f90fa56f42..ca821cff57 100644
--- a/ydb/core/mind/hive/fill.cpp
+++ b/ydb/core/mind/hive/fill.cpp
@@ -56,7 +56,7 @@ protected:
<< " from node " << tablet->Node->Id << " " << tablet->Node->ResourceValues
<< " to node " << result.BestNode->Id << " " << result.BestNode->ResourceValues);
Hive->TabletCounters->Cumulative()[NHive::COUNTER_FILL_EXECUTED].Increment(1);
- Hive->TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1);
+ Hive->RecordTabletMove({TInstant::Now(), tablet->GetFullTabletId(), tablet->Node->Id, result.BestNode->Id});
Hive->Execute(Hive->CreateRestartTablet(tablet->GetFullTabletId(), result.BestNode->Id), ctx);
}
}
diff --git a/ydb/core/mind/hive/hive.cpp b/ydb/core/mind/hive/hive.cpp
index 58c64db832..437bbd477b 100644
--- a/ydb/core/mind/hive/hive.cpp
+++ b/ydb/core/mind/hive/hive.cpp
@@ -26,6 +26,15 @@ TString EFollowerStrategyName(EFollowerStrategy value) {
}
}
+TString EBalancerTypeName(EBalancerType value) {
+ switch (value) {
+ case EBalancerType::None: return "???";
+ case EBalancerType::Scatter: return "Scatter";
+ case EBalancerType::Emergency: return "Emergency";
+ case EBalancerType::Manual: return "Manual";
+ }
+}
+
TResourceNormalizedValues NormalizeRawValues(const TResourceRawValues& values, const TResourceRawValues& maximum) {
TResourceNormalizedValues normValues = {};
if (std::get<NMetrics::EResource::Counter>(maximum) != 0) {
diff --git a/ydb/core/mind/hive/hive.h b/ydb/core/mind/hive/hive.h
index 8791cfb0c4..2031119817 100644
--- a/ydb/core/mind/hive/hive.h
+++ b/ydb/core/mind/hive/hive.h
@@ -71,13 +71,16 @@ enum class EFollowerStrategy : ui32 {
Read,
};
+TString EFollowerStrategyName(EFollowerStrategy value);
+
enum class EBalancerType {
None,
Scatter,
Emergency,
+ Manual,
};
-TString EFollowerStrategyName(EFollowerStrategy value);
+TString EBalancerTypeName(EBalancerType value);
struct ISubActor {
virtual void Cleanup() = 0;
diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp
index 30ea7e4fc2..e1a11a7860 100644
--- a/ydb/core/mind/hive/hive_impl.cpp
+++ b/ydb/core/mind/hive/hive_impl.cpp
@@ -1569,6 +1569,11 @@ void THive::UpdateCounterEventQueueSize(i64 eventQueueSizeDiff) {
}
}
+void THive::RecordTabletMove(const TTabletMoveInfo& moveInfo) {
+ TabletMoveHistory.PushBack(moveInfo);
+ TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1);
+}
+
bool THive::DomainHasNodes(const TSubDomainKey &domainKey) const {
return !DomainsView.IsEmpty(domainKey);
}
@@ -2109,7 +2114,7 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) {
ProcessTabletBalancerScheduled = false;
if (!SubActors.empty()) {
BLOG_D("Balancer has been postponed because of sub activity");
- ProcessTabletBalancerPostponed = false;
+ ProcessTabletBalancerPostponed = true;
return;
}
@@ -2439,6 +2444,8 @@ TDuration THive::GetBalancerCooldown() const {
return GetMinPeriodBetweenBalance();
case EBalancerType::Emergency:
return GetMinPeriodBetweenEmergencyBalance();
+ case EBalancerType::Manual:
+ return TDuration::Seconds(1);
}
}
diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h
index be6ea78637..e43b7aec95 100644
--- a/ydb/core/mind/hive/hive_impl.h
+++ b/ydb/core/mind/hive/hive_impl.h
@@ -31,6 +31,7 @@
#include <library/cpp/actors/core/interconnect.h>
#include <library/cpp/actors/core/hfunc.h>
+#include <library/cpp/containers/ring_buffer/ring_buffer.h>
#include <util/generic/queue.h>
#include <util/random/random.h>
@@ -138,11 +139,13 @@ TString GetTimes(ui64 times, const TString& zero = "0.00%");
TString GetConditionalGreyString(const TString& str, bool condition);
TString GetConditionalBoldString(const TString& str, bool condition);
TString GetConditionalRedString(const TString& str, bool condition);
+TString GetColoredValue(double val, double maxVal);
TString GetDataCenterName(ui64 dataCenterId);
TString LongToShortTabletName(const TString& longTabletName);
TString GetLocationString(const NActors::TNodeLocation& location);
void MakeTabletTypeSet(std::vector<TTabletTypes::EType>& list);
bool IsValidTabletType(TTabletTypes::EType type);
+TString GetBalancerProgressText(i32 balancerProgress, EBalancerType balancerType);
class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveSharedSettings {
public:
@@ -420,6 +423,22 @@ protected:
// normalized to be sorted list of unique values
std::vector<TTabletTypes::EType> BalancerIgnoreTabletTypes; // built from CurrentConfig
+ struct TTabletMoveInfo {
+ TInstant Timestamp;
+ TFullTabletId Tablet;
+ TNodeId From;
+ TNodeId To;
+
+ TString ToHTML() {
+ TStringBuilder str;
+ str << "<tr><td>" << Timestamp << "</td><td>" << Tablet
+ << "</td><td>" << From << "&rarr;" << To << "</td><tr>";
+ return str;
+ }
+ };
+
+ TStaticRingBuffer<TTabletMoveInfo, 5> TabletMoveHistory;
+
// to be removed later
bool TabletOwnersSynced = false;
// to be removed later
@@ -592,6 +611,7 @@ public:
void UpdateCounterTabletsAlive(i64 tabletsAliveDiff);
void UpdateCounterBootQueueSize(ui64 bootQueueSize);
void UpdateCounterEventQueueSize(i64 eventQueueSizeDiff);
+ void RecordTabletMove(const TTabletMoveInfo& info);
bool DomainHasNodes(const TSubDomainKey &domainKey) const;
void ProcessBootQueue();
void ProcessWaitQueue();
diff --git a/ydb/core/mind/hive/hive_statics.cpp b/ydb/core/mind/hive/hive_statics.cpp
index 4eae631b94..f4c19ba58a 100644
--- a/ydb/core/mind/hive/hive_statics.cpp
+++ b/ydb/core/mind/hive/hive_statics.cpp
@@ -196,6 +196,20 @@ TString GetConditionalRedString(const TString& str, bool condition) {
}
}
+TString GetColoredValue(double val, double maxVal) {
+ double ratio = val / maxVal;
+ TString color;
+ if (ratio < 0.9) {
+ color = "green";
+ } else if (ratio < 1.0) {
+ color = "yellow";
+ } else {
+ color = "red";
+ }
+
+ return Sprintf("<span style='color:%s'>%.2f</span>", color.c_str(), val);
+}
+
ui64 GetReadThroughput(const NKikimrTabletBase::TMetrics& values) {
ui64 acc = 0;
for (const auto& throughput : values.GetGroupReadThroughput()) {
@@ -351,5 +365,13 @@ bool IsValidTabletType(TTabletTypes::EType type) {
);
}
+TString GetBalancerProgressText(i32 balancerProgress, EBalancerType balancerType) {
+ TStringBuilder str;
+ if (balancerProgress >= 0) {
+ str << balancerProgress << "% (" << EBalancerTypeName(balancerType) << ")";
+ }
+ return str;
+}
+
} // NHive
} // NKikimr
diff --git a/ydb/core/mind/hive/monitoring.cpp b/ydb/core/mind/hive/monitoring.cpp
index 3e567133a6..d5eba8979c 100644
--- a/ydb/core/mind/hive/monitoring.cpp
+++ b/ydb/core/mind/hive/monitoring.cpp
@@ -1352,13 +1352,16 @@ public:
<< aliveNodes << "/" << nodes << "</td></tr>";*/
out << "<tr><td>" << "Tablets:" << "</td><td id='runningTablets'>" << (tablets == 0 ? 0 : runningTablets * 100 / tablets) << "% "
<< runningTablets << "/" << tablets << "</td></tr>";
- out << "<tr><td><button type='button' class='btn btn-link' data-toggle='modal' data-target='#rebalance'>Balancer: </button></td><td id='balancerProgress'>"
- << (Self->BalancerProgress >= 0 ? Sprintf("%d%%", Self->BalancerProgress) : TString()) << "</td></tr>";
+ out << "<tr><td><a role='button' data-toggle='modal' href='#rebalance'>Balancer:</a></td><td id='balancerProgress'>"
+ << GetBalancerProgressText(Self->BalancerProgress, Self->LastBalancerTrigger) << "</td></tr>";
out << "<tr><td>" << "Boot Queue:" << "</td><td id='bootQueue'>" << Self->BootQueue.BootQueue.size() << "</td></tr>";
out << "<tr><td>" << "Wait Queue:" << "</td><td id='waitQueue'>" << Self->BootQueue.WaitQueue.size() << "</td></tr>";
out << "<tr><td>" << "Resource Total: " << "</td><td id='resourceTotal'>" << GetResourceValuesText(Self->TotalRawResourceValues) << "</td></tr>";
out << "<tr><td>" << "Resource StDev: " << "</td><td id='resourceVariance'>"
<< convert(Self->GetStDevResourceValues(), [](double d) -> TString { return Sprintf("%.9f", d); }) << "</td></tr>";
+ THive::THiveStats stats = Self->GetStats();
+ out << "<tr><td>" << "Max usage:" << "<td id='maxUsage'>" << GetColoredValue(stats.MaxUsage, Self->GetMaxNodeUsageToKick()) << "</td></tr>";
+ out << "<tr><td>" << "Scatter:" << "<td id='scatter'>" << GetColoredValue(stats.Scatter, Self->GetMinScatterToBalance()) << "</td></tr>";
out << "</table>";
out << "<table id='node_table' class='table simple-table2 table-hover table-condensed'>";
@@ -1546,17 +1549,26 @@ public:
<div class='modal-content'>
<div class='modal-header'>
<button type='button' class='close' data-dismiss='modal'>&times;</button>
- <h4 class='modal-title'>Rebalance tablets</h4>
+ <h4 class='modal-title'>Balancer</h4>
</div>
<div class='modal-body'>
<div class='row'>
<div class='col-md-12'>
<h2> Run Balancer</h2>
+ </div>
+ </div>
+ <div class='row'>
+ <div class='col-md-2'>
<label for='balancer_max_movements'>Max movements</label>
<div in='balancer_max_movements' class='input-group'>
<input id='balancer_max_movements' type='number' value='1000' class='form-control'>
</div>
- <button type='submit' class='btn btn-primary' onclick='rebalanceTablets()' data-dismiss='modal'>Run</button>
+ <br>
+ </div>
+ </div>
+ <div class='row'>
+ <div class='col-md-2'>
+ <button type='submit' class='btn btn-primary' onclick='rebalanceTablets()' data-dismiss='modal' id='run-balancer'>Run</button>
</div>
</div>
<div class='row'>
@@ -1567,13 +1579,45 @@ public:
<div class='row'>
<div class='col-md-12'>
<h2> Rebalance ALL tablets FROM SCRATCH</h2>
+ </div>
+ </div>
+ <div class='row'>
+ <div class='col-md-8'>
<label for='tenant_name'> Please enter the tenant name to confirm you know what you are doing</label>
<div in='tenant_name' class='input-group' style='width:100%'>
<input id='tenant_name' type='text' class='form-control'>
</div>
+ <br>
+ </div>
+ </div>
+ <div class='row'>
+ <div class='col-md-2'>
<button id='button_rebalance' type='submit' class='btn btn-danger' onclick='rebalanceTabletsFromScratch();' data-dismiss='modal'>Run</button>
</div>
</div>
+ <div class='row'>
+ <div class='col-md-12'>
+ <hr>
+ </div>
+ </div>
+ <div class='row'>
+ <div class='col-md-12'>
+ <h2> Latest tablet moves</h2>
+ </div>
+ </div>
+ <div class='row'>
+ <div class='col-md-12'>
+ <table id='move_history' class='table table-stripped'>
+ <thead>
+ <th>Timestamp</th>
+ <th>Tablet</th>
+ <th>Node</th>
+ </thead>
+ <tbody>
+ </tbody>
+ </table>
+ </div>
+ </div>
</div>
<div class='modal-footer'>
<button type='button' class='btn btn-default' data-dismiss='modal'>Cancel</button>
@@ -1784,10 +1828,12 @@ public:
$('#resourceTotal').html(result.ResourceTotal);
$('#bootQueue').html(result.BootQueueSize);
$('#waitQueue').html(result.WaitQueueSize);
- if (result.BalancerProgress >= 0) {
- $('#balancerProgress').html(result.BalancerProgress + '%');
- } else {
- $('#balancerProgress').html('');
+ $('#balancerProgress').html(result.BalancerProgress);
+ $('#maxUsage').html(result.MaxUsage);
+ $('#scatter').html(result.Scatter);
+ $('#move_history > tbody > tr').remove();
+ for (var i in result.Moves) {
+ $(result.Moves[i]).appendTo('#move_history > tbody');
}
var old_nodes = {};
if (Empty) {
@@ -2004,6 +2050,7 @@ public:
}
NJson::TJsonValue jsonData;
+ THive::THiveStats stats = Self->GetStats();
jsonData["TotalTablets"] = tablets;
jsonData["RunningTablets"] = runningTablets;
@@ -2013,7 +2060,9 @@ public:
jsonData["ResourceVariance"] = GetResourceValuesText(Self->GetStDevResourceValues());//, [](double d) -> TString { return Sprintf("%.9f", d); });
jsonData["BootQueueSize"] = Self->BootQueue.BootQueue.size();
jsonData["WaitQueueSize"] = Self->BootQueue.WaitQueue.size();
- jsonData["BalancerProgress"] = Self->BalancerProgress;
+ jsonData["BalancerProgress"] = GetBalancerProgressText(Self->BalancerProgress, Self->LastBalancerTrigger);
+ jsonData["MaxUsage"] = GetColoredValue(stats.MaxUsage, Self->GetMaxNodeUsageToKick()) ;
+ jsonData["Scatter"] = GetColoredValue(stats.Scatter, Self->GetMinScatterToBalance());
TVector<TNodeInfo*> nodeInfos;
nodeInfos.reserve(Self->Nodes.size());
@@ -2085,6 +2134,12 @@ public:
jsonNode["ResourceValues"] = GetResourceValuesJson(node.ResourceValues, node.ResourceMaximumValues);
jsonNode["StDevResourceValues"] = GetResourceValuesText(node.GetStDevResourceValues());
}
+ NJson::TJsonValue& moves = jsonData["Moves"];
+ if (Self->TabletMoveHistory.TotalSize()) {
+ for (int i = Self->TabletMoveHistory.TotalSize() - 1; i >= (int)Self->TabletMoveHistory.FirstIndex(); --i) {
+ moves.AppendValue(Self->TabletMoveHistory[i].ToHTML());
+ }
+ }
NJson::WriteJson(&out, &jsonData);
}
};
@@ -2288,6 +2343,7 @@ public:
TTxType GetTxType() const override { return NHive::TXTYPE_MON_REBALANCE; }
bool Execute(TTransactionContext&, const TActorContext&) override {
+ Self->LastBalancerTrigger = EBalancerType::Manual;
Self->StartHiveBalancer(MaxMovements);
return true;
}
diff --git a/ydb/core/mind/hive/ya.make b/ydb/core/mind/hive/ya.make
index 12034b12a0..de0262a0f7 100644
--- a/ydb/core/mind/hive/ya.make
+++ b/ydb/core/mind/hive/ya.make
@@ -80,6 +80,7 @@ SRCS(
PEERDIR(
library/cpp/actors/core
library/cpp/actors/interconnect
+ library/cpp/containers/ring_buffer
library/cpp/json
library/cpp/monlib/dynamic_counters
ydb/core/base