aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorzalyalov <zalyalov@yandex-team.com>2023-11-08 13:44:09 +0300
committerzalyalov <zalyalov@yandex-team.com>2023-11-08 15:12:38 +0300
commit266288f7033d1afaea8e6ed1da8295cff41cf579 (patch)
tree29f16566288e51c5953a904e1c9727b985fb5176
parent52e23495a9390cac8f528198780a559df1cb5ea3 (diff)
downloadydb-266288f7033d1afaea8e6ed1da8295cff41cf579.tar.gz
write batch logs about tablet moves
-rw-r--r--ydb/core/mind/hive/CMakeLists.darwin-x86_64.txt1
-rw-r--r--ydb/core/mind/hive/CMakeLists.linux-aarch64.txt1
-rw-r--r--ydb/core/mind/hive/CMakeLists.linux-x86_64.txt1
-rw-r--r--ydb/core/mind/hive/CMakeLists.windows-x86_64.txt1
-rw-r--r--ydb/core/mind/hive/balancer.cpp2
-rw-r--r--ydb/core/mind/hive/drain.cpp2
-rw-r--r--ydb/core/mind/hive/fill.cpp2
-rw-r--r--ydb/core/mind/hive/hive_events.h3
-rw-r--r--ydb/core/mind/hive/hive_impl.cpp48
-rw-r--r--ydb/core/mind/hive/hive_impl.h21
-rw-r--r--ydb/core/mind/hive/tablet_move_info.cpp35
-rw-r--r--ydb/core/mind/hive/ya.make1
12 files changed, 109 insertions, 9 deletions
diff --git a/ydb/core/mind/hive/CMakeLists.darwin-x86_64.txt b/ydb/core/mind/hive/CMakeLists.darwin-x86_64.txt
index 245396fdd2..fef7d8ca01 100644
--- a/ydb/core/mind/hive/CMakeLists.darwin-x86_64.txt
+++ b/ydb/core/mind/hive/CMakeLists.darwin-x86_64.txt
@@ -46,6 +46,7 @@ target_sources(core-mind-hive PRIVATE
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/storage_group_info.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/storage_pool_info.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tablet_info.cpp
+ ${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tablet_move_info.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tx__adopt_tablet.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tx__block_storage_result.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tx__configure_subdomain.cpp
diff --git a/ydb/core/mind/hive/CMakeLists.linux-aarch64.txt b/ydb/core/mind/hive/CMakeLists.linux-aarch64.txt
index df4f5c4075..b6a5f9928f 100644
--- a/ydb/core/mind/hive/CMakeLists.linux-aarch64.txt
+++ b/ydb/core/mind/hive/CMakeLists.linux-aarch64.txt
@@ -47,6 +47,7 @@ target_sources(core-mind-hive PRIVATE
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/storage_group_info.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/storage_pool_info.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tablet_info.cpp
+ ${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tablet_move_info.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tx__adopt_tablet.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tx__block_storage_result.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tx__configure_subdomain.cpp
diff --git a/ydb/core/mind/hive/CMakeLists.linux-x86_64.txt b/ydb/core/mind/hive/CMakeLists.linux-x86_64.txt
index df4f5c4075..b6a5f9928f 100644
--- a/ydb/core/mind/hive/CMakeLists.linux-x86_64.txt
+++ b/ydb/core/mind/hive/CMakeLists.linux-x86_64.txt
@@ -47,6 +47,7 @@ target_sources(core-mind-hive PRIVATE
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/storage_group_info.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/storage_pool_info.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tablet_info.cpp
+ ${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tablet_move_info.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tx__adopt_tablet.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tx__block_storage_result.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tx__configure_subdomain.cpp
diff --git a/ydb/core/mind/hive/CMakeLists.windows-x86_64.txt b/ydb/core/mind/hive/CMakeLists.windows-x86_64.txt
index 245396fdd2..fef7d8ca01 100644
--- a/ydb/core/mind/hive/CMakeLists.windows-x86_64.txt
+++ b/ydb/core/mind/hive/CMakeLists.windows-x86_64.txt
@@ -46,6 +46,7 @@ target_sources(core-mind-hive PRIVATE
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/storage_group_info.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/storage_pool_info.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tablet_info.cpp
+ ${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tablet_move_info.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tx__adopt_tablet.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tx__block_storage_result.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/hive/tx__configure_subdomain.cpp
diff --git a/ydb/core/mind/hive/balancer.cpp b/ydb/core/mind/hive/balancer.cpp
index 6ec197906e..cf8753afbf 100644
--- a/ydb/core/mind/hive/balancer.cpp
+++ b/ydb/core/mind/hive/balancer.cpp
@@ -264,7 +264,7 @@ protected:
BLOG_D("Balancer moving tablet " << tablet->ToString() << " " << tablet->GetResourceValues()
<< " from node " << tablet->Node->Id << " " << tablet->Node->ResourceValues
<< " to node " << result.BestNode->Id << " " << result.BestNode->ResourceValues);
- Hive->RecordTabletMove({now, tablet->GetFullTabletId(), tablet->Node->Id, result.BestNode->Id});
+ Hive->RecordTabletMove(THive::TTabletMoveInfo(now, *tablet, tablet->Node->Id, result.BestNode->Id));
Hive->Execute(Hive->CreateRestartTablet(tablet->GetFullTabletId(), result.BestNode->Id));
UpdateProgress();
if (!CanKickNextTablet()) {
diff --git a/ydb/core/mind/hive/drain.cpp b/ydb/core/mind/hive/drain.cpp
index 3583a91efd..55ed697c95 100644
--- a/ydb/core/mind/hive/drain.cpp
+++ b/ydb/core/mind/hive/drain.cpp
@@ -70,7 +70,7 @@ protected:
<< " from node " << tablet->Node->Id << " " << tablet->Node->ResourceValues
<< " to node " << result.BestNode->Id << " " << result.BestNode->ResourceValues);
Hive->TabletCounters->Cumulative()[NHive::COUNTER_DRAIN_EXECUTED].Increment(1);
- Hive->RecordTabletMove({TInstant::Now(), tablet->GetFullTabletId(), tablet->Node->Id, result.BestNode->Id});
+ Hive->RecordTabletMove(THive::TTabletMoveInfo(TInstant::Now(), *tablet, tablet->Node->Id, result.BestNode->Id));
Hive->Execute(Hive->CreateRestartTablet(tabletId, result.BestNode->Id));
} else {
Hive->TabletCounters->Cumulative()[NHive::COUNTER_DRAIN_FAILED].Increment(1);
diff --git a/ydb/core/mind/hive/fill.cpp b/ydb/core/mind/hive/fill.cpp
index 383c856636..404e490fcb 100644
--- a/ydb/core/mind/hive/fill.cpp
+++ b/ydb/core/mind/hive/fill.cpp
@@ -56,7 +56,7 @@ protected:
<< " from node " << tablet->Node->Id << " " << tablet->Node->ResourceValues
<< " to node " << result.BestNode->Id << " " << result.BestNode->ResourceValues);
Hive->TabletCounters->Cumulative()[NHive::COUNTER_FILL_EXECUTED].Increment(1);
- Hive->RecordTabletMove({TInstant::Now(), tablet->GetFullTabletId(), tablet->Node->Id, result.BestNode->Id});
+ Hive->RecordTabletMove(THive::TTabletMoveInfo(TInstant::Now(), *tablet, tablet->Node->Id, result.BestNode->Id));
Hive->Execute(Hive->CreateRestartTablet(tablet->GetFullTabletId(), result.BestNode->Id), ctx);
}
}
diff --git a/ydb/core/mind/hive/hive_events.h b/ydb/core/mind/hive/hive_events.h
index 8a74a533b9..ce14763450 100644
--- a/ydb/core/mind/hive/hive_events.h
+++ b/ydb/core/mind/hive/hive_events.h
@@ -26,6 +26,7 @@ struct TEvPrivate {
EvBalancerOut,
EvProcessIncomingEvent,
EvRefreshStorageInfo,
+ EvLogTabletMoves,
EvEnd
};
@@ -87,6 +88,8 @@ struct TEvPrivate {
struct TEvProcessIncomingEvent : TEventLocal<TEvProcessIncomingEvent, EvProcessIncomingEvent> {};
struct TEvRefreshStorageInfo : TEventLocal<TEvRefreshStorageInfo, EvRefreshStorageInfo> {};
+
+ struct TEvLogTabletMoves : TEventLocal<TEvLogTabletMoves, EvLogTabletMoves> {};
};
} // NHive
diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp
index ba44de4d72..2034028f48 100644
--- a/ydb/core/mind/hive/hive_impl.cpp
+++ b/ydb/core/mind/hive/hive_impl.cpp
@@ -1603,6 +1603,20 @@ void THive::UpdateCounterNodesConnected(i64 nodesConnectedDiff) {
void THive::RecordTabletMove(const TTabletMoveInfo& moveInfo) {
TabletMoveHistory.PushBack(moveInfo);
TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1);
+ if (TabletMoveSamplesForLog.size() < MOVE_SAMPLES_PER_LOG_ENTRY) {
+ TabletMoveSamplesForLog.push_back(moveInfo);
+ std::push_heap(TabletMoveSamplesForLog.begin(), TabletMoveSamplesForLog.end(), std::greater<TTabletMoveInfo>{});
+ } else if (moveInfo.Priority > TabletMoveSamplesForLog.front().Priority) {
+ TabletMoveSamplesForLog.push_back(moveInfo);
+ std::pop_heap(TabletMoveSamplesForLog.begin(), TabletMoveSamplesForLog.end(), std::greater<TTabletMoveInfo>{});
+ TabletMoveSamplesForLog.pop_back();
+ }
+ TabletMovesByTypeForLog[moveInfo.TabletType]++;
+ if (!LogTabletMovesScheduled) {
+ LogTabletMovesScheduled = true;
+ LogTabletMovesSchedulingTime = moveInfo.Timestamp;
+ Schedule(TDuration::Minutes(5), new TEvPrivate::TEvLogTabletMoves());
+ }
}
bool THive::DomainHasNodes(const TSubDomainKey &domainKey) const {
@@ -2793,6 +2807,7 @@ void THive::ProcessEvent(std::unique_ptr<IEventHandle> event) {
hFunc(TEvPrivate::TEvBalancerOut, Handle);
hFunc(TEvHive::TEvUpdateTabletsObject, Handle);
hFunc(TEvPrivate::TEvRefreshStorageInfo, Handle);
+ hFunc(TEvPrivate::TEvLogTabletMoves, Handle);
}
}
@@ -2889,6 +2904,7 @@ STFUNC(THive::StateWork) {
fFunc(TEvPrivate::TEvBalancerOut::EventType, EnqueueIncomingEvent);
fFunc(TEvHive::TEvUpdateTabletsObject::EventType, EnqueueIncomingEvent);
fFunc(TEvPrivate::TEvRefreshStorageInfo::EventType, EnqueueIncomingEvent);
+ fFunc(TEvPrivate::TEvLogTabletMoves::EventType, EnqueueIncomingEvent);
hFunc(TEvPrivate::TEvProcessIncomingEvent, Handle);
default:
if (!HandleDefaultEvents(ev, SelfId())) {
@@ -3121,6 +3137,38 @@ void THive::Handle(TEvPrivate::TEvRefreshStorageInfo::TPtr&) {
RequestPoolsInformation();
}
+void THive::Handle(TEvPrivate::TEvLogTabletMoves::TPtr&) {
+ LogTabletMovesScheduled = false;
+ if (TabletMovesByTypeForLog.empty()) {
+ return;
+ }
+ std::sort(TabletMoveSamplesForLog.begin(), TabletMoveSamplesForLog.end(), [](const TTabletMoveInfo& lhs, const TTabletMoveInfo& rhs) {
+ return lhs.Timestamp < rhs.Timestamp;
+ });
+ TStringBuilder movesByTypeString;
+ ui64 movesCount = 0;
+ for (const auto& [type, cnt] : TabletMovesByTypeForLog) {
+ if (!movesByTypeString.empty()) {
+ movesByTypeString << ", ";
+ }
+ movesByTypeString << cnt << "x " << TTabletTypes::TypeToStr(type);
+ movesCount += cnt;
+ }
+ BLOG_I("Made " << movesCount <<
+ " tablet moves (" << movesByTypeString <<
+ ") since " << LogTabletMovesSchedulingTime <<
+ ", including:");
+ for (const auto& moveInfo : TabletMoveSamplesForLog) {
+ auto tablet = FindTablet(moveInfo.Tablet);
+ BLOG_I("tablet " << (tablet ? tablet->ToString() : ToString(moveInfo.Tablet)) <<
+ " from node " << moveInfo.From <<
+ " to node " << moveInfo.To <<
+ " at " << moveInfo.Timestamp);
+ }
+ TabletMoveSamplesForLog.clear();
+ TabletMovesByTypeForLog.clear();
+}
+
TVector<TNodeId> THive::GetNodesForWhiteboardBroadcast(size_t maxNodesToReturn) {
TVector<TNodeId> nodes;
TNodeId selfNodeId = SelfId().NodeId();
diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h
index 3d96081846..c3bc3da59b 100644
--- a/ydb/core/mind/hive/hive_impl.h
+++ b/ydb/core/mind/hive/hive_impl.h
@@ -382,6 +382,7 @@ protected:
bool ProcessTabletBalancerScheduled = false;
bool ProcessTabletBalancerPostponed = false;
bool ProcessPendingOperationsScheduled = false;
+ bool LogTabletMovesScheduled = false;
TResourceRawValues TotalRawResourceValues = {};
TResourceNormalizedValues TotalNormalizedResourceValues = {};
TInstant LastResourceChangeReaction;
@@ -436,16 +437,23 @@ protected:
TFullTabletId Tablet;
TNodeId From;
TNodeId To;
+ double Priority;
+ TTabletTypes::EType TabletType;
- TString ToHTML() {
- TStringBuilder str;
- str << "<tr><td>" << Timestamp << "</td><td>" << Tablet
- << "</td><td>" << From << "&rarr;" << To << "</td><tr>";
- return str;
- }
+
+ TTabletMoveInfo(TInstant timestamp, const TTabletInfo& tablet, TNodeId from, TNodeId to);
+
+ TString ToHTML() const;
+
+ std::weak_ordering operator<=>(const TTabletMoveInfo& other) const;
};
TStaticRingBuffer<TTabletMoveInfo, 5> TabletMoveHistory;
+ std::vector<TTabletMoveInfo> TabletMoveSamplesForLog; // stores (at most) MOVE_SAMPLES_PER_LOG_ENTRY highest priority moves in a heap
+ static constexpr size_t MOVE_SAMPLES_PER_LOG_ENTRY = 10;
+ std::unordered_map<TTabletTypes::EType, ui64> TabletMovesByTypeForLog;
+ TInstant LogTabletMovesSchedulingTime;
+
// to be removed later
bool TabletOwnersSynced = false;
@@ -537,6 +545,7 @@ protected:
void Handle(TEvHive::TEvTabletOwnersReply::TPtr& ev);
void Handle(TEvHive::TEvUpdateTabletsObject::TPtr& ev);
void Handle(TEvPrivate::TEvRefreshStorageInfo::TPtr& ev);
+ void Handle(TEvPrivate::TEvLogTabletMoves::TPtr& ev);
void Handle(TEvPrivate::TEvProcessIncomingEvent::TPtr& ev);
protected:
diff --git a/ydb/core/mind/hive/tablet_move_info.cpp b/ydb/core/mind/hive/tablet_move_info.cpp
new file mode 100644
index 0000000000..ca2c290b17
--- /dev/null
+++ b/ydb/core/mind/hive/tablet_move_info.cpp
@@ -0,0 +1,35 @@
+#include "hive_impl.h"
+
+#include <random>
+
+namespace NKikimr {
+namespace NHive {
+
+THive::TTabletMoveInfo::TTabletMoveInfo(TInstant timestamp, const TTabletInfo& tablet, TNodeId from, TNodeId to)
+ : Timestamp(timestamp)
+ , Tablet(tablet.GetFullTabletId())
+ , From(from)
+ , To(to)
+ , TabletType(tablet.GetTabletType())
+{
+ // Priority is used to sample random moves, while prioritising system tablets
+ auto& randGen = *TAppData::RandomProvider.Get();
+ Priority = std::uniform_real_distribution{}(randGen);
+ if (THive::IsSystemTablet(TabletType)) {
+ Priority += 1;
+ }
+}
+
+TString THive::TTabletMoveInfo::ToHTML() const {
+ TStringBuilder str;
+ str << "<tr><td>" << Timestamp << "</td><td>" << Tablet
+ << "</td><td>" << From << "&rarr;" << To << "</td><tr>";
+ return str;
+}
+
+std::weak_ordering THive::TTabletMoveInfo::operator<=>(const TTabletMoveInfo& other) const {
+ return std::weak_order(Priority, other.Priority);
+}
+
+}
+}
diff --git a/ydb/core/mind/hive/ya.make b/ydb/core/mind/hive/ya.make
index d4ac1125b9..c4da22d45f 100644
--- a/ydb/core/mind/hive/ya.make
+++ b/ydb/core/mind/hive/ya.make
@@ -38,6 +38,7 @@ SRCS(
storage_pool_info.h
tablet_info.cpp
tablet_info.h
+ tablet_move_info.cpp
tx__adopt_tablet.cpp
tx__block_storage_result.cpp
tx__configure_subdomain.cpp