summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorvporyadke <[email protected]>2024-07-17 10:43:03 +0300
committerGitHub <[email protected]>2024-07-17 09:43:03 +0200
commitd4679736891d872b74e2ff7e892cf569a17840d5 (patch)
tree6f770d13d182e0a9ffa34e343cee585075650231
parente7d4e27ed1213f0cad5540329aa8d82421b054a7 (diff)
observability for tablet starts (#6584)
-rw-r--r--ydb/core/mind/hive/hive_impl.cpp8
-rw-r--r--ydb/core/mind/hive/hive_impl.h1
-rw-r--r--ydb/core/mind/hive/tablet_info.h1
-rw-r--r--ydb/core/mind/hive/tx__start_tablet.cpp8
-rw-r--r--ydb/core/mind/hive/tx__update_tablet_status.cpp8
-rw-r--r--ydb/core/protos/counters_hive.proto16
6 files changed, 42 insertions, 0 deletions
diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp
index a44a8b15488..aea0ab969b7 100644
--- a/ydb/core/mind/hive/hive_impl.cpp
+++ b/ydb/core/mind/hive/hive_impl.cpp
@@ -1688,6 +1688,14 @@ void THive::UpdateCounterNodesConnected(i64 nodesConnectedDiff) {
}
}
+void THive::UpdateCounterTabletsStarting(i64 tabletsStartingDiff) {
+ if (TabletCounters != nullptr) {
+ auto& counter = TabletCounters->Simple()[NHive::COUNTER_TABLETS_STARTING];
+ auto newValue = counter.Get() + tabletsStartingDiff;
+ counter.Set(newValue);
+ }
+}
+
void THive::RecordTabletMove(const TTabletMoveInfo& moveInfo) {
TabletMoveHistory.PushBack(moveInfo);
TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1);
diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h
index 53d712ce964..bc8ece7586b 100644
--- a/ydb/core/mind/hive/hive_impl.h
+++ b/ydb/core/mind/hive/hive_impl.h
@@ -649,6 +649,7 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
void UpdateCounterBootQueueSize(ui64 bootQueueSize);
void UpdateCounterEventQueueSize(i64 eventQueueSizeDiff);
void UpdateCounterNodesConnected(i64 nodesConnectedDiff);
+ void UpdateCounterTabletsStarting(i64 tabletsStartingDiff);
void RecordTabletMove(const TTabletMoveInfo& info);
bool DomainHasNodes(const TSubDomainKey &domainKey) const;
void ProcessBootQueue();
diff --git a/ydb/core/mind/hive/tablet_info.h b/ydb/core/mind/hive/tablet_info.h
index 35920dd1748..adbc141fe18 100644
--- a/ydb/core/mind/hive/tablet_info.h
+++ b/ydb/core/mind/hive/tablet_info.h
@@ -162,6 +162,7 @@ public:
TInstant PostponedStart;
EBalancerPolicy BalancerPolicy;
TNodeId FailedNodeId = 0; // last time we tried to start the tablet, we failed on this node
+ TInstant BootTime;
TTabletInfo(ETabletRole role, THive& hive);
TTabletInfo(const TTabletInfo&) = delete;
diff --git a/ydb/core/mind/hive/tx__start_tablet.cpp b/ydb/core/mind/hive/tx__start_tablet.cpp
index 068f9915432..034fd6a124b 100644
--- a/ydb/core/mind/hive/tx__start_tablet.cpp
+++ b/ydb/core/mind/hive/tx__start_tablet.cpp
@@ -10,6 +10,7 @@ class TTxStartTablet : public TTransactionBase<THive> {
ui64 Cookie;
bool External;
TSideEffects SideEffects;
+ bool Success;
public:
TTxStartTablet(TFullTabletId tabletId, const TActorId& local, ui64 cookie, bool external, THive *hive)
@@ -23,10 +24,12 @@ public:
TTxType GetTxType() const override { return NHive::TXTYPE_START_TABLET; }
bool Execute(TTransactionContext& txc, const TActorContext&) override {
+ Success = false;
SideEffects.Reset(Self->SelfId());
BLOG_D("THive::TTxStartTablet::Execute Tablet " << TabletId);
TTabletInfo* tablet = Self->FindTablet(TabletId);
if (tablet != nullptr) {
+ tablet->BootTime = TActivationContext::Now();
// finish fast-move operation
if (tablet->LastNodeId != 0 && tablet->LastNodeId != Local.NodeId()) {
TNodeInfo* lastNode = Self->FindNode(tablet->LastNodeId);
@@ -65,6 +68,7 @@ public:
new TEvLocal::TEvBootTablet(*leader.TabletStorageInfo, promotableFollowerId, leader.KnownGeneration),
IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession,
Cookie);
+ Success = true;
return true;
} else {
BLOG_W("THive::TTxStartTablet::Execute, ignoring TEvBootTablet(" << leader.ToString() << ") - wrong state or node");
@@ -79,6 +83,7 @@ public:
new TEvLocal::TEvBootTablet(*follower.LeaderTablet.TabletStorageInfo, follower.Id),
IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession,
Cookie);
+ Success = true;
return true;
} else {
BLOG_W("THive::TTxStartTablet::Execute, ignoring TEvBootTablet(" << follower.ToString() << ") - wrong state or node");
@@ -108,6 +113,9 @@ public:
void Complete(const TActorContext& ctx) override {
BLOG_D("THive::TTxStartTablet::Complete Tablet " << TabletId << " SideEffects: " << SideEffects);
SideEffects.Complete(ctx);
+ if (Success) {
+ Self->UpdateCounterTabletsStarting(+1);
+ }
}
};
diff --git a/ydb/core/mind/hive/tx__update_tablet_status.cpp b/ydb/core/mind/hive/tx__update_tablet_status.cpp
index 621978d5689..c4efbbb9b24 100644
--- a/ydb/core/mind/hive/tx__update_tablet_status.cpp
+++ b/ydb/core/mind/hive/tx__update_tablet_status.cpp
@@ -80,6 +80,14 @@ public:
if (Status == TEvLocal::TEvTabletStatus::StatusOk) {
tablet->Statistics.AddRestartTimestamp(now.MilliSeconds());
tablet->ActualizeTabletStatistics(now);
+ if (tablet->BootTime != TInstant()) {
+ TDuration startTime = now - tablet->BootTime;
+ if (startTime > TDuration::Seconds(30)) {
+ BLOG_W("Tablet " << tablet->GetFullTabletId() << " was starting for " << startTime.Seconds() << " seconds");
+ }
+ Self->TabletCounters->Percentile()[NHive::COUNTER_TABLETS_START_TIME].IncrementFor(startTime.MilliSeconds());
+ Self->UpdateCounterTabletsStarting(-1);
+ }
TNodeInfo* node = Self->FindNode(Local.NodeId());
if (node == nullptr) {
// event from IC about disconnection of the node could overtake events from the node itself because of Pipe Server
diff --git a/ydb/core/protos/counters_hive.proto b/ydb/core/protos/counters_hive.proto
index 980f9cdc0ab..cde6e2cfa80 100644
--- a/ydb/core/protos/counters_hive.proto
+++ b/ydb/core/protos/counters_hive.proto
@@ -29,6 +29,7 @@ enum ESimpleCounters {
COUNTER_IMBALANCED_OBJECTS = 19 [(CounterOpts) = {Name: "ImbalancedObjects"}];
COUNTER_WORST_OBJECT_VARIANCE = 20 [(CounterOpts) = {Name: "WorstObjectVariance"}];
COUNTER_STORAGE_SCATTER = 21 [(CounterOpts) = {Name: "StorageScatter"}];
+ COUNTER_TABLETS_STARTING = 22 [(CounterOpts) = {Name: "TabletsStarting"}];
}
enum ECumulativeCounters {
@@ -75,6 +76,21 @@ enum EPercentileCounters {
Ranges: { Value: 95 Name: "95%" },
Ranges: { Value: 100 Name: "100%" },
}];
+
+ COUNTER_TABLETS_START_TIME = 2 [(CounterOpts) = {
+ Name: "TabletsStartTimeMs",
+ Ranges: { Value: 1 }
+ Ranges: { Value: 5 }
+ Ranges: { Value: 10 }
+ Ranges: { Value: 50 }
+ Ranges: { Value: 100 }
+ Ranges: { Value: 500 }
+ Ranges: { Value: 1000 }
+ Ranges: { Value: 5000 }
+ Ranges: { Value: 10000 }
+ Ranges: { Value: 30000 }
+ Ranges: { Value: 60000 }
+ }];
}
enum ETxTypes {