From d4679736891d872b74e2ff7e892cf569a17840d5 Mon Sep 17 00:00:00 2001 From: vporyadke Date: Wed, 17 Jul 2024 10:43:03 +0300 Subject: observability for tablet starts (#6584) --- ydb/core/mind/hive/hive_impl.cpp | 8 ++++++++ ydb/core/mind/hive/hive_impl.h | 1 + ydb/core/mind/hive/tablet_info.h | 1 + ydb/core/mind/hive/tx__start_tablet.cpp | 8 ++++++++ ydb/core/mind/hive/tx__update_tablet_status.cpp | 8 ++++++++ ydb/core/protos/counters_hive.proto | 16 ++++++++++++++++ 6 files changed, 42 insertions(+) diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index a44a8b15488..aea0ab969b7 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -1688,6 +1688,14 @@ void THive::UpdateCounterNodesConnected(i64 nodesConnectedDiff) { } } +void THive::UpdateCounterTabletsStarting(i64 tabletsStartingDiff) { + if (TabletCounters != nullptr) { + auto& counter = TabletCounters->Simple()[NHive::COUNTER_TABLETS_STARTING]; + auto newValue = counter.Get() + tabletsStartingDiff; + counter.Set(newValue); + } +} + void THive::RecordTabletMove(const TTabletMoveInfo& moveInfo) { TabletMoveHistory.PushBack(moveInfo); TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1); diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h index 53d712ce964..bc8ece7586b 100644 --- a/ydb/core/mind/hive/hive_impl.h +++ b/ydb/core/mind/hive/hive_impl.h @@ -649,6 +649,7 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId void UpdateCounterBootQueueSize(ui64 bootQueueSize); void UpdateCounterEventQueueSize(i64 eventQueueSizeDiff); void UpdateCounterNodesConnected(i64 nodesConnectedDiff); + void UpdateCounterTabletsStarting(i64 tabletsStartingDiff); void RecordTabletMove(const TTabletMoveInfo& info); bool DomainHasNodes(const TSubDomainKey &domainKey) const; void ProcessBootQueue(); diff --git a/ydb/core/mind/hive/tablet_info.h b/ydb/core/mind/hive/tablet_info.h index 35920dd1748..adbc141fe18 100644 --- a/ydb/core/mind/hive/tablet_info.h +++ b/ydb/core/mind/hive/tablet_info.h @@ -162,6 +162,7 @@ public: TInstant PostponedStart; EBalancerPolicy BalancerPolicy; TNodeId FailedNodeId = 0; // last time we tried to start the tablet, we failed on this node + TInstant BootTime; TTabletInfo(ETabletRole role, THive& hive); TTabletInfo(const TTabletInfo&) = delete; diff --git a/ydb/core/mind/hive/tx__start_tablet.cpp b/ydb/core/mind/hive/tx__start_tablet.cpp index 068f9915432..034fd6a124b 100644 --- a/ydb/core/mind/hive/tx__start_tablet.cpp +++ b/ydb/core/mind/hive/tx__start_tablet.cpp @@ -10,6 +10,7 @@ class TTxStartTablet : public TTransactionBase { ui64 Cookie; bool External; TSideEffects SideEffects; + bool Success; public: TTxStartTablet(TFullTabletId tabletId, const TActorId& local, ui64 cookie, bool external, THive *hive) @@ -23,10 +24,12 @@ public: TTxType GetTxType() const override { return NHive::TXTYPE_START_TABLET; } bool Execute(TTransactionContext& txc, const TActorContext&) override { + Success = false; SideEffects.Reset(Self->SelfId()); BLOG_D("THive::TTxStartTablet::Execute Tablet " << TabletId); TTabletInfo* tablet = Self->FindTablet(TabletId); if (tablet != nullptr) { + tablet->BootTime = TActivationContext::Now(); // finish fast-move operation if (tablet->LastNodeId != 0 && tablet->LastNodeId != Local.NodeId()) { TNodeInfo* lastNode = Self->FindNode(tablet->LastNodeId); @@ -65,6 +68,7 @@ public: new TEvLocal::TEvBootTablet(*leader.TabletStorageInfo, promotableFollowerId, leader.KnownGeneration), IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession, Cookie); + Success = true; return true; } else { BLOG_W("THive::TTxStartTablet::Execute, ignoring TEvBootTablet(" << leader.ToString() << ") - wrong state or node"); @@ -79,6 +83,7 @@ public: new TEvLocal::TEvBootTablet(*follower.LeaderTablet.TabletStorageInfo, follower.Id), IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession, Cookie); + Success = true; return true; } else { BLOG_W("THive::TTxStartTablet::Execute, ignoring TEvBootTablet(" << follower.ToString() << ") - wrong state or node"); @@ -108,6 +113,9 @@ public: void Complete(const TActorContext& ctx) override { BLOG_D("THive::TTxStartTablet::Complete Tablet " << TabletId << " SideEffects: " << SideEffects); SideEffects.Complete(ctx); + if (Success) { + Self->UpdateCounterTabletsStarting(+1); + } } }; diff --git a/ydb/core/mind/hive/tx__update_tablet_status.cpp b/ydb/core/mind/hive/tx__update_tablet_status.cpp index 621978d5689..c4efbbb9b24 100644 --- a/ydb/core/mind/hive/tx__update_tablet_status.cpp +++ b/ydb/core/mind/hive/tx__update_tablet_status.cpp @@ -80,6 +80,14 @@ public: if (Status == TEvLocal::TEvTabletStatus::StatusOk) { tablet->Statistics.AddRestartTimestamp(now.MilliSeconds()); tablet->ActualizeTabletStatistics(now); + if (tablet->BootTime != TInstant()) { + TDuration startTime = now - tablet->BootTime; + if (startTime > TDuration::Seconds(30)) { + BLOG_W("Tablet " << tablet->GetFullTabletId() << " was starting for " << startTime.Seconds() << " seconds"); + } + Self->TabletCounters->Percentile()[NHive::COUNTER_TABLETS_START_TIME].IncrementFor(startTime.MilliSeconds()); + Self->UpdateCounterTabletsStarting(-1); + } TNodeInfo* node = Self->FindNode(Local.NodeId()); if (node == nullptr) { // event from IC about disconnection of the node could overtake events from the node itself because of Pipe Server diff --git a/ydb/core/protos/counters_hive.proto b/ydb/core/protos/counters_hive.proto index 980f9cdc0ab..cde6e2cfa80 100644 --- a/ydb/core/protos/counters_hive.proto +++ b/ydb/core/protos/counters_hive.proto @@ -29,6 +29,7 @@ enum ESimpleCounters { COUNTER_IMBALANCED_OBJECTS = 19 [(CounterOpts) = {Name: "ImbalancedObjects"}]; COUNTER_WORST_OBJECT_VARIANCE = 20 [(CounterOpts) = {Name: "WorstObjectVariance"}]; COUNTER_STORAGE_SCATTER = 21 [(CounterOpts) = {Name: "StorageScatter"}]; + COUNTER_TABLETS_STARTING = 22 [(CounterOpts) = {Name: "TabletsStarting"}]; } enum ECumulativeCounters { @@ -75,6 +76,21 @@ enum EPercentileCounters { Ranges: { Value: 95 Name: "95%" }, Ranges: { Value: 100 Name: "100%" }, }]; + + COUNTER_TABLETS_START_TIME = 2 [(CounterOpts) = { + Name: "TabletsStartTimeMs", + Ranges: { Value: 1 } + Ranges: { Value: 5 } + Ranges: { Value: 10 } + Ranges: { Value: 50 } + Ranges: { Value: 100 } + Ranges: { Value: 500 } + Ranges: { Value: 1000 } + Ranges: { Value: 5000 } + Ranges: { Value: 10000 } + Ranges: { Value: 30000 } + Ranges: { Value: 60000 } + }]; } enum ETxTypes { -- cgit v1.3