diff options
| author | risenberg <[email protected]> | 2026-06-26 20:23:22 +0300 |
|---|---|---|
| committer | GitHub <[email protected]> | 2026-06-26 20:23:22 +0300 |
| commit | 77ebda6da406e5958aedfebf40cb4546fc321b87 (patch) | |
| tree | a0bd26d5a3385bb3afe4e09f028ba050abc3ec3b | |
| parent | 3ccdf241c5642a335f685c52b4582ff293f76451 (diff) | |
Handle TEvWatchNotifyUnavailable in Init state by restarting the shard. (#44341)
5 files changed, 84 insertions, 1 deletions
diff --git a/ydb/core/tx/columnshard/columnshard_impl.cpp b/ydb/core/tx/columnshard/columnshard_impl.cpp index 886bd5ac1c3..c7da8286a4a 100644 --- a/ydb/core/tx/columnshard/columnshard_impl.cpp +++ b/ydb/core/tx/columnshard/columnshard_impl.cpp @@ -1642,6 +1642,7 @@ void TColumnShard::Enqueue(STFUNC_SIG) { HFunc(TEvPrivate::TEvNormalizerResult, Handle); HFunc(TEvPrivate::TEvAskTabletDataAccessors, Handle); HFunc(TEvTxProxySchemeCache::TEvWatchNotifyUpdated, Handle); + HFunc(TEvTxProxySchemeCache::TEvWatchNotifyUnavailable, Handle); default: AFL_WARN(NKikimrServices::TX_COLUMNSHARD)("event", "unexpected event in enqueue"); return NTabletFlatExecutor::TTabletExecutedFlat::Enqueue(ev); diff --git a/ydb/core/tx/columnshard/columnshard_impl.h b/ydb/core/tx/columnshard/columnshard_impl.h index 206c75c3ca1..be5c7766fda 100644 --- a/ydb/core/tx/columnshard/columnshard_impl.h +++ b/ydb/core/tx/columnshard/columnshard_impl.h @@ -321,6 +321,7 @@ class TColumnShard: public TActor<TColumnShard>, public NTabletFlatExecutor::TTa void Handle(NColumnShard::TEvPrivate::TEvAskTabletDataAccessors::TPtr& ev, const TActorContext& ctx); void Handle(NColumnShard::TEvPrivate::TEvAskColumnData::TPtr& ev, const TActorContext& ctx); void Handle(TEvTxProxySchemeCache::TEvWatchNotifyUpdated::TPtr& ev, const TActorContext& ctx); + void Handle(TEvTxProxySchemeCache::TEvWatchNotifyUnavailable::TPtr& ev, const TActorContext& ctx); void Handle(TEvDataShard::TEvCancelBackup::TPtr& ev, const TActorContext& ctx); void Handle(TEvDataShard::TEvCancelRestore::TPtr& ev, const TActorContext& ctx); @@ -494,6 +495,7 @@ protected: HFunc(NColumnShard::TEvPrivate::TEvAskTabletDataAccessors, Handle); HFunc(NColumnShard::TEvPrivate::TEvAskColumnData, Handle); HFunc(TEvTxProxySchemeCache::TEvWatchNotifyUpdated, Handle); + HFunc(TEvTxProxySchemeCache::TEvWatchNotifyUnavailable, Handle); HFunc(TEvColumnShard::TEvOverloadUnsubscribe, Handle); HFunc(NLongTxService::TEvLongTxService::TEvLockStatus, Handle); HFunc(TEvDataShard::TEvCancelBackup, Handle); diff --git a/ydb/core/tx/columnshard/columnshard_subdomain_path_id.cpp b/ydb/core/tx/columnshard/columnshard_subdomain_path_id.cpp index 3455c9b099a..d42f54cdf6d 100644 --- a/ydb/core/tx/columnshard/columnshard_subdomain_path_id.cpp +++ b/ydb/core/tx/columnshard/columnshard_subdomain_path_id.cpp @@ -98,6 +98,16 @@ void TColumnShard::Handle(TEvTxProxySchemeCache::TEvWatchNotifyUpdated::TPtr& ev Execute(new TTxPersistSubDomainOutOfSpace(this, outOfSpace), ctx); } +void TColumnShard::Handle(TEvTxProxySchemeCache::TEvWatchNotifyUnavailable::TPtr& ev, const TActorContext& ctx) { + const auto* msg = ev->Get(); + AFL_CRIT(NKikimrServices::TX_COLUMNSHARD)("event", "scheme shard unavailable, will restart to try again")("path_id", msg->PathId); + // This event may arrive while the tablet is still in StateInit, with init transactions + // in flight. HandlePoison detaches the executor first (so those transactions stop + // calling back into this object) and then dies - unlike a bare Die(), which would + // leave the executor running against freed memory. + HandlePoison(ctx); +} + static constexpr TDuration MaxFindSubDomainPathIdDelay = TDuration::Minutes(10); void TSpaceWatcher::StartFindSubDomainPathId(bool delayFirstRequest) { diff --git a/ydb/core/tx/columnshard/hooks/testing/controller.cpp b/ydb/core/tx/columnshard/hooks/testing/controller.cpp index d3838ac0062..9a90ba1f13d 100644 --- a/ydb/core/tx/columnshard/hooks/testing/controller.cpp +++ b/ydb/core/tx/columnshard/hooks/testing/controller.cpp @@ -88,7 +88,9 @@ void TController::DoOnTabletInitCompleted(const ::NKikimr::NColumnShard::TColumn void TController::DoOnTabletStopped(const ::NKikimr::NColumnShard::TColumnShard& shard) { TGuard<TMutex> g(Mutex); - AFL_VERIFY(ShardActuals.erase(shard.TabletID())); + // A shard may stop before init completes (e.g. it dies on TEvWatchNotifyUnavailable + // while still in StateInit), in which case it was never added to ShardActuals. + ShardActuals.erase(shard.TabletID()); } bool TController::IsTrivialLinks() const { diff --git a/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp b/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp index cde600868d6..57eae9352f7 100644 --- a/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp +++ b/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp @@ -1559,6 +1559,74 @@ void TestReadAggregate(const std::vector<NArrow::NTest::TTestColumn>& ydbSchema, } // namespace +Y_UNIT_TEST_SUITE(TColumnShardInit) { + // Regression test for the "Unhandled StateInit event" crash: while a columnshard is + // still booting (StateInit), the scheme cache may answer the subdomain watch with + // TEvWatchNotifyUnavailable (e.g. the subdomain can't be resolved during a cluster + // version change). Before the fix this event was unhandled and fell through to + // TTabletExecutedFlat::Enqueue, which had Y_DEBUG_ABORT. So it crashed in debug mode + // and ran with a possibly inconsistent state in release. + // The current fix is to die and restart in loop until a proper init happens. + // + // Note: TTabletExecutedFlat::Enqueue only Y_DEBUG_ABORTs, which is a no-op in release + // builds, so without the fix the unhandled event is silently dropped there and the + // shard boots to StateWork as if nothing happened. Asserting "the tablet eventually + // becomes active" is therefore not enough - it holds with or without the fix in release. + // What the fix actually changes is that the shard's user part *dies and is re-created*: + // the bootstrapper boots a brand new user actor with a different TActorId. We assert on + // that. (We can't count TEvRestored events: the test tablet framework delivers two per + // physical boot, to the same user actor, so the count is 2 even with no restart.) + Y_UNIT_TEST(SubDomainUnavailableDuringInit) { + TTestBasicRuntime runtime; + TTester::Setup(runtime); + auto csControllerGuard = NKikimr::NYDBTest::TControllers::RegisterCSControllerGuard<TDefaultTestsController>(); + auto controller = NKikimr::NYDBTest::TControllers::GetControllerAs<NKikimr::NYDBTest::NColumnShard::TController>(); + + const ui64 tabletId = TTestTxConfig::TxTablet0; + + // Inject the unavailable notification exactly once, the moment the shard's user + // actor first appears (it is in StateInit until TTxInit finishes), reproducing the + // production ordering where the reply lands before the switch to StateWork. Track + // the set of distinct user actors that boot for this tablet: a real restart of the + // user part shows up as a second, different TActorId. + bool injected = false; + THashSet<TActorId> userActors; + runtime.SetObserverFunc([&](TAutoPtr<IEventHandle>& ev) { + if (ev->GetTypeRewrite() == TEvTablet::TEvRestored::EventType) { + const auto* msg = ev->Get<TEvTablet::TEvRestored>(); + if (msg->TabletID == tabletId && !msg->Follower) { + userActors.insert(msg->UserTabletActor); + if (!injected) { + injected = true; + runtime.Send(new IEventHandle(msg->UserTabletActor, TActorId(), + new TEvTxProxySchemeCache::TEvWatchNotifyUnavailable(0, "/Root", TPathId(1, 1))), 0, true); + } + } + } + return TTestActorRuntime::EEventAction::PROCESS; + }); + + CreateTestBootstrapper(runtime, CreateTestTabletInfo(tabletId, TTabletTypes::ColumnShard), &CreateColumnShard); + + // With the fix the shard dies on the injected event and the bootstrapper restarts + // it; the restart is not re-injected (guarded by `injected`) and reaches StateWork. + while (!controller->IsActiveTablet(tabletId)) { + runtime.SimulateSleep(TDuration::Seconds(1)); + } + UNIT_ASSERT(injected); + + // The fix's contract: the shard's user part must have died on the injected event and + // been re-created by the bootstrapper, so a second distinct user actor exists. + // Without the fix the event is dropped and the original user actor reaches StateWork + // untouched, so there is only ever one - and this fails. + UNIT_ASSERT_GE_C(userActors.size(), 2, "shard did not restart after TEvWatchNotifyUnavailable (only one user actor booted)"); + + // The recovered shard is fully functional. + TActorId sender = runtime.AllocateEdgeActor(); + Y_UNUSED(SetupSchema(runtime, sender, 1, TestTableDescription{})); + } +} + Y_UNIT_TEST_SUITE(EvWrite) { Y_UNIT_TEST(WriteInTransaction) { using namespace NArrow; |
