diff options
author | Aleksei Borzenkov <snaury@ydb.tech> | 2025-03-14 12:53:47 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-03-14 12:53:47 +0300 |
commit | bba21cb0db2e074cac5ac4211b78c0852a2b67fe (patch) | |
tree | 9379fade33f154cddbc156b540f121cb54e0ae01 | |
parent | 1b4e25c3bd89af549e479fb6df968d7d29fcafd4 (diff) | |
download | ydb-bba21cb0db2e074cac5ac4211b78c0852a2b67fe.tar.gz |
Support processing unhandled exceptions in actors and tablets (#15468)
-rw-r--r-- | ydb/core/driver_lib/run/main.cpp | 4 | ||||
-rw-r--r-- | ydb/core/protos/feature_flags.proto | 1 | ||||
-rw-r--r-- | ydb/core/tablet_flat/flat_executor.cpp | 19 | ||||
-rw-r--r-- | ydb/core/tablet_flat/flat_executor.h | 5 | ||||
-rw-r--r-- | ydb/core/tablet_flat/flat_executor_ut.cpp | 38 | ||||
-rw-r--r-- | ydb/core/tablet_flat/tablet_flat_executed.cpp | 29 | ||||
-rw-r--r-- | ydb/core/tablet_flat/tablet_flat_executed.h | 7 | ||||
-rw-r--r-- | ydb/core/tablet_flat/tablet_flat_executor.h | 2 | ||||
-rw-r--r-- | ydb/core/tablet_flat/test/libs/exec/dummy.h | 6 | ||||
-rw-r--r-- | ydb/core/testlib/actors/test_runtime.cpp | 5 | ||||
-rw-r--r-- | ydb/library/actors/core/actor.cpp | 20 | ||||
-rw-r--r-- | ydb/library/actors/core/actor.h | 28 | ||||
-rw-r--r-- | ydb/library/actors/core/ut/actor_exception_ut.cpp | 105 | ||||
-rw-r--r-- | ydb/library/actors/core/ut/ya.make | 1 |
14 files changed, 249 insertions, 21 deletions
diff --git a/ydb/core/driver_lib/run/main.cpp b/ydb/core/driver_lib/run/main.cpp index 00be7c4568..ffb54aca93 100644 --- a/ydb/core/driver_lib/run/main.cpp +++ b/ydb/core/driver_lib/run/main.cpp @@ -178,6 +178,10 @@ std::terminate_handler defaultTerminateHandler; void KikimrTerminateHandler() { Cerr << "======= terminate() call stack ========\n"; FormatBackTrace(&Cerr); + if (auto backtrace = TBackTrace::FromCurrentException(); backtrace.size() > 0) { + Cerr << "======== exception call stack =========\n"; + backtrace.PrintTo(Cerr); + } Cerr << "=======================================\n"; auto oldHandler = defaultTerminateHandler; diff --git a/ydb/core/protos/feature_flags.proto b/ydb/core/protos/feature_flags.proto index b5912d922c..82ca987893 100644 --- a/ydb/core/protos/feature_flags.proto +++ b/ydb/core/protos/feature_flags.proto @@ -199,4 +199,5 @@ message TFeatureFlags { optional bool EnableShowCreate = 173 [default = false]; optional bool EnableChangefeedsExport = 174 [default = false]; optional bool EnableKafkaNativeBalancing = 175 [default = false]; + optional bool EnableTabletRestartOnUnhandledExceptions = 176 [default = true]; } diff --git a/ydb/core/tablet_flat/flat_executor.cpp b/ydb/core/tablet_flat/flat_executor.cpp index 29a85754f0..b62bd22174 100644 --- a/ydb/core/tablet_flat/flat_executor.cpp +++ b/ydb/core/tablet_flat/flat_executor.cpp @@ -139,6 +139,20 @@ TExecutor::~TExecutor() { } +bool TExecutor::OnUnhandledException(const std::exception& e) { + if (AppData()->FeatureFlags.GetEnableTabletRestartOnUnhandledExceptions()) { + if (auto log = Logger->Log(ELnLev::Crit)) { + log << "Tablet " << TabletId() << " unhandled exception " << TypeName(e) << ": " << e.what() + << '\n' << TBackTrace::FromCurrentException().PrintToString(); + } + Broken(); + return true; + } + + // Exception will propagate and cause the process to crash + return false; +} + ui64 TExecutor::Stamp() const noexcept { return CommitManager ? CommitManager->Stamp() : TTxStamp{ Generation0, Step0 }.Raw; @@ -168,6 +182,7 @@ void TExecutor::Registered(TActorSystem *sys, const TActorId&) GetServiceCounters(AppData()->Counters, "tablets")->GetCounter("alerts_req_nodata", true); GetServiceCounters(AppData()->Counters, "tablets")->GetCounter("alerts_scan_nodata", true); GetServiceCounters(AppData()->Counters, "tablets")->GetCounter("alerts_boot_nodata", true); + GetServiceCounters(AppData()->Counters, "tablets")->GetCounter("alerts_broken", true); } void TExecutor::PassAway() { @@ -195,6 +210,8 @@ void TExecutor::PassAway() { } void TExecutor::Broken() { + GetServiceCounters(AppData()->Counters, "tablets")->GetCounter("alerts_broken", true)->Inc(); + if (BootLogic) BootLogic->Cancel(); @@ -888,7 +905,7 @@ void TExecutor::Restored(TEvTablet::TEvRestored::TPtr &ev, const TActorContext & return TranscriptBootOpResult(res, ctx); } -void TExecutor::DetachTablet(const TActorContext &) { +void TExecutor::DetachTablet() { TabletCountersForgetTablet(Owner->TabletID(), Owner->TabletType(), Owner->Info()->TenantPathId, Stats->IsFollower(), SelfId()); return PassAway(); diff --git a/ydb/core/tablet_flat/flat_executor.h b/ydb/core/tablet_flat/flat_executor.h index ea562e30bb..fc0f3deaf7 100644 --- a/ydb/core/tablet_flat/flat_executor.h +++ b/ydb/core/tablet_flat/flat_executor.h @@ -312,6 +312,7 @@ struct TExecutorCaches { class TExecutor : public TActor<TExecutor> , public NFlatExecutorSetup::IExecutor + , public IActorExceptionHandler , private NTable::ICompactionBackend , private ILoadBlob { @@ -633,7 +634,7 @@ public: // IExecutor interface void Boot(TEvTablet::TEvBoot::TPtr &ev, const TActorContext &ctx) override; void Restored(TEvTablet::TEvRestored::TPtr &ev, const TActorContext &ctx) override; - void DetachTablet(const TActorContext &ctx) override; + void DetachTablet() override; ui64 DoExecute(TAutoPtr<ITransaction> transaction, ETxMode mode); void Execute(TAutoPtr<ITransaction> transaction, const TActorContext &ctx) override; ui64 Enqueue(TAutoPtr<ITransaction> transaction) override; @@ -705,6 +706,8 @@ public: TExecutor(NFlatExecutorSetup::ITablet *owner, const TActorId& ownerActorId); ~TExecutor(); + bool OnUnhandledException(const std::exception&) override; + STFUNC(StateInit); STFUNC(StateBoot); STFUNC(StateWork); diff --git a/ydb/core/tablet_flat/flat_executor_ut.cpp b/ydb/core/tablet_flat/flat_executor_ut.cpp index 01af27dbe3..037e7a3059 100644 --- a/ydb/core/tablet_flat/flat_executor_ut.cpp +++ b/ydb/core/tablet_flat/flat_executor_ut.cpp @@ -523,7 +523,7 @@ class TTestFlatTablet : public TActor<TTestFlatTablet>, public TTabletExecutedFl void Handle(TEvents::TEvPoison::TPtr &, const TActorContext &ctx) { Become(&TThis::StateBroken); - Executor()->DetachTablet(ctx), Detach(ctx); /* see TDummy tablet */ + Executor()->DetachTablet(), Detach(ctx); /* see TDummy tablet */ ctx.Send(Sender, new TEvents::TEvGone); } @@ -7354,5 +7354,41 @@ Y_UNIT_TEST_SUITE(TFlatTableExecutor_LowPriorityTxs) { } } +Y_UNIT_TEST_SUITE(TFlatTableExecutor_Exceptions) { + struct TTxExecuteThrowException : public ITransaction { + bool Execute(TTransactionContext&, const TActorContext&) override { + throw std::runtime_error("test"); + } + + void Complete(const TActorContext&) override { + // not reached + } + }; + + Y_UNIT_TEST(TestTabletExecuteExceptionDirect) { + TMyEnvBase env; + env->GetAppData().FeatureFlags.SetEnableTabletRestartOnUnhandledExceptions(true); + + env.FireDummyTablet(); + + env.SendAsync(new NFake::TEvExecute([&](auto* x, const auto& ctx) { + x->Execute(new TTxExecuteThrowException, ctx); + })); + env.WaitForGone(); + } + + Y_UNIT_TEST(TestTabletExecuteExceptionEnqueue) { + TMyEnvBase env; + env->GetAppData().FeatureFlags.SetEnableTabletRestartOnUnhandledExceptions(true); + + env.FireDummyTablet(); + + env.SendAsync(new NFake::TEvExecute([&](auto* x, const auto&) { + x->Enqueue(new TTxExecuteThrowException); + })); + env.WaitForGone(); + } +} + } } diff --git a/ydb/core/tablet_flat/tablet_flat_executed.cpp b/ydb/core/tablet_flat/tablet_flat_executed.cpp index 4358b987d9..7e325c4274 100644 --- a/ydb/core/tablet_flat/tablet_flat_executed.cpp +++ b/ydb/core/tablet_flat/tablet_flat_executed.cpp @@ -2,6 +2,7 @@ #include "flat_executor.h" #include "flat_executor_counters.h" #include <ydb/core/base/appdata.h> +#include <ydb/core/base/counters.h> #include <library/cpp/monlib/service/pages/templates.h> namespace NKikimr { @@ -16,6 +17,28 @@ TTabletExecutedFlat::TTabletExecutedFlat(TTabletStorageInfo *info, const TActorI , StartTime0(TAppData::TimeProvider->Now()) {} +bool TTabletExecutedFlat::OnUnhandledException(const std::exception& e) { + if (AppData()->FeatureFlags.GetEnableTabletRestartOnUnhandledExceptions()) { + // Tablets have a weird inheritence where subclass is always an actor, + // but we don't know the exact type at compile time. This dynamic_cast + // is expected to always succeed. + if (auto* actor = dynamic_cast<IActor*>(this)) { + auto ctx = TActivationContext::ActorContextFor(actor->SelfId()); + LOG_CRIT_S(*TlsActivationContext, NKikimrServices::TABLET_EXECUTOR, + "Tablet " << TabletID() << " unhandled exception " << TypeName(e) << ": " << e.what() + << '\n' << TBackTrace::FromCurrentException().PrintToString()); + + GetServiceCounters(AppData(ctx)->Counters, "tablets")->GetCounter("alerts_broken", true)->Inc(); + + HandlePoison(ctx); + return true; + } + } + + // Exception will propagate and cause the process to crash + return false; +} + IExecutor* TTabletExecutedFlat::CreateExecutor(const TActorContext &ctx) { if (!Executor()) { IActor *executor = NFlatExecutorSetup::CreateExecutor(this, ctx.SelfID); @@ -123,9 +146,9 @@ void TTabletExecutedFlat::OnTabletStop(TEvTablet::TEvTabletStop::TPtr &ev, const ctx.Send(Tablet(), new TEvTablet::TEvTabletStopped()); } -void TTabletExecutedFlat::HandlePoison(const TActorContext &ctx) { +void TTabletExecutedFlat::HandlePoison(const TActorContext& ctx) { if (Executor0) { - Executor0->DetachTablet(ExecutorCtx(ctx)); + Executor0->DetachTablet(); Executor0 = nullptr; } @@ -142,7 +165,7 @@ void TTabletExecutedFlat::HandleTabletStop(TEvTablet::TEvTabletStop::TPtr &ev, c void TTabletExecutedFlat::HandleTabletDead(TEvTablet::TEvTabletDead::TPtr &ev, const TActorContext &ctx) { if (Executor0) { - Executor0->DetachTablet(ExecutorCtx(ctx)); + Executor0->DetachTablet(); Executor0 = nullptr; } diff --git a/ydb/core/tablet_flat/tablet_flat_executed.h b/ydb/core/tablet_flat/tablet_flat_executed.h index fb45ea880c..edc0d984bc 100644 --- a/ydb/core/tablet_flat/tablet_flat_executed.h +++ b/ydb/core/tablet_flat/tablet_flat_executed.h @@ -16,7 +16,10 @@ struct IMiniKQLFactory { virtual TAutoPtr<ITransaction> Make(TEvTablet::TEvLocalReadColumns::TPtr&) = 0; }; -class TTabletExecutedFlat : public NFlatExecutorSetup::ITablet { +class TTabletExecutedFlat + : public NFlatExecutorSetup::ITablet + , public IActorExceptionHandler +{ protected: using IExecutor = NFlatExecutorSetup::IExecutor; @@ -24,6 +27,8 @@ protected: IExecutor* Executor() const { return Executor0; } const TInstant StartTime() const { return StartTime0; } + bool OnUnhandledException(const std::exception&) override; + void Execute(TAutoPtr<ITransaction> transaction, const TActorContext &ctx); void Execute(TAutoPtr<ITransaction> transaction); ui64 Enqueue(TAutoPtr<ITransaction> transaction); diff --git a/ydb/core/tablet_flat/tablet_flat_executor.h b/ydb/core/tablet_flat/tablet_flat_executor.h index 7024004140..edac195ea2 100644 --- a/ydb/core/tablet_flat/tablet_flat_executor.h +++ b/ydb/core/tablet_flat/tablet_flat_executor.h @@ -550,7 +550,7 @@ namespace NFlatExecutorSetup { // tablet generation restoration complete, tablet could act as leader virtual void Restored(TEvTablet::TEvRestored::TPtr &ev, const TActorContext &ctx) = 0; // die! - virtual void DetachTablet(const TActorContext &ctx) = 0; + virtual void DetachTablet() = 0; // tablet assigned as follower (or follower connection refreshed), must begin loading virtual void FollowerBoot(TEvTablet::TEvFBoot::TPtr &ev, const TActorContext &ctx) = 0; diff --git a/ydb/core/tablet_flat/test/libs/exec/dummy.h b/ydb/core/tablet_flat/test/libs/exec/dummy.h index 8df6036401..f018900a91 100644 --- a/ydb/core/tablet_flat/test/libs/exec/dummy.h +++ b/ydb/core/tablet_flat/test/libs/exec/dummy.h @@ -15,7 +15,7 @@ namespace NFake { virtual NFake::TEvExecute* OnFinished() = 0; }; - class TDummy : public ::NActors::IActorCallback, public TExecuted { + class TDummy : public TActor<TDummy>, public TExecuted { enum EState { Boot = 1, Work = 2, @@ -35,7 +35,7 @@ namespace NFake { TDummy(const TActorId &tablet, TInfo *info, const TActorId& owner, ui32 flags = 0 /* ORed EFlg enum */) - : ::NActors::IActorCallback(static_cast<TReceiveFunc>(&TDummy::Inbox), NKikimrServices::TActivity::FAKE_ENV_A) + : TActor(&TDummy::Inbox, NKikimrServices::TActivity::FAKE_ENV_A) , TTabletExecutedFlat(info, tablet, nullptr) , Owner(owner) , Flags(flags) @@ -75,7 +75,7 @@ namespace NFake { */ auto ctx(this->ActorContext()); - Executor()->DetachTablet(ctx), Detach(ctx); + Executor()->DetachTablet(), Detach(ctx); } } else if (State == EState::Boot) { TTabletExecutedFlat::StateInitImpl(eh, SelfId()); diff --git a/ydb/core/testlib/actors/test_runtime.cpp b/ydb/core/testlib/actors/test_runtime.cpp index 8dacce589e..4c526d4af8 100644 --- a/ydb/core/testlib/actors/test_runtime.cpp +++ b/ydb/core/testlib/actors/test_runtime.cpp @@ -123,6 +123,11 @@ namespace NActors { NKikimr::TAppData::TimeProvider = TimeProvider; } + // We want tests to fail on unhandled exceptions by default + if (!App0->FeatureFlags.HasEnableTabletRestartOnUnhandledExceptions()) { + App0->FeatureFlags.SetEnableTabletRestartOnUnhandledExceptions(false); + } + MonPorts.clear(); for (ui32 nodeIndex = 0; nodeIndex < NodeCount; ++nodeIndex) { ui32 nodeId = FirstNodeId + nodeIndex; diff --git a/ydb/library/actors/core/actor.cpp b/ydb/library/actors/core/actor.cpp index 68ec2bff0d..208acc74af 100644 --- a/ydb/library/actors/core/actor.cpp +++ b/ydb/library/actors/core/actor.cpp @@ -267,6 +267,26 @@ namespace NActors { return NHPTimer::GetSeconds(ElapsedTicks); } + void IActor::Receive(TAutoPtr<IEventHandle>& ev) { +#ifndef NDEBUG + if (ev->Flags & IEventHandle::FlagDebugTrackReceive) { + YaDebugBreak(); + } +#endif + ++HandledEvents; + LastReceiveTimestamp = TActivationContext::Monotonic(); + + try { + (this->*StateFunc_)(ev); + } catch(const std::exception& e) { + if (auto* handler = dynamic_cast<IActorExceptionHandler*>(this); + !handler || !handler->OnUnhandledException(e)) + { + throw; + } + } + } + void IActor::Registered(TActorSystem* sys, const TActorId& owner) { // fallback to legacy method, do not use it anymore if (auto eh = AfterRegister(SelfId(), owner)) { diff --git a/ydb/library/actors/core/actor.h b/ydb/library/actors/core/actor.h index 5ac6826af7..8153e31c12 100644 --- a/ydb/library/actors/core/actor.h +++ b/ydb/library/actors/core/actor.h @@ -347,6 +347,23 @@ namespace NActors { void DoActorInit() { LastUsageTimestamp = GetCycleCountFast(); } }; + /** + * Optional interface for actors with exception handling + */ + class IActorExceptionHandler { + protected: + ~IActorExceptionHandler() = default; + + public: + /** + * Called when actor's event handler throws an std::exception subclass + * + * The implementation is supposed to return true for handled exceptions + * and false to rethrow (which will likely result in a process crash). + */ + virtual bool OnUnhandledException(const std::exception&) = 0; + }; + class IActor : protected IActorOps , public TActorUsageImpl<ActorLibCollectUsageStats> @@ -547,16 +564,7 @@ namespace NActors { return SelfActorId; } - void Receive(TAutoPtr<IEventHandle>& ev) { -#ifndef NDEBUG - if (ev->Flags & IEventHandle::FlagDebugTrackReceive) { - YaDebugBreak(); - } -#endif - ++HandledEvents; - LastReceiveTimestamp = TActivationContext::Monotonic(); - (this->*StateFunc_)(ev); - } + void Receive(TAutoPtr<IEventHandle>& ev); TActorContext ActorContext() const { return TActivationContext::ActorContextFor(SelfId()); diff --git a/ydb/library/actors/core/ut/actor_exception_ut.cpp b/ydb/library/actors/core/ut/actor_exception_ut.cpp new file mode 100644 index 0000000000..1495530470 --- /dev/null +++ b/ydb/library/actors/core/ut/actor_exception_ut.cpp @@ -0,0 +1,105 @@ +#include "actor.h" +#include "actor_bootstrapped.h" +#include "events.h" +#include "actorsystem.h" +#include "executor_pool_basic.h" +#include "scheduler_basic.h" +#include "actor_bootstrapped.h" +#include "actor_benchmark_helper.h" + +#include <library/cpp/testing/unittest/registar.h> + +Y_UNIT_TEST_SUITE(ActorException) { + + using namespace NActors; + using namespace NActors::NTests; + using TActorBenchmark = ::NActors::NTests::TActorBenchmark<>; + + class TActorBootstrapExceptionActor + : public TActorBootstrapped<TActorBootstrapExceptionActor> + , public IActorExceptionHandler + { + public: + TActorBootstrapExceptionActor(TManualEvent& done) + : Done(done) + {} + + void Bootstrap() { + throw std::runtime_error("test"); + } + + private: + bool OnUnhandledException(const std::exception& e) override { + Y_ABORT_UNLESS(TypeName(e) == "std::runtime_error"); + Done.Signal(); + PassAway(); + return true; + } + + private: + TManualEvent& Done; + }; + + class TActorHandlerExceptionActor + : public TActor<TActorHandlerExceptionActor> + , public IActorExceptionHandler + { + public: + TActorHandlerExceptionActor(TManualEvent& done) + : TActor(&TThis::StateWork) + , Done(done) + {} + + private: + STFUNC(StateWork) { + switch (ev->GetTypeRewrite()) { + hFunc(TEvents::TEvWakeup, Handle) + } + } + + void Handle(TEvents::TEvWakeup::TPtr&) { + throw std::runtime_error("test"); + } + + private: + bool OnUnhandledException(const std::exception& e) { + Y_ABORT_UNLESS(TypeName(e) == "std::runtime_error"); + Done.Signal(); + PassAway(); + return true; + } + + private: + TManualEvent& Done; + }; + + Y_UNIT_TEST(ActorBootstrapExceptionCaught) { + THolder<TActorSystemSetup> setup = TActorBenchmark::GetActorSystemSetup(); + TActorBenchmark::AddBasicPool(setup, 1, 1, false); + + TActorSystem actorSystem(setup); + actorSystem.Start(); + + TManualEvent doneEvent; + actorSystem.Register(new TActorBootstrapExceptionActor(doneEvent)); + doneEvent.WaitI(); + + actorSystem.Stop(); + } + + Y_UNIT_TEST(ActorHandlerExceptionCaught) { + THolder<TActorSystemSetup> setup = TActorBenchmark::GetActorSystemSetup(); + TActorBenchmark::AddBasicPool(setup, 1, 1, false); + + TActorSystem actorSystem(setup); + actorSystem.Start(); + + TManualEvent doneEvent; + auto actorId = actorSystem.Register(new TActorHandlerExceptionActor(doneEvent)); + actorSystem.Send(actorId, new TEvents::TEvWakeup); + doneEvent.WaitI(); + + actorSystem.Stop(); + } + +} // Y_UNIT_TEST_SUITE(ActorException) diff --git a/ydb/library/actors/core/ut/ya.make b/ydb/library/actors/core/ut/ya.make index c18a38cbfa..fa0a50b39e 100644 --- a/ydb/library/actors/core/ut/ya.make +++ b/ydb/library/actors/core/ut/ya.make @@ -21,6 +21,7 @@ PEERDIR( SRCS( actor_basic_ut.cpp actor_coroutine_ut.cpp + actor_exception_ut.cpp actor_shared_threads.cpp benchmark_ut.cpp actor_ut.cpp |