aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAleksei Borzenkov <snaury@ydb.tech>2025-03-14 12:53:47 +0300
committerGitHub <noreply@github.com>2025-03-14 12:53:47 +0300
commitbba21cb0db2e074cac5ac4211b78c0852a2b67fe (patch)
tree9379fade33f154cddbc156b540f121cb54e0ae01
parent1b4e25c3bd89af549e479fb6df968d7d29fcafd4 (diff)
downloadydb-bba21cb0db2e074cac5ac4211b78c0852a2b67fe.tar.gz
Support processing unhandled exceptions in actors and tablets (#15468)
-rw-r--r--ydb/core/driver_lib/run/main.cpp4
-rw-r--r--ydb/core/protos/feature_flags.proto1
-rw-r--r--ydb/core/tablet_flat/flat_executor.cpp19
-rw-r--r--ydb/core/tablet_flat/flat_executor.h5
-rw-r--r--ydb/core/tablet_flat/flat_executor_ut.cpp38
-rw-r--r--ydb/core/tablet_flat/tablet_flat_executed.cpp29
-rw-r--r--ydb/core/tablet_flat/tablet_flat_executed.h7
-rw-r--r--ydb/core/tablet_flat/tablet_flat_executor.h2
-rw-r--r--ydb/core/tablet_flat/test/libs/exec/dummy.h6
-rw-r--r--ydb/core/testlib/actors/test_runtime.cpp5
-rw-r--r--ydb/library/actors/core/actor.cpp20
-rw-r--r--ydb/library/actors/core/actor.h28
-rw-r--r--ydb/library/actors/core/ut/actor_exception_ut.cpp105
-rw-r--r--ydb/library/actors/core/ut/ya.make1
14 files changed, 249 insertions, 21 deletions
diff --git a/ydb/core/driver_lib/run/main.cpp b/ydb/core/driver_lib/run/main.cpp
index 00be7c4568..ffb54aca93 100644
--- a/ydb/core/driver_lib/run/main.cpp
+++ b/ydb/core/driver_lib/run/main.cpp
@@ -178,6 +178,10 @@ std::terminate_handler defaultTerminateHandler;
void KikimrTerminateHandler() {
Cerr << "======= terminate() call stack ========\n";
FormatBackTrace(&Cerr);
+ if (auto backtrace = TBackTrace::FromCurrentException(); backtrace.size() > 0) {
+ Cerr << "======== exception call stack =========\n";
+ backtrace.PrintTo(Cerr);
+ }
Cerr << "=======================================\n";
auto oldHandler = defaultTerminateHandler;
diff --git a/ydb/core/protos/feature_flags.proto b/ydb/core/protos/feature_flags.proto
index b5912d922c..82ca987893 100644
--- a/ydb/core/protos/feature_flags.proto
+++ b/ydb/core/protos/feature_flags.proto
@@ -199,4 +199,5 @@ message TFeatureFlags {
optional bool EnableShowCreate = 173 [default = false];
optional bool EnableChangefeedsExport = 174 [default = false];
optional bool EnableKafkaNativeBalancing = 175 [default = false];
+ optional bool EnableTabletRestartOnUnhandledExceptions = 176 [default = true];
}
diff --git a/ydb/core/tablet_flat/flat_executor.cpp b/ydb/core/tablet_flat/flat_executor.cpp
index 29a85754f0..b62bd22174 100644
--- a/ydb/core/tablet_flat/flat_executor.cpp
+++ b/ydb/core/tablet_flat/flat_executor.cpp
@@ -139,6 +139,20 @@ TExecutor::~TExecutor() {
}
+bool TExecutor::OnUnhandledException(const std::exception& e) {
+ if (AppData()->FeatureFlags.GetEnableTabletRestartOnUnhandledExceptions()) {
+ if (auto log = Logger->Log(ELnLev::Crit)) {
+ log << "Tablet " << TabletId() << " unhandled exception " << TypeName(e) << ": " << e.what()
+ << '\n' << TBackTrace::FromCurrentException().PrintToString();
+ }
+ Broken();
+ return true;
+ }
+
+ // Exception will propagate and cause the process to crash
+ return false;
+}
+
ui64 TExecutor::Stamp() const noexcept
{
return CommitManager ? CommitManager->Stamp() : TTxStamp{ Generation0, Step0 }.Raw;
@@ -168,6 +182,7 @@ void TExecutor::Registered(TActorSystem *sys, const TActorId&)
GetServiceCounters(AppData()->Counters, "tablets")->GetCounter("alerts_req_nodata", true);
GetServiceCounters(AppData()->Counters, "tablets")->GetCounter("alerts_scan_nodata", true);
GetServiceCounters(AppData()->Counters, "tablets")->GetCounter("alerts_boot_nodata", true);
+ GetServiceCounters(AppData()->Counters, "tablets")->GetCounter("alerts_broken", true);
}
void TExecutor::PassAway() {
@@ -195,6 +210,8 @@ void TExecutor::PassAway() {
}
void TExecutor::Broken() {
+ GetServiceCounters(AppData()->Counters, "tablets")->GetCounter("alerts_broken", true)->Inc();
+
if (BootLogic)
BootLogic->Cancel();
@@ -888,7 +905,7 @@ void TExecutor::Restored(TEvTablet::TEvRestored::TPtr &ev, const TActorContext &
return TranscriptBootOpResult(res, ctx);
}
-void TExecutor::DetachTablet(const TActorContext &) {
+void TExecutor::DetachTablet() {
TabletCountersForgetTablet(Owner->TabletID(), Owner->TabletType(),
Owner->Info()->TenantPathId, Stats->IsFollower(), SelfId());
return PassAway();
diff --git a/ydb/core/tablet_flat/flat_executor.h b/ydb/core/tablet_flat/flat_executor.h
index ea562e30bb..fc0f3deaf7 100644
--- a/ydb/core/tablet_flat/flat_executor.h
+++ b/ydb/core/tablet_flat/flat_executor.h
@@ -312,6 +312,7 @@ struct TExecutorCaches {
class TExecutor
: public TActor<TExecutor>
, public NFlatExecutorSetup::IExecutor
+ , public IActorExceptionHandler
, private NTable::ICompactionBackend
, private ILoadBlob
{
@@ -633,7 +634,7 @@ public:
// IExecutor interface
void Boot(TEvTablet::TEvBoot::TPtr &ev, const TActorContext &ctx) override;
void Restored(TEvTablet::TEvRestored::TPtr &ev, const TActorContext &ctx) override;
- void DetachTablet(const TActorContext &ctx) override;
+ void DetachTablet() override;
ui64 DoExecute(TAutoPtr<ITransaction> transaction, ETxMode mode);
void Execute(TAutoPtr<ITransaction> transaction, const TActorContext &ctx) override;
ui64 Enqueue(TAutoPtr<ITransaction> transaction) override;
@@ -705,6 +706,8 @@ public:
TExecutor(NFlatExecutorSetup::ITablet *owner, const TActorId& ownerActorId);
~TExecutor();
+ bool OnUnhandledException(const std::exception&) override;
+
STFUNC(StateInit);
STFUNC(StateBoot);
STFUNC(StateWork);
diff --git a/ydb/core/tablet_flat/flat_executor_ut.cpp b/ydb/core/tablet_flat/flat_executor_ut.cpp
index 01af27dbe3..037e7a3059 100644
--- a/ydb/core/tablet_flat/flat_executor_ut.cpp
+++ b/ydb/core/tablet_flat/flat_executor_ut.cpp
@@ -523,7 +523,7 @@ class TTestFlatTablet : public TActor<TTestFlatTablet>, public TTabletExecutedFl
void Handle(TEvents::TEvPoison::TPtr &, const TActorContext &ctx) {
Become(&TThis::StateBroken);
- Executor()->DetachTablet(ctx), Detach(ctx); /* see TDummy tablet */
+ Executor()->DetachTablet(), Detach(ctx); /* see TDummy tablet */
ctx.Send(Sender, new TEvents::TEvGone);
}
@@ -7354,5 +7354,41 @@ Y_UNIT_TEST_SUITE(TFlatTableExecutor_LowPriorityTxs) {
}
}
+Y_UNIT_TEST_SUITE(TFlatTableExecutor_Exceptions) {
+ struct TTxExecuteThrowException : public ITransaction {
+ bool Execute(TTransactionContext&, const TActorContext&) override {
+ throw std::runtime_error("test");
+ }
+
+ void Complete(const TActorContext&) override {
+ // not reached
+ }
+ };
+
+ Y_UNIT_TEST(TestTabletExecuteExceptionDirect) {
+ TMyEnvBase env;
+ env->GetAppData().FeatureFlags.SetEnableTabletRestartOnUnhandledExceptions(true);
+
+ env.FireDummyTablet();
+
+ env.SendAsync(new NFake::TEvExecute([&](auto* x, const auto& ctx) {
+ x->Execute(new TTxExecuteThrowException, ctx);
+ }));
+ env.WaitForGone();
+ }
+
+ Y_UNIT_TEST(TestTabletExecuteExceptionEnqueue) {
+ TMyEnvBase env;
+ env->GetAppData().FeatureFlags.SetEnableTabletRestartOnUnhandledExceptions(true);
+
+ env.FireDummyTablet();
+
+ env.SendAsync(new NFake::TEvExecute([&](auto* x, const auto&) {
+ x->Enqueue(new TTxExecuteThrowException);
+ }));
+ env.WaitForGone();
+ }
+}
+
}
}
diff --git a/ydb/core/tablet_flat/tablet_flat_executed.cpp b/ydb/core/tablet_flat/tablet_flat_executed.cpp
index 4358b987d9..7e325c4274 100644
--- a/ydb/core/tablet_flat/tablet_flat_executed.cpp
+++ b/ydb/core/tablet_flat/tablet_flat_executed.cpp
@@ -2,6 +2,7 @@
#include "flat_executor.h"
#include "flat_executor_counters.h"
#include <ydb/core/base/appdata.h>
+#include <ydb/core/base/counters.h>
#include <library/cpp/monlib/service/pages/templates.h>
namespace NKikimr {
@@ -16,6 +17,28 @@ TTabletExecutedFlat::TTabletExecutedFlat(TTabletStorageInfo *info, const TActorI
, StartTime0(TAppData::TimeProvider->Now())
{}
+bool TTabletExecutedFlat::OnUnhandledException(const std::exception& e) {
+ if (AppData()->FeatureFlags.GetEnableTabletRestartOnUnhandledExceptions()) {
+ // Tablets have a weird inheritence where subclass is always an actor,
+ // but we don't know the exact type at compile time. This dynamic_cast
+ // is expected to always succeed.
+ if (auto* actor = dynamic_cast<IActor*>(this)) {
+ auto ctx = TActivationContext::ActorContextFor(actor->SelfId());
+ LOG_CRIT_S(*TlsActivationContext, NKikimrServices::TABLET_EXECUTOR,
+ "Tablet " << TabletID() << " unhandled exception " << TypeName(e) << ": " << e.what()
+ << '\n' << TBackTrace::FromCurrentException().PrintToString());
+
+ GetServiceCounters(AppData(ctx)->Counters, "tablets")->GetCounter("alerts_broken", true)->Inc();
+
+ HandlePoison(ctx);
+ return true;
+ }
+ }
+
+ // Exception will propagate and cause the process to crash
+ return false;
+}
+
IExecutor* TTabletExecutedFlat::CreateExecutor(const TActorContext &ctx) {
if (!Executor()) {
IActor *executor = NFlatExecutorSetup::CreateExecutor(this, ctx.SelfID);
@@ -123,9 +146,9 @@ void TTabletExecutedFlat::OnTabletStop(TEvTablet::TEvTabletStop::TPtr &ev, const
ctx.Send(Tablet(), new TEvTablet::TEvTabletStopped());
}
-void TTabletExecutedFlat::HandlePoison(const TActorContext &ctx) {
+void TTabletExecutedFlat::HandlePoison(const TActorContext& ctx) {
if (Executor0) {
- Executor0->DetachTablet(ExecutorCtx(ctx));
+ Executor0->DetachTablet();
Executor0 = nullptr;
}
@@ -142,7 +165,7 @@ void TTabletExecutedFlat::HandleTabletStop(TEvTablet::TEvTabletStop::TPtr &ev, c
void TTabletExecutedFlat::HandleTabletDead(TEvTablet::TEvTabletDead::TPtr &ev, const TActorContext &ctx) {
if (Executor0) {
- Executor0->DetachTablet(ExecutorCtx(ctx));
+ Executor0->DetachTablet();
Executor0 = nullptr;
}
diff --git a/ydb/core/tablet_flat/tablet_flat_executed.h b/ydb/core/tablet_flat/tablet_flat_executed.h
index fb45ea880c..edc0d984bc 100644
--- a/ydb/core/tablet_flat/tablet_flat_executed.h
+++ b/ydb/core/tablet_flat/tablet_flat_executed.h
@@ -16,7 +16,10 @@ struct IMiniKQLFactory {
virtual TAutoPtr<ITransaction> Make(TEvTablet::TEvLocalReadColumns::TPtr&) = 0;
};
-class TTabletExecutedFlat : public NFlatExecutorSetup::ITablet {
+class TTabletExecutedFlat
+ : public NFlatExecutorSetup::ITablet
+ , public IActorExceptionHandler
+{
protected:
using IExecutor = NFlatExecutorSetup::IExecutor;
@@ -24,6 +27,8 @@ protected:
IExecutor* Executor() const { return Executor0; }
const TInstant StartTime() const { return StartTime0; }
+ bool OnUnhandledException(const std::exception&) override;
+
void Execute(TAutoPtr<ITransaction> transaction, const TActorContext &ctx);
void Execute(TAutoPtr<ITransaction> transaction);
ui64 Enqueue(TAutoPtr<ITransaction> transaction);
diff --git a/ydb/core/tablet_flat/tablet_flat_executor.h b/ydb/core/tablet_flat/tablet_flat_executor.h
index 7024004140..edac195ea2 100644
--- a/ydb/core/tablet_flat/tablet_flat_executor.h
+++ b/ydb/core/tablet_flat/tablet_flat_executor.h
@@ -550,7 +550,7 @@ namespace NFlatExecutorSetup {
// tablet generation restoration complete, tablet could act as leader
virtual void Restored(TEvTablet::TEvRestored::TPtr &ev, const TActorContext &ctx) = 0;
// die!
- virtual void DetachTablet(const TActorContext &ctx) = 0;
+ virtual void DetachTablet() = 0;
// tablet assigned as follower (or follower connection refreshed), must begin loading
virtual void FollowerBoot(TEvTablet::TEvFBoot::TPtr &ev, const TActorContext &ctx) = 0;
diff --git a/ydb/core/tablet_flat/test/libs/exec/dummy.h b/ydb/core/tablet_flat/test/libs/exec/dummy.h
index 8df6036401..f018900a91 100644
--- a/ydb/core/tablet_flat/test/libs/exec/dummy.h
+++ b/ydb/core/tablet_flat/test/libs/exec/dummy.h
@@ -15,7 +15,7 @@ namespace NFake {
virtual NFake::TEvExecute* OnFinished() = 0;
};
- class TDummy : public ::NActors::IActorCallback, public TExecuted {
+ class TDummy : public TActor<TDummy>, public TExecuted {
enum EState {
Boot = 1,
Work = 2,
@@ -35,7 +35,7 @@ namespace NFake {
TDummy(const TActorId &tablet, TInfo *info, const TActorId& owner,
ui32 flags = 0 /* ORed EFlg enum */)
- : ::NActors::IActorCallback(static_cast<TReceiveFunc>(&TDummy::Inbox), NKikimrServices::TActivity::FAKE_ENV_A)
+ : TActor(&TDummy::Inbox, NKikimrServices::TActivity::FAKE_ENV_A)
, TTabletExecutedFlat(info, tablet, nullptr)
, Owner(owner)
, Flags(flags)
@@ -75,7 +75,7 @@ namespace NFake {
*/
auto ctx(this->ActorContext());
- Executor()->DetachTablet(ctx), Detach(ctx);
+ Executor()->DetachTablet(), Detach(ctx);
}
} else if (State == EState::Boot) {
TTabletExecutedFlat::StateInitImpl(eh, SelfId());
diff --git a/ydb/core/testlib/actors/test_runtime.cpp b/ydb/core/testlib/actors/test_runtime.cpp
index 8dacce589e..4c526d4af8 100644
--- a/ydb/core/testlib/actors/test_runtime.cpp
+++ b/ydb/core/testlib/actors/test_runtime.cpp
@@ -123,6 +123,11 @@ namespace NActors {
NKikimr::TAppData::TimeProvider = TimeProvider;
}
+ // We want tests to fail on unhandled exceptions by default
+ if (!App0->FeatureFlags.HasEnableTabletRestartOnUnhandledExceptions()) {
+ App0->FeatureFlags.SetEnableTabletRestartOnUnhandledExceptions(false);
+ }
+
MonPorts.clear();
for (ui32 nodeIndex = 0; nodeIndex < NodeCount; ++nodeIndex) {
ui32 nodeId = FirstNodeId + nodeIndex;
diff --git a/ydb/library/actors/core/actor.cpp b/ydb/library/actors/core/actor.cpp
index 68ec2bff0d..208acc74af 100644
--- a/ydb/library/actors/core/actor.cpp
+++ b/ydb/library/actors/core/actor.cpp
@@ -267,6 +267,26 @@ namespace NActors {
return NHPTimer::GetSeconds(ElapsedTicks);
}
+ void IActor::Receive(TAutoPtr<IEventHandle>& ev) {
+#ifndef NDEBUG
+ if (ev->Flags & IEventHandle::FlagDebugTrackReceive) {
+ YaDebugBreak();
+ }
+#endif
+ ++HandledEvents;
+ LastReceiveTimestamp = TActivationContext::Monotonic();
+
+ try {
+ (this->*StateFunc_)(ev);
+ } catch(const std::exception& e) {
+ if (auto* handler = dynamic_cast<IActorExceptionHandler*>(this);
+ !handler || !handler->OnUnhandledException(e))
+ {
+ throw;
+ }
+ }
+ }
+
void IActor::Registered(TActorSystem* sys, const TActorId& owner) {
// fallback to legacy method, do not use it anymore
if (auto eh = AfterRegister(SelfId(), owner)) {
diff --git a/ydb/library/actors/core/actor.h b/ydb/library/actors/core/actor.h
index 5ac6826af7..8153e31c12 100644
--- a/ydb/library/actors/core/actor.h
+++ b/ydb/library/actors/core/actor.h
@@ -347,6 +347,23 @@ namespace NActors {
void DoActorInit() { LastUsageTimestamp = GetCycleCountFast(); }
};
+ /**
+ * Optional interface for actors with exception handling
+ */
+ class IActorExceptionHandler {
+ protected:
+ ~IActorExceptionHandler() = default;
+
+ public:
+ /**
+ * Called when actor's event handler throws an std::exception subclass
+ *
+ * The implementation is supposed to return true for handled exceptions
+ * and false to rethrow (which will likely result in a process crash).
+ */
+ virtual bool OnUnhandledException(const std::exception&) = 0;
+ };
+
class IActor
: protected IActorOps
, public TActorUsageImpl<ActorLibCollectUsageStats>
@@ -547,16 +564,7 @@ namespace NActors {
return SelfActorId;
}
- void Receive(TAutoPtr<IEventHandle>& ev) {
-#ifndef NDEBUG
- if (ev->Flags & IEventHandle::FlagDebugTrackReceive) {
- YaDebugBreak();
- }
-#endif
- ++HandledEvents;
- LastReceiveTimestamp = TActivationContext::Monotonic();
- (this->*StateFunc_)(ev);
- }
+ void Receive(TAutoPtr<IEventHandle>& ev);
TActorContext ActorContext() const {
return TActivationContext::ActorContextFor(SelfId());
diff --git a/ydb/library/actors/core/ut/actor_exception_ut.cpp b/ydb/library/actors/core/ut/actor_exception_ut.cpp
new file mode 100644
index 0000000000..1495530470
--- /dev/null
+++ b/ydb/library/actors/core/ut/actor_exception_ut.cpp
@@ -0,0 +1,105 @@
+#include "actor.h"
+#include "actor_bootstrapped.h"
+#include "events.h"
+#include "actorsystem.h"
+#include "executor_pool_basic.h"
+#include "scheduler_basic.h"
+#include "actor_bootstrapped.h"
+#include "actor_benchmark_helper.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+Y_UNIT_TEST_SUITE(ActorException) {
+
+ using namespace NActors;
+ using namespace NActors::NTests;
+ using TActorBenchmark = ::NActors::NTests::TActorBenchmark<>;
+
+ class TActorBootstrapExceptionActor
+ : public TActorBootstrapped<TActorBootstrapExceptionActor>
+ , public IActorExceptionHandler
+ {
+ public:
+ TActorBootstrapExceptionActor(TManualEvent& done)
+ : Done(done)
+ {}
+
+ void Bootstrap() {
+ throw std::runtime_error("test");
+ }
+
+ private:
+ bool OnUnhandledException(const std::exception& e) override {
+ Y_ABORT_UNLESS(TypeName(e) == "std::runtime_error");
+ Done.Signal();
+ PassAway();
+ return true;
+ }
+
+ private:
+ TManualEvent& Done;
+ };
+
+ class TActorHandlerExceptionActor
+ : public TActor<TActorHandlerExceptionActor>
+ , public IActorExceptionHandler
+ {
+ public:
+ TActorHandlerExceptionActor(TManualEvent& done)
+ : TActor(&TThis::StateWork)
+ , Done(done)
+ {}
+
+ private:
+ STFUNC(StateWork) {
+ switch (ev->GetTypeRewrite()) {
+ hFunc(TEvents::TEvWakeup, Handle)
+ }
+ }
+
+ void Handle(TEvents::TEvWakeup::TPtr&) {
+ throw std::runtime_error("test");
+ }
+
+ private:
+ bool OnUnhandledException(const std::exception& e) {
+ Y_ABORT_UNLESS(TypeName(e) == "std::runtime_error");
+ Done.Signal();
+ PassAway();
+ return true;
+ }
+
+ private:
+ TManualEvent& Done;
+ };
+
+ Y_UNIT_TEST(ActorBootstrapExceptionCaught) {
+ THolder<TActorSystemSetup> setup = TActorBenchmark::GetActorSystemSetup();
+ TActorBenchmark::AddBasicPool(setup, 1, 1, false);
+
+ TActorSystem actorSystem(setup);
+ actorSystem.Start();
+
+ TManualEvent doneEvent;
+ actorSystem.Register(new TActorBootstrapExceptionActor(doneEvent));
+ doneEvent.WaitI();
+
+ actorSystem.Stop();
+ }
+
+ Y_UNIT_TEST(ActorHandlerExceptionCaught) {
+ THolder<TActorSystemSetup> setup = TActorBenchmark::GetActorSystemSetup();
+ TActorBenchmark::AddBasicPool(setup, 1, 1, false);
+
+ TActorSystem actorSystem(setup);
+ actorSystem.Start();
+
+ TManualEvent doneEvent;
+ auto actorId = actorSystem.Register(new TActorHandlerExceptionActor(doneEvent));
+ actorSystem.Send(actorId, new TEvents::TEvWakeup);
+ doneEvent.WaitI();
+
+ actorSystem.Stop();
+ }
+
+} // Y_UNIT_TEST_SUITE(ActorException)
diff --git a/ydb/library/actors/core/ut/ya.make b/ydb/library/actors/core/ut/ya.make
index c18a38cbfa..fa0a50b39e 100644
--- a/ydb/library/actors/core/ut/ya.make
+++ b/ydb/library/actors/core/ut/ya.make
@@ -21,6 +21,7 @@ PEERDIR(
SRCS(
actor_basic_ut.cpp
actor_coroutine_ut.cpp
+ actor_exception_ut.cpp
actor_shared_threads.cpp
benchmark_ut.cpp
actor_ut.cpp