diff options
author | ijon <ijon@yandex-team.com> | 2023-03-28 13:52:15 +0300 |
---|---|---|
committer | ijon <ijon@yandex-team.com> | 2023-03-28 13:52:15 +0300 |
commit | 097f18f3c789135a1ccd2f0a81f9cfa18d85d5a6 (patch) | |
tree | 7c61404ec166990073a892075360f3c2f3520d10 | |
parent | b33e63d71ecbc1123dbf86cdfccc8e55d210843d (diff) | |
download | ydb-097f18f3c789135a1ccd2f0a81f9cfa18d85d5a6.tar.gz |
schemeshard: add out-of-operation-scope event handling
Schemeshard should serve datashard a reply to TEvDataShard::TEvSchemaChanged
even if that event come at the very peculiar moment when responsible suboperation
goes from active to done in-between TSchemeShard::Handle() and
TTxOperationReply<>::Execute().
Not doing so leaves datashard in some transitional state which will prevent
datashard from being deleted until next schemeshard or datashard restart.
-rw-r--r-- | ydb/core/tx/schemeshard/schemeshard__operation.cpp | 68 |
1 files changed, 64 insertions, 4 deletions
diff --git a/ydb/core/tx/schemeshard/schemeshard__operation.cpp b/ydb/core/tx/schemeshard/schemeshard__operation.cpp index 41761d4653..30ba56710d 100644 --- a/ydb/core/tx/schemeshard/schemeshard__operation.cpp +++ b/ydb/core/tx/schemeshard/schemeshard__operation.cpp @@ -338,6 +338,52 @@ struct TSchemeShard::TTxOperationProgress: public NTabletFlatExecutor::TTransact }; +//NOTE: There is certain time frame between initial event handling at TSchemeShard::Handle(*) +// and actual event processing at TTxOperationReply*::Execute() method. +// While TSchemeShard::Handle() tries to determine suboperation/operation part it should +// route event processing to, TTxOperationReply*::Execute() checks if those operation +// and suboperation are still exist and active. +// And it is perfectly ok if (sub)operation will manage to change their state within that +// time frame between TSchemeShard::Handle(*) and TTxOperationReply*::Execute(). +// And it is generally ok if in that situation TTxOperationReply*::Execute() will skip +// to do anything with an incoming event. +// +// Generally, but not in all cases. +// +// There could be communication patterns that require other party to receive reaction +// from schemeshard (ack or reply) unconditionally, no matter what, possibly out-of-scope +// of any (sub)operation that initiated communication in the first place. +// +// Right now there is the case with DataShard's TEvDataShard::TEvSchemaChanged event. +// See below. +// +template <class TEvType> +void OutOfScopeEventHandler(const typename TEvType::TPtr&, TOperationContext&) { + // Do nothing by default +} + +// DataShard should receive reply on any single TEvDataShard::TEvSchemaChanged it will send +// to schemeshard even if that particular DataShard goes offline and back online right at +// perfect peculiar moments during (sub)operations. +// Not serving reply to a TEvDataShard::TEvSchemaChanged will leave Datashard +// in some transitional state which, for example, will prevent it from being stopped +// and deleted until schemeshard or datashard restart. +// +template <> +void OutOfScopeEventHandler<TEvDataShard::TEvSchemaChanged>(const TEvDataShard::TEvSchemaChanged::TPtr& ev, TOperationContext& context) { + const auto txId = ev->Get()->Record.GetTxId(); + LOG_DEBUG_S(context.Ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, + "TTxOperationReply<TEvDataShard::TEvSchemaChanged> execute" + << ", at schemeshard: " << context.SS->TabletID() + << ", send out-of-scope reply, for txId " << txId + ); + const TActorId ackTo = ev->Get()->GetSource(); + + auto event = MakeHolder<TEvDataShard::TEvSchemaChangedResult>(txId); + context.OnComplete.Send(ackTo, event.Release()); +} + + template <class TEvType> struct TSchemeShard::TTxOperationReply {}; @@ -363,20 +409,29 @@ struct TSchemeShard::TTxOperationReply {}; \ bool Execute(NTabletFlatExecutor::TTransactionContext& txc, const TActorContext& ctx) override { \ LOG_DEBUG_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, \ - "TTxOperationReply<" #TEvType "> execute " \ + "TTxOperationReply<" #TEvType "> execute" \ << ", operationId: " << OperationId \ << ", at schemeshard: " << Self->TabletID() \ << ", message: " << ISubOperationState::DebugReply(EvReply)); \ if (!Self->Operations.contains(OperationId.GetTxId())) { \ + LOG_DEBUG_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, \ + "TTxOperationReply<" #TEvType "> execute" \ + << ", operationId: " << OperationId \ + << ", at schemeshard: " << Self->TabletID() \ + << ", operation unknown"); \ + TOperationContext context{Self, txc, ctx, OnComplete, MemChanges, DbChanges}; \ + OutOfScopeEventHandler<TEvType>(EvReply, context); \ return true; \ } \ TOperation::TPtr operation = Self->Operations.at(OperationId.GetTxId()); \ if (operation->DoneParts.contains(OperationId.GetSubTxId())) { \ LOG_DEBUG_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, \ - "TTxOperationReply<" #TEvType "> execute " \ - << ", operation already done" \ + "TTxOperationReply<" #TEvType "> execute" \ << ", operationId: " << OperationId \ - << ", at schemeshard: " << Self->TabletID()); \ + << ", at schemeshard: " << Self->TabletID() \ + << ", operation part is already done"); \ + TOperationContext context{Self, txc, ctx, OnComplete, MemChanges, DbChanges}; \ + OutOfScopeEventHandler<TEvType>(EvReply, context); \ return true; \ } \ ISubOperation::TPtr part = operation->Parts.at(ui64(OperationId.GetSubTxId())); \ @@ -387,7 +442,12 @@ struct TSchemeShard::TTxOperationReply {}; DbChanges.Apply(Self, txc, ctx); \ return true; \ } \ + \ void Complete(const TActorContext& ctx) override { \ + LOG_DEBUG_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, "TTxOperationReply<" #TEvType "> complete" \ + << ", operationId: " << OperationId \ + << ", at schemeshard: " << Self->TabletID() \ + ); \ OnComplete.ApplyOnComplete(Self, ctx); \ } \ }; |