aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorijon <ijon@yandex-team.com>2023-03-28 13:52:15 +0300
committerijon <ijon@yandex-team.com>2023-03-28 13:52:15 +0300
commit097f18f3c789135a1ccd2f0a81f9cfa18d85d5a6 (patch)
tree7c61404ec166990073a892075360f3c2f3520d10
parentb33e63d71ecbc1123dbf86cdfccc8e55d210843d (diff)
downloadydb-097f18f3c789135a1ccd2f0a81f9cfa18d85d5a6.tar.gz
schemeshard: add out-of-operation-scope event handling
Schemeshard should serve datashard a reply to TEvDataShard::TEvSchemaChanged even if that event come at the very peculiar moment when responsible suboperation goes from active to done in-between TSchemeShard::Handle() and TTxOperationReply<>::Execute(). Not doing so leaves datashard in some transitional state which will prevent datashard from being deleted until next schemeshard or datashard restart.
-rw-r--r--ydb/core/tx/schemeshard/schemeshard__operation.cpp68
1 files changed, 64 insertions, 4 deletions
diff --git a/ydb/core/tx/schemeshard/schemeshard__operation.cpp b/ydb/core/tx/schemeshard/schemeshard__operation.cpp
index 41761d4653..30ba56710d 100644
--- a/ydb/core/tx/schemeshard/schemeshard__operation.cpp
+++ b/ydb/core/tx/schemeshard/schemeshard__operation.cpp
@@ -338,6 +338,52 @@ struct TSchemeShard::TTxOperationProgress: public NTabletFlatExecutor::TTransact
};
+//NOTE: There is certain time frame between initial event handling at TSchemeShard::Handle(*)
+// and actual event processing at TTxOperationReply*::Execute() method.
+// While TSchemeShard::Handle() tries to determine suboperation/operation part it should
+// route event processing to, TTxOperationReply*::Execute() checks if those operation
+// and suboperation are still exist and active.
+// And it is perfectly ok if (sub)operation will manage to change their state within that
+// time frame between TSchemeShard::Handle(*) and TTxOperationReply*::Execute().
+// And it is generally ok if in that situation TTxOperationReply*::Execute() will skip
+// to do anything with an incoming event.
+//
+// Generally, but not in all cases.
+//
+// There could be communication patterns that require other party to receive reaction
+// from schemeshard (ack or reply) unconditionally, no matter what, possibly out-of-scope
+// of any (sub)operation that initiated communication in the first place.
+//
+// Right now there is the case with DataShard's TEvDataShard::TEvSchemaChanged event.
+// See below.
+//
+template <class TEvType>
+void OutOfScopeEventHandler(const typename TEvType::TPtr&, TOperationContext&) {
+ // Do nothing by default
+}
+
+// DataShard should receive reply on any single TEvDataShard::TEvSchemaChanged it will send
+// to schemeshard even if that particular DataShard goes offline and back online right at
+// perfect peculiar moments during (sub)operations.
+// Not serving reply to a TEvDataShard::TEvSchemaChanged will leave Datashard
+// in some transitional state which, for example, will prevent it from being stopped
+// and deleted until schemeshard or datashard restart.
+//
+template <>
+void OutOfScopeEventHandler<TEvDataShard::TEvSchemaChanged>(const TEvDataShard::TEvSchemaChanged::TPtr& ev, TOperationContext& context) {
+ const auto txId = ev->Get()->Record.GetTxId();
+ LOG_DEBUG_S(context.Ctx, NKikimrServices::FLAT_TX_SCHEMESHARD,
+ "TTxOperationReply<TEvDataShard::TEvSchemaChanged> execute"
+ << ", at schemeshard: " << context.SS->TabletID()
+ << ", send out-of-scope reply, for txId " << txId
+ );
+ const TActorId ackTo = ev->Get()->GetSource();
+
+ auto event = MakeHolder<TEvDataShard::TEvSchemaChangedResult>(txId);
+ context.OnComplete.Send(ackTo, event.Release());
+}
+
+
template <class TEvType>
struct TSchemeShard::TTxOperationReply {};
@@ -363,20 +409,29 @@ struct TSchemeShard::TTxOperationReply {};
\
bool Execute(NTabletFlatExecutor::TTransactionContext& txc, const TActorContext& ctx) override { \
LOG_DEBUG_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, \
- "TTxOperationReply<" #TEvType "> execute " \
+ "TTxOperationReply<" #TEvType "> execute" \
<< ", operationId: " << OperationId \
<< ", at schemeshard: " << Self->TabletID() \
<< ", message: " << ISubOperationState::DebugReply(EvReply)); \
if (!Self->Operations.contains(OperationId.GetTxId())) { \
+ LOG_DEBUG_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, \
+ "TTxOperationReply<" #TEvType "> execute" \
+ << ", operationId: " << OperationId \
+ << ", at schemeshard: " << Self->TabletID() \
+ << ", operation unknown"); \
+ TOperationContext context{Self, txc, ctx, OnComplete, MemChanges, DbChanges}; \
+ OutOfScopeEventHandler<TEvType>(EvReply, context); \
return true; \
} \
TOperation::TPtr operation = Self->Operations.at(OperationId.GetTxId()); \
if (operation->DoneParts.contains(OperationId.GetSubTxId())) { \
LOG_DEBUG_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, \
- "TTxOperationReply<" #TEvType "> execute " \
- << ", operation already done" \
+ "TTxOperationReply<" #TEvType "> execute" \
<< ", operationId: " << OperationId \
- << ", at schemeshard: " << Self->TabletID()); \
+ << ", at schemeshard: " << Self->TabletID() \
+ << ", operation part is already done"); \
+ TOperationContext context{Self, txc, ctx, OnComplete, MemChanges, DbChanges}; \
+ OutOfScopeEventHandler<TEvType>(EvReply, context); \
return true; \
} \
ISubOperation::TPtr part = operation->Parts.at(ui64(OperationId.GetSubTxId())); \
@@ -387,7 +442,12 @@ struct TSchemeShard::TTxOperationReply {};
DbChanges.Apply(Self, txc, ctx); \
return true; \
} \
+ \
void Complete(const TActorContext& ctx) override { \
+ LOG_DEBUG_S(ctx, NKikimrServices::FLAT_TX_SCHEMESHARD, "TTxOperationReply<" #TEvType "> complete" \
+ << ", operationId: " << OperationId \
+ << ", at schemeshard: " << Self->TabletID() \
+ ); \
OnComplete.ApplyOnComplete(Self, ctx); \
} \
};