diff options
author | snaury <snaury@ydb.tech> | 2023-10-23 14:04:49 +0300 |
---|---|---|
committer | snaury <snaury@ydb.tech> | 2023-10-23 14:52:10 +0300 |
commit | 530d89dcbb88f1e44e0a4adb8490fd95adb60d93 (patch) | |
tree | 03d9824a438f385e034d2a4d9cc16c39313a11de | |
parent | 4b35f7607d9807f22019bb41b0cd7e3ed484ee8b (diff) | |
download | ydb-530d89dcbb88f1e44e0a4adb8490fd95adb60d93.tar.gz |
Add shard/node/state info to read iterator error messages KIKIMR-19789
-rw-r--r-- | ydb/core/tx/datashard/datashard__read_iterator.cpp | 136 | ||||
-rw-r--r-- | ydb/core/tx/datashard/datashard_ut_read_iterator.cpp | 7 |
2 files changed, 97 insertions, 46 deletions
diff --git a/ydb/core/tx/datashard/datashard__read_iterator.cpp b/ydb/core/tx/datashard/datashard__read_iterator.cpp index bde44b41683..67017d540d3 100644 --- a/ydb/core/tx/datashard/datashard__read_iterator.cpp +++ b/ydb/core/tx/datashard/datashard__read_iterator.cpp @@ -1176,14 +1176,16 @@ public: Result->Record, Ydb::StatusIds::OVERLOADED, TStringBuilder() << "Shard in state " << DatashardStateName(Self->State) - << ", tablet id: " << Self->TabletID()); + << ", tablet id: " << Self->TabletID() + << ", node# " << ctx.SelfID.NodeId()); return EExecutionStatus::DelayComplete; } else { SetStatusError( Result->Record, Ydb::StatusIds::SCHEME_ERROR, TStringBuilder() << "Shard in state " << DatashardStateName(Self->State) - << ", will be deleted soon, tablet id: " << Self->TabletID()); + << ", will be deleted soon, tablet id: " << Self->TabletID() + << ", node# " << ctx.SelfID.NodeId()); return EExecutionStatus::DelayComplete; } } @@ -1195,7 +1197,8 @@ public: Result->Record, Ydb::StatusIds::OVERLOADED, TStringBuilder() << "Shard in state " << DatashardStateName(Self->State) - << ", tablet id: " << Self->TabletID()); + << ", tablet id: " << Self->TabletID() + << ", node# " << ctx.SelfID.NodeId()); return EExecutionStatus::DelayComplete; } case TShardState::Uninitialized: @@ -1206,7 +1209,8 @@ public: Result->Record, Ydb::StatusIds::INTERNAL_ERROR, TStringBuilder() << "Wrong shard state: " << DatashardStateName(Self->State) - << ", tablet id: " << Self->TabletID()); + << ", tablet id: " << Self->TabletID() + << ", node# " << ctx.SelfID.NodeId()); return EExecutionStatus::DelayComplete; } @@ -1221,7 +1225,8 @@ public: SetStatusError( Result->Record, Ydb::StatusIds::NOT_FOUND, - TStringBuilder() << "Unknown table id: " << tableId); + TStringBuilder() << "Unknown table id: " << tableId + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); return EExecutionStatus::DelayComplete; } auto& userTableInfo = it->second; @@ -1251,7 +1256,8 @@ public: TStringBuilder() << "Table id " << tableId << " lost snapshot at " << state.ReadVersion << " shard " << Self->TabletID() << " with lowWatermark " << Self->GetSnapshotManager().GetLowWatermark() - << (Self->IsFollower() ? " RO replica" : "")); + << (Self->IsFollower() ? " RO replica" : "") + << " (node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); return EExecutionStatus::DelayComplete; } } @@ -1262,7 +1268,8 @@ public: Result->Record, Ydb::StatusIds::SCHEME_ERROR, TStringBuilder() << "Schema changed, current " << userTableInfo->GetTableSchemaVersion() - << ", requested table schemaversion " << state.SchemaVersion); + << ", requested table schemaversion " << state.SchemaVersion + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); return EExecutionStatus::DelayComplete; } } @@ -1294,7 +1301,8 @@ public: SetStatusError( Result->Record, Ydb::StatusIds::ABORTED, - TStringBuilder() << "Transaction was already committed or aborted"); + TStringBuilder() << "Transaction was already committed or aborted" + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); return EExecutionStatus::DelayComplete; } } @@ -1454,7 +1462,8 @@ public: Result->Record, Ydb::StatusIds::SCHEME_ERROR, TStringBuilder() << "Wrong schemaversion " << record.GetTableId().GetSchemaVersion() - << " requested, table schemaversion " << state.SchemaVersion); + << " requested, table schemaversion " << state.SchemaVersion + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); return; } } @@ -1468,7 +1477,8 @@ public: Result->Record, Ydb::StatusIds::NOT_FOUND, TStringBuilder() << "Failed to get scheme for table local id: " - << state.PathId.LocalPathId); + << state.PathId.LocalPathId + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); return; } TableInfo = TShortTableInfo(state.PathId.LocalPathId, *schema); @@ -1511,7 +1521,8 @@ public: SetStatusError( Result->Record, Ydb::StatusIds::SCHEME_ERROR, - TStringBuilder() << "Unknown column: " << col); + TStringBuilder() << "Unknown column: " << col + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); return; } @@ -1552,7 +1563,9 @@ public: LOG_DEBUG_S(ctx, NKikimrServices::TX_DATASHARD, Self->TabletID() << " read iterator# " << readId << " TReadOperation::Execute() finished without Result, aborting"); Result = MakeEvReadResult(ctx.SelfID.NodeId()); - SetStatusError(Result->Record, Ydb::StatusIds::ABORTED, "Iterator aborted"); + SetStatusError(Result->Record, Ydb::StatusIds::ABORTED, TStringBuilder() + << "Iterator aborted" + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); Result->Record.SetReadId(readId.ReadId); Self->SendImmediateReadResult(Sender, Result.release(), 0, state.SessionId); Self->DeleteReadIterator(it); @@ -1560,7 +1573,9 @@ public: } if (!Result->Record.HasStatus() && Reader && Reader->HadInconsistentResult()) { - SetStatusError(Result->Record, Ydb::StatusIds::ABORTED, "Read conflict with concurrent transaction"); + SetStatusError(Result->Record, Ydb::StatusIds::ABORTED, TStringBuilder() + << "Read conflict with concurrent transaction" + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); } // error happened and status set @@ -1651,7 +1666,8 @@ private: SetStatusError( Result->Record, Ydb::StatusIds::NOT_FOUND, - TStringBuilder() << "Unknown table id: " << state.PathId.LocalPathId); + TStringBuilder() << "Unknown table id: " << state.PathId.LocalPathId + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); return true; } auto userTableInfo = it->second; @@ -1662,7 +1678,8 @@ private: Result->Record, Ydb::StatusIds::SCHEME_ERROR, TStringBuilder() << "Schema changed, current " << currentSchemaVersion - << ", requested table schemaversion " << state.SchemaVersion); + << ", requested table schemaversion " << state.SchemaVersion + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); return true; } @@ -1674,7 +1691,8 @@ private: Result->Record, Ydb::StatusIds::NOT_FOUND, TStringBuilder() << "Failed to get scheme for table local id: " - << state.PathId.LocalPathId); + << state.PathId.LocalPathId + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); return true; } TableInfo = TShortTableInfo(state.PathId.LocalPathId, *schema); @@ -1704,7 +1722,8 @@ private: TStringBuilder() << "Table id " << tableId << " lost snapshot at " << state.ReadVersion << " shard " << Self->TabletID() << " with lowWatermark " << Self->GetSnapshotManager().GetLowWatermark() - << (Self->IsFollower() ? " RO replica" : "")); + << (Self->IsFollower() ? " RO replica" : "") + << " (node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); return true; } } @@ -1923,7 +1942,8 @@ public: } ReplyError( Ydb::StatusIds::INTERNAL_ERROR, - TStringBuilder() << "Failed to sync follower: " << errMessage, + TStringBuilder() << "Failed to sync follower: " << errMessage + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")", ctx.SelfID.NodeId()); return true; } @@ -1948,7 +1968,8 @@ public: Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Requesting ownerId: " << state.PathId.OwnerId << ", tableId: " << state.PathId.LocalPathId - << ", from wrong owner: " << Self->GetPathOwnerId(), + << ", from shard with owner: " << Self->GetPathOwnerId() + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")", ctx.SelfID.NodeId()); return true; } @@ -1958,7 +1979,8 @@ public: if (it == Self->TableInfos.end()) { ReplyError( Ydb::StatusIds::NOT_FOUND, - TStringBuilder() << "Unknown table id: " << tableId, + TStringBuilder() << "Unknown table id: " << tableId + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")", ctx.SelfID.NodeId()); return true; } @@ -1967,7 +1989,8 @@ public: if (userTableInfo->IsBackup) { ReplyError( Ydb::StatusIds::BAD_REQUEST, - "Can't read from a backup table", + TStringBuilder() << "Can't read from a backup table" + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")", ctx.SelfID.NodeId()); return true; } @@ -1975,7 +1998,8 @@ public: if (!Self->IsMvccEnabled()) { ReplyError( Ydb::StatusIds::UNSUPPORTED, - "Cannot use read iterators without mvcc", + TStringBuilder() << "Cannot use read iterators without mvcc" + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")", ctx.SelfID.NodeId()); return true; } @@ -2046,7 +2070,8 @@ public: TStringBuilder() << "Table id " << tableId << " has no snapshot at " << state.ReadVersion << " shard " << Self->TabletID() << " with lowWatermark " << Self->GetSnapshotManager().GetLowWatermark() - << (Self->IsFollower() ? " RO replica" : ""), + << (Self->IsFollower() ? " RO replica" : "") + << " (node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")", ctx.SelfID.NodeId()); return true; } @@ -2057,21 +2082,24 @@ public: if (Self->IsFollower()) { ReplyError( Ydb::StatusIds::UNSUPPORTED, - "Followers don't support system table reads", + TStringBuilder() << "Followers don't support system table reads" + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")", ctx.SelfID.NodeId()); return true; } if (!state.IsHeadRead) { ReplyError( Ydb::StatusIds::BAD_REQUEST, - TStringBuilder() << "Cannot read system table using snapshot " << state.ReadVersion, + TStringBuilder() << "Cannot read system table using snapshot " << state.ReadVersion + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")", ctx.SelfID.NodeId()); return true; } if (record.GetTableId().GetTableId() >= TDataShard::Schema::MinLocalTid) { ReplyError( Ydb::StatusIds::BAD_REQUEST, - "Cannot read from user tables using system tables", + TStringBuilder() << "Cannot read from user tables using system tables" + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")", ctx.SelfID.NodeId()); return true; } @@ -2079,7 +2107,8 @@ public: ReplyError( Ydb::StatusIds::UNSUPPORTED, TStringBuilder() << "Unsupported result format " - << (int)record.GetResultFormat() << " when reading from system tables", + << (int)record.GetResultFormat() << " when reading from system tables" + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")", ctx.SelfID.NodeId()); return true; } @@ -2088,7 +2117,8 @@ public: Ydb::StatusIds::BAD_REQUEST, TStringBuilder() << "Cannot request system table at shard " << record.GetTableId().GetOwnerId() << ", localTid: " << record.GetTableId().GetTableId() - << ", with schema: " << record.GetTableId().GetSchemaVersion(), + << ", with schema: " << record.GetTableId().GetSchemaVersion() + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")", ctx.SelfID.NodeId()); return true; } @@ -2260,7 +2290,8 @@ public: SetStatusError( Result->Record, Ydb::StatusIds::NOT_FOUND, - TStringBuilder() << "Unknown table id: " << state.PathId.LocalPathId); + TStringBuilder() << "Unknown table id: " << state.PathId.LocalPathId + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); SendResult(ctx); return true; } @@ -2272,7 +2303,8 @@ public: Result->Record, Ydb::StatusIds::SCHEME_ERROR, TStringBuilder() << "Schema changed, current " << currentSchemaVersion - << ", requested table schemaversion " << state.SchemaVersion); + << ", requested table schemaversion " << state.SchemaVersion + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); SendResult(ctx); return true; } @@ -2285,7 +2317,8 @@ public: Result->Record, Ydb::StatusIds::NOT_FOUND, TStringBuilder() << "Failed to get scheme for table local id: " - << state.PathId.LocalPathId); + << state.PathId.LocalPathId + << " (shard# " << Self->TabletID() << " node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); SendResult(ctx); return true; } @@ -2316,7 +2349,8 @@ public: TStringBuilder() << "Table id " << tableId << " lost snapshot at " << state.ReadVersion << " shard " << Self->TabletID() << " with lowWatermark " << Self->GetSnapshotManager().GetLowWatermark() - << (Self->IsFollower() ? " RO replica" : "")); + << (Self->IsFollower() ? " RO replica" : "") + << " (node# " << ctx.SelfID.NodeId() << " state# " << DatashardStateName(Self->State) << ")"); SendResult(ctx); return true; } @@ -2515,7 +2549,8 @@ void TDataShard::Handle(TEvDataShard::TEvRead::TPtr& ev, const TActorContext& ct const auto& record = request->Record; if (Y_UNLIKELY(!record.HasReadId())) { auto result = MakeEvReadResult(ctx.SelfID.NodeId()); - SetStatusError(result->Record, Ydb::StatusIds::BAD_REQUEST, "Missing ReadId"); + SetStatusError(result->Record, Ydb::StatusIds::BAD_REQUEST, + TStringBuilder() << "Missing ReadId at shard " << TabletID()); ctx.Send(ev->Sender, result.release()); return; } @@ -2541,14 +2576,15 @@ void TDataShard::Handle(TEvDataShard::TEvRead::TPtr& ev, const TActorContext& ct if (Y_UNLIKELY(Pipeline.HasWaitingReadIterator(readId) || ReadIterators.contains(readId))) { replyWithError( Ydb::StatusIds::ALREADY_EXISTS, - TStringBuilder() << "Request " << readId.ReadId << " already executing"); + TStringBuilder() << "Request " << readId.ReadId << " already executing at shard " << TabletID()); return; } if (!IsStateNewReadAllowed()) { replyWithError( Ydb::StatusIds::OVERLOADED, - TStringBuilder() << "Shard " << TabletID() << " is splitting/merging"); + TStringBuilder() << "Shard " << TabletID() << " is splitting/merging" + << " (node# " << SelfId().NodeId() << " state# " << DatashardStateName(State) << ")"); return; } @@ -2571,7 +2607,8 @@ void TDataShard::Handle(TEvDataShard::TEvRead::TPtr& ev, const TActorContext& ct if (Pipeline.HasDrop()) { replyWithError( Ydb::StatusIds::INTERNAL_ERROR, - TStringBuilder() << "Request " << readId.ReadId << " rejected, because pipeline is in process of drop"); + TStringBuilder() << "Request " << readId.ReadId << " rejected, because pipeline is in process of drop" + << " (shard# " << TabletID() << " node# " << SelfId().NodeId() << " state# " << DatashardStateName(State) << ")"); return; } @@ -2581,22 +2618,29 @@ void TDataShard::Handle(TEvDataShard::TEvRead::TPtr& ev, const TActorContext& ct if (totalInFly > GetMaxTxInFly()) { replyWithError( Ydb::StatusIds::OVERLOADED, - TStringBuilder() << "Request " << readId.ReadId << " rejected, MaxTxInFly was exceeded"); + TStringBuilder() << "Request " << readId.ReadId << " rejected, MaxTxInFly was exceeded" + << " (shard# " << TabletID() << " node# " << SelfId().NodeId() << " state# " << DatashardStateName(State) << ")"); return; } if (!request->Keys.empty() && !request->Ranges.empty()) { - replyWithError(Ydb::StatusIds::BAD_REQUEST, "Both keys and ranges are forbidden"); + replyWithError(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() + << "Both keys and ranges are forbidden" + << " (shard# " << TabletID() << " node# " << SelfId().NodeId() << " state# " << DatashardStateName(State) << ")"); return; } if (request->Keys.empty() && request->Ranges.empty()) { - replyWithError(Ydb::StatusIds::BAD_REQUEST, "Neither keys nor ranges"); + replyWithError(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() + << "Neither keys nor ranges specified" + << " (shard# " << TabletID() << " node# " << SelfId().NodeId() << " state# " << DatashardStateName(State) << ")"); return; } if (record.HasProgram()) { - replyWithError(Ydb::StatusIds::BAD_REQUEST, "PushDown is not supported"); + replyWithError(Ydb::StatusIds::BAD_REQUEST, TStringBuilder() + << "PushDown is not supported" + << " (shard# " << TabletID() << " node# " << SelfId().NodeId() << " state# " << DatashardStateName(State) << ")"); return; } @@ -2607,8 +2651,9 @@ void TDataShard::Handle(TEvDataShard::TEvRead::TPtr& ev, const TActorContext& ct readVersion.TxId = record.GetSnapshot().GetTxId(); if (readVersion.Step == Max<ui64>()) { replyWithError( - Ydb::StatusIds::UNSUPPORTED, - "invalid snapshot value specified"); + Ydb::StatusIds::UNSUPPORTED, TStringBuilder() + << "invalid snapshot value specified" + << " (shard# " << TabletID() << " node# " << SelfId().NodeId() << " state# " << DatashardStateName(State) << ")"); return; } isHeadRead = false; @@ -2676,7 +2721,9 @@ void TDataShard::Handle(TEvDataShard::TEvReadAck::TPtr& ev, const TActorContext& LOG_DEBUG_S(ctx, NKikimrServices::TX_DATASHARD, TabletID() << " ReadAck: " << record); auto result = MakeEvReadResult(ctx.SelfID.NodeId()); - SetStatusError(result->Record, Ydb::StatusIds::BAD_REQUEST, "Missing mandatory fields in TEvReadAck"); + SetStatusError(result->Record, Ydb::StatusIds::BAD_REQUEST, TStringBuilder() + << "Missing mandatory fields in TEvReadAck" + << " (shard# " << TabletID() << " node# " << SelfId().NodeId() << " state# " << DatashardStateName(State) << ")"); if (record.HasReadId()) result->Record.SetReadId(record.GetReadId()); ctx.Send(ev->Sender, result.release()); @@ -2706,7 +2753,8 @@ void TDataShard::Handle(TEvDataShard::TEvReadAck::TPtr& ev, const TActorContext& // We received ACK on message we hadn't sent yet if (state.SeqNo < record.GetSeqNo()) { auto issueStr = TStringBuilder() << TabletID() << " ReadAck from future: " << record.GetSeqNo() - << ", current seqNo# " << state.SeqNo; + << ", current seqNo# " << state.SeqNo + << " (shard# " << TabletID() << " node# " << SelfId().NodeId() << " state# " << DatashardStateName(State) << ")"; LOG_DEBUG_S(ctx, NKikimrServices::TX_DATASHARD, issueStr); auto result = MakeEvReadResult(ctx.SelfID.NodeId()); diff --git a/ydb/core/tx/datashard/datashard_ut_read_iterator.cpp b/ydb/core/tx/datashard/datashard_ut_read_iterator.cpp index f30afbaccc5..afb48e5f84e 100644 --- a/ydb/core/tx/datashard/datashard_ut_read_iterator.cpp +++ b/ydb/core/tx/datashard/datashard_ut_read_iterator.cpp @@ -2470,8 +2470,11 @@ Y_UNIT_TEST_SUITE(DataShardReadIterator) { waitFor([&]{ return serverConnectedCount != 0; }, "intercepted EvServerConnected"); if (!connectedFromDifferentNode) { ++node; + sender = runtime.AllocateEdgeActor(node); + serverConnectedCount = 0; table.ClientId = runtime.ConnectToPipe(table.TabletId, sender, node, GetPipeConfigWithRetries()); UNIT_ASSERT(table.ClientId); + waitFor([&]{ return serverConnectedCount != 0; }, "intercepted EvServerConnected"); } UNIT_ASSERT(connectedFromDifferentNode); @@ -2491,10 +2494,10 @@ Y_UNIT_TEST_SUITE(DataShardReadIterator) { runtime.DisconnectNodes(0, 1); table.ClientId = runtime.ConnectToPipe(table.TabletId, sender, node, GetPipeConfigWithRetries()); - exhaustedCount = helper.GetSimpleCounter("table-1", "DataShard/ReadIteratorsExhaustedCount", 0); + exhaustedCount = helper.GetSimpleCounter("table-1", "DataShard/ReadIteratorsExhaustedCount", node); while (exhaustedCount != 0) { SimulateSleep(helper.Server, TDuration::Seconds(1)); - exhaustedCount = helper.GetSimpleCounter("table-1", "DataShard/ReadIteratorsExhaustedCount", 0); + exhaustedCount = helper.GetSimpleCounter("table-1", "DataShard/ReadIteratorsExhaustedCount", node); } iteratorsCount = helper.GetSimpleCounter("table-1", "DataShard/ReadIteratorsCount", node); |