diff options
author | Semyon Danilov <senya@ydb.tech> | 2024-09-24 14:40:41 +0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-09-24 14:40:41 +0400 |
commit | 4b0a8a279c8209c05e9a0c69fc721b5bc0241f56 (patch) | |
tree | 6aa8ff9f40bf6757ff11ccb16b7f06e0be54d12f | |
parent | 22f751a793700553687f12ecdf9f62193f338c4d (diff) | |
download | ydb-4b0a8a279c8209c05e9a0c69fc721b5bc0241f56.tar.gz |
Add PDisk error reason to VDisk's PDisk error state (#9302)
4 files changed, 71 insertions, 42 deletions
diff --git a/ydb/core/blobstorage/vdisk/common/vdisk_context.h b/ydb/core/blobstorage/vdisk/common/vdisk_context.h index 24faf02ce4..d0b463c38f 100644 --- a/ydb/core/blobstorage/vdisk/common/vdisk_context.h +++ b/ydb/core/blobstorage/vdisk/common/vdisk_context.h @@ -80,8 +80,6 @@ namespace NKikimr { TOutOfSpaceState OutOfSpaceState; // Global stat about huge heap fragmentation THugeHeapFragmentation HugeHeapFragmentation; - // Tracks PDisk errors - TPDiskErrorState PDiskErrorState; friend class TDskSpaceTrackerActor; NMonGroup::TCostGroup CostMonGroup; @@ -89,10 +87,6 @@ namespace NKikimr { public: TLogger Logger; - TPDiskErrorState::EState GetPDiskErrorState() const { - return PDiskErrorState.GetState(); - } - public: TVDiskContext( const TActorId &vdiskActorId, @@ -137,14 +131,11 @@ namespace NKikimr { case NKikimrProto::CORRUPTED: case NKikimrProto::OUT_OF_SPACE: { // Device is out of order - PDiskErrorState.Set(ev.Status, ev.StatusFlags); - auto newState = PDiskErrorState.GetState(); LOG_ERROR(actorSystemOrCtx, NKikimrServices::BS_VDISK_OTHER, VDISKP(VDiskLogPrefix, - "CheckPDiskResponse: Recoverable error from PDisk: %s newState# %s", - FormatMessage(ev.Status, ev.ErrorReason, ev.StatusFlags, message).data(), - TPDiskErrorState::StateToString(newState))); - actorSystemOrCtx.Send(VDiskActorId, new TEvPDiskErrorStateChange(newState)); + "CheckPDiskResponse: Recoverable error from PDisk: %s", + FormatMessage(ev.Status, ev.ErrorReason, ev.StatusFlags, message).data())); + actorSystemOrCtx.Send(VDiskActorId, new TEvPDiskErrorStateChange(ev.Status, ev.StatusFlags, ev.ErrorReason)); return false; } default: diff --git a/ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error.h b/ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error.h index d4474b21f7..86c83cc03e 100644 --- a/ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error.h +++ b/ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error.h @@ -1,8 +1,10 @@ #pragma once + #include "defs.h" -#include <ydb/core/base/blobstorage.h> #include "vdisk_config.h" +#include <ydb/core/base/blobstorage.h> + namespace NKikimr { //////////////////////////////////////////////////////////////////////////// @@ -31,43 +33,52 @@ namespace NKikimr { } TPDiskErrorState() { - SetPrivate(Good); + SetPrivate(Good, ""); } EState GetState() const { - return static_cast<EState>(AtomicGet(State)); + return State; + } + + const TString& GetErrorReason() const { + return ErrorReason; } // We call this function when PDisk returned ERROR and we pass pdiskFlags to set the correct state - EState Set(NKikimrProto::EReplyStatus status, ui32 pdiskFlags) { + EState Set(NKikimrProto::EReplyStatus status, ui32 pdiskFlags, const TString& errorReason) { switch (status) { case NKikimrProto::CORRUPTED: - return SetPrivate(NoWrites); + return SetPrivate(NoWrites, errorReason); case NKikimrProto::OUT_OF_SPACE: // check flags additionally Y_ABORT_UNLESS(pdiskFlags & NKikimrBlobStorage::StatusNotEnoughDiskSpaceForOperation); - return SetPrivate(WriteOnlyLog); + return SetPrivate(WriteOnlyLog, errorReason); default: Y_ABORT("Unexpected state# %s", NKikimrProto::EReplyStatus_Name(status).data()); } } - private: - TAtomic State = 0; - - EState SetPrivate(EState state) { - // make sure bad state increments (not decrements), use CAS for that - while (true) { - EState curState = GetState(); - if (state > curState) { - // if state is worse than curState: - TAtomicBase newState = static_cast<TAtomicBase>(state); - bool done = AtomicCas(&State, newState, curState); - if (done) - return state; - } else - return curState; + TString ToString() const { + TStringStream ss; + ss << "State# " << StateToString(State); + if (!ErrorReason.empty()) { + ss << ", PDiskError# " << ErrorReason; } + return ss.Str(); + } + + private: + EState State = EState::Unspecified; + + TString ErrorReason; + + EState SetPrivate(EState state, const TString& errorReason) { + if (state > State) { + State = state; + ErrorReason = errorReason; + return state; + } else + return State; } }; @@ -80,10 +91,14 @@ namespace NKikimr { struct TEvPDiskErrorStateChange : public TEventLocal<TEvPDiskErrorStateChange, TEvBlobStorage::EvPDiskErrorStateChange> { - const TPDiskErrorState::EState State; + const NKikimrProto::EReplyStatus Status; + const ui32 PDiskFlags; + const TString ErrorReason; - TEvPDiskErrorStateChange(TPDiskErrorState::EState state) - : State(state) + TEvPDiskErrorStateChange(NKikimrProto::EReplyStatus status, ui32 pdiskFlags, const TString &errorReason) + : Status(status) + , PDiskFlags(pdiskFlags) + , ErrorReason(errorReason) {} }; diff --git a/ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error_ut.cpp b/ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error_ut.cpp index 3b6103cfa3..f3eb58f0bf 100644 --- a/ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error_ut.cpp +++ b/ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error_ut.cpp @@ -16,20 +16,39 @@ namespace NKikimr { Y_UNIT_TEST(Basic) { TPDiskErrorState state; UNIT_ASSERT(state.GetState() == TPDiskErrorState::Good); + UNIT_ASSERT(state.GetErrorReason().Empty()); - state.Set(NKikimrProto::CORRUPTED, 0); + state.Set(NKikimrProto::CORRUPTED, 0, ""); UNIT_ASSERT(state.GetState() == TPDiskErrorState::NoWrites); + UNIT_ASSERT(state.GetErrorReason().Empty()); } Y_UNIT_TEST(Basic2) { TPDiskErrorState state; UNIT_ASSERT(state.GetState() == TPDiskErrorState::Good); + UNIT_ASSERT(state.GetErrorReason().Empty()); - state.Set(NKikimrProto::OUT_OF_SPACE, NKikimrBlobStorage::StatusNotEnoughDiskSpaceForOperation); + state.Set(NKikimrProto::OUT_OF_SPACE, NKikimrBlobStorage::StatusNotEnoughDiskSpaceForOperation, ""); UNIT_ASSERT(state.GetState() == TPDiskErrorState::WriteOnlyLog); + UNIT_ASSERT(state.GetErrorReason().Empty()); - state.Set(NKikimrProto::CORRUPTED, 0); + state.Set(NKikimrProto::CORRUPTED, 0, ""); UNIT_ASSERT(state.GetState() == TPDiskErrorState::NoWrites); + UNIT_ASSERT(state.GetErrorReason().Empty()); + } + + Y_UNIT_TEST(BasicErrorReason) { + TPDiskErrorState state; + UNIT_ASSERT(state.GetState() == TPDiskErrorState::Good); + UNIT_ASSERT(state.GetErrorReason().Empty()); + + state.Set(NKikimrProto::OUT_OF_SPACE, NKikimrBlobStorage::StatusNotEnoughDiskSpaceForOperation, "Foo"); + UNIT_ASSERT(state.GetState() == TPDiskErrorState::WriteOnlyLog); + UNIT_ASSERT(state.GetErrorReason() == "Foo"); + + state.Set(NKikimrProto::CORRUPTED, 0, "Bar"); + UNIT_ASSERT(state.GetState() == TPDiskErrorState::NoWrites); + UNIT_ASSERT(state.GetErrorReason() == "Bar"); } } diff --git a/ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeletonfront.cpp b/ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeletonfront.cpp index 1ef0e41a60..f360860dc4 100644 --- a/ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeletonfront.cpp +++ b/ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeletonfront.cpp @@ -636,6 +636,7 @@ namespace NKikimr { friend class TIntQueueClass; TVDiskContextPtr VCtx; + TPDiskErrorState PDiskErrorState; TIntrusivePtr<TVDiskConfig> Config; TIntrusivePtr<TBlobStorageGroupInfo> GInfo; std::shared_ptr<TBlobStorageGroupInfo::TTopology> Top; @@ -934,7 +935,7 @@ namespace NKikimr { TABLED() {str << "Error Details";} TABLED() { str << "PDisk reported error: " - << TPDiskErrorState::StateToString(VCtx->GetPDiskErrorState()); + << PDiskErrorState.ToString(); } } } else if (VCtx->LocalRecoveryErrorStr) { @@ -1663,12 +1664,15 @@ namespace NKikimr { } void Handle(TEvPDiskErrorStateChange::TPtr &ev, const TActorContext &ctx) { + auto errorStateChange = ev->Get(); + + PDiskErrorState.Set(errorStateChange->Status, errorStateChange->PDiskFlags, errorStateChange->ErrorReason); + LOG_ERROR_S(ctx, NKikimrServices::BS_SKELETON, VCtx->VDiskLogPrefix << "SkeletonFront: got TEvPDiskErrorStateChange;" - << " state# " << TPDiskErrorState::StateToString(ev->Get()->State) + << PDiskErrorState.ToString() << " Marker# BSVSF03"); - // switch skeleton state to PDiskError SkeletonFrontGroup->ResetCounters(); VDiskMonGroup.VDiskState(NKikimrWhiteboard::EVDiskState::PDiskError); |