aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSemyon Danilov <senya@ydb.tech>2024-09-24 14:40:41 +0400
committerGitHub <noreply@github.com>2024-09-24 14:40:41 +0400
commit4b0a8a279c8209c05e9a0c69fc721b5bc0241f56 (patch)
tree6aa8ff9f40bf6757ff11ccb16b7f06e0be54d12f
parent22f751a793700553687f12ecdf9f62193f338c4d (diff)
downloadydb-4b0a8a279c8209c05e9a0c69fc721b5bc0241f56.tar.gz
Add PDisk error reason to VDisk's PDisk error state (#9302)
-rw-r--r--ydb/core/blobstorage/vdisk/common/vdisk_context.h15
-rw-r--r--ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error.h63
-rw-r--r--ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error_ut.cpp25
-rw-r--r--ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeletonfront.cpp10
4 files changed, 71 insertions, 42 deletions
diff --git a/ydb/core/blobstorage/vdisk/common/vdisk_context.h b/ydb/core/blobstorage/vdisk/common/vdisk_context.h
index 24faf02ce4..d0b463c38f 100644
--- a/ydb/core/blobstorage/vdisk/common/vdisk_context.h
+++ b/ydb/core/blobstorage/vdisk/common/vdisk_context.h
@@ -80,8 +80,6 @@ namespace NKikimr {
TOutOfSpaceState OutOfSpaceState;
// Global stat about huge heap fragmentation
THugeHeapFragmentation HugeHeapFragmentation;
- // Tracks PDisk errors
- TPDiskErrorState PDiskErrorState;
friend class TDskSpaceTrackerActor;
NMonGroup::TCostGroup CostMonGroup;
@@ -89,10 +87,6 @@ namespace NKikimr {
public:
TLogger Logger;
- TPDiskErrorState::EState GetPDiskErrorState() const {
- return PDiskErrorState.GetState();
- }
-
public:
TVDiskContext(
const TActorId &vdiskActorId,
@@ -137,14 +131,11 @@ namespace NKikimr {
case NKikimrProto::CORRUPTED:
case NKikimrProto::OUT_OF_SPACE: {
// Device is out of order
- PDiskErrorState.Set(ev.Status, ev.StatusFlags);
- auto newState = PDiskErrorState.GetState();
LOG_ERROR(actorSystemOrCtx, NKikimrServices::BS_VDISK_OTHER,
VDISKP(VDiskLogPrefix,
- "CheckPDiskResponse: Recoverable error from PDisk: %s newState# %s",
- FormatMessage(ev.Status, ev.ErrorReason, ev.StatusFlags, message).data(),
- TPDiskErrorState::StateToString(newState)));
- actorSystemOrCtx.Send(VDiskActorId, new TEvPDiskErrorStateChange(newState));
+ "CheckPDiskResponse: Recoverable error from PDisk: %s",
+ FormatMessage(ev.Status, ev.ErrorReason, ev.StatusFlags, message).data()));
+ actorSystemOrCtx.Send(VDiskActorId, new TEvPDiskErrorStateChange(ev.Status, ev.StatusFlags, ev.ErrorReason));
return false;
}
default:
diff --git a/ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error.h b/ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error.h
index d4474b21f7..86c83cc03e 100644
--- a/ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error.h
+++ b/ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error.h
@@ -1,8 +1,10 @@
#pragma once
+
#include "defs.h"
-#include <ydb/core/base/blobstorage.h>
#include "vdisk_config.h"
+#include <ydb/core/base/blobstorage.h>
+
namespace NKikimr {
////////////////////////////////////////////////////////////////////////////
@@ -31,43 +33,52 @@ namespace NKikimr {
}
TPDiskErrorState() {
- SetPrivate(Good);
+ SetPrivate(Good, "");
}
EState GetState() const {
- return static_cast<EState>(AtomicGet(State));
+ return State;
+ }
+
+ const TString& GetErrorReason() const {
+ return ErrorReason;
}
// We call this function when PDisk returned ERROR and we pass pdiskFlags to set the correct state
- EState Set(NKikimrProto::EReplyStatus status, ui32 pdiskFlags) {
+ EState Set(NKikimrProto::EReplyStatus status, ui32 pdiskFlags, const TString& errorReason) {
switch (status) {
case NKikimrProto::CORRUPTED:
- return SetPrivate(NoWrites);
+ return SetPrivate(NoWrites, errorReason);
case NKikimrProto::OUT_OF_SPACE:
// check flags additionally
Y_ABORT_UNLESS(pdiskFlags & NKikimrBlobStorage::StatusNotEnoughDiskSpaceForOperation);
- return SetPrivate(WriteOnlyLog);
+ return SetPrivate(WriteOnlyLog, errorReason);
default:
Y_ABORT("Unexpected state# %s", NKikimrProto::EReplyStatus_Name(status).data());
}
}
- private:
- TAtomic State = 0;
-
- EState SetPrivate(EState state) {
- // make sure bad state increments (not decrements), use CAS for that
- while (true) {
- EState curState = GetState();
- if (state > curState) {
- // if state is worse than curState:
- TAtomicBase newState = static_cast<TAtomicBase>(state);
- bool done = AtomicCas(&State, newState, curState);
- if (done)
- return state;
- } else
- return curState;
+ TString ToString() const {
+ TStringStream ss;
+ ss << "State# " << StateToString(State);
+ if (!ErrorReason.empty()) {
+ ss << ", PDiskError# " << ErrorReason;
}
+ return ss.Str();
+ }
+
+ private:
+ EState State = EState::Unspecified;
+
+ TString ErrorReason;
+
+ EState SetPrivate(EState state, const TString& errorReason) {
+ if (state > State) {
+ State = state;
+ ErrorReason = errorReason;
+ return state;
+ } else
+ return State;
}
};
@@ -80,10 +91,14 @@ namespace NKikimr {
struct TEvPDiskErrorStateChange :
public TEventLocal<TEvPDiskErrorStateChange, TEvBlobStorage::EvPDiskErrorStateChange>
{
- const TPDiskErrorState::EState State;
+ const NKikimrProto::EReplyStatus Status;
+ const ui32 PDiskFlags;
+ const TString ErrorReason;
- TEvPDiskErrorStateChange(TPDiskErrorState::EState state)
- : State(state)
+ TEvPDiskErrorStateChange(NKikimrProto::EReplyStatus status, ui32 pdiskFlags, const TString &errorReason)
+ : Status(status)
+ , PDiskFlags(pdiskFlags)
+ , ErrorReason(errorReason)
{}
};
diff --git a/ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error_ut.cpp b/ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error_ut.cpp
index 3b6103cfa3..f3eb58f0bf 100644
--- a/ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error_ut.cpp
+++ b/ydb/core/blobstorage/vdisk/common/vdisk_pdisk_error_ut.cpp
@@ -16,20 +16,39 @@ namespace NKikimr {
Y_UNIT_TEST(Basic) {
TPDiskErrorState state;
UNIT_ASSERT(state.GetState() == TPDiskErrorState::Good);
+ UNIT_ASSERT(state.GetErrorReason().Empty());
- state.Set(NKikimrProto::CORRUPTED, 0);
+ state.Set(NKikimrProto::CORRUPTED, 0, "");
UNIT_ASSERT(state.GetState() == TPDiskErrorState::NoWrites);
+ UNIT_ASSERT(state.GetErrorReason().Empty());
}
Y_UNIT_TEST(Basic2) {
TPDiskErrorState state;
UNIT_ASSERT(state.GetState() == TPDiskErrorState::Good);
+ UNIT_ASSERT(state.GetErrorReason().Empty());
- state.Set(NKikimrProto::OUT_OF_SPACE, NKikimrBlobStorage::StatusNotEnoughDiskSpaceForOperation);
+ state.Set(NKikimrProto::OUT_OF_SPACE, NKikimrBlobStorage::StatusNotEnoughDiskSpaceForOperation, "");
UNIT_ASSERT(state.GetState() == TPDiskErrorState::WriteOnlyLog);
+ UNIT_ASSERT(state.GetErrorReason().Empty());
- state.Set(NKikimrProto::CORRUPTED, 0);
+ state.Set(NKikimrProto::CORRUPTED, 0, "");
UNIT_ASSERT(state.GetState() == TPDiskErrorState::NoWrites);
+ UNIT_ASSERT(state.GetErrorReason().Empty());
+ }
+
+ Y_UNIT_TEST(BasicErrorReason) {
+ TPDiskErrorState state;
+ UNIT_ASSERT(state.GetState() == TPDiskErrorState::Good);
+ UNIT_ASSERT(state.GetErrorReason().Empty());
+
+ state.Set(NKikimrProto::OUT_OF_SPACE, NKikimrBlobStorage::StatusNotEnoughDiskSpaceForOperation, "Foo");
+ UNIT_ASSERT(state.GetState() == TPDiskErrorState::WriteOnlyLog);
+ UNIT_ASSERT(state.GetErrorReason() == "Foo");
+
+ state.Set(NKikimrProto::CORRUPTED, 0, "Bar");
+ UNIT_ASSERT(state.GetState() == TPDiskErrorState::NoWrites);
+ UNIT_ASSERT(state.GetErrorReason() == "Bar");
}
}
diff --git a/ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeletonfront.cpp b/ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeletonfront.cpp
index 1ef0e41a60..f360860dc4 100644
--- a/ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeletonfront.cpp
+++ b/ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeletonfront.cpp
@@ -636,6 +636,7 @@ namespace NKikimr {
friend class TIntQueueClass;
TVDiskContextPtr VCtx;
+ TPDiskErrorState PDiskErrorState;
TIntrusivePtr<TVDiskConfig> Config;
TIntrusivePtr<TBlobStorageGroupInfo> GInfo;
std::shared_ptr<TBlobStorageGroupInfo::TTopology> Top;
@@ -934,7 +935,7 @@ namespace NKikimr {
TABLED() {str << "Error Details";}
TABLED() {
str << "PDisk reported error: "
- << TPDiskErrorState::StateToString(VCtx->GetPDiskErrorState());
+ << PDiskErrorState.ToString();
}
}
} else if (VCtx->LocalRecoveryErrorStr) {
@@ -1663,12 +1664,15 @@ namespace NKikimr {
}
void Handle(TEvPDiskErrorStateChange::TPtr &ev, const TActorContext &ctx) {
+ auto errorStateChange = ev->Get();
+
+ PDiskErrorState.Set(errorStateChange->Status, errorStateChange->PDiskFlags, errorStateChange->ErrorReason);
+
LOG_ERROR_S(ctx, NKikimrServices::BS_SKELETON, VCtx->VDiskLogPrefix
<< "SkeletonFront: got TEvPDiskErrorStateChange;"
- << " state# " << TPDiskErrorState::StateToString(ev->Get()->State)
+ << PDiskErrorState.ToString()
<< " Marker# BSVSF03");
-
// switch skeleton state to PDiskError
SkeletonFrontGroup->ResetCounters();
VDiskMonGroup.VDiskState(NKikimrWhiteboard::EVDiskState::PDiskError);