aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorilnaz <ilnaz@ydb.tech>2023-10-09 01:14:09 +0300
committerilnaz <ilnaz@ydb.tech>2023-10-09 01:33:25 +0300
commit20b31868a936eceda7d5271d1b604ad8bdb5388c (patch)
treefbaf81edcbc1d2a41e96133c097552a0078bc8ed
parentd23e12022dc17381a20537ca8f31d90c5f4763ad (diff)
downloadydb-20b31868a936eceda7d5271d1b604ad8bdb5388c.tar.gz
Return verbose reason KIKIMR-19573
-rw-r--r--ydb/core/cms/cms.cpp2
-rw-r--r--ydb/core/cms/erasure_checkers.cpp124
-rw-r--r--ydb/core/cms/erasure_checkers.h29
3 files changed, 88 insertions, 67 deletions
diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp
index daa59c2df0f..a7e657813f0 100644
--- a/ydb/core/cms/cms.cpp
+++ b/ydb/core/cms/cms.cpp
@@ -663,7 +663,7 @@ bool TCms::TryToLockVDisk(const TActionOptions& opts,
switch (opts.AvailabilityMode) {
case MODE_MAX_AVAILABILITY:
- if (!counters->CheckForMaxAvailability(error, defaultDeadline, opts.PartialPermissionAllowed)) {
+ if (!counters->CheckForMaxAvailability(ClusterInfo, error, defaultDeadline, opts.PartialPermissionAllowed)) {
return false;
}
break;
diff --git a/ydb/core/cms/erasure_checkers.cpp b/ydb/core/cms/erasure_checkers.cpp
index b98748c7932..008a7bec354 100644
--- a/ydb/core/cms/erasure_checkers.cpp
+++ b/ydb/core/cms/erasure_checkers.cpp
@@ -9,8 +9,9 @@ bool TErasureCounterBase::IsDown(const TVDiskInfo &vdisk, TClusterInfoPtr info,
// Check we received info for PDisk.
if (!pdisk.NodeId) {
- ++Down;
- error.Reason = TStringBuilder() << "Missing info for " << pdisk.ItemName();
+ Down.insert(vdisk.VDiskId);
+ error.Code = TStatus::DISALLOW_TEMP;
+ error.Reason = TStringBuilder() << "Missing info for " << pdisk.PrettyItemName();
return false;
}
@@ -27,9 +28,9 @@ bool TErasureCounterBase::IsLocked(const TVDiskInfo &vdisk, TClusterInfoPtr info
// Check we received info for VDisk.
if (!vdisk.NodeId || !vdisk.PDiskId) {
- ++Down;
+ Down.insert(vdisk.VDiskId);
error.Code = TStatus::DISALLOW_TEMP;
- error.Reason = TStringBuilder() << "Missing info for " << vdisk.ItemName();
+ error.Reason = TStringBuilder() << "Missing info for " << vdisk.PrettyItemName();
return false;
}
@@ -42,8 +43,27 @@ bool TErasureCounterBase::GroupAlreadyHasLockedDisks() const {
return HasAlreadyLockedDisks;
}
-bool TErasureCounterBase::CheckForMaxAvailability(TErrorInfo &error, TInstant &defaultDeadline, bool allowPartial) const {
- if (Locked + Down > 1) {
+static TString DumpVDisksInfo(const TSet<TVDiskID>& vdisks, TClusterInfoPtr info) {
+ if (vdisks.empty()) {
+ return "<empty>";
+ }
+
+ TStringBuilder dump;
+
+ bool comma = false;
+ for (const auto& vdisk : vdisks) {
+ if (comma) {
+ dump << ", ";
+ }
+ dump << info->VDisk(vdisk).PrettyItemName();
+ comma = true;
+ }
+
+ return dump;
+}
+
+bool TErasureCounterBase::CheckForMaxAvailability(TClusterInfoPtr info, TErrorInfo &error, TInstant &defaultDeadline, bool allowPartial) const {
+ if (Locked.size() + Down.size() > 1) {
if (HasAlreadyLockedDisks && !allowPartial) {
error.Code = TStatus::DISALLOW;
error.Reason = "The request is incorrect: too many disks from the one group. "
@@ -53,14 +73,17 @@ bool TErasureCounterBase::CheckForMaxAvailability(TErrorInfo &error, TInstant &d
error.Code = TStatus::DISALLOW_TEMP;
error.Reason = TStringBuilder() << "Issue in affected group " << GroupId
- << ". " << "Too many locked and down vdisks: " << Locked + Down;
+ << ": too many unavailable vdisks"
+ << ". Locked: " << DumpVDisksInfo(Locked, info)
+ << ". Down: " << DumpVDisksInfo(Down, info);
error.Deadline = defaultDeadline;
return false;
}
+
return true;
}
-void TDefaultErasureCounter::CountVDisk(const TVDiskInfo &vdisk, TClusterInfoPtr info, TDuration retryTime,
+bool TErasureCounterBase::CountVDisk(const TVDiskInfo &vdisk, TClusterInfoPtr info, TDuration retryTime,
TDuration duration, TErrorInfo &error)
{
Y_VERIFY_DEBUG(vdisk.VDiskId != VDisk.VDiskId);
@@ -68,22 +91,36 @@ void TDefaultErasureCounter::CountVDisk(const TVDiskInfo &vdisk, TClusterInfoPtr
// Check locks.
TErrorInfo err;
if (IsLocked(vdisk, info, retryTime, duration, err)) {
- ++Locked;
+ Locked.insert(vdisk.VDiskId);
error.Code = err.Code;
- error.Reason = TStringBuilder() << "Issue in affected group " << GroupId
- << ". " << err.Reason;
+ error.Reason = TStringBuilder() << "Issue in affected group " << GroupId << ": " << err.Reason;
error.Deadline = Max(error.Deadline, err.Deadline);
- return;
+ return true;
}
// Check if disk is down.
if (IsDown(vdisk, info, retryTime, err)) {
- ++Down;
+ Down.insert(vdisk.VDiskId);
error.Code = err.Code;
- error.Reason = TStringBuilder() << "Issue in affected group " << GroupId
- << ". " << err.Reason;
+ error.Reason = TStringBuilder() << "Issue in affected group " << GroupId << ": " << err.Reason;
error.Deadline = Max(error.Deadline, err.Deadline);
+ return true;
}
+
+ return false;
+}
+
+void TErasureCounterBase::CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) {
+ for (const auto &vdId : info->BSGroup(GroupId).VDisks) {
+ if (vdId != VDisk.VDiskId)
+ CountVDisk(info->VDisk(vdId), info, retryTime, duration, error);
+ }
+
+ if (Locked && error.Code == TStatus::DISALLOW) {
+ HasAlreadyLockedDisks = true;
+ }
+
+ Locked.insert(VDisk.VDiskId);
}
bool TDefaultErasureCounter::CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo &error,
@@ -97,19 +134,23 @@ bool TDefaultErasureCounter::CheckForKeepAvailability(TClusterInfoPtr info, TErr
return false;
}
- if (Down + Locked > info->BSGroup(GroupId).Erasure.ParityParts()) {
+ if (Down.size() + Locked.size() > info->BSGroup(GroupId).Erasure.ParityParts()) {
if (HasAlreadyLockedDisks && !allowPartial) {
error.Code = TStatus::DISALLOW;
error.Reason = "The request is incorrect: too many disks from the one group. "
"Fix the request or set PartialPermissionAllowed to true";
return false;
}
+
error.Code = TStatus::DISALLOW_TEMP;
- error.Reason = TStringBuilder() << "Cannot lock disk " << VDisk.PrettyItemName()
- << ". Too many locked nodes for group " << GroupId;
+ error.Reason = TStringBuilder() << "Issue in affected group " << GroupId
+ << ": too many unavailable vdisks"
+ << ". Locked: " << DumpVDisksInfo(Locked, info)
+ << ". Down: " << DumpVDisksInfo(Down, info);
error.Deadline = defaultDeadline;
return false;
}
+
return true;
}
@@ -146,60 +187,37 @@ bool TMirror3dcCounter::CheckForKeepAvailability(TClusterInfoPtr info, TErrorInf
if (DataCenterDisabledNodes.size() > 2) {
error.Code = TStatus::DISALLOW_TEMP;
error.Reason = TStringBuilder() << "Issue in affected group " << GroupId
- << ". Too many data centers have unavailable vdisks: "
- << DataCenterDisabledNodes.size();
+ << ": too many unavailable vdisks"
+ << ". Number of data centers with unavailable vdisks: " << DataCenterDisabledNodes.size()
+ << ". Locked: " << DumpVDisksInfo(Locked, info)
+ << ". Down: " << DumpVDisksInfo(Down, info);
error.Deadline = defaultDeadline;
return false;
}
error.Code = TStatus::DISALLOW_TEMP;
error.Reason = TStringBuilder() << "Issue in affected group " << GroupId
- << ". Data centers have too many unavailable vdisks";
+ << ": too many unavailable vdisks"
+ << ". Locked: " << DumpVDisksInfo(Locked, info)
+ << ". Down: " << DumpVDisksInfo(Down, info);
error.Deadline = defaultDeadline;
return false;
}
-void TMirror3dcCounter::CountVDisk(const TVDiskInfo &vdisk, TClusterInfoPtr info, TDuration retryTime,
+bool TMirror3dcCounter::CountVDisk(const TVDiskInfo &vdisk, TClusterInfoPtr info, TDuration retryTime,
TDuration duration, TErrorInfo &error)
{
- Y_VERIFY_DEBUG(vdisk.VDiskId != VDisk.VDiskId);
-
- // Check locks.
- TErrorInfo err;
- if (IsLocked(vdisk, info, retryTime, duration, err)
- || IsDown(vdisk, info, retryTime, err)) {
- error.Code = err.Code;
- error.Reason = TStringBuilder() << "Issue in affected group " << GroupId
- << ". " << err.Reason;
- error.Deadline = Max(error.Deadline, err.Deadline);
- ++Locked;
+ const bool disabled = TErasureCounterBase::CountVDisk(vdisk, info, retryTime, duration, error);
+ if (disabled) {
++DataCenterDisabledNodes[vdisk.VDiskId.FailRealm];
}
+ return disabled;
}
void TMirror3dcCounter::CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) {
- for (const auto &vdId : info->BSGroup(GroupId).VDisks) {
- if (vdId != VDisk.VDiskId)
- CountVDisk(info->VDisk(vdId), info, retryTime, duration, error);
- }
- ++Locked;
+ TErasureCounterBase::CountGroupState(info, retryTime, duration, error);
++DataCenterDisabledNodes[VDisk.VDiskId.FailRealm];
-
- if (Locked && error.Code == TStatus::DISALLOW) {
- HasAlreadyLockedDisks = true;
- }
-}
-
-void TDefaultErasureCounter::CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) {
- for (const auto &vdId : info->BSGroup(GroupId).VDisks) {
- if (vdId != VDisk.VDiskId)
- CountVDisk(info->VDisk(vdId), info, retryTime, duration, error);
- }
- if (Locked && error.Code == TStatus::DISALLOW) {
- HasAlreadyLockedDisks = true;
- }
- ++Locked;
}
TSimpleSharedPtr<IErasureCounter> CreateErasureCounter(TErasureType::EErasureSpecies es, const TVDiskInfo &vdisk, ui32 groupId) {
diff --git a/ydb/core/cms/erasure_checkers.h b/ydb/core/cms/erasure_checkers.h
index 0c0de0be766..c02fd1da8bf 100644
--- a/ydb/core/cms/erasure_checkers.h
+++ b/ydb/core/cms/erasure_checkers.h
@@ -6,25 +6,29 @@
#include <ydb/core/erasure/erasure.h>
#include <ydb/core/protos/cms.pb.h>
+#include <util/generic/set.h>
+
namespace NKikimr::NCms {
using namespace NKikimrCms;
class IErasureCounter {
+protected:
+ virtual bool CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) = 0;
+
public:
virtual ~IErasureCounter() = default;
virtual bool GroupAlreadyHasLockedDisks() const = 0;
- virtual bool CheckForMaxAvailability(TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const = 0;
+ virtual bool CheckForMaxAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const = 0;
virtual bool CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const = 0;
virtual void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) = 0;
- virtual void CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) = 0;
};
class TErasureCounterBase: public IErasureCounter {
protected:
- ui32 Down;
- ui32 Locked;
+ TSet<TVDiskID> Down;
+ TSet<TVDiskID> Locked;
const TVDiskInfo& VDisk;
const ui32 GroupId;
bool HasAlreadyLockedDisks;
@@ -32,19 +36,19 @@ protected:
protected:
bool IsDown(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration& retryTime, TErrorInfo& error);
bool IsLocked(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration& retryTime, TDuration& duration, TErrorInfo& error);
+ bool CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) override;
public:
TErasureCounterBase(const TVDiskInfo& vdisk, ui32 groupId)
- : Down(0)
- , Locked(0)
- , VDisk(vdisk)
+ : VDisk(vdisk)
, GroupId(groupId)
, HasAlreadyLockedDisks(false)
{
}
bool GroupAlreadyHasLockedDisks() const final;
- bool CheckForMaxAvailability(TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const final;
+ bool CheckForMaxAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const final;
+ void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) override;
};
class TDefaultErasureCounter: public TErasureCounterBase {
@@ -54,24 +58,23 @@ public:
{
}
- void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) override;
bool CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const override;
- void CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) override;
};
class TMirror3dcCounter: public TErasureCounterBase {
-private:
THashMap<ui8, ui32> DataCenterDisabledNodes;
+protected:
+ bool CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) override;
+
public:
TMirror3dcCounter(const TVDiskInfo& vdisk, ui32 groupId)
: TErasureCounterBase(vdisk, groupId)
{
}
- void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) override;
bool CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const override;
- void CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) override;
+ void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) override;
};
TSimpleSharedPtr<IErasureCounter> CreateErasureCounter(TErasureType::EErasureSpecies es, const TVDiskInfo& vdisk, ui32 groupId);