aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authort1mursadykov <t1mursadykov@ydb.tech>2023-02-08 14:03:30 +0300
committert1mursadykov <t1mursadykov@ydb.tech>2023-02-08 14:03:30 +0300
commit63721d79509d1f5bcffac38a85c0dfc4cb084893 (patch)
treed35aae04aa9f7084f56c4f6ac2eb7b4f1950f242
parent6bf6928eda1dd0bcabb3aac0000b72aedd29d44e (diff)
downloadydb-63721d79509d1f5bcffac38a85c0dfc4cb084893.tar.gz
Fix behavior for requests with multiple disks from the same group
-rw-r--r--ydb/core/cms/cms.cpp18
-rw-r--r--ydb/core/cms/cms_impl.h2
-rw-r--r--ydb/core/cms/cms_ut.cpp77
-rw-r--r--ydb/core/cms/erasure_checkers.cpp63
-rw-r--r--ydb/core/cms/erasure_checkers.h18
5 files changed, 153 insertions, 25 deletions
diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp
index 29a46d9dbc..ad24eeb844 100644
--- a/ydb/core/cms/cms.cpp
+++ b/ydb/core/cms/cms.cpp
@@ -208,6 +208,7 @@ bool TCms::CheckPermissionRequest(const TPermissionRequest &request,
TActionOptions opts(permissionDuration);
opts.TenantPolicy = request.GetTenantPolicy();
opts.AvailabilityMode = request.GetAvailabilityMode();
+ opts.PartialPermissionAllowed = allowPartial;
TErrorInfo error;
@@ -797,24 +798,25 @@ bool TCms::TryToLockVDisk(const TActionOptions& opts,
auto counters = CreateErasureCounter(ClusterInfo->BSGroup(groupId).Erasure.GetErasure(), vdisk, groupId);
counters->CountGroupState(ClusterInfo, State->Config.DefaultRetryTime, duration, error);
- if (counters->GroupAlreadyHasLockedDisks(error)) {
- return false;
- }
-
switch (opts.AvailabilityMode) {
case MODE_MAX_AVAILABILITY:
- if (!counters->CheckForMaxAvailability(error, defaultDeadline)) {
- Y_VERIFY(error.Code == TStatus::DISALLOW_TEMP);
+ if (!counters->CheckForMaxAvailability(error, defaultDeadline, opts.PartialPermissionAllowed)) {
return false;
}
break;
case MODE_KEEP_AVAILABLE:
- if (!counters->CheckForKeepAvailability(ClusterInfo, error, defaultDeadline)) {
- Y_VERIFY(error.Code == TStatus::DISALLOW_TEMP);
+ if (!counters->CheckForKeepAvailability(ClusterInfo, error, defaultDeadline, opts.PartialPermissionAllowed)) {
return false;
}
break;
case MODE_FORCE_RESTART:
+ if ( counters->GroupAlreadyHasLockedDisks() && opts.PartialPermissionAllowed) {
+ error.Code = TStatus::DISALLOW_TEMP;
+ error.Reason = "You cannot get two or more disks from the same group at the same time"
+ " without specifying the PartialPermissionAllowed parameter";
+ error.Deadline = defaultDeadline;
+ return false;
+ }
// Any number of down disks is OK for this mode.
break;
default:
diff --git a/ydb/core/cms/cms_impl.h b/ydb/core/cms/cms_impl.h
index ff14ba60fe..c5fc9c1684 100644
--- a/ydb/core/cms/cms_impl.h
+++ b/ydb/core/cms/cms_impl.h
@@ -100,11 +100,13 @@ private:
TDuration PermissionDuration;
NKikimrCms::ETenantPolicy TenantPolicy;
NKikimrCms::EAvailabilityMode AvailabilityMode;
+ bool PartialPermissionAllowed;
TActionOptions(TDuration dur)
: PermissionDuration(dur)
, TenantPolicy(NKikimrCms::DEFAULT)
, AvailabilityMode(NKikimrCms::MODE_MAX_AVAILABILITY)
+ , PartialPermissionAllowed(false)
{}
};
diff --git a/ydb/core/cms/cms_ut.cpp b/ydb/core/cms/cms_ut.cpp
index b7a44c1021..cf5ab70dff 100644
--- a/ydb/core/cms/cms_ut.cpp
+++ b/ydb/core/cms/cms_ut.cpp
@@ -1367,6 +1367,83 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
env.CheckPermissionRequest("user", false, true, false, true, MODE_KEEP_AVAILABLE, TStatus::ALLOW,
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(4), 60000000, "storage"));
}
+
+ Y_UNIT_TEST(TestTwoOrMoreDisksFromGroupAtTheSameRequestBlock42) {
+ TCmsTestEnv env(8);
+
+ // It is impossible to get two or more permissions for one group in one request
+ env.CheckPermissionRequest("user", true, true, false, true, MODE_KEEP_AVAILABLE, TStatus::ALLOW_PARTIAL,
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
+ env.CheckPermissionRequest("user", true, true, false, true, MODE_FORCE_RESTART, TStatus::ALLOW_PARTIAL,
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
+ env.CheckPermissionRequest("user", true, true, false, true, MODE_MAX_AVAILABILITY, TStatus::ALLOW_PARTIAL,
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
+
+ // It's ok to get two permissions for one group if PartialPermissionAllowed is set to false
+ env.CheckPermissionRequest("user", false, true, false, true, MODE_KEEP_AVAILABLE, TStatus::ALLOW,
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
+ env.CheckPermissionRequest("user", false, true, false, true, MODE_FORCE_RESTART, TStatus::ALLOW,
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
+
+ // Icorrect requests
+ env.CheckPermissionRequest("user", false, true, false, true, MODE_MAX_AVAILABILITY, TStatus::DISALLOW,
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
+ env.CheckPermissionRequest("user", false, true, false, true, MODE_KEEP_AVAILABLE, TStatus::DISALLOW,
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(4), 60000000, "storage"));
+ }
+
+ Y_UNIT_TEST(TestTwoOrMoreDisksFromGroupAtTheSameRequestMirror3dc) {
+ TTestEnvOpts options(9);
+ options.UseMirror3dcErasure = true;
+ options.VDisks = 9;
+ options.DataCenterCount = 3;
+
+ TCmsTestEnv env(options);
+
+ // It is impossible to get two or more permissions for one group in one request
+ env.CheckPermissionRequest("user", true, true, false, true, MODE_KEEP_AVAILABLE, TStatus::ALLOW_PARTIAL,
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(4), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(4), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(5), 60000000, "storage"));
+ env.CheckPermissionRequest("user", true, true, false, true, MODE_FORCE_RESTART, TStatus::ALLOW_PARTIAL,
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(4), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(5), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(6), 60000000, "storage"));
+ env.CheckPermissionRequest("user", true, true, false, true, MODE_MAX_AVAILABILITY, TStatus::ALLOW_PARTIAL,
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"));
+
+ // It's ok to get two permissions for one group if PartialPermissionAllowed is set to false
+ env.CheckPermissionRequest("user", false, true, false, true, MODE_KEEP_AVAILABLE, TStatus::ALLOW,
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(4), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(5), 60000000, "storage"));
+ env.CheckPermissionRequest("user", false, true, false, true, MODE_FORCE_RESTART, TStatus::ALLOW,
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(4), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(5), 60000000, "storage"));
+
+ // Incorrect requests
+ env.CheckPermissionRequest("user", false, true, false, true, MODE_KEEP_AVAILABLE, TStatus::DISALLOW,
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(4), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(6), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(7), 60000000, "storage"));
+ env.CheckPermissionRequest("user", false, true, false, true, MODE_MAX_AVAILABILITY, TStatus::DISALLOW,
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(3), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(4), 60000000, "storage"),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(5), 60000000, "storage"));
+ }
}
} // NCmsTest
} // NKikimr
diff --git a/ydb/core/cms/erasure_checkers.cpp b/ydb/core/cms/erasure_checkers.cpp
index 253cdfc0f2..e3ad883620 100644
--- a/ydb/core/cms/erasure_checkers.cpp
+++ b/ydb/core/cms/erasure_checkers.cpp
@@ -45,23 +45,26 @@ bool TErasureCounterBase::IsLocked(const TVDiskInfo& vdisk,
|| vdisk.IsLocked(error, retryTime, TActivationContext::Now(), duration);
}
-bool TErasureCounterBase::GroupAlreadyHasLockedDisks(TErrorInfo& error) const
+bool TErasureCounterBase::GroupAlreadyHasLockedDisks() const
{
- if (Locked && error.Code == TStatus::DISALLOW) {
- error.Reason = "Group already has locked disks";
- return true;
- }
- return false;
+ return HasAlreadyLockedDisks;
}
bool TErasureCounterBase::CheckForMaxAvailability(TErrorInfo& error,
- TInstant& defaultDeadline) const
+ TInstant& defaultDeadline,
+ bool allowPartial) const
{
if (Locked + Down > 1) {
+ if (HasAlreadyLockedDisks && !allowPartial) {
+ error.Code = TStatus::DISALLOW;
+ error.Reason = "The request is incorrect: too many disks from the one group. "
+ "Fix the request or set PartialPermissionAllowed to true";
+ return false;
+ }
+
error.Code = TStatus::DISALLOW_TEMP;
error.Reason = TStringBuilder() << "Issue in affected group " << GroupId
<< ". " << "Too many locked and down vdisks: " << Locked + Down;
-
error.Deadline = defaultDeadline;
return false;
}
@@ -99,10 +102,27 @@ void TDefaultErasureCounter::CountVDisk(const TVDiskInfo& vdisk,
bool TDefaultErasureCounter::CheckForKeepAvailability(TClusterInfoPtr info,
TErrorInfo& error,
- TInstant& defaultDeadline) const
+ TInstant& defaultDeadline,
+ bool allowPartial) const
{
+ if (HasAlreadyLockedDisks && allowPartial) {
+ error.Code = TStatus::DISALLOW_TEMP;
+ error.Reason = "You cannot get two or more disks from the same group at the same time"
+ " without specifying the PartialPermissionAllowed parameter";
+ error.Deadline = defaultDeadline;
+ return false;
+ }
+
if (Down + Locked > info->BSGroup(GroupId).Erasure.ParityParts()) {
+ if (HasAlreadyLockedDisks && !allowPartial) {
+ error.Code = TStatus::DISALLOW;
+ error.Reason = "The request is incorrect: too many disks from the one group. "
+ "Fix the request or set PartialPermissionAllowed to true";
+ return false;
+ }
error.Code = TStatus::DISALLOW_TEMP;
+ error.Reason = TStringBuilder() << "Cannot lock disk " << VDisk.PrettyItemName()
+ << ". Too many locked nodes for group " << GroupId;
error.Deadline = defaultDeadline;
return false;
}
@@ -111,10 +131,19 @@ bool TDefaultErasureCounter::CheckForKeepAvailability(TClusterInfoPtr info,
bool TMirror3dcCounter::CheckForKeepAvailability(TClusterInfoPtr info,
TErrorInfo& error,
- TInstant& defaultDeadline) const
+ TInstant& defaultDeadline,
+ bool allowPartial) const
{
Y_UNUSED(info);
+ if (HasAlreadyLockedDisks && allowPartial) {
+ error.Code = TStatus::DISALLOW_TEMP;
+ error.Reason = "You cannot get two or more disks from the same group at the same time"
+ " without specifying the PartialPermissionAllowed parameter";
+ error.Deadline = defaultDeadline;
+ return false;
+ }
+
if (DataCenterDisabledNodes.size() <= 1)
return true;
@@ -125,6 +154,13 @@ bool TMirror3dcCounter::CheckForKeepAvailability(TClusterInfoPtr info,
return true;
}
+ if (HasAlreadyLockedDisks && !allowPartial) {
+ error.Code = TStatus::DISALLOW;
+ error.Reason = "The request is incorrect: too many disks from the one group. "
+ "Fix the request or set PartialPermissionAllowed to true";
+ return false;
+ }
+
if (DataCenterDisabledNodes.size() > 2) {
error.Code = TStatus::DISALLOW_TEMP;
error.Reason = TStringBuilder() << "Issue in affected group " << GroupId
@@ -174,6 +210,10 @@ void TMirror3dcCounter::CountGroupState(TClusterInfoPtr info,
}
++Locked;
++DataCenterDisabledNodes[VDisk.VDiskId.FailRealm];
+
+ if (Locked && error.Code == TStatus::DISALLOW) {
+ HasAlreadyLockedDisks = true;
+ }
}
void TDefaultErasureCounter::CountGroupState(TClusterInfoPtr info,
@@ -185,6 +225,9 @@ void TDefaultErasureCounter::CountGroupState(TClusterInfoPtr info,
if (vdId != VDisk.VDiskId)
CountVDisk(info->VDisk(vdId), info, retryTime, duration, error);
}
+ if (Locked && error.Code == TStatus::DISALLOW) {
+ HasAlreadyLockedDisks = true;
+ }
++Locked;
}
diff --git a/ydb/core/cms/erasure_checkers.h b/ydb/core/cms/erasure_checkers.h
index 51d9ca9553..a496ae59ff 100644
--- a/ydb/core/cms/erasure_checkers.h
+++ b/ydb/core/cms/erasure_checkers.h
@@ -15,9 +15,9 @@ class IErasureCounter {
public:
virtual ~IErasureCounter() = default;
- virtual bool GroupAlreadyHasLockedDisks(TErrorInfo& error) const = 0;
- virtual bool CheckForMaxAvailability(TErrorInfo& error, TInstant& defaultDeadline) const = 0;
- virtual bool CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline) const = 0;
+ virtual bool GroupAlreadyHasLockedDisks() const = 0;
+ virtual bool CheckForMaxAvailability(TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const = 0;
+ virtual bool CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const = 0;
virtual void CountGroupState(TClusterInfoPtr info,
TDuration retryTime,
TDuration duration,
@@ -35,6 +35,7 @@ protected:
ui32 Locked;
const TVDiskInfo& VDisk;
const ui32 GroupId;
+ bool HasAlreadyLockedDisks;
protected:
bool IsDown(const TVDiskInfo& vdisk,
@@ -53,11 +54,12 @@ public:
, Locked(0)
, VDisk(vdisk)
, GroupId(groupId)
+ , HasAlreadyLockedDisks(false)
{
}
- bool GroupAlreadyHasLockedDisks(TErrorInfo& error) const final;
- bool CheckForMaxAvailability(TErrorInfo& error, TInstant& defaultDeadline) const final;
+ bool GroupAlreadyHasLockedDisks() const final;
+ bool CheckForMaxAvailability(TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const final;
};
class TDefaultErasureCounter: public TErasureCounterBase {
@@ -71,7 +73,8 @@ public:
TDuration duration, TErrorInfo &error) override;
bool CheckForKeepAvailability(TClusterInfoPtr info,
TErrorInfo& error,
- TInstant& defaultDeadline) const override;
+ TInstant& defaultDeadline,
+ bool allowPartial) const override;
void CountVDisk(const TVDiskInfo& vdisk,
TClusterInfoPtr info,
TDuration retryTime,
@@ -93,7 +96,8 @@ public:
TDuration duration, TErrorInfo &error) override;
bool CheckForKeepAvailability(TClusterInfoPtr info,
TErrorInfo& error,
- TInstant& defaultDeadline) const override;
+ TInstant& defaultDeadline,
+ bool allowPartial) const override;
void CountVDisk(const TVDiskInfo& vdisk,
TClusterInfoPtr info,
TDuration retryTime,