aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorilnaz <ilnaz@ydb.tech>2023-08-29 11:32:40 +0300
committerilnaz <ilnaz@ydb.tech>2023-08-29 13:08:26 +0300
commit1a5ec169fd76344b798f5d5e614ac71c072feed7 (patch)
tree0e5d9279ff1894387b460d04b47d464c34872c22
parent5afb0c6e17e595eca20f2c58c0634a9be4ec03cf (diff)
downloadydb-1a5ec169fd76344b798f5d5e614ac71c072feed7.tar.gz
Return correct errors KIKIMR-19022
-rw-r--r--ydb/core/cms/cms.cpp12
-rw-r--r--ydb/core/cms/node_checkers.cpp97
-rw-r--r--ydb/core/cms/node_checkers.h104
3 files changed, 90 insertions, 123 deletions
diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp
index 4832c83c94a..d591303b89a 100644
--- a/ydb/core/cms/cms.cpp
+++ b/ydb/core/cms/cms.cpp
@@ -548,9 +548,8 @@ bool TCms::CheckSysTabletsNode(const TActionOptions &opts,
}
for (auto &tabletType : ClusterInfo->NodeToTabletTypes[node.NodeId]) {
- if (!ClusterInfo->SysNodesCheckers[tabletType]->TryToLockNode(node.NodeId, opts.AvailabilityMode)) {
+ if (!ClusterInfo->SysNodesCheckers[tabletType]->TryToLockNode(node.NodeId, opts.AvailabilityMode, error.Reason)) {
error.Code = TStatus::DISALLOW_TEMP;
- error.Reason = ClusterInfo->SysNodesCheckers[tabletType]->ReadableReason(node.NodeId, opts.AvailabilityMode);
error.Deadline = TActivationContext::Now() + State->Config.DefaultRetryTime;
return false;
}
@@ -567,23 +566,18 @@ bool TCms::TryToLockNode(const TAction& action,
TDuration duration = TDuration::MicroSeconds(action.GetDuration());
duration += opts.PermissionDuration;
- if (!ClusterInfo->ClusterNodes->TryToLockNode(node.NodeId, opts.AvailabilityMode))
- {
+ if (!ClusterInfo->ClusterNodes->TryToLockNode(node.NodeId, opts.AvailabilityMode, error.Reason)) {
error.Code = TStatus::DISALLOW_TEMP;
- error.Reason = ClusterInfo->ClusterNodes->ReadableReason(node.NodeId, opts.AvailabilityMode);
error.Deadline = TActivationContext::Now() + State->Config.DefaultRetryTime;
-
return false;
}
if (node.Tenant
&& opts.TenantPolicy != NONE
- && !ClusterInfo->TenantNodesChecker[node.Tenant]->TryToLockNode(node.NodeId, opts.AvailabilityMode))
+ && !ClusterInfo->TenantNodesChecker[node.Tenant]->TryToLockNode(node.NodeId, opts.AvailabilityMode, error.Reason))
{
error.Code = TStatus::DISALLOW_TEMP;
- error.Reason = ClusterInfo->TenantNodesChecker[node.Tenant]->ReadableReason(node.NodeId, opts.AvailabilityMode);
error.Deadline = TActivationContext::Now() + State->Config.DefaultRetryTime;
-
return false;
}
diff --git a/ydb/core/cms/node_checkers.cpp b/ydb/core/cms/node_checkers.cpp
index 8a857b0a63f..79eceebb3ea 100644
--- a/ydb/core/cms/node_checkers.cpp
+++ b/ydb/core/cms/node_checkers.cpp
@@ -1,8 +1,6 @@
#include "node_checkers.h"
-#include <ydb/core/protos/cms.pb.h>
-
-#include <util/string/cast.h>
+#include <library/cpp/actors/core/log.h>
namespace NKikimr::NCms {
@@ -81,29 +79,31 @@ void TNodesCounterBase::UnlockNode(ui32 nodeId) {
}
}
-bool TNodesLimitsCounterBase::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const {
+bool TNodesLimitsCounterBase::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, TString& reason) const {
Y_VERIFY(NodeToState.contains(nodeId));
auto nodeState = NodeToState.at(nodeId);
bool isForceRestart = mode == NKikimrCms::MODE_FORCE_RESTART;
NCH_LOG_D("Checking Node: "
- << nodeId << ", with state: " << ToString(nodeState)
+ << nodeId << ", with state: " << nodeState
<< ", with limit: " << DisabledNodesLimit
<< ", with ratio limit: " << DisabledNodesRatioLimit
<< ", locked nodes: " << LockedNodesCount
<< ", down nodes: " << DownNodesCount);
- // Allow to maintain down/unavailable node
- if (nodeState == NODE_STATE_DOWN) {
- return true;
- }
-
- if (nodeState == NODE_STATE_RESTART ||
- nodeState == NODE_STATE_LOCKED ||
- nodeState == NODE_STATE_UNSPECIFIED) {
-
- return false;
+ switch (nodeState) {
+ case NODE_STATE_UP:
+ break;
+ case NODE_STATE_UNSPECIFIED:
+ case NODE_STATE_LOCKED:
+ case NODE_STATE_RESTART:
+ reason = TStringBuilder() << ReasonPrefix(nodeId)
+ << ": node state: '" << nodeState << "'";
+ return false;
+ case NODE_STATE_DOWN:
+ // Allow to maintain down/unavailable node
+ return true;
}
// Always allow at least one node
@@ -115,56 +115,85 @@ bool TNodesLimitsCounterBase::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabili
return true;
}
- if (DisabledNodesLimit > 0 &&
- (LockedNodesCount + DownNodesCount + 1 > DisabledNodesLimit)) {
+ const auto disabledNodes = LockedNodesCount + DownNodesCount + 1;
+
+ if (DisabledNodesLimit > 0 && disabledNodes > DisabledNodesLimit) {
+ reason = TStringBuilder() << ReasonPrefix(nodeId)
+ << ": too many unavailable nodes."
+ << " Locked: " << LockedNodesCount
+ << ", down: " << DownNodesCount
+ << ", limit: " << DisabledNodesLimit;
return false;
}
- if (DisabledNodesRatioLimit > 0 &&
- ((LockedNodesCount + DownNodesCount + 1) * 100 >
- (NodeToState.size() * DisabledNodesRatioLimit))) {
+ if (DisabledNodesRatioLimit > 0 && (disabledNodes * 100 > NodeToState.size() * DisabledNodesRatioLimit)) {
+ reason = TStringBuilder() << ReasonPrefix(nodeId)
+ << ": too many unavailable nodes."
+ << " Locked: " << LockedNodesCount
+ << ", down: " << DownNodesCount
+ << ", total: " << NodeToState.size()
+ << ", limit: " << DisabledNodesRatioLimit << "%";
return false;
}
return true;
}
-bool TSysTabletsNodesCounter::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const {
+bool TSysTabletsNodesCounter::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, TString& reason) const {
Y_VERIFY(NodeToState.contains(nodeId));
auto nodeState = NodeToState.at(nodeId);
NCH_LOG_D("Checking limits for sys tablet: " << NKikimrConfig::TBootstrap_ETabletType_Name(TabletType)
<< ", on node: " << nodeId
- << ", with state: " << ToString(nodeState)
+ << ", with state: " << nodeState
<< ", locked nodes: " << LockedNodesCount
<< ", down nodes: " << DownNodesCount);
- if (nodeState == NODE_STATE_RESTART ||
- nodeState == NODE_STATE_LOCKED ||
- nodeState == NODE_STATE_UNSPECIFIED) {
+ switch (nodeState) {
+ case NODE_STATE_UNSPECIFIED:
+ case NODE_STATE_LOCKED:
+ case NODE_STATE_RESTART:
+ reason = TStringBuilder() << "Cannot lock node '" << nodeId << "'"
+ << ": node state: '" << nodeState << "'";
+ return false;
+ default:
+ break;
+ }
- return false;
+ const auto tabletNodes = NodeToState.size();
+ if (tabletNodes < 1) {
+ return true;
}
- ui32 tabletNodes = NodeToState.size();
+ const auto disabledNodes = LockedNodesCount + DownNodesCount + 1;
+ ui32 limit = 0;
+
switch (mode) {
+ case NKikimrCms::MODE_FORCE_RESTART:
+ return true;
case NKikimrCms::MODE_MAX_AVAILABILITY:
- if (tabletNodes > 1 && (DownNodesCount + LockedNodesCount + 1) * 2 > tabletNodes){
- return false;
+ limit = NodeToState.size() / 2;
+ if (disabledNodes * 2 <= tabletNodes) {
+ return true;
}
break;
case NKikimrCms::MODE_KEEP_AVAILABLE:
- if (tabletNodes > 1 && (DownNodesCount + LockedNodesCount + 1) > tabletNodes - 1) {
- return false;
+ limit = NodeToState.size() - 1;
+ if (disabledNodes < tabletNodes) {
+ return true;
}
break;
- case NKikimrCms::MODE_FORCE_RESTART:
- break;
default:
Y_FAIL("Unknown availability mode");
}
- return true;
+ reason = TStringBuilder() << "Cannot lock node '" << nodeId << "'"
+ << ": tablet '" << NKikimrConfig::TBootstrap_ETabletType_Name(TabletType) << "'"
+ << " has too many unavailable nodes."
+ << " Locked: " << LockedNodesCount
+ << ", down: " << DownNodesCount
+ << ", limit: " << limit;
+ return false;
}
} // namespace NKikimr::NCms
diff --git a/ydb/core/cms/node_checkers.h b/ydb/core/cms/node_checkers.h
index 72362c84dba..0b0b8563013 100644
--- a/ydb/core/cms/node_checkers.h
+++ b/ydb/core/cms/node_checkers.h
@@ -2,22 +2,11 @@
#include "defs.h"
-#include <ydb/core/blobstorage/base/blobstorage_vdiskid.h>
-#include <ydb/core/erasure/erasure.h>
#include <ydb/core/protos/cms.pb.h>
#include <ydb/core/protos/config.pb.h>
-#include <library/cpp/actors/core/log.h>
-
#include <util/generic/hash.h>
-#include <util/generic/hash_set.h>
-#include <util/system/compiler.h>
-#include <util/system/yassert.h>
-
-#include <bitset>
-#include <sstream>
-#include <algorithm>
-#include <string>
+#include <util/string/builder.h>
namespace NKikimr::NCms {
@@ -48,9 +37,7 @@ public:
virtual void LockNode(ui32 nodeId) = 0;
virtual void UnlockNode(ui32 nodeId) = 0;
- virtual bool TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const = 0;
-
- virtual std::string ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const = 0;
+ virtual bool TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, TString& reason) const = 0;
};
/**
@@ -66,9 +53,8 @@ public:
TNodesCounterBase()
: LockedNodesCount(0)
, DownNodesCount(0)
- {}
-
- virtual ~TNodesCounterBase() = default;
+ {
+ }
void AddNode(ui32 nodeId) override;
void UpdateNode(ui32 nodeId, NKikimrCms::EState) override;
@@ -89,71 +75,51 @@ protected:
ui32 DisabledNodesLimit;
ui32 DisabledNodesRatioLimit;
+ virtual TString ReasonPrefix(ui32 nodeId) const = 0;
+
public:
- TNodesLimitsCounterBase(ui32 disabledNodesLimit, ui32 disabledNodesRatioLimit)
+ explicit TNodesLimitsCounterBase(ui32 disabledNodesLimit, ui32 disabledNodesRatioLimit)
: DisabledNodesLimit(disabledNodesLimit)
, DisabledNodesRatioLimit(disabledNodesRatioLimit)
{
}
- virtual ~TNodesLimitsCounterBase() = default;
-
void ApplyLimits(ui32 nodesLimit, ui32 ratioLimit) {
DisabledNodesLimit = nodesLimit;
DisabledNodesRatioLimit = ratioLimit;
}
- bool TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const override final;
+ bool TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, TString& reason) const override final;
};
class TTenantLimitsCounter : public TNodesLimitsCounterBase {
private:
- const std::string TenantName;
+ const TString TenantName;
+
+protected:
+ TString ReasonPrefix(ui32 nodeId) const override final {
+ return TStringBuilder() << "Cannot lock node '" << nodeId << "' of tenant '" << TenantName << "'";
+ }
public:
- TTenantLimitsCounter(const std::string &tenantName, ui32 disabledNodesLimit, ui32 disabledNodesRatioLimit)
+ explicit TTenantLimitsCounter(const TString& tenantName, ui32 disabledNodesLimit, ui32 disabledNodesRatioLimit)
: TNodesLimitsCounterBase(disabledNodesLimit, disabledNodesRatioLimit)
, TenantName(tenantName)
{
}
-
- std::string ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const override final {
- Y_UNUSED(mode);
-
- std::stringstream reason;
- reason << "Cannot lock node: " << nodeId
- << ". Too many locked nodes for tenant " << TenantName
- << "; locked: " << LockedNodesCount
- << "; down: " << DownNodesCount
- << "; total: " << NodeToState.size()
- << "; limit: " << DisabledNodesLimit
- << "; ratio limit: " << DisabledNodesRatioLimit << "%";
-
- return reason.str();
- }
};
class TClusterLimitsCounter : public TNodesLimitsCounterBase {
+protected:
+ TString ReasonPrefix(ui32 nodeId) const override final {
+ return TStringBuilder() << "Cannot lock node '" << nodeId << "'";
+ }
+
public:
- TClusterLimitsCounter(ui32 disabledNodesLimit, ui32 disabledNodesRatioLimit)
+ explicit TClusterLimitsCounter(ui32 disabledNodesLimit, ui32 disabledNodesRatioLimit)
: TNodesLimitsCounterBase(disabledNodesLimit, disabledNodesRatioLimit)
{
}
-
- std::string ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const override final {
- Y_UNUSED(mode);
-
- std::stringstream reason;
- reason << "Cannot lock node: " << nodeId
- <<". Too many locked nodes in cluster"
- << "; locked: " << LockedNodesCount
- << "; down: " << DownNodesCount
- << "; total: " << NodeToState.size()
- << "; limit: " << DisabledNodesLimit
- << "; ratio limit: " << DisabledNodesRatioLimit << "%";
-
- return reason.str();
- }
};
/**
@@ -169,32 +135,10 @@ private:
public:
explicit TSysTabletsNodesCounter(NKikimrConfig::TBootstrap::ETabletType tabletType)
: TabletType(tabletType)
- {}
-
- bool TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const override final;
-
- std::string ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const override final {
- std::stringstream reason;
-
- if (mode == NKikimrCms::MODE_FORCE_RESTART) {
- return reason.str();
- }
-
- reason << "Cannot lock node: " << nodeId
- << ". Tablet "
- << NKikimrConfig::TBootstrap_ETabletType_Name(TabletType)
- << " has too many unavailable nodes. Locked: " << LockedNodesCount
- << ". Down: " << DownNodesCount;
- if (mode == NKikimrCms::MODE_MAX_AVAILABILITY) {
- reason << ". Limit: " << NodeToState.size() / 2 << " (50%)";
- }
-
- if (mode == NKikimrCms::MODE_KEEP_AVAILABLE) {
- reason << ". Limit: " << NodeToState.size() - 1;
- }
-
- return reason.str();
+ {
}
+
+ bool TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, TString& reason) const override final;
};
} // namespace NKikimr::NCms