diff options
author | ilnaz <ilnaz@ydb.tech> | 2023-08-29 11:32:40 +0300 |
---|---|---|
committer | ilnaz <ilnaz@ydb.tech> | 2023-08-29 13:08:26 +0300 |
commit | 1a5ec169fd76344b798f5d5e614ac71c072feed7 (patch) | |
tree | 0e5d9279ff1894387b460d04b47d464c34872c22 | |
parent | 5afb0c6e17e595eca20f2c58c0634a9be4ec03cf (diff) | |
download | ydb-1a5ec169fd76344b798f5d5e614ac71c072feed7.tar.gz |
Return correct errors KIKIMR-19022
-rw-r--r-- | ydb/core/cms/cms.cpp | 12 | ||||
-rw-r--r-- | ydb/core/cms/node_checkers.cpp | 97 | ||||
-rw-r--r-- | ydb/core/cms/node_checkers.h | 104 |
3 files changed, 90 insertions, 123 deletions
diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp index 4832c83c94a..d591303b89a 100644 --- a/ydb/core/cms/cms.cpp +++ b/ydb/core/cms/cms.cpp @@ -548,9 +548,8 @@ bool TCms::CheckSysTabletsNode(const TActionOptions &opts, } for (auto &tabletType : ClusterInfo->NodeToTabletTypes[node.NodeId]) { - if (!ClusterInfo->SysNodesCheckers[tabletType]->TryToLockNode(node.NodeId, opts.AvailabilityMode)) { + if (!ClusterInfo->SysNodesCheckers[tabletType]->TryToLockNode(node.NodeId, opts.AvailabilityMode, error.Reason)) { error.Code = TStatus::DISALLOW_TEMP; - error.Reason = ClusterInfo->SysNodesCheckers[tabletType]->ReadableReason(node.NodeId, opts.AvailabilityMode); error.Deadline = TActivationContext::Now() + State->Config.DefaultRetryTime; return false; } @@ -567,23 +566,18 @@ bool TCms::TryToLockNode(const TAction& action, TDuration duration = TDuration::MicroSeconds(action.GetDuration()); duration += opts.PermissionDuration; - if (!ClusterInfo->ClusterNodes->TryToLockNode(node.NodeId, opts.AvailabilityMode)) - { + if (!ClusterInfo->ClusterNodes->TryToLockNode(node.NodeId, opts.AvailabilityMode, error.Reason)) { error.Code = TStatus::DISALLOW_TEMP; - error.Reason = ClusterInfo->ClusterNodes->ReadableReason(node.NodeId, opts.AvailabilityMode); error.Deadline = TActivationContext::Now() + State->Config.DefaultRetryTime; - return false; } if (node.Tenant && opts.TenantPolicy != NONE - && !ClusterInfo->TenantNodesChecker[node.Tenant]->TryToLockNode(node.NodeId, opts.AvailabilityMode)) + && !ClusterInfo->TenantNodesChecker[node.Tenant]->TryToLockNode(node.NodeId, opts.AvailabilityMode, error.Reason)) { error.Code = TStatus::DISALLOW_TEMP; - error.Reason = ClusterInfo->TenantNodesChecker[node.Tenant]->ReadableReason(node.NodeId, opts.AvailabilityMode); error.Deadline = TActivationContext::Now() + State->Config.DefaultRetryTime; - return false; } diff --git a/ydb/core/cms/node_checkers.cpp b/ydb/core/cms/node_checkers.cpp index 8a857b0a63f..79eceebb3ea 100644 --- a/ydb/core/cms/node_checkers.cpp +++ b/ydb/core/cms/node_checkers.cpp @@ -1,8 +1,6 @@ #include "node_checkers.h" -#include <ydb/core/protos/cms.pb.h> - -#include <util/string/cast.h> +#include <library/cpp/actors/core/log.h> namespace NKikimr::NCms { @@ -81,29 +79,31 @@ void TNodesCounterBase::UnlockNode(ui32 nodeId) { } } -bool TNodesLimitsCounterBase::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const { +bool TNodesLimitsCounterBase::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, TString& reason) const { Y_VERIFY(NodeToState.contains(nodeId)); auto nodeState = NodeToState.at(nodeId); bool isForceRestart = mode == NKikimrCms::MODE_FORCE_RESTART; NCH_LOG_D("Checking Node: " - << nodeId << ", with state: " << ToString(nodeState) + << nodeId << ", with state: " << nodeState << ", with limit: " << DisabledNodesLimit << ", with ratio limit: " << DisabledNodesRatioLimit << ", locked nodes: " << LockedNodesCount << ", down nodes: " << DownNodesCount); - // Allow to maintain down/unavailable node - if (nodeState == NODE_STATE_DOWN) { - return true; - } - - if (nodeState == NODE_STATE_RESTART || - nodeState == NODE_STATE_LOCKED || - nodeState == NODE_STATE_UNSPECIFIED) { - - return false; + switch (nodeState) { + case NODE_STATE_UP: + break; + case NODE_STATE_UNSPECIFIED: + case NODE_STATE_LOCKED: + case NODE_STATE_RESTART: + reason = TStringBuilder() << ReasonPrefix(nodeId) + << ": node state: '" << nodeState << "'"; + return false; + case NODE_STATE_DOWN: + // Allow to maintain down/unavailable node + return true; } // Always allow at least one node @@ -115,56 +115,85 @@ bool TNodesLimitsCounterBase::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabili return true; } - if (DisabledNodesLimit > 0 && - (LockedNodesCount + DownNodesCount + 1 > DisabledNodesLimit)) { + const auto disabledNodes = LockedNodesCount + DownNodesCount + 1; + + if (DisabledNodesLimit > 0 && disabledNodes > DisabledNodesLimit) { + reason = TStringBuilder() << ReasonPrefix(nodeId) + << ": too many unavailable nodes." + << " Locked: " << LockedNodesCount + << ", down: " << DownNodesCount + << ", limit: " << DisabledNodesLimit; return false; } - if (DisabledNodesRatioLimit > 0 && - ((LockedNodesCount + DownNodesCount + 1) * 100 > - (NodeToState.size() * DisabledNodesRatioLimit))) { + if (DisabledNodesRatioLimit > 0 && (disabledNodes * 100 > NodeToState.size() * DisabledNodesRatioLimit)) { + reason = TStringBuilder() << ReasonPrefix(nodeId) + << ": too many unavailable nodes." + << " Locked: " << LockedNodesCount + << ", down: " << DownNodesCount + << ", total: " << NodeToState.size() + << ", limit: " << DisabledNodesRatioLimit << "%"; return false; } return true; } -bool TSysTabletsNodesCounter::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const { +bool TSysTabletsNodesCounter::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, TString& reason) const { Y_VERIFY(NodeToState.contains(nodeId)); auto nodeState = NodeToState.at(nodeId); NCH_LOG_D("Checking limits for sys tablet: " << NKikimrConfig::TBootstrap_ETabletType_Name(TabletType) << ", on node: " << nodeId - << ", with state: " << ToString(nodeState) + << ", with state: " << nodeState << ", locked nodes: " << LockedNodesCount << ", down nodes: " << DownNodesCount); - if (nodeState == NODE_STATE_RESTART || - nodeState == NODE_STATE_LOCKED || - nodeState == NODE_STATE_UNSPECIFIED) { + switch (nodeState) { + case NODE_STATE_UNSPECIFIED: + case NODE_STATE_LOCKED: + case NODE_STATE_RESTART: + reason = TStringBuilder() << "Cannot lock node '" << nodeId << "'" + << ": node state: '" << nodeState << "'"; + return false; + default: + break; + } - return false; + const auto tabletNodes = NodeToState.size(); + if (tabletNodes < 1) { + return true; } - ui32 tabletNodes = NodeToState.size(); + const auto disabledNodes = LockedNodesCount + DownNodesCount + 1; + ui32 limit = 0; + switch (mode) { + case NKikimrCms::MODE_FORCE_RESTART: + return true; case NKikimrCms::MODE_MAX_AVAILABILITY: - if (tabletNodes > 1 && (DownNodesCount + LockedNodesCount + 1) * 2 > tabletNodes){ - return false; + limit = NodeToState.size() / 2; + if (disabledNodes * 2 <= tabletNodes) { + return true; } break; case NKikimrCms::MODE_KEEP_AVAILABLE: - if (tabletNodes > 1 && (DownNodesCount + LockedNodesCount + 1) > tabletNodes - 1) { - return false; + limit = NodeToState.size() - 1; + if (disabledNodes < tabletNodes) { + return true; } break; - case NKikimrCms::MODE_FORCE_RESTART: - break; default: Y_FAIL("Unknown availability mode"); } - return true; + reason = TStringBuilder() << "Cannot lock node '" << nodeId << "'" + << ": tablet '" << NKikimrConfig::TBootstrap_ETabletType_Name(TabletType) << "'" + << " has too many unavailable nodes." + << " Locked: " << LockedNodesCount + << ", down: " << DownNodesCount + << ", limit: " << limit; + return false; } } // namespace NKikimr::NCms diff --git a/ydb/core/cms/node_checkers.h b/ydb/core/cms/node_checkers.h index 72362c84dba..0b0b8563013 100644 --- a/ydb/core/cms/node_checkers.h +++ b/ydb/core/cms/node_checkers.h @@ -2,22 +2,11 @@ #include "defs.h" -#include <ydb/core/blobstorage/base/blobstorage_vdiskid.h> -#include <ydb/core/erasure/erasure.h> #include <ydb/core/protos/cms.pb.h> #include <ydb/core/protos/config.pb.h> -#include <library/cpp/actors/core/log.h> - #include <util/generic/hash.h> -#include <util/generic/hash_set.h> -#include <util/system/compiler.h> -#include <util/system/yassert.h> - -#include <bitset> -#include <sstream> -#include <algorithm> -#include <string> +#include <util/string/builder.h> namespace NKikimr::NCms { @@ -48,9 +37,7 @@ public: virtual void LockNode(ui32 nodeId) = 0; virtual void UnlockNode(ui32 nodeId) = 0; - virtual bool TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const = 0; - - virtual std::string ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const = 0; + virtual bool TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, TString& reason) const = 0; }; /** @@ -66,9 +53,8 @@ public: TNodesCounterBase() : LockedNodesCount(0) , DownNodesCount(0) - {} - - virtual ~TNodesCounterBase() = default; + { + } void AddNode(ui32 nodeId) override; void UpdateNode(ui32 nodeId, NKikimrCms::EState) override; @@ -89,71 +75,51 @@ protected: ui32 DisabledNodesLimit; ui32 DisabledNodesRatioLimit; + virtual TString ReasonPrefix(ui32 nodeId) const = 0; + public: - TNodesLimitsCounterBase(ui32 disabledNodesLimit, ui32 disabledNodesRatioLimit) + explicit TNodesLimitsCounterBase(ui32 disabledNodesLimit, ui32 disabledNodesRatioLimit) : DisabledNodesLimit(disabledNodesLimit) , DisabledNodesRatioLimit(disabledNodesRatioLimit) { } - virtual ~TNodesLimitsCounterBase() = default; - void ApplyLimits(ui32 nodesLimit, ui32 ratioLimit) { DisabledNodesLimit = nodesLimit; DisabledNodesRatioLimit = ratioLimit; } - bool TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const override final; + bool TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, TString& reason) const override final; }; class TTenantLimitsCounter : public TNodesLimitsCounterBase { private: - const std::string TenantName; + const TString TenantName; + +protected: + TString ReasonPrefix(ui32 nodeId) const override final { + return TStringBuilder() << "Cannot lock node '" << nodeId << "' of tenant '" << TenantName << "'"; + } public: - TTenantLimitsCounter(const std::string &tenantName, ui32 disabledNodesLimit, ui32 disabledNodesRatioLimit) + explicit TTenantLimitsCounter(const TString& tenantName, ui32 disabledNodesLimit, ui32 disabledNodesRatioLimit) : TNodesLimitsCounterBase(disabledNodesLimit, disabledNodesRatioLimit) , TenantName(tenantName) { } - - std::string ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const override final { - Y_UNUSED(mode); - - std::stringstream reason; - reason << "Cannot lock node: " << nodeId - << ". Too many locked nodes for tenant " << TenantName - << "; locked: " << LockedNodesCount - << "; down: " << DownNodesCount - << "; total: " << NodeToState.size() - << "; limit: " << DisabledNodesLimit - << "; ratio limit: " << DisabledNodesRatioLimit << "%"; - - return reason.str(); - } }; class TClusterLimitsCounter : public TNodesLimitsCounterBase { +protected: + TString ReasonPrefix(ui32 nodeId) const override final { + return TStringBuilder() << "Cannot lock node '" << nodeId << "'"; + } + public: - TClusterLimitsCounter(ui32 disabledNodesLimit, ui32 disabledNodesRatioLimit) + explicit TClusterLimitsCounter(ui32 disabledNodesLimit, ui32 disabledNodesRatioLimit) : TNodesLimitsCounterBase(disabledNodesLimit, disabledNodesRatioLimit) { } - - std::string ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const override final { - Y_UNUSED(mode); - - std::stringstream reason; - reason << "Cannot lock node: " << nodeId - <<". Too many locked nodes in cluster" - << "; locked: " << LockedNodesCount - << "; down: " << DownNodesCount - << "; total: " << NodeToState.size() - << "; limit: " << DisabledNodesLimit - << "; ratio limit: " << DisabledNodesRatioLimit << "%"; - - return reason.str(); - } }; /** @@ -169,32 +135,10 @@ private: public: explicit TSysTabletsNodesCounter(NKikimrConfig::TBootstrap::ETabletType tabletType) : TabletType(tabletType) - {} - - bool TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const override final; - - std::string ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const override final { - std::stringstream reason; - - if (mode == NKikimrCms::MODE_FORCE_RESTART) { - return reason.str(); - } - - reason << "Cannot lock node: " << nodeId - << ". Tablet " - << NKikimrConfig::TBootstrap_ETabletType_Name(TabletType) - << " has too many unavailable nodes. Locked: " << LockedNodesCount - << ". Down: " << DownNodesCount; - if (mode == NKikimrCms::MODE_MAX_AVAILABILITY) { - reason << ". Limit: " << NodeToState.size() / 2 << " (50%)"; - } - - if (mode == NKikimrCms::MODE_KEEP_AVAILABLE) { - reason << ". Limit: " << NodeToState.size() - 1; - } - - return reason.str(); + { } + + bool TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, TString& reason) const override final; }; } // namespace NKikimr::NCms |