diff options
| author | ilnaz <[email protected]> | 2023-10-09 01:14:09 +0300 | 
|---|---|---|
| committer | ilnaz <[email protected]> | 2023-10-09 01:33:25 +0300 | 
| commit | 20b31868a936eceda7d5271d1b604ad8bdb5388c (patch) | |
| tree | fbaf81edcbc1d2a41e96133c097552a0078bc8ed | |
| parent | d23e12022dc17381a20537ca8f31d90c5f4763ad (diff) | |
Return verbose reason KIKIMR-19573
| -rw-r--r-- | ydb/core/cms/cms.cpp | 2 | ||||
| -rw-r--r-- | ydb/core/cms/erasure_checkers.cpp | 124 | ||||
| -rw-r--r-- | ydb/core/cms/erasure_checkers.h | 29 | 
3 files changed, 88 insertions, 67 deletions
| diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp index daa59c2df0f..a7e657813f0 100644 --- a/ydb/core/cms/cms.cpp +++ b/ydb/core/cms/cms.cpp @@ -663,7 +663,7 @@ bool TCms::TryToLockVDisk(const TActionOptions& opts,          switch (opts.AvailabilityMode) {          case MODE_MAX_AVAILABILITY: -            if (!counters->CheckForMaxAvailability(error, defaultDeadline, opts.PartialPermissionAllowed)) { +            if (!counters->CheckForMaxAvailability(ClusterInfo, error, defaultDeadline, opts.PartialPermissionAllowed)) {                  return false;              }              break; diff --git a/ydb/core/cms/erasure_checkers.cpp b/ydb/core/cms/erasure_checkers.cpp index b98748c7932..008a7bec354 100644 --- a/ydb/core/cms/erasure_checkers.cpp +++ b/ydb/core/cms/erasure_checkers.cpp @@ -9,8 +9,9 @@ bool TErasureCounterBase::IsDown(const TVDiskInfo &vdisk, TClusterInfoPtr info,      // Check we received info for PDisk.      if (!pdisk.NodeId) { -        ++Down; -        error.Reason = TStringBuilder() << "Missing info for " << pdisk.ItemName(); +        Down.insert(vdisk.VDiskId); +        error.Code = TStatus::DISALLOW_TEMP; +        error.Reason = TStringBuilder() << "Missing info for " << pdisk.PrettyItemName();          return false;      } @@ -27,9 +28,9 @@ bool TErasureCounterBase::IsLocked(const TVDiskInfo &vdisk, TClusterInfoPtr info      // Check we received info for VDisk.      if (!vdisk.NodeId || !vdisk.PDiskId) { -        ++Down; +        Down.insert(vdisk.VDiskId);          error.Code = TStatus::DISALLOW_TEMP; -        error.Reason = TStringBuilder() << "Missing info for " << vdisk.ItemName(); +        error.Reason = TStringBuilder() << "Missing info for " << vdisk.PrettyItemName();          return false;      } @@ -42,8 +43,27 @@ bool TErasureCounterBase::GroupAlreadyHasLockedDisks() const {      return HasAlreadyLockedDisks;  } -bool TErasureCounterBase::CheckForMaxAvailability(TErrorInfo &error, TInstant &defaultDeadline, bool allowPartial) const { -    if (Locked + Down > 1) { +static TString DumpVDisksInfo(const TSet<TVDiskID>& vdisks, TClusterInfoPtr info) { +    if (vdisks.empty()) { +        return "<empty>"; +    } + +    TStringBuilder dump; + +    bool comma = false; +    for (const auto& vdisk : vdisks) { +        if (comma) { +            dump << ", "; +        } +        dump << info->VDisk(vdisk).PrettyItemName(); +        comma = true; +    } + +    return dump; +} + +bool TErasureCounterBase::CheckForMaxAvailability(TClusterInfoPtr info, TErrorInfo &error, TInstant &defaultDeadline, bool allowPartial) const { +    if (Locked.size() + Down.size() > 1) {          if (HasAlreadyLockedDisks && !allowPartial) {              error.Code = TStatus::DISALLOW;              error.Reason = "The request is incorrect: too many disks from the one group. " @@ -53,14 +73,17 @@ bool TErasureCounterBase::CheckForMaxAvailability(TErrorInfo &error, TInstant &d          error.Code = TStatus::DISALLOW_TEMP;          error.Reason = TStringBuilder() << "Issue in affected group " << GroupId -                                        << ". " << "Too many locked and down vdisks: " << Locked + Down; +            << ": too many unavailable vdisks" +            << ". Locked: " << DumpVDisksInfo(Locked, info) +            << ". Down: " << DumpVDisksInfo(Down, info);          error.Deadline = defaultDeadline;          return false;      } +      return true;  } -void TDefaultErasureCounter::CountVDisk(const TVDiskInfo &vdisk, TClusterInfoPtr info, TDuration retryTime, +bool TErasureCounterBase::CountVDisk(const TVDiskInfo &vdisk, TClusterInfoPtr info, TDuration retryTime,          TDuration duration, TErrorInfo &error)  {      Y_VERIFY_DEBUG(vdisk.VDiskId != VDisk.VDiskId); @@ -68,22 +91,36 @@ void TDefaultErasureCounter::CountVDisk(const TVDiskInfo &vdisk, TClusterInfoPtr      // Check locks.      TErrorInfo err;      if (IsLocked(vdisk, info, retryTime, duration, err)) { -        ++Locked; +        Locked.insert(vdisk.VDiskId);          error.Code = err.Code; -        error.Reason = TStringBuilder() << "Issue in affected group " << GroupId -                                        << ". " << err.Reason; +        error.Reason = TStringBuilder() << "Issue in affected group " << GroupId << ": " << err.Reason;          error.Deadline = Max(error.Deadline, err.Deadline); -        return; +        return true;      }      // Check if disk is down.      if (IsDown(vdisk, info, retryTime, err)) { -        ++Down; +        Down.insert(vdisk.VDiskId);          error.Code = err.Code; -        error.Reason = TStringBuilder() << "Issue in affected group " << GroupId -                                        << ". " << err.Reason; +        error.Reason = TStringBuilder() << "Issue in affected group " << GroupId << ": " << err.Reason;          error.Deadline = Max(error.Deadline, err.Deadline); +        return true;      } + +    return false; +} + +void TErasureCounterBase::CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) { +    for (const auto &vdId : info->BSGroup(GroupId).VDisks) { +        if (vdId != VDisk.VDiskId) +            CountVDisk(info->VDisk(vdId), info, retryTime, duration, error); +    } + +    if (Locked && error.Code == TStatus::DISALLOW) { +        HasAlreadyLockedDisks = true; +    } + +    Locked.insert(VDisk.VDiskId);  }  bool TDefaultErasureCounter::CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo &error, @@ -97,19 +134,23 @@ bool TDefaultErasureCounter::CheckForKeepAvailability(TClusterInfoPtr info, TErr          return false;      } -    if (Down + Locked > info->BSGroup(GroupId).Erasure.ParityParts()) { +    if (Down.size() + Locked.size() > info->BSGroup(GroupId).Erasure.ParityParts()) {          if (HasAlreadyLockedDisks && !allowPartial) {              error.Code = TStatus::DISALLOW;              error.Reason = "The request is incorrect: too many disks from the one group. "                             "Fix the request or set PartialPermissionAllowed to true";              return false;          } +          error.Code = TStatus::DISALLOW_TEMP; -        error.Reason = TStringBuilder() << "Cannot lock disk " << VDisk.PrettyItemName() -                                        << ". Too many locked nodes for group " << GroupId; +        error.Reason = TStringBuilder() << "Issue in affected group " << GroupId +            << ": too many unavailable vdisks" +            << ". Locked: " << DumpVDisksInfo(Locked, info) +            << ". Down: " << DumpVDisksInfo(Down, info);          error.Deadline = defaultDeadline;          return false;      } +      return true;  } @@ -146,60 +187,37 @@ bool TMirror3dcCounter::CheckForKeepAvailability(TClusterInfoPtr info, TErrorInf      if (DataCenterDisabledNodes.size() > 2) {          error.Code = TStatus::DISALLOW_TEMP;          error.Reason = TStringBuilder() << "Issue in affected group " << GroupId -                                        << ". Too many data centers have unavailable vdisks: " -                                        << DataCenterDisabledNodes.size(); +            << ": too many unavailable vdisks" +            << ". Number of data centers with unavailable vdisks: " << DataCenterDisabledNodes.size() +            << ". Locked: " << DumpVDisksInfo(Locked, info) +            << ". Down: " << DumpVDisksInfo(Down, info);          error.Deadline = defaultDeadline;          return false;      }      error.Code = TStatus::DISALLOW_TEMP;      error.Reason = TStringBuilder() << "Issue in affected group " << GroupId -                                    << ". Data centers have too many unavailable vdisks"; +        << ": too many unavailable vdisks" +        << ". Locked: " << DumpVDisksInfo(Locked, info) +        << ". Down: " << DumpVDisksInfo(Down, info);      error.Deadline = defaultDeadline;      return false;  } -void TMirror3dcCounter::CountVDisk(const TVDiskInfo &vdisk, TClusterInfoPtr info, TDuration retryTime, +bool TMirror3dcCounter::CountVDisk(const TVDiskInfo &vdisk, TClusterInfoPtr info, TDuration retryTime,          TDuration duration, TErrorInfo &error)  { -    Y_VERIFY_DEBUG(vdisk.VDiskId != VDisk.VDiskId); - -    // Check locks. -    TErrorInfo err; -    if (IsLocked(vdisk, info, retryTime, duration, err) -        || IsDown(vdisk, info, retryTime, err)) { -        error.Code = err.Code; -        error.Reason = TStringBuilder() << "Issue in affected group " << GroupId -                                        << ". " << err.Reason; -        error.Deadline = Max(error.Deadline, err.Deadline); -        ++Locked; +    const bool disabled = TErasureCounterBase::CountVDisk(vdisk, info, retryTime, duration, error); +    if (disabled) {          ++DataCenterDisabledNodes[vdisk.VDiskId.FailRealm];      } +    return disabled;  }  void TMirror3dcCounter::CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) { -    for (const auto &vdId : info->BSGroup(GroupId).VDisks) { -        if (vdId != VDisk.VDiskId) -            CountVDisk(info->VDisk(vdId), info, retryTime, duration, error); -    } -    ++Locked; +    TErasureCounterBase::CountGroupState(info, retryTime, duration, error);      ++DataCenterDisabledNodes[VDisk.VDiskId.FailRealm]; - -    if (Locked && error.Code == TStatus::DISALLOW) { -        HasAlreadyLockedDisks = true; -    } -} - -void TDefaultErasureCounter::CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) { -    for (const auto &vdId : info->BSGroup(GroupId).VDisks) { -        if (vdId != VDisk.VDiskId) -            CountVDisk(info->VDisk(vdId), info, retryTime, duration, error); -    } -    if (Locked && error.Code == TStatus::DISALLOW) { -        HasAlreadyLockedDisks = true; -    } -    ++Locked;  }  TSimpleSharedPtr<IErasureCounter> CreateErasureCounter(TErasureType::EErasureSpecies es, const TVDiskInfo &vdisk, ui32 groupId) { diff --git a/ydb/core/cms/erasure_checkers.h b/ydb/core/cms/erasure_checkers.h index 0c0de0be766..c02fd1da8bf 100644 --- a/ydb/core/cms/erasure_checkers.h +++ b/ydb/core/cms/erasure_checkers.h @@ -6,25 +6,29 @@  #include <ydb/core/erasure/erasure.h>  #include <ydb/core/protos/cms.pb.h> +#include <util/generic/set.h> +  namespace NKikimr::NCms {  using namespace NKikimrCms;  class IErasureCounter { +protected: +    virtual bool CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) = 0; +  public:      virtual ~IErasureCounter() = default;      virtual bool GroupAlreadyHasLockedDisks() const = 0; -    virtual bool CheckForMaxAvailability(TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const = 0; +    virtual bool CheckForMaxAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const = 0;      virtual bool CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const = 0;      virtual void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) = 0; -    virtual void CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) = 0;  };  class TErasureCounterBase: public IErasureCounter {  protected: -    ui32 Down; -    ui32 Locked; +    TSet<TVDiskID> Down; +    TSet<TVDiskID> Locked;      const TVDiskInfo& VDisk;      const ui32 GroupId;      bool HasAlreadyLockedDisks; @@ -32,19 +36,19 @@ protected:  protected:      bool IsDown(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration& retryTime, TErrorInfo& error);      bool IsLocked(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration& retryTime, TDuration& duration, TErrorInfo& error); +    bool CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) override;  public:      TErasureCounterBase(const TVDiskInfo& vdisk, ui32 groupId) -        : Down(0) -        , Locked(0) -        , VDisk(vdisk) +        : VDisk(vdisk)          , GroupId(groupId)          , HasAlreadyLockedDisks(false)      {      }      bool GroupAlreadyHasLockedDisks() const final; -    bool CheckForMaxAvailability(TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const final; +    bool CheckForMaxAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const final; +    void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) override;  };  class TDefaultErasureCounter: public TErasureCounterBase { @@ -54,24 +58,23 @@ public:      {      } -    void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) override;      bool CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const override; -    void CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) override;  };  class TMirror3dcCounter: public TErasureCounterBase { -private:      THashMap<ui8, ui32> DataCenterDisabledNodes; +protected: +    bool CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) override; +  public:      TMirror3dcCounter(const TVDiskInfo& vdisk, ui32 groupId)          : TErasureCounterBase(vdisk, groupId)      {      } -    void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) override;      bool CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const override; -    void CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) override; +    void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) override;  };  TSimpleSharedPtr<IErasureCounter> CreateErasureCounter(TErasureType::EErasureSpecies es, const TVDiskInfo& vdisk, ui32 groupId); | 
