diff options
author | alexvru <alexvru@ydb.tech> | 2023-05-22 16:12:47 +0300 |
---|---|---|
committer | alexvru <alexvru@ydb.tech> | 2023-05-22 16:12:47 +0300 |
commit | cfaf3952685b3a7dc43dceafe49f2449391fc54d (patch) | |
tree | d271bc59843a279eefc8a188ae52e304c3d614bb | |
parent | c73082e61b645e3117d59a1e14e5d3e5159012bd (diff) | |
download | ydb-cfaf3952685b3a7dc43dceafe49f2449391fc54d.tar.gz |
SelfHeal w/degraded
-rw-r--r-- | ydb/core/mind/bscontroller/impl.h | 1 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/self_heal.cpp | 146 |
2 files changed, 92 insertions, 55 deletions
diff --git a/ydb/core/mind/bscontroller/impl.h b/ydb/core/mind/bscontroller/impl.h index 917cd87f7eb..590df0efcc9 100644 --- a/ydb/core/mind/bscontroller/impl.h +++ b/ydb/core/mind/bscontroller/impl.h @@ -453,7 +453,6 @@ public: bool BadInTermsOfSelfHeal() const { return Status == NKikimrBlobStorage::EDriveStatus::FAULTY - || Status == NKikimrBlobStorage::EDriveStatus::TO_BE_REMOVED || Status == NKikimrBlobStorage::EDriveStatus::INACTIVE; } diff --git a/ydb/core/mind/bscontroller/self_heal.cpp b/ydb/core/mind/bscontroller/self_heal.cpp index aa1f187ad6c..1a36f2e0cb8 100644 --- a/ydb/core/mind/bscontroller/self_heal.cpp +++ b/ydb/core/mind/bscontroller/self_heal.cpp @@ -38,20 +38,21 @@ namespace NKikimr::NBsController { const TGroupId GroupId; const TEvControllerUpdateSelfHealInfo::TGroupContent Group; const std::optional<TVDiskID> VDiskToReplace; - TBlobStorageGroupInfo::TTopology Topology; - THolder<TBlobStorageGroupInfo::TGroupVDisks> FailedGroupDisks; + std::shared_ptr<TBlobStorageGroupInfo::TTopology> Topology; + TBlobStorageGroupInfo::TGroupVDisks FailedGroupDisks; THashSet<TVDiskID> PendingVDisks; THashMap<TActorId, TVDiskID> ActorToDiskMap; THashMap<TNodeId, TVector<TVDiskID>> NodeToDiskMap; public: TReassignerActor(TActorId controllerId, TGroupId groupId, TEvControllerUpdateSelfHealInfo::TGroupContent group, - std::optional<TVDiskID> vdiskToReplace) + std::optional<TVDiskID> vdiskToReplace, std::shared_ptr<TBlobStorageGroupInfo::TTopology> topology) : ControllerId(controllerId) , GroupId(groupId) , Group(std::move(group)) , VDiskToReplace(vdiskToReplace) - , Topology(Group.Type) + , Topology(std::move(topology)) + , FailedGroupDisks(Topology.get()) {} void Bootstrap(const TActorId& parent) { @@ -60,37 +61,9 @@ namespace NKikimr::NBsController { STLOG(PRI_DEBUG, BS_SELFHEAL, BSSH01, "Reassigner starting", (GroupId, GroupId)); - // create the topology - for (const auto& [vdiskId, vdisk] : Group.VDisks) { - Y_VERIFY(vdiskId.GroupID == GroupId); - Y_VERIFY(vdiskId.GroupGeneration == Group.Generation); - - // allocate new fail realm (if needed) - if (Topology.FailRealms.size() == vdiskId.FailRealm) { - Topology.FailRealms.emplace_back(); - } - Y_VERIFY(vdiskId.FailRealm == Topology.FailRealms.size() - 1); - auto& realm = Topology.FailRealms.back(); - - // allocate new fail domain (if needed) - if (realm.FailDomains.size() == vdiskId.FailDomain) { - realm.FailDomains.emplace_back(); - } - Y_VERIFY(vdiskId.FailDomain == realm.FailDomains.size() - 1); - auto& domain = realm.FailDomains.back(); - - // allocate new VDisk id - Y_VERIFY(vdiskId.VDisk == domain.VDisks.size()); - domain.VDisks.emplace_back(); - } - - // fill in topology structures - Topology.FinalizeConstruction(); - FailedGroupDisks = MakeHolder<TBlobStorageGroupInfo::TGroupVDisks>(&Topology); - for (const auto& [vdiskId, vdisk] : Group.VDisks) { if (VDiskToReplace && vdiskId == *VDiskToReplace) { - *FailedGroupDisks |= {&Topology, vdiskId}; + FailedGroupDisks |= {Topology.get(), vdiskId}; continue; // skip disk we are going to replcate -- it will be wiped out anyway } @@ -114,7 +87,7 @@ namespace NKikimr::NBsController { (VDiskId, vdiskId), (DiskIsOk, diskIsOk)); if (PendingVDisks.erase(vdiskId)) { if (!diskIsOk) { - *FailedGroupDisks |= {&Topology, vdiskId}; + FailedGroupDisks |= {Topology.get(), vdiskId}; } if (!PendingVDisks) { ProcessResult(); @@ -157,13 +130,13 @@ namespace NKikimr::NBsController { } void ProcessResult() { - auto& checker = Topology.GetQuorumChecker(); - if (!checker.CheckFailModelForGroup(*FailedGroupDisks)) { + auto& checker = Topology->GetQuorumChecker(); + if (!checker.CheckFailModelForGroup(FailedGroupDisks)) { STLOG(PRI_DEBUG, BS_SELFHEAL, BSSH06, "Reassigner ProcessResult quorum checker failed", (GroupId, GroupId)); return Finish(false, "Reassigner ProcessResult quorum checker failed"); // this change will render group unusable } - if (!VDiskToReplace && *FailedGroupDisks) { + if (!VDiskToReplace && FailedGroupDisks) { STLOG(PRI_DEBUG, BS_SELFHEAL, BSSH10, "Cannot sanitize group with non-operational disks", (GroupId, GroupId)); return Finish(false, "Cannot sanitize group with non-operational disks"); } @@ -255,6 +228,7 @@ namespace NKikimr::NBsController { TDuration RetryTimeout = MinRetryTimeout; TInstant NextRetryTimestamp = TInstant::Zero(); THashMap<TVDiskID, TVDiskStatusTracker> VDiskStatus; + std::shared_ptr<TBlobStorageGroupInfo::TTopology> Topology; bool LayoutValid = true; TString LayoutError; @@ -271,6 +245,9 @@ namespace NKikimr::NBsController { bool GroupLayoutSanitizerEnabled = false; THostRecordMap HostRecords; + using TTopologyDescr = std::tuple<TBlobStorageGroupType::EErasureSpecies, ui32, ui32, ui32>; + THashMap<TTopologyDescr, std::shared_ptr<TBlobStorageGroupInfo::TTopology>> Topologies; + static constexpr TDuration SelfHealWakeupPeriod = TDuration::Seconds(10); static constexpr uint32_t GroupLayoutSanitizerOperationLogSize = 128; @@ -297,8 +274,13 @@ namespace NKikimr::NBsController { UpdateLayoutInformationForAllGroups(); } } + bool groupsDeleted = false; for (const auto& [groupId, data] : ev->Get()->GroupsToUpdate) { if (data) { + if (!data->VDisks) { + continue; // virtual-only group + } + const auto [it, inserted] = Groups.try_emplace(groupId, groupId); auto& g = it->second; bool hasFaultyDisks = false; @@ -309,9 +291,16 @@ namespace NKikimr::NBsController { UpdateGroupLayoutInformation(g); } + ui32 numFailRealms = 0; + ui32 numFailDomainsPerFailRealm = 0; + ui32 numVDisksPerFailDomain = 0; + for (const auto& [vdiskId, vdisk] : g.Content.VDisks) { g.VDiskStatus[vdiskId].Update(vdisk.VDiskStatus, now); hasFaultyDisks |= vdisk.Faulty; + numFailRealms = Max<ui32>(numFailRealms, 1 + vdiskId.FailRealm); + numFailDomainsPerFailRealm = Max<ui32>(numFailDomainsPerFailRealm, 1 + vdiskId.FailDomain); + numVDisksPerFailDomain = Max<ui32>(numVDisksPerFailDomain, 1 + vdiskId.VDisk); } for (auto it = g.VDiskStatus.begin(); it != g.VDiskStatus.end(); ) { if (g.Content.VDisks.count(it->first)) { @@ -325,6 +314,16 @@ namespace NKikimr::NBsController { } else { GroupsWithFaultyDisks.Remove(&g); } + + Y_VERIFY(numFailRealms && numFailDomainsPerFailRealm && numVDisksPerFailDomain); + TTopologyDescr descr(g.Content.Type.GetErasure(), numFailRealms, numFailDomainsPerFailRealm, + numVDisksPerFailDomain); + auto& topology = Topologies[descr]; + if (!topology) { + topology = std::make_shared<TBlobStorageGroupInfo::TTopology>(std::get<0>(descr), + std::get<1>(descr), std::get<2>(descr), std::get<3>(descr), true); + } + g.Topology = topology; } else { // find the group to delete const auto it = Groups.find(groupId); @@ -341,6 +340,17 @@ namespace NKikimr::NBsController { // remove the group Groups.erase(it); + + groupsDeleted = true; + } + } + if (groupsDeleted) { + for (auto it = Topologies.begin(); it != Topologies.end(); ) { + if (it->second.use_count() == 1) { + Topologies.erase(it++); + } else { + ++it; + } } } for (const auto& [vdiskId, status, onlyPhantomsRemain] : ev->Get()->VDiskStatusUpdate) { @@ -367,8 +377,9 @@ namespace NKikimr::NBsController { } // check if it is possible to move anything out - if (const auto v = FindVDiskToReplace(group.VDiskStatus, group.Content, now)) { - group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, *v)); + if (const auto v = FindVDiskToReplace(group.VDiskStatus, group.Content, now, group.Topology.get())) { + group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, + *v, group.Topology)); } else { ++counter; // this group can't be reassigned right now } @@ -396,7 +407,8 @@ namespace NKikimr::NBsController { } else { ADD_RECORD_WITH_TIMESTAMP_TO_OPERATION_LOG(GroupLayoutSanitizerOperationLog, "Start sanitizing GroupId# " << group.GroupId << " GroupGeneration# " << group.Content.Generation); - group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, std::nullopt)); + group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, + std::nullopt, group.Topology)); } } } @@ -448,27 +460,53 @@ namespace NKikimr::NBsController { } std::optional<TVDiskID> FindVDiskToReplace(const THashMap<TVDiskID, TVDiskStatusTracker>& tracker, - const TEvControllerUpdateSelfHealInfo::TGroupContent& content, TInstant now) { - auto status = [&](const TVDiskID& id) { - try { - return tracker.at(id).GetStatus(now); - } catch (const std::out_of_range&) { - Y_FAIL(); + const TEvControllerUpdateSelfHealInfo::TGroupContent& content, TInstant now, + TBlobStorageGroupInfo::TTopology *topology) { + // main idea of selfhealing is step-by-step healing of bad group; we can allow healing of group with more + // than one disk missing, but we should not move next faulty disk until previous one is replicated, at least + // partially (meaning only phantoms left) + + // so, first we check that we have no replicating or starting disk in the group; but we allow one + // semi-replicated disk to prevent selfheal blocking + TBlobStorageGroupInfo::TGroupVDisks failedByReadiness(topology); + TBlobStorageGroupInfo::TGroupVDisks failedByBadness(topology); + ui32 numReplicatingWithPhantomsOnly = 0; + for (const auto& [vdiskId, vdisk] : content.VDisks) { + switch (vdisk.VDiskStatus) { + case NKikimrBlobStorage::EVDiskStatus::REPLICATING: + if (vdisk.OnlyPhantomsRemain && !numReplicatingWithPhantomsOnly) { + ++numReplicatingWithPhantomsOnly; + break; + } + [[fallthrough]]; + case NKikimrBlobStorage::EVDiskStatus::INIT_PENDING: + return std::nullopt; // don't touch group with replicating disks + + default: + break; } - }; - ui32 numBadDisks = 0; - for (const auto& [vdiskId, vdisk] : content.VDisks) { - if (status(vdiskId) != NKikimrBlobStorage::EVDiskStatus::READY || vdisk.Bad) { - ++numBadDisks; + auto it = tracker.find(vdiskId); + Y_VERIFY(it != tracker.end()); + if (it->second.GetStatus(now) != NKikimrBlobStorage::EVDiskStatus::READY) { + failedByReadiness |= {topology, vdiskId}; + } + if (vdisk.Bad) { + failedByBadness |= {topology, vdiskId}; } } - if (numBadDisks > 1) { - return std::nullopt; // do not touch groups with -2 disks or worse - } + + const auto& checker = topology->GetQuorumChecker(); + const auto failed = failedByReadiness | failedByBadness; // assume disks marked as Bad may become non-ready any moment now for (const auto& [vdiskId, vdisk] : content.VDisks) { if (vdisk.Faulty) { + const auto newFailed = failed | TBlobStorageGroupInfo::TGroupVDisks(topology, vdiskId); + if (!checker.CheckFailModelForGroup(newFailed)) { + continue; // healing this disk would break the group + } else if (checker.IsDegraded(failed) < checker.IsDegraded(newFailed)) { + continue; // this group will become degraded when applying self-heal logic, skip disk + } return vdiskId; } } |