diff options
author | serg-belyakov <[email protected]> | 2022-12-27 12:45:42 +0300 |
---|---|---|
committer | serg-belyakov <[email protected]> | 2022-12-27 12:45:42 +0300 |
commit | bbda7c5295e60cbe2620908dc2b1d49385ef0352 (patch) | |
tree | 8a92bf93dce4f5eb253c1c8e6fb5e3325b977812 | |
parent | 726a249dea3cbcaa4b1bf07754b1bb5ee1a6626a (diff) |
Add SanitizeGroup function to TGroupMapper,
Determine failRealm->pRealm mapping
Add sanitize function to GroupMapper
-rw-r--r-- | ydb/core/mind/bscontroller/bsc.cpp | 1 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/cmds_storage_pool.cpp | 5 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/config.h | 2 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/config_cmd.cpp | 2 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/config_fit_groups.cpp | 54 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/group_geometry_info.h | 39 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/group_mapper.cpp | 469 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/group_mapper.h | 30 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/group_mapper_ut.cpp | 257 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/monitoring.cpp | 15 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/self_heal.cpp | 69 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/ut_selfheal/env.h | 2 | ||||
-rw-r--r-- | ydb/core/protos/blobstorage_config.proto | 5 |
13 files changed, 804 insertions, 146 deletions
diff --git a/ydb/core/mind/bscontroller/bsc.cpp b/ydb/core/mind/bscontroller/bsc.cpp index c44f3cc71f0..1f1d9f1e1fa 100644 --- a/ydb/core/mind/bscontroller/bsc.cpp +++ b/ydb/core/mind/bscontroller/bsc.cpp @@ -349,6 +349,7 @@ ui32 TBlobStorageController::GetEventPriority(IEventHandle *ev) { case NKikimrBlobStorage::TConfigRequest::TCommand::kAllocateVirtualGroup: case NKikimrBlobStorage::TConfigRequest::TCommand::kDecommitGroups: case NKikimrBlobStorage::TConfigRequest::TCommand::kWipeVDisk: + case NKikimrBlobStorage::TConfigRequest::TCommand::kSanitizeGroup: return 2; // read-write commands go with higher priority as they are needed to keep cluster intact case NKikimrBlobStorage::TConfigRequest::TCommand::kReadHostConfig: diff --git a/ydb/core/mind/bscontroller/cmds_storage_pool.cpp b/ydb/core/mind/bscontroller/cmds_storage_pool.cpp index ca931601bad..d590081069f 100644 --- a/ydb/core/mind/bscontroller/cmds_storage_pool.cpp +++ b/ydb/core/mind/bscontroller/cmds_storage_pool.cpp @@ -584,4 +584,9 @@ namespace NKikimr::NBsController { group->CalculateGroupStatus(); } + void TBlobStorageController::TConfigState::ExecuteStep(const NKikimrBlobStorage::TSanitizeGroup& cmd, NKikimrBlobStorage::TConfigResponse::TStatus& /*status*/) { + ui32 groupId = cmd.GetGroupId(); + SanitizingRequests.emplace(groupId); + } + } // NKikimr::NBsController diff --git a/ydb/core/mind/bscontroller/config.h b/ydb/core/mind/bscontroller/config.h index ca7e5f44d77..0c46340d168 100644 --- a/ydb/core/mind/bscontroller/config.h +++ b/ydb/core/mind/bscontroller/config.h @@ -79,6 +79,7 @@ namespace NKikimr { // volatile reconfiguration state THashMap<TVSlotId, TPDiskId> ExplicitReconfigureMap; std::set<TVSlotId> SuppressDonorMode; + std::unordered_set<ui32> SanitizingRequests; // just-created vslots, which are not yet committed to the storage TSet<TVSlotId> UncommittedVSlots; @@ -273,6 +274,7 @@ namespace NKikimr { void ExecuteStep(const NKikimrBlobStorage::TAllocateVirtualGroup& cmd, TStatus& status); void ExecuteStep(const NKikimrBlobStorage::TDecommitGroups& cmd, TStatus& status); void ExecuteStep(const NKikimrBlobStorage::TWipeVDisk& cmd, TStatus& status); + void ExecuteStep(const NKikimrBlobStorage::TSanitizeGroup& cmd, TStatus& status); }; } // NBsController diff --git a/ydb/core/mind/bscontroller/config_cmd.cpp b/ydb/core/mind/bscontroller/config_cmd.cpp index 96084e3b842..d53fc9400c3 100644 --- a/ydb/core/mind/bscontroller/config_cmd.cpp +++ b/ydb/core/mind/bscontroller/config_cmd.cpp @@ -218,6 +218,7 @@ namespace NKikimr::NBsController { MAP_TIMING(DropDonorDisk, DROP_DONOR_DISK) MAP_TIMING(ReassignGroupDisk, REASSIGN_GROUP_DISK) MAP_TIMING(WipeVDisk, REASSIGN_GROUP_DISK) + MAP_TIMING(SanitizeGroup, REASSIGN_GROUP_DISK) default: break; @@ -314,6 +315,7 @@ namespace NKikimr::NBsController { HANDLE_COMMAND(AllocateVirtualGroup) HANDLE_COMMAND(DecommitGroups) HANDLE_COMMAND(WipeVDisk) + HANDLE_COMMAND(SanitizeGroup) case NKikimrBlobStorage::TConfigRequest::TCommand::kAddMigrationPlan: case NKikimrBlobStorage::TConfigRequest::TCommand::kDeleteMigrationPlan: diff --git a/ydb/core/mind/bscontroller/config_fit_groups.cpp b/ydb/core/mind/bscontroller/config_fit_groups.cpp index 45d3443b8e5..802a20554b5 100644 --- a/ydb/core/mind/bscontroller/config_fit_groups.cpp +++ b/ydb/core/mind/bscontroller/config_fit_groups.cpp @@ -172,6 +172,7 @@ namespace NKikimr { TStackVec<std::pair<TVSlotId, bool>, 32> replaceQueue; THashMap<TVDiskIdShort, TPDiskId> replacedDisks; i64 requiredSpace = Min<i64>(); + bool sanitizingRequest = (State.SanitizingRequests.find(groupId) != State.SanitizingRequests.end()); //////////////////////////////////////////////////////////////////////////////////////////////////////// // scan through all VSlots and find matching PDisks @@ -225,6 +226,11 @@ namespace NKikimr { } } + if (sanitizingRequest) { + // resize group definition + getGroup(); + } + if (group) { TGroupInfo *groupInfo = State.Groups.FindForUpdate(groupId); @@ -249,7 +255,26 @@ namespace NKikimr { } } } - AllocateGroup(groupId, group, replacedDisks, std::move(forbid), requiredSpace, AllowUnusableDisks); + if ((replacedDisks.empty() && sanitizingRequest) || (replacedDisks.size() == 1)) { + auto result = SanitizeGroup(groupId, group, std::move(forbid), requiredSpace, AllowUnusableDisks); + + if (replacedDisks.empty()) { + // update information about replaced disks + for (const TVSlotInfo *vslot : groupInfo->VDisksInGroup) { + if (vslot->GetShortVDiskId() == result.first) { + auto it = preservedSlots.find(vslot->GetVDiskId()); + Y_VERIFY(it != preservedSlots.end()); + preservedSlots.erase(it); + replacedSlots.emplace(result.first, vslot->VSlotId); + replaceQueue.emplace_back(vslot->VSlotId, State.SuppressDonorMode.count(vslot->VSlotId)); + replacedDisks.emplace(result.first, vslot->VSlotId.ComprisingPDiskId()); + break; + } + } + } + } else { + AllocateGroup(groupId, group, replacedDisks, std::move(forbid), requiredSpace, AllowUnusableDisks); + } if (!IgnoreVSlotQuotaCheck) { adjustSpaceAvailable = true; for (const auto& [pos, vslotId] : replacedSlots) { @@ -381,6 +406,33 @@ namespace NKikimr { } } + std::pair<TVDiskIdShort, TPDiskId> SanitizeGroup(TGroupId groupId, TGroupMapper::TGroupDefinition& group, + TGroupMapper::TForbiddenPDisks forbid, i64 requiredSpace, bool addExistingDisks) { + if (!Mapper) { + Mapper.emplace(Geometry, StoragePool.RandomizeGroupMapping); + PopulateGroupMapper(); + } + TStackVec<TPDiskId, 32> removeQ; + if (addExistingDisks) { + for (const auto& realm : group) { + for (const auto& domain : realm) { + for (const TPDiskId id : domain) { + if (id != TPDiskId()) { + if (auto *info = State.PDisks.Find(id); info && RegisterPDisk(id, *info, false)) { + removeQ.push_back(id); + } + } + } + } + } + } + auto res = Geometry.SanitizeGroup(*Mapper, groupId, group, std::move(forbid), requiredSpace); + for (const TPDiskId pdiskId : removeQ) { + Mapper->UnregisterPDisk(pdiskId); + } + return res; + } + void PopulateGroupMapper() { const TBoxId boxId = std::get<0>(StoragePoolId); diff --git a/ydb/core/mind/bscontroller/group_geometry_info.h b/ydb/core/mind/bscontroller/group_geometry_info.h index a74698f1f84..748f9802fc7 100644 --- a/ydb/core/mind/bscontroller/group_geometry_info.h +++ b/ydb/core/mind/bscontroller/group_geometry_info.h @@ -80,6 +80,23 @@ namespace NKikimr::NBsController { throw TExFitGroupError() << "failed to allocate group: " << error; } + // returns pair of previous VDisk and PDisk id's + std::pair<TVDiskIdShort, TPDiskId> SanitizeGroup(TGroupMapper &mapper, TGroupId groupId, TGroupMapper::TGroupDefinition &group, + TGroupMapper::TForbiddenPDisks forbid, i64 requiredSpace) const { + TString error; + auto misplacedVDisks = mapper.FindMisplacedVDisks(group); + for (const bool requireOperational : {true, false}) { + for (const auto& replacedDisk : misplacedVDisks.Disks) { + TPDiskId pdiskId = group[replacedDisk.FailRealm][replacedDisk.FailDomain][replacedDisk.VDisk]; + if (mapper.TargetMisplacedVDisk(groupId, group, replacedDisk, forbid, requiredSpace, + requireOperational, error)) { + return {replacedDisk, pdiskId}; + } + } + } + throw TExFitGroupError() << "failed to sanitize group: " << error; + } + bool ResizeGroup(TGroupMapper::TGroupDefinition& group) const { if (!group) { group.resize(NumFailRealms); @@ -114,6 +131,28 @@ namespace NKikimr::NBsController { return true; } + bool CheckGroupSize(const TGroupMapper::TGroupDefinition& group) const { + if (!group) { + return false; + } + + if (group.size() != NumFailRealms) { + return false; + } + for (const auto& realm : group) { + if (realm.size() != NumFailDomainsPerFailRealm) { + return false; + } + for (const auto& domain : realm) { + if (domain.size() != NumVDisksPerFailDomain) { + return false; + } + } + } + + return true; + } + TBlobStorageGroupType GetType() const { return Type; } diff --git a/ydb/core/mind/bscontroller/group_mapper.cpp b/ydb/core/mind/bscontroller/group_mapper.cpp index 5619534fb3a..3f4dd8f2158 100644 --- a/ydb/core/mind/bscontroller/group_mapper.cpp +++ b/ydb/core/mind/bscontroller/group_mapper.cpp @@ -6,6 +6,8 @@ namespace NKikimr::NBsController { using namespace NLayoutChecker; + struct TAllocator; + class TGroupMapper::TImpl : TNonCopyable { struct TPDiskInfo : TPDiskRecord { TPDiskLayoutPosition Position; @@ -58,7 +60,12 @@ namespace NKikimr::NBsController { using TGroup = std::vector<TPDiskInfo*>; - struct TAllocator { + // PDomain/PRealm - TPDiskLayoutPosition, Fail Domain/Fail Realm - VDiskId + + using TPDomainCandidatesRange = std::pair<std::vector<ui32>::const_iterator, std::vector<ui32>::const_iterator>; + using TPDiskCandidatesRange = std::pair<std::vector<TPDiskInfo*>::const_iterator, std::vector<TPDiskInfo*>::const_iterator>; + + struct TDiskManager { TImpl& Self; const TBlobStorageGroupInfo::TTopology Topology; THashSet<TPDiskId> OldGroupContent; // set of all existing disks in the group, inclusing ones which are replaced @@ -69,7 +76,7 @@ namespace NKikimr::NBsController { TGroupLayout GroupLayout; std::optional<TScore> WorstScore; - TAllocator(TImpl& self, const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational, + TDiskManager(TImpl& self, const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational, TForbiddenPDisks forbiddenDisks, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks) : Self(self) , Topology(geom.GetType(), geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm(), geom.GetNumVDisksPerFailDomain(), true) @@ -227,6 +234,65 @@ namespace NKikimr::NBsController { } } + bool DiskIsBetter(const TPDiskInfo& pretender, const TPDiskInfo& king) const { + if (pretender.NumSlots != king.NumSlots) { + return pretender.NumSlots < king.NumSlots; + } else if (GivesLocalityBoost(pretender, king) || BetterQuotaMatch(pretender, king)) { + return true; + } else { + if (pretender.NumDomainMatchingDisks != king.NumDomainMatchingDisks) { + return pretender.NumDomainMatchingDisks > king.NumDomainMatchingDisks; + } + return pretender.PDiskId < king.PDiskId; + } + } + + bool GivesLocalityBoost(const TPDiskInfo& pretender, const TPDiskInfo& king) const { + const ui32 a = GetLocalityFactor(pretender); + const ui32 b = GetLocalityFactor(king); + return Self.Randomize ? a < b : a > b; + } + + bool BetterQuotaMatch(const TPDiskInfo& pretender, const TPDiskInfo& king) const { + return pretender.SpaceAvailable < king.SpaceAvailable; + } + + void AddUsedDisk(const TPDiskInfo& pdisk) { + for (ui32 groupId : pdisk.Groups) { + ++LocalityFactor[groupId]; + } + } + + void RemoveUsedDisk(const TPDiskInfo& pdisk) { + for (ui32 groupId : pdisk.Groups) { + if (!--LocalityFactor[groupId]) { + LocalityFactor.erase(groupId); + } + } + } + + unsigned GetLocalityFactor(const TPDiskInfo& pdisk) const { + unsigned res = 0; + for (ui32 groupId : pdisk.Groups) { + res += GetLocalityFactor(groupId); + } + return res; + } + + unsigned GetLocalityFactor(ui32 groupId) const { + const auto it = LocalityFactor.find(groupId); + return it != LocalityFactor.end() ? it->second : 0; + } + }; + + struct TAllocator : public TDiskManager { + + TAllocator(TImpl& self, const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational, + TForbiddenPDisks forbiddenDisks, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks) + : TDiskManager(self, geom, requiredSpace, requireOperational, forbiddenDisks, replacedDisks) + { + } + bool FillInGroup(ui32 maxScore, TUndoLog& undo, TGroup& group) { // determine PDisks that fit our requirements (including score) auto v = SetupMatchingDisks(maxScore); @@ -467,55 +533,270 @@ namespace NKikimr::NBsController { cb(pdisk); } } + }; - bool DiskIsBetter(TPDiskInfo& pretender, TPDiskInfo& king) const { - if (pretender.NumSlots != king.NumSlots) { - return pretender.NumSlots < king.NumSlots; - } else if (GivesLocalityBoost(pretender, king) || BetterQuotaMatch(pretender, king)) { - return true; - } else { - if (pretender.NumDomainMatchingDisks != king.NumDomainMatchingDisks) { - return pretender.NumDomainMatchingDisks > king.NumDomainMatchingDisks; + struct TSanitizer : public TDiskManager { + ui32 DesiredRealmGroup; + std::vector<ui32> RealmNavigator; + // failRealm -> pRealm + std::unordered_map<ui32, std::vector<ui32>> DomainCandidates; + // pRealm -> {pDomain1, pDomain2, ... }, sorted by number of slots in pDomains + std::unordered_map<ui32, std::unordered_map<ui32, std::vector<TPDiskInfo*>>> DiskCandidates; + // {pRealm, pDomain} -> {pdisk1, pdisk2, ... }, sorted by DiskIsBetter() relation + std::unordered_map<ui32, std::unordered_set<ui32>> BannedDomains; + // pRealm -> {pDomain1, pDomain2, ... } + // Cannot be a candidate, this domains are already placed correctly + + TSanitizer(TImpl& self, const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational, + TForbiddenPDisks forbiddenDisks, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks) + : TDiskManager(self, geom, requiredSpace, requireOperational, forbiddenDisks, replacedDisks) + { + } + + bool SetupNavigation(const TGroup& group) { + TPDiskByPosition matchingDisks = SetupMatchingDisks(::Max<ui32>()); + const ui32 totalFailRealmsNum = Topology.GetTotalFailRealmsNum(); + const ui32 numFailDomainsPerFailRealm = Topology.GetNumFailDomainsPerFailRealm(); + const ui32 numDisksPerFailRealm = numFailDomainsPerFailRealm * Topology.GetNumVDisksPerFailDomain(); + RealmNavigator.assign(totalFailRealmsNum, ::Max<ui32>()); + + std::map<ui32, ui32> realmGroups; + + // {failRealm, pRealm} -> #number of pdisks from ${pRealm} in ${failRealm} + std::vector<std::unordered_map<ui32, ui32>> disksInPRealmByFailRealm(totalFailRealmsNum); + + // pRealm -> #number of pdisks from ${pRealm} in ${group} + std::unordered_map<ui32, ui32> disksInPRealm; + std::set<ui32> realmCandidates; + + // the list of potentailly free pDomains in pRealm, which include free domains and + // domains, currently occupied by group's pdisks + std::unordered_map<ui32, std::unordered_set<ui32>> pDomainsInPRealm; + + for (ui32 orderNumber = 0; orderNumber < group.size(); ++orderNumber) { + if (group[orderNumber]) { + const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber); + const ui32 pRealmGroup = group[orderNumber]->Position.RealmGroup.Index(); + const ui32 pRealm = group[orderNumber]->Position.Realm.Index(); + const ui32 pDomain = group[orderNumber]->Position.Domain.Index(); + realmGroups[pRealmGroup]++; + disksInPRealmByFailRealm[vdisk.FailRealm][pRealm]++; + disksInPRealm[pRealm]++; + pDomainsInPRealm[pRealm].insert(pDomain); } - return pretender.PDiskId < king.PDiskId; } - } - bool GivesLocalityBoost(TPDiskInfo& pretender, TPDiskInfo& king) const { - const ui32 a = GetLocalityFactor(pretender); - const ui32 b = GetLocalityFactor(king); - return Self.Randomize ? a < b : a > b; - } + DesiredRealmGroup = 0; + ui32 bestRealmGroupSize = 0; + for (auto it = realmGroups.begin(); it != realmGroups.end(); ++it) { + if (it->second > bestRealmGroupSize) { + bestRealmGroupSize = it->second; + DesiredRealmGroup = it->first; + } + } - bool BetterQuotaMatch(TPDiskInfo& pretender, TPDiskInfo& king) const { - return pretender.SpaceAvailable < king.SpaceAvailable; + for (const auto& [position, pdisk] : matchingDisks) { + if (position.RealmGroup.Index() == DesiredRealmGroup) { + pDomainsInPRealm[position.Realm.Index()].insert(position.Domain.Index()); + } + } + + for (auto& [pRealmIdx, pRealm] : pDomainsInPRealm) { + if (pRealm.size() >= numFailDomainsPerFailRealm) { + realmCandidates.insert(pRealmIdx); + } + } + + + std::vector<std::pair<ui32, ui32>> realmFilling(totalFailRealmsNum); + for (ui32 failRealm = 0; failRealm < totalFailRealmsNum; ++failRealm) { + ui32 maxFilling = 0; + for (const auto& [pRealm, filling] : disksInPRealmByFailRealm[failRealm]) { + maxFilling = std::max(maxFilling, filling); + } + realmFilling[failRealm] = { numFailDomainsPerFailRealm - maxFilling, failRealm }; + } + std::sort(realmFilling.begin(), realmFilling.end()); + + for (const auto& [_, failRealm] : realmFilling) { + ui32 bestRealm = ::Max<ui32>(); + ui32 movesRequired = ::Max<ui32>(); + for (auto it = realmCandidates.begin(); it != realmCandidates.end(); ++it) { + ui32 pRealm = *it; + ui32 correctAlready = disksInPRealmByFailRealm[failRealm][pRealm]; + ui32 toMoveIn = numDisksPerFailRealm - correctAlready; + ui32 toMoveOut = disksInPRealm[pRealm] - correctAlready; + ui32 freeDomains = pDomainsInPRealm[pRealm].size(); + ui32 newMovesRequired = toMoveIn; + if (toMoveOut + freeDomains < toMoveIn) { + continue; // not enough free domains to place all the disks + } + if (newMovesRequired < movesRequired || (newMovesRequired == movesRequired && + freeDomains > pDomainsInPRealm[bestRealm].size())) { + bestRealm = pRealm; + movesRequired = newMovesRequired; + } + } + if (bestRealm == ::Max<ui32>()) { + return false; + } + RealmNavigator[failRealm] = bestRealm; + realmCandidates.erase(realmCandidates.find(bestRealm)); + } + + UpdateGroup(group); + return true; } - void AddUsedDisk(const TPDiskInfo& pdisk) { - for (ui32 groupId : pdisk.Groups) { - ++LocalityFactor[groupId]; + void UpdateGroup(const TGroup& group) { + BannedDomains.clear(); + for (ui32 orderNumber = 0; orderNumber < group.size(); ++orderNumber) { + if (group[orderNumber]) { + const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber); + const ui32 pRealm = group[orderNumber]->Position.Realm.Index(); + const ui32 pDomain = group[orderNumber]->Position.Domain.Index(); + if (pRealm == RealmNavigator[vdisk.FailRealm]) { + BannedDomains[pRealm].insert(pDomain); + } + } } } - void RemoveUsedDisk(const TPDiskInfo& pdisk) { - for (ui32 groupId : pdisk.Groups) { - if (!--LocalityFactor[groupId]) { - LocalityFactor.erase(groupId); + void SetupCandidates(ui32 maxScore) { + TPDiskByPosition matchingDisks = SetupMatchingDisks(maxScore); + DomainCandidates.clear(); + DiskCandidates.clear(); + + std::unordered_map<ui32, std::unordered_map<ui32, ui32>> slotsInPDomain; + // {pRealm, pDomain} -> #summary number of slots in ${pDomain, pRealm} + + for (const auto& [position, pdisk] : matchingDisks) { + if (position.RealmGroup.Index() == DesiredRealmGroup) { + ui32 pRealm = position.Realm.Index(); + ui32 pDomain = position.Domain.Index(); + + if (BannedDomains[pRealm].count(pDomain) == 0) { + DomainCandidates[pRealm].push_back(pDomain); + DiskCandidates[pRealm][pDomain].push_back(pdisk); + } + + slotsInPDomain[pRealm][pDomain] += pdisk->NumSlots; + } + } + for (auto it = DomainCandidates.begin(); it != DomainCandidates.end(); ++it) { + const ui32 pRealmIdx = it->first; + // sort domains in realm by the number of free disks + const auto& pRealmInfo = slotsInPDomain[pRealmIdx]; + auto realm = it->second; + std::sort(realm.begin(), realm.end(), [&pRealmInfo](const ui32& left, const ui32& right) { + return pRealmInfo.at(left) > pRealmInfo.at(right); + }); + it->second = realm; + + auto& diskCandidatesInRealm = DiskCandidates[pRealmIdx]; + for (auto jt = diskCandidatesInRealm.begin(); jt != diskCandidatesInRealm.end(); ++jt) { + auto domain = jt->second; + // sort disks in domain by DiskIsBetter metric + // DiskIsBetter() is not suitable for std::sort, better ordering required + // std::sort(domain.begin(), domain.end(), [this](const TPDiskInfo* left, const TPDiskInfo* right) { + // return this->DiskIsBetter(*left, *right); + // }); + + for (ui32 i = 0; i < domain.size(); ++i) { + if (DiskIsBetter(*domain[0], *domain[i])) { + std::swap(domain[0], domain[i]); + } + } + jt->second = domain; } } } - unsigned GetLocalityFactor(const TPDiskInfo& pdisk) const { - unsigned res = 0; - for (ui32 groupId : pdisk.Groups) { - res += GetLocalityFactor(groupId); + // if optional is empty, then all disks in group are placed correctly + std::pair<TMisplacedVDisks::EFailLevel, std::vector<ui32>> FindMisplacedVDisks(const TGroup& group) { + using EFailLevel = TMisplacedVDisks::EFailLevel; + std::unordered_map<ui32, std::unordered_set<ui32>> usedPDomains; // pRealm -> { pDomain1, pDomain2, ... } + std::set<TPDiskId> usedPDisks; + // {pRealm, pDomain} -> { pdisk1, pdisk2, ... } + + EFailLevel failLevel = EFailLevel::ALL_OK; + std::vector<ui32> misplacedVDisks; + std::unordered_map<ui32, std::unordered_map<ui32, ui32>> domainInterlace; + std::map<TPDiskId, ui32> diskInterlace; + + for (ui32 orderNum = 0; orderNum < group.size(); ++orderNum) { + if (group[orderNum]) { + ui32 pRealm = group[orderNum]->Position.Realm.Index(); + ui32 pDomain = group[orderNum]->Position.Domain.Index(); + TPDiskId pdisk = group[orderNum]->PDiskId; + domainInterlace[pRealm][pDomain]++; + diskInterlace[pdisk]++; + } } - return res; + + for (ui32 orderNum = 0; orderNum < group.size(); ++orderNum) { + if (group[orderNum]) { + const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNum); + ui32 pRealm = group[orderNum]->Position.Realm.Index(); + ui32 pDomain = group[orderNum]->Position.Domain.Index(); + TPDiskId pdisk = group[orderNum]->PDiskId; + ui32 desiredPRealm = RealmNavigator[vdisk.FailRealm]; + if (desiredPRealm != pRealm && (ui32)failLevel <= (ui32)EFailLevel::REALM_FAIL) { + if ((ui32)failLevel < (ui32)EFailLevel::REALM_FAIL) { + misplacedVDisks.clear(); + } + failLevel = EFailLevel::REALM_FAIL; + misplacedVDisks.push_back(orderNum); + } else if (domainInterlace[pRealm][pDomain] > 1 && (ui32)failLevel <= (ui32)EFailLevel::DOMAIN_FAIL) { + if ((ui32)failLevel < (ui32)EFailLevel::DOMAIN_FAIL) { + misplacedVDisks.clear(); + } + failLevel = EFailLevel::DOMAIN_FAIL; + misplacedVDisks.push_back(orderNum); + } else if (diskInterlace[pdisk] > 1 && (ui32)failLevel <= (ui32)EFailLevel::DISK_FAIL) { + failLevel = EFailLevel::DISK_FAIL; + misplacedVDisks.push_back(orderNum); + } + } else { + if (failLevel == EFailLevel::EMPTY_SLOT) { + misplacedVDisks.clear(); + failLevel = EFailLevel::INCORRECT_LAYOUT; + } else if ((ui32)failLevel < (ui32)EFailLevel::EMPTY_SLOT) { + misplacedVDisks = {orderNum}; + failLevel = EFailLevel::EMPTY_SLOT; + } + } + } + + return {failLevel, misplacedVDisks}; } - unsigned GetLocalityFactor(ui32 groupId) const { - const auto it = LocalityFactor.find(groupId); - return it != LocalityFactor.end() ? it->second : 0; + std::optional<TPDiskId> TargetMisplacedVDisk(ui32 maxScore, const TGroup& group, const TVDiskIdShort& vdisk) { + for (ui32 orderNumber = 0; orderNumber < group.size(); ++orderNumber) { + if (!group[orderNumber] && orderNumber != Topology.GetOrderNumber(vdisk)) { + return std::nullopt; + } + } + + UpdateGroup(group); + SetupCandidates(maxScore); + + ui32 failRealm = vdisk.FailRealm; + ui32 pRealm = RealmNavigator[failRealm]; + + const auto& domainCandidates = DomainCandidates[pRealm]; + TPDomainCandidatesRange pDomainRange = { domainCandidates.begin(), domainCandidates.end() }; + + for (; pDomainRange.first != pDomainRange.second;) { + ui32 pDomain = *pDomainRange.first++; + const auto& diskCandidates = DiskCandidates[pRealm][pDomain]; + + if (!diskCandidates.empty()) { + return (*diskCandidates.begin())->PDiskId; + } + } + + return std::nullopt; } }; @@ -564,7 +845,7 @@ namespace NKikimr::NBsController { it->second.SpaceAvailable += increment; } - TString FormatPDisks(const TAllocator& allocator) const { + TString FormatPDisks(const TDiskManager& diskManager) const { TStringStream s; s << "PDisks# "; @@ -584,11 +865,11 @@ namespace NKikimr::NBsController { s << std::exchange(space, " ") << pdisk->PDiskId; - if (allocator.OldGroupContent.contains(pdisk->PDiskId)) { + if (diskManager.OldGroupContent.contains(pdisk->PDiskId)) { s << "*"; } const char *minus = "-"; - if (allocator.ForbiddenDisks.contains(pdisk->PDiskId)) { + if (diskManager.ForbiddenDisks.contains(pdisk->PDiskId)) { s << std::exchange(minus, "") << "f"; } if (!pdisk->Usable) { @@ -600,13 +881,13 @@ namespace NKikimr::NBsController { if (pdisk->NumSlots >= pdisk->MaxSlots) { s << std::exchange(minus, "") << "s[" << pdisk->NumSlots << "/" << pdisk->MaxSlots << "]"; } - if (pdisk->SpaceAvailable < allocator.RequiredSpace) { + if (pdisk->SpaceAvailable < diskManager.RequiredSpace) { s << std::exchange(minus, "") << "v"; } if (!pdisk->Operational) { s << std::exchange(minus, "") << "o"; } - if (allocator.DiskIsUsable(*pdisk)) { + if (diskManager.DiskIsUsable(*pdisk)) { s << "+"; } @@ -701,6 +982,108 @@ namespace NKikimr::NBsController { return false; } } + + TMisplacedVDisks FindMisplacedVDisks(const TGroupDefinition& groupDefinition) { + using EFailLevel = TMisplacedVDisks::EFailLevel; + // create group of required size, if it is not created yet + if (!Geom.CheckGroupSize(groupDefinition)) { + return TMisplacedVDisks(EFailLevel::INCORRECT_LAYOUT, {}, "Incorrect group"); + } + + TSanitizer sanitizer(*this, Geom, 0, false, {}, {}); + TString error; + TGroup group = sanitizer.ProcessExistingGroup(groupDefinition, error); + if (group.empty()) { + return TMisplacedVDisks(EFailLevel::INCORRECT_LAYOUT, {}, error); + } + if (!sanitizer.SetupNavigation(group)) { + return TMisplacedVDisks(EFailLevel::INCORRECT_LAYOUT, {}, "Cannot map failRealms to pRealms"); + } + + sanitizer.SetupCandidates(::Max<ui32>()); + auto [failLevel, misplacedVDiskNums] = sanitizer.FindMisplacedVDisks(group); + std::vector<TVDiskIdShort> misplacedVDisks; + for (ui32 orderNum : misplacedVDiskNums) { + misplacedVDisks.push_back(sanitizer.Topology.GetVDiskId(orderNum)); + } + return TMisplacedVDisks(failLevel, misplacedVDisks); + } + + std::optional<TPDiskId> TargetMisplacedVDisk(ui32 groupId, TGroupDefinition& groupDefinition, TVDiskIdShort vdisk, + TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error) { + if (Dirty) { + std::sort(PDiskByPosition.begin(), PDiskByPosition.end()); + Dirty = false; + } + + // create group of required size, if it is not created yet + if (!Geom.CheckGroupSize(groupDefinition)) { + error = "Incorrect group"; + return std::nullopt; + } + + TSanitizer sanitizer(*this, Geom, requiredSpace, requireOperational, std::move(forbid), {}); + TGroup group = sanitizer.ProcessExistingGroup(groupDefinition, error); + if (group.empty()) { + error = "Empty group"; + return std::nullopt; + } + if (!sanitizer.SetupNavigation(group)) { + error = "Cannot map failRealms to pRealms"; + return std::nullopt; + } + + // calculate score table + std::vector<ui32> scores; + for (const auto& [pdiskId, pdisk] : PDisks) { + if (sanitizer.DiskIsUsable(pdisk)) { + scores.push_back(pdisk.GetPickerScore()); + } + } + std::sort(scores.begin(), scores.end()); + scores.erase(std::unique(scores.begin(), scores.end()), scores.end()); + + // bisect scores to find optimal working one + sanitizer.SetupCandidates(::Max<ui32>()); + + std::optional<TPDiskId> result; + + ui32 begin = 0, end = scores.size(); + while (begin < end) { + const ui32 mid = begin + (end - begin) / 2; + std::optional<TPDiskId> target; + if ((target = sanitizer.TargetMisplacedVDisk(scores[mid], group, vdisk))) { + result = target; + end = mid; + } else { + begin = mid + 1; + } + } + + if (result) { + ui32 orderNum = sanitizer.Topology.GetOrderNumber(vdisk); + if (group[orderNum]) { + TPDiskId pdiskId = group[orderNum]->PDiskId; + const auto it = PDisks.find(pdiskId); + Y_VERIFY(it != PDisks.end()); + TPDiskInfo& pdisk = it->second; + --pdisk.NumSlots; + pdisk.EraseGroup(groupId); + } + { + const auto it = PDisks.find(*result); + Y_VERIFY(it != PDisks.end()); + TPDiskInfo& pdisk = it->second; + ++pdisk.NumSlots; + pdisk.InsertGroup(groupId); + groupDefinition[vdisk.FailRealm][vdisk.FailDomain][vdisk.VDisk] = *result; + } + return result; + } + + error = "Cannot replace vdisk"; + return std::nullopt; + } }; TGroupMapper::TGroupMapper(TGroupGeometryInfo geom, bool randomize) @@ -726,4 +1109,12 @@ namespace NKikimr::NBsController { return Impl->AllocateGroup(groupId, group, replacedDisks, std::move(forbid), requiredSpace, requireOperational, error); } + TGroupMapper::TMisplacedVDisks TGroupMapper::FindMisplacedVDisks(const TGroupDefinition& group) { + return Impl->FindMisplacedVDisks(group); + } + + std::optional<TPDiskId> TGroupMapper::TargetMisplacedVDisk(ui32 groupId, TGroupMapper::TGroupDefinition& group, + TVDiskIdShort vdisk, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error) { + return Impl->TargetMisplacedVDisk(groupId, group, vdisk, std::move(forbid), requiredSpace, requireOperational, error); + } } // NKikimr::NBsController diff --git a/ydb/core/mind/bscontroller/group_mapper.h b/ydb/core/mind/bscontroller/group_mapper.h index 991a636bf38..99ffcd433bb 100644 --- a/ydb/core/mind/bscontroller/group_mapper.h +++ b/ydb/core/mind/bscontroller/group_mapper.h @@ -78,6 +78,36 @@ namespace NKikimr { // prefix+infix part gives us distinct fail realms we can use while generating groups. bool AllocateGroup(ui32 groupId, TGroupDefinition& group, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error); + + struct TMisplacedVDisks { + enum EFailLevel : ui32 { + ALL_OK, + DISK_FAIL, + DOMAIN_FAIL, + REALM_FAIL, + EMPTY_SLOT, + INCORRECT_LAYOUT, + }; + + TMisplacedVDisks(EFailLevel failLevel, std::vector<TVDiskIdShort> disks, TString errorReason = "") + : FailLevel(failLevel) + , Disks(std::move(disks)) + , ErrorReason(errorReason) + {} + + EFailLevel FailLevel; + std::vector<TVDiskIdShort> Disks; + TString ErrorReason; + + operator bool() const { + return FailLevel != EFailLevel::INCORRECT_LAYOUT; + } + }; + + TMisplacedVDisks FindMisplacedVDisks(const TGroupDefinition& group); + + std::optional<TPDiskId> TargetMisplacedVDisk(ui32 groupId, TGroupDefinition& group, TVDiskIdShort vdisk, + TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error); }; } // NBsController diff --git a/ydb/core/mind/bscontroller/group_mapper_ut.cpp b/ydb/core/mind/bscontroller/group_mapper_ut.cpp index 1be38782962..513283d8a2b 100644 --- a/ydb/core/mind/bscontroller/group_mapper_ut.cpp +++ b/ydb/core/mind/bscontroller/group_mapper_ut.cpp @@ -248,6 +248,83 @@ public: return group.Group; } + enum class ESanitizeResult { + SUCCESS, + FAIL, + ALREADY, + }; + + using TSanitizeGroupResult = std::pair<ESanitizeResult, TGroupMapper::TGroupDefinition>; + TSanitizeGroupResult SanitizeGroup(TGroupMapper& mapper, ui32 groupId, const TSet<TPDiskId>& unusableDisks, + bool makeThemForbidden = false, bool requireOperational = false, bool allowError = false, + std::pair<TVDiskIdShort, TPDiskId>* movedDisk = nullptr) { + TGroupRecord& group = Groups.at(groupId); + + TGroupMapper::TForbiddenPDisks forbid(unusableDisks.begin(), unusableDisks.end()); + if (!makeThemForbidden) { + forbid.clear(); + } + + Ctest << "groupId# " << groupId << " sanitizing group# " << FormatGroup(group.Group) << Endl; + for (ui32 i = 0; i < group.Group.size(); ++i) { + for (ui32 j = 0; j < group.Group[i].size(); ++j) { + for (ui32 k = 0; k < group.Group[i][j].size(); ++k) { + auto& pdisk = group.Group[i][j][k]; + --PDisks.at(pdisk).NumSlots; + } + } + } + + TGroupMapper::TMisplacedVDisks result = mapper.FindMisplacedVDisks(group.Group); + if (result) { + Ctest << "error# " << result.ErrorReason << Endl; + if (allowError) { + for (auto& realm : group.Group) { + for (auto& domain : realm) { + for (auto& pdisk : domain) { + ++PDisks.at(pdisk).NumSlots; + } + } + } + return {ESanitizeResult::FAIL, {}}; + } + } + + ESanitizeResult status = ESanitizeResult::ALREADY; + TString error; + + if (!result.Disks.empty()) { + status = ESanitizeResult::FAIL; + for (auto vdisk : result.Disks) { + auto target = mapper.TargetMisplacedVDisk(groupId, group.Group, vdisk, + std::move(forbid), 0, requireOperational, error); + if (target) { + status = ESanitizeResult::SUCCESS; + if (movedDisk) { + *movedDisk = {vdisk, *target}; + } + break; + } + } + } + + if (status == ESanitizeResult::FAIL) { + Ctest << "Sanitation failed! Last error reason: " << error << Endl; + } + + group.PDisks.clear(); + for (const auto& realm : group.Group) { + for (const auto& domain : realm) { + for (const auto& pdisk : domain) { + group.PDisks.push_back(pdisk); + ++PDisks.at(pdisk).NumSlots; + } + } + } + + return {status, group.Group}; + } + void SetGroup(ui32 groupId, const TGroupMapper::TGroupDefinition& group) { auto& g = Groups[groupId]; for (const TPDiskId& pdiskId : g.PDisks) { @@ -449,6 +526,69 @@ public: } } } + + bool CheckGroupPlacement(const TGroupMapper::TGroupDefinition& group, TGroupGeometryInfo geom, TString& error) { + NLayoutChecker::TDomainMapper domainMapper; + if (group.size() != geom.GetNumFailRealms()) { + error = "Wrong fail realms number"; + return false; + } + + for (ui32 failRealm = 0; failRealm < geom.GetNumFailRealms(); ++failRealm) { + if (group[failRealm].size() != geom.GetNumFailDomainsPerFailRealm()) { + error = TStringBuilder() << "Wrong fail domains number in failRealm# " << failRealm; + return false; + } + for (ui32 failDomain = 0; failDomain < geom.GetNumFailDomainsPerFailRealm(); ++failDomain) { + if (group[failRealm][failDomain].size() != geom.GetNumVDisksPerFailDomain()) { + error = TStringBuilder() << "Wrong vdisks number in failRealm# " << failRealm << ", failDomain# " << failDomain; + return false; + } + } + } + + std::unordered_set<ui32> usedPRealms; + for (ui32 failRealm = 0; failRealm < geom.GetNumFailRealms(); ++failRealm) { + const NLayoutChecker::TPDiskLayoutPosition pdisk0(domainMapper, PDisks.at(group[failRealm][0][0]).GetLocation(), group[failRealm][0][0], geom); + ui32 pRealm = pdisk0.Realm.Index(); + if (usedPRealms.count(pRealm)) { + error = "same pRealm in different fail realms detected"; + return false; + } + usedPRealms.insert(pRealm); + std::unordered_set<ui32> usedPDomains; + for (ui32 failDomain = 0; failDomain < geom.GetNumFailDomainsPerFailRealm(); ++failDomain) { + const NLayoutChecker::TPDiskLayoutPosition pdisk1(domainMapper, PDisks.at(group[failRealm][failDomain][0]).GetLocation(), + group[failRealm][failDomain][0], geom); + ui32 pDomain = pdisk1.Domain.Index(); + if (usedPDomains.count(pDomain)) { + error = "same pDomain in different fail domains detected"; + return false; + } + usedPDomains.insert(pDomain); + std::set<TPDiskId> usedPDisks; + for (ui32 vdisk = 0; vdisk < geom.GetNumVDisksPerFailDomain(); ++vdisk) { + auto pdiskId = group[failRealm][failDomain][vdisk]; + auto pdisk = NLayoutChecker::TPDiskLayoutPosition(domainMapper, PDisks.at(pdiskId).GetLocation(), pdiskId, geom); + if (pdisk.Realm.Index() != pRealm) { + error = TStringBuilder() << "different pRealms within one failRealm, vdisk# " << failRealm << ":" << failDomain << + ":" << vdisk << ", expected pRealm " << pRealm << ", got " << pdisk.Realm.Index(); + return false; + } + if (pdisk.Domain.Index() != pDomain) { + error = "different pDomains within one failDomain"; + return false; + } + if (usedPDisks.count(pdiskId)) { + error = "same PDisk in different VDisks"; + return false; + } + } + } + } + + return true; + } }; Y_UNIT_TEST_SUITE(TGroupMapperTest) { @@ -894,33 +1034,18 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) { Y_UNIT_TEST(SanitizeGroupTest3dc) { const ui32 numDataCenters = 3; const ui32 numRacks = 5; - TTestContext context(numDataCenters, 1, numRacks, 1, 1); - TGroupMapper::TGroupDefinition group; + const ui32 numDisks = 3; + TTestContext context(numDataCenters, 1, numRacks, 1, numDisks); + TGroupMapper::TGroupDefinition groupDef; ui32 groupId; { TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc)); context.PopulateGroupMapper(mapper, 1); - groupId = context.AllocateGroup(mapper, group); + groupId = context.AllocateGroup(mapper, groupDef); Ctest << "group after allocation:" << Endl; - context.DumpGroup(group); + context.DumpGroup(groupDef); } - auto checkLayout = [&](const auto& group) { - TGroupGeometryInfo geom = TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc); - THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>> layout; - for (ui32 i = 0; i < group.size(); ++i) { - for (ui32 j = 0; j < group[i].size(); ++j) { - for (ui32 k = 0; k < group[i][j].size(); ++k) { - layout.emplace(TVDiskIdShort(i, j, k), std::make_pair(context.GetLocation(group[i][j][k]), - group[i][j][k])); - } - } - } - return CheckGroupLayout(geom, layout); - }; - - UNIT_ASSERT(checkLayout(group)); - for (ui32 n = 0; n < 1000; ++n) { Ctest << Endl << "iteration# " << n << Endl; @@ -929,51 +1054,61 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) { context.ImportLayout(layout); Ctest << "group after layout shuffling:" << Endl; - context.DumpGroup(group); - - struct TQueueItem { - TGroupMapper::TGroupDefinition Group; - TString Path; - TSet<TGroupMapper::TGroupDefinition> Seen; - TSet<TVDiskIdShort> VDiskItems; - TSet<TPDiskId> PDiskItems; - }; - std::deque<TQueueItem> queue; - for (queue.push_back({.Group = group}); !queue.empty(); ) { - TQueueItem item = std::move(queue.front()); - queue.pop_front(); - const auto [it, inserted] = item.Seen.insert(item.Group); + context.DumpGroup(groupDef); + + ui32 sanitationStep = 0; + + TGroupMapper::TGroupDefinition group = groupDef; + TString path = ""; + TSet<TGroupMapper::TGroupDefinition> seen; + TSet<TVDiskIdShort> vdiskItems; + TSet<TPDiskId> pdiskItems; + + while (true) { + const auto [it, inserted] = seen.insert(group); UNIT_ASSERT(inserted); - UNIT_ASSERT(item.Seen.size() <= 9); - Ctest << "processing path# " << item.Path << Endl; - - auto candidates = checkLayout(item.Group); - if (!candidates) { - for (const TVDiskIdShort& vdiskId : candidates.Candidates) { - TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc)); - context.SetGroup(groupId, item.Group); - context.PopulateGroupMapper(mapper, 2); - const TPDiskId& pdiskId = item.Group[vdiskId.FailRealm][vdiskId.FailDomain][vdiskId.VDisk]; - auto temp = context.ReallocateGroup(mapper, groupId, {pdiskId}, false, false, false); - TString path = TStringBuilder() << item.Path << "/" << (int)vdiskId.FailRealm << ":" - << (int)vdiskId.FailDomain << ":" << (int)vdiskId.VDisk << "@" << pdiskId; - Ctest << "path# " << path << Endl; - context.DumpGroup(temp); - - auto vdiskItems = item.VDiskItems; -// const auto [it1, inserted1] = vdiskItems.insert(vdiskId); -// UNIT_ASSERT_C(inserted1, "Duplicate group cell# " << vdiskId); - - auto pdiskItems = item.PDiskItems; -// const auto [it2, inserted2] = pdiskItems.insert(pdiskId); -// UNIT_ASSERT_C(inserted2, "Duplicate origin PDisk# " << pdiskId); - - queue.push_front({.Group = std::move(temp), .Path = std::move(path), .Seen = item.Seen, - .VDiskItems = std::move(vdiskItems), .PDiskItems = std::move(pdiskItems)}); - } + UNIT_ASSERT(seen.size() <= 9); + Ctest << "processing path# " << path << Endl; + + TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc)); + + context.SetGroup(groupId, group); + context.PopulateGroupMapper(mapper, 2); + + std::pair<TVDiskIdShort, TPDiskId> movedDisk; + auto [res, tempGroup] = context.SanitizeGroup(mapper, groupId, {}, false, false, false, &movedDisk); + Ctest << "Sanititaion step# " << sanitationStep++ << ", sanitizer "; + switch (res) { + case TTestContext::ESanitizeResult::FAIL: + Ctest << "FAIL" << Endl; + UNIT_FAIL("Sanitizing failed"); + break; + case TTestContext::ESanitizeResult::ALREADY: + Ctest << "ALREADY" << Endl; + break; + case TTestContext::ESanitizeResult::SUCCESS: + Ctest << "SUCCESS" << Endl; + break; + } + + path = TStringBuilder() << path << "/" << (int)movedDisk.first.FailRealm << ":" + << (int)movedDisk.first.FailDomain << ":" << (int)movedDisk.first.VDisk << "@" << movedDisk.second; + Ctest << "path# " << path << Endl; + context.DumpGroup(tempGroup); + if (res == TTestContext::ESanitizeResult::ALREADY) { + TString error; + UNIT_ASSERT_C(context.CheckGroupPlacement(group, TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc), error), error); + break; } Ctest << Endl; + group = tempGroup; + + const auto [it1, inserted1] = vdiskItems.insert(movedDisk.first); + UNIT_ASSERT_C(inserted1, "Duplicate group cell# " << movedDisk.first); + + const auto [it2, inserted2] = pdiskItems.insert(movedDisk.second); + UNIT_ASSERT_C(inserted2, "Duplicate origin PDisk# " << movedDisk.second); } } } diff --git a/ydb/core/mind/bscontroller/monitoring.cpp b/ydb/core/mind/bscontroller/monitoring.cpp index 8864a53dc4d..60645b190a5 100644 --- a/ydb/core/mind/bscontroller/monitoring.cpp +++ b/ydb/core/mind/bscontroller/monitoring.cpp @@ -445,6 +445,21 @@ public: break; } + case NKikimrBlobStorage::TConfigRequest::TCommand::kSanitizeGroup: { + const auto& cmd = q.GetSanitizeGroup(); + + const ui32 groupId = cmd.GetGroupId(); + TStringStream msg; + msg << "Group sanitizing request" + << "<br/>GroupId# " << groupId; + + Events.emplace_front(timestamp, msg.Str(), std::move(reassign)); + auto& j = Events.front().Json; + j["Event"] = "SanitizeGroup"; + j["GroupId"] = ToString(groupId); + break; + } + case NKikimrBlobStorage::TConfigRequest::TCommand::kUpdateDriveStatus: { const auto& cmd = q.GetUpdateDriveStatus(); diff --git a/ydb/core/mind/bscontroller/self_heal.cpp b/ydb/core/mind/bscontroller/self_heal.cpp index 4e7178363fe..13c67af6eb0 100644 --- a/ydb/core/mind/bscontroller/self_heal.cpp +++ b/ydb/core/mind/bscontroller/self_heal.cpp @@ -26,7 +26,7 @@ namespace NKikimr::NBsController { TActorId SelfHealId; // filled on bootstrap const TGroupId GroupId; const TEvControllerUpdateSelfHealInfo::TGroupContent Group; - const TVDiskID VDiskToReplace; + const std::optional<TVDiskID> VDiskToReplace; TBlobStorageGroupInfo::TTopology Topology; THolder<TBlobStorageGroupInfo::TGroupVDisks> FailedGroupDisks; THashSet<TVDiskID> PendingVDisks; @@ -35,7 +35,7 @@ namespace NKikimr::NBsController { public: TReassignerActor(TActorId controllerId, TGroupId groupId, TEvControllerUpdateSelfHealInfo::TGroupContent group, - TVDiskID vdiskToReplace) + std::optional<TVDiskID> vdiskToReplace) : ControllerId(controllerId) , GroupId(groupId) , Group(std::move(group)) @@ -78,7 +78,7 @@ namespace NKikimr::NBsController { FailedGroupDisks = MakeHolder<TBlobStorageGroupInfo::TGroupVDisks>(&Topology); for (const auto& [vdiskId, vdisk] : Group.VDisks) { - if (vdiskId == VDiskToReplace) { + if (VDiskToReplace && vdiskId == *VDiskToReplace) { *FailedGroupDisks |= {&Topology, vdiskId}; continue; // skip disk we are going to replcate -- it will be wiped out anyway } @@ -158,12 +158,17 @@ namespace NKikimr::NBsController { auto *request = record.MutableRequest(); request->SetIgnoreGroupReserve(true); request->SetSettleOnlyOnOperationalDisks(true); - auto *cmd = request->AddCommand()->MutableReassignGroupDisk(); - cmd->SetGroupId(VDiskToReplace.GroupID); - cmd->SetGroupGeneration(VDiskToReplace.GroupGeneration); - cmd->SetFailRealmIdx(VDiskToReplace.FailRealm); - cmd->SetFailDomainIdx(VDiskToReplace.FailDomain); - cmd->SetVDiskIdx(VDiskToReplace.VDisk); + if (VDiskToReplace) { + auto *cmd = request->AddCommand()->MutableReassignGroupDisk(); + cmd->SetGroupId(VDiskToReplace->GroupID); + cmd->SetGroupGeneration(VDiskToReplace->GroupGeneration); + cmd->SetFailRealmIdx(VDiskToReplace->FailRealm); + cmd->SetFailDomainIdx(VDiskToReplace->FailDomain); + cmd->SetVDiskIdx(VDiskToReplace->VDisk); + } else { + auto *cmd = request->AddCommand()->MutableSanitizeGroup(); + cmd->SetGroupId(GroupId); + } Send(ControllerId, ev.Release()); } @@ -341,12 +346,8 @@ namespace NKikimr::NBsController { Y_VERIFY(!group.LayoutValid); if (group.ReassignerActorId || now < group.NextRetryTimestamp) { // nothing to do - } else if (const auto v = FindVDiskToReplaceByLayout(group, now)) { - group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, *v)); - } else if (group.LayoutValid) { - GroupsWithInvalidLayout.Remove(&group); } else { - ++counter; + group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, std::nullopt)); } } } @@ -401,40 +402,20 @@ namespace NKikimr::NBsController { CheckGroups(); } } - - std::optional<TVDiskID> FindVDiskToReplaceByLayout(TGroupRecord& group, TInstant now) { - THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>> layout; - for (const auto& [vdiskId, vdisk] : group.Content.VDisks) { - Y_VERIFY(HostRecords); + + using TVDiskInfo = TEvControllerUpdateSelfHealInfo::TGroupContent::TVDiskInfo; + TGroupMapper::TGroupDefinition MakeGroupDefinition(const TMap<TVDiskID, TVDiskInfo>& vdisks, + const TGroupGeometryInfo& geom) { + TGroupMapper::TGroupDefinition groupDefinition; + geom.ResizeGroup(groupDefinition); + + for (const auto& [vdiskId, vdisk] : vdisks) { if (!vdisk.Decommitted) { - layout.emplace(vdiskId, std::make_pair(HostRecords->GetLocation(vdisk.Location.NodeId), - vdisk.Location.ComprisingPDiskId())); + groupDefinition[vdiskId.FailRealm][vdiskId.FailDomain][vdiskId.VDisk] = vdisk.Location.ComprisingPDiskId(); } } - const TLayoutCheckResult checkResult = CheckGroupLayout(*group.Content.Geometry, layout); - if (checkResult) { // group is valid - group.LayoutValid = true; - return std::nullopt; - } - THashSet<TVDiskIdShort> badDisks; - for (const auto& [vdiskId, vdisk] : group.Content.VDisks) { - const auto it = group.VDiskStatus.find(vdiskId); - if (it == group.VDiskStatus.end() || it->second.GetStatus(now) != NKikimrBlobStorage::EVDiskStatus::READY || vdisk.Bad) { - badDisks.insert(vdiskId); - } - } - if (badDisks.empty()) { - return TVDiskID(group.GroupId, group.Content.Generation, checkResult.Candidates.front()); - } else if (badDisks.size() == 1) { - for (const auto& vdiskId : checkResult.Candidates) { - if (badDisks.contains(vdiskId)) { - return TVDiskID(group.GroupId, group.Content.Generation, vdiskId); - } - } - } - - return std::nullopt; + return std::move(groupDefinition); } void HandleWakeup() { diff --git a/ydb/core/mind/bscontroller/ut_selfheal/env.h b/ydb/core/mind/bscontroller/ut_selfheal/env.h index 90643f56b2b..197ab4b4f76 100644 --- a/ydb/core/mind/bscontroller/ut_selfheal/env.h +++ b/ydb/core/mind/bscontroller/ut_selfheal/env.h @@ -214,4 +214,4 @@ struct TEnvironmentSetup { WaitForEdgeActorEvent<TEvents::TEvWakeup>(edge); } -};
\ No newline at end of file +}; diff --git a/ydb/core/protos/blobstorage_config.proto b/ydb/core/protos/blobstorage_config.proto index 20f8315654b..f901fb24269 100644 --- a/ydb/core/protos/blobstorage_config.proto +++ b/ydb/core/protos/blobstorage_config.proto @@ -271,6 +271,10 @@ message TReassignGroupDisk { bool SuppressDonorMode = 7; // when set, donor mode is not used even if it is enabled through BSC } +message TSanitizeGroup { + uint32 GroupId = 1; +} + enum EClusterFitAlgorithm { QUADRATIC = 0; HUNGARIAN = 1; @@ -515,6 +519,7 @@ message TConfigRequest { // commands intended for internal use TReassignGroupDisk ReassignGroupDisk = 19; + TSanitizeGroup SanitizeGroup = 42; } } |