diff options
author | Alexander Rutkovsky <alexander.rutkovsky@gmail.com> | 2022-04-28 13:57:34 +0300 |
---|---|---|
committer | Alexander Rutkovsky <alexander.rutkovsky@gmail.com> | 2022-04-28 13:57:34 +0300 |
commit | ed1b327b748bfc62f10eb935200de8a2087762f5 (patch) | |
tree | 6c004eee2430da3afad7c7a0bc8d9d9d7114811e | |
parent | 00311b881dc0a494fbe5cc94593aa99a0c2f51ac (diff) | |
download | ydb-ed1b327b748bfc62f10eb935200de8a2087762f5.tar.gz |
Fix some problems and improve code KIKIMR-14580
ref:cdc978539243b7ddc1e6357c69ea78e650f00034
-rw-r--r-- | ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp | 9 | ||||
-rw-r--r-- | ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h | 7 | ||||
-rw-r--r-- | ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp | 2 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/bsc.cpp | 2 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/group_geometry_info.h | 4 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/group_layout_checker.cpp | 12 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/group_layout_checker.h | 47 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/group_mapper.cpp | 365 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/group_mapper.h | 13 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/group_mapper_ut.cpp | 6 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/impl.h | 9 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/self_heal.cpp | 8 |
12 files changed, 256 insertions, 228 deletions
diff --git a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp index 054d5be6653..7a7cb76ed6d 100644 --- a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp +++ b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp @@ -299,7 +299,7 @@ TBlobStorageGroupInfo::TTopology::TTopology(TBlobStorageGroupType gtype) {} TBlobStorageGroupInfo::TTopology::TTopology(TBlobStorageGroupType gtype, ui32 numFailRealms, - ui32 numFailDomainsPerFailRealm, ui32 numVDisksPerFailDomain) + ui32 numFailDomainsPerFailRealm, ui32 numVDisksPerFailDomain, bool finalize) : GType(gtype) { FailRealms = {numFailRealms, { @@ -307,6 +307,9 @@ TBlobStorageGroupInfo::TTopology::TTopology(TBlobStorageGroupType gtype, ui32 nu {numVDisksPerFailDomain, TVDiskInfo{}} }} }}; + if (finalize) { + FinalizeConstruction(); + } } TBlobStorageGroupInfo::TTopology::~TTopology() = default; @@ -400,10 +403,6 @@ ui32 TBlobStorageGroupInfo::TTopology::GetOrderNumber(const TVDiskIdShort &vdisk return FailRealms[vdisk.FailRealm].FailDomains[vdisk.FailDomain].VDisks[vdisk.VDisk].OrderNumber; } -ui32 TBlobStorageGroupInfo::TTopology::GetNumVDisksPerFailDomain() const { - return FailRealms[0].FailDomains[0].VDisks.size(); -} - void TBlobStorageGroupInfo::TTopology::PickSubgroup(ui32 hash, TBlobStorageGroupInfo::TOrderNums &orderNums) const { return BlobMapper->PickSubgroup(hash, orderNums); } diff --git a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h index e2d1445be4e..2b89538bc8b 100644 --- a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h +++ b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h @@ -191,7 +191,8 @@ public: TVector<TVDiskIdShort> VDiskIdForOrderNumber; TTopology(TBlobStorageGroupType gtype); - TTopology(TBlobStorageGroupType gtype, ui32 numFailRealms, ui32 numFailDomainsPerFailRealm, ui32 numVDisksPerFailDomain); + TTopology(TBlobStorageGroupType gtype, ui32 numFailRealms, ui32 numFailDomainsPerFailRealm, ui32 numVDisksPerFailDomain, + bool finalize = false); TTopology(const TTopology&) = delete; TTopology &operator =(const TTopology&) = delete; TTopology(TTopology&&) = default; @@ -221,7 +222,9 @@ public: // get the total number of VDisks in the blobstorage group ui32 GetTotalVDisksNum() const { return TotalVDisks; } // get number of VDisks per fail domain - ui32 GetNumVDisksPerFailDomain() const; + ui32 GetNumVDisksPerFailDomain() const { return FailRealms[0].FailDomains[0].VDisks.size(); } + // get number of fail domains per fail realm + ui32 GetNumFailDomainsPerFailRealm() const { return FailRealms[0].FailDomains.size(); } // get quorum checker const IQuorumChecker& GetQuorumChecker() const { return *QuorumChecker; } diff --git a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp index 9e17730c3d6..f74e2d48413 100644 --- a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp +++ b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp @@ -98,6 +98,6 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { env.Sim(TDuration::Minutes(15)); auto corrected = getGroupsWithIncorrectLayout(); Cerr << "bad groups after shuffling and fixing# " << FormatList(corrected) << Endl; -// UNIT_ASSERT(corrected.empty()); + UNIT_ASSERT(corrected.empty()); } } diff --git a/ydb/core/mind/bscontroller/bsc.cpp b/ydb/core/mind/bscontroller/bsc.cpp index bb44c7fbb04..bd9f83d86d6 100644 --- a/ydb/core/mind/bscontroller/bsc.cpp +++ b/ydb/core/mind/bscontroller/bsc.cpp @@ -152,7 +152,7 @@ void TBlobStorageController::Handle(TEvInterconnect::TEvNodesInfo::TPtr &ev) { const bool initial = !HostRecords; HostRecords = std::make_shared<THostRecordMap::element_type>(ev->Get()); Schedule(TDuration::Minutes(5), new TEvPrivate::TEvHostRecordsTimeToLiveExceeded); - TActivationContext::Send(ev->Forward(SelfHealId)); + Send(SelfHealId, new TEvPrivate::TEvUpdateHostRecords(HostRecords)); if (initial) { Execute(CreateTxInitScheme()); } diff --git a/ydb/core/mind/bscontroller/group_geometry_info.h b/ydb/core/mind/bscontroller/group_geometry_info.h index 5d37a0dfd2c..a74698f1f84 100644 --- a/ydb/core/mind/bscontroller/group_geometry_info.h +++ b/ydb/core/mind/bscontroller/group_geometry_info.h @@ -114,6 +114,10 @@ namespace NKikimr::NBsController { return true; } + TBlobStorageGroupType GetType() const { + return Type; + } + TBlobStorageGroupType::EErasureSpecies GetErasure() const { return Type.GetErasure(); } diff --git a/ydb/core/mind/bscontroller/group_layout_checker.cpp b/ydb/core/mind/bscontroller/group_layout_checker.cpp index cf947e16741..8ab76e3e4f5 100644 --- a/ydb/core/mind/bscontroller/group_layout_checker.cpp +++ b/ydb/core/mind/bscontroller/group_layout_checker.cpp @@ -10,26 +10,25 @@ namespace NKikimr::NBsController { return {}; } - TGroupLayout group(geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm()); + TBlobStorageGroupInfo::TTopology topology(geom.GetType(), geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm(), + geom.GetNumVDisksPerFailDomain(), true); + TGroupLayout group(topology); TDomainMapper mapper; THashMap<TVDiskIdShort, TPDiskLayoutPosition> map; for (const auto& [vdiskId, p] : layout) { const auto& [location, pdiskId] = p; TPDiskLayoutPosition pos(mapper, location, pdiskId, geom); - group.AddDisk(pos, vdiskId.FailRealm, vdiskId.FailDomain); + group.AddDisk(pos, topology.GetOrderNumber(vdiskId)); map.emplace(vdiskId, pos); } std::vector<std::pair<TScore, TVDiskIdShort>> scoreboard; for (const auto& [vdiskId, pos] : map) { - scoreboard.emplace_back(group.GetCandidateScore(pos, vdiskId.FailRealm, vdiskId.FailDomain), vdiskId); + scoreboard.emplace_back(group.GetCandidateScore(pos, topology.GetOrderNumber(vdiskId)), vdiskId); } auto comp1 = [](const auto& x, const auto& y) { return x.second < y.second; }; std::sort(scoreboard.begin(), scoreboard.end(), comp1); - for (const auto& [score, vdiskId] : scoreboard) { - Cerr << vdiskId << "@" << map[vdiskId].ToString() << " -> " << score.ToString() << Endl; - } auto comp = [](const auto& x, const auto& y) { return x.first.BetterThan(y.first); }; std::sort(scoreboard.begin(), scoreboard.end(), comp); @@ -37,7 +36,6 @@ namespace NKikimr::NBsController { const auto reference = scoreboard.back().first; if (!reference.SameAs({})) { // not perfectly correct layout for (; !scoreboard.empty() && !scoreboard.back().first.BetterThan(reference); scoreboard.pop_back()) { - Cerr << "candidate# " << scoreboard.back().second << Endl; res.Candidates.push_back(scoreboard.back().second); } } diff --git a/ydb/core/mind/bscontroller/group_layout_checker.h b/ydb/core/mind/bscontroller/group_layout_checker.h index 2319d83515c..407f0b7c7f7 100644 --- a/ydb/core/mind/bscontroller/group_layout_checker.h +++ b/ydb/core/mind/bscontroller/group_layout_checker.h @@ -155,7 +155,7 @@ namespace NKikimr::NBsController { }; struct TGroupLayout { - const ui32 NumFailDomainsPerFailRealm; + const TBlobStorageGroupInfo::TTopology& Topology; ui32 NumDisks = 0; THashMap<TEntityId, ui32> NumDisksPerRealmGroup; @@ -168,45 +168,54 @@ namespace NKikimr::NBsController { TStackVec<THashMap<TEntityId, ui32>, 32> NumDisksPerDomain; THashMap<TEntityId, ui32> NumDisksPerDomainTotal; - TGroupLayout(ui32 numFailRealms, ui32 numFailDomainsPerFailRealm) - : NumFailDomainsPerFailRealm(numFailDomainsPerFailRealm) - , NumDisksInRealm(numFailRealms) - , NumDisksPerRealm(numFailRealms) - , NumDisksInDomain(numFailRealms * numFailDomainsPerFailRealm) - , NumDisksPerDomain(numFailRealms * numFailDomainsPerFailRealm) + TGroupLayout(const TBlobStorageGroupInfo::TTopology& topology) + : Topology(topology) + , NumDisksInRealm(Topology.GetTotalFailRealmsNum()) + , NumDisksPerRealm(Topology.GetTotalFailRealmsNum()) + , NumDisksInDomain(Topology.GetTotalFailDomainsNum()) + , NumDisksPerDomain(Topology.GetTotalFailDomainsNum()) {} - void UpdateDisk(const TPDiskLayoutPosition& pos, ui32 realmIdx, ui32 domainIdx, ui32 value) { - domainIdx += realmIdx * NumFailDomainsPerFailRealm; + void UpdateDisk(const TPDiskLayoutPosition& pos, ui32 orderNumber, ui32 value) { NumDisks += value; NumDisksPerRealmGroup[pos.RealmGroup] += value; - NumDisksInRealm[realmIdx] += value; - NumDisksPerRealm[realmIdx][pos.Realm] += value; + const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber); + NumDisksInRealm[vdisk.FailRealm] += value; + NumDisksPerRealm[vdisk.FailRealm][pos.Realm] += value; NumDisksPerRealmTotal[pos.Realm] += value; + const ui32 domainIdx = Topology.GetFailDomainOrderNumber(vdisk); NumDisksInDomain[domainIdx] += value; NumDisksPerDomain[domainIdx][pos.Domain] += value; NumDisksPerDomainTotal[pos.Domain] += value; } - void AddDisk(const TPDiskLayoutPosition& pos, ui32 realmIdx, ui32 domainIdx) { - UpdateDisk(pos, realmIdx, domainIdx, 1); + void AddDisk(const TPDiskLayoutPosition& pos, ui32 orderNumber) { + UpdateDisk(pos, orderNumber, 1); } - void RemoveDisk(const TPDiskLayoutPosition& pos, ui32 realmIdx, ui32 domainIdx) { - UpdateDisk(pos, realmIdx, domainIdx, Max<ui32>()); + void RemoveDisk(const TPDiskLayoutPosition& pos, ui32 orderNumber) { + UpdateDisk(pos, orderNumber, Max<ui32>()); } - TScore GetCandidateScore(const TPDiskLayoutPosition& pos, ui32 realmIdx, ui32 domainIdx) { - domainIdx += realmIdx * NumFailDomainsPerFailRealm; + TScore GetCandidateScore(const TPDiskLayoutPosition& pos, ui32 orderNumber) { + const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber); + const ui32 domainIdx = Topology.GetFailDomainOrderNumber(vdisk); return { - .RealmInterlace = NumDisksPerRealmTotal[pos.Realm] - NumDisksPerRealm[realmIdx][pos.Realm], + .RealmInterlace = NumDisksPerRealmTotal[pos.Realm] - NumDisksPerRealm[vdisk.FailRealm][pos.Realm], .DomainInterlace = NumDisksPerDomainTotal[pos.Domain] - NumDisksPerDomain[domainIdx][pos.Domain], .RealmGroupScatter = NumDisks - NumDisksPerRealmGroup[pos.RealmGroup], - .RealmScatter = NumDisksInRealm[realmIdx] - NumDisksPerRealm[realmIdx][pos.Realm], + .RealmScatter = NumDisksInRealm[vdisk.FailRealm] - NumDisksPerRealm[vdisk.FailRealm][pos.Realm], .DomainScatter = NumDisksInDomain[domainIdx] - NumDisksPerDomain[domainIdx][pos.Domain], }; } + + TScore GetExcludedDiskScore(const TPDiskLayoutPosition& pos, ui32 orderNumber) { + RemoveDisk(pos, orderNumber); + const TScore score = GetCandidateScore(pos, orderNumber); + AddDisk(pos, orderNumber); + return score; + } }; } // NLayoutChecker diff --git a/ydb/core/mind/bscontroller/group_mapper.cpp b/ydb/core/mind/bscontroller/group_mapper.cpp index 967aeafaa15..5619534fb3a 100644 --- a/ydb/core/mind/bscontroller/group_mapper.cpp +++ b/ydb/core/mind/bscontroller/group_mapper.cpp @@ -60,94 +60,69 @@ namespace NKikimr::NBsController { struct TAllocator { TImpl& Self; - const ui32 NumFailRealms; - const ui32 NumFailDomainsPerFailRealm; - const ui32 NumFailDomainsTotal; - const ui32 NumVDisksPerFailDomain; - const ui32 GroupSize; - TStackVec<ui8, 32> RealmIdx; - TStackVec<ui8, 32> DomainIdx; - TStackVec<ui8, 32> DomainThroughIdx; - TStackVec<ui8, 32> VDiskIdx; + const TBlobStorageGroupInfo::TTopology Topology; THashSet<TPDiskId> OldGroupContent; // set of all existing disks in the group, inclusing ones which are replaced const i64 RequiredSpace; const bool RequireOperational; TForbiddenPDisks ForbiddenDisks; THashMap<ui32, unsigned> LocalityFactor; TGroupLayout GroupLayout; - std::optional<TScore> BestScore; + std::optional<TScore> WorstScore; TAllocator(TImpl& self, const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational, TForbiddenPDisks forbiddenDisks, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks) : Self(self) - , NumFailRealms(geom.GetNumFailRealms()) - , NumFailDomainsPerFailRealm(geom.GetNumFailDomainsPerFailRealm()) - , NumFailDomainsTotal(NumFailRealms * NumFailDomainsPerFailRealm) - , NumVDisksPerFailDomain(geom.GetNumVDisksPerFailDomain()) - , GroupSize(NumFailDomainsTotal * NumVDisksPerFailDomain) - , RealmIdx(GroupSize) - , DomainIdx(GroupSize) - , DomainThroughIdx(GroupSize) - , VDiskIdx(GroupSize) + , Topology(geom.GetType(), geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm(), geom.GetNumVDisksPerFailDomain(), true) , RequiredSpace(requiredSpace) , RequireOperational(requireOperational) , ForbiddenDisks(std::move(forbiddenDisks)) - , GroupLayout(NumFailRealms, NumFailDomainsPerFailRealm) + , GroupLayout(Topology) { for (const auto& [vdiskId, pdiskId] : replacedDisks) { OldGroupContent.insert(pdiskId); } - for (ui32 index = 0, domainThroughIdx = 0, realmIdx = 0; realmIdx < NumFailRealms; ++realmIdx) { - for (ui32 domainIdx = 0; domainIdx < NumFailDomainsPerFailRealm; ++domainIdx, ++domainThroughIdx) { - for (ui32 vdiskIdx = 0; vdiskIdx < NumVDisksPerFailDomain; ++vdiskIdx, ++index) { - RealmIdx[index] = realmIdx; - DomainIdx[index] = domainIdx; - DomainThroughIdx[index] = domainThroughIdx; - VDiskIdx[index] = vdiskIdx; - } - } - } } TGroup ProcessExistingGroup(const TGroupDefinition& group, TString& error) { - TGroup res(GroupSize); - - ui32 index = 0; - for (const auto& realm : group) { - for (const auto& domain : realm) { - for (const auto& pdiskId : domain) { - if (pdiskId != TPDiskId()) { - const auto it = Self.PDisks.find(pdiskId); - if (it == Self.PDisks.end()) { - error = TStringBuilder() << "existing group contains missing PDiskId# " << pdiskId; - return {}; - } - TPDiskInfo& pdisk = it->second; - res[index] = &pdisk; - - const auto [_, inserted] = OldGroupContent.insert(pdiskId); - if (!inserted) { - error = TStringBuilder() << "group contains duplicate PDiskId# " << pdiskId; - return {}; - } - - if (!pdisk.Decommitted) { - AddUsedDisk(pdisk); - GroupLayout.AddDisk(pdisk.Position, RealmIdx[index], DomainIdx[index]); - } + TGroup res(Topology.GetTotalVDisksNum()); + + struct TExError { TString error; }; + + try { + Traverse(group, [&](TVDiskIdShort vdisk, TPDiskId pdiskId) { + if (pdiskId != TPDiskId()) { + const ui32 orderNumber = Topology.GetOrderNumber(vdisk); + + const auto it = Self.PDisks.find(pdiskId); + if (it == Self.PDisks.end()) { + throw TExError{TStringBuilder() << "existing group contains missing PDiskId# " << pdiskId}; } + TPDiskInfo& pdisk = it->second; + res[orderNumber] = &pdisk; - ++index; + const auto [_, inserted] = OldGroupContent.insert(pdiskId); + if (!inserted) { + throw TExError{TStringBuilder() << "group contains duplicate PDiskId# " << pdiskId}; + } + + if (!pdisk.Decommitted) { + AddUsedDisk(pdisk); + GroupLayout.AddDisk(pdisk.Position, orderNumber); + } } - } + }); + } catch (const TExError& e) { + error = e.error; + return {}; } return res; } void Decompose(const TGroup& in, TGroupDefinition& out) { - for (ui32 i = 0; i < GroupSize; ++i) { - out[RealmIdx[i]][DomainIdx[i]][VDiskIdx[i]] = in[i]->PDiskId; + for (ui32 i = 0; i < in.size(); ++i) { + const TVDiskIdShort vdisk = Topology.GetVDiskId(i); + out[vdisk.FailRealm][vdisk.FailDomain][vdisk.VDisk] = in[i]->PDiskId; } } @@ -238,8 +213,8 @@ namespace NKikimr::NBsController { undo.Log(index, pdisk); group[index] = pdisk; AddUsedDisk(*pdisk); - GroupLayout.AddDisk(pdisk->Position, RealmIdx[index], DomainIdx[index]); - BestScore.reset(); // invalidate score + GroupLayout.AddDisk(pdisk->Position, index); + WorstScore.reset(); // invalidate score } void Revert(TUndoLog& undo, TGroup& group, size_t until) { @@ -247,112 +222,118 @@ namespace NKikimr::NBsController { const auto& item = undo.Items.back(); group[item.Index] = nullptr; RemoveUsedDisk(*item.PDisk); - GroupLayout.RemoveDisk(item.PDisk->Position, RealmIdx[item.Index], DomainIdx[item.Index]); - BestScore.reset(); // invalidate score + GroupLayout.RemoveDisk(item.PDisk->Position, item.Index); + WorstScore.reset(); // invalidate score } } bool FillInGroup(ui32 maxScore, TUndoLog& undo, TGroup& group) { - // Determine PDisks that fit our requirements (including score). - auto set = SetupMatchingDisks(maxScore); - - // Determine what we have to fill in -- full group, some realms, domains, or just some cells. - bool emptyGroup = true; - - TDynBitMap emptyRealms; - emptyRealms.Set(0, NumFailRealms); - - TDynBitMap emptyDomains; - emptyDomains.Set(0, NumFailDomainsTotal); - - TDynBitMap emptyDisks; - emptyDisks.Set(0, GroupSize); - - for (ui32 i = 0; i < GroupSize; ++i) { - if (group[i]) { - emptyGroup = false; - emptyRealms[RealmIdx[i]] = false; - emptyDomains[DomainThroughIdx[i]] = false; - emptyDisks[i] = false; + // determine PDisks that fit our requirements (including score) + auto v = SetupMatchingDisks(maxScore); + + // find which entities we need to allocate -- whole group, some realms, maybe some domains within specific realms? + bool isEmptyGroup = true; + std::vector<bool> isEmptyRealm(Topology.GetTotalFailRealmsNum(), true); + std::vector<bool> isEmptyDomain(Topology.GetTotalFailDomainsNum(), true); + for (ui32 orderNumber = 0; orderNumber < group.size(); ++orderNumber) { + if (group[orderNumber]) { + const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber); + isEmptyGroup = false; + isEmptyRealm[vdisk.FailRealm] = false; + const ui32 domainIdx = Topology.GetFailDomainOrderNumber(vdisk); + isEmptyDomain[domainIdx] = false; } } - // Allocate new full group and exit if it is absolutely empty. auto allocate = [&](auto what, ui32 index) { - TDiskRange fullRange(set.begin(), set.end()); TDynBitMap forbiddenEntities; forbiddenEntities.Reserve(Self.DomainMapper.GetIdCount()); - if (!AllocateWholeEntity(what, group, undo, index, fullRange, forbiddenEntities)) { + if (!AllocateWholeEntity(what, group, undo, index, {v.begin(), v.end()}, forbiddenEntities)) { Revert(undo, group, 0); return false; } return true; }; - if (emptyGroup) { + if (isEmptyGroup) { return allocate(TAllocateWholeGroup(), 0); } - // Fill in missing fail realms. - for (ui32 i = emptyRealms.FirstNonZeroBit(); i != emptyRealms.Size(); i = emptyRealms.NextNonZeroBit(i)) { - if (!allocate(TAllocateWholeRealm(), i)) { - return false; - } + const ui32 numFailDomainsPerFailRealm = Topology.GetNumFailDomainsPerFailRealm(); + const ui32 numVDisksPerFailDomain = Topology.GetNumVDisksPerFailDomain(); + ui32 domainOrderNumber = 0; + ui32 orderNumber = 0; - // remove excessive domains and disk from the set - emptyDomains.Reset(i * NumFailDomainsPerFailRealm, (i + 1) * NumFailDomainsPerFailRealm); - emptyDisks.Reset(i * NumFailDomainsPerFailRealm * NumVDisksPerFailDomain, - (i + 1) * NumFailDomainsPerFailRealm * NumVDisksPerFailDomain); - } - - // Fill in missing fail domains in some partially filled realms. - for (ui32 i = emptyDomains.FirstNonZeroBit(); i != emptyDomains.Size(); i = emptyDomains.NextNonZeroBit(i)) { - if (!allocate(TAllocateWholeDomain(), i)) { - return false; + // scan all fail realms and allocate missing realms or their parts + for (ui32 failRealmIdx = 0; failRealmIdx < isEmptyRealm.size(); ++failRealmIdx) { + if (isEmptyRealm[failRealmIdx]) { + // we have an empty realm -- we have to allocate it fully + if (!allocate(TAllocateWholeRealm(), failRealmIdx)) { + return false; + } + // skip to next realm + domainOrderNumber += numFailDomainsPerFailRealm; + orderNumber += numVDisksPerFailDomain * numFailDomainsPerFailRealm; + continue; } - // remove excessive disks - emptyDisks.Reset(i * NumVDisksPerFailDomain, (i + 1) * NumVDisksPerFailDomain); - } + // scan through domains of this realm, find unallocated ones + for (ui32 failDomainIdx = 0; failDomainIdx < numFailDomainsPerFailRealm; ++failDomainIdx, ++domainOrderNumber) { + if (isEmptyDomain[domainOrderNumber]) { + // try to allocate full domain + if (!allocate(TAllocateWholeDomain(), domainOrderNumber)) { + return false; + } + // skip to next domain + orderNumber += numVDisksPerFailDomain; + continue; + } - // Fill in missing disk cells. - for (ui32 i = emptyDisks.FirstNonZeroBit(); i != emptyDisks.Size(); i = emptyDisks.NextNonZeroBit(i)) { - if (!allocate(TAllocateDisk(), i)) { - return false; + // scan individual disks of the domain and fill gaps + for (ui32 vdiskIdx = 0; vdiskIdx < numVDisksPerFailDomain; ++vdiskIdx, ++orderNumber) { + if (!group[orderNumber] && !allocate(TAllocateDisk(), orderNumber)) { + return false; + } + } } } + Y_VERIFY(domainOrderNumber == Topology.GetTotalFailDomainsNum()); + Y_VERIFY(orderNumber == Topology.GetTotalVDisksNum()); + return true; } + using TAllocateResult = TPDiskLayoutPosition*; + struct TAllocateDisk {}; struct TAllocateWholeDomain { - static constexpr auto EntityCount = &TAllocator::NumVDisksPerFailDomain; - static constexpr auto PositionItem = &TPDiskLayoutPosition::Domain; + static constexpr auto GetEntityCount = &TBlobStorageGroupInfo::TTopology::GetNumVDisksPerFailDomain; using TNestedEntity = TAllocateDisk; - static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x) { + static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x, TEntityId& scope) { + scope = x.Domain; return {x, x}; } }; struct TAllocateWholeRealm { - static constexpr auto EntityCount = &TAllocator::NumFailDomainsPerFailRealm; - static constexpr auto PositionItem = &TPDiskLayoutPosition::Realm; + static constexpr auto GetEntityCount = &TBlobStorageGroupInfo::TTopology::GetNumFailDomainsPerFailRealm; using TNestedEntity = TAllocateWholeDomain; - static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x) { + static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x, TEntityId& scope) { + scope = x.Realm; return {{x.RealmGroup, x.Realm, TEntityId::Min()}, {x.RealmGroup, x.Realm, TEntityId::Max()}}; } }; struct TAllocateWholeGroup { - static constexpr auto EntityCount = &TAllocator::NumFailRealms; - static constexpr auto PositionItem = &TPDiskLayoutPosition::RealmGroup; + static constexpr auto GetEntityCount = &TBlobStorageGroupInfo::TTopology::GetTotalFailRealmsNum; using TNestedEntity = TAllocateWholeRealm; - static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x) { + static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x, TEntityId& scope) { + scope = x.RealmGroup; return {{x.RealmGroup, TEntityId::Min(), TEntityId::Min()}, {x.RealmGroup, TEntityId::Max(), TEntityId::Max()}}; } }; @@ -360,113 +341,125 @@ namespace NKikimr::NBsController { using TDiskRange = std::pair<TPDiskByPosition::const_iterator, TPDiskByPosition::const_iterator>; template<typename T> - TPDiskLayoutPosition *AllocateWholeEntity(T, TGroup& group, TUndoLog& undo, ui32 parentEntityIndex, - TDiskRange range, TDynBitMap& forbiddenEntities) { - const TDiskRange originalRange(range); + TAllocateResult AllocateWholeEntity(T, TGroup& group, TUndoLog& undo, ui32 parentEntityIndex, TDiskRange range, + TDynBitMap& forbiddenEntities) { + // number of enclosed child entities within this one + const ui32 entityCount = (Topology.*T::GetEntityCount)(); + Y_VERIFY(entityCount); + parentEntityIndex *= entityCount; + // remember current undo stack size const size_t undoPosition = undo.GetPosition(); - TPDiskLayoutPosition *prefix = nullptr; - TEntityId currentEntityId = TEntityId::Max(); - for (ui32 index = 0, num = this->*T::EntityCount; index < num; ) { - // allocate nested entity - prefix = AllocateWholeEntity(typename T::TNestedEntity(), group, undo, - parentEntityIndex * num + index, range, forbiddenEntities); - if (prefix) { - if (!index) { - currentEntityId = prefix->*T::PositionItem; - auto [min, max] = T::MakeRange(*prefix); - range.first = std::lower_bound(range.first, range.second, min, TComparePDiskByPosition()); - range.second = std::upper_bound(range.first, range.second, max, TComparePDiskByPosition()); + + for (;;) { + auto [from, to] = range; + TPDiskLayoutPosition *prefix; + TEntityId scope; + + for (ui32 index = 0;; ++index) { + // allocate nested entity + prefix = AllocateWholeEntity(typename T::TNestedEntity(), group, undo, parentEntityIndex + index, + {from, to}, forbiddenEntities); + + if (prefix) { + if (!index) { + // reduce range to specific realm/domain entity + auto [min, max] = T::MakeRange(*prefix, scope); + from = std::lower_bound(from, to, min, TComparePDiskByPosition()); + to = std::upper_bound(from, to, max, TComparePDiskByPosition()); + } + if (index + 1 == entityCount) { + // disable filled entity from further selection if it was really allocated + forbiddenEntities.Set(scope.Index()); + return prefix; + } + } else if (index) { + // disable just checked entity (to prevent its selection again) + forbiddenEntities.Set(scope.Index()); + // try another entity at this level + Revert(undo, group, undoPosition); + // break the loop and retry + break; + } else { + // no chance to allocate new entity, exit + return {}; } - ++index; - } else if (index) { - // disable just checked entity (to prevent its selection again) - Y_VERIFY(currentEntityId != TEntityId::Max()); - forbiddenEntities.Set(currentEntityId.Index()); - // try another entity at this level - Revert(undo, group, undoPosition); - // revert original wide range and start from the beginning - range = originalRange; - index = 0; - currentEntityId = TEntityId::Max(); - } else { - // no chance to allocate new entity, exit - return nullptr; } } - // disable filled entity from further selection - Y_VERIFY(prefix && currentEntityId != TEntityId::Max()); - forbiddenEntities.Set(currentEntityId.Index()); - return prefix; } - TPDiskLayoutPosition *AllocateWholeEntity(TAllocateDisk, TGroup& group, TUndoLog& undo, ui32 index, - TDiskRange range, TDynBitMap& forbiddenEntities) { - TPDiskInfo *pdisk = nullptr; + TAllocateResult AllocateWholeEntity(TAllocateDisk, TGroup& group, TUndoLog& undo, ui32 index, TDiskRange range, + TDynBitMap& forbiddenEntities) { + TPDiskInfo *pdisk = group[index]; + Y_VERIFY(!pdisk); auto process = [this, &pdisk](TPDiskInfo *candidate) { if (!pdisk || DiskIsBetter(*candidate, *pdisk)) { pdisk = candidate; } }; - FindMatchingDiskBasedOnScore(process, group, RealmIdx[index], DomainIdx[index], - range, forbiddenEntities); + FindMatchingDiskBasedOnScore(process, group, index, range, forbiddenEntities); if (pdisk) { AddDiskViaUndoLog(undo, group, index, pdisk); pdisk->Matching = false; return &pdisk->Position; } else { - return nullptr; + return {}; } } - TScore CalculateBestScoreWithCache(const TGroup& group) { - if (!BestScore) { + TScore CalculateWorstScoreWithCache(const TGroup& group) { + if (!WorstScore) { // find the worst disk from a position of layout correctness and use it as a milestone for other // disks -- they can't be misplaced worse - TScore bestScore; - for (ui32 i = 0; i < GroupSize; ++i) { + TScore worstScore; + for (ui32 i = 0; i < Topology.GetTotalVDisksNum(); ++i) { if (TPDiskInfo *pdisk = group[i]; pdisk && !pdisk->Decommitted) { - TScore score = GroupLayout.GetCandidateScore(pdisk->Position, RealmIdx[i], - DomainIdx[i]); - if (bestScore.BetterThan(score)) { - bestScore = score; + // calculate score for this pdisk, removing it from the set first -- to prevent counting itself + const TScore score = GroupLayout.GetExcludedDiskScore(pdisk->Position, i); + if (worstScore.BetterThan(score)) { + worstScore = score; } } } - BestScore = bestScore; + WorstScore = worstScore; } - return *BestScore; + return *WorstScore; } template<typename TCallback> - void FindMatchingDiskBasedOnScore(TCallback&& cb, const TGroup& group, ui32 failRealmIdx, ui32 failDomainIdx, - TDiskRange range, TDynBitMap& forbiddenEntities) { - TScore bestScore = CalculateBestScoreWithCache(group); + void FindMatchingDiskBasedOnScore( + TCallback&& cb, // callback to be invoked for every matching candidate + const TGroup& group, // group with peer disks + ui32 orderNumber, // order number of disk being allocated + TDiskRange range, // range of PDisk candidates to scan + TDynBitMap& forbiddenEntities) { // a set of forbidden TEntityId's prevented from allocation + // first, find the best score for current group layout -- we can't make failure model inconsistency + // any worse than it already is + TScore bestScore = CalculateWorstScoreWithCache(group); std::vector<TPDiskInfo*> candidates; + // scan the candidate range while (range.first != range.second) { const auto& [position, pdisk] = *range.first++; + // skip inappropriate disks, whole realm groups, realms and domains if (!pdisk->Matching) { - continue; + // just do nothing, skip this candidate disk } else if (forbiddenEntities[position.RealmGroup.Index()]) { range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextRealmGroup - 1); - continue; } else if (forbiddenEntities[position.Realm.Index()]) { range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextRealm - 1); - continue; } else if (forbiddenEntities[position.Domain.Index()]) { range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextDomain - 1); - continue; - } - - TScore score = GroupLayout.GetCandidateScore(position, failRealmIdx, failDomainIdx); - if (score.BetterThan(bestScore)) { - candidates.clear(); - candidates.push_back(pdisk); - bestScore = score; - } else if (score.SameAs(bestScore)) { - candidates.push_back(pdisk); + } else { + const TScore score = GroupLayout.GetCandidateScore(position, orderNumber); + if (score.BetterThan(bestScore)) { + candidates.clear(); + bestScore = score; + } + if (score.SameAs(bestScore)) { + candidates.push_back(pdisk); + } } } @@ -692,7 +685,7 @@ namespace NKikimr::NBsController { pdisk.EraseGroup(groupId); } ui32 numZero = 0; - for (ui32 i = 0; i < allocator.GroupSize; ++i) { + for (ui32 i = 0; i < allocator.Topology.GetTotalVDisksNum(); ++i) { if (!group[i]) { ++numZero; TPDiskInfo *pdisk = result->at(i); @@ -700,7 +693,7 @@ namespace NKikimr::NBsController { pdisk->InsertGroup(groupId); } } - Y_VERIFY(numZero == allocator.GroupSize || numZero == replacedDisks.size()); + Y_VERIFY(numZero == allocator.Topology.GetTotalVDisksNum() || numZero == replacedDisks.size()); allocator.Decompose(*result, groupDefinition); return true; } else { diff --git a/ydb/core/mind/bscontroller/group_mapper.h b/ydb/core/mind/bscontroller/group_mapper.h index a58e49ab0d1..991a636bf38 100644 --- a/ydb/core/mind/bscontroller/group_mapper.h +++ b/ydb/core/mind/bscontroller/group_mapper.h @@ -18,6 +18,19 @@ namespace NKikimr { using TGroupDefinition = TVector<TVector<TVector<TPDiskId>>>; // Realm/Domain/Disk using TForbiddenPDisks = std::unordered_set<TPDiskId, THash<TPDiskId>>; + template<typename T> + static void Traverse(const TGroupDefinition& group, T&& callback) { + for (ui32 failRealmIdx = 0; failRealmIdx != group.size(); ++failRealmIdx) { + const auto& realm = group[failRealmIdx]; + for (ui32 failDomainIdx = 0; failDomainIdx != realm.size(); ++failDomainIdx) { + const auto& domain = realm[failDomainIdx]; + for (ui32 vdiskIdx = 0; vdiskIdx != domain.size(); ++vdiskIdx) { + callback(TVDiskIdShort(failRealmIdx, failDomainIdx, vdiskIdx), domain[vdiskIdx]); + } + } + } + } + struct TPDiskRecord { const TPDiskId PDiskId; const TNodeLocation Location; diff --git a/ydb/core/mind/bscontroller/group_mapper_ut.cpp b/ydb/core/mind/bscontroller/group_mapper_ut.cpp index 839d75e27e0..1be38782962 100644 --- a/ydb/core/mind/bscontroller/group_mapper_ut.cpp +++ b/ydb/core/mind/bscontroller/group_mapper_ut.cpp @@ -945,7 +945,7 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) { const auto [it, inserted] = item.Seen.insert(item.Group); UNIT_ASSERT(inserted); UNIT_ASSERT(item.Seen.size() <= 9); - Cerr << "processing path# " << item.Path << Endl; + Ctest << "processing path# " << item.Path << Endl; auto candidates = checkLayout(item.Group); if (!candidates) { @@ -957,7 +957,7 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) { auto temp = context.ReallocateGroup(mapper, groupId, {pdiskId}, false, false, false); TString path = TStringBuilder() << item.Path << "/" << (int)vdiskId.FailRealm << ":" << (int)vdiskId.FailDomain << ":" << (int)vdiskId.VDisk << "@" << pdiskId; - Cerr << "path# " << path << Endl; + Ctest << "path# " << path << Endl; context.DumpGroup(temp); auto vdiskItems = item.VDiskItems; @@ -973,7 +973,7 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) { } } - Cerr << Endl; + Ctest << Endl; } } } diff --git a/ydb/core/mind/bscontroller/impl.h b/ydb/core/mind/bscontroller/impl.h index b3db9d483b7..e69265827bb 100644 --- a/ydb/core/mind/bscontroller/impl.h +++ b/ydb/core/mind/bscontroller/impl.h @@ -1377,6 +1377,7 @@ private: EvVSlotReadyUpdate, EvVSlotNotReadyHistogramUpdate, EvProcessIncomingEvent, + EvUpdateHostRecords, }; struct TEvUpdateSystemViews : public TEventLocal<TEvUpdateSystemViews, EvUpdateSystemViews> {}; @@ -1390,6 +1391,14 @@ private: struct TEvScrub : TEventLocal<TEvScrub, EvScrub> {}; struct TEvVSlotReadyUpdate : TEventLocal<TEvVSlotReadyUpdate, EvVSlotReadyUpdate> {}; struct TEvVSlotNotReadyHistogramUpdate : TEventLocal<TEvVSlotNotReadyHistogramUpdate, EvVSlotNotReadyHistogramUpdate> {}; + + struct TEvUpdateHostRecords : TEventLocal<TEvUpdateHostRecords, EvUpdateHostRecords> { + THostRecordMap HostRecords; + + TEvUpdateHostRecords(THostRecordMap hostRecords) + : HostRecords(std::move(hostRecords)) + {} + }; }; static constexpr TDuration UpdateSystemViewsPeriod = TDuration::Seconds(5); diff --git a/ydb/core/mind/bscontroller/self_heal.cpp b/ydb/core/mind/bscontroller/self_heal.cpp index 86ea82a627e..24c6045625e 100644 --- a/ydb/core/mind/bscontroller/self_heal.cpp +++ b/ydb/core/mind/bscontroller/self_heal.cpp @@ -244,7 +244,7 @@ namespace NKikimr::NBsController { TIntrusiveList<TGroupRecord, TWithInvalidLayout> GroupsWithInvalidLayout; std::shared_ptr<std::atomic_uint64_t> UnreassignableGroups; bool GroupLayoutSanitizer = false; - std::optional<THostRecordMapImpl> HostRecords; + THostRecordMap HostRecords; public: TSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups) @@ -579,8 +579,8 @@ namespace NKikimr::NBsController { } } - void Handle(TEvInterconnect::TEvNodesInfo::TPtr ev) { - HostRecords.emplace(ev->Get()); + void Handle(TEvPrivate::TEvUpdateHostRecords::TPtr ev) { + HostRecords = std::move(ev->Get()->HostRecords); } STRICT_STFUNC(StateFunc, { @@ -589,7 +589,7 @@ namespace NKikimr::NBsController { hFunc(NMon::TEvRemoteHttpInfo, Handle); hFunc(TEvReassignerDone, Handle); cFunc(TEvents::TSystem::Wakeup, HandleWakeup); - hFunc(TEvInterconnect::TEvNodesInfo, Handle); + hFunc(TEvPrivate::TEvUpdateHostRecords, Handle); }) }; |