aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexander Rutkovsky <alexander.rutkovsky@gmail.com>2022-04-28 13:57:34 +0300
committerAlexander Rutkovsky <alexander.rutkovsky@gmail.com>2022-04-28 13:57:34 +0300
commited1b327b748bfc62f10eb935200de8a2087762f5 (patch)
tree6c004eee2430da3afad7c7a0bc8d9d9d7114811e
parent00311b881dc0a494fbe5cc94593aa99a0c2f51ac (diff)
downloadydb-ed1b327b748bfc62f10eb935200de8a2087762f5.tar.gz
Fix some problems and improve code KIKIMR-14580
ref:cdc978539243b7ddc1e6357c69ea78e650f00034
-rw-r--r--ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp9
-rw-r--r--ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h7
-rw-r--r--ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp2
-rw-r--r--ydb/core/mind/bscontroller/bsc.cpp2
-rw-r--r--ydb/core/mind/bscontroller/group_geometry_info.h4
-rw-r--r--ydb/core/mind/bscontroller/group_layout_checker.cpp12
-rw-r--r--ydb/core/mind/bscontroller/group_layout_checker.h47
-rw-r--r--ydb/core/mind/bscontroller/group_mapper.cpp365
-rw-r--r--ydb/core/mind/bscontroller/group_mapper.h13
-rw-r--r--ydb/core/mind/bscontroller/group_mapper_ut.cpp6
-rw-r--r--ydb/core/mind/bscontroller/impl.h9
-rw-r--r--ydb/core/mind/bscontroller/self_heal.cpp8
12 files changed, 256 insertions, 228 deletions
diff --git a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp
index 054d5be6653..7a7cb76ed6d 100644
--- a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp
+++ b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp
@@ -299,7 +299,7 @@ TBlobStorageGroupInfo::TTopology::TTopology(TBlobStorageGroupType gtype)
{}
TBlobStorageGroupInfo::TTopology::TTopology(TBlobStorageGroupType gtype, ui32 numFailRealms,
- ui32 numFailDomainsPerFailRealm, ui32 numVDisksPerFailDomain)
+ ui32 numFailDomainsPerFailRealm, ui32 numVDisksPerFailDomain, bool finalize)
: GType(gtype)
{
FailRealms = {numFailRealms, {
@@ -307,6 +307,9 @@ TBlobStorageGroupInfo::TTopology::TTopology(TBlobStorageGroupType gtype, ui32 nu
{numVDisksPerFailDomain, TVDiskInfo{}}
}}
}};
+ if (finalize) {
+ FinalizeConstruction();
+ }
}
TBlobStorageGroupInfo::TTopology::~TTopology() = default;
@@ -400,10 +403,6 @@ ui32 TBlobStorageGroupInfo::TTopology::GetOrderNumber(const TVDiskIdShort &vdisk
return FailRealms[vdisk.FailRealm].FailDomains[vdisk.FailDomain].VDisks[vdisk.VDisk].OrderNumber;
}
-ui32 TBlobStorageGroupInfo::TTopology::GetNumVDisksPerFailDomain() const {
- return FailRealms[0].FailDomains[0].VDisks.size();
-}
-
void TBlobStorageGroupInfo::TTopology::PickSubgroup(ui32 hash, TBlobStorageGroupInfo::TOrderNums &orderNums) const {
return BlobMapper->PickSubgroup(hash, orderNums);
}
diff --git a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h
index e2d1445be4e..2b89538bc8b 100644
--- a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h
+++ b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h
@@ -191,7 +191,8 @@ public:
TVector<TVDiskIdShort> VDiskIdForOrderNumber;
TTopology(TBlobStorageGroupType gtype);
- TTopology(TBlobStorageGroupType gtype, ui32 numFailRealms, ui32 numFailDomainsPerFailRealm, ui32 numVDisksPerFailDomain);
+ TTopology(TBlobStorageGroupType gtype, ui32 numFailRealms, ui32 numFailDomainsPerFailRealm, ui32 numVDisksPerFailDomain,
+ bool finalize = false);
TTopology(const TTopology&) = delete;
TTopology &operator =(const TTopology&) = delete;
TTopology(TTopology&&) = default;
@@ -221,7 +222,9 @@ public:
// get the total number of VDisks in the blobstorage group
ui32 GetTotalVDisksNum() const { return TotalVDisks; }
// get number of VDisks per fail domain
- ui32 GetNumVDisksPerFailDomain() const;
+ ui32 GetNumVDisksPerFailDomain() const { return FailRealms[0].FailDomains[0].VDisks.size(); }
+ // get number of fail domains per fail realm
+ ui32 GetNumFailDomainsPerFailRealm() const { return FailRealms[0].FailDomains.size(); }
// get quorum checker
const IQuorumChecker& GetQuorumChecker() const { return *QuorumChecker; }
diff --git a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp
index 9e17730c3d6..f74e2d48413 100644
--- a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp
+++ b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp
@@ -98,6 +98,6 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
env.Sim(TDuration::Minutes(15));
auto corrected = getGroupsWithIncorrectLayout();
Cerr << "bad groups after shuffling and fixing# " << FormatList(corrected) << Endl;
-// UNIT_ASSERT(corrected.empty());
+ UNIT_ASSERT(corrected.empty());
}
}
diff --git a/ydb/core/mind/bscontroller/bsc.cpp b/ydb/core/mind/bscontroller/bsc.cpp
index bb44c7fbb04..bd9f83d86d6 100644
--- a/ydb/core/mind/bscontroller/bsc.cpp
+++ b/ydb/core/mind/bscontroller/bsc.cpp
@@ -152,7 +152,7 @@ void TBlobStorageController::Handle(TEvInterconnect::TEvNodesInfo::TPtr &ev) {
const bool initial = !HostRecords;
HostRecords = std::make_shared<THostRecordMap::element_type>(ev->Get());
Schedule(TDuration::Minutes(5), new TEvPrivate::TEvHostRecordsTimeToLiveExceeded);
- TActivationContext::Send(ev->Forward(SelfHealId));
+ Send(SelfHealId, new TEvPrivate::TEvUpdateHostRecords(HostRecords));
if (initial) {
Execute(CreateTxInitScheme());
}
diff --git a/ydb/core/mind/bscontroller/group_geometry_info.h b/ydb/core/mind/bscontroller/group_geometry_info.h
index 5d37a0dfd2c..a74698f1f84 100644
--- a/ydb/core/mind/bscontroller/group_geometry_info.h
+++ b/ydb/core/mind/bscontroller/group_geometry_info.h
@@ -114,6 +114,10 @@ namespace NKikimr::NBsController {
return true;
}
+ TBlobStorageGroupType GetType() const {
+ return Type;
+ }
+
TBlobStorageGroupType::EErasureSpecies GetErasure() const {
return Type.GetErasure();
}
diff --git a/ydb/core/mind/bscontroller/group_layout_checker.cpp b/ydb/core/mind/bscontroller/group_layout_checker.cpp
index cf947e16741..8ab76e3e4f5 100644
--- a/ydb/core/mind/bscontroller/group_layout_checker.cpp
+++ b/ydb/core/mind/bscontroller/group_layout_checker.cpp
@@ -10,26 +10,25 @@ namespace NKikimr::NBsController {
return {};
}
- TGroupLayout group(geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm());
+ TBlobStorageGroupInfo::TTopology topology(geom.GetType(), geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm(),
+ geom.GetNumVDisksPerFailDomain(), true);
+ TGroupLayout group(topology);
TDomainMapper mapper;
THashMap<TVDiskIdShort, TPDiskLayoutPosition> map;
for (const auto& [vdiskId, p] : layout) {
const auto& [location, pdiskId] = p;
TPDiskLayoutPosition pos(mapper, location, pdiskId, geom);
- group.AddDisk(pos, vdiskId.FailRealm, vdiskId.FailDomain);
+ group.AddDisk(pos, topology.GetOrderNumber(vdiskId));
map.emplace(vdiskId, pos);
}
std::vector<std::pair<TScore, TVDiskIdShort>> scoreboard;
for (const auto& [vdiskId, pos] : map) {
- scoreboard.emplace_back(group.GetCandidateScore(pos, vdiskId.FailRealm, vdiskId.FailDomain), vdiskId);
+ scoreboard.emplace_back(group.GetCandidateScore(pos, topology.GetOrderNumber(vdiskId)), vdiskId);
}
auto comp1 = [](const auto& x, const auto& y) { return x.second < y.second; };
std::sort(scoreboard.begin(), scoreboard.end(), comp1);
- for (const auto& [score, vdiskId] : scoreboard) {
- Cerr << vdiskId << "@" << map[vdiskId].ToString() << " -> " << score.ToString() << Endl;
- }
auto comp = [](const auto& x, const auto& y) { return x.first.BetterThan(y.first); };
std::sort(scoreboard.begin(), scoreboard.end(), comp);
@@ -37,7 +36,6 @@ namespace NKikimr::NBsController {
const auto reference = scoreboard.back().first;
if (!reference.SameAs({})) { // not perfectly correct layout
for (; !scoreboard.empty() && !scoreboard.back().first.BetterThan(reference); scoreboard.pop_back()) {
- Cerr << "candidate# " << scoreboard.back().second << Endl;
res.Candidates.push_back(scoreboard.back().second);
}
}
diff --git a/ydb/core/mind/bscontroller/group_layout_checker.h b/ydb/core/mind/bscontroller/group_layout_checker.h
index 2319d83515c..407f0b7c7f7 100644
--- a/ydb/core/mind/bscontroller/group_layout_checker.h
+++ b/ydb/core/mind/bscontroller/group_layout_checker.h
@@ -155,7 +155,7 @@ namespace NKikimr::NBsController {
};
struct TGroupLayout {
- const ui32 NumFailDomainsPerFailRealm;
+ const TBlobStorageGroupInfo::TTopology& Topology;
ui32 NumDisks = 0;
THashMap<TEntityId, ui32> NumDisksPerRealmGroup;
@@ -168,45 +168,54 @@ namespace NKikimr::NBsController {
TStackVec<THashMap<TEntityId, ui32>, 32> NumDisksPerDomain;
THashMap<TEntityId, ui32> NumDisksPerDomainTotal;
- TGroupLayout(ui32 numFailRealms, ui32 numFailDomainsPerFailRealm)
- : NumFailDomainsPerFailRealm(numFailDomainsPerFailRealm)
- , NumDisksInRealm(numFailRealms)
- , NumDisksPerRealm(numFailRealms)
- , NumDisksInDomain(numFailRealms * numFailDomainsPerFailRealm)
- , NumDisksPerDomain(numFailRealms * numFailDomainsPerFailRealm)
+ TGroupLayout(const TBlobStorageGroupInfo::TTopology& topology)
+ : Topology(topology)
+ , NumDisksInRealm(Topology.GetTotalFailRealmsNum())
+ , NumDisksPerRealm(Topology.GetTotalFailRealmsNum())
+ , NumDisksInDomain(Topology.GetTotalFailDomainsNum())
+ , NumDisksPerDomain(Topology.GetTotalFailDomainsNum())
{}
- void UpdateDisk(const TPDiskLayoutPosition& pos, ui32 realmIdx, ui32 domainIdx, ui32 value) {
- domainIdx += realmIdx * NumFailDomainsPerFailRealm;
+ void UpdateDisk(const TPDiskLayoutPosition& pos, ui32 orderNumber, ui32 value) {
NumDisks += value;
NumDisksPerRealmGroup[pos.RealmGroup] += value;
- NumDisksInRealm[realmIdx] += value;
- NumDisksPerRealm[realmIdx][pos.Realm] += value;
+ const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber);
+ NumDisksInRealm[vdisk.FailRealm] += value;
+ NumDisksPerRealm[vdisk.FailRealm][pos.Realm] += value;
NumDisksPerRealmTotal[pos.Realm] += value;
+ const ui32 domainIdx = Topology.GetFailDomainOrderNumber(vdisk);
NumDisksInDomain[domainIdx] += value;
NumDisksPerDomain[domainIdx][pos.Domain] += value;
NumDisksPerDomainTotal[pos.Domain] += value;
}
- void AddDisk(const TPDiskLayoutPosition& pos, ui32 realmIdx, ui32 domainIdx) {
- UpdateDisk(pos, realmIdx, domainIdx, 1);
+ void AddDisk(const TPDiskLayoutPosition& pos, ui32 orderNumber) {
+ UpdateDisk(pos, orderNumber, 1);
}
- void RemoveDisk(const TPDiskLayoutPosition& pos, ui32 realmIdx, ui32 domainIdx) {
- UpdateDisk(pos, realmIdx, domainIdx, Max<ui32>());
+ void RemoveDisk(const TPDiskLayoutPosition& pos, ui32 orderNumber) {
+ UpdateDisk(pos, orderNumber, Max<ui32>());
}
- TScore GetCandidateScore(const TPDiskLayoutPosition& pos, ui32 realmIdx, ui32 domainIdx) {
- domainIdx += realmIdx * NumFailDomainsPerFailRealm;
+ TScore GetCandidateScore(const TPDiskLayoutPosition& pos, ui32 orderNumber) {
+ const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber);
+ const ui32 domainIdx = Topology.GetFailDomainOrderNumber(vdisk);
return {
- .RealmInterlace = NumDisksPerRealmTotal[pos.Realm] - NumDisksPerRealm[realmIdx][pos.Realm],
+ .RealmInterlace = NumDisksPerRealmTotal[pos.Realm] - NumDisksPerRealm[vdisk.FailRealm][pos.Realm],
.DomainInterlace = NumDisksPerDomainTotal[pos.Domain] - NumDisksPerDomain[domainIdx][pos.Domain],
.RealmGroupScatter = NumDisks - NumDisksPerRealmGroup[pos.RealmGroup],
- .RealmScatter = NumDisksInRealm[realmIdx] - NumDisksPerRealm[realmIdx][pos.Realm],
+ .RealmScatter = NumDisksInRealm[vdisk.FailRealm] - NumDisksPerRealm[vdisk.FailRealm][pos.Realm],
.DomainScatter = NumDisksInDomain[domainIdx] - NumDisksPerDomain[domainIdx][pos.Domain],
};
}
+
+ TScore GetExcludedDiskScore(const TPDiskLayoutPosition& pos, ui32 orderNumber) {
+ RemoveDisk(pos, orderNumber);
+ const TScore score = GetCandidateScore(pos, orderNumber);
+ AddDisk(pos, orderNumber);
+ return score;
+ }
};
} // NLayoutChecker
diff --git a/ydb/core/mind/bscontroller/group_mapper.cpp b/ydb/core/mind/bscontroller/group_mapper.cpp
index 967aeafaa15..5619534fb3a 100644
--- a/ydb/core/mind/bscontroller/group_mapper.cpp
+++ b/ydb/core/mind/bscontroller/group_mapper.cpp
@@ -60,94 +60,69 @@ namespace NKikimr::NBsController {
struct TAllocator {
TImpl& Self;
- const ui32 NumFailRealms;
- const ui32 NumFailDomainsPerFailRealm;
- const ui32 NumFailDomainsTotal;
- const ui32 NumVDisksPerFailDomain;
- const ui32 GroupSize;
- TStackVec<ui8, 32> RealmIdx;
- TStackVec<ui8, 32> DomainIdx;
- TStackVec<ui8, 32> DomainThroughIdx;
- TStackVec<ui8, 32> VDiskIdx;
+ const TBlobStorageGroupInfo::TTopology Topology;
THashSet<TPDiskId> OldGroupContent; // set of all existing disks in the group, inclusing ones which are replaced
const i64 RequiredSpace;
const bool RequireOperational;
TForbiddenPDisks ForbiddenDisks;
THashMap<ui32, unsigned> LocalityFactor;
TGroupLayout GroupLayout;
- std::optional<TScore> BestScore;
+ std::optional<TScore> WorstScore;
TAllocator(TImpl& self, const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational,
TForbiddenPDisks forbiddenDisks, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks)
: Self(self)
- , NumFailRealms(geom.GetNumFailRealms())
- , NumFailDomainsPerFailRealm(geom.GetNumFailDomainsPerFailRealm())
- , NumFailDomainsTotal(NumFailRealms * NumFailDomainsPerFailRealm)
- , NumVDisksPerFailDomain(geom.GetNumVDisksPerFailDomain())
- , GroupSize(NumFailDomainsTotal * NumVDisksPerFailDomain)
- , RealmIdx(GroupSize)
- , DomainIdx(GroupSize)
- , DomainThroughIdx(GroupSize)
- , VDiskIdx(GroupSize)
+ , Topology(geom.GetType(), geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm(), geom.GetNumVDisksPerFailDomain(), true)
, RequiredSpace(requiredSpace)
, RequireOperational(requireOperational)
, ForbiddenDisks(std::move(forbiddenDisks))
- , GroupLayout(NumFailRealms, NumFailDomainsPerFailRealm)
+ , GroupLayout(Topology)
{
for (const auto& [vdiskId, pdiskId] : replacedDisks) {
OldGroupContent.insert(pdiskId);
}
- for (ui32 index = 0, domainThroughIdx = 0, realmIdx = 0; realmIdx < NumFailRealms; ++realmIdx) {
- for (ui32 domainIdx = 0; domainIdx < NumFailDomainsPerFailRealm; ++domainIdx, ++domainThroughIdx) {
- for (ui32 vdiskIdx = 0; vdiskIdx < NumVDisksPerFailDomain; ++vdiskIdx, ++index) {
- RealmIdx[index] = realmIdx;
- DomainIdx[index] = domainIdx;
- DomainThroughIdx[index] = domainThroughIdx;
- VDiskIdx[index] = vdiskIdx;
- }
- }
- }
}
TGroup ProcessExistingGroup(const TGroupDefinition& group, TString& error) {
- TGroup res(GroupSize);
-
- ui32 index = 0;
- for (const auto& realm : group) {
- for (const auto& domain : realm) {
- for (const auto& pdiskId : domain) {
- if (pdiskId != TPDiskId()) {
- const auto it = Self.PDisks.find(pdiskId);
- if (it == Self.PDisks.end()) {
- error = TStringBuilder() << "existing group contains missing PDiskId# " << pdiskId;
- return {};
- }
- TPDiskInfo& pdisk = it->second;
- res[index] = &pdisk;
-
- const auto [_, inserted] = OldGroupContent.insert(pdiskId);
- if (!inserted) {
- error = TStringBuilder() << "group contains duplicate PDiskId# " << pdiskId;
- return {};
- }
-
- if (!pdisk.Decommitted) {
- AddUsedDisk(pdisk);
- GroupLayout.AddDisk(pdisk.Position, RealmIdx[index], DomainIdx[index]);
- }
+ TGroup res(Topology.GetTotalVDisksNum());
+
+ struct TExError { TString error; };
+
+ try {
+ Traverse(group, [&](TVDiskIdShort vdisk, TPDiskId pdiskId) {
+ if (pdiskId != TPDiskId()) {
+ const ui32 orderNumber = Topology.GetOrderNumber(vdisk);
+
+ const auto it = Self.PDisks.find(pdiskId);
+ if (it == Self.PDisks.end()) {
+ throw TExError{TStringBuilder() << "existing group contains missing PDiskId# " << pdiskId};
}
+ TPDiskInfo& pdisk = it->second;
+ res[orderNumber] = &pdisk;
- ++index;
+ const auto [_, inserted] = OldGroupContent.insert(pdiskId);
+ if (!inserted) {
+ throw TExError{TStringBuilder() << "group contains duplicate PDiskId# " << pdiskId};
+ }
+
+ if (!pdisk.Decommitted) {
+ AddUsedDisk(pdisk);
+ GroupLayout.AddDisk(pdisk.Position, orderNumber);
+ }
}
- }
+ });
+ } catch (const TExError& e) {
+ error = e.error;
+ return {};
}
return res;
}
void Decompose(const TGroup& in, TGroupDefinition& out) {
- for (ui32 i = 0; i < GroupSize; ++i) {
- out[RealmIdx[i]][DomainIdx[i]][VDiskIdx[i]] = in[i]->PDiskId;
+ for (ui32 i = 0; i < in.size(); ++i) {
+ const TVDiskIdShort vdisk = Topology.GetVDiskId(i);
+ out[vdisk.FailRealm][vdisk.FailDomain][vdisk.VDisk] = in[i]->PDiskId;
}
}
@@ -238,8 +213,8 @@ namespace NKikimr::NBsController {
undo.Log(index, pdisk);
group[index] = pdisk;
AddUsedDisk(*pdisk);
- GroupLayout.AddDisk(pdisk->Position, RealmIdx[index], DomainIdx[index]);
- BestScore.reset(); // invalidate score
+ GroupLayout.AddDisk(pdisk->Position, index);
+ WorstScore.reset(); // invalidate score
}
void Revert(TUndoLog& undo, TGroup& group, size_t until) {
@@ -247,112 +222,118 @@ namespace NKikimr::NBsController {
const auto& item = undo.Items.back();
group[item.Index] = nullptr;
RemoveUsedDisk(*item.PDisk);
- GroupLayout.RemoveDisk(item.PDisk->Position, RealmIdx[item.Index], DomainIdx[item.Index]);
- BestScore.reset(); // invalidate score
+ GroupLayout.RemoveDisk(item.PDisk->Position, item.Index);
+ WorstScore.reset(); // invalidate score
}
}
bool FillInGroup(ui32 maxScore, TUndoLog& undo, TGroup& group) {
- // Determine PDisks that fit our requirements (including score).
- auto set = SetupMatchingDisks(maxScore);
-
- // Determine what we have to fill in -- full group, some realms, domains, or just some cells.
- bool emptyGroup = true;
-
- TDynBitMap emptyRealms;
- emptyRealms.Set(0, NumFailRealms);
-
- TDynBitMap emptyDomains;
- emptyDomains.Set(0, NumFailDomainsTotal);
-
- TDynBitMap emptyDisks;
- emptyDisks.Set(0, GroupSize);
-
- for (ui32 i = 0; i < GroupSize; ++i) {
- if (group[i]) {
- emptyGroup = false;
- emptyRealms[RealmIdx[i]] = false;
- emptyDomains[DomainThroughIdx[i]] = false;
- emptyDisks[i] = false;
+ // determine PDisks that fit our requirements (including score)
+ auto v = SetupMatchingDisks(maxScore);
+
+ // find which entities we need to allocate -- whole group, some realms, maybe some domains within specific realms?
+ bool isEmptyGroup = true;
+ std::vector<bool> isEmptyRealm(Topology.GetTotalFailRealmsNum(), true);
+ std::vector<bool> isEmptyDomain(Topology.GetTotalFailDomainsNum(), true);
+ for (ui32 orderNumber = 0; orderNumber < group.size(); ++orderNumber) {
+ if (group[orderNumber]) {
+ const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber);
+ isEmptyGroup = false;
+ isEmptyRealm[vdisk.FailRealm] = false;
+ const ui32 domainIdx = Topology.GetFailDomainOrderNumber(vdisk);
+ isEmptyDomain[domainIdx] = false;
}
}
- // Allocate new full group and exit if it is absolutely empty.
auto allocate = [&](auto what, ui32 index) {
- TDiskRange fullRange(set.begin(), set.end());
TDynBitMap forbiddenEntities;
forbiddenEntities.Reserve(Self.DomainMapper.GetIdCount());
- if (!AllocateWholeEntity(what, group, undo, index, fullRange, forbiddenEntities)) {
+ if (!AllocateWholeEntity(what, group, undo, index, {v.begin(), v.end()}, forbiddenEntities)) {
Revert(undo, group, 0);
return false;
}
return true;
};
- if (emptyGroup) {
+ if (isEmptyGroup) {
return allocate(TAllocateWholeGroup(), 0);
}
- // Fill in missing fail realms.
- for (ui32 i = emptyRealms.FirstNonZeroBit(); i != emptyRealms.Size(); i = emptyRealms.NextNonZeroBit(i)) {
- if (!allocate(TAllocateWholeRealm(), i)) {
- return false;
- }
+ const ui32 numFailDomainsPerFailRealm = Topology.GetNumFailDomainsPerFailRealm();
+ const ui32 numVDisksPerFailDomain = Topology.GetNumVDisksPerFailDomain();
+ ui32 domainOrderNumber = 0;
+ ui32 orderNumber = 0;
- // remove excessive domains and disk from the set
- emptyDomains.Reset(i * NumFailDomainsPerFailRealm, (i + 1) * NumFailDomainsPerFailRealm);
- emptyDisks.Reset(i * NumFailDomainsPerFailRealm * NumVDisksPerFailDomain,
- (i + 1) * NumFailDomainsPerFailRealm * NumVDisksPerFailDomain);
- }
-
- // Fill in missing fail domains in some partially filled realms.
- for (ui32 i = emptyDomains.FirstNonZeroBit(); i != emptyDomains.Size(); i = emptyDomains.NextNonZeroBit(i)) {
- if (!allocate(TAllocateWholeDomain(), i)) {
- return false;
+ // scan all fail realms and allocate missing realms or their parts
+ for (ui32 failRealmIdx = 0; failRealmIdx < isEmptyRealm.size(); ++failRealmIdx) {
+ if (isEmptyRealm[failRealmIdx]) {
+ // we have an empty realm -- we have to allocate it fully
+ if (!allocate(TAllocateWholeRealm(), failRealmIdx)) {
+ return false;
+ }
+ // skip to next realm
+ domainOrderNumber += numFailDomainsPerFailRealm;
+ orderNumber += numVDisksPerFailDomain * numFailDomainsPerFailRealm;
+ continue;
}
- // remove excessive disks
- emptyDisks.Reset(i * NumVDisksPerFailDomain, (i + 1) * NumVDisksPerFailDomain);
- }
+ // scan through domains of this realm, find unallocated ones
+ for (ui32 failDomainIdx = 0; failDomainIdx < numFailDomainsPerFailRealm; ++failDomainIdx, ++domainOrderNumber) {
+ if (isEmptyDomain[domainOrderNumber]) {
+ // try to allocate full domain
+ if (!allocate(TAllocateWholeDomain(), domainOrderNumber)) {
+ return false;
+ }
+ // skip to next domain
+ orderNumber += numVDisksPerFailDomain;
+ continue;
+ }
- // Fill in missing disk cells.
- for (ui32 i = emptyDisks.FirstNonZeroBit(); i != emptyDisks.Size(); i = emptyDisks.NextNonZeroBit(i)) {
- if (!allocate(TAllocateDisk(), i)) {
- return false;
+ // scan individual disks of the domain and fill gaps
+ for (ui32 vdiskIdx = 0; vdiskIdx < numVDisksPerFailDomain; ++vdiskIdx, ++orderNumber) {
+ if (!group[orderNumber] && !allocate(TAllocateDisk(), orderNumber)) {
+ return false;
+ }
+ }
}
}
+ Y_VERIFY(domainOrderNumber == Topology.GetTotalFailDomainsNum());
+ Y_VERIFY(orderNumber == Topology.GetTotalVDisksNum());
+
return true;
}
+ using TAllocateResult = TPDiskLayoutPosition*;
+
struct TAllocateDisk {};
struct TAllocateWholeDomain {
- static constexpr auto EntityCount = &TAllocator::NumVDisksPerFailDomain;
- static constexpr auto PositionItem = &TPDiskLayoutPosition::Domain;
+ static constexpr auto GetEntityCount = &TBlobStorageGroupInfo::TTopology::GetNumVDisksPerFailDomain;
using TNestedEntity = TAllocateDisk;
- static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x) {
+ static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x, TEntityId& scope) {
+ scope = x.Domain;
return {x, x};
}
};
struct TAllocateWholeRealm {
- static constexpr auto EntityCount = &TAllocator::NumFailDomainsPerFailRealm;
- static constexpr auto PositionItem = &TPDiskLayoutPosition::Realm;
+ static constexpr auto GetEntityCount = &TBlobStorageGroupInfo::TTopology::GetNumFailDomainsPerFailRealm;
using TNestedEntity = TAllocateWholeDomain;
- static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x) {
+ static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x, TEntityId& scope) {
+ scope = x.Realm;
return {{x.RealmGroup, x.Realm, TEntityId::Min()}, {x.RealmGroup, x.Realm, TEntityId::Max()}};
}
};
struct TAllocateWholeGroup {
- static constexpr auto EntityCount = &TAllocator::NumFailRealms;
- static constexpr auto PositionItem = &TPDiskLayoutPosition::RealmGroup;
+ static constexpr auto GetEntityCount = &TBlobStorageGroupInfo::TTopology::GetTotalFailRealmsNum;
using TNestedEntity = TAllocateWholeRealm;
- static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x) {
+ static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x, TEntityId& scope) {
+ scope = x.RealmGroup;
return {{x.RealmGroup, TEntityId::Min(), TEntityId::Min()}, {x.RealmGroup, TEntityId::Max(), TEntityId::Max()}};
}
};
@@ -360,113 +341,125 @@ namespace NKikimr::NBsController {
using TDiskRange = std::pair<TPDiskByPosition::const_iterator, TPDiskByPosition::const_iterator>;
template<typename T>
- TPDiskLayoutPosition *AllocateWholeEntity(T, TGroup& group, TUndoLog& undo, ui32 parentEntityIndex,
- TDiskRange range, TDynBitMap& forbiddenEntities) {
- const TDiskRange originalRange(range);
+ TAllocateResult AllocateWholeEntity(T, TGroup& group, TUndoLog& undo, ui32 parentEntityIndex, TDiskRange range,
+ TDynBitMap& forbiddenEntities) {
+ // number of enclosed child entities within this one
+ const ui32 entityCount = (Topology.*T::GetEntityCount)();
+ Y_VERIFY(entityCount);
+ parentEntityIndex *= entityCount;
+ // remember current undo stack size
const size_t undoPosition = undo.GetPosition();
- TPDiskLayoutPosition *prefix = nullptr;
- TEntityId currentEntityId = TEntityId::Max();
- for (ui32 index = 0, num = this->*T::EntityCount; index < num; ) {
- // allocate nested entity
- prefix = AllocateWholeEntity(typename T::TNestedEntity(), group, undo,
- parentEntityIndex * num + index, range, forbiddenEntities);
- if (prefix) {
- if (!index) {
- currentEntityId = prefix->*T::PositionItem;
- auto [min, max] = T::MakeRange(*prefix);
- range.first = std::lower_bound(range.first, range.second, min, TComparePDiskByPosition());
- range.second = std::upper_bound(range.first, range.second, max, TComparePDiskByPosition());
+
+ for (;;) {
+ auto [from, to] = range;
+ TPDiskLayoutPosition *prefix;
+ TEntityId scope;
+
+ for (ui32 index = 0;; ++index) {
+ // allocate nested entity
+ prefix = AllocateWholeEntity(typename T::TNestedEntity(), group, undo, parentEntityIndex + index,
+ {from, to}, forbiddenEntities);
+
+ if (prefix) {
+ if (!index) {
+ // reduce range to specific realm/domain entity
+ auto [min, max] = T::MakeRange(*prefix, scope);
+ from = std::lower_bound(from, to, min, TComparePDiskByPosition());
+ to = std::upper_bound(from, to, max, TComparePDiskByPosition());
+ }
+ if (index + 1 == entityCount) {
+ // disable filled entity from further selection if it was really allocated
+ forbiddenEntities.Set(scope.Index());
+ return prefix;
+ }
+ } else if (index) {
+ // disable just checked entity (to prevent its selection again)
+ forbiddenEntities.Set(scope.Index());
+ // try another entity at this level
+ Revert(undo, group, undoPosition);
+ // break the loop and retry
+ break;
+ } else {
+ // no chance to allocate new entity, exit
+ return {};
}
- ++index;
- } else if (index) {
- // disable just checked entity (to prevent its selection again)
- Y_VERIFY(currentEntityId != TEntityId::Max());
- forbiddenEntities.Set(currentEntityId.Index());
- // try another entity at this level
- Revert(undo, group, undoPosition);
- // revert original wide range and start from the beginning
- range = originalRange;
- index = 0;
- currentEntityId = TEntityId::Max();
- } else {
- // no chance to allocate new entity, exit
- return nullptr;
}
}
- // disable filled entity from further selection
- Y_VERIFY(prefix && currentEntityId != TEntityId::Max());
- forbiddenEntities.Set(currentEntityId.Index());
- return prefix;
}
- TPDiskLayoutPosition *AllocateWholeEntity(TAllocateDisk, TGroup& group, TUndoLog& undo, ui32 index,
- TDiskRange range, TDynBitMap& forbiddenEntities) {
- TPDiskInfo *pdisk = nullptr;
+ TAllocateResult AllocateWholeEntity(TAllocateDisk, TGroup& group, TUndoLog& undo, ui32 index, TDiskRange range,
+ TDynBitMap& forbiddenEntities) {
+ TPDiskInfo *pdisk = group[index];
+ Y_VERIFY(!pdisk);
auto process = [this, &pdisk](TPDiskInfo *candidate) {
if (!pdisk || DiskIsBetter(*candidate, *pdisk)) {
pdisk = candidate;
}
};
- FindMatchingDiskBasedOnScore(process, group, RealmIdx[index], DomainIdx[index],
- range, forbiddenEntities);
+ FindMatchingDiskBasedOnScore(process, group, index, range, forbiddenEntities);
if (pdisk) {
AddDiskViaUndoLog(undo, group, index, pdisk);
pdisk->Matching = false;
return &pdisk->Position;
} else {
- return nullptr;
+ return {};
}
}
- TScore CalculateBestScoreWithCache(const TGroup& group) {
- if (!BestScore) {
+ TScore CalculateWorstScoreWithCache(const TGroup& group) {
+ if (!WorstScore) {
// find the worst disk from a position of layout correctness and use it as a milestone for other
// disks -- they can't be misplaced worse
- TScore bestScore;
- for (ui32 i = 0; i < GroupSize; ++i) {
+ TScore worstScore;
+ for (ui32 i = 0; i < Topology.GetTotalVDisksNum(); ++i) {
if (TPDiskInfo *pdisk = group[i]; pdisk && !pdisk->Decommitted) {
- TScore score = GroupLayout.GetCandidateScore(pdisk->Position, RealmIdx[i],
- DomainIdx[i]);
- if (bestScore.BetterThan(score)) {
- bestScore = score;
+ // calculate score for this pdisk, removing it from the set first -- to prevent counting itself
+ const TScore score = GroupLayout.GetExcludedDiskScore(pdisk->Position, i);
+ if (worstScore.BetterThan(score)) {
+ worstScore = score;
}
}
}
- BestScore = bestScore;
+ WorstScore = worstScore;
}
- return *BestScore;
+ return *WorstScore;
}
template<typename TCallback>
- void FindMatchingDiskBasedOnScore(TCallback&& cb, const TGroup& group, ui32 failRealmIdx, ui32 failDomainIdx,
- TDiskRange range, TDynBitMap& forbiddenEntities) {
- TScore bestScore = CalculateBestScoreWithCache(group);
+ void FindMatchingDiskBasedOnScore(
+ TCallback&& cb, // callback to be invoked for every matching candidate
+ const TGroup& group, // group with peer disks
+ ui32 orderNumber, // order number of disk being allocated
+ TDiskRange range, // range of PDisk candidates to scan
+ TDynBitMap& forbiddenEntities) { // a set of forbidden TEntityId's prevented from allocation
+ // first, find the best score for current group layout -- we can't make failure model inconsistency
+ // any worse than it already is
+ TScore bestScore = CalculateWorstScoreWithCache(group);
std::vector<TPDiskInfo*> candidates;
+ // scan the candidate range
while (range.first != range.second) {
const auto& [position, pdisk] = *range.first++;
+ // skip inappropriate disks, whole realm groups, realms and domains
if (!pdisk->Matching) {
- continue;
+ // just do nothing, skip this candidate disk
} else if (forbiddenEntities[position.RealmGroup.Index()]) {
range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextRealmGroup - 1);
- continue;
} else if (forbiddenEntities[position.Realm.Index()]) {
range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextRealm - 1);
- continue;
} else if (forbiddenEntities[position.Domain.Index()]) {
range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextDomain - 1);
- continue;
- }
-
- TScore score = GroupLayout.GetCandidateScore(position, failRealmIdx, failDomainIdx);
- if (score.BetterThan(bestScore)) {
- candidates.clear();
- candidates.push_back(pdisk);
- bestScore = score;
- } else if (score.SameAs(bestScore)) {
- candidates.push_back(pdisk);
+ } else {
+ const TScore score = GroupLayout.GetCandidateScore(position, orderNumber);
+ if (score.BetterThan(bestScore)) {
+ candidates.clear();
+ bestScore = score;
+ }
+ if (score.SameAs(bestScore)) {
+ candidates.push_back(pdisk);
+ }
}
}
@@ -692,7 +685,7 @@ namespace NKikimr::NBsController {
pdisk.EraseGroup(groupId);
}
ui32 numZero = 0;
- for (ui32 i = 0; i < allocator.GroupSize; ++i) {
+ for (ui32 i = 0; i < allocator.Topology.GetTotalVDisksNum(); ++i) {
if (!group[i]) {
++numZero;
TPDiskInfo *pdisk = result->at(i);
@@ -700,7 +693,7 @@ namespace NKikimr::NBsController {
pdisk->InsertGroup(groupId);
}
}
- Y_VERIFY(numZero == allocator.GroupSize || numZero == replacedDisks.size());
+ Y_VERIFY(numZero == allocator.Topology.GetTotalVDisksNum() || numZero == replacedDisks.size());
allocator.Decompose(*result, groupDefinition);
return true;
} else {
diff --git a/ydb/core/mind/bscontroller/group_mapper.h b/ydb/core/mind/bscontroller/group_mapper.h
index a58e49ab0d1..991a636bf38 100644
--- a/ydb/core/mind/bscontroller/group_mapper.h
+++ b/ydb/core/mind/bscontroller/group_mapper.h
@@ -18,6 +18,19 @@ namespace NKikimr {
using TGroupDefinition = TVector<TVector<TVector<TPDiskId>>>; // Realm/Domain/Disk
using TForbiddenPDisks = std::unordered_set<TPDiskId, THash<TPDiskId>>;
+ template<typename T>
+ static void Traverse(const TGroupDefinition& group, T&& callback) {
+ for (ui32 failRealmIdx = 0; failRealmIdx != group.size(); ++failRealmIdx) {
+ const auto& realm = group[failRealmIdx];
+ for (ui32 failDomainIdx = 0; failDomainIdx != realm.size(); ++failDomainIdx) {
+ const auto& domain = realm[failDomainIdx];
+ for (ui32 vdiskIdx = 0; vdiskIdx != domain.size(); ++vdiskIdx) {
+ callback(TVDiskIdShort(failRealmIdx, failDomainIdx, vdiskIdx), domain[vdiskIdx]);
+ }
+ }
+ }
+ }
+
struct TPDiskRecord {
const TPDiskId PDiskId;
const TNodeLocation Location;
diff --git a/ydb/core/mind/bscontroller/group_mapper_ut.cpp b/ydb/core/mind/bscontroller/group_mapper_ut.cpp
index 839d75e27e0..1be38782962 100644
--- a/ydb/core/mind/bscontroller/group_mapper_ut.cpp
+++ b/ydb/core/mind/bscontroller/group_mapper_ut.cpp
@@ -945,7 +945,7 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) {
const auto [it, inserted] = item.Seen.insert(item.Group);
UNIT_ASSERT(inserted);
UNIT_ASSERT(item.Seen.size() <= 9);
- Cerr << "processing path# " << item.Path << Endl;
+ Ctest << "processing path# " << item.Path << Endl;
auto candidates = checkLayout(item.Group);
if (!candidates) {
@@ -957,7 +957,7 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) {
auto temp = context.ReallocateGroup(mapper, groupId, {pdiskId}, false, false, false);
TString path = TStringBuilder() << item.Path << "/" << (int)vdiskId.FailRealm << ":"
<< (int)vdiskId.FailDomain << ":" << (int)vdiskId.VDisk << "@" << pdiskId;
- Cerr << "path# " << path << Endl;
+ Ctest << "path# " << path << Endl;
context.DumpGroup(temp);
auto vdiskItems = item.VDiskItems;
@@ -973,7 +973,7 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) {
}
}
- Cerr << Endl;
+ Ctest << Endl;
}
}
}
diff --git a/ydb/core/mind/bscontroller/impl.h b/ydb/core/mind/bscontroller/impl.h
index b3db9d483b7..e69265827bb 100644
--- a/ydb/core/mind/bscontroller/impl.h
+++ b/ydb/core/mind/bscontroller/impl.h
@@ -1377,6 +1377,7 @@ private:
EvVSlotReadyUpdate,
EvVSlotNotReadyHistogramUpdate,
EvProcessIncomingEvent,
+ EvUpdateHostRecords,
};
struct TEvUpdateSystemViews : public TEventLocal<TEvUpdateSystemViews, EvUpdateSystemViews> {};
@@ -1390,6 +1391,14 @@ private:
struct TEvScrub : TEventLocal<TEvScrub, EvScrub> {};
struct TEvVSlotReadyUpdate : TEventLocal<TEvVSlotReadyUpdate, EvVSlotReadyUpdate> {};
struct TEvVSlotNotReadyHistogramUpdate : TEventLocal<TEvVSlotNotReadyHistogramUpdate, EvVSlotNotReadyHistogramUpdate> {};
+
+ struct TEvUpdateHostRecords : TEventLocal<TEvUpdateHostRecords, EvUpdateHostRecords> {
+ THostRecordMap HostRecords;
+
+ TEvUpdateHostRecords(THostRecordMap hostRecords)
+ : HostRecords(std::move(hostRecords))
+ {}
+ };
};
static constexpr TDuration UpdateSystemViewsPeriod = TDuration::Seconds(5);
diff --git a/ydb/core/mind/bscontroller/self_heal.cpp b/ydb/core/mind/bscontroller/self_heal.cpp
index 86ea82a627e..24c6045625e 100644
--- a/ydb/core/mind/bscontroller/self_heal.cpp
+++ b/ydb/core/mind/bscontroller/self_heal.cpp
@@ -244,7 +244,7 @@ namespace NKikimr::NBsController {
TIntrusiveList<TGroupRecord, TWithInvalidLayout> GroupsWithInvalidLayout;
std::shared_ptr<std::atomic_uint64_t> UnreassignableGroups;
bool GroupLayoutSanitizer = false;
- std::optional<THostRecordMapImpl> HostRecords;
+ THostRecordMap HostRecords;
public:
TSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups)
@@ -579,8 +579,8 @@ namespace NKikimr::NBsController {
}
}
- void Handle(TEvInterconnect::TEvNodesInfo::TPtr ev) {
- HostRecords.emplace(ev->Get());
+ void Handle(TEvPrivate::TEvUpdateHostRecords::TPtr ev) {
+ HostRecords = std::move(ev->Get()->HostRecords);
}
STRICT_STFUNC(StateFunc, {
@@ -589,7 +589,7 @@ namespace NKikimr::NBsController {
hFunc(NMon::TEvRemoteHttpInfo, Handle);
hFunc(TEvReassignerDone, Handle);
cFunc(TEvents::TSystem::Wakeup, HandleWakeup);
- hFunc(TEvInterconnect::TEvNodesInfo, Handle);
+ hFunc(TEvPrivate::TEvUpdateHostRecords, Handle);
})
};