diff options
| author | Alexander Rutkovsky <[email protected]> | 2022-04-08 15:27:10 +0300 |
|---|---|---|
| committer | Alexander Rutkovsky <[email protected]> | 2022-04-08 15:27:10 +0300 |
| commit | 0db69be232f1d207c45edeb606df5f6223a89ba7 (patch) | |
| tree | 158de59a8c8a5779b3b55817a6b22033f6477a33 | |
| parent | f22d9cd81bf3f86db7b45ecabd93397349d88263 (diff) | |
Add group layout sanitizer feature KIKIMR-14580
ref:ee11bc2fb183c18c214c9b4153d83f0b0d3920d7
25 files changed, 1245 insertions, 680 deletions
diff --git a/ydb/core/blobstorage/ut_blobstorage/CMakeLists.darwin.txt b/ydb/core/blobstorage/ut_blobstorage/CMakeLists.darwin.txt index 46272ff7922..4af5291bff6 100644 --- a/ydb/core/blobstorage/ut_blobstorage/CMakeLists.darwin.txt +++ b/ydb/core/blobstorage/ut_blobstorage/CMakeLists.darwin.txt @@ -39,6 +39,7 @@ target_sources(ydb-core-blobstorage-ut_blobstorage PRIVATE ${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/incorrect_queries.cpp ${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/main.cpp ${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/mirror3of4.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp ${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/space_check.cpp ${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/sync.cpp ${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/replication.cpp diff --git a/ydb/core/blobstorage/ut_blobstorage/CMakeLists.linux.txt b/ydb/core/blobstorage/ut_blobstorage/CMakeLists.linux.txt index 0c713121adf..c7a8da56f7d 100644 --- a/ydb/core/blobstorage/ut_blobstorage/CMakeLists.linux.txt +++ b/ydb/core/blobstorage/ut_blobstorage/CMakeLists.linux.txt @@ -42,6 +42,7 @@ target_sources(ydb-core-blobstorage-ut_blobstorage PRIVATE ${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/incorrect_queries.cpp ${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/main.cpp ${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/mirror3of4.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp ${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/space_check.cpp ${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/sync.cpp ${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/replication.cpp diff --git a/ydb/core/blobstorage/ut_blobstorage/lib/env.h b/ydb/core/blobstorage/ut_blobstorage/lib/env.h index e3dc7cc8f4b..685594fa4c7 100644 --- a/ydb/core/blobstorage/ut_blobstorage/lib/env.h +++ b/ydb/core/blobstorage/ut_blobstorage/lib/env.h @@ -30,6 +30,7 @@ struct TEnvironmentSetup { const ui32 ControllerNodeId = 1; const bool Cache = false; const ui32 NumDataCenters = 0; + const std::function<TNodeLocation(ui32)> LocationGenerator; }; const TSettings Settings; @@ -108,7 +109,11 @@ struct TEnvironmentSetup { Runtime->Start(); auto *appData = Runtime->GetAppData(); appData->DomainsInfo->AddDomain(TDomainsInfo::TDomain::ConstructEmptyDomain("dom", DomainId).Release()); - Runtime->SetupTabletRuntime(GetNumDataCenters(), Settings.ControllerNodeId); + if (Settings.LocationGenerator) { + Runtime->SetupTabletRuntime(Settings.LocationGenerator, Settings.ControllerNodeId); + } else { + Runtime->SetupTabletRuntime(GetNumDataCenters(), Settings.ControllerNodeId); + } SetupStaticStorage(); SetupTablet(); SetupStorage(); @@ -120,7 +125,11 @@ struct TEnvironmentSetup { void StartNode(ui32 nodeId) { Runtime->StartNode(nodeId); - Runtime->SetupTabletRuntime(GetNumDataCenters(), Settings.ControllerNodeId, nodeId); + if (Settings.LocationGenerator) { + Runtime->SetupTabletRuntime(Settings.LocationGenerator, Settings.ControllerNodeId, nodeId); + } else { + Runtime->SetupTabletRuntime(GetNumDataCenters(), Settings.ControllerNodeId, nodeId); + } if (nodeId == Settings.ControllerNodeId) { SetupStaticStorage(); SetupTablet(); @@ -553,12 +562,13 @@ struct TEnvironmentSetup { } } - void UpdateSettings(bool selfHeal, bool donorMode) { + void UpdateSettings(bool selfHeal, bool donorMode, bool groupLayoutSanitizer = false) { NKikimrBlobStorage::TConfigRequest request; auto *cmd = request.AddCommand(); auto *us = cmd->MutableUpdateSettings(); us->AddEnableSelfHeal(selfHeal); us->AddEnableDonorMode(donorMode); + us->AddEnableGroupLayoutSanitizer(groupLayoutSanitizer); auto response = Invoke(request); UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription()); } diff --git a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp new file mode 100644 index 00000000000..9e17730c3d6 --- /dev/null +++ b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp @@ -0,0 +1,103 @@ +#include <ydb/core/blobstorage/ut_blobstorage/lib/env.h> + +Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { + Y_UNIT_TEST(Test3dc) { + const ui32 numRacks = 15; + std::vector<ui32> nodesPerRack(numRacks); + std::vector<ui32> nodeToRack; + for (ui32 numFilledRacks = 0; numFilledRacks < numRacks; ) { +// const ui32 rackId = RandomNumber(numRacks); + const ui32 rackId = numFilledRacks; + nodeToRack.emplace_back(rackId); + numFilledRacks += !nodesPerRack[rackId]++; + } + const ui32 numDatacenters = 3; + std::vector<ui32> rackToDatacenter; + for (ui32 i = 0; i < numRacks; ++i) { + rackToDatacenter.push_back(i % numDatacenters); + } + + std::vector<TNodeLocation> locations; + for (ui32 i = 0; i < nodeToRack.size(); ++i) { + NActorsInterconnect::TNodeLocation proto; + proto.SetDataCenter(ToString(rackToDatacenter[nodeToRack[i]])); + proto.SetRack(ToString(nodeToRack[i])); + proto.SetUnit(ToString(i)); + locations.emplace_back(proto); + } + + TEnvironmentSetup env{{ + .NodeCount = (ui32)nodeToRack.size(), + .Erasure = TBlobStorageGroupType::ErasureMirror3dc, + .LocationGenerator = [&](ui32 nodeId) { return locations[nodeId - 1]; }, + }}; + + auto getGroupsWithIncorrectLayout = [&] { + auto config = env.FetchBaseConfig(); + + std::map<ui32, std::tuple<TString, TString>> nodeIdToLocation; + for (const auto& node : config.GetNode()) { + const auto& location = node.GetLocation(); + nodeIdToLocation.emplace(node.GetNodeId(), std::make_tuple(location.GetDataCenter(), location.GetRack())); + } + + std::map<ui32, std::vector<std::vector<std::tuple<TString, TString>>>> groups; + for (const auto& vslot : config.GetVSlot()) { + auto& group = groups[vslot.GetGroupId()]; + if (group.empty()) { + group.resize(3, {3, {"", ""}}); + } + group[vslot.GetFailRealmIdx()][vslot.GetFailDomainIdx()] = nodeIdToLocation[vslot.GetVSlotId().GetNodeId()]; + } + + std::set<ui32> badGroups; + + for (auto& [groupId, group] : groups) { + std::set<TString> usedRealms; + + for (const auto& row : group) { + TString realm; + std::set<TString> usedRacks; + + for (const auto& [dc, rack] : row) { + Y_VERIFY(dc && rack); + + if (!usedRacks.insert(rack).second) { + badGroups.insert(groupId); + } + + if (!realm) { + if (!usedRealms.insert(dc).second) { + badGroups.insert(groupId); + } + realm = dc; + } else if (realm != dc) { + badGroups.insert(groupId); + } + } + } + } + + return badGroups; + }; + + const ui32 disksPerNode = 1; + const ui32 slotsPerDisk = 3; + env.CreateBoxAndPool(disksPerNode, nodeToRack.size() * disksPerNode * slotsPerDisk / 9); + env.Sim(TDuration::Seconds(30)); + auto before = getGroupsWithIncorrectLayout(); + Cerr << "bad groups before shuffling# " << FormatList(before) << Endl; + UNIT_ASSERT(before.empty()); + env.Cleanup(); + std::random_shuffle(locations.begin(), locations.end()); + env.Initialize(); + env.Sim(TDuration::Seconds(100)); + auto after = getGroupsWithIncorrectLayout(); + Cerr << "bad groups just after shuffling# " << FormatList(after) << Endl; + env.UpdateSettings(true, false, true); + env.Sim(TDuration::Minutes(15)); + auto corrected = getGroupsWithIncorrectLayout(); + Cerr << "bad groups after shuffling and fixing# " << FormatList(corrected) << Endl; +// UNIT_ASSERT(corrected.empty()); + } +} diff --git a/ydb/core/mind/bscontroller/CMakeLists.txt b/ydb/core/mind/bscontroller/CMakeLists.txt index d897fd44761..05311af9ab1 100644 --- a/ydb/core/mind/bscontroller/CMakeLists.txt +++ b/ydb/core/mind/bscontroller/CMakeLists.txt @@ -37,6 +37,7 @@ target_sources(core-mind-bscontroller PRIVATE ${CMAKE_SOURCE_DIR}/ydb/core/mind/bscontroller/drop_donor.cpp ${CMAKE_SOURCE_DIR}/ydb/core/mind/bscontroller/get_group.cpp ${CMAKE_SOURCE_DIR}/ydb/core/mind/bscontroller/grouper.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/mind/bscontroller/group_layout_checker.cpp ${CMAKE_SOURCE_DIR}/ydb/core/mind/bscontroller/group_mapper.cpp ${CMAKE_SOURCE_DIR}/ydb/core/mind/bscontroller/group_reconfigure_wipe.cpp ${CMAKE_SOURCE_DIR}/ydb/core/mind/bscontroller/init_scheme.cpp diff --git a/ydb/core/mind/bscontroller/bsc.cpp b/ydb/core/mind/bscontroller/bsc.cpp index 779381fd513..bb44c7fbb04 100644 --- a/ydb/core/mind/bscontroller/bsc.cpp +++ b/ydb/core/mind/bscontroller/bsc.cpp @@ -116,7 +116,7 @@ void TBlobStorageController::OnActivateExecutor(const TActorContext&) { } // create self-heal actor - SelfHealId = Register(CreateSelfHealActor(TabletID(), SelfHealUnreassignableGroups)); + SelfHealId = Register(CreateSelfHealActor()); // create stat processor StatProcessorActorId = Register(CreateStatProcessorActor()); @@ -152,6 +152,7 @@ void TBlobStorageController::Handle(TEvInterconnect::TEvNodesInfo::TPtr &ev) { const bool initial = !HostRecords; HostRecords = std::make_shared<THostRecordMap::element_type>(ev->Get()); Schedule(TDuration::Minutes(5), new TEvPrivate::TEvHostRecordsTimeToLiveExceeded); + TActivationContext::Send(ev->Forward(SelfHealId)); if (initial) { Execute(CreateTxInitScheme()); } diff --git a/ydb/core/mind/bscontroller/config_cmd.cpp b/ydb/core/mind/bscontroller/config_cmd.cpp index 2fc925a9cd7..f4bb7d5aa1c 100644 --- a/ydb/core/mind/bscontroller/config_cmd.cpp +++ b/ydb/core/mind/bscontroller/config_cmd.cpp @@ -126,6 +126,13 @@ namespace NKikimr::NBsController { Self->PDiskSpaceColorBorder = static_cast<T::PDiskSpaceColorBorder::Type>(value); db.Table<T>().Key(true).Update<T::PDiskSpaceColorBorder>(Self->PDiskSpaceColorBorder); } + for (bool value : settings.GetEnableGroupLayoutSanitizer()) { + Self->GroupLayoutSanitizer = value; + db.Table<T>().Key(true).Update<T::GroupLayoutSanitizer>(Self->GroupLayoutSanitizer); + auto ev = std::make_unique<TEvControllerUpdateSelfHealInfo>(); + ev->GroupLayoutSanitizer = Self->GroupLayoutSanitizer; + Self->Send(Self->SelfHealId, ev.release()); + } return true; } diff --git a/ydb/core/mind/bscontroller/config_fit_groups.cpp b/ydb/core/mind/bscontroller/config_fit_groups.cpp index c7b2f18384e..dd1513b3549 100644 --- a/ydb/core/mind/bscontroller/config_fit_groups.cpp +++ b/ydb/core/mind/bscontroller/config_fit_groups.cpp @@ -51,7 +51,7 @@ namespace NKikimr { for (ui64 reserve = 0; reserve < min || (reserve - min) * 1000000 / Max<ui64>(1, total) < part; ++reserve, ++total) { TGroupMapper::TGroupDefinition group; try { - AllocateGroup(0, group, nullptr, 0, {}, 0, false); + AllocateGroup(0, group, {}, {}, 0, false); } catch (const TExFitGroupError&) { throw TExError() << "group reserve constraint hit"; } @@ -92,7 +92,7 @@ namespace NKikimr { requiredSpace = ExpectedSlotSize.front(); ExpectedSlotSize.pop_front(); } - AllocateGroup(groupId, group, nullptr, 0, {}, requiredSpace, false); + AllocateGroup(groupId, group, {}, {}, requiredSpace, false); // scan all comprising PDisks for PDiskCategory TMaybe<TPDiskCategory> desiredPDiskCategory; @@ -171,6 +171,7 @@ namespace NKikimr { // mapping for audit log TMap<TVDiskIdShort, TVSlotId> replacedSlots; TStackVec<std::pair<TVSlotId, bool>, 32> replaceQueue; + THashMap<TVDiskIdShort, TPDiskId> replacedDisks; i64 requiredSpace = Min<i64>(); //////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -214,6 +215,7 @@ namespace NKikimr { g[vslot->RingIdx][vslot->FailDomainIdx][vslot->VDiskIdx] = targetPDiskId; replacedSlots.emplace(TVDiskIdShort(vslot->RingIdx, vslot->FailDomainIdx, vslot->VDiskIdx), vslot->VSlotId); replaceQueue.emplace_back(vslot->VSlotId, State.SuppressDonorMode.count(vslot->VSlotId)); + replacedDisks.emplace(vslot->GetShortVDiskId(), vslot->VSlotId.ComprisingPDiskId()); } else { preservedSlots.emplace(vslot->GetVDiskId(), vslot->VSlotId); auto& m = vslot->Metrics; @@ -240,10 +242,6 @@ namespace NKikimr { } } if (hasMissingSlots || !IgnoreGroupSanityChecks) { - TStackVec<TPDiskId, 32> replacedDiskIds; - for (const auto& [vslotId, suppressDonorMode] : replaceQueue) { - replacedDiskIds.push_back(vslotId.ComprisingPDiskId()); - } TGroupMapper::TForbiddenPDisks forbid; for (const auto& vslot : groupInfo->VDisksInGroup) { for (const auto& [vslotId, vdiskId] : vslot->Donors) { @@ -252,8 +250,7 @@ namespace NKikimr { } } } - AllocateGroup(groupId, group, replacedDiskIds.data(), replacedDiskIds.size(), std::move(forbid), - requiredSpace, AllowUnusableDisks); + AllocateGroup(groupId, group, replacedDisks, std::move(forbid), requiredSpace, AllowUnusableDisks); if (!IgnoreVSlotQuotaCheck) { adjustSpaceAvailable = true; for (const auto& [pos, vslotId] : replacedSlots) { @@ -358,9 +355,9 @@ namespace NKikimr { } private: - void AllocateGroup(TGroupId groupId, TGroupMapper::TGroupDefinition& group, const TPDiskId replacedDiskIds[], - size_t numReplacedDisks, TGroupMapper::TForbiddenPDisks forbid, i64 requiredSpace, - bool addExistingDisks) { + void AllocateGroup(TGroupId groupId, TGroupMapper::TGroupDefinition& group, + const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks, TGroupMapper::TForbiddenPDisks forbid, + i64 requiredSpace, bool addExistingDisks) { if (!Mapper) { Mapper.emplace(Geometry, StoragePool.RandomizeGroupMapping); PopulateGroupMapper(); @@ -379,8 +376,7 @@ namespace NKikimr { } } } - Geometry.AllocateGroup(*Mapper, groupId, group, replacedDiskIds, numReplacedDisks, std::move(forbid), - requiredSpace); + Geometry.AllocateGroup(*Mapper, groupId, group, replacedDisks, std::move(forbid), requiredSpace); for (const TPDiskId pdiskId : removeQ) { Mapper->UnregisterPDisk(pdiskId); } diff --git a/ydb/core/mind/bscontroller/group_geometry_info.h b/ydb/core/mind/bscontroller/group_geometry_info.h index 10e5daedba0..5d37a0dfd2c 100644 --- a/ydb/core/mind/bscontroller/group_geometry_info.h +++ b/ydb/core/mind/bscontroller/group_geometry_info.h @@ -69,12 +69,11 @@ namespace NKikimr::NBsController { ui32 GetDomainLevelEnd() const { return DomainLevelEnd; } void AllocateGroup(TGroupMapper &mapper, TGroupId groupId, TGroupMapper::TGroupDefinition &group, - const TPDiskId replacedDiskIds[], size_t numReplacedDisks, TGroupMapper::TForbiddenPDisks forbid, + const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks, TGroupMapper::TForbiddenPDisks forbid, i64 requiredSpace) const { TString error; for (const bool requireOperational : {true, false}) { - if (mapper.AllocateGroup(groupId, group, replacedDiskIds, numReplacedDisks, forbid, - requiredSpace, requireOperational, error)) { + if (mapper.AllocateGroup(groupId, group, replacedDisks, forbid, requiredSpace, requireOperational, error)) { return; } } diff --git a/ydb/core/mind/bscontroller/group_layout_checker.cpp b/ydb/core/mind/bscontroller/group_layout_checker.cpp new file mode 100644 index 00000000000..3e4418ce731 --- /dev/null +++ b/ydb/core/mind/bscontroller/group_layout_checker.cpp @@ -0,0 +1,47 @@ +#include "group_layout_checker.h" +#include "group_geometry_info.h" + +namespace NKikimr::NBsController { + + TLayoutCheckResult CheckGroupLayout(const TGroupGeometryInfo& geom, const THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>>& layout) { + using namespace NLayoutChecker; + + if (layout.empty()) { + return {}; + } + + TGroupLayout group(geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm()); + TDomainMapper mapper; + THashMap<TVDiskIdShort, TPDiskLayoutPosition> map; + for (const auto& [vdiskId, p] : layout) { + const auto& [location, pdiskId] = p; + TPDiskLayoutPosition pos(mapper, location, pdiskId, geom); + group.AddDisk(pos, vdiskId.FailRealm, vdiskId.FailDomain); + map.emplace(vdiskId, pos); + } + + std::vector<std::pair<TScore, TVDiskIdShort>> scoreboard; + for (const auto& [vdiskId, pos] : map) { + scoreboard.emplace_back(group.GetCandidateScore(pos, vdiskId.FailRealm, vdiskId.FailDomain), vdiskId); + } + + auto comp1 = [](const auto& x, const auto& y) { return x.second < y.second; }; + std::sort(scoreboard.begin(), scoreboard.end(), comp1); + for (const auto& [score, vdiskId] : scoreboard) { + Cerr << vdiskId << "@" << map[vdiskId].ToString() << " -> " << score.ToString() << Endl; + } + + auto comp = [](const auto& x, const auto& y) { return x.first.BetterThan(y.first); }; + std::sort(scoreboard.begin(), scoreboard.end(), comp); + TLayoutCheckResult res; + const auto reference = scoreboard.back().first; + if (!reference.SameAs({})) { // not perfectly correct layout + for (; !scoreboard.empty() && !scoreboard.back().first.BetterThan(reference); scoreboard.pop_back()) { + Cerr << "candidate# " << scoreboard.back().second << Endl; + res.Candidates.push_back(scoreboard.back().second); + } + } + return res; + } + +} // NKikimr::NBsController diff --git a/ydb/core/mind/bscontroller/group_layout_checker.h b/ydb/core/mind/bscontroller/group_layout_checker.h new file mode 100644 index 00000000000..5a317f59ddb --- /dev/null +++ b/ydb/core/mind/bscontroller/group_layout_checker.h @@ -0,0 +1,177 @@ +#pragma once + +#include "defs.h" +#include "types.h" +#include "group_geometry_info.h" + +namespace NKikimr::NBsController { + + struct TLayoutCheckResult { + std::vector<TVDiskIdShort> Candidates; + + explicit operator bool() const { // checks whether fail model is correct + return Candidates.empty(); + } + }; + + TLayoutCheckResult CheckGroupLayout(const TGroupGeometryInfo& geom, const THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>>& layout); + + namespace NLayoutChecker { + + class TDomainMapper { + std::unordered_map<TString, ui32> FailDomainId; + + public: + ui32 operator ()(TString item) { + return FailDomainId.emplace(std::move(item), FailDomainId.size()).first->second; + } + + ui32 GetIdCount() const { + return FailDomainId.size(); + } + }; + + struct TPDiskLayoutPosition { + ui32 RealmGroup = 0; + ui32 Realm = 0; + ui32 Domain = 0; + + TPDiskLayoutPosition() = default; + + TPDiskLayoutPosition(ui32 realmGroup, ui32 realm, ui32 domain) + : RealmGroup(realmGroup) + , Realm(realm) + , Domain(domain) + {} + + TPDiskLayoutPosition(TDomainMapper& mapper, const TNodeLocation& location, TPDiskId pdiskId, const TGroupGeometryInfo& geom) { + TStringStream realmGroup, realm, domain; + const std::pair<int, TStringStream*> levels[] = { + {geom.GetRealmLevelBegin(), &realmGroup}, + {Max(geom.GetRealmLevelEnd(), geom.GetDomainLevelBegin()), &realm}, + {Max(geom.GetRealmLevelEnd(), geom.GetDomainLevelEnd()), &domain} + }; + auto addLevel = [&](int key, const TString& value) { + for (const auto& [reference, stream] : levels) { + if (key < reference) { + Save(stream, std::make_tuple(key, value)); + } + } + }; + for (const auto& [key, value] : location.GetItems()) { + addLevel(key, value); + } + addLevel(255, pdiskId.ToString()); // ephemeral level to distinguish between PDisks on the same node + RealmGroup = mapper(realmGroup.Str()); + Realm = mapper(realm.Str()); + Domain = mapper(domain.Str()); + } + + TString ToString() const { + return TStringBuilder() << "{" << RealmGroup << "." << Realm << "." << Domain << "}"; + } + + auto AsTuple() const { + return std::tie(RealmGroup, Realm, Domain); + } + + friend bool operator ==(const TPDiskLayoutPosition& x, const TPDiskLayoutPosition& y) { + return x.AsTuple() == y.AsTuple(); + } + + friend bool operator <(const TPDiskLayoutPosition& x, const TPDiskLayoutPosition& y) { + return x.AsTuple() < y.AsTuple(); + } + }; + + struct TScore { + ui32 RealmInterlace = 0; + ui32 DomainInterlace = 0; + ui32 RealmGroupScatter = 0; + ui32 RealmScatter = 0; + ui32 DomainScatter = 0; + + auto AsTuple() const { + return std::make_tuple(RealmInterlace, DomainInterlace, RealmGroupScatter, RealmScatter, DomainScatter); + } + + bool BetterThan(const TScore& other) const { + return AsTuple() < other.AsTuple(); + } + + bool SameAs(const TScore& other) const { + return AsTuple() == other.AsTuple(); + } + + static TScore Max() { + return {::Max<ui32>(), ::Max<ui32>(), ::Max<ui32>(), ::Max<ui32>(), ::Max<ui32>()}; + } + + TString ToString() const { + return TStringBuilder() << "{RealmInterlace# " << RealmInterlace + << " DomainInterlace# " << DomainInterlace + << " RealmGroupScatter# " << RealmGroupScatter + << " RealmScatter# " << RealmScatter + << " DomainScatter# " << DomainScatter + << "}"; + } + }; + + struct TGroupLayout { + const ui32 NumFailDomainsPerFailRealm; + + ui32 NumDisks = 0; + THashMap<ui32, ui32> NumDisksPerRealmGroup; + + TStackVec<ui32, 4> NumDisksInRealm; + TStackVec<THashMap<ui32, ui32>, 4> NumDisksPerRealm; + THashMap<ui32, ui32> NumDisksPerRealmTotal; + + TStackVec<ui32, 32> NumDisksInDomain; + TStackVec<THashMap<ui32, ui32>, 32> NumDisksPerDomain; + THashMap<ui32, ui32> NumDisksPerDomainTotal; + + TGroupLayout(ui32 numFailRealms, ui32 numFailDomainsPerFailRealm) + : NumFailDomainsPerFailRealm(numFailDomainsPerFailRealm) + , NumDisksInRealm(numFailRealms) + , NumDisksPerRealm(numFailRealms) + , NumDisksInDomain(numFailRealms * numFailDomainsPerFailRealm) + , NumDisksPerDomain(numFailRealms * numFailDomainsPerFailRealm) + {} + + void UpdateDisk(const TPDiskLayoutPosition& pos, ui32 realmIdx, ui32 domainIdx, ui32 value) { + domainIdx += realmIdx * NumFailDomainsPerFailRealm; + NumDisks += value; + NumDisksPerRealmGroup[pos.RealmGroup] += value; + NumDisksInRealm[realmIdx] += value; + NumDisksPerRealm[realmIdx][pos.Realm] += value; + NumDisksPerRealmTotal[pos.Realm] += value; + NumDisksInDomain[domainIdx] += value; + NumDisksPerDomain[domainIdx][pos.Domain] += value; + NumDisksPerDomainTotal[pos.Domain] += value; + } + + void AddDisk(const TPDiskLayoutPosition& pos, ui32 realmIdx, ui32 domainIdx) { + UpdateDisk(pos, realmIdx, domainIdx, 1); + } + + void RemoveDisk(const TPDiskLayoutPosition& pos, ui32 realmIdx, ui32 domainIdx) { + UpdateDisk(pos, realmIdx, domainIdx, Max<ui32>()); + } + + TScore GetCandidateScore(const TPDiskLayoutPosition& pos, ui32 realmIdx, ui32 domainIdx) { + domainIdx += realmIdx * NumFailDomainsPerFailRealm; + + return { + .RealmInterlace = NumDisksPerRealmTotal[pos.Realm] - NumDisksPerRealm[realmIdx][pos.Realm], + .DomainInterlace = NumDisksPerDomainTotal[pos.Domain] - NumDisksPerDomain[domainIdx][pos.Domain], + .RealmGroupScatter = NumDisks - NumDisksPerRealmGroup[pos.RealmGroup], + .RealmScatter = NumDisksInRealm[realmIdx] - NumDisksPerRealm[realmIdx][pos.Realm], + .DomainScatter = NumDisksInDomain[domainIdx] - NumDisksPerDomain[domainIdx][pos.Domain], + }; + } + }; + + } // NLayoutChecker + +} // NKikimr::NBsController diff --git a/ydb/core/mind/bscontroller/group_mapper.cpp b/ydb/core/mind/bscontroller/group_mapper.cpp index 945487c7ee2..347a136a8a7 100644 --- a/ydb/core/mind/bscontroller/group_mapper.cpp +++ b/ydb/core/mind/bscontroller/group_mapper.cpp @@ -1,86 +1,19 @@ #include "group_mapper.h" #include "group_geometry_info.h" +#include "group_layout_checker.h" namespace NKikimr::NBsController { class TGroupMapper::TImpl : TNonCopyable { - class TDomainMapper { - std::unordered_map<TString, ui32> FailDomainId; - - public: - ui32 operator ()(TString item) { - return FailDomainId.emplace(std::move(item), FailDomainId.size()).first->second; - } - - ui32 GetIdCount() const { - return FailDomainId.size(); - } - }; - - enum class EPositionItem { - RealmGroup, - Realm, - Domain, - None, - }; - - struct TPDiskLayoutPosition { - ui32 RealmGroup = 0; - ui32 Realm = 0; - ui32 Domain = 0; - - TPDiskLayoutPosition() = default; - - TPDiskLayoutPosition(ui32 realmGroup, ui32 realm, ui32 domain) - : RealmGroup(realmGroup) - , Realm(realm) - , Domain(domain) - {} - - TPDiskLayoutPosition(TDomainMapper& mapper, const TNodeLocation& location, TPDiskId pdiskId, const TGroupGeometryInfo& geom) { - TStringStream realmGroup, realm, domain; - const std::pair<int, TStringStream*> levels[] = { - {geom.GetRealmLevelBegin(), &realmGroup}, - {Max(geom.GetRealmLevelEnd(), geom.GetDomainLevelBegin()), &realm}, - {Max(geom.GetRealmLevelEnd(), geom.GetDomainLevelEnd()), &domain} - }; - auto addLevel = [&](int key, const TString& value) { - for (const auto& [reference, stream] : levels) { - if (key < reference) { - Save(stream, std::make_tuple(key, value)); - } - } - }; - for (const auto& [key, value] : location.GetItems()) { - addLevel(key, value); - } - addLevel(255, pdiskId.ToString()); // ephemeral level to distinguish between PDisks on the same node - RealmGroup = mapper(realmGroup.Str()); - Realm = mapper(realm.Str()); - Domain = mapper(domain.Str()); - } - - auto AsTuple() const { - return std::tie(RealmGroup, Realm, Domain); - } - - friend bool operator ==(const TPDiskLayoutPosition& x, const TPDiskLayoutPosition& y) { - return x.AsTuple() == y.AsTuple(); - } - - friend bool operator !=(const TPDiskLayoutPosition& x, const TPDiskLayoutPosition& y) { - return x.AsTuple() != y.AsTuple(); - } - - friend bool operator <(const TPDiskLayoutPosition& x, const TPDiskLayoutPosition& y) { - return x.AsTuple() < y.AsTuple(); - } - }; + using TPDiskLayoutPosition = NLayoutChecker::TPDiskLayoutPosition; struct TPDiskInfo : TPDiskRecord { TPDiskLayoutPosition Position; - bool Matching = false; - ui32 NumDomainMatchingDisks = 0; + bool Matching; + ui32 NumDomainMatchingDisks; + ui32 SkipToNextRealmGroup; + ui32 SkipToNextRealm; + ui32 SkipToNextDomain; TPDiskInfo(const TPDiskRecord& pdisk, TPDiskLayoutPosition position) : TPDiskRecord(pdisk) @@ -89,10 +22,6 @@ namespace NKikimr::NBsController { std::sort(Groups.begin(), Groups.end()); } - TString ToString() const { - return Location.ToString(); - } - bool IsUsable() const { return Usable && !Decommitted && NumSlots < MaxSlots; } @@ -127,123 +56,106 @@ namespace NKikimr::NBsController { } }; - struct TAllocateContext { - struct TDomainBound { - ui32 NumChildren = 0; - }; - - struct TRealmBound { - ui32 NumChildren = 0; - TStackVec<THashMap<ui32, TDomainBound>, 8> Items; - - TRealmBound(size_t numFailDomains) - : Items(numFailDomains) - {} - }; - - struct TRealmGroupBound { - ui32 NumChildren = 0; - TStackVec<THashMap<ui32, TRealmBound>, 4> Items; - - TRealmGroupBound(size_t numFailRealms) - : Items(numFailRealms) - {} - }; + using TGroup = std::vector<TPDiskInfo*>; + struct TAllocator { + TImpl& Self; const ui32 NumFailRealms; const ui32 NumFailDomainsPerFailRealm; - THashMap<ui32, TRealmGroupBound> RealmGroup; + const ui32 NumFailDomainsTotal; + const ui32 NumVDisksPerFailDomain; + const ui32 GroupSize; + TStackVec<ui8, 32> RealmIdx; + TStackVec<ui8, 32> DomainIdx; + TStackVec<ui8, 32> DomainThroughIdx; + TStackVec<ui8, 32> VDiskIdx; THashSet<TPDiskId> OldGroupContent; // set of all existing disks in the group, inclusing ones which are replaced - THashSet<TPDiskId> NewGroupContent; // newly generated group content const i64 RequiredSpace; const bool RequireOperational; - TForbiddenPDisks Forbid; + TForbiddenPDisks ForbiddenDisks; + THashMap<ui32, unsigned> LocalityFactor; + NLayoutChecker::TGroupLayout GroupLayout; + std::optional<NLayoutChecker::TScore> BestScore; - TAllocateContext(const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational, - TForbiddenPDisks forbid) - : NumFailRealms(geom.GetNumFailRealms()) + TAllocator(TImpl& self, const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational, + TForbiddenPDisks forbiddenDisks, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks) + : Self(self) + , NumFailRealms(geom.GetNumFailRealms()) , NumFailDomainsPerFailRealm(geom.GetNumFailDomainsPerFailRealm()) + , NumFailDomainsTotal(NumFailRealms * NumFailDomainsPerFailRealm) + , NumVDisksPerFailDomain(geom.GetNumVDisksPerFailDomain()) + , GroupSize(NumFailDomainsTotal * NumVDisksPerFailDomain) + , RealmIdx(GroupSize) + , DomainIdx(GroupSize) + , DomainThroughIdx(GroupSize) + , VDiskIdx(GroupSize) , RequiredSpace(requiredSpace) , RequireOperational(requireOperational) - , Forbid(std::move(forbid)) - {} - - bool ProcessExistingGroup(const TGroupDefinition& group, const TPDisks& pdisks, const TPDiskId replacedDiskIds[], - size_t numReplacedDisks, TString& error) { - OldGroupContent = {replacedDiskIds, replacedDiskIds + numReplacedDisks}; - - for (ui32 failRealmIdx = 0; failRealmIdx < group.size(); ++failRealmIdx) { - const auto& realm = group[failRealmIdx]; - for (ui32 failDomainIdx = 0; failDomainIdx < realm.size(); ++failDomainIdx) { - const auto& domain = realm[failDomainIdx]; - for (const TPDiskId pdiskId : domain) { + , ForbiddenDisks(std::move(forbiddenDisks)) + , GroupLayout(NumFailRealms, NumFailDomainsPerFailRealm) + { + for (const auto& [vdiskId, pdiskId] : replacedDisks) { + OldGroupContent.insert(pdiskId); + } + for (ui32 index = 0, domainThroughIdx = 0, realmIdx = 0; realmIdx < NumFailRealms; ++realmIdx) { + for (ui32 domainIdx = 0; domainIdx < NumFailDomainsPerFailRealm; ++domainIdx, ++domainThroughIdx) { + for (ui32 vdiskIdx = 0; vdiskIdx < NumVDisksPerFailDomain; ++vdiskIdx, ++index) { + RealmIdx[index] = realmIdx; + DomainIdx[index] = domainIdx; + DomainThroughIdx[index] = domainThroughIdx; + VDiskIdx[index] = vdiskIdx; + } + } + } + } + + TGroup ProcessExistingGroup(const TGroupDefinition& group, TString& error) { + TGroup res(GroupSize); + + ui32 index = 0; + for (const auto& realm : group) { + for (const auto& domain : realm) { + for (const auto& pdiskId : domain) { if (pdiskId != TPDiskId()) { - // add to used pdisk set - const bool inserted = OldGroupContent.insert(pdiskId).second; - Y_VERIFY(inserted); - - // find existing pdisk - auto it = pdisks.find(pdiskId); - if (it == pdisks.end()) { - error = TStringBuilder() << "existing group contains missing PDisks"; - return false; + const auto it = Self.PDisks.find(pdiskId); + if (it == Self.PDisks.end()) { + error = TStringBuilder() << "existing group contains missing PDiskId# " << pdiskId; + return {}; } - const TPDiskInfo& pdisk = it->second; + TPDiskInfo& pdisk = it->second; + res[index] = &pdisk; - if (pdisk.Decommitted) { - continue; + const auto [_, inserted] = OldGroupContent.insert(pdiskId); + if (!inserted) { + error = TStringBuilder() << "group contains duplicate PDiskId# " << pdiskId; + return {}; } - if (!AddDisk(pdisk, failRealmIdx, failDomainIdx)) { - error = "group contains duplicate PDisks"; - return false; + if (!pdisk.Decommitted) { + AddUsedDisk(pdisk); + GroupLayout.AddDisk(pdisk.Position, RealmIdx[index], DomainIdx[index]); } } + + ++index; } } } - return true; + return res; } - void UndoAddDisk(const TPDiskInfo& pdisk, ui32 failRealmIdx, ui32 failDomainIdx) { - const size_t num = NewGroupContent.erase(pdisk.PDiskId); - Y_VERIFY(num); - auto realmGroupIt = RealmGroup.find(pdisk.Position.RealmGroup); - Y_VERIFY(realmGroupIt != RealmGroup.end()); - auto& realms = realmGroupIt->second.Items[failRealmIdx]; - auto realmIt = realms.find(pdisk.Position.Realm); - Y_VERIFY(realmIt != realms.end()); - auto& domains = realmIt->second.Items[failDomainIdx]; - auto domainIt = domains.find(pdisk.Position.Domain); - Y_VERIFY(domainIt != domains.end()); - if (!--domainIt->second.NumChildren) { - domains.erase(domainIt); - } - if (!--realmIt->second.NumChildren) { - realms.erase(realmIt); - } - if (!--realmGroupIt->second.NumChildren) { - RealmGroup.erase(realmGroupIt); + void Decompose(const TGroup& in, TGroupDefinition& out) { + for (ui32 i = 0; i < GroupSize; ++i) { + out[RealmIdx[i]][DomainIdx[i]][VDiskIdx[i]] = in[i]->PDiskId; } } - bool AddDisk(const TPDiskInfo& pdisk, ui32 failRealmIdx, ui32 failDomainIdx) { - auto& realmGroup = RealmGroup.try_emplace(pdisk.Position.RealmGroup, NumFailRealms).first->second; - auto& realm = realmGroup.Items[failRealmIdx].try_emplace(pdisk.Position.Realm, NumFailDomainsPerFailRealm).first->second; - auto& domain = realm.Items[failDomainIdx].try_emplace(pdisk.Position.Domain).first->second; - ++realmGroup.NumChildren; - ++realm.NumChildren; - ++domain.NumChildren; - const auto& [_, inserted] = NewGroupContent.insert(pdisk.PDiskId); - return inserted; - } - bool DiskIsUsable(const TPDiskInfo& pdisk) const { if (!pdisk.IsUsable()) { return false; // disk is not usable in this case } - if (OldGroupContent.contains(pdisk.PDiskId) || NewGroupContent.contains(pdisk.PDiskId) || Forbid.contains(pdisk.PDiskId)) { + if (OldGroupContent.contains(pdisk.PDiskId) || ForbiddenDisks.contains(pdisk.PDiskId)) { return false; // can't allow duplicate disks } if (RequireOperational && !pdisk.Operational) { @@ -254,218 +166,313 @@ namespace NKikimr::NBsController { } return true; } - }; - class THelper { - TImpl& Self; - TAllocateContext& Ctx; - std::unordered_map<ui32, unsigned> LocalityFactor; - TDynBitMap ForbiddenEntities; - - public: - THelper(TImpl& self, TAllocateContext& ctx) - : Self(self) - , Ctx(ctx) - { - ForbiddenEntities.Reserve(Self.DomainMapper.GetIdCount()); - for (const TPDiskId& pdiskId : Ctx.NewGroupContent) { - try { - const TPDiskInfo& pdisk = Self.PDisks.at(pdiskId); - AddUsedDisk(pdisk); - Forbid(pdisk); - } catch (const std::out_of_range&) { - Y_FAIL(); - } - } + TPDiskByPosition SetupMatchingDisks(ui32 maxScore) { + TPDiskByPosition res; + res.reserve(Self.PDiskByPosition.size()); - ui32 numMatchingDisksInDomain = 0; - ui32 numMatchingDomainsInRealm = 0; - ui32 numMatchingRealmsInRealmGroup = 0; + ui32 realmGroupBegin = 0; + ui32 realmBegin = 0; + ui32 domainBegin = 0; + TPDiskLayoutPosition prev; - const ui32 numFailRealms = Self.Geom.GetNumFailRealms(); - const ui32 numFailDomainsPerFailRealm = Self.Geom.GetNumFailDomainsPerFailRealm(); - const ui32 numVDisksPerFailDomain = Self.Geom.GetNumVDisksPerFailDomain(); - - auto advance = [&](bool domainExhausted, bool realmExhausted, bool realmGroupExhausted, const TPDiskLayoutPosition& prev) { - if (domainExhausted) { - if (numMatchingDisksInDomain < numVDisksPerFailDomain) { - ForbiddenEntities.Set(prev.Domain); - } else { - ++numMatchingDomainsInRealm; + std::vector<ui32> numMatchingDisksInDomain(Self.DomainMapper.GetIdCount(), 0); + for (const auto& [position, pdisk] : Self.PDiskByPosition) { + pdisk->Matching = pdisk->GetPickerScore() <= maxScore && DiskIsUsable(*pdisk); + if (pdisk->Matching) { + if (position.RealmGroup != prev.RealmGroup) { + for (; realmGroupBegin < res.size(); ++realmGroupBegin) { + res[realmGroupBegin].second->SkipToNextRealmGroup = res.size() - realmGroupBegin; + } } - numMatchingDisksInDomain = 0; - } - if (realmExhausted) { - if (numMatchingDomainsInRealm < numFailDomainsPerFailRealm) { - ForbiddenEntities.Set(prev.Realm); - } else { - ++numMatchingRealmsInRealmGroup; + if (position.Realm != prev.Realm) { + for (; realmBegin < res.size(); ++realmBegin) { + res[realmBegin].second->SkipToNextRealm = res.size() - realmBegin; + } } - numMatchingDomainsInRealm = 0; - } - if (realmGroupExhausted) { - if (numMatchingRealmsInRealmGroup < numFailRealms) { - ForbiddenEntities.Set(prev.RealmGroup); + if (position.Domain != prev.Domain) { + for (; domainBegin < res.size(); ++domainBegin) { + res[domainBegin].second->SkipToNextDomain = res.size() - domainBegin; + } } - numMatchingRealmsInRealmGroup = 0; - } - }; + prev = position; - if (const auto *begin = Self.PDiskByPosition.data(), *end = begin + Self.PDiskByPosition.size(); begin != end) { - --end; - while (begin != end) { - numMatchingDisksInDomain += begin->second->Matching || Ctx.NewGroupContent.contains(begin->second->PDiskId); - const auto& prev = begin++->first; - const auto& cur = begin->first; - advance(prev.Domain != cur.Domain, prev.Realm != cur.Realm, prev.RealmGroup != cur.RealmGroup, prev); + res.emplace_back(position, pdisk); + ++numMatchingDisksInDomain[position.Domain]; } - numMatchingDisksInDomain += begin->second->Matching || Ctx.NewGroupContent.contains(begin->second->PDiskId); - advance(true, true, true, begin->first); } + for (; realmGroupBegin < res.size(); ++realmGroupBegin) { + res[realmGroupBegin].second->SkipToNextRealmGroup = res.size() - realmGroupBegin; + } + for (; realmBegin < res.size(); ++realmBegin) { + res[realmBegin].second->SkipToNextRealm = res.size() - realmBegin; + } + for (; domainBegin < res.size(); ++domainBegin) { + res[domainBegin].second->SkipToNextDomain = res.size() - domainBegin; + } + for (const auto& [position, pdisk] : res) { + pdisk->NumDomainMatchingDisks = numMatchingDisksInDomain[position.Domain]; + } + + return std::move(res); } - TPDiskId AddBestDisk(ui32 realmIdx, ui32 domainIdx) { - TPDiskInfo *pdisk = nullptr; - auto process = [this, &pdisk](TPDiskInfo *candidate) { - if (!pdisk || DiskIsBetter(*candidate, *pdisk)) { - pdisk = candidate; - } + struct TUndoLog { + struct TItem { + ui32 Index; + TPDiskInfo *PDisk; }; - FindMatchingDisksBounded(process, Self.PDiskByPosition.begin(), Self.PDiskByPosition.end(), - Ctx.RealmGroup, {realmIdx, domainIdx}); - if (!pdisk) { - return TPDiskId(); + + std::vector<TItem> Items; + + void Log(ui32 index, TPDiskInfo *pdisk) { + Items.push_back({index, pdisk}); } - const bool success = Ctx.AddDisk(*pdisk, realmIdx, domainIdx); - Y_VERIFY(success); - Forbid(*pdisk); - pdisk->Matching = false; // disable this disk for further selection + + size_t GetPosition() const { + return Items.size(); + } + }; + + void AddDiskViaUndoLog(TUndoLog& undo, TGroup& group, ui32 index, TPDiskInfo *pdisk) { + undo.Log(index, pdisk); + group[index] = pdisk; AddUsedDisk(*pdisk); - return pdisk->PDiskId; + GroupLayout.AddDisk(pdisk->Position, RealmIdx[index], DomainIdx[index]); + BestScore.reset(); // invalidate score } - private: - void Forbid(const TPDiskInfo& pdisk) { - for (const ui32 id : {pdisk.Position.RealmGroup, pdisk.Position.Realm, pdisk.Position.Domain}) { - ForbiddenEntities.Set(id); + void Revert(TUndoLog& undo, TGroup& group, size_t until) { + for (; undo.Items.size() > until; undo.Items.pop_back()) { + const auto& item = undo.Items.back(); + group[item.Index] = nullptr; + RemoveUsedDisk(*item.PDisk); + GroupLayout.RemoveDisk(item.PDisk->Position, RealmIdx[item.Index], DomainIdx[item.Index]); + BestScore.reset(); // invalidate score } } - template<typename T> - struct TBoundTraits {}; + bool FillInGroup(ui32 maxScore, TUndoLog& undo, TGroup& group) { + // Determine PDisks that fit our requirements (including score). + auto set = SetupMatchingDisks(maxScore); + + // Determine what we have to fill in -- full group, some realms, domains, or just some cells. + bool emptyGroup = true; + + TDynBitMap emptyRealms; + emptyRealms.Set(0, NumFailRealms); - template<> - struct TBoundTraits<TAllocateContext::TRealmGroupBound> { - static constexpr ui32 TPDiskLayoutPosition::*Ptr = &TPDiskLayoutPosition::RealmGroup; - static constexpr EPositionItem ForbiddenCheckLevel = EPositionItem::RealmGroup; - static constexpr size_t CoordIndex = 0; - static constexpr bool Descend = true; + TDynBitMap emptyDomains; + emptyDomains.Set(0, NumFailDomainsTotal); - static TPDiskLayoutPosition LowerBound(TPDiskLayoutPosition, ui32 id) { - return {id, 0, 0}; + TDynBitMap emptyDisks; + emptyDisks.Set(0, GroupSize); + + for (ui32 i = 0; i < GroupSize; ++i) { + if (group[i]) { + emptyGroup = false; + emptyRealms[RealmIdx[i]] = false; + emptyDomains[DomainThroughIdx[i]] = false; + emptyDisks[i] = false; + } } - static TPDiskLayoutPosition UpperBoundFromLowerBound(TPDiskLayoutPosition lower) { - return {lower.RealmGroup, Max<ui32>(), Max<ui32>()}; + // Allocate new full group and exit if it is absolutely empty. + auto allocate = [&](auto what, ui32 index) { + TDiskRange fullRange(set.begin(), set.end()); + TDynBitMap forbiddenEntities; + forbiddenEntities.Reserve(Self.DomainMapper.GetIdCount()); + if (!AllocateWholeEntity(what, group, undo, index, fullRange, forbiddenEntities)) { + Revert(undo, group, 0); + return false; + } + return true; + }; + + if (emptyGroup) { + return allocate(TAllocateWholeGroup(), 0); } - static bool PrefixEquals(const TPDiskLayoutPosition& a, const TPDiskLayoutPosition& b) { - return a.RealmGroup == b.RealmGroup; + // Fill in missing fail realms. + for (ui32 i = emptyRealms.FirstNonZeroBit(); i != emptyRealms.Size(); i = emptyRealms.NextNonZeroBit(i)) { + if (!allocate(TAllocateWholeRealm(), i)) { + return false; + } + + // remove excessive domains and disk from the set + emptyDomains.Reset(i * NumFailDomainsPerFailRealm, (i + 1) * NumFailDomainsPerFailRealm); + emptyDisks.Reset(i * NumFailDomainsPerFailRealm * NumVDisksPerFailDomain, + (i + 1) * NumFailDomainsPerFailRealm * NumVDisksPerFailDomain); } - }; - template<> - struct TBoundTraits<TAllocateContext::TRealmBound> { - static constexpr ui32 TPDiskLayoutPosition::*Ptr = &TPDiskLayoutPosition::Realm; - static constexpr EPositionItem ForbiddenCheckLevel = EPositionItem::Realm; - static constexpr size_t CoordIndex = 1; - static constexpr bool Descend = true; + // Fill in missing fail domains in some partially filled realms. + for (ui32 i = emptyDomains.FirstNonZeroBit(); i != emptyDomains.Size(); i = emptyDomains.NextNonZeroBit(i)) { + if (!allocate(TAllocateWholeDomain(), i)) { + return false; + } - static TPDiskLayoutPosition LowerBound(TPDiskLayoutPosition prefix, ui32 id) { - return {prefix.RealmGroup, id, 0}; + // remove excessive disks + emptyDisks.Reset(i * NumVDisksPerFailDomain, (i + 1) * NumVDisksPerFailDomain); } - static TPDiskLayoutPosition UpperBoundFromLowerBound(TPDiskLayoutPosition lower) { - return {lower.RealmGroup, lower.Realm, Max<ui32>()}; + // Fill in missing disk cells. + for (ui32 i = emptyDisks.FirstNonZeroBit(); i != emptyDisks.Size(); i = emptyDisks.NextNonZeroBit(i)) { + if (!allocate(TAllocateDisk(), i)) { + return false; + } } - static bool PrefixEquals(const TPDiskLayoutPosition& a, const TPDiskLayoutPosition& b) { - return a.RealmGroup == b.RealmGroup && a.Realm == b.Realm; + return true; + } + + struct TAllocateDisk {}; + + struct TAllocateWholeDomain { + static constexpr auto EntityCount = &TAllocator::NumVDisksPerFailDomain; + static constexpr auto PositionItem = &TPDiskLayoutPosition::Domain; + using TNestedEntity = TAllocateDisk; + + static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x) { + return {x, x}; } }; - template<> - struct TBoundTraits<TAllocateContext::TDomainBound> { - static constexpr ui32 TPDiskLayoutPosition::*Ptr = &TPDiskLayoutPosition::Domain; - static constexpr EPositionItem ForbiddenCheckLevel = EPositionItem::Domain; - static constexpr bool Descend = false; + struct TAllocateWholeRealm { + static constexpr auto EntityCount = &TAllocator::NumFailDomainsPerFailRealm; + static constexpr auto PositionItem = &TPDiskLayoutPosition::Realm; + using TNestedEntity = TAllocateWholeDomain; - static TPDiskLayoutPosition LowerBound(TPDiskLayoutPosition prefix, ui32 id) { - return {prefix.RealmGroup, prefix.Realm, id}; + static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x) { + return {{x.RealmGroup, x.Realm, 0}, {x.RealmGroup, x.Realm, Max<ui32>()}}; } + }; - static TPDiskLayoutPosition UpperBoundFromLowerBound(TPDiskLayoutPosition lower) { - return lower; - } + struct TAllocateWholeGroup { + static constexpr auto EntityCount = &TAllocator::NumFailRealms; + static constexpr auto PositionItem = &TPDiskLayoutPosition::RealmGroup; + using TNestedEntity = TAllocateWholeRealm; - static bool PrefixEquals(const TPDiskLayoutPosition& a, const TPDiskLayoutPosition& b) { - return a == b; + static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x) { + return {{x.RealmGroup, 0, 0}, {x.RealmGroup, Max<ui32>(), Max<ui32>()}}; } }; - template<typename TCallback, typename TBound, typename... TRest> - void FindMatchingDisksBounded(TCallback&& cb, TPDiskByPosition::const_iterator begin, - TPDiskByPosition::const_iterator end, const TBound& bound, - std::tuple<ui32, ui32> posInGroup) { - using Traits = TBoundTraits<typename TBound::mapped_type>; - if (bound && begin != end) { - ui32 max = 0; - for (const auto& [_, item] : bound) { - max = Max(item.NumChildren, max); - } - for (const auto& [id, item] : bound) { - if (item.NumChildren != max) { - continue; - } - const TPDiskLayoutPosition lower = Traits::LowerBound(begin->first, id); - const auto childBegin = std::lower_bound(begin, end, lower, TComparePDiskByPosition()); - const auto childEnd = std::upper_bound(childBegin, end, Traits::UpperBoundFromLowerBound(lower), - TComparePDiskByPosition()); - if constexpr (Traits::Descend) { - const ui32 index = std::get<Traits::CoordIndex>(posInGroup); - FindMatchingDisksBounded(cb, childBegin, childEnd, item.Items[index], posInGroup); - } else { - FindMatchingDisks<EPositionItem::None>(cb, childBegin, childEnd); + using TDiskRange = std::pair<TPDiskByPosition::const_iterator, TPDiskByPosition::const_iterator>; + + template<typename T> + TPDiskLayoutPosition *AllocateWholeEntity(T, TGroup& group, TUndoLog& undo, ui32 parentEntityIndex, + TDiskRange range, TDynBitMap& forbiddenEntities) { + const TDiskRange originalRange(range); + const size_t undoPosition = undo.GetPosition(); + TPDiskLayoutPosition *prefix = nullptr; + ui32 currentEntityId = Max<ui32>(); + for (ui32 index = 0, num = this->*T::EntityCount; index < num; ) { + // allocate nested entity + prefix = AllocateWholeEntity(typename T::TNestedEntity(), group, undo, + parentEntityIndex * num + index, range, forbiddenEntities); + if (prefix) { + if (!index) { + currentEntityId = prefix->*T::PositionItem; + auto [min, max] = T::MakeRange(*prefix); + range.first = std::lower_bound(range.first, range.second, min, TComparePDiskByPosition()); + range.second = std::upper_bound(range.first, range.second, max, TComparePDiskByPosition()); } + ++index; + } else if (index) { + // disable just checked entity (to prevent its selection again) + Y_VERIFY(currentEntityId != Max<ui32>()); + forbiddenEntities.Set(currentEntityId); + // try another entity at this level + Revert(undo, group, undoPosition); + // revert original wide range and start from the beginning + range = originalRange; + index = 0; + currentEntityId = Max<ui32>(); + } else { + // no chance to allocate new entity, exit + return nullptr; } - } else { - FindMatchingDisks<Traits::ForbiddenCheckLevel>(cb, begin, end); } + // disable filled entity from further selection + Y_VERIFY(prefix && currentEntityId != Max<ui32>()); + forbiddenEntities.Set(currentEntityId); + return prefix; } - template<EPositionItem ForbiddenCheckLevel, typename TCallback> - void FindMatchingDisks(TCallback&& cb, TPDiskByPosition::const_iterator begin, TPDiskByPosition::const_iterator end) { - while (begin != end) { - const auto& [position, pdisk] = *begin++; - if constexpr (ForbiddenCheckLevel <= EPositionItem::RealmGroup) { - if (ForbiddenEntities[position.RealmGroup]) { - continue; - } + TPDiskLayoutPosition *AllocateWholeEntity(TAllocateDisk, TGroup& group, TUndoLog& undo, ui32 index, + TDiskRange range, TDynBitMap& forbiddenEntities) { + TPDiskInfo *pdisk = nullptr; + auto process = [this, &pdisk](TPDiskInfo *candidate) { + if (!pdisk || DiskIsBetter(*candidate, *pdisk)) { + pdisk = candidate; } - if constexpr (ForbiddenCheckLevel <= EPositionItem::Realm) { - if (ForbiddenEntities[position.Realm]) { - continue; + }; + FindMatchingDiskBasedOnScore(process, group, RealmIdx[index], DomainIdx[index], + range, forbiddenEntities); + if (pdisk) { + AddDiskViaUndoLog(undo, group, index, pdisk); + pdisk->Matching = false; + return &pdisk->Position; + } else { + return nullptr; + } + } + + NLayoutChecker::TScore CalculateBestScoreWithCache(const TGroup& group) { + if (!BestScore) { + // find the worst disk from a position of layout correctness and use it as a milestone for other + // disks -- they can't be misplaced worse + NLayoutChecker::TScore bestScore; + for (ui32 i = 0; i < GroupSize; ++i) { + if (TPDiskInfo *pdisk = group[i]; pdisk && !pdisk->Decommitted) { + NLayoutChecker::TScore score = GroupLayout.GetCandidateScore(pdisk->Position, RealmIdx[i], + DomainIdx[i]); + if (bestScore.BetterThan(score)) { + bestScore = score; + } } } - if constexpr (ForbiddenCheckLevel <= EPositionItem::Domain) { - if (ForbiddenEntities[position.Domain]) { - continue; - } + BestScore = bestScore; + } + return *BestScore; + } + + template<typename TCallback> + void FindMatchingDiskBasedOnScore(TCallback&& cb, const TGroup& group, ui32 failRealmIdx, ui32 failDomainIdx, + TDiskRange range, TDynBitMap& forbiddenEntities) { + NLayoutChecker::TScore bestScore = CalculateBestScoreWithCache(group); + + std::vector<TPDiskInfo*> candidates; + + while (range.first != range.second) { + const auto& [position, pdisk] = *range.first++; + + if (!pdisk->Matching) { + continue; + } else if (forbiddenEntities[position.RealmGroup]) { + range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextRealmGroup - 1); + continue; + } else if (forbiddenEntities[position.Realm]) { + range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextRealm - 1); + continue; + } else if (forbiddenEntities[position.Domain]) { + range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextDomain - 1); + continue; } - if (pdisk->Matching) { - cb(pdisk); + + NLayoutChecker::TScore score = GroupLayout.GetCandidateScore(position, failRealmIdx, failDomainIdx); + if (score.BetterThan(bestScore)) { + candidates.clear(); + candidates.push_back(pdisk); + bestScore = score; + } else if (score.SameAs(bestScore)) { + candidates.push_back(pdisk); } } + + for (TPDiskInfo *pdisk : candidates) { + cb(pdisk); + } } bool DiskIsBetter(TPDiskInfo& pretender, TPDiskInfo& king) const { @@ -497,6 +504,14 @@ namespace NKikimr::NBsController { } } + void RemoveUsedDisk(const TPDiskInfo& pdisk) { + for (ui32 groupId : pdisk.Groups) { + if (!--LocalityFactor[groupId]) { + LocalityFactor.erase(groupId); + } + } + } + unsigned GetLocalityFactor(const TPDiskInfo& pdisk) const { unsigned res = 0; for (ui32 groupId : pdisk.Groups) { @@ -514,7 +529,7 @@ namespace NKikimr::NBsController { private: const TGroupGeometryInfo Geom; const bool Randomize; - TDomainMapper DomainMapper; + NLayoutChecker::TDomainMapper DomainMapper; TPDisks PDisks; TPDiskByPosition PDiskByPosition; bool Dirty = false; @@ -556,7 +571,7 @@ namespace NKikimr::NBsController { it->second.SpaceAvailable += increment; } - TString FormatPDisks(const TAllocateContext& ctx) const { + TString FormatPDisks(const TAllocator& allocator) const { TStringStream s; s << "PDisks# "; @@ -576,11 +591,11 @@ namespace NKikimr::NBsController { s << std::exchange(space, " ") << pdisk->PDiskId; - if (ctx.OldGroupContent.contains(pdisk->PDiskId)) { + if (allocator.OldGroupContent.contains(pdisk->PDiskId)) { s << "*"; } const char *minus = "-"; - if (ctx.Forbid.contains(pdisk->PDiskId)) { + if (allocator.ForbiddenDisks.contains(pdisk->PDiskId)) { s << std::exchange(minus, "") << "f"; } if (!pdisk->Usable) { @@ -590,15 +605,15 @@ namespace NKikimr::NBsController { s << std::exchange(minus, "") << "d"; } if (pdisk->NumSlots >= pdisk->MaxSlots) { - s << std::exchange(minus, "") << "s"; + s << std::exchange(minus, "") << "s[" << pdisk->NumSlots << "/" << pdisk->MaxSlots << "]"; } - if (pdisk->SpaceAvailable < ctx.RequiredSpace) { + if (pdisk->SpaceAvailable < allocator.RequiredSpace) { s << std::exchange(minus, "") << "v"; } if (!pdisk->Operational) { s << std::exchange(minus, "") << "o"; } - if (ctx.DiskIsUsable(*pdisk)) { + if (allocator.DiskIsUsable(*pdisk)) { s << "+"; } @@ -612,154 +627,86 @@ namespace NKikimr::NBsController { return s.Str(); } - bool AllocateGroup(ui32 groupId, TGroupDefinition& group, const TPDiskId replacedDiskIds[], - size_t numReplacedDisks, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, + bool AllocateGroup(ui32 groupId, TGroupDefinition& groupDefinition, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks, + TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error) { if (Dirty) { std::sort(PDiskByPosition.begin(), PDiskByPosition.end()); Dirty = false; } - // fill in the allocation context - TAllocateContext ctx(Geom, requiredSpace, requireOperational, std::move(forbid)); - if (!ctx.ProcessExistingGroup(group, PDisks, replacedDiskIds, numReplacedDisks, error)) { - return false; - } - // create group of required size, if it is not created yet - if (!Geom.ResizeGroup(group)) { + if (!Geom.ResizeGroup(groupDefinition)) { error = "incorrect existing group"; return false; } - // if the group is already created, check for missing entities - bool hasMissingEntities = false; - for (const auto& realm : group) { - for (const auto& domain : realm) { - for (const TPDiskId& pdiskId : domain) { - if (pdiskId == TPDiskId()) { - hasMissingEntities = true; - goto out; - } - } - } - } -out: if (!hasMissingEntities) { - return true; // group is okay + // fill in the allocation context + TAllocator allocator(*this, Geom, requiredSpace, requireOperational, std::move(forbid), replacedDisks); + TGroup group = allocator.ProcessExistingGroup(groupDefinition, error); + if (group.empty()) { + return false; } - - // adjust number of slots - for (TPDiskId pdiskId : ctx.OldGroupContent) { - Y_VERIFY_DEBUG(PDisks.contains(pdiskId)); - --PDisks.at(pdiskId).NumSlots; + bool ok = true; + for (TPDiskInfo *pdisk : group) { + if (!pdisk) { + ok = false; + break; + } } - for (size_t i = 0; i < numReplacedDisks; ++i) { - Y_VERIFY_DEBUG(PDisks.contains(replacedDiskIds[i])); - PDisks.at(replacedDiskIds[i]).EraseGroup(groupId); + if (ok) { + return true; } - ui32 minScore = Max<ui32>(); - ui32 maxScore = Min<ui32>(); + // calculate score table + std::vector<ui32> scores; for (const auto& [pdiskId, pdisk] : PDisks) { - const ui32 score = pdisk.GetPickerScore(); - minScore = Min(minScore, score); - maxScore = Max(maxScore, score + 1); - } - - std::optional<TGroupDefinition> outGroup; - - auto tryIteration = [&](ui32 score) { - ui32 numDomainMatchingDisks = 0; - auto domainBegin = PDiskByPosition.begin(); - for (auto it = domainBegin; it != PDiskByPosition.end(); ++it) { - auto& [position, pdisk] = *it; - if (position != domainBegin->first) { - for (; domainBegin != it; ++domainBegin) { - domainBegin->second->NumDomainMatchingDisks = numDomainMatchingDisks; - } - numDomainMatchingDisks = 0; - } - pdisk->Matching = ctx.DiskIsUsable(*pdisk) && pdisk->GetPickerScore() <= score; - numDomainMatchingDisks += pdisk->Matching; - } - for (; domainBegin != PDiskByPosition.end(); ++domainBegin) { - domainBegin->second->NumDomainMatchingDisks = numDomainMatchingDisks; - } - - TStackVec<std::tuple<ui32, ui32, ui32>, 32> undoLog; - THelper helper(*this, ctx); - - auto revert = [&]() { - for (const auto& item : undoLog) { - ui32 realmIdx, domainIdx, vdiskIdx; - std::tie(realmIdx, domainIdx, vdiskIdx) = item; // thanks to Microsoft - auto& pdiskId = group[realmIdx][domainIdx][vdiskIdx]; - const auto it = PDisks.find(pdiskId); - Y_VERIFY(it != PDisks.end()); - ctx.UndoAddDisk(it->second, realmIdx, domainIdx); - pdiskId = TPDiskId(); - } - }; - - for (ui32 realmIdx = 0; realmIdx < ctx.NumFailRealms; ++realmIdx) { - for (ui32 domainIdx = 0; domainIdx < ctx.NumFailDomainsPerFailRealm; ++domainIdx) { - auto& domain = group[realmIdx][domainIdx]; - for (ui32 vdiskIdx = 0; vdiskIdx < domain.size(); ++vdiskIdx) { - if (auto& pdiskId = domain[vdiskIdx]; pdiskId == TPDiskId()) { - pdiskId = helper.AddBestDisk(realmIdx, domainIdx); - if (pdiskId == TPDiskId()) { - revert(); - return false; - } else { - undoLog.emplace_back(realmIdx, domainIdx, vdiskIdx); - } - } - } - } - } - - outGroup = group; - revert(); - return true; - }; - - while (minScore < maxScore) { - const ui32 score = minScore + (maxScore - minScore) / 2; - if (tryIteration(score)) { - maxScore = score; + if (allocator.DiskIsUsable(pdisk)) { + scores.push_back(pdisk.GetPickerScore()); + } + } + std::sort(scores.begin(), scores.end()); + scores.erase(std::unique(scores.begin(), scores.end()), scores.end()); + + // bisect scores to find optimal working one + std::optional<TGroup> result; + ui32 begin = 0, end = scores.size(); + while (begin < end) { + const ui32 mid = begin + (end - begin) / 2; + TAllocator::TUndoLog undo; + if (allocator.FillInGroup(scores[mid], undo, group)) { + result = group; + allocator.Revert(undo, group, 0); + end = mid; } else { - minScore = score + 1; + begin = mid + 1; } } - if (outGroup) { - group = *outGroup; - for (const auto& realm : group) { - for (const auto& domain : realm) { - for (const auto& pdiskId : domain) { - if (const auto it = PDisks.find(pdiskId); it != PDisks.end()) { - ++it->second.NumSlots; - it->second.InsertGroup(groupId); - } else { - Y_FAIL(); - } - } + if (result) { + for (const auto& [vdiskId, pdiskId] : replacedDisks) { + const auto it = PDisks.find(pdiskId); + Y_VERIFY(it != PDisks.end()); + TPDiskInfo& pdisk = it->second; + --pdisk.NumSlots; + pdisk.EraseGroup(groupId); + } + ui32 numZero = 0; + for (ui32 i = 0; i < allocator.GroupSize; ++i) { + if (!group[i]) { + ++numZero; + TPDiskInfo *pdisk = result->at(i); + ++pdisk->NumSlots; + pdisk->InsertGroup(groupId); } } + Y_VERIFY(numZero == allocator.GroupSize || numZero == replacedDisks.size()); + allocator.Decompose(*result, groupDefinition); return true; + } else { + error = "no group options " + FormatPDisks(allocator); + return false; } - - // undo changes to the mapper content - for (TPDiskId pdiskId : ctx.OldGroupContent) { - Y_VERIFY_DEBUG(PDisks.contains(pdiskId)); - ++PDisks.at(pdiskId).NumSlots; - } - for (size_t i = 0; i < numReplacedDisks; ++i) { - Y_VERIFY_DEBUG(PDisks.contains(replacedDiskIds[i])); - PDisks.at(replacedDiskIds[i]).InsertGroup(groupId); - } - error = "no group options " + FormatPDisks(ctx); - return false; } }; @@ -781,10 +728,9 @@ out: if (!hasMissingEntities) { return Impl->AdjustSpaceAvailable(pdiskId, increment); } - bool TGroupMapper::AllocateGroup(ui32 groupId, TGroupDefinition& group, const TPDiskId replacedDiskIds[], - size_t numReplacedDisks, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error) { - return Impl->AllocateGroup(groupId, group, replacedDiskIds, numReplacedDisks, std::move(forbid), - requiredSpace, requireOperational, error); + bool TGroupMapper::AllocateGroup(ui32 groupId, TGroupDefinition& group, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks, + TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error) { + return Impl->AllocateGroup(groupId, group, replacedDisks, std::move(forbid), requiredSpace, requireOperational, error); } } // NKikimr::NBsController diff --git a/ydb/core/mind/bscontroller/group_mapper.h b/ydb/core/mind/bscontroller/group_mapper.h index 46dbb11c8da..a58e49ab0d1 100644 --- a/ydb/core/mind/bscontroller/group_mapper.h +++ b/ydb/core/mind/bscontroller/group_mapper.h @@ -63,9 +63,8 @@ namespace NKikimr { // failRealmBeginDxLevel, failRealmEndDxLevel, and then by finding possible options to meet requirements // (1) and (2). That is, prefix gives us unique domains in which we can find realms to operate, while // prefix+infix part gives us distinct fail realms we can use while generating groups. - bool AllocateGroup(ui32 groupId, TGroupDefinition& group, const TPDiskId replacedDiskIds[], - size_t numReplacedDisks, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, - TString& error); + bool AllocateGroup(ui32 groupId, TGroupDefinition& group, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks, + TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error); }; } // NBsController diff --git a/ydb/core/mind/bscontroller/group_mapper_ut.cpp b/ydb/core/mind/bscontroller/group_mapper_ut.cpp index d19bd44cc27..839d75e27e0 100644 --- a/ydb/core/mind/bscontroller/group_mapper_ut.cpp +++ b/ydb/core/mind/bscontroller/group_mapper_ut.cpp @@ -2,6 +2,7 @@ #include "group_geometry_info.h" #include "group_mapper.h" +#include "group_layout_checker.h" #include "ut_helpers.h" using namespace NKikimr; @@ -136,16 +137,43 @@ public: } } + ui32 GetDataCenter(TPDiskId pdiskId) const { + const auto it = PDisks.find(pdiskId); + UNIT_ASSERT(it != PDisks.end()); + return it->second.DataCenterId; + } + + TNodeLocation GetLocation(TPDiskId pdiskId) const { + const auto it = PDisks.find(pdiskId); + UNIT_ASSERT(it != PDisks.end()); + return it->second.GetLocation(); + } + + std::vector<std::tuple<ui32, ui32, ui32, ui32>> ExportLayout() const { + std::vector<std::tuple<ui32, ui32, ui32, ui32>> res; + for (const auto& [pdiskId, pdisk] : PDisks) { + res.emplace_back(pdisk.DataCenterId, pdisk.RoomId, pdisk.RackId, pdisk.BodyId); + } + return res; + } + + void ImportLayout(const std::vector<std::tuple<ui32, ui32, ui32, ui32>>& v) { + size_t index = 0; + for (auto& [pdiskId, pdisk] : PDisks) { + UNIT_ASSERT(index != v.size()); + std::tie(pdisk.DataCenterId, pdisk.RoomId, pdisk.RackId, pdisk.BodyId) = v[index]; + ++index; + } + UNIT_ASSERT(index == v.size()); + } + ui32 AllocateGroup(TGroupMapper& mapper, TGroupMapper::TGroupDefinition& group, bool allowFailure = false) { ui32 groupId = NextGroupId++; TString error; - bool success = mapper.AllocateGroup(groupId, group, nullptr, 0, {}, 0, false, error); + bool success = mapper.AllocateGroup(groupId, group, {}, {}, 0, false, error); if (!success && allowFailure) { return 0; } - if (!success) { - Ctest << "error# " << error << Endl; - } UNIT_ASSERT_C(success, error); TGroupRecord& record = Groups[groupId]; record.Group = group; @@ -161,7 +189,7 @@ public: } TGroupMapper::TGroupDefinition ReallocateGroup(TGroupMapper& mapper, ui32 groupId, const TSet<TPDiskId>& unusableDisks, - bool makeThemForbidden = false, bool requireOperational = false, bool requireError = false) { + bool makeThemForbidden = false, bool requireOperational = false, bool allowError = false) { TGroupRecord& group = Groups.at(groupId); TGroupMapper::TForbiddenPDisks forbid(unusableDisks.begin(), unusableDisks.end()); @@ -170,13 +198,14 @@ public: } // remove unusable disks from the set - std::vector<TPDiskId> replaced; - for (auto& realm : group.Group) { - for (auto& domain : realm) { - for (auto& pdisk : domain) { + THashMap<TVDiskIdShort, TPDiskId> replacedDisks; + for (ui32 i = 0; i < group.Group.size(); ++i) { + for (ui32 j = 0; j < group.Group[i].size(); ++j) { + for (ui32 k = 0; k < group.Group[i][j].size(); ++k) { + auto& pdisk = group.Group[i][j][k]; --PDisks.at(pdisk).NumSlots; if (unusableDisks.count(pdisk)) { - replaced.push_back(std::exchange(pdisk, {})); + replacedDisks.emplace(TVDiskIdShort(i, j, k), std::exchange(pdisk, {})); } } } @@ -185,15 +214,24 @@ public: Ctest << "groupId# " << groupId << " reallocating group# " << FormatGroup(group.Group) << Endl; TString error; - bool success = mapper.AllocateGroup(groupId, group.Group, replaced.data(), replaced.size(), std::move(forbid), - 0, requireOperational, error); + bool success = mapper.AllocateGroup(groupId, group.Group, replacedDisks, std::move(forbid), 0, + requireOperational, error); if (!success) { - if (requireError) { + Ctest << "error# " << error << Endl; + if (allowError) { + // revert group to its original state + for (const auto& [vdiskId, pdiskId] : replacedDisks) { + group.Group[vdiskId.FailRealm][vdiskId.FailDomain][vdiskId.VDisk] = pdiskId; + } + for (auto& realm : group.Group) { + for (auto& domain : realm) { + for (auto& pdisk : domain) { + ++PDisks.at(pdisk).NumSlots; + } + } + } return {}; } - Ctest << "error# " << error << Endl; - } else { - UNIT_ASSERT(!requireError); } UNIT_ASSERT(success); @@ -210,6 +248,23 @@ public: return group.Group; } + void SetGroup(ui32 groupId, const TGroupMapper::TGroupDefinition& group) { + auto& g = Groups[groupId]; + for (const TPDiskId& pdiskId : g.PDisks) { + --PDisks.at(pdiskId).NumSlots; + } + g.Group = group; + g.PDisks.clear(); + for (const auto& realm : g.Group) { + for (const auto& domain : realm) { + for (const auto& pdisk : domain) { + g.PDisks.push_back(pdisk); + ++PDisks.at(pdisk).NumSlots; + } + } + } + } + TString FormatGroup(const TGroupMapper::TGroupDefinition& group) { TStringStream str; str << "["; @@ -234,23 +289,27 @@ public: return str.Str(); } - void CheckGroupErasure(const TGroupMapper::TGroupDefinition& group) { + void CheckGroupErasure(const TGroupMapper::TGroupDefinition& group, ui32 decommittedDataCenter = 0) { TSet<ui32> dataCenters; for (const auto& realm : group) { TMaybe<ui32> dataCenter; - TSet<std::tuple<ui32, ui32>> domains; + TSet<std::tuple<ui32, ui32, ui32>> domains; for (const auto& domain : realm) { - TMaybe<std::tuple<ui32, ui32>> currentDom; + TMaybe<std::tuple<ui32, ui32, ui32>> currentDom; for (const auto& pdisk : domain) { const TPDiskRecord& record = PDisks.at(pdisk); - if (dataCenter) { - UNIT_ASSERT_VALUES_EQUAL(*dataCenter, record.DataCenterId); - } else { - dataCenter = record.DataCenterId; - const bool inserted = dataCenters.insert(*dataCenter).second; - UNIT_ASSERT(inserted); + if (record.DataCenterId != decommittedDataCenter) { // ignore entries from decommitted data center + if (dataCenter) { + if (*dataCenter != decommittedDataCenter && record.DataCenterId != decommittedDataCenter) { + UNIT_ASSERT_VALUES_EQUAL(*dataCenter, record.DataCenterId); + } + } else { + dataCenter = record.DataCenterId; + const bool inserted = dataCenters.insert(*dataCenter).second; + UNIT_ASSERT(inserted); + } } - std::tuple<ui32, ui32> dom = {record.RoomId, record.RackId}; + auto dom = std::make_tuple(record.DataCenterId, record.RoomId, record.RackId); if (currentDom) { // check that all disks from the same domain reside in the same domain :) UNIT_ASSERT_EQUAL(dom, *currentDom); @@ -297,7 +356,7 @@ public: } void PopulateGroupMapper(TGroupMapper& mapper, ui32 maxSlots = 16, TSet<TPDiskId> unusableDisks = {}, - TSet<TPDiskId> nonoperationalDisks = {}) { + TSet<TPDiskId> nonoperationalDisks = {}, std::optional<ui32> decommittedDataCenter = std::nullopt) { std::map<TPDiskId, std::vector<ui32>> groupDisks; for (const auto& [groupId, group] : Groups) { for (TPDiskId pdiskId : group.PDisks) { @@ -314,11 +373,82 @@ public: .MaxSlots = maxSlots, .Groups{g.begin(), g.end()}, .SpaceAvailable = 0, - .Operational = static_cast<bool>(nonoperationalDisks.count(pair.first)), - .Decommitted = false, + .Operational = !nonoperationalDisks.contains(pair.first), + .Decommitted = decommittedDataCenter == pair.second.DataCenterId, }); } } + + void DumpGroup(const TGroupMapper::TGroupDefinition& group) { + std::set<std::tuple<ui32, ui32, ui32>> locations; + for (const auto& [pdiskId, pdisk] : PDisks) { + locations.emplace(pdisk.DataCenterId, pdisk.RoomId, pdisk.RackId); + } + + std::unordered_map<ui32, ui32> dataCenterToColumn; + std::unordered_map<ui32, std::unordered_map<std::tuple<ui32, ui32>, ui32>> rackToColumn; + for (const auto& x : locations) { + const ui32 dataCenterId = std::get<0>(x); + const ui32 roomId = std::get<1>(x); + const ui32 rackId = std::get<2>(x); + dataCenterToColumn.try_emplace(dataCenterId, dataCenterToColumn.size()); + auto& rtc = rackToColumn[dataCenterId]; + rtc.try_emplace(std::make_tuple(roomId, rackId), rtc.size()); + } + + std::vector<std::vector<TString>> cells(dataCenterToColumn.size()); + for (const auto& [dataCenterId, racks] : rackToColumn) { + cells[dataCenterToColumn[dataCenterId]].resize(racks.size()); + } + + ui32 maxCellWidth = 0; + for (ui32 failRealmIdx = 0; failRealmIdx < group.size(); ++failRealmIdx) { + for (ui32 failDomainIdx = 0; failDomainIdx < group[failRealmIdx].size(); ++failDomainIdx) { + for (const TPDiskId& pdiskId : group[failRealmIdx][failDomainIdx]) { + if (pdiskId != TPDiskId()) { + const auto it = PDisks.find(pdiskId); + UNIT_ASSERT(it != PDisks.end()); + const TPDiskRecord& pdisk = it->second; + auto& cell = cells[dataCenterToColumn[pdisk.DataCenterId]] + [rackToColumn[pdisk.DataCenterId][{pdisk.RoomId, pdisk.RackId}]]; + if (cell) { + cell += ", "; + } + cell += TStringBuilder() << failRealmIdx << "/" << failDomainIdx; + maxCellWidth = Max<ui32>(maxCellWidth, cell.size()); + } + } + } + } + + if (!maxCellWidth) { + ++maxCellWidth; + } + + for (ui32 row = 0;; ++row) { + bool done = true; + TStringBuilder s; + for (ui32 column = 0; column < cells.size(); ++column) { + if (row >= cells[column].size()) { + s << TString(maxCellWidth, ' '); + } else if (const auto& cell = cells[column][row]) { + s << cell << TString(maxCellWidth - cell.size(), ' '); + done = false; + } else { + s << TString(maxCellWidth, 'X'); + done = false; + } + if (column != cells.size() - 1) { + s << ' '; + } + } + if (done) { + break; + } else { + Ctest << s << Endl; + } + } + } }; Y_UNIT_TEST_SUITE(TGroupMapperTest) { @@ -591,12 +721,15 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) { nonoperationalDisks.insert(pdiskId); }); context.PopulateGroupMapper(mapper, 10, unusableDisks, nonoperationalDisks); + ui32 hasEmpty = false; for (ui32 groupId : groupIds) { - auto group = context.ReallocateGroup(mapper, groupId, unusableDisks, true, true); - group = context.ReallocateGroup(mapper, groupId, unusableDisks); + auto tmp = context.ReallocateGroup(mapper, groupId, unusableDisks, false, true, true); + hasEmpty |= tmp.empty(); + auto group = context.ReallocateGroup(mapper, groupId, unusableDisks); Ctest << "groupId# " << groupId << " new content# " << context.FormatGroup(group) << Endl; context.CheckGroupErasure(group); } + UNIT_ASSERT(hasEmpty); } } @@ -686,155 +819,176 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) { Y_UNIT_TEST(ReassignGroupTest3dc) { for (ui32 i = 0; i < 10000; ++i) { Ctest << "iteration# " << i << Endl; - std::map<ui32, std::pair<ui32, ui32>> nodeToLocation; - auto populate = [&](ui32 decommittedDatacenter, const std::set<std::pair<ui32, ui32>>& unusableDisks, - TGroupMapper::TGroupDefinition group) { - auto mapper = std::make_unique<TGroupMapper>(TTestContext::CreateGroupGeometry( - TBlobStorageGroupType::ErasureMirror3dc)); - std::map<TPDiskId, ui32> slots; - for (const auto& realm : group) { - for (const auto& domain : realm) { - for (const auto& pdisk : domain) { - if (pdisk == TPDiskId()) { - ++slots[pdisk]; - } - } - } - } - for (ui32 datacenter = 1, nodeId = 1; datacenter <= 4; ++datacenter) { - for (ui32 rack = 1; rack <= 4; ++rack, ++nodeId) { - NActorsInterconnect::TNodeLocation proto; - proto.SetDataCenter(ToString(datacenter)); - proto.SetModule(""); - proto.SetRack(ToString(rack)); - proto.SetUnit(""); - TPDiskId pdiskId(nodeId, 1); - mapper->RegisterPDisk({ - .PDiskId = pdiskId, - .Location{proto}, - .Usable = datacenter != decommittedDatacenter && !unusableDisks.contains({datacenter, rack}), - .NumSlots = slots[pdiskId], - .MaxSlots = 1, - .Groups{slots[pdiskId], 0}, - .SpaceAvailable = 0, - .Operational = true, - .Decommitted = datacenter == decommittedDatacenter, - }); - nodeToLocation[nodeId] = {datacenter, rack}; - } - } - return mapper; - }; - - auto dumpGroup = [&](const auto& group) { - std::map<std::pair<ui32, ui32>, TString> cells; - - for (ui32 i = 0; i < group.size(); ++i) { - for (ui32 j = 0; j < group[i].size(); ++j) { - const auto& [datacenter, rack] = nodeToLocation[group[i][j][0].NodeId]; - cells[{datacenter, rack}] = TStringBuilder() << i << "/" << j; - } - } - for (ui32 rack = 1; rack <= 4; ++rack) { - for (ui32 datacenter = 1; datacenter <= 4; ++datacenter) { - TString cell = cells[{datacenter, rack}]; - if (!cell) { - cell = "xxx"; - } - Ctest << cell << " "; - } - Ctest << Endl; - } - }; + const ui32 numDataCenters = 5; + const ui32 numRacks = 5; + TTestContext context(numDataCenters, 1, numRacks, 1, 1); TGroupMapper::TGroupDefinition group; + ui32 groupId; { - auto mapper = populate(0, {}, group); - TString error; - bool success = mapper->AllocateGroup(0, group, nullptr, 0, {}, 0, false, error); - UNIT_ASSERT_C(success, error); - Ctest << "After allocation" << Endl; - dumpGroup(group); + TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc)); + context.PopulateGroupMapper(mapper, 1); + groupId = context.AllocateGroup(mapper, group); + Ctest << "group after allocation:" << Endl; + context.DumpGroup(group); } - ui32 decommittedDatacenter = RandomNumber<ui32>(5); - Ctest << "decommittedDatacenter# " << decommittedDatacenter << Endl; + ui32 decommittedDataCenter = RandomNumber<ui32>(numDataCenters + 1); + Ctest << "decommittedDataCenter# " << decommittedDataCenter << Endl; { + // randomly move some of disks from decommitted datacenter + TSet<TPDiskId> unusableDisks; for (auto& realm : group) { for (auto& domain : realm) { for (auto& pdisk : domain) { - if (nodeToLocation[pdisk.NodeId].first == decommittedDatacenter && RandomNumber(2u)) { - pdisk = {}; // reassign disk + if (context.GetDataCenter(pdisk) == decommittedDataCenter && RandomNumber(2u)) { + unusableDisks.insert(pdisk); } } } } - auto mapper = populate(decommittedDatacenter, {}, group); - TString error; - bool success = mapper->AllocateGroup(0, group, nullptr, 0, {}, 0, false, error); - UNIT_ASSERT_C(success, error); - Ctest << "After decomission" << Endl; - dumpGroup(group); - } - - std::set<std::pair<ui32, ui32>> unusableDisks; - ui32 unusableDatacenter = RandomNumber<ui32>(5); - if (unusableDatacenter) { - for (ui32 rack = 1; rack <= 4; ++rack) { - unusableDisks.emplace(unusableDatacenter, rack); - } + + TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc)); + context.PopulateGroupMapper(mapper, 1, {}, {}, decommittedDataCenter); + group = context.ReallocateGroup(mapper, groupId, unusableDisks); + Ctest << "group after data center decommission:" << Endl; + context.DumpGroup(group); + } + + TSet<TPDiskId> unusableDisks; + ui32 unusableDataCenter = RandomNumber<ui32>(numDataCenters + 1); + Ctest << "unusableDataCenter# " << unusableDataCenter << Endl; + if (unusableDataCenter) { + context.IteratePDisks([&](const auto& pdiskId, const auto& record) { + if (record.DataCenterId == unusableDataCenter) { + unusableDisks.insert(pdiskId); + } + }); } for (ui32 i = 0; i < 2; ++i) { - unusableDatacenter = RandomNumber<ui32>(5); - if (unusableDatacenter) { - ui32 unusableRack = 1 + RandomNumber<ui32>(4); - unusableDisks.emplace(unusableDatacenter, unusableRack); + if (const ui32 unusableDataCenter = RandomNumber<ui32>(numDataCenters + 1)) { + const ui32 unusableRack = 1 + RandomNumber<ui32>(numRacks); + context.IteratePDisks([&](const auto& pdiskId, const auto& record) { + if (record.DataCenterId == unusableDataCenter && record.RackId == unusableRack) { + unusableDisks.insert(pdiskId); + } + }); } } { - for (auto& realm : group) { - for (auto& domain : realm) { - for (auto& pdisk : domain) { - if (unusableDisks.contains(nodeToLocation[pdisk.NodeId])) { - pdisk = {}; // reassign disk - } - } - } - } - auto mapper = populate(decommittedDatacenter, {}, group); - TString error; - bool success = mapper->AllocateGroup(0, group, nullptr, 0, {}, 0, false, error); - UNIT_ASSERT_C(success, error); - Ctest << "After remapping" << Endl; - dumpGroup(group); + TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc)); + context.PopulateGroupMapper(mapper, 1); + auto group = context.ReallocateGroup(mapper, groupId, unusableDisks); + Ctest << "group after reallocation:" << Endl; + context.DumpGroup(group); + context.CheckGroupErasure(group, decommittedDataCenter); } - for (ui32 i = 0; i < group.size(); ++i) { - ui32 datacenterForRealm = 0; + Ctest << Endl; + } + } + + Y_UNIT_TEST(SanitizeGroupTest3dc) { + const ui32 numDataCenters = 3; + const ui32 numRacks = 5; + TTestContext context(numDataCenters, 1, numRacks, 1, 1); + TGroupMapper::TGroupDefinition group; + ui32 groupId; + { + TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc)); + context.PopulateGroupMapper(mapper, 1); + groupId = context.AllocateGroup(mapper, group); + Ctest << "group after allocation:" << Endl; + context.DumpGroup(group); + } + auto checkLayout = [&](const auto& group) { + TGroupGeometryInfo geom = TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc); + THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>> layout; + for (ui32 i = 0; i < group.size(); ++i) { for (ui32 j = 0; j < group[i].size(); ++j) { - std::set<ui32> racksInRealm; - - for (const auto& pdiskId : group[i][j]) { - const auto& [datacenter, rack] = nodeToLocation[pdiskId.NodeId]; - if (!datacenterForRealm) { - datacenterForRealm = datacenter; - } else if (datacenterForRealm == datacenter) { - // it's okay - } else if (decommittedDatacenter && (datacenter == decommittedDatacenter || datacenterForRealm == decommittedDatacenter)) { - // it's okay too, decomitted datacenter is partially broken - } else { - UNIT_FAIL("incorrect datacenter for realm"); - } - UNIT_ASSERT(racksInRealm.insert(rack).second); + for (ui32 k = 0; k < group[i][j].size(); ++k) { + layout.emplace(TVDiskIdShort(i, j, k), std::make_pair(context.GetLocation(group[i][j][k]), + group[i][j][k])); } } } + return CheckGroupLayout(geom, layout); + }; - Ctest << Endl; + UNIT_ASSERT(checkLayout(group)); + + for (ui32 n = 0; n < 1000; ++n) { + Ctest << Endl << "iteration# " << n << Endl; + + auto layout = context.ExportLayout(); + std::random_shuffle(layout.begin(), layout.end()); + context.ImportLayout(layout); + + Ctest << "group after layout shuffling:" << Endl; + context.DumpGroup(group); + + struct TQueueItem { + TGroupMapper::TGroupDefinition Group; + TString Path; + TSet<TGroupMapper::TGroupDefinition> Seen; + TSet<TVDiskIdShort> VDiskItems; + TSet<TPDiskId> PDiskItems; + }; + std::deque<TQueueItem> queue; + for (queue.push_back({.Group = group}); !queue.empty(); ) { + TQueueItem item = std::move(queue.front()); + queue.pop_front(); + const auto [it, inserted] = item.Seen.insert(item.Group); + UNIT_ASSERT(inserted); + UNIT_ASSERT(item.Seen.size() <= 9); + Cerr << "processing path# " << item.Path << Endl; + + auto candidates = checkLayout(item.Group); + if (!candidates) { + for (const TVDiskIdShort& vdiskId : candidates.Candidates) { + TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc)); + context.SetGroup(groupId, item.Group); + context.PopulateGroupMapper(mapper, 2); + const TPDiskId& pdiskId = item.Group[vdiskId.FailRealm][vdiskId.FailDomain][vdiskId.VDisk]; + auto temp = context.ReallocateGroup(mapper, groupId, {pdiskId}, false, false, false); + TString path = TStringBuilder() << item.Path << "/" << (int)vdiskId.FailRealm << ":" + << (int)vdiskId.FailDomain << ":" << (int)vdiskId.VDisk << "@" << pdiskId; + Cerr << "path# " << path << Endl; + context.DumpGroup(temp); + + auto vdiskItems = item.VDiskItems; +// const auto [it1, inserted1] = vdiskItems.insert(vdiskId); +// UNIT_ASSERT_C(inserted1, "Duplicate group cell# " << vdiskId); + + auto pdiskItems = item.PDiskItems; +// const auto [it2, inserted2] = pdiskItems.insert(pdiskId); +// UNIT_ASSERT_C(inserted2, "Duplicate origin PDisk# " << pdiskId); + + queue.push_front({.Group = std::move(temp), .Path = std::move(path), .Seen = item.Seen, + .VDiskItems = std::move(vdiskItems), .PDiskItems = std::move(pdiskItems)}); + } + } + + Cerr << Endl; + } } } + + Y_UNIT_TEST(CheckNotToBreakFailModel) { + TTestContext context(4, 1, 3, 1, 1); + TGroupMapper::TGroupDefinition group; + TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc)); + context.PopulateGroupMapper(mapper, 1); + ui32 groupId = context.AllocateGroup(mapper, group); + Ctest << "group after allocation:" << Endl; + context.DumpGroup(group); + group = context.ReallocateGroup(mapper, groupId, {group[0][0][0]}, false, false, true); + Ctest << "group after reallocation:" << Endl; + context.DumpGroup(group); + UNIT_ASSERT(group.empty()); + } } diff --git a/ydb/core/mind/bscontroller/impl.h b/ydb/core/mind/bscontroller/impl.h index ad8eea28c77..2b32da63a9a 100644 --- a/ydb/core/mind/bscontroller/impl.h +++ b/ydb/core/mind/bscontroller/impl.h @@ -78,6 +78,7 @@ public: class TConfigState; class TGroupSelector; class TGroupFitter; + class TSelfHealActor; using TVSlotReadyTimestampQ = std::list<std::pair<TInstant, TVSlotInfo*>>; @@ -422,8 +423,7 @@ public: } bool AcceptsNewSlots() const { - return Status == NKikimrBlobStorage::EDriveStatus::ACTIVE - && DecommitStatus == NKikimrBlobStorage::EDecommitStatus::DECOMMIT_NONE; + return Status == NKikimrBlobStorage::EDriveStatus::ACTIVE; } bool Decommitted() const { @@ -1344,6 +1344,7 @@ private: THashMap<TPDiskId, ui32> StaticPDiskSlotUsage; std::unique_ptr<TStoragePoolStat> StoragePoolStat; bool StopGivingGroups = false; + bool GroupLayoutSanitizer = false; NKikimrBlobStorage::TSerialManagementStage::E SerialManagementStage = NKikimrBlobStorage::TSerialManagementStage::DISCOVER_SERIAL; @@ -1569,6 +1570,16 @@ private: void Handle(TEvInterconnect::TEvNodesInfo::TPtr &ev); void HandleHostRecordsTimeToLiveExceeded(); +public: + // Self-heal actor's main purpose is to monitor FAULTY pdisks and to slightly move groups out of them; every move + // should not render group unusable, also it should not exceed its fail model. It also takes into account replication + // broker features such as only one vslot over PDisk is being replicated at a moment. + // + // It interacts with BS_CONTROLLER and group observer (which provides information about group state on a per-vdisk + // basis). BS_CONTROLLER reports faulty PDisks and all involved groups in a push notification manner. + IActor *CreateSelfHealActor(); + +private: //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Online state void Handle(TEvBlobStorage::TEvControllerRegisterNode::TPtr &ev); diff --git a/ydb/core/mind/bscontroller/load_everything.cpp b/ydb/core/mind/bscontroller/load_everything.cpp index 06fc1a8679c..8f33e2a1b63 100644 --- a/ydb/core/mind/bscontroller/load_everything.cpp +++ b/ydb/core/mind/bscontroller/load_everything.cpp @@ -83,6 +83,7 @@ public: Self->GroupReservePart = state.GetValue<T::GroupReservePart>(); Self->MaxScrubbedDisksAtOnce = state.GetValue<T::MaxScrubbedDisksAtOnce>(); Self->PDiskSpaceColorBorder = state.GetValue<T::PDiskSpaceColorBorder>(); + Self->GroupLayoutSanitizer = state.GetValue<T::GroupLayoutSanitizer>(); Self->SysViewChangedSettings = true; } } diff --git a/ydb/core/mind/bscontroller/scheme.h b/ydb/core/mind/bscontroller/scheme.h index 58f33878a98..cd47ac234d5 100644 --- a/ydb/core/mind/bscontroller/scheme.h +++ b/ydb/core/mind/bscontroller/scheme.h @@ -85,11 +85,13 @@ struct Schema : NIceDb::Schema { struct GroupReservePart : Column<15, NScheme::NTypeIds::Uint32> { static constexpr Type Default = 0; }; // parts per million struct MaxScrubbedDisksAtOnce : Column<16, NScheme::NTypeIds::Uint32> { static constexpr Type Default = Max<ui32>(); }; // no limit struct PDiskSpaceColorBorder : Column<17, NScheme::NTypeIds::Uint32> { using Type = NKikimrBlobStorage::TPDiskSpaceColor::E; static constexpr Type Default = NKikimrBlobStorage::TPDiskSpaceColor::GREEN; }; + struct GroupLayoutSanitizer : Column<18, NScheme::NTypeIds::Bool> { static constexpr Type Default = false; }; using TKey = TableKey<FixedKey>; using TColumns = TableColumns<FixedKey, NextGroupID, SchemaVersion, NextOperationLogIndex, DefaultMaxSlots, InstanceId, SelfHealEnable, DonorModeEnable, ScrubPeriodicity, SerialManagementStage, NextStoragePoolId, - PDiskSpaceMarginPromille, GroupReserveMin, GroupReservePart, MaxScrubbedDisksAtOnce, PDiskSpaceColorBorder>; + PDiskSpaceMarginPromille, GroupReserveMin, GroupReservePart, MaxScrubbedDisksAtOnce, PDiskSpaceColorBorder, + GroupLayoutSanitizer>; }; struct VSlot : Table<5> { diff --git a/ydb/core/mind/bscontroller/self_heal.cpp b/ydb/core/mind/bscontroller/self_heal.cpp index b732d98e937..86ea82a627e 100644 --- a/ydb/core/mind/bscontroller/self_heal.cpp +++ b/ydb/core/mind/bscontroller/self_heal.cpp @@ -2,6 +2,8 @@ #include "impl.h" #include "vdisk_status_tracker.h" #include "config.h" +#include "group_geometry_info.h" +#include "group_layout_checker.h" namespace NKikimr::NBsController { @@ -111,7 +113,9 @@ namespace NKikimr::NBsController { void Handle(TEvBlobStorage::TEvVStatusResult::TPtr& ev) { const auto& record = ev->Get()->Record; - STLOG(PRI_DEBUG, BS_SELFHEAL, BSSH03, "Reassigner TEvVStatusResult", (GroupId, GroupId), (Response, record)); + STLOG(PRI_DEBUG, BS_SELFHEAL, BSSH03, "Reassigner TEvVStatusResult", (GroupId, GroupId), + (Status, record.GetStatus()), (JoinedGroup, record.GetJoinedGroup()), + (Replicated, record.GetReplicated())); bool diskIsOk = false; if (record.GetStatus() == NKikimrProto::RACE) { @@ -169,6 +173,13 @@ namespace NKikimr::NBsController { if (!record.GetResponse().GetSuccess()) { STLOG(PRI_WARN, BS_SELFHEAL, BSSH07, "Reassigner ReassignGroupDisk request failed", (GroupId, GroupId), (VDiskToReplace, VDiskToReplace), (Response, record)); + } else { + TString items = "none"; + for (const auto& item : record.GetResponse().GetStatus(0).GetReassignedItem()) { + items = TStringBuilder() << VDiskIDFromVDiskID(item.GetVDiskId()) << ": " + << TVSlotId(item.GetFrom()) << " -> " << TVSlotId(item.GetTo()); + } + STLOG(PRI_INFO, BS_SELFHEAL, BSSH09, "Reassigner succeeded", (GroupId, GroupId), (Items, items)); } Finish(record.GetResponse().GetSuccess()); } @@ -204,23 +215,36 @@ namespace NKikimr::NBsController { }) }; - class TSelfHealActor : public TActorBootstrapped<TSelfHealActor> { + class TBlobStorageController::TSelfHealActor : public TActorBootstrapped<TSelfHealActor> { static constexpr TDuration MinRetryTimeout = TDuration::Seconds(1); static constexpr TDuration MaxRetryTimeout = TDuration::Seconds(60); - struct TGroupRecord { + struct TWithFaultyDisks {}; + struct TWithInvalidLayout {}; + + struct TGroupRecord + : TIntrusiveListItem<TGroupRecord, TWithFaultyDisks> + , TIntrusiveListItem<TGroupRecord, TWithInvalidLayout> + { + const TGroupId GroupId; TEvControllerUpdateSelfHealInfo::TGroupContent Content; TActorId ReassignerActorId; // reassigner in flight TDuration RetryTimeout = MinRetryTimeout; TInstant NextRetryTimestamp = TInstant::Zero(); THashMap<TVDiskID, TVDiskStatusTracker> VDiskStatus; + bool LayoutValid = false; + + TGroupRecord(TGroupId groupId) : GroupId(groupId) {} }; const ui64 TabletId; TActorId ControllerId; THashMap<TGroupId, TGroupRecord> Groups; - TSet<TGroupId> GroupsWithFaultyDisks; + TIntrusiveList<TGroupRecord, TWithFaultyDisks> GroupsWithFaultyDisks; + TIntrusiveList<TGroupRecord, TWithInvalidLayout> GroupsWithInvalidLayout; std::shared_ptr<std::atomic_uint64_t> UnreassignableGroups; + bool GroupLayoutSanitizer = false; + std::optional<THostRecordMapImpl> HostRecords; public: TSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups) @@ -236,11 +260,17 @@ namespace NKikimr::NBsController { void Handle(TEvControllerUpdateSelfHealInfo::TPtr& ev) { const TInstant now = TActivationContext::Now(); + if (const auto& setting = ev->Get()->GroupLayoutSanitizer) { + GroupLayoutSanitizer = *setting; + } for (const auto& [groupId, data] : ev->Get()->GroupsToUpdate) { if (data) { - auto& g = Groups[groupId]; + const auto [it, inserted] = Groups.try_emplace(groupId, groupId); + auto& g = it->second; bool hasFaultyDisks = false; g.Content = std::move(*data); + g.LayoutValid = false; + GroupsWithInvalidLayout.PushBack(&g); for (const auto& [vdiskId, vdisk] : g.Content.VDisks) { g.VDiskStatus[vdiskId].Update(vdisk.VDiskStatus, now); hasFaultyDisks |= vdisk.Faulty; @@ -253,9 +283,9 @@ namespace NKikimr::NBsController { } } if (hasFaultyDisks) { - GroupsWithFaultyDisks.insert(groupId); + GroupsWithFaultyDisks.PushBack(&g); } else { - GroupsWithFaultyDisks.erase(groupId); + GroupsWithFaultyDisks.Remove(&g); } } else { // find the group to delete @@ -272,7 +302,6 @@ namespace NKikimr::NBsController { } // remove the group - GroupsWithFaultyDisks.erase(groupId); Groups.erase(it); } } @@ -293,29 +322,40 @@ namespace NKikimr::NBsController { ui64 counter = 0; - for (const TGroupId groupId : GroupsWithFaultyDisks) { - // find the group to process - const auto it = Groups.find(groupId); - Y_VERIFY(it != Groups.end()); - TGroupRecord& group = it->second; - + for (TGroupRecord& group : GroupsWithFaultyDisks) { if (group.ReassignerActorId || now < group.NextRetryTimestamp) { continue; // we are already running reassigner for this group } // check if it is possible to move anything out if (const auto v = FindVDiskToReplace(group.VDiskStatus, group.Content, now)) { - group.ReassignerActorId = Register(new TReassignerActor(ControllerId, groupId, group.Content, *v)); + group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, *v)); } else { ++counter; // this group can't be reassigned right now } } + if (GroupLayoutSanitizer) { + for (auto it = GroupsWithInvalidLayout.begin(); it != GroupsWithInvalidLayout.end(); ) { + TGroupRecord& group = *it++; + Y_VERIFY(!group.LayoutValid); + if (group.ReassignerActorId || now < group.NextRetryTimestamp) { + // nothing to do + } else if (const auto v = FindVDiskToReplaceByLayout(group, now)) { + group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, *v)); + } else if (group.LayoutValid) { + GroupsWithInvalidLayout.Remove(&group); + } else { + ++counter; + } + } + } + UnreassignableGroups->store(counter); } std::optional<TVDiskID> FindVDiskToReplace(const THashMap<TVDiskID, TVDiskStatusTracker>& tracker, - const TEvControllerUpdateSelfHealInfo::TGroupContent& content, const TInstant now) { + const TEvControllerUpdateSelfHealInfo::TGroupContent& content, TInstant now) { auto status = [&](const TVDiskID& id) { try { return tracker.at(id).GetStatus(now); @@ -362,6 +402,41 @@ namespace NKikimr::NBsController { } } + std::optional<TVDiskID> FindVDiskToReplaceByLayout(TGroupRecord& group, TInstant now) { + THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>> layout; + for (const auto& [vdiskId, vdisk] : group.Content.VDisks) { + Y_VERIFY(HostRecords); + if (!vdisk.Decommitted) { + layout.emplace(vdiskId, std::make_pair(HostRecords->GetLocation(vdisk.Location.NodeId), + vdisk.Location.ComprisingPDiskId())); + } + } + const TLayoutCheckResult checkResult = CheckGroupLayout(*group.Content.Geometry, layout); + if (checkResult) { // group is valid + group.LayoutValid = true; + return std::nullopt; + } + + THashSet<TVDiskIdShort> badDisks; + for (const auto& [vdiskId, vdisk] : group.Content.VDisks) { + const auto it = group.VDiskStatus.find(vdiskId); + if (it == group.VDiskStatus.end() || it->second.GetStatus(now) != NKikimrBlobStorage::EVDiskStatus::READY || vdisk.Bad) { + badDisks.insert(vdiskId); + } + } + if (badDisks.empty()) { + return TVDiskID(group.GroupId, group.Content.Generation, checkResult.Candidates.front()); + } else if (badDisks.size() == 1) { + for (const auto& vdiskId : checkResult.Candidates) { + if (badDisks.contains(vdiskId)) { + return TVDiskID(group.GroupId, group.Content.Generation, vdiskId); + } + } + } + + return std::nullopt; + } + void HandleWakeup() { CheckGroups(); Schedule(TDuration::Seconds(10), new TEvents::TEvWakeup); @@ -439,9 +514,8 @@ namespace NKikimr::NBsController { TABLE_CLASS("table-sortable table") { TABLEHEAD() { ui32 numCols = 0; - for (const auto& id : GroupsWithFaultyDisks) { - const auto& info = Groups.at(id); - numCols = Max<ui32>(numCols, info.Content.VDisks.size()); + for (const TGroupRecord& group : GroupsWithFaultyDisks) { + numCols = Max<ui32>(numCols, group.Content.VDisks.size()); } TABLER() { @@ -452,20 +526,19 @@ namespace NKikimr::NBsController { } } TABLEBODY() { - for (const auto& id : GroupsWithFaultyDisks) { - const auto& info = Groups.at(id); + for (const TGroupRecord& group : GroupsWithFaultyDisks) { TABLER() { out << "<td rowspan='2'><a href='?TabletID=" << TabletId - << "&page=GroupDetail&GroupId=" << id << "'>" - << id << "</a>:" << info.Content.Generation << "</td>"; + << "&page=GroupDetail&GroupId=" << group.GroupId << "'>" + << group.GroupId << "</a>:" << group.Content.Generation << "</td>"; - for (const auto& [vdiskId, vdisk] : info.Content.VDisks) { + for (const auto& [vdiskId, vdisk] : group.Content.VDisks) { TABLED() { out << vdiskId.ToString(); out << "<br/>"; out << vdisk.VDiskStatus; out << "<br/><strong>"; - if (const auto it = info.VDiskStatus.find(vdiskId); it != info.VDiskStatus.end()) { + if (const auto it = group.VDiskStatus.find(vdiskId); it != group.VDiskStatus.end()) { if (const auto& status = it->second.GetStatus(now)) { out << *status; } else { @@ -479,7 +552,7 @@ namespace NKikimr::NBsController { } } TABLER() { - for (const auto& [vdiskId, vdisk] : info.Content.VDisks) { + for (const auto& [vdiskId, vdisk] : group.Content.VDisks) { TABLED() { const auto& l = vdisk.Location; if (vdisk.Faulty) { @@ -506,17 +579,22 @@ namespace NKikimr::NBsController { } } + void Handle(TEvInterconnect::TEvNodesInfo::TPtr ev) { + HostRecords.emplace(ev->Get()); + } + STRICT_STFUNC(StateFunc, { cFunc(TEvents::TSystem::Poison, PassAway); hFunc(TEvControllerUpdateSelfHealInfo, Handle); hFunc(NMon::TEvRemoteHttpInfo, Handle); hFunc(TEvReassignerDone, Handle); cFunc(TEvents::TSystem::Wakeup, HandleWakeup); + hFunc(TEvInterconnect::TEvNodesInfo, Handle); }) }; - IActor *CreateSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups) { - return new TSelfHealActor(tabletId, std::move(unreassignableGroups)); + IActor *TBlobStorageController::CreateSelfHealActor() { + return new TSelfHealActor(TabletID(), SelfHealUnreassignableGroups); } void TBlobStorageController::InitializeSelfHealState() { @@ -525,10 +603,13 @@ namespace NKikimr::NBsController { ev->GroupsToUpdate.emplace(groupId, TEvControllerUpdateSelfHealInfo::TGroupContent()); } FillInSelfHealGroups(*ev, nullptr); + ev->GroupLayoutSanitizer = GroupLayoutSanitizer; Send(SelfHealId, ev.Release()); } void TBlobStorageController::FillInSelfHealGroups(TEvControllerUpdateSelfHealInfo& msg, TConfigState *state) { + THashMap<TBoxStoragePoolId, std::shared_ptr<TGroupGeometryInfo>> geomCache; + for (auto& [groupId, group] : msg.GroupsToUpdate) { if (!group) { continue; @@ -540,11 +621,24 @@ namespace NKikimr::NBsController { group->Generation = p->Generation; group->Type = TBlobStorageGroupType(p->ErasureSpecies); + if (auto it = geomCache.find(p->StoragePoolId); it != geomCache.end()) { + group->Geometry = it->second; + } else { + const TMap<TBoxStoragePoolId, TStoragePoolInfo>& storagePools = state + ? state->StoragePools.Get() + : StoragePools; + const auto spIt = storagePools.find(p->StoragePoolId); + Y_VERIFY(spIt != storagePools.end()); + group->Geometry = std::make_unique<TGroupGeometryInfo>(group->Type, spIt->second.GetGroupGeometry()); + geomCache.emplace(p->StoragePoolId, group->Geometry); + } + for (const TVSlotInfo *slot : p->VDisksInGroup) { group->VDisks[slot->GetVDiskId()] = { slot->VSlotId, slot->PDisk->ShouldBeSettledBySelfHeal(), slot->PDisk->BadInTermsOfSelfHeal(), + slot->PDisk->Decommitted(), slot->GetStatus(), }; } diff --git a/ydb/core/mind/bscontroller/self_heal.h b/ydb/core/mind/bscontroller/self_heal.h index 287f05d4670..b2740f4800b 100644 --- a/ydb/core/mind/bscontroller/self_heal.h +++ b/ydb/core/mind/bscontroller/self_heal.h @@ -6,29 +6,26 @@ namespace NKikimr::NBsController { + class TGroupGeometryInfo; + struct TEvControllerUpdateSelfHealInfo : TEventLocal<TEvControllerUpdateSelfHealInfo, TEvBlobStorage::EvControllerUpdateSelfHealInfo> { struct TGroupContent { struct TVDiskInfo { TVSlotId Location; bool Faulty; bool Bad; + bool Decommitted; NKikimrBlobStorage::EVDiskStatus VDiskStatus; }; ui32 Generation; TBlobStorageGroupType Type; TMap<TVDiskID, TVDiskInfo> VDisks; + std::shared_ptr<TGroupGeometryInfo> Geometry; }; THashMap<TGroupId, std::optional<TGroupContent>> GroupsToUpdate; // groups with faulty groups that are changed or got faulty PDisks for the first time TVector<std::pair<TVDiskID, NKikimrBlobStorage::EVDiskStatus>> VDiskStatusUpdate; + std::optional<bool> GroupLayoutSanitizer; }; - // Self-heal actor's main purpose is to monitor FAULTY pdisks and to slightly move groups out of them; every move - // should not render group unusable, also it should not exceed its fail model. It also takes into account replication - // broker features such as only one vslot over PDisk is being replicated at a moment. - // - // It interacts with BS_CONTROLLER and group observer (which provides information about group state on a per-vdisk - // basis). BS_CONTROLLER reports faulty PDisks and all involved groups in a push notification manner. - IActor *CreateSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups); - } // NKikimr::NBsController diff --git a/ydb/core/mind/bscontroller/sys_view.cpp b/ydb/core/mind/bscontroller/sys_view.cpp index d233cfc67bc..756035d051c 100644 --- a/ydb/core/mind/bscontroller/sys_view.cpp +++ b/ydb/core/mind/bscontroller/sys_view.cpp @@ -310,7 +310,7 @@ public: TGroupMapper::TGroupDefinition group; TString error; std::deque<ui64> groupSizes; - while (mapper.AllocateGroup(groupSizes.size(), group, nullptr, 0, {}, 0, false, error)) { + while (mapper.AllocateGroup(groupSizes.size(), group, {}, {}, 0, false, error)) { std::vector<TGroupDiskInfo> disks; std::deque<NKikimrBlobStorage::TPDiskMetrics> pdiskMetrics; std::deque<NKikimrBlobStorage::TVDiskMetrics> vdiskMetrics; diff --git a/ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp b/ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp index 5c2d0aad201..d4205abea84 100644 --- a/ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp +++ b/ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp @@ -1,6 +1,7 @@ #include <library/cpp/testing/unittest/registar.h> #include <ydb/core/util/testactorsys.h> #include <ydb/core/mind/bscontroller/self_heal.h> +#include <ydb/core/mind/bscontroller/impl.h> using namespace NActors; using namespace NKikimr; @@ -13,8 +14,8 @@ void RunTestCase(TCallback&& callback) { TTestActorSystem runtime(1); runtime.Start(); const TActorId& parentId = runtime.AllocateEdgeActor(1); - std::shared_ptr<std::atomic_uint64_t> UnreassignableGroups = std::make_shared<std::atomic_uint64_t>(); - const TActorId& selfHealId = runtime.Register(CreateSelfHealActor(1, UnreassignableGroups), parentId, {}, {}, 1); + TBlobStorageController Controller({}, new TTabletStorageInfo(1, TTabletTypes::FLAT_BS_CONTROLLER)); + const TActorId& selfHealId = runtime.Register(Controller.CreateSelfHealActor(), parentId, {}, {}, 1); callback(selfHealId, parentId, runtime); runtime.Stop(); } diff --git a/ydb/core/protos/blobstorage_config.proto b/ydb/core/protos/blobstorage_config.proto index 9215c37d3ff..bc7f54aa6e9 100644 --- a/ydb/core/protos/blobstorage_config.proto +++ b/ydb/core/protos/blobstorage_config.proto @@ -434,6 +434,7 @@ message TUpdateSettings { repeated uint32 GroupReservePartPPM = 7; repeated uint32 MaxScrubbedDisksAtOnce = 8; repeated NKikimrBlobStorage.TPDiskSpaceColor.E PDiskSpaceColorBorder = 9; + repeated bool EnableGroupLayoutSanitizer = 10; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/ydb/core/util/testactorsys.cpp b/ydb/core/util/testactorsys.cpp index a678744a2f1..ead159d35ed 100644 --- a/ydb/core/util/testactorsys.cpp +++ b/ydb/core/util/testactorsys.cpp @@ -133,16 +133,24 @@ TActorId TTestActorSystem::CreateTestBootstrapper(TTabletStorageInfo *info, std: } void TTestActorSystem::SetupTabletRuntime(ui32 numDataCenters, ui32 stateStorageNodeId, ui32 targetNodeId) { - auto setup = MakeIntrusive<TTableNameserverSetup>(); - ui32 nodeCountInDC = (MaxNodeId + numDataCenters - 1) / numDataCenters; - for (ui32 nodeId : GetNodes()) { - const TString name = Sprintf("127.0.0.%u", nodeId); + const ui32 nodeCountInDC = (MaxNodeId + numDataCenters - 1) / numDataCenters; + auto locationGenerator = [&](ui32 nodeId) { const ui32 dcNum = (nodeId + nodeCountInDC - 1) / nodeCountInDC; NActorsInterconnect::TNodeLocation location; location.SetDataCenter(ToString(dcNum)); location.SetRack(ToString(nodeId)); location.SetUnit(ToString(nodeId)); - setup->StaticNodeTable[nodeId] = {name, name, name, 19001, TNodeLocation(location)}; + return TNodeLocation(location); + }; + SetupTabletRuntime(locationGenerator, stateStorageNodeId, targetNodeId); +} + +void TTestActorSystem::SetupTabletRuntime(const std::function<TNodeLocation(ui32)>& locationGenerator, + ui32 stateStorageNodeId, ui32 targetNodeId) { + auto setup = MakeIntrusive<TTableNameserverSetup>(); + for (ui32 nodeId : GetNodes()) { + const TString name = Sprintf("127.0.0.%u", nodeId); + setup->StaticNodeTable[nodeId] = {name, name, name, 19001, locationGenerator(nodeId)}; } for (ui32 nodeId : GetNodes()) { diff --git a/ydb/core/util/testactorsys.h b/ydb/core/util/testactorsys.h index 6bbe4dd3eb3..5037f7ab715 100644 --- a/ydb/core/util/testactorsys.h +++ b/ydb/core/util/testactorsys.h @@ -661,6 +661,8 @@ public: // tablet-related utility functions void SetupTabletRuntime(ui32 numDataCenters = 1, ui32 stateStorageNodeId = 0, ui32 targetNodeId = 0); + void SetupTabletRuntime(const std::function<TNodeLocation(ui32)>& locationGenerator, ui32 stateStorageNodeId = 0, + ui32 targetNodeId = 0); static NTabletPipe::TClientConfig GetPipeConfigWithRetries(); void SendToPipe(ui64 tabletId, const TActorId& sender, IEventBase* payload, ui64 cookie, const NKikimr::NTabletPipe::TClientConfig& pipeConfig); static TTabletStorageInfo *CreateTestTabletInfo(ui64 tabletId, TTabletTypes::EType tabletType, TBlobStorageGroupType::EErasureSpecies erasure, ui32 groupId); diff --git a/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema b/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema index 28608d95cd9..928757d0259 100644 --- a/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema +++ b/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema @@ -12,6 +12,11 @@ "ColumnType": "Uint32" }, { + "ColumnId": 18, + "ColumnName": "GroupLayoutSanitizer", + "ColumnType": "Bool" + }, + { "ColumnId": 1, "ColumnName": "FixedKey", "ColumnType": "Bool" @@ -92,6 +97,7 @@ "0": { "Columns": [ 17, + 18, 1, 2, 4, |
