summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexander Rutkovsky <[email protected]>2022-04-08 15:27:10 +0300
committerAlexander Rutkovsky <[email protected]>2022-04-08 15:27:10 +0300
commit0db69be232f1d207c45edeb606df5f6223a89ba7 (patch)
tree158de59a8c8a5779b3b55817a6b22033f6477a33
parentf22d9cd81bf3f86db7b45ecabd93397349d88263 (diff)
Add group layout sanitizer feature KIKIMR-14580
ref:ee11bc2fb183c18c214c9b4153d83f0b0d3920d7
-rw-r--r--ydb/core/blobstorage/ut_blobstorage/CMakeLists.darwin.txt1
-rw-r--r--ydb/core/blobstorage/ut_blobstorage/CMakeLists.linux.txt1
-rw-r--r--ydb/core/blobstorage/ut_blobstorage/lib/env.h16
-rw-r--r--ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp103
-rw-r--r--ydb/core/mind/bscontroller/CMakeLists.txt1
-rw-r--r--ydb/core/mind/bscontroller/bsc.cpp3
-rw-r--r--ydb/core/mind/bscontroller/config_cmd.cpp7
-rw-r--r--ydb/core/mind/bscontroller/config_fit_groups.cpp22
-rw-r--r--ydb/core/mind/bscontroller/group_geometry_info.h5
-rw-r--r--ydb/core/mind/bscontroller/group_layout_checker.cpp47
-rw-r--r--ydb/core/mind/bscontroller/group_layout_checker.h177
-rw-r--r--ydb/core/mind/bscontroller/group_mapper.cpp864
-rw-r--r--ydb/core/mind/bscontroller/group_mapper.h5
-rw-r--r--ydb/core/mind/bscontroller/group_mapper_ut.cpp456
-rw-r--r--ydb/core/mind/bscontroller/impl.h15
-rw-r--r--ydb/core/mind/bscontroller/load_everything.cpp1
-rw-r--r--ydb/core/mind/bscontroller/scheme.h4
-rw-r--r--ydb/core/mind/bscontroller/self_heal.cpp150
-rw-r--r--ydb/core/mind/bscontroller/self_heal.h13
-rw-r--r--ydb/core/mind/bscontroller/sys_view.cpp2
-rw-r--r--ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp5
-rw-r--r--ydb/core/protos/blobstorage_config.proto1
-rw-r--r--ydb/core/util/testactorsys.cpp18
-rw-r--r--ydb/core/util/testactorsys.h2
-rw-r--r--ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema6
25 files changed, 1245 insertions, 680 deletions
diff --git a/ydb/core/blobstorage/ut_blobstorage/CMakeLists.darwin.txt b/ydb/core/blobstorage/ut_blobstorage/CMakeLists.darwin.txt
index 46272ff7922..4af5291bff6 100644
--- a/ydb/core/blobstorage/ut_blobstorage/CMakeLists.darwin.txt
+++ b/ydb/core/blobstorage/ut_blobstorage/CMakeLists.darwin.txt
@@ -39,6 +39,7 @@ target_sources(ydb-core-blobstorage-ut_blobstorage PRIVATE
${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/incorrect_queries.cpp
${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/main.cpp
${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/mirror3of4.cpp
+ ${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp
${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/space_check.cpp
${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/sync.cpp
${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/replication.cpp
diff --git a/ydb/core/blobstorage/ut_blobstorage/CMakeLists.linux.txt b/ydb/core/blobstorage/ut_blobstorage/CMakeLists.linux.txt
index 0c713121adf..c7a8da56f7d 100644
--- a/ydb/core/blobstorage/ut_blobstorage/CMakeLists.linux.txt
+++ b/ydb/core/blobstorage/ut_blobstorage/CMakeLists.linux.txt
@@ -42,6 +42,7 @@ target_sources(ydb-core-blobstorage-ut_blobstorage PRIVATE
${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/incorrect_queries.cpp
${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/main.cpp
${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/mirror3of4.cpp
+ ${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp
${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/space_check.cpp
${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/sync.cpp
${CMAKE_SOURCE_DIR}/ydb/core/blobstorage/ut_blobstorage/replication.cpp
diff --git a/ydb/core/blobstorage/ut_blobstorage/lib/env.h b/ydb/core/blobstorage/ut_blobstorage/lib/env.h
index e3dc7cc8f4b..685594fa4c7 100644
--- a/ydb/core/blobstorage/ut_blobstorage/lib/env.h
+++ b/ydb/core/blobstorage/ut_blobstorage/lib/env.h
@@ -30,6 +30,7 @@ struct TEnvironmentSetup {
const ui32 ControllerNodeId = 1;
const bool Cache = false;
const ui32 NumDataCenters = 0;
+ const std::function<TNodeLocation(ui32)> LocationGenerator;
};
const TSettings Settings;
@@ -108,7 +109,11 @@ struct TEnvironmentSetup {
Runtime->Start();
auto *appData = Runtime->GetAppData();
appData->DomainsInfo->AddDomain(TDomainsInfo::TDomain::ConstructEmptyDomain("dom", DomainId).Release());
- Runtime->SetupTabletRuntime(GetNumDataCenters(), Settings.ControllerNodeId);
+ if (Settings.LocationGenerator) {
+ Runtime->SetupTabletRuntime(Settings.LocationGenerator, Settings.ControllerNodeId);
+ } else {
+ Runtime->SetupTabletRuntime(GetNumDataCenters(), Settings.ControllerNodeId);
+ }
SetupStaticStorage();
SetupTablet();
SetupStorage();
@@ -120,7 +125,11 @@ struct TEnvironmentSetup {
void StartNode(ui32 nodeId) {
Runtime->StartNode(nodeId);
- Runtime->SetupTabletRuntime(GetNumDataCenters(), Settings.ControllerNodeId, nodeId);
+ if (Settings.LocationGenerator) {
+ Runtime->SetupTabletRuntime(Settings.LocationGenerator, Settings.ControllerNodeId, nodeId);
+ } else {
+ Runtime->SetupTabletRuntime(GetNumDataCenters(), Settings.ControllerNodeId, nodeId);
+ }
if (nodeId == Settings.ControllerNodeId) {
SetupStaticStorage();
SetupTablet();
@@ -553,12 +562,13 @@ struct TEnvironmentSetup {
}
}
- void UpdateSettings(bool selfHeal, bool donorMode) {
+ void UpdateSettings(bool selfHeal, bool donorMode, bool groupLayoutSanitizer = false) {
NKikimrBlobStorage::TConfigRequest request;
auto *cmd = request.AddCommand();
auto *us = cmd->MutableUpdateSettings();
us->AddEnableSelfHeal(selfHeal);
us->AddEnableDonorMode(donorMode);
+ us->AddEnableGroupLayoutSanitizer(groupLayoutSanitizer);
auto response = Invoke(request);
UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription());
}
diff --git a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp
new file mode 100644
index 00000000000..9e17730c3d6
--- /dev/null
+++ b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp
@@ -0,0 +1,103 @@
+#include <ydb/core/blobstorage/ut_blobstorage/lib/env.h>
+
+Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
+ Y_UNIT_TEST(Test3dc) {
+ const ui32 numRacks = 15;
+ std::vector<ui32> nodesPerRack(numRacks);
+ std::vector<ui32> nodeToRack;
+ for (ui32 numFilledRacks = 0; numFilledRacks < numRacks; ) {
+// const ui32 rackId = RandomNumber(numRacks);
+ const ui32 rackId = numFilledRacks;
+ nodeToRack.emplace_back(rackId);
+ numFilledRacks += !nodesPerRack[rackId]++;
+ }
+ const ui32 numDatacenters = 3;
+ std::vector<ui32> rackToDatacenter;
+ for (ui32 i = 0; i < numRacks; ++i) {
+ rackToDatacenter.push_back(i % numDatacenters);
+ }
+
+ std::vector<TNodeLocation> locations;
+ for (ui32 i = 0; i < nodeToRack.size(); ++i) {
+ NActorsInterconnect::TNodeLocation proto;
+ proto.SetDataCenter(ToString(rackToDatacenter[nodeToRack[i]]));
+ proto.SetRack(ToString(nodeToRack[i]));
+ proto.SetUnit(ToString(i));
+ locations.emplace_back(proto);
+ }
+
+ TEnvironmentSetup env{{
+ .NodeCount = (ui32)nodeToRack.size(),
+ .Erasure = TBlobStorageGroupType::ErasureMirror3dc,
+ .LocationGenerator = [&](ui32 nodeId) { return locations[nodeId - 1]; },
+ }};
+
+ auto getGroupsWithIncorrectLayout = [&] {
+ auto config = env.FetchBaseConfig();
+
+ std::map<ui32, std::tuple<TString, TString>> nodeIdToLocation;
+ for (const auto& node : config.GetNode()) {
+ const auto& location = node.GetLocation();
+ nodeIdToLocation.emplace(node.GetNodeId(), std::make_tuple(location.GetDataCenter(), location.GetRack()));
+ }
+
+ std::map<ui32, std::vector<std::vector<std::tuple<TString, TString>>>> groups;
+ for (const auto& vslot : config.GetVSlot()) {
+ auto& group = groups[vslot.GetGroupId()];
+ if (group.empty()) {
+ group.resize(3, {3, {"", ""}});
+ }
+ group[vslot.GetFailRealmIdx()][vslot.GetFailDomainIdx()] = nodeIdToLocation[vslot.GetVSlotId().GetNodeId()];
+ }
+
+ std::set<ui32> badGroups;
+
+ for (auto& [groupId, group] : groups) {
+ std::set<TString> usedRealms;
+
+ for (const auto& row : group) {
+ TString realm;
+ std::set<TString> usedRacks;
+
+ for (const auto& [dc, rack] : row) {
+ Y_VERIFY(dc && rack);
+
+ if (!usedRacks.insert(rack).second) {
+ badGroups.insert(groupId);
+ }
+
+ if (!realm) {
+ if (!usedRealms.insert(dc).second) {
+ badGroups.insert(groupId);
+ }
+ realm = dc;
+ } else if (realm != dc) {
+ badGroups.insert(groupId);
+ }
+ }
+ }
+ }
+
+ return badGroups;
+ };
+
+ const ui32 disksPerNode = 1;
+ const ui32 slotsPerDisk = 3;
+ env.CreateBoxAndPool(disksPerNode, nodeToRack.size() * disksPerNode * slotsPerDisk / 9);
+ env.Sim(TDuration::Seconds(30));
+ auto before = getGroupsWithIncorrectLayout();
+ Cerr << "bad groups before shuffling# " << FormatList(before) << Endl;
+ UNIT_ASSERT(before.empty());
+ env.Cleanup();
+ std::random_shuffle(locations.begin(), locations.end());
+ env.Initialize();
+ env.Sim(TDuration::Seconds(100));
+ auto after = getGroupsWithIncorrectLayout();
+ Cerr << "bad groups just after shuffling# " << FormatList(after) << Endl;
+ env.UpdateSettings(true, false, true);
+ env.Sim(TDuration::Minutes(15));
+ auto corrected = getGroupsWithIncorrectLayout();
+ Cerr << "bad groups after shuffling and fixing# " << FormatList(corrected) << Endl;
+// UNIT_ASSERT(corrected.empty());
+ }
+}
diff --git a/ydb/core/mind/bscontroller/CMakeLists.txt b/ydb/core/mind/bscontroller/CMakeLists.txt
index d897fd44761..05311af9ab1 100644
--- a/ydb/core/mind/bscontroller/CMakeLists.txt
+++ b/ydb/core/mind/bscontroller/CMakeLists.txt
@@ -37,6 +37,7 @@ target_sources(core-mind-bscontroller PRIVATE
${CMAKE_SOURCE_DIR}/ydb/core/mind/bscontroller/drop_donor.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/bscontroller/get_group.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/bscontroller/grouper.cpp
+ ${CMAKE_SOURCE_DIR}/ydb/core/mind/bscontroller/group_layout_checker.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/bscontroller/group_mapper.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/bscontroller/group_reconfigure_wipe.cpp
${CMAKE_SOURCE_DIR}/ydb/core/mind/bscontroller/init_scheme.cpp
diff --git a/ydb/core/mind/bscontroller/bsc.cpp b/ydb/core/mind/bscontroller/bsc.cpp
index 779381fd513..bb44c7fbb04 100644
--- a/ydb/core/mind/bscontroller/bsc.cpp
+++ b/ydb/core/mind/bscontroller/bsc.cpp
@@ -116,7 +116,7 @@ void TBlobStorageController::OnActivateExecutor(const TActorContext&) {
}
// create self-heal actor
- SelfHealId = Register(CreateSelfHealActor(TabletID(), SelfHealUnreassignableGroups));
+ SelfHealId = Register(CreateSelfHealActor());
// create stat processor
StatProcessorActorId = Register(CreateStatProcessorActor());
@@ -152,6 +152,7 @@ void TBlobStorageController::Handle(TEvInterconnect::TEvNodesInfo::TPtr &ev) {
const bool initial = !HostRecords;
HostRecords = std::make_shared<THostRecordMap::element_type>(ev->Get());
Schedule(TDuration::Minutes(5), new TEvPrivate::TEvHostRecordsTimeToLiveExceeded);
+ TActivationContext::Send(ev->Forward(SelfHealId));
if (initial) {
Execute(CreateTxInitScheme());
}
diff --git a/ydb/core/mind/bscontroller/config_cmd.cpp b/ydb/core/mind/bscontroller/config_cmd.cpp
index 2fc925a9cd7..f4bb7d5aa1c 100644
--- a/ydb/core/mind/bscontroller/config_cmd.cpp
+++ b/ydb/core/mind/bscontroller/config_cmd.cpp
@@ -126,6 +126,13 @@ namespace NKikimr::NBsController {
Self->PDiskSpaceColorBorder = static_cast<T::PDiskSpaceColorBorder::Type>(value);
db.Table<T>().Key(true).Update<T::PDiskSpaceColorBorder>(Self->PDiskSpaceColorBorder);
}
+ for (bool value : settings.GetEnableGroupLayoutSanitizer()) {
+ Self->GroupLayoutSanitizer = value;
+ db.Table<T>().Key(true).Update<T::GroupLayoutSanitizer>(Self->GroupLayoutSanitizer);
+ auto ev = std::make_unique<TEvControllerUpdateSelfHealInfo>();
+ ev->GroupLayoutSanitizer = Self->GroupLayoutSanitizer;
+ Self->Send(Self->SelfHealId, ev.release());
+ }
return true;
}
diff --git a/ydb/core/mind/bscontroller/config_fit_groups.cpp b/ydb/core/mind/bscontroller/config_fit_groups.cpp
index c7b2f18384e..dd1513b3549 100644
--- a/ydb/core/mind/bscontroller/config_fit_groups.cpp
+++ b/ydb/core/mind/bscontroller/config_fit_groups.cpp
@@ -51,7 +51,7 @@ namespace NKikimr {
for (ui64 reserve = 0; reserve < min || (reserve - min) * 1000000 / Max<ui64>(1, total) < part; ++reserve, ++total) {
TGroupMapper::TGroupDefinition group;
try {
- AllocateGroup(0, group, nullptr, 0, {}, 0, false);
+ AllocateGroup(0, group, {}, {}, 0, false);
} catch (const TExFitGroupError&) {
throw TExError() << "group reserve constraint hit";
}
@@ -92,7 +92,7 @@ namespace NKikimr {
requiredSpace = ExpectedSlotSize.front();
ExpectedSlotSize.pop_front();
}
- AllocateGroup(groupId, group, nullptr, 0, {}, requiredSpace, false);
+ AllocateGroup(groupId, group, {}, {}, requiredSpace, false);
// scan all comprising PDisks for PDiskCategory
TMaybe<TPDiskCategory> desiredPDiskCategory;
@@ -171,6 +171,7 @@ namespace NKikimr {
// mapping for audit log
TMap<TVDiskIdShort, TVSlotId> replacedSlots;
TStackVec<std::pair<TVSlotId, bool>, 32> replaceQueue;
+ THashMap<TVDiskIdShort, TPDiskId> replacedDisks;
i64 requiredSpace = Min<i64>();
////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -214,6 +215,7 @@ namespace NKikimr {
g[vslot->RingIdx][vslot->FailDomainIdx][vslot->VDiskIdx] = targetPDiskId;
replacedSlots.emplace(TVDiskIdShort(vslot->RingIdx, vslot->FailDomainIdx, vslot->VDiskIdx), vslot->VSlotId);
replaceQueue.emplace_back(vslot->VSlotId, State.SuppressDonorMode.count(vslot->VSlotId));
+ replacedDisks.emplace(vslot->GetShortVDiskId(), vslot->VSlotId.ComprisingPDiskId());
} else {
preservedSlots.emplace(vslot->GetVDiskId(), vslot->VSlotId);
auto& m = vslot->Metrics;
@@ -240,10 +242,6 @@ namespace NKikimr {
}
}
if (hasMissingSlots || !IgnoreGroupSanityChecks) {
- TStackVec<TPDiskId, 32> replacedDiskIds;
- for (const auto& [vslotId, suppressDonorMode] : replaceQueue) {
- replacedDiskIds.push_back(vslotId.ComprisingPDiskId());
- }
TGroupMapper::TForbiddenPDisks forbid;
for (const auto& vslot : groupInfo->VDisksInGroup) {
for (const auto& [vslotId, vdiskId] : vslot->Donors) {
@@ -252,8 +250,7 @@ namespace NKikimr {
}
}
}
- AllocateGroup(groupId, group, replacedDiskIds.data(), replacedDiskIds.size(), std::move(forbid),
- requiredSpace, AllowUnusableDisks);
+ AllocateGroup(groupId, group, replacedDisks, std::move(forbid), requiredSpace, AllowUnusableDisks);
if (!IgnoreVSlotQuotaCheck) {
adjustSpaceAvailable = true;
for (const auto& [pos, vslotId] : replacedSlots) {
@@ -358,9 +355,9 @@ namespace NKikimr {
}
private:
- void AllocateGroup(TGroupId groupId, TGroupMapper::TGroupDefinition& group, const TPDiskId replacedDiskIds[],
- size_t numReplacedDisks, TGroupMapper::TForbiddenPDisks forbid, i64 requiredSpace,
- bool addExistingDisks) {
+ void AllocateGroup(TGroupId groupId, TGroupMapper::TGroupDefinition& group,
+ const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks, TGroupMapper::TForbiddenPDisks forbid,
+ i64 requiredSpace, bool addExistingDisks) {
if (!Mapper) {
Mapper.emplace(Geometry, StoragePool.RandomizeGroupMapping);
PopulateGroupMapper();
@@ -379,8 +376,7 @@ namespace NKikimr {
}
}
}
- Geometry.AllocateGroup(*Mapper, groupId, group, replacedDiskIds, numReplacedDisks, std::move(forbid),
- requiredSpace);
+ Geometry.AllocateGroup(*Mapper, groupId, group, replacedDisks, std::move(forbid), requiredSpace);
for (const TPDiskId pdiskId : removeQ) {
Mapper->UnregisterPDisk(pdiskId);
}
diff --git a/ydb/core/mind/bscontroller/group_geometry_info.h b/ydb/core/mind/bscontroller/group_geometry_info.h
index 10e5daedba0..5d37a0dfd2c 100644
--- a/ydb/core/mind/bscontroller/group_geometry_info.h
+++ b/ydb/core/mind/bscontroller/group_geometry_info.h
@@ -69,12 +69,11 @@ namespace NKikimr::NBsController {
ui32 GetDomainLevelEnd() const { return DomainLevelEnd; }
void AllocateGroup(TGroupMapper &mapper, TGroupId groupId, TGroupMapper::TGroupDefinition &group,
- const TPDiskId replacedDiskIds[], size_t numReplacedDisks, TGroupMapper::TForbiddenPDisks forbid,
+ const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks, TGroupMapper::TForbiddenPDisks forbid,
i64 requiredSpace) const {
TString error;
for (const bool requireOperational : {true, false}) {
- if (mapper.AllocateGroup(groupId, group, replacedDiskIds, numReplacedDisks, forbid,
- requiredSpace, requireOperational, error)) {
+ if (mapper.AllocateGroup(groupId, group, replacedDisks, forbid, requiredSpace, requireOperational, error)) {
return;
}
}
diff --git a/ydb/core/mind/bscontroller/group_layout_checker.cpp b/ydb/core/mind/bscontroller/group_layout_checker.cpp
new file mode 100644
index 00000000000..3e4418ce731
--- /dev/null
+++ b/ydb/core/mind/bscontroller/group_layout_checker.cpp
@@ -0,0 +1,47 @@
+#include "group_layout_checker.h"
+#include "group_geometry_info.h"
+
+namespace NKikimr::NBsController {
+
+ TLayoutCheckResult CheckGroupLayout(const TGroupGeometryInfo& geom, const THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>>& layout) {
+ using namespace NLayoutChecker;
+
+ if (layout.empty()) {
+ return {};
+ }
+
+ TGroupLayout group(geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm());
+ TDomainMapper mapper;
+ THashMap<TVDiskIdShort, TPDiskLayoutPosition> map;
+ for (const auto& [vdiskId, p] : layout) {
+ const auto& [location, pdiskId] = p;
+ TPDiskLayoutPosition pos(mapper, location, pdiskId, geom);
+ group.AddDisk(pos, vdiskId.FailRealm, vdiskId.FailDomain);
+ map.emplace(vdiskId, pos);
+ }
+
+ std::vector<std::pair<TScore, TVDiskIdShort>> scoreboard;
+ for (const auto& [vdiskId, pos] : map) {
+ scoreboard.emplace_back(group.GetCandidateScore(pos, vdiskId.FailRealm, vdiskId.FailDomain), vdiskId);
+ }
+
+ auto comp1 = [](const auto& x, const auto& y) { return x.second < y.second; };
+ std::sort(scoreboard.begin(), scoreboard.end(), comp1);
+ for (const auto& [score, vdiskId] : scoreboard) {
+ Cerr << vdiskId << "@" << map[vdiskId].ToString() << " -> " << score.ToString() << Endl;
+ }
+
+ auto comp = [](const auto& x, const auto& y) { return x.first.BetterThan(y.first); };
+ std::sort(scoreboard.begin(), scoreboard.end(), comp);
+ TLayoutCheckResult res;
+ const auto reference = scoreboard.back().first;
+ if (!reference.SameAs({})) { // not perfectly correct layout
+ for (; !scoreboard.empty() && !scoreboard.back().first.BetterThan(reference); scoreboard.pop_back()) {
+ Cerr << "candidate# " << scoreboard.back().second << Endl;
+ res.Candidates.push_back(scoreboard.back().second);
+ }
+ }
+ return res;
+ }
+
+} // NKikimr::NBsController
diff --git a/ydb/core/mind/bscontroller/group_layout_checker.h b/ydb/core/mind/bscontroller/group_layout_checker.h
new file mode 100644
index 00000000000..5a317f59ddb
--- /dev/null
+++ b/ydb/core/mind/bscontroller/group_layout_checker.h
@@ -0,0 +1,177 @@
+#pragma once
+
+#include "defs.h"
+#include "types.h"
+#include "group_geometry_info.h"
+
+namespace NKikimr::NBsController {
+
+ struct TLayoutCheckResult {
+ std::vector<TVDiskIdShort> Candidates;
+
+ explicit operator bool() const { // checks whether fail model is correct
+ return Candidates.empty();
+ }
+ };
+
+ TLayoutCheckResult CheckGroupLayout(const TGroupGeometryInfo& geom, const THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>>& layout);
+
+ namespace NLayoutChecker {
+
+ class TDomainMapper {
+ std::unordered_map<TString, ui32> FailDomainId;
+
+ public:
+ ui32 operator ()(TString item) {
+ return FailDomainId.emplace(std::move(item), FailDomainId.size()).first->second;
+ }
+
+ ui32 GetIdCount() const {
+ return FailDomainId.size();
+ }
+ };
+
+ struct TPDiskLayoutPosition {
+ ui32 RealmGroup = 0;
+ ui32 Realm = 0;
+ ui32 Domain = 0;
+
+ TPDiskLayoutPosition() = default;
+
+ TPDiskLayoutPosition(ui32 realmGroup, ui32 realm, ui32 domain)
+ : RealmGroup(realmGroup)
+ , Realm(realm)
+ , Domain(domain)
+ {}
+
+ TPDiskLayoutPosition(TDomainMapper& mapper, const TNodeLocation& location, TPDiskId pdiskId, const TGroupGeometryInfo& geom) {
+ TStringStream realmGroup, realm, domain;
+ const std::pair<int, TStringStream*> levels[] = {
+ {geom.GetRealmLevelBegin(), &realmGroup},
+ {Max(geom.GetRealmLevelEnd(), geom.GetDomainLevelBegin()), &realm},
+ {Max(geom.GetRealmLevelEnd(), geom.GetDomainLevelEnd()), &domain}
+ };
+ auto addLevel = [&](int key, const TString& value) {
+ for (const auto& [reference, stream] : levels) {
+ if (key < reference) {
+ Save(stream, std::make_tuple(key, value));
+ }
+ }
+ };
+ for (const auto& [key, value] : location.GetItems()) {
+ addLevel(key, value);
+ }
+ addLevel(255, pdiskId.ToString()); // ephemeral level to distinguish between PDisks on the same node
+ RealmGroup = mapper(realmGroup.Str());
+ Realm = mapper(realm.Str());
+ Domain = mapper(domain.Str());
+ }
+
+ TString ToString() const {
+ return TStringBuilder() << "{" << RealmGroup << "." << Realm << "." << Domain << "}";
+ }
+
+ auto AsTuple() const {
+ return std::tie(RealmGroup, Realm, Domain);
+ }
+
+ friend bool operator ==(const TPDiskLayoutPosition& x, const TPDiskLayoutPosition& y) {
+ return x.AsTuple() == y.AsTuple();
+ }
+
+ friend bool operator <(const TPDiskLayoutPosition& x, const TPDiskLayoutPosition& y) {
+ return x.AsTuple() < y.AsTuple();
+ }
+ };
+
+ struct TScore {
+ ui32 RealmInterlace = 0;
+ ui32 DomainInterlace = 0;
+ ui32 RealmGroupScatter = 0;
+ ui32 RealmScatter = 0;
+ ui32 DomainScatter = 0;
+
+ auto AsTuple() const {
+ return std::make_tuple(RealmInterlace, DomainInterlace, RealmGroupScatter, RealmScatter, DomainScatter);
+ }
+
+ bool BetterThan(const TScore& other) const {
+ return AsTuple() < other.AsTuple();
+ }
+
+ bool SameAs(const TScore& other) const {
+ return AsTuple() == other.AsTuple();
+ }
+
+ static TScore Max() {
+ return {::Max<ui32>(), ::Max<ui32>(), ::Max<ui32>(), ::Max<ui32>(), ::Max<ui32>()};
+ }
+
+ TString ToString() const {
+ return TStringBuilder() << "{RealmInterlace# " << RealmInterlace
+ << " DomainInterlace# " << DomainInterlace
+ << " RealmGroupScatter# " << RealmGroupScatter
+ << " RealmScatter# " << RealmScatter
+ << " DomainScatter# " << DomainScatter
+ << "}";
+ }
+ };
+
+ struct TGroupLayout {
+ const ui32 NumFailDomainsPerFailRealm;
+
+ ui32 NumDisks = 0;
+ THashMap<ui32, ui32> NumDisksPerRealmGroup;
+
+ TStackVec<ui32, 4> NumDisksInRealm;
+ TStackVec<THashMap<ui32, ui32>, 4> NumDisksPerRealm;
+ THashMap<ui32, ui32> NumDisksPerRealmTotal;
+
+ TStackVec<ui32, 32> NumDisksInDomain;
+ TStackVec<THashMap<ui32, ui32>, 32> NumDisksPerDomain;
+ THashMap<ui32, ui32> NumDisksPerDomainTotal;
+
+ TGroupLayout(ui32 numFailRealms, ui32 numFailDomainsPerFailRealm)
+ : NumFailDomainsPerFailRealm(numFailDomainsPerFailRealm)
+ , NumDisksInRealm(numFailRealms)
+ , NumDisksPerRealm(numFailRealms)
+ , NumDisksInDomain(numFailRealms * numFailDomainsPerFailRealm)
+ , NumDisksPerDomain(numFailRealms * numFailDomainsPerFailRealm)
+ {}
+
+ void UpdateDisk(const TPDiskLayoutPosition& pos, ui32 realmIdx, ui32 domainIdx, ui32 value) {
+ domainIdx += realmIdx * NumFailDomainsPerFailRealm;
+ NumDisks += value;
+ NumDisksPerRealmGroup[pos.RealmGroup] += value;
+ NumDisksInRealm[realmIdx] += value;
+ NumDisksPerRealm[realmIdx][pos.Realm] += value;
+ NumDisksPerRealmTotal[pos.Realm] += value;
+ NumDisksInDomain[domainIdx] += value;
+ NumDisksPerDomain[domainIdx][pos.Domain] += value;
+ NumDisksPerDomainTotal[pos.Domain] += value;
+ }
+
+ void AddDisk(const TPDiskLayoutPosition& pos, ui32 realmIdx, ui32 domainIdx) {
+ UpdateDisk(pos, realmIdx, domainIdx, 1);
+ }
+
+ void RemoveDisk(const TPDiskLayoutPosition& pos, ui32 realmIdx, ui32 domainIdx) {
+ UpdateDisk(pos, realmIdx, domainIdx, Max<ui32>());
+ }
+
+ TScore GetCandidateScore(const TPDiskLayoutPosition& pos, ui32 realmIdx, ui32 domainIdx) {
+ domainIdx += realmIdx * NumFailDomainsPerFailRealm;
+
+ return {
+ .RealmInterlace = NumDisksPerRealmTotal[pos.Realm] - NumDisksPerRealm[realmIdx][pos.Realm],
+ .DomainInterlace = NumDisksPerDomainTotal[pos.Domain] - NumDisksPerDomain[domainIdx][pos.Domain],
+ .RealmGroupScatter = NumDisks - NumDisksPerRealmGroup[pos.RealmGroup],
+ .RealmScatter = NumDisksInRealm[realmIdx] - NumDisksPerRealm[realmIdx][pos.Realm],
+ .DomainScatter = NumDisksInDomain[domainIdx] - NumDisksPerDomain[domainIdx][pos.Domain],
+ };
+ }
+ };
+
+ } // NLayoutChecker
+
+} // NKikimr::NBsController
diff --git a/ydb/core/mind/bscontroller/group_mapper.cpp b/ydb/core/mind/bscontroller/group_mapper.cpp
index 945487c7ee2..347a136a8a7 100644
--- a/ydb/core/mind/bscontroller/group_mapper.cpp
+++ b/ydb/core/mind/bscontroller/group_mapper.cpp
@@ -1,86 +1,19 @@
#include "group_mapper.h"
#include "group_geometry_info.h"
+#include "group_layout_checker.h"
namespace NKikimr::NBsController {
class TGroupMapper::TImpl : TNonCopyable {
- class TDomainMapper {
- std::unordered_map<TString, ui32> FailDomainId;
-
- public:
- ui32 operator ()(TString item) {
- return FailDomainId.emplace(std::move(item), FailDomainId.size()).first->second;
- }
-
- ui32 GetIdCount() const {
- return FailDomainId.size();
- }
- };
-
- enum class EPositionItem {
- RealmGroup,
- Realm,
- Domain,
- None,
- };
-
- struct TPDiskLayoutPosition {
- ui32 RealmGroup = 0;
- ui32 Realm = 0;
- ui32 Domain = 0;
-
- TPDiskLayoutPosition() = default;
-
- TPDiskLayoutPosition(ui32 realmGroup, ui32 realm, ui32 domain)
- : RealmGroup(realmGroup)
- , Realm(realm)
- , Domain(domain)
- {}
-
- TPDiskLayoutPosition(TDomainMapper& mapper, const TNodeLocation& location, TPDiskId pdiskId, const TGroupGeometryInfo& geom) {
- TStringStream realmGroup, realm, domain;
- const std::pair<int, TStringStream*> levels[] = {
- {geom.GetRealmLevelBegin(), &realmGroup},
- {Max(geom.GetRealmLevelEnd(), geom.GetDomainLevelBegin()), &realm},
- {Max(geom.GetRealmLevelEnd(), geom.GetDomainLevelEnd()), &domain}
- };
- auto addLevel = [&](int key, const TString& value) {
- for (const auto& [reference, stream] : levels) {
- if (key < reference) {
- Save(stream, std::make_tuple(key, value));
- }
- }
- };
- for (const auto& [key, value] : location.GetItems()) {
- addLevel(key, value);
- }
- addLevel(255, pdiskId.ToString()); // ephemeral level to distinguish between PDisks on the same node
- RealmGroup = mapper(realmGroup.Str());
- Realm = mapper(realm.Str());
- Domain = mapper(domain.Str());
- }
-
- auto AsTuple() const {
- return std::tie(RealmGroup, Realm, Domain);
- }
-
- friend bool operator ==(const TPDiskLayoutPosition& x, const TPDiskLayoutPosition& y) {
- return x.AsTuple() == y.AsTuple();
- }
-
- friend bool operator !=(const TPDiskLayoutPosition& x, const TPDiskLayoutPosition& y) {
- return x.AsTuple() != y.AsTuple();
- }
-
- friend bool operator <(const TPDiskLayoutPosition& x, const TPDiskLayoutPosition& y) {
- return x.AsTuple() < y.AsTuple();
- }
- };
+ using TPDiskLayoutPosition = NLayoutChecker::TPDiskLayoutPosition;
struct TPDiskInfo : TPDiskRecord {
TPDiskLayoutPosition Position;
- bool Matching = false;
- ui32 NumDomainMatchingDisks = 0;
+ bool Matching;
+ ui32 NumDomainMatchingDisks;
+ ui32 SkipToNextRealmGroup;
+ ui32 SkipToNextRealm;
+ ui32 SkipToNextDomain;
TPDiskInfo(const TPDiskRecord& pdisk, TPDiskLayoutPosition position)
: TPDiskRecord(pdisk)
@@ -89,10 +22,6 @@ namespace NKikimr::NBsController {
std::sort(Groups.begin(), Groups.end());
}
- TString ToString() const {
- return Location.ToString();
- }
-
bool IsUsable() const {
return Usable && !Decommitted && NumSlots < MaxSlots;
}
@@ -127,123 +56,106 @@ namespace NKikimr::NBsController {
}
};
- struct TAllocateContext {
- struct TDomainBound {
- ui32 NumChildren = 0;
- };
-
- struct TRealmBound {
- ui32 NumChildren = 0;
- TStackVec<THashMap<ui32, TDomainBound>, 8> Items;
-
- TRealmBound(size_t numFailDomains)
- : Items(numFailDomains)
- {}
- };
-
- struct TRealmGroupBound {
- ui32 NumChildren = 0;
- TStackVec<THashMap<ui32, TRealmBound>, 4> Items;
-
- TRealmGroupBound(size_t numFailRealms)
- : Items(numFailRealms)
- {}
- };
+ using TGroup = std::vector<TPDiskInfo*>;
+ struct TAllocator {
+ TImpl& Self;
const ui32 NumFailRealms;
const ui32 NumFailDomainsPerFailRealm;
- THashMap<ui32, TRealmGroupBound> RealmGroup;
+ const ui32 NumFailDomainsTotal;
+ const ui32 NumVDisksPerFailDomain;
+ const ui32 GroupSize;
+ TStackVec<ui8, 32> RealmIdx;
+ TStackVec<ui8, 32> DomainIdx;
+ TStackVec<ui8, 32> DomainThroughIdx;
+ TStackVec<ui8, 32> VDiskIdx;
THashSet<TPDiskId> OldGroupContent; // set of all existing disks in the group, inclusing ones which are replaced
- THashSet<TPDiskId> NewGroupContent; // newly generated group content
const i64 RequiredSpace;
const bool RequireOperational;
- TForbiddenPDisks Forbid;
+ TForbiddenPDisks ForbiddenDisks;
+ THashMap<ui32, unsigned> LocalityFactor;
+ NLayoutChecker::TGroupLayout GroupLayout;
+ std::optional<NLayoutChecker::TScore> BestScore;
- TAllocateContext(const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational,
- TForbiddenPDisks forbid)
- : NumFailRealms(geom.GetNumFailRealms())
+ TAllocator(TImpl& self, const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational,
+ TForbiddenPDisks forbiddenDisks, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks)
+ : Self(self)
+ , NumFailRealms(geom.GetNumFailRealms())
, NumFailDomainsPerFailRealm(geom.GetNumFailDomainsPerFailRealm())
+ , NumFailDomainsTotal(NumFailRealms * NumFailDomainsPerFailRealm)
+ , NumVDisksPerFailDomain(geom.GetNumVDisksPerFailDomain())
+ , GroupSize(NumFailDomainsTotal * NumVDisksPerFailDomain)
+ , RealmIdx(GroupSize)
+ , DomainIdx(GroupSize)
+ , DomainThroughIdx(GroupSize)
+ , VDiskIdx(GroupSize)
, RequiredSpace(requiredSpace)
, RequireOperational(requireOperational)
- , Forbid(std::move(forbid))
- {}
-
- bool ProcessExistingGroup(const TGroupDefinition& group, const TPDisks& pdisks, const TPDiskId replacedDiskIds[],
- size_t numReplacedDisks, TString& error) {
- OldGroupContent = {replacedDiskIds, replacedDiskIds + numReplacedDisks};
-
- for (ui32 failRealmIdx = 0; failRealmIdx < group.size(); ++failRealmIdx) {
- const auto& realm = group[failRealmIdx];
- for (ui32 failDomainIdx = 0; failDomainIdx < realm.size(); ++failDomainIdx) {
- const auto& domain = realm[failDomainIdx];
- for (const TPDiskId pdiskId : domain) {
+ , ForbiddenDisks(std::move(forbiddenDisks))
+ , GroupLayout(NumFailRealms, NumFailDomainsPerFailRealm)
+ {
+ for (const auto& [vdiskId, pdiskId] : replacedDisks) {
+ OldGroupContent.insert(pdiskId);
+ }
+ for (ui32 index = 0, domainThroughIdx = 0, realmIdx = 0; realmIdx < NumFailRealms; ++realmIdx) {
+ for (ui32 domainIdx = 0; domainIdx < NumFailDomainsPerFailRealm; ++domainIdx, ++domainThroughIdx) {
+ for (ui32 vdiskIdx = 0; vdiskIdx < NumVDisksPerFailDomain; ++vdiskIdx, ++index) {
+ RealmIdx[index] = realmIdx;
+ DomainIdx[index] = domainIdx;
+ DomainThroughIdx[index] = domainThroughIdx;
+ VDiskIdx[index] = vdiskIdx;
+ }
+ }
+ }
+ }
+
+ TGroup ProcessExistingGroup(const TGroupDefinition& group, TString& error) {
+ TGroup res(GroupSize);
+
+ ui32 index = 0;
+ for (const auto& realm : group) {
+ for (const auto& domain : realm) {
+ for (const auto& pdiskId : domain) {
if (pdiskId != TPDiskId()) {
- // add to used pdisk set
- const bool inserted = OldGroupContent.insert(pdiskId).second;
- Y_VERIFY(inserted);
-
- // find existing pdisk
- auto it = pdisks.find(pdiskId);
- if (it == pdisks.end()) {
- error = TStringBuilder() << "existing group contains missing PDisks";
- return false;
+ const auto it = Self.PDisks.find(pdiskId);
+ if (it == Self.PDisks.end()) {
+ error = TStringBuilder() << "existing group contains missing PDiskId# " << pdiskId;
+ return {};
}
- const TPDiskInfo& pdisk = it->second;
+ TPDiskInfo& pdisk = it->second;
+ res[index] = &pdisk;
- if (pdisk.Decommitted) {
- continue;
+ const auto [_, inserted] = OldGroupContent.insert(pdiskId);
+ if (!inserted) {
+ error = TStringBuilder() << "group contains duplicate PDiskId# " << pdiskId;
+ return {};
}
- if (!AddDisk(pdisk, failRealmIdx, failDomainIdx)) {
- error = "group contains duplicate PDisks";
- return false;
+ if (!pdisk.Decommitted) {
+ AddUsedDisk(pdisk);
+ GroupLayout.AddDisk(pdisk.Position, RealmIdx[index], DomainIdx[index]);
}
}
+
+ ++index;
}
}
}
- return true;
+ return res;
}
- void UndoAddDisk(const TPDiskInfo& pdisk, ui32 failRealmIdx, ui32 failDomainIdx) {
- const size_t num = NewGroupContent.erase(pdisk.PDiskId);
- Y_VERIFY(num);
- auto realmGroupIt = RealmGroup.find(pdisk.Position.RealmGroup);
- Y_VERIFY(realmGroupIt != RealmGroup.end());
- auto& realms = realmGroupIt->second.Items[failRealmIdx];
- auto realmIt = realms.find(pdisk.Position.Realm);
- Y_VERIFY(realmIt != realms.end());
- auto& domains = realmIt->second.Items[failDomainIdx];
- auto domainIt = domains.find(pdisk.Position.Domain);
- Y_VERIFY(domainIt != domains.end());
- if (!--domainIt->second.NumChildren) {
- domains.erase(domainIt);
- }
- if (!--realmIt->second.NumChildren) {
- realms.erase(realmIt);
- }
- if (!--realmGroupIt->second.NumChildren) {
- RealmGroup.erase(realmGroupIt);
+ void Decompose(const TGroup& in, TGroupDefinition& out) {
+ for (ui32 i = 0; i < GroupSize; ++i) {
+ out[RealmIdx[i]][DomainIdx[i]][VDiskIdx[i]] = in[i]->PDiskId;
}
}
- bool AddDisk(const TPDiskInfo& pdisk, ui32 failRealmIdx, ui32 failDomainIdx) {
- auto& realmGroup = RealmGroup.try_emplace(pdisk.Position.RealmGroup, NumFailRealms).first->second;
- auto& realm = realmGroup.Items[failRealmIdx].try_emplace(pdisk.Position.Realm, NumFailDomainsPerFailRealm).first->second;
- auto& domain = realm.Items[failDomainIdx].try_emplace(pdisk.Position.Domain).first->second;
- ++realmGroup.NumChildren;
- ++realm.NumChildren;
- ++domain.NumChildren;
- const auto& [_, inserted] = NewGroupContent.insert(pdisk.PDiskId);
- return inserted;
- }
-
bool DiskIsUsable(const TPDiskInfo& pdisk) const {
if (!pdisk.IsUsable()) {
return false; // disk is not usable in this case
}
- if (OldGroupContent.contains(pdisk.PDiskId) || NewGroupContent.contains(pdisk.PDiskId) || Forbid.contains(pdisk.PDiskId)) {
+ if (OldGroupContent.contains(pdisk.PDiskId) || ForbiddenDisks.contains(pdisk.PDiskId)) {
return false; // can't allow duplicate disks
}
if (RequireOperational && !pdisk.Operational) {
@@ -254,218 +166,313 @@ namespace NKikimr::NBsController {
}
return true;
}
- };
- class THelper {
- TImpl& Self;
- TAllocateContext& Ctx;
- std::unordered_map<ui32, unsigned> LocalityFactor;
- TDynBitMap ForbiddenEntities;
-
- public:
- THelper(TImpl& self, TAllocateContext& ctx)
- : Self(self)
- , Ctx(ctx)
- {
- ForbiddenEntities.Reserve(Self.DomainMapper.GetIdCount());
- for (const TPDiskId& pdiskId : Ctx.NewGroupContent) {
- try {
- const TPDiskInfo& pdisk = Self.PDisks.at(pdiskId);
- AddUsedDisk(pdisk);
- Forbid(pdisk);
- } catch (const std::out_of_range&) {
- Y_FAIL();
- }
- }
+ TPDiskByPosition SetupMatchingDisks(ui32 maxScore) {
+ TPDiskByPosition res;
+ res.reserve(Self.PDiskByPosition.size());
- ui32 numMatchingDisksInDomain = 0;
- ui32 numMatchingDomainsInRealm = 0;
- ui32 numMatchingRealmsInRealmGroup = 0;
+ ui32 realmGroupBegin = 0;
+ ui32 realmBegin = 0;
+ ui32 domainBegin = 0;
+ TPDiskLayoutPosition prev;
- const ui32 numFailRealms = Self.Geom.GetNumFailRealms();
- const ui32 numFailDomainsPerFailRealm = Self.Geom.GetNumFailDomainsPerFailRealm();
- const ui32 numVDisksPerFailDomain = Self.Geom.GetNumVDisksPerFailDomain();
-
- auto advance = [&](bool domainExhausted, bool realmExhausted, bool realmGroupExhausted, const TPDiskLayoutPosition& prev) {
- if (domainExhausted) {
- if (numMatchingDisksInDomain < numVDisksPerFailDomain) {
- ForbiddenEntities.Set(prev.Domain);
- } else {
- ++numMatchingDomainsInRealm;
+ std::vector<ui32> numMatchingDisksInDomain(Self.DomainMapper.GetIdCount(), 0);
+ for (const auto& [position, pdisk] : Self.PDiskByPosition) {
+ pdisk->Matching = pdisk->GetPickerScore() <= maxScore && DiskIsUsable(*pdisk);
+ if (pdisk->Matching) {
+ if (position.RealmGroup != prev.RealmGroup) {
+ for (; realmGroupBegin < res.size(); ++realmGroupBegin) {
+ res[realmGroupBegin].second->SkipToNextRealmGroup = res.size() - realmGroupBegin;
+ }
}
- numMatchingDisksInDomain = 0;
- }
- if (realmExhausted) {
- if (numMatchingDomainsInRealm < numFailDomainsPerFailRealm) {
- ForbiddenEntities.Set(prev.Realm);
- } else {
- ++numMatchingRealmsInRealmGroup;
+ if (position.Realm != prev.Realm) {
+ for (; realmBegin < res.size(); ++realmBegin) {
+ res[realmBegin].second->SkipToNextRealm = res.size() - realmBegin;
+ }
}
- numMatchingDomainsInRealm = 0;
- }
- if (realmGroupExhausted) {
- if (numMatchingRealmsInRealmGroup < numFailRealms) {
- ForbiddenEntities.Set(prev.RealmGroup);
+ if (position.Domain != prev.Domain) {
+ for (; domainBegin < res.size(); ++domainBegin) {
+ res[domainBegin].second->SkipToNextDomain = res.size() - domainBegin;
+ }
}
- numMatchingRealmsInRealmGroup = 0;
- }
- };
+ prev = position;
- if (const auto *begin = Self.PDiskByPosition.data(), *end = begin + Self.PDiskByPosition.size(); begin != end) {
- --end;
- while (begin != end) {
- numMatchingDisksInDomain += begin->second->Matching || Ctx.NewGroupContent.contains(begin->second->PDiskId);
- const auto& prev = begin++->first;
- const auto& cur = begin->first;
- advance(prev.Domain != cur.Domain, prev.Realm != cur.Realm, prev.RealmGroup != cur.RealmGroup, prev);
+ res.emplace_back(position, pdisk);
+ ++numMatchingDisksInDomain[position.Domain];
}
- numMatchingDisksInDomain += begin->second->Matching || Ctx.NewGroupContent.contains(begin->second->PDiskId);
- advance(true, true, true, begin->first);
}
+ for (; realmGroupBegin < res.size(); ++realmGroupBegin) {
+ res[realmGroupBegin].second->SkipToNextRealmGroup = res.size() - realmGroupBegin;
+ }
+ for (; realmBegin < res.size(); ++realmBegin) {
+ res[realmBegin].second->SkipToNextRealm = res.size() - realmBegin;
+ }
+ for (; domainBegin < res.size(); ++domainBegin) {
+ res[domainBegin].second->SkipToNextDomain = res.size() - domainBegin;
+ }
+ for (const auto& [position, pdisk] : res) {
+ pdisk->NumDomainMatchingDisks = numMatchingDisksInDomain[position.Domain];
+ }
+
+ return std::move(res);
}
- TPDiskId AddBestDisk(ui32 realmIdx, ui32 domainIdx) {
- TPDiskInfo *pdisk = nullptr;
- auto process = [this, &pdisk](TPDiskInfo *candidate) {
- if (!pdisk || DiskIsBetter(*candidate, *pdisk)) {
- pdisk = candidate;
- }
+ struct TUndoLog {
+ struct TItem {
+ ui32 Index;
+ TPDiskInfo *PDisk;
};
- FindMatchingDisksBounded(process, Self.PDiskByPosition.begin(), Self.PDiskByPosition.end(),
- Ctx.RealmGroup, {realmIdx, domainIdx});
- if (!pdisk) {
- return TPDiskId();
+
+ std::vector<TItem> Items;
+
+ void Log(ui32 index, TPDiskInfo *pdisk) {
+ Items.push_back({index, pdisk});
}
- const bool success = Ctx.AddDisk(*pdisk, realmIdx, domainIdx);
- Y_VERIFY(success);
- Forbid(*pdisk);
- pdisk->Matching = false; // disable this disk for further selection
+
+ size_t GetPosition() const {
+ return Items.size();
+ }
+ };
+
+ void AddDiskViaUndoLog(TUndoLog& undo, TGroup& group, ui32 index, TPDiskInfo *pdisk) {
+ undo.Log(index, pdisk);
+ group[index] = pdisk;
AddUsedDisk(*pdisk);
- return pdisk->PDiskId;
+ GroupLayout.AddDisk(pdisk->Position, RealmIdx[index], DomainIdx[index]);
+ BestScore.reset(); // invalidate score
}
- private:
- void Forbid(const TPDiskInfo& pdisk) {
- for (const ui32 id : {pdisk.Position.RealmGroup, pdisk.Position.Realm, pdisk.Position.Domain}) {
- ForbiddenEntities.Set(id);
+ void Revert(TUndoLog& undo, TGroup& group, size_t until) {
+ for (; undo.Items.size() > until; undo.Items.pop_back()) {
+ const auto& item = undo.Items.back();
+ group[item.Index] = nullptr;
+ RemoveUsedDisk(*item.PDisk);
+ GroupLayout.RemoveDisk(item.PDisk->Position, RealmIdx[item.Index], DomainIdx[item.Index]);
+ BestScore.reset(); // invalidate score
}
}
- template<typename T>
- struct TBoundTraits {};
+ bool FillInGroup(ui32 maxScore, TUndoLog& undo, TGroup& group) {
+ // Determine PDisks that fit our requirements (including score).
+ auto set = SetupMatchingDisks(maxScore);
+
+ // Determine what we have to fill in -- full group, some realms, domains, or just some cells.
+ bool emptyGroup = true;
+
+ TDynBitMap emptyRealms;
+ emptyRealms.Set(0, NumFailRealms);
- template<>
- struct TBoundTraits<TAllocateContext::TRealmGroupBound> {
- static constexpr ui32 TPDiskLayoutPosition::*Ptr = &TPDiskLayoutPosition::RealmGroup;
- static constexpr EPositionItem ForbiddenCheckLevel = EPositionItem::RealmGroup;
- static constexpr size_t CoordIndex = 0;
- static constexpr bool Descend = true;
+ TDynBitMap emptyDomains;
+ emptyDomains.Set(0, NumFailDomainsTotal);
- static TPDiskLayoutPosition LowerBound(TPDiskLayoutPosition, ui32 id) {
- return {id, 0, 0};
+ TDynBitMap emptyDisks;
+ emptyDisks.Set(0, GroupSize);
+
+ for (ui32 i = 0; i < GroupSize; ++i) {
+ if (group[i]) {
+ emptyGroup = false;
+ emptyRealms[RealmIdx[i]] = false;
+ emptyDomains[DomainThroughIdx[i]] = false;
+ emptyDisks[i] = false;
+ }
}
- static TPDiskLayoutPosition UpperBoundFromLowerBound(TPDiskLayoutPosition lower) {
- return {lower.RealmGroup, Max<ui32>(), Max<ui32>()};
+ // Allocate new full group and exit if it is absolutely empty.
+ auto allocate = [&](auto what, ui32 index) {
+ TDiskRange fullRange(set.begin(), set.end());
+ TDynBitMap forbiddenEntities;
+ forbiddenEntities.Reserve(Self.DomainMapper.GetIdCount());
+ if (!AllocateWholeEntity(what, group, undo, index, fullRange, forbiddenEntities)) {
+ Revert(undo, group, 0);
+ return false;
+ }
+ return true;
+ };
+
+ if (emptyGroup) {
+ return allocate(TAllocateWholeGroup(), 0);
}
- static bool PrefixEquals(const TPDiskLayoutPosition& a, const TPDiskLayoutPosition& b) {
- return a.RealmGroup == b.RealmGroup;
+ // Fill in missing fail realms.
+ for (ui32 i = emptyRealms.FirstNonZeroBit(); i != emptyRealms.Size(); i = emptyRealms.NextNonZeroBit(i)) {
+ if (!allocate(TAllocateWholeRealm(), i)) {
+ return false;
+ }
+
+ // remove excessive domains and disk from the set
+ emptyDomains.Reset(i * NumFailDomainsPerFailRealm, (i + 1) * NumFailDomainsPerFailRealm);
+ emptyDisks.Reset(i * NumFailDomainsPerFailRealm * NumVDisksPerFailDomain,
+ (i + 1) * NumFailDomainsPerFailRealm * NumVDisksPerFailDomain);
}
- };
- template<>
- struct TBoundTraits<TAllocateContext::TRealmBound> {
- static constexpr ui32 TPDiskLayoutPosition::*Ptr = &TPDiskLayoutPosition::Realm;
- static constexpr EPositionItem ForbiddenCheckLevel = EPositionItem::Realm;
- static constexpr size_t CoordIndex = 1;
- static constexpr bool Descend = true;
+ // Fill in missing fail domains in some partially filled realms.
+ for (ui32 i = emptyDomains.FirstNonZeroBit(); i != emptyDomains.Size(); i = emptyDomains.NextNonZeroBit(i)) {
+ if (!allocate(TAllocateWholeDomain(), i)) {
+ return false;
+ }
- static TPDiskLayoutPosition LowerBound(TPDiskLayoutPosition prefix, ui32 id) {
- return {prefix.RealmGroup, id, 0};
+ // remove excessive disks
+ emptyDisks.Reset(i * NumVDisksPerFailDomain, (i + 1) * NumVDisksPerFailDomain);
}
- static TPDiskLayoutPosition UpperBoundFromLowerBound(TPDiskLayoutPosition lower) {
- return {lower.RealmGroup, lower.Realm, Max<ui32>()};
+ // Fill in missing disk cells.
+ for (ui32 i = emptyDisks.FirstNonZeroBit(); i != emptyDisks.Size(); i = emptyDisks.NextNonZeroBit(i)) {
+ if (!allocate(TAllocateDisk(), i)) {
+ return false;
+ }
}
- static bool PrefixEquals(const TPDiskLayoutPosition& a, const TPDiskLayoutPosition& b) {
- return a.RealmGroup == b.RealmGroup && a.Realm == b.Realm;
+ return true;
+ }
+
+ struct TAllocateDisk {};
+
+ struct TAllocateWholeDomain {
+ static constexpr auto EntityCount = &TAllocator::NumVDisksPerFailDomain;
+ static constexpr auto PositionItem = &TPDiskLayoutPosition::Domain;
+ using TNestedEntity = TAllocateDisk;
+
+ static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x) {
+ return {x, x};
}
};
- template<>
- struct TBoundTraits<TAllocateContext::TDomainBound> {
- static constexpr ui32 TPDiskLayoutPosition::*Ptr = &TPDiskLayoutPosition::Domain;
- static constexpr EPositionItem ForbiddenCheckLevel = EPositionItem::Domain;
- static constexpr bool Descend = false;
+ struct TAllocateWholeRealm {
+ static constexpr auto EntityCount = &TAllocator::NumFailDomainsPerFailRealm;
+ static constexpr auto PositionItem = &TPDiskLayoutPosition::Realm;
+ using TNestedEntity = TAllocateWholeDomain;
- static TPDiskLayoutPosition LowerBound(TPDiskLayoutPosition prefix, ui32 id) {
- return {prefix.RealmGroup, prefix.Realm, id};
+ static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x) {
+ return {{x.RealmGroup, x.Realm, 0}, {x.RealmGroup, x.Realm, Max<ui32>()}};
}
+ };
- static TPDiskLayoutPosition UpperBoundFromLowerBound(TPDiskLayoutPosition lower) {
- return lower;
- }
+ struct TAllocateWholeGroup {
+ static constexpr auto EntityCount = &TAllocator::NumFailRealms;
+ static constexpr auto PositionItem = &TPDiskLayoutPosition::RealmGroup;
+ using TNestedEntity = TAllocateWholeRealm;
- static bool PrefixEquals(const TPDiskLayoutPosition& a, const TPDiskLayoutPosition& b) {
- return a == b;
+ static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x) {
+ return {{x.RealmGroup, 0, 0}, {x.RealmGroup, Max<ui32>(), Max<ui32>()}};
}
};
- template<typename TCallback, typename TBound, typename... TRest>
- void FindMatchingDisksBounded(TCallback&& cb, TPDiskByPosition::const_iterator begin,
- TPDiskByPosition::const_iterator end, const TBound& bound,
- std::tuple<ui32, ui32> posInGroup) {
- using Traits = TBoundTraits<typename TBound::mapped_type>;
- if (bound && begin != end) {
- ui32 max = 0;
- for (const auto& [_, item] : bound) {
- max = Max(item.NumChildren, max);
- }
- for (const auto& [id, item] : bound) {
- if (item.NumChildren != max) {
- continue;
- }
- const TPDiskLayoutPosition lower = Traits::LowerBound(begin->first, id);
- const auto childBegin = std::lower_bound(begin, end, lower, TComparePDiskByPosition());
- const auto childEnd = std::upper_bound(childBegin, end, Traits::UpperBoundFromLowerBound(lower),
- TComparePDiskByPosition());
- if constexpr (Traits::Descend) {
- const ui32 index = std::get<Traits::CoordIndex>(posInGroup);
- FindMatchingDisksBounded(cb, childBegin, childEnd, item.Items[index], posInGroup);
- } else {
- FindMatchingDisks<EPositionItem::None>(cb, childBegin, childEnd);
+ using TDiskRange = std::pair<TPDiskByPosition::const_iterator, TPDiskByPosition::const_iterator>;
+
+ template<typename T>
+ TPDiskLayoutPosition *AllocateWholeEntity(T, TGroup& group, TUndoLog& undo, ui32 parentEntityIndex,
+ TDiskRange range, TDynBitMap& forbiddenEntities) {
+ const TDiskRange originalRange(range);
+ const size_t undoPosition = undo.GetPosition();
+ TPDiskLayoutPosition *prefix = nullptr;
+ ui32 currentEntityId = Max<ui32>();
+ for (ui32 index = 0, num = this->*T::EntityCount; index < num; ) {
+ // allocate nested entity
+ prefix = AllocateWholeEntity(typename T::TNestedEntity(), group, undo,
+ parentEntityIndex * num + index, range, forbiddenEntities);
+ if (prefix) {
+ if (!index) {
+ currentEntityId = prefix->*T::PositionItem;
+ auto [min, max] = T::MakeRange(*prefix);
+ range.first = std::lower_bound(range.first, range.second, min, TComparePDiskByPosition());
+ range.second = std::upper_bound(range.first, range.second, max, TComparePDiskByPosition());
}
+ ++index;
+ } else if (index) {
+ // disable just checked entity (to prevent its selection again)
+ Y_VERIFY(currentEntityId != Max<ui32>());
+ forbiddenEntities.Set(currentEntityId);
+ // try another entity at this level
+ Revert(undo, group, undoPosition);
+ // revert original wide range and start from the beginning
+ range = originalRange;
+ index = 0;
+ currentEntityId = Max<ui32>();
+ } else {
+ // no chance to allocate new entity, exit
+ return nullptr;
}
- } else {
- FindMatchingDisks<Traits::ForbiddenCheckLevel>(cb, begin, end);
}
+ // disable filled entity from further selection
+ Y_VERIFY(prefix && currentEntityId != Max<ui32>());
+ forbiddenEntities.Set(currentEntityId);
+ return prefix;
}
- template<EPositionItem ForbiddenCheckLevel, typename TCallback>
- void FindMatchingDisks(TCallback&& cb, TPDiskByPosition::const_iterator begin, TPDiskByPosition::const_iterator end) {
- while (begin != end) {
- const auto& [position, pdisk] = *begin++;
- if constexpr (ForbiddenCheckLevel <= EPositionItem::RealmGroup) {
- if (ForbiddenEntities[position.RealmGroup]) {
- continue;
- }
+ TPDiskLayoutPosition *AllocateWholeEntity(TAllocateDisk, TGroup& group, TUndoLog& undo, ui32 index,
+ TDiskRange range, TDynBitMap& forbiddenEntities) {
+ TPDiskInfo *pdisk = nullptr;
+ auto process = [this, &pdisk](TPDiskInfo *candidate) {
+ if (!pdisk || DiskIsBetter(*candidate, *pdisk)) {
+ pdisk = candidate;
}
- if constexpr (ForbiddenCheckLevel <= EPositionItem::Realm) {
- if (ForbiddenEntities[position.Realm]) {
- continue;
+ };
+ FindMatchingDiskBasedOnScore(process, group, RealmIdx[index], DomainIdx[index],
+ range, forbiddenEntities);
+ if (pdisk) {
+ AddDiskViaUndoLog(undo, group, index, pdisk);
+ pdisk->Matching = false;
+ return &pdisk->Position;
+ } else {
+ return nullptr;
+ }
+ }
+
+ NLayoutChecker::TScore CalculateBestScoreWithCache(const TGroup& group) {
+ if (!BestScore) {
+ // find the worst disk from a position of layout correctness and use it as a milestone for other
+ // disks -- they can't be misplaced worse
+ NLayoutChecker::TScore bestScore;
+ for (ui32 i = 0; i < GroupSize; ++i) {
+ if (TPDiskInfo *pdisk = group[i]; pdisk && !pdisk->Decommitted) {
+ NLayoutChecker::TScore score = GroupLayout.GetCandidateScore(pdisk->Position, RealmIdx[i],
+ DomainIdx[i]);
+ if (bestScore.BetterThan(score)) {
+ bestScore = score;
+ }
}
}
- if constexpr (ForbiddenCheckLevel <= EPositionItem::Domain) {
- if (ForbiddenEntities[position.Domain]) {
- continue;
- }
+ BestScore = bestScore;
+ }
+ return *BestScore;
+ }
+
+ template<typename TCallback>
+ void FindMatchingDiskBasedOnScore(TCallback&& cb, const TGroup& group, ui32 failRealmIdx, ui32 failDomainIdx,
+ TDiskRange range, TDynBitMap& forbiddenEntities) {
+ NLayoutChecker::TScore bestScore = CalculateBestScoreWithCache(group);
+
+ std::vector<TPDiskInfo*> candidates;
+
+ while (range.first != range.second) {
+ const auto& [position, pdisk] = *range.first++;
+
+ if (!pdisk->Matching) {
+ continue;
+ } else if (forbiddenEntities[position.RealmGroup]) {
+ range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextRealmGroup - 1);
+ continue;
+ } else if (forbiddenEntities[position.Realm]) {
+ range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextRealm - 1);
+ continue;
+ } else if (forbiddenEntities[position.Domain]) {
+ range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextDomain - 1);
+ continue;
}
- if (pdisk->Matching) {
- cb(pdisk);
+
+ NLayoutChecker::TScore score = GroupLayout.GetCandidateScore(position, failRealmIdx, failDomainIdx);
+ if (score.BetterThan(bestScore)) {
+ candidates.clear();
+ candidates.push_back(pdisk);
+ bestScore = score;
+ } else if (score.SameAs(bestScore)) {
+ candidates.push_back(pdisk);
}
}
+
+ for (TPDiskInfo *pdisk : candidates) {
+ cb(pdisk);
+ }
}
bool DiskIsBetter(TPDiskInfo& pretender, TPDiskInfo& king) const {
@@ -497,6 +504,14 @@ namespace NKikimr::NBsController {
}
}
+ void RemoveUsedDisk(const TPDiskInfo& pdisk) {
+ for (ui32 groupId : pdisk.Groups) {
+ if (!--LocalityFactor[groupId]) {
+ LocalityFactor.erase(groupId);
+ }
+ }
+ }
+
unsigned GetLocalityFactor(const TPDiskInfo& pdisk) const {
unsigned res = 0;
for (ui32 groupId : pdisk.Groups) {
@@ -514,7 +529,7 @@ namespace NKikimr::NBsController {
private:
const TGroupGeometryInfo Geom;
const bool Randomize;
- TDomainMapper DomainMapper;
+ NLayoutChecker::TDomainMapper DomainMapper;
TPDisks PDisks;
TPDiskByPosition PDiskByPosition;
bool Dirty = false;
@@ -556,7 +571,7 @@ namespace NKikimr::NBsController {
it->second.SpaceAvailable += increment;
}
- TString FormatPDisks(const TAllocateContext& ctx) const {
+ TString FormatPDisks(const TAllocator& allocator) const {
TStringStream s;
s << "PDisks# ";
@@ -576,11 +591,11 @@ namespace NKikimr::NBsController {
s << std::exchange(space, " ") << pdisk->PDiskId;
- if (ctx.OldGroupContent.contains(pdisk->PDiskId)) {
+ if (allocator.OldGroupContent.contains(pdisk->PDiskId)) {
s << "*";
}
const char *minus = "-";
- if (ctx.Forbid.contains(pdisk->PDiskId)) {
+ if (allocator.ForbiddenDisks.contains(pdisk->PDiskId)) {
s << std::exchange(minus, "") << "f";
}
if (!pdisk->Usable) {
@@ -590,15 +605,15 @@ namespace NKikimr::NBsController {
s << std::exchange(minus, "") << "d";
}
if (pdisk->NumSlots >= pdisk->MaxSlots) {
- s << std::exchange(minus, "") << "s";
+ s << std::exchange(minus, "") << "s[" << pdisk->NumSlots << "/" << pdisk->MaxSlots << "]";
}
- if (pdisk->SpaceAvailable < ctx.RequiredSpace) {
+ if (pdisk->SpaceAvailable < allocator.RequiredSpace) {
s << std::exchange(minus, "") << "v";
}
if (!pdisk->Operational) {
s << std::exchange(minus, "") << "o";
}
- if (ctx.DiskIsUsable(*pdisk)) {
+ if (allocator.DiskIsUsable(*pdisk)) {
s << "+";
}
@@ -612,154 +627,86 @@ namespace NKikimr::NBsController {
return s.Str();
}
- bool AllocateGroup(ui32 groupId, TGroupDefinition& group, const TPDiskId replacedDiskIds[],
- size_t numReplacedDisks, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational,
+ bool AllocateGroup(ui32 groupId, TGroupDefinition& groupDefinition, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks,
+ TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational,
TString& error) {
if (Dirty) {
std::sort(PDiskByPosition.begin(), PDiskByPosition.end());
Dirty = false;
}
- // fill in the allocation context
- TAllocateContext ctx(Geom, requiredSpace, requireOperational, std::move(forbid));
- if (!ctx.ProcessExistingGroup(group, PDisks, replacedDiskIds, numReplacedDisks, error)) {
- return false;
- }
-
// create group of required size, if it is not created yet
- if (!Geom.ResizeGroup(group)) {
+ if (!Geom.ResizeGroup(groupDefinition)) {
error = "incorrect existing group";
return false;
}
- // if the group is already created, check for missing entities
- bool hasMissingEntities = false;
- for (const auto& realm : group) {
- for (const auto& domain : realm) {
- for (const TPDiskId& pdiskId : domain) {
- if (pdiskId == TPDiskId()) {
- hasMissingEntities = true;
- goto out;
- }
- }
- }
- }
-out: if (!hasMissingEntities) {
- return true; // group is okay
+ // fill in the allocation context
+ TAllocator allocator(*this, Geom, requiredSpace, requireOperational, std::move(forbid), replacedDisks);
+ TGroup group = allocator.ProcessExistingGroup(groupDefinition, error);
+ if (group.empty()) {
+ return false;
}
-
- // adjust number of slots
- for (TPDiskId pdiskId : ctx.OldGroupContent) {
- Y_VERIFY_DEBUG(PDisks.contains(pdiskId));
- --PDisks.at(pdiskId).NumSlots;
+ bool ok = true;
+ for (TPDiskInfo *pdisk : group) {
+ if (!pdisk) {
+ ok = false;
+ break;
+ }
}
- for (size_t i = 0; i < numReplacedDisks; ++i) {
- Y_VERIFY_DEBUG(PDisks.contains(replacedDiskIds[i]));
- PDisks.at(replacedDiskIds[i]).EraseGroup(groupId);
+ if (ok) {
+ return true;
}
- ui32 minScore = Max<ui32>();
- ui32 maxScore = Min<ui32>();
+ // calculate score table
+ std::vector<ui32> scores;
for (const auto& [pdiskId, pdisk] : PDisks) {
- const ui32 score = pdisk.GetPickerScore();
- minScore = Min(minScore, score);
- maxScore = Max(maxScore, score + 1);
- }
-
- std::optional<TGroupDefinition> outGroup;
-
- auto tryIteration = [&](ui32 score) {
- ui32 numDomainMatchingDisks = 0;
- auto domainBegin = PDiskByPosition.begin();
- for (auto it = domainBegin; it != PDiskByPosition.end(); ++it) {
- auto& [position, pdisk] = *it;
- if (position != domainBegin->first) {
- for (; domainBegin != it; ++domainBegin) {
- domainBegin->second->NumDomainMatchingDisks = numDomainMatchingDisks;
- }
- numDomainMatchingDisks = 0;
- }
- pdisk->Matching = ctx.DiskIsUsable(*pdisk) && pdisk->GetPickerScore() <= score;
- numDomainMatchingDisks += pdisk->Matching;
- }
- for (; domainBegin != PDiskByPosition.end(); ++domainBegin) {
- domainBegin->second->NumDomainMatchingDisks = numDomainMatchingDisks;
- }
-
- TStackVec<std::tuple<ui32, ui32, ui32>, 32> undoLog;
- THelper helper(*this, ctx);
-
- auto revert = [&]() {
- for (const auto& item : undoLog) {
- ui32 realmIdx, domainIdx, vdiskIdx;
- std::tie(realmIdx, domainIdx, vdiskIdx) = item; // thanks to Microsoft
- auto& pdiskId = group[realmIdx][domainIdx][vdiskIdx];
- const auto it = PDisks.find(pdiskId);
- Y_VERIFY(it != PDisks.end());
- ctx.UndoAddDisk(it->second, realmIdx, domainIdx);
- pdiskId = TPDiskId();
- }
- };
-
- for (ui32 realmIdx = 0; realmIdx < ctx.NumFailRealms; ++realmIdx) {
- for (ui32 domainIdx = 0; domainIdx < ctx.NumFailDomainsPerFailRealm; ++domainIdx) {
- auto& domain = group[realmIdx][domainIdx];
- for (ui32 vdiskIdx = 0; vdiskIdx < domain.size(); ++vdiskIdx) {
- if (auto& pdiskId = domain[vdiskIdx]; pdiskId == TPDiskId()) {
- pdiskId = helper.AddBestDisk(realmIdx, domainIdx);
- if (pdiskId == TPDiskId()) {
- revert();
- return false;
- } else {
- undoLog.emplace_back(realmIdx, domainIdx, vdiskIdx);
- }
- }
- }
- }
- }
-
- outGroup = group;
- revert();
- return true;
- };
-
- while (minScore < maxScore) {
- const ui32 score = minScore + (maxScore - minScore) / 2;
- if (tryIteration(score)) {
- maxScore = score;
+ if (allocator.DiskIsUsable(pdisk)) {
+ scores.push_back(pdisk.GetPickerScore());
+ }
+ }
+ std::sort(scores.begin(), scores.end());
+ scores.erase(std::unique(scores.begin(), scores.end()), scores.end());
+
+ // bisect scores to find optimal working one
+ std::optional<TGroup> result;
+ ui32 begin = 0, end = scores.size();
+ while (begin < end) {
+ const ui32 mid = begin + (end - begin) / 2;
+ TAllocator::TUndoLog undo;
+ if (allocator.FillInGroup(scores[mid], undo, group)) {
+ result = group;
+ allocator.Revert(undo, group, 0);
+ end = mid;
} else {
- minScore = score + 1;
+ begin = mid + 1;
}
}
- if (outGroup) {
- group = *outGroup;
- for (const auto& realm : group) {
- for (const auto& domain : realm) {
- for (const auto& pdiskId : domain) {
- if (const auto it = PDisks.find(pdiskId); it != PDisks.end()) {
- ++it->second.NumSlots;
- it->second.InsertGroup(groupId);
- } else {
- Y_FAIL();
- }
- }
+ if (result) {
+ for (const auto& [vdiskId, pdiskId] : replacedDisks) {
+ const auto it = PDisks.find(pdiskId);
+ Y_VERIFY(it != PDisks.end());
+ TPDiskInfo& pdisk = it->second;
+ --pdisk.NumSlots;
+ pdisk.EraseGroup(groupId);
+ }
+ ui32 numZero = 0;
+ for (ui32 i = 0; i < allocator.GroupSize; ++i) {
+ if (!group[i]) {
+ ++numZero;
+ TPDiskInfo *pdisk = result->at(i);
+ ++pdisk->NumSlots;
+ pdisk->InsertGroup(groupId);
}
}
+ Y_VERIFY(numZero == allocator.GroupSize || numZero == replacedDisks.size());
+ allocator.Decompose(*result, groupDefinition);
return true;
+ } else {
+ error = "no group options " + FormatPDisks(allocator);
+ return false;
}
-
- // undo changes to the mapper content
- for (TPDiskId pdiskId : ctx.OldGroupContent) {
- Y_VERIFY_DEBUG(PDisks.contains(pdiskId));
- ++PDisks.at(pdiskId).NumSlots;
- }
- for (size_t i = 0; i < numReplacedDisks; ++i) {
- Y_VERIFY_DEBUG(PDisks.contains(replacedDiskIds[i]));
- PDisks.at(replacedDiskIds[i]).InsertGroup(groupId);
- }
- error = "no group options " + FormatPDisks(ctx);
- return false;
}
};
@@ -781,10 +728,9 @@ out: if (!hasMissingEntities) {
return Impl->AdjustSpaceAvailable(pdiskId, increment);
}
- bool TGroupMapper::AllocateGroup(ui32 groupId, TGroupDefinition& group, const TPDiskId replacedDiskIds[],
- size_t numReplacedDisks, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error) {
- return Impl->AllocateGroup(groupId, group, replacedDiskIds, numReplacedDisks, std::move(forbid),
- requiredSpace, requireOperational, error);
+ bool TGroupMapper::AllocateGroup(ui32 groupId, TGroupDefinition& group, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks,
+ TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error) {
+ return Impl->AllocateGroup(groupId, group, replacedDisks, std::move(forbid), requiredSpace, requireOperational, error);
}
} // NKikimr::NBsController
diff --git a/ydb/core/mind/bscontroller/group_mapper.h b/ydb/core/mind/bscontroller/group_mapper.h
index 46dbb11c8da..a58e49ab0d1 100644
--- a/ydb/core/mind/bscontroller/group_mapper.h
+++ b/ydb/core/mind/bscontroller/group_mapper.h
@@ -63,9 +63,8 @@ namespace NKikimr {
// failRealmBeginDxLevel, failRealmEndDxLevel, and then by finding possible options to meet requirements
// (1) and (2). That is, prefix gives us unique domains in which we can find realms to operate, while
// prefix+infix part gives us distinct fail realms we can use while generating groups.
- bool AllocateGroup(ui32 groupId, TGroupDefinition& group, const TPDiskId replacedDiskIds[],
- size_t numReplacedDisks, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational,
- TString& error);
+ bool AllocateGroup(ui32 groupId, TGroupDefinition& group, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks,
+ TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error);
};
} // NBsController
diff --git a/ydb/core/mind/bscontroller/group_mapper_ut.cpp b/ydb/core/mind/bscontroller/group_mapper_ut.cpp
index d19bd44cc27..839d75e27e0 100644
--- a/ydb/core/mind/bscontroller/group_mapper_ut.cpp
+++ b/ydb/core/mind/bscontroller/group_mapper_ut.cpp
@@ -2,6 +2,7 @@
#include "group_geometry_info.h"
#include "group_mapper.h"
+#include "group_layout_checker.h"
#include "ut_helpers.h"
using namespace NKikimr;
@@ -136,16 +137,43 @@ public:
}
}
+ ui32 GetDataCenter(TPDiskId pdiskId) const {
+ const auto it = PDisks.find(pdiskId);
+ UNIT_ASSERT(it != PDisks.end());
+ return it->second.DataCenterId;
+ }
+
+ TNodeLocation GetLocation(TPDiskId pdiskId) const {
+ const auto it = PDisks.find(pdiskId);
+ UNIT_ASSERT(it != PDisks.end());
+ return it->second.GetLocation();
+ }
+
+ std::vector<std::tuple<ui32, ui32, ui32, ui32>> ExportLayout() const {
+ std::vector<std::tuple<ui32, ui32, ui32, ui32>> res;
+ for (const auto& [pdiskId, pdisk] : PDisks) {
+ res.emplace_back(pdisk.DataCenterId, pdisk.RoomId, pdisk.RackId, pdisk.BodyId);
+ }
+ return res;
+ }
+
+ void ImportLayout(const std::vector<std::tuple<ui32, ui32, ui32, ui32>>& v) {
+ size_t index = 0;
+ for (auto& [pdiskId, pdisk] : PDisks) {
+ UNIT_ASSERT(index != v.size());
+ std::tie(pdisk.DataCenterId, pdisk.RoomId, pdisk.RackId, pdisk.BodyId) = v[index];
+ ++index;
+ }
+ UNIT_ASSERT(index == v.size());
+ }
+
ui32 AllocateGroup(TGroupMapper& mapper, TGroupMapper::TGroupDefinition& group, bool allowFailure = false) {
ui32 groupId = NextGroupId++;
TString error;
- bool success = mapper.AllocateGroup(groupId, group, nullptr, 0, {}, 0, false, error);
+ bool success = mapper.AllocateGroup(groupId, group, {}, {}, 0, false, error);
if (!success && allowFailure) {
return 0;
}
- if (!success) {
- Ctest << "error# " << error << Endl;
- }
UNIT_ASSERT_C(success, error);
TGroupRecord& record = Groups[groupId];
record.Group = group;
@@ -161,7 +189,7 @@ public:
}
TGroupMapper::TGroupDefinition ReallocateGroup(TGroupMapper& mapper, ui32 groupId, const TSet<TPDiskId>& unusableDisks,
- bool makeThemForbidden = false, bool requireOperational = false, bool requireError = false) {
+ bool makeThemForbidden = false, bool requireOperational = false, bool allowError = false) {
TGroupRecord& group = Groups.at(groupId);
TGroupMapper::TForbiddenPDisks forbid(unusableDisks.begin(), unusableDisks.end());
@@ -170,13 +198,14 @@ public:
}
// remove unusable disks from the set
- std::vector<TPDiskId> replaced;
- for (auto& realm : group.Group) {
- for (auto& domain : realm) {
- for (auto& pdisk : domain) {
+ THashMap<TVDiskIdShort, TPDiskId> replacedDisks;
+ for (ui32 i = 0; i < group.Group.size(); ++i) {
+ for (ui32 j = 0; j < group.Group[i].size(); ++j) {
+ for (ui32 k = 0; k < group.Group[i][j].size(); ++k) {
+ auto& pdisk = group.Group[i][j][k];
--PDisks.at(pdisk).NumSlots;
if (unusableDisks.count(pdisk)) {
- replaced.push_back(std::exchange(pdisk, {}));
+ replacedDisks.emplace(TVDiskIdShort(i, j, k), std::exchange(pdisk, {}));
}
}
}
@@ -185,15 +214,24 @@ public:
Ctest << "groupId# " << groupId << " reallocating group# " << FormatGroup(group.Group) << Endl;
TString error;
- bool success = mapper.AllocateGroup(groupId, group.Group, replaced.data(), replaced.size(), std::move(forbid),
- 0, requireOperational, error);
+ bool success = mapper.AllocateGroup(groupId, group.Group, replacedDisks, std::move(forbid), 0,
+ requireOperational, error);
if (!success) {
- if (requireError) {
+ Ctest << "error# " << error << Endl;
+ if (allowError) {
+ // revert group to its original state
+ for (const auto& [vdiskId, pdiskId] : replacedDisks) {
+ group.Group[vdiskId.FailRealm][vdiskId.FailDomain][vdiskId.VDisk] = pdiskId;
+ }
+ for (auto& realm : group.Group) {
+ for (auto& domain : realm) {
+ for (auto& pdisk : domain) {
+ ++PDisks.at(pdisk).NumSlots;
+ }
+ }
+ }
return {};
}
- Ctest << "error# " << error << Endl;
- } else {
- UNIT_ASSERT(!requireError);
}
UNIT_ASSERT(success);
@@ -210,6 +248,23 @@ public:
return group.Group;
}
+ void SetGroup(ui32 groupId, const TGroupMapper::TGroupDefinition& group) {
+ auto& g = Groups[groupId];
+ for (const TPDiskId& pdiskId : g.PDisks) {
+ --PDisks.at(pdiskId).NumSlots;
+ }
+ g.Group = group;
+ g.PDisks.clear();
+ for (const auto& realm : g.Group) {
+ for (const auto& domain : realm) {
+ for (const auto& pdisk : domain) {
+ g.PDisks.push_back(pdisk);
+ ++PDisks.at(pdisk).NumSlots;
+ }
+ }
+ }
+ }
+
TString FormatGroup(const TGroupMapper::TGroupDefinition& group) {
TStringStream str;
str << "[";
@@ -234,23 +289,27 @@ public:
return str.Str();
}
- void CheckGroupErasure(const TGroupMapper::TGroupDefinition& group) {
+ void CheckGroupErasure(const TGroupMapper::TGroupDefinition& group, ui32 decommittedDataCenter = 0) {
TSet<ui32> dataCenters;
for (const auto& realm : group) {
TMaybe<ui32> dataCenter;
- TSet<std::tuple<ui32, ui32>> domains;
+ TSet<std::tuple<ui32, ui32, ui32>> domains;
for (const auto& domain : realm) {
- TMaybe<std::tuple<ui32, ui32>> currentDom;
+ TMaybe<std::tuple<ui32, ui32, ui32>> currentDom;
for (const auto& pdisk : domain) {
const TPDiskRecord& record = PDisks.at(pdisk);
- if (dataCenter) {
- UNIT_ASSERT_VALUES_EQUAL(*dataCenter, record.DataCenterId);
- } else {
- dataCenter = record.DataCenterId;
- const bool inserted = dataCenters.insert(*dataCenter).second;
- UNIT_ASSERT(inserted);
+ if (record.DataCenterId != decommittedDataCenter) { // ignore entries from decommitted data center
+ if (dataCenter) {
+ if (*dataCenter != decommittedDataCenter && record.DataCenterId != decommittedDataCenter) {
+ UNIT_ASSERT_VALUES_EQUAL(*dataCenter, record.DataCenterId);
+ }
+ } else {
+ dataCenter = record.DataCenterId;
+ const bool inserted = dataCenters.insert(*dataCenter).second;
+ UNIT_ASSERT(inserted);
+ }
}
- std::tuple<ui32, ui32> dom = {record.RoomId, record.RackId};
+ auto dom = std::make_tuple(record.DataCenterId, record.RoomId, record.RackId);
if (currentDom) {
// check that all disks from the same domain reside in the same domain :)
UNIT_ASSERT_EQUAL(dom, *currentDom);
@@ -297,7 +356,7 @@ public:
}
void PopulateGroupMapper(TGroupMapper& mapper, ui32 maxSlots = 16, TSet<TPDiskId> unusableDisks = {},
- TSet<TPDiskId> nonoperationalDisks = {}) {
+ TSet<TPDiskId> nonoperationalDisks = {}, std::optional<ui32> decommittedDataCenter = std::nullopt) {
std::map<TPDiskId, std::vector<ui32>> groupDisks;
for (const auto& [groupId, group] : Groups) {
for (TPDiskId pdiskId : group.PDisks) {
@@ -314,11 +373,82 @@ public:
.MaxSlots = maxSlots,
.Groups{g.begin(), g.end()},
.SpaceAvailable = 0,
- .Operational = static_cast<bool>(nonoperationalDisks.count(pair.first)),
- .Decommitted = false,
+ .Operational = !nonoperationalDisks.contains(pair.first),
+ .Decommitted = decommittedDataCenter == pair.second.DataCenterId,
});
}
}
+
+ void DumpGroup(const TGroupMapper::TGroupDefinition& group) {
+ std::set<std::tuple<ui32, ui32, ui32>> locations;
+ for (const auto& [pdiskId, pdisk] : PDisks) {
+ locations.emplace(pdisk.DataCenterId, pdisk.RoomId, pdisk.RackId);
+ }
+
+ std::unordered_map<ui32, ui32> dataCenterToColumn;
+ std::unordered_map<ui32, std::unordered_map<std::tuple<ui32, ui32>, ui32>> rackToColumn;
+ for (const auto& x : locations) {
+ const ui32 dataCenterId = std::get<0>(x);
+ const ui32 roomId = std::get<1>(x);
+ const ui32 rackId = std::get<2>(x);
+ dataCenterToColumn.try_emplace(dataCenterId, dataCenterToColumn.size());
+ auto& rtc = rackToColumn[dataCenterId];
+ rtc.try_emplace(std::make_tuple(roomId, rackId), rtc.size());
+ }
+
+ std::vector<std::vector<TString>> cells(dataCenterToColumn.size());
+ for (const auto& [dataCenterId, racks] : rackToColumn) {
+ cells[dataCenterToColumn[dataCenterId]].resize(racks.size());
+ }
+
+ ui32 maxCellWidth = 0;
+ for (ui32 failRealmIdx = 0; failRealmIdx < group.size(); ++failRealmIdx) {
+ for (ui32 failDomainIdx = 0; failDomainIdx < group[failRealmIdx].size(); ++failDomainIdx) {
+ for (const TPDiskId& pdiskId : group[failRealmIdx][failDomainIdx]) {
+ if (pdiskId != TPDiskId()) {
+ const auto it = PDisks.find(pdiskId);
+ UNIT_ASSERT(it != PDisks.end());
+ const TPDiskRecord& pdisk = it->second;
+ auto& cell = cells[dataCenterToColumn[pdisk.DataCenterId]]
+ [rackToColumn[pdisk.DataCenterId][{pdisk.RoomId, pdisk.RackId}]];
+ if (cell) {
+ cell += ", ";
+ }
+ cell += TStringBuilder() << failRealmIdx << "/" << failDomainIdx;
+ maxCellWidth = Max<ui32>(maxCellWidth, cell.size());
+ }
+ }
+ }
+ }
+
+ if (!maxCellWidth) {
+ ++maxCellWidth;
+ }
+
+ for (ui32 row = 0;; ++row) {
+ bool done = true;
+ TStringBuilder s;
+ for (ui32 column = 0; column < cells.size(); ++column) {
+ if (row >= cells[column].size()) {
+ s << TString(maxCellWidth, ' ');
+ } else if (const auto& cell = cells[column][row]) {
+ s << cell << TString(maxCellWidth - cell.size(), ' ');
+ done = false;
+ } else {
+ s << TString(maxCellWidth, 'X');
+ done = false;
+ }
+ if (column != cells.size() - 1) {
+ s << ' ';
+ }
+ }
+ if (done) {
+ break;
+ } else {
+ Ctest << s << Endl;
+ }
+ }
+ }
};
Y_UNIT_TEST_SUITE(TGroupMapperTest) {
@@ -591,12 +721,15 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) {
nonoperationalDisks.insert(pdiskId);
});
context.PopulateGroupMapper(mapper, 10, unusableDisks, nonoperationalDisks);
+ ui32 hasEmpty = false;
for (ui32 groupId : groupIds) {
- auto group = context.ReallocateGroup(mapper, groupId, unusableDisks, true, true);
- group = context.ReallocateGroup(mapper, groupId, unusableDisks);
+ auto tmp = context.ReallocateGroup(mapper, groupId, unusableDisks, false, true, true);
+ hasEmpty |= tmp.empty();
+ auto group = context.ReallocateGroup(mapper, groupId, unusableDisks);
Ctest << "groupId# " << groupId << " new content# " << context.FormatGroup(group) << Endl;
context.CheckGroupErasure(group);
}
+ UNIT_ASSERT(hasEmpty);
}
}
@@ -686,155 +819,176 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) {
Y_UNIT_TEST(ReassignGroupTest3dc) {
for (ui32 i = 0; i < 10000; ++i) {
Ctest << "iteration# " << i << Endl;
- std::map<ui32, std::pair<ui32, ui32>> nodeToLocation;
- auto populate = [&](ui32 decommittedDatacenter, const std::set<std::pair<ui32, ui32>>& unusableDisks,
- TGroupMapper::TGroupDefinition group) {
- auto mapper = std::make_unique<TGroupMapper>(TTestContext::CreateGroupGeometry(
- TBlobStorageGroupType::ErasureMirror3dc));
- std::map<TPDiskId, ui32> slots;
- for (const auto& realm : group) {
- for (const auto& domain : realm) {
- for (const auto& pdisk : domain) {
- if (pdisk == TPDiskId()) {
- ++slots[pdisk];
- }
- }
- }
- }
- for (ui32 datacenter = 1, nodeId = 1; datacenter <= 4; ++datacenter) {
- for (ui32 rack = 1; rack <= 4; ++rack, ++nodeId) {
- NActorsInterconnect::TNodeLocation proto;
- proto.SetDataCenter(ToString(datacenter));
- proto.SetModule("");
- proto.SetRack(ToString(rack));
- proto.SetUnit("");
- TPDiskId pdiskId(nodeId, 1);
- mapper->RegisterPDisk({
- .PDiskId = pdiskId,
- .Location{proto},
- .Usable = datacenter != decommittedDatacenter && !unusableDisks.contains({datacenter, rack}),
- .NumSlots = slots[pdiskId],
- .MaxSlots = 1,
- .Groups{slots[pdiskId], 0},
- .SpaceAvailable = 0,
- .Operational = true,
- .Decommitted = datacenter == decommittedDatacenter,
- });
- nodeToLocation[nodeId] = {datacenter, rack};
- }
- }
- return mapper;
- };
-
- auto dumpGroup = [&](const auto& group) {
- std::map<std::pair<ui32, ui32>, TString> cells;
-
- for (ui32 i = 0; i < group.size(); ++i) {
- for (ui32 j = 0; j < group[i].size(); ++j) {
- const auto& [datacenter, rack] = nodeToLocation[group[i][j][0].NodeId];
- cells[{datacenter, rack}] = TStringBuilder() << i << "/" << j;
- }
- }
- for (ui32 rack = 1; rack <= 4; ++rack) {
- for (ui32 datacenter = 1; datacenter <= 4; ++datacenter) {
- TString cell = cells[{datacenter, rack}];
- if (!cell) {
- cell = "xxx";
- }
- Ctest << cell << " ";
- }
- Ctest << Endl;
- }
- };
+ const ui32 numDataCenters = 5;
+ const ui32 numRacks = 5;
+ TTestContext context(numDataCenters, 1, numRacks, 1, 1);
TGroupMapper::TGroupDefinition group;
+ ui32 groupId;
{
- auto mapper = populate(0, {}, group);
- TString error;
- bool success = mapper->AllocateGroup(0, group, nullptr, 0, {}, 0, false, error);
- UNIT_ASSERT_C(success, error);
- Ctest << "After allocation" << Endl;
- dumpGroup(group);
+ TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc));
+ context.PopulateGroupMapper(mapper, 1);
+ groupId = context.AllocateGroup(mapper, group);
+ Ctest << "group after allocation:" << Endl;
+ context.DumpGroup(group);
}
- ui32 decommittedDatacenter = RandomNumber<ui32>(5);
- Ctest << "decommittedDatacenter# " << decommittedDatacenter << Endl;
+ ui32 decommittedDataCenter = RandomNumber<ui32>(numDataCenters + 1);
+ Ctest << "decommittedDataCenter# " << decommittedDataCenter << Endl;
{
+ // randomly move some of disks from decommitted datacenter
+ TSet<TPDiskId> unusableDisks;
for (auto& realm : group) {
for (auto& domain : realm) {
for (auto& pdisk : domain) {
- if (nodeToLocation[pdisk.NodeId].first == decommittedDatacenter && RandomNumber(2u)) {
- pdisk = {}; // reassign disk
+ if (context.GetDataCenter(pdisk) == decommittedDataCenter && RandomNumber(2u)) {
+ unusableDisks.insert(pdisk);
}
}
}
}
- auto mapper = populate(decommittedDatacenter, {}, group);
- TString error;
- bool success = mapper->AllocateGroup(0, group, nullptr, 0, {}, 0, false, error);
- UNIT_ASSERT_C(success, error);
- Ctest << "After decomission" << Endl;
- dumpGroup(group);
- }
-
- std::set<std::pair<ui32, ui32>> unusableDisks;
- ui32 unusableDatacenter = RandomNumber<ui32>(5);
- if (unusableDatacenter) {
- for (ui32 rack = 1; rack <= 4; ++rack) {
- unusableDisks.emplace(unusableDatacenter, rack);
- }
+
+ TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc));
+ context.PopulateGroupMapper(mapper, 1, {}, {}, decommittedDataCenter);
+ group = context.ReallocateGroup(mapper, groupId, unusableDisks);
+ Ctest << "group after data center decommission:" << Endl;
+ context.DumpGroup(group);
+ }
+
+ TSet<TPDiskId> unusableDisks;
+ ui32 unusableDataCenter = RandomNumber<ui32>(numDataCenters + 1);
+ Ctest << "unusableDataCenter# " << unusableDataCenter << Endl;
+ if (unusableDataCenter) {
+ context.IteratePDisks([&](const auto& pdiskId, const auto& record) {
+ if (record.DataCenterId == unusableDataCenter) {
+ unusableDisks.insert(pdiskId);
+ }
+ });
}
for (ui32 i = 0; i < 2; ++i) {
- unusableDatacenter = RandomNumber<ui32>(5);
- if (unusableDatacenter) {
- ui32 unusableRack = 1 + RandomNumber<ui32>(4);
- unusableDisks.emplace(unusableDatacenter, unusableRack);
+ if (const ui32 unusableDataCenter = RandomNumber<ui32>(numDataCenters + 1)) {
+ const ui32 unusableRack = 1 + RandomNumber<ui32>(numRacks);
+ context.IteratePDisks([&](const auto& pdiskId, const auto& record) {
+ if (record.DataCenterId == unusableDataCenter && record.RackId == unusableRack) {
+ unusableDisks.insert(pdiskId);
+ }
+ });
}
}
{
- for (auto& realm : group) {
- for (auto& domain : realm) {
- for (auto& pdisk : domain) {
- if (unusableDisks.contains(nodeToLocation[pdisk.NodeId])) {
- pdisk = {}; // reassign disk
- }
- }
- }
- }
- auto mapper = populate(decommittedDatacenter, {}, group);
- TString error;
- bool success = mapper->AllocateGroup(0, group, nullptr, 0, {}, 0, false, error);
- UNIT_ASSERT_C(success, error);
- Ctest << "After remapping" << Endl;
- dumpGroup(group);
+ TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc));
+ context.PopulateGroupMapper(mapper, 1);
+ auto group = context.ReallocateGroup(mapper, groupId, unusableDisks);
+ Ctest << "group after reallocation:" << Endl;
+ context.DumpGroup(group);
+ context.CheckGroupErasure(group, decommittedDataCenter);
}
- for (ui32 i = 0; i < group.size(); ++i) {
- ui32 datacenterForRealm = 0;
+ Ctest << Endl;
+ }
+ }
+
+ Y_UNIT_TEST(SanitizeGroupTest3dc) {
+ const ui32 numDataCenters = 3;
+ const ui32 numRacks = 5;
+ TTestContext context(numDataCenters, 1, numRacks, 1, 1);
+ TGroupMapper::TGroupDefinition group;
+ ui32 groupId;
+ {
+ TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc));
+ context.PopulateGroupMapper(mapper, 1);
+ groupId = context.AllocateGroup(mapper, group);
+ Ctest << "group after allocation:" << Endl;
+ context.DumpGroup(group);
+ }
+ auto checkLayout = [&](const auto& group) {
+ TGroupGeometryInfo geom = TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc);
+ THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>> layout;
+ for (ui32 i = 0; i < group.size(); ++i) {
for (ui32 j = 0; j < group[i].size(); ++j) {
- std::set<ui32> racksInRealm;
-
- for (const auto& pdiskId : group[i][j]) {
- const auto& [datacenter, rack] = nodeToLocation[pdiskId.NodeId];
- if (!datacenterForRealm) {
- datacenterForRealm = datacenter;
- } else if (datacenterForRealm == datacenter) {
- // it's okay
- } else if (decommittedDatacenter && (datacenter == decommittedDatacenter || datacenterForRealm == decommittedDatacenter)) {
- // it's okay too, decomitted datacenter is partially broken
- } else {
- UNIT_FAIL("incorrect datacenter for realm");
- }
- UNIT_ASSERT(racksInRealm.insert(rack).second);
+ for (ui32 k = 0; k < group[i][j].size(); ++k) {
+ layout.emplace(TVDiskIdShort(i, j, k), std::make_pair(context.GetLocation(group[i][j][k]),
+ group[i][j][k]));
}
}
}
+ return CheckGroupLayout(geom, layout);
+ };
- Ctest << Endl;
+ UNIT_ASSERT(checkLayout(group));
+
+ for (ui32 n = 0; n < 1000; ++n) {
+ Ctest << Endl << "iteration# " << n << Endl;
+
+ auto layout = context.ExportLayout();
+ std::random_shuffle(layout.begin(), layout.end());
+ context.ImportLayout(layout);
+
+ Ctest << "group after layout shuffling:" << Endl;
+ context.DumpGroup(group);
+
+ struct TQueueItem {
+ TGroupMapper::TGroupDefinition Group;
+ TString Path;
+ TSet<TGroupMapper::TGroupDefinition> Seen;
+ TSet<TVDiskIdShort> VDiskItems;
+ TSet<TPDiskId> PDiskItems;
+ };
+ std::deque<TQueueItem> queue;
+ for (queue.push_back({.Group = group}); !queue.empty(); ) {
+ TQueueItem item = std::move(queue.front());
+ queue.pop_front();
+ const auto [it, inserted] = item.Seen.insert(item.Group);
+ UNIT_ASSERT(inserted);
+ UNIT_ASSERT(item.Seen.size() <= 9);
+ Cerr << "processing path# " << item.Path << Endl;
+
+ auto candidates = checkLayout(item.Group);
+ if (!candidates) {
+ for (const TVDiskIdShort& vdiskId : candidates.Candidates) {
+ TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc));
+ context.SetGroup(groupId, item.Group);
+ context.PopulateGroupMapper(mapper, 2);
+ const TPDiskId& pdiskId = item.Group[vdiskId.FailRealm][vdiskId.FailDomain][vdiskId.VDisk];
+ auto temp = context.ReallocateGroup(mapper, groupId, {pdiskId}, false, false, false);
+ TString path = TStringBuilder() << item.Path << "/" << (int)vdiskId.FailRealm << ":"
+ << (int)vdiskId.FailDomain << ":" << (int)vdiskId.VDisk << "@" << pdiskId;
+ Cerr << "path# " << path << Endl;
+ context.DumpGroup(temp);
+
+ auto vdiskItems = item.VDiskItems;
+// const auto [it1, inserted1] = vdiskItems.insert(vdiskId);
+// UNIT_ASSERT_C(inserted1, "Duplicate group cell# " << vdiskId);
+
+ auto pdiskItems = item.PDiskItems;
+// const auto [it2, inserted2] = pdiskItems.insert(pdiskId);
+// UNIT_ASSERT_C(inserted2, "Duplicate origin PDisk# " << pdiskId);
+
+ queue.push_front({.Group = std::move(temp), .Path = std::move(path), .Seen = item.Seen,
+ .VDiskItems = std::move(vdiskItems), .PDiskItems = std::move(pdiskItems)});
+ }
+ }
+
+ Cerr << Endl;
+ }
}
}
+
+ Y_UNIT_TEST(CheckNotToBreakFailModel) {
+ TTestContext context(4, 1, 3, 1, 1);
+ TGroupMapper::TGroupDefinition group;
+ TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc));
+ context.PopulateGroupMapper(mapper, 1);
+ ui32 groupId = context.AllocateGroup(mapper, group);
+ Ctest << "group after allocation:" << Endl;
+ context.DumpGroup(group);
+ group = context.ReallocateGroup(mapper, groupId, {group[0][0][0]}, false, false, true);
+ Ctest << "group after reallocation:" << Endl;
+ context.DumpGroup(group);
+ UNIT_ASSERT(group.empty());
+ }
}
diff --git a/ydb/core/mind/bscontroller/impl.h b/ydb/core/mind/bscontroller/impl.h
index ad8eea28c77..2b32da63a9a 100644
--- a/ydb/core/mind/bscontroller/impl.h
+++ b/ydb/core/mind/bscontroller/impl.h
@@ -78,6 +78,7 @@ public:
class TConfigState;
class TGroupSelector;
class TGroupFitter;
+ class TSelfHealActor;
using TVSlotReadyTimestampQ = std::list<std::pair<TInstant, TVSlotInfo*>>;
@@ -422,8 +423,7 @@ public:
}
bool AcceptsNewSlots() const {
- return Status == NKikimrBlobStorage::EDriveStatus::ACTIVE
- && DecommitStatus == NKikimrBlobStorage::EDecommitStatus::DECOMMIT_NONE;
+ return Status == NKikimrBlobStorage::EDriveStatus::ACTIVE;
}
bool Decommitted() const {
@@ -1344,6 +1344,7 @@ private:
THashMap<TPDiskId, ui32> StaticPDiskSlotUsage;
std::unique_ptr<TStoragePoolStat> StoragePoolStat;
bool StopGivingGroups = false;
+ bool GroupLayoutSanitizer = false;
NKikimrBlobStorage::TSerialManagementStage::E SerialManagementStage
= NKikimrBlobStorage::TSerialManagementStage::DISCOVER_SERIAL;
@@ -1569,6 +1570,16 @@ private:
void Handle(TEvInterconnect::TEvNodesInfo::TPtr &ev);
void HandleHostRecordsTimeToLiveExceeded();
+public:
+ // Self-heal actor's main purpose is to monitor FAULTY pdisks and to slightly move groups out of them; every move
+ // should not render group unusable, also it should not exceed its fail model. It also takes into account replication
+ // broker features such as only one vslot over PDisk is being replicated at a moment.
+ //
+ // It interacts with BS_CONTROLLER and group observer (which provides information about group state on a per-vdisk
+ // basis). BS_CONTROLLER reports faulty PDisks and all involved groups in a push notification manner.
+ IActor *CreateSelfHealActor();
+
+private:
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Online state
void Handle(TEvBlobStorage::TEvControllerRegisterNode::TPtr &ev);
diff --git a/ydb/core/mind/bscontroller/load_everything.cpp b/ydb/core/mind/bscontroller/load_everything.cpp
index 06fc1a8679c..8f33e2a1b63 100644
--- a/ydb/core/mind/bscontroller/load_everything.cpp
+++ b/ydb/core/mind/bscontroller/load_everything.cpp
@@ -83,6 +83,7 @@ public:
Self->GroupReservePart = state.GetValue<T::GroupReservePart>();
Self->MaxScrubbedDisksAtOnce = state.GetValue<T::MaxScrubbedDisksAtOnce>();
Self->PDiskSpaceColorBorder = state.GetValue<T::PDiskSpaceColorBorder>();
+ Self->GroupLayoutSanitizer = state.GetValue<T::GroupLayoutSanitizer>();
Self->SysViewChangedSettings = true;
}
}
diff --git a/ydb/core/mind/bscontroller/scheme.h b/ydb/core/mind/bscontroller/scheme.h
index 58f33878a98..cd47ac234d5 100644
--- a/ydb/core/mind/bscontroller/scheme.h
+++ b/ydb/core/mind/bscontroller/scheme.h
@@ -85,11 +85,13 @@ struct Schema : NIceDb::Schema {
struct GroupReservePart : Column<15, NScheme::NTypeIds::Uint32> { static constexpr Type Default = 0; }; // parts per million
struct MaxScrubbedDisksAtOnce : Column<16, NScheme::NTypeIds::Uint32> { static constexpr Type Default = Max<ui32>(); }; // no limit
struct PDiskSpaceColorBorder : Column<17, NScheme::NTypeIds::Uint32> { using Type = NKikimrBlobStorage::TPDiskSpaceColor::E; static constexpr Type Default = NKikimrBlobStorage::TPDiskSpaceColor::GREEN; };
+ struct GroupLayoutSanitizer : Column<18, NScheme::NTypeIds::Bool> { static constexpr Type Default = false; };
using TKey = TableKey<FixedKey>;
using TColumns = TableColumns<FixedKey, NextGroupID, SchemaVersion, NextOperationLogIndex, DefaultMaxSlots,
InstanceId, SelfHealEnable, DonorModeEnable, ScrubPeriodicity, SerialManagementStage, NextStoragePoolId,
- PDiskSpaceMarginPromille, GroupReserveMin, GroupReservePart, MaxScrubbedDisksAtOnce, PDiskSpaceColorBorder>;
+ PDiskSpaceMarginPromille, GroupReserveMin, GroupReservePart, MaxScrubbedDisksAtOnce, PDiskSpaceColorBorder,
+ GroupLayoutSanitizer>;
};
struct VSlot : Table<5> {
diff --git a/ydb/core/mind/bscontroller/self_heal.cpp b/ydb/core/mind/bscontroller/self_heal.cpp
index b732d98e937..86ea82a627e 100644
--- a/ydb/core/mind/bscontroller/self_heal.cpp
+++ b/ydb/core/mind/bscontroller/self_heal.cpp
@@ -2,6 +2,8 @@
#include "impl.h"
#include "vdisk_status_tracker.h"
#include "config.h"
+#include "group_geometry_info.h"
+#include "group_layout_checker.h"
namespace NKikimr::NBsController {
@@ -111,7 +113,9 @@ namespace NKikimr::NBsController {
void Handle(TEvBlobStorage::TEvVStatusResult::TPtr& ev) {
const auto& record = ev->Get()->Record;
- STLOG(PRI_DEBUG, BS_SELFHEAL, BSSH03, "Reassigner TEvVStatusResult", (GroupId, GroupId), (Response, record));
+ STLOG(PRI_DEBUG, BS_SELFHEAL, BSSH03, "Reassigner TEvVStatusResult", (GroupId, GroupId),
+ (Status, record.GetStatus()), (JoinedGroup, record.GetJoinedGroup()),
+ (Replicated, record.GetReplicated()));
bool diskIsOk = false;
if (record.GetStatus() == NKikimrProto::RACE) {
@@ -169,6 +173,13 @@ namespace NKikimr::NBsController {
if (!record.GetResponse().GetSuccess()) {
STLOG(PRI_WARN, BS_SELFHEAL, BSSH07, "Reassigner ReassignGroupDisk request failed", (GroupId, GroupId),
(VDiskToReplace, VDiskToReplace), (Response, record));
+ } else {
+ TString items = "none";
+ for (const auto& item : record.GetResponse().GetStatus(0).GetReassignedItem()) {
+ items = TStringBuilder() << VDiskIDFromVDiskID(item.GetVDiskId()) << ": "
+ << TVSlotId(item.GetFrom()) << " -> " << TVSlotId(item.GetTo());
+ }
+ STLOG(PRI_INFO, BS_SELFHEAL, BSSH09, "Reassigner succeeded", (GroupId, GroupId), (Items, items));
}
Finish(record.GetResponse().GetSuccess());
}
@@ -204,23 +215,36 @@ namespace NKikimr::NBsController {
})
};
- class TSelfHealActor : public TActorBootstrapped<TSelfHealActor> {
+ class TBlobStorageController::TSelfHealActor : public TActorBootstrapped<TSelfHealActor> {
static constexpr TDuration MinRetryTimeout = TDuration::Seconds(1);
static constexpr TDuration MaxRetryTimeout = TDuration::Seconds(60);
- struct TGroupRecord {
+ struct TWithFaultyDisks {};
+ struct TWithInvalidLayout {};
+
+ struct TGroupRecord
+ : TIntrusiveListItem<TGroupRecord, TWithFaultyDisks>
+ , TIntrusiveListItem<TGroupRecord, TWithInvalidLayout>
+ {
+ const TGroupId GroupId;
TEvControllerUpdateSelfHealInfo::TGroupContent Content;
TActorId ReassignerActorId; // reassigner in flight
TDuration RetryTimeout = MinRetryTimeout;
TInstant NextRetryTimestamp = TInstant::Zero();
THashMap<TVDiskID, TVDiskStatusTracker> VDiskStatus;
+ bool LayoutValid = false;
+
+ TGroupRecord(TGroupId groupId) : GroupId(groupId) {}
};
const ui64 TabletId;
TActorId ControllerId;
THashMap<TGroupId, TGroupRecord> Groups;
- TSet<TGroupId> GroupsWithFaultyDisks;
+ TIntrusiveList<TGroupRecord, TWithFaultyDisks> GroupsWithFaultyDisks;
+ TIntrusiveList<TGroupRecord, TWithInvalidLayout> GroupsWithInvalidLayout;
std::shared_ptr<std::atomic_uint64_t> UnreassignableGroups;
+ bool GroupLayoutSanitizer = false;
+ std::optional<THostRecordMapImpl> HostRecords;
public:
TSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups)
@@ -236,11 +260,17 @@ namespace NKikimr::NBsController {
void Handle(TEvControllerUpdateSelfHealInfo::TPtr& ev) {
const TInstant now = TActivationContext::Now();
+ if (const auto& setting = ev->Get()->GroupLayoutSanitizer) {
+ GroupLayoutSanitizer = *setting;
+ }
for (const auto& [groupId, data] : ev->Get()->GroupsToUpdate) {
if (data) {
- auto& g = Groups[groupId];
+ const auto [it, inserted] = Groups.try_emplace(groupId, groupId);
+ auto& g = it->second;
bool hasFaultyDisks = false;
g.Content = std::move(*data);
+ g.LayoutValid = false;
+ GroupsWithInvalidLayout.PushBack(&g);
for (const auto& [vdiskId, vdisk] : g.Content.VDisks) {
g.VDiskStatus[vdiskId].Update(vdisk.VDiskStatus, now);
hasFaultyDisks |= vdisk.Faulty;
@@ -253,9 +283,9 @@ namespace NKikimr::NBsController {
}
}
if (hasFaultyDisks) {
- GroupsWithFaultyDisks.insert(groupId);
+ GroupsWithFaultyDisks.PushBack(&g);
} else {
- GroupsWithFaultyDisks.erase(groupId);
+ GroupsWithFaultyDisks.Remove(&g);
}
} else {
// find the group to delete
@@ -272,7 +302,6 @@ namespace NKikimr::NBsController {
}
// remove the group
- GroupsWithFaultyDisks.erase(groupId);
Groups.erase(it);
}
}
@@ -293,29 +322,40 @@ namespace NKikimr::NBsController {
ui64 counter = 0;
- for (const TGroupId groupId : GroupsWithFaultyDisks) {
- // find the group to process
- const auto it = Groups.find(groupId);
- Y_VERIFY(it != Groups.end());
- TGroupRecord& group = it->second;
-
+ for (TGroupRecord& group : GroupsWithFaultyDisks) {
if (group.ReassignerActorId || now < group.NextRetryTimestamp) {
continue; // we are already running reassigner for this group
}
// check if it is possible to move anything out
if (const auto v = FindVDiskToReplace(group.VDiskStatus, group.Content, now)) {
- group.ReassignerActorId = Register(new TReassignerActor(ControllerId, groupId, group.Content, *v));
+ group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, *v));
} else {
++counter; // this group can't be reassigned right now
}
}
+ if (GroupLayoutSanitizer) {
+ for (auto it = GroupsWithInvalidLayout.begin(); it != GroupsWithInvalidLayout.end(); ) {
+ TGroupRecord& group = *it++;
+ Y_VERIFY(!group.LayoutValid);
+ if (group.ReassignerActorId || now < group.NextRetryTimestamp) {
+ // nothing to do
+ } else if (const auto v = FindVDiskToReplaceByLayout(group, now)) {
+ group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, *v));
+ } else if (group.LayoutValid) {
+ GroupsWithInvalidLayout.Remove(&group);
+ } else {
+ ++counter;
+ }
+ }
+ }
+
UnreassignableGroups->store(counter);
}
std::optional<TVDiskID> FindVDiskToReplace(const THashMap<TVDiskID, TVDiskStatusTracker>& tracker,
- const TEvControllerUpdateSelfHealInfo::TGroupContent& content, const TInstant now) {
+ const TEvControllerUpdateSelfHealInfo::TGroupContent& content, TInstant now) {
auto status = [&](const TVDiskID& id) {
try {
return tracker.at(id).GetStatus(now);
@@ -362,6 +402,41 @@ namespace NKikimr::NBsController {
}
}
+ std::optional<TVDiskID> FindVDiskToReplaceByLayout(TGroupRecord& group, TInstant now) {
+ THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>> layout;
+ for (const auto& [vdiskId, vdisk] : group.Content.VDisks) {
+ Y_VERIFY(HostRecords);
+ if (!vdisk.Decommitted) {
+ layout.emplace(vdiskId, std::make_pair(HostRecords->GetLocation(vdisk.Location.NodeId),
+ vdisk.Location.ComprisingPDiskId()));
+ }
+ }
+ const TLayoutCheckResult checkResult = CheckGroupLayout(*group.Content.Geometry, layout);
+ if (checkResult) { // group is valid
+ group.LayoutValid = true;
+ return std::nullopt;
+ }
+
+ THashSet<TVDiskIdShort> badDisks;
+ for (const auto& [vdiskId, vdisk] : group.Content.VDisks) {
+ const auto it = group.VDiskStatus.find(vdiskId);
+ if (it == group.VDiskStatus.end() || it->second.GetStatus(now) != NKikimrBlobStorage::EVDiskStatus::READY || vdisk.Bad) {
+ badDisks.insert(vdiskId);
+ }
+ }
+ if (badDisks.empty()) {
+ return TVDiskID(group.GroupId, group.Content.Generation, checkResult.Candidates.front());
+ } else if (badDisks.size() == 1) {
+ for (const auto& vdiskId : checkResult.Candidates) {
+ if (badDisks.contains(vdiskId)) {
+ return TVDiskID(group.GroupId, group.Content.Generation, vdiskId);
+ }
+ }
+ }
+
+ return std::nullopt;
+ }
+
void HandleWakeup() {
CheckGroups();
Schedule(TDuration::Seconds(10), new TEvents::TEvWakeup);
@@ -439,9 +514,8 @@ namespace NKikimr::NBsController {
TABLE_CLASS("table-sortable table") {
TABLEHEAD() {
ui32 numCols = 0;
- for (const auto& id : GroupsWithFaultyDisks) {
- const auto& info = Groups.at(id);
- numCols = Max<ui32>(numCols, info.Content.VDisks.size());
+ for (const TGroupRecord& group : GroupsWithFaultyDisks) {
+ numCols = Max<ui32>(numCols, group.Content.VDisks.size());
}
TABLER() {
@@ -452,20 +526,19 @@ namespace NKikimr::NBsController {
}
}
TABLEBODY() {
- for (const auto& id : GroupsWithFaultyDisks) {
- const auto& info = Groups.at(id);
+ for (const TGroupRecord& group : GroupsWithFaultyDisks) {
TABLER() {
out << "<td rowspan='2'><a href='?TabletID=" << TabletId
- << "&page=GroupDetail&GroupId=" << id << "'>"
- << id << "</a>:" << info.Content.Generation << "</td>";
+ << "&page=GroupDetail&GroupId=" << group.GroupId << "'>"
+ << group.GroupId << "</a>:" << group.Content.Generation << "</td>";
- for (const auto& [vdiskId, vdisk] : info.Content.VDisks) {
+ for (const auto& [vdiskId, vdisk] : group.Content.VDisks) {
TABLED() {
out << vdiskId.ToString();
out << "<br/>";
out << vdisk.VDiskStatus;
out << "<br/><strong>";
- if (const auto it = info.VDiskStatus.find(vdiskId); it != info.VDiskStatus.end()) {
+ if (const auto it = group.VDiskStatus.find(vdiskId); it != group.VDiskStatus.end()) {
if (const auto& status = it->second.GetStatus(now)) {
out << *status;
} else {
@@ -479,7 +552,7 @@ namespace NKikimr::NBsController {
}
}
TABLER() {
- for (const auto& [vdiskId, vdisk] : info.Content.VDisks) {
+ for (const auto& [vdiskId, vdisk] : group.Content.VDisks) {
TABLED() {
const auto& l = vdisk.Location;
if (vdisk.Faulty) {
@@ -506,17 +579,22 @@ namespace NKikimr::NBsController {
}
}
+ void Handle(TEvInterconnect::TEvNodesInfo::TPtr ev) {
+ HostRecords.emplace(ev->Get());
+ }
+
STRICT_STFUNC(StateFunc, {
cFunc(TEvents::TSystem::Poison, PassAway);
hFunc(TEvControllerUpdateSelfHealInfo, Handle);
hFunc(NMon::TEvRemoteHttpInfo, Handle);
hFunc(TEvReassignerDone, Handle);
cFunc(TEvents::TSystem::Wakeup, HandleWakeup);
+ hFunc(TEvInterconnect::TEvNodesInfo, Handle);
})
};
- IActor *CreateSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups) {
- return new TSelfHealActor(tabletId, std::move(unreassignableGroups));
+ IActor *TBlobStorageController::CreateSelfHealActor() {
+ return new TSelfHealActor(TabletID(), SelfHealUnreassignableGroups);
}
void TBlobStorageController::InitializeSelfHealState() {
@@ -525,10 +603,13 @@ namespace NKikimr::NBsController {
ev->GroupsToUpdate.emplace(groupId, TEvControllerUpdateSelfHealInfo::TGroupContent());
}
FillInSelfHealGroups(*ev, nullptr);
+ ev->GroupLayoutSanitizer = GroupLayoutSanitizer;
Send(SelfHealId, ev.Release());
}
void TBlobStorageController::FillInSelfHealGroups(TEvControllerUpdateSelfHealInfo& msg, TConfigState *state) {
+ THashMap<TBoxStoragePoolId, std::shared_ptr<TGroupGeometryInfo>> geomCache;
+
for (auto& [groupId, group] : msg.GroupsToUpdate) {
if (!group) {
continue;
@@ -540,11 +621,24 @@ namespace NKikimr::NBsController {
group->Generation = p->Generation;
group->Type = TBlobStorageGroupType(p->ErasureSpecies);
+ if (auto it = geomCache.find(p->StoragePoolId); it != geomCache.end()) {
+ group->Geometry = it->second;
+ } else {
+ const TMap<TBoxStoragePoolId, TStoragePoolInfo>& storagePools = state
+ ? state->StoragePools.Get()
+ : StoragePools;
+ const auto spIt = storagePools.find(p->StoragePoolId);
+ Y_VERIFY(spIt != storagePools.end());
+ group->Geometry = std::make_unique<TGroupGeometryInfo>(group->Type, spIt->second.GetGroupGeometry());
+ geomCache.emplace(p->StoragePoolId, group->Geometry);
+ }
+
for (const TVSlotInfo *slot : p->VDisksInGroup) {
group->VDisks[slot->GetVDiskId()] = {
slot->VSlotId,
slot->PDisk->ShouldBeSettledBySelfHeal(),
slot->PDisk->BadInTermsOfSelfHeal(),
+ slot->PDisk->Decommitted(),
slot->GetStatus(),
};
}
diff --git a/ydb/core/mind/bscontroller/self_heal.h b/ydb/core/mind/bscontroller/self_heal.h
index 287f05d4670..b2740f4800b 100644
--- a/ydb/core/mind/bscontroller/self_heal.h
+++ b/ydb/core/mind/bscontroller/self_heal.h
@@ -6,29 +6,26 @@
namespace NKikimr::NBsController {
+ class TGroupGeometryInfo;
+
struct TEvControllerUpdateSelfHealInfo : TEventLocal<TEvControllerUpdateSelfHealInfo, TEvBlobStorage::EvControllerUpdateSelfHealInfo> {
struct TGroupContent {
struct TVDiskInfo {
TVSlotId Location;
bool Faulty;
bool Bad;
+ bool Decommitted;
NKikimrBlobStorage::EVDiskStatus VDiskStatus;
};
ui32 Generation;
TBlobStorageGroupType Type;
TMap<TVDiskID, TVDiskInfo> VDisks;
+ std::shared_ptr<TGroupGeometryInfo> Geometry;
};
THashMap<TGroupId, std::optional<TGroupContent>> GroupsToUpdate; // groups with faulty groups that are changed or got faulty PDisks for the first time
TVector<std::pair<TVDiskID, NKikimrBlobStorage::EVDiskStatus>> VDiskStatusUpdate;
+ std::optional<bool> GroupLayoutSanitizer;
};
- // Self-heal actor's main purpose is to monitor FAULTY pdisks and to slightly move groups out of them; every move
- // should not render group unusable, also it should not exceed its fail model. It also takes into account replication
- // broker features such as only one vslot over PDisk is being replicated at a moment.
- //
- // It interacts with BS_CONTROLLER and group observer (which provides information about group state on a per-vdisk
- // basis). BS_CONTROLLER reports faulty PDisks and all involved groups in a push notification manner.
- IActor *CreateSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups);
-
} // NKikimr::NBsController
diff --git a/ydb/core/mind/bscontroller/sys_view.cpp b/ydb/core/mind/bscontroller/sys_view.cpp
index d233cfc67bc..756035d051c 100644
--- a/ydb/core/mind/bscontroller/sys_view.cpp
+++ b/ydb/core/mind/bscontroller/sys_view.cpp
@@ -310,7 +310,7 @@ public:
TGroupMapper::TGroupDefinition group;
TString error;
std::deque<ui64> groupSizes;
- while (mapper.AllocateGroup(groupSizes.size(), group, nullptr, 0, {}, 0, false, error)) {
+ while (mapper.AllocateGroup(groupSizes.size(), group, {}, {}, 0, false, error)) {
std::vector<TGroupDiskInfo> disks;
std::deque<NKikimrBlobStorage::TPDiskMetrics> pdiskMetrics;
std::deque<NKikimrBlobStorage::TVDiskMetrics> vdiskMetrics;
diff --git a/ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp b/ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp
index 5c2d0aad201..d4205abea84 100644
--- a/ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp
+++ b/ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp
@@ -1,6 +1,7 @@
#include <library/cpp/testing/unittest/registar.h>
#include <ydb/core/util/testactorsys.h>
#include <ydb/core/mind/bscontroller/self_heal.h>
+#include <ydb/core/mind/bscontroller/impl.h>
using namespace NActors;
using namespace NKikimr;
@@ -13,8 +14,8 @@ void RunTestCase(TCallback&& callback) {
TTestActorSystem runtime(1);
runtime.Start();
const TActorId& parentId = runtime.AllocateEdgeActor(1);
- std::shared_ptr<std::atomic_uint64_t> UnreassignableGroups = std::make_shared<std::atomic_uint64_t>();
- const TActorId& selfHealId = runtime.Register(CreateSelfHealActor(1, UnreassignableGroups), parentId, {}, {}, 1);
+ TBlobStorageController Controller({}, new TTabletStorageInfo(1, TTabletTypes::FLAT_BS_CONTROLLER));
+ const TActorId& selfHealId = runtime.Register(Controller.CreateSelfHealActor(), parentId, {}, {}, 1);
callback(selfHealId, parentId, runtime);
runtime.Stop();
}
diff --git a/ydb/core/protos/blobstorage_config.proto b/ydb/core/protos/blobstorage_config.proto
index 9215c37d3ff..bc7f54aa6e9 100644
--- a/ydb/core/protos/blobstorage_config.proto
+++ b/ydb/core/protos/blobstorage_config.proto
@@ -434,6 +434,7 @@ message TUpdateSettings {
repeated uint32 GroupReservePartPPM = 7;
repeated uint32 MaxScrubbedDisksAtOnce = 8;
repeated NKikimrBlobStorage.TPDiskSpaceColor.E PDiskSpaceColorBorder = 9;
+ repeated bool EnableGroupLayoutSanitizer = 10;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/ydb/core/util/testactorsys.cpp b/ydb/core/util/testactorsys.cpp
index a678744a2f1..ead159d35ed 100644
--- a/ydb/core/util/testactorsys.cpp
+++ b/ydb/core/util/testactorsys.cpp
@@ -133,16 +133,24 @@ TActorId TTestActorSystem::CreateTestBootstrapper(TTabletStorageInfo *info, std:
}
void TTestActorSystem::SetupTabletRuntime(ui32 numDataCenters, ui32 stateStorageNodeId, ui32 targetNodeId) {
- auto setup = MakeIntrusive<TTableNameserverSetup>();
- ui32 nodeCountInDC = (MaxNodeId + numDataCenters - 1) / numDataCenters;
- for (ui32 nodeId : GetNodes()) {
- const TString name = Sprintf("127.0.0.%u", nodeId);
+ const ui32 nodeCountInDC = (MaxNodeId + numDataCenters - 1) / numDataCenters;
+ auto locationGenerator = [&](ui32 nodeId) {
const ui32 dcNum = (nodeId + nodeCountInDC - 1) / nodeCountInDC;
NActorsInterconnect::TNodeLocation location;
location.SetDataCenter(ToString(dcNum));
location.SetRack(ToString(nodeId));
location.SetUnit(ToString(nodeId));
- setup->StaticNodeTable[nodeId] = {name, name, name, 19001, TNodeLocation(location)};
+ return TNodeLocation(location);
+ };
+ SetupTabletRuntime(locationGenerator, stateStorageNodeId, targetNodeId);
+}
+
+void TTestActorSystem::SetupTabletRuntime(const std::function<TNodeLocation(ui32)>& locationGenerator,
+ ui32 stateStorageNodeId, ui32 targetNodeId) {
+ auto setup = MakeIntrusive<TTableNameserverSetup>();
+ for (ui32 nodeId : GetNodes()) {
+ const TString name = Sprintf("127.0.0.%u", nodeId);
+ setup->StaticNodeTable[nodeId] = {name, name, name, 19001, locationGenerator(nodeId)};
}
for (ui32 nodeId : GetNodes()) {
diff --git a/ydb/core/util/testactorsys.h b/ydb/core/util/testactorsys.h
index 6bbe4dd3eb3..5037f7ab715 100644
--- a/ydb/core/util/testactorsys.h
+++ b/ydb/core/util/testactorsys.h
@@ -661,6 +661,8 @@ public:
// tablet-related utility functions
void SetupTabletRuntime(ui32 numDataCenters = 1, ui32 stateStorageNodeId = 0, ui32 targetNodeId = 0);
+ void SetupTabletRuntime(const std::function<TNodeLocation(ui32)>& locationGenerator, ui32 stateStorageNodeId = 0,
+ ui32 targetNodeId = 0);
static NTabletPipe::TClientConfig GetPipeConfigWithRetries();
void SendToPipe(ui64 tabletId, const TActorId& sender, IEventBase* payload, ui64 cookie, const NKikimr::NTabletPipe::TClientConfig& pipeConfig);
static TTabletStorageInfo *CreateTestTabletInfo(ui64 tabletId, TTabletTypes::EType tabletType, TBlobStorageGroupType::EErasureSpecies erasure, ui32 groupId);
diff --git a/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema b/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema
index 28608d95cd9..928757d0259 100644
--- a/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema
+++ b/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema
@@ -12,6 +12,11 @@
"ColumnType": "Uint32"
},
{
+ "ColumnId": 18,
+ "ColumnName": "GroupLayoutSanitizer",
+ "ColumnType": "Bool"
+ },
+ {
"ColumnId": 1,
"ColumnName": "FixedKey",
"ColumnType": "Bool"
@@ -92,6 +97,7 @@
"0": {
"Columns": [
17,
+ 18,
1,
2,
4,