aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexander Rutkovsky <alexvru@mail.ru>2022-06-06 14:52:24 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-06-06 14:52:24 +0300
commit1c9a9bebfd56f7648a7879824fa93adeab288978 (patch)
tree1b28da8263e45f4beb6ce7f6b21f66db48b5887e
parent3ad62aa372aacec6af48bb3e8c2e82a768022f8b (diff)
downloadydb-1c9a9bebfd56f7648a7879824fa93adeab288978.tar.gz
PR from branch users/alexvru/merge/22-2/KIKIMR-14580
PR from branch users/alexvru/bsdc/KIKIMR-12959/fix-bug KIKIMR-12959 REVIEW: 2492305 Fix some problems and improve code KIKIMR-14580 REVIEW: 2509145 Improve code by adding TEntityId KIKIMR-14580 REVIEW: 2500367 Add group layout sanitizer feature KIKIMR-14580 REVIEW: 2443980 Fix group mapper bug KIKIMR-14580 REVIEW: 2430098 Refactor group mapper to support partially aligned groups according to majority principle KIKIMR-14580 REVIEW: 2425466 Separate DecommitStatus for drive KIKIMR-14580 REVIEW: 2421023 Support DECOMMIT_* status in BS_CONTROLLER KIKIMR-14580 REVIEW: 2416336 REVIEW: 2515731 x-ydb-stable-ref: 2c2ad79ee0c925ddb5845f5543245862a930a68a
-rw-r--r--ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp9
-rw-r--r--ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h7
-rw-r--r--ydb/core/blobstorage/pdisk/mock/pdisk_mock.cpp31
-rw-r--r--ydb/core/blobstorage/ut_blobstorage/decommit_3dc.cpp89
-rw-r--r--ydb/core/blobstorage/ut_blobstorage/lib/env.h45
-rw-r--r--ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp103
-rw-r--r--ydb/core/blobstorage/ut_blobstorage/ya.make2
-rw-r--r--ydb/core/cms/sentinel.cpp1
-rw-r--r--ydb/core/cms/sentinel_ut.cpp2
-rw-r--r--ydb/core/mind/bscontroller/bsc.cpp3
-rw-r--r--ydb/core/mind/bscontroller/cmds_drive_status.cpp25
-rw-r--r--ydb/core/mind/bscontroller/cmds_storage_pool.cpp1
-rw-r--r--ydb/core/mind/bscontroller/config.cpp1
-rw-r--r--ydb/core/mind/bscontroller/config_cmd.cpp7
-rw-r--r--ydb/core/mind/bscontroller/config_fit_groups.cpp40
-rw-r--r--ydb/core/mind/bscontroller/config_fit_pdisks.cpp8
-rw-r--r--ydb/core/mind/bscontroller/group_geometry_info.h9
-rw-r--r--ydb/core/mind/bscontroller/group_layout_checker.cpp47
-rw-r--r--ydb/core/mind/bscontroller/group_layout_checker.h233
-rw-r--r--ydb/core/mind/bscontroller/group_mapper.cpp1086
-rw-r--r--ydb/core/mind/bscontroller/group_mapper.h33
-rw-r--r--ydb/core/mind/bscontroller/group_mapper_ut.cpp425
-rw-r--r--ydb/core/mind/bscontroller/impl.h66
-rw-r--r--ydb/core/mind/bscontroller/load_everything.cpp5
-rw-r--r--ydb/core/mind/bscontroller/scheme.h7
-rw-r--r--ydb/core/mind/bscontroller/self_heal.cpp150
-rw-r--r--ydb/core/mind/bscontroller/self_heal.h13
-rw-r--r--ydb/core/mind/bscontroller/sys_view.cpp27
-rw-r--r--ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp5
-rw-r--r--ydb/core/mind/bscontroller/ya.make2
-rw-r--r--ydb/core/protos/blobstorage_config.proto12
-rw-r--r--ydb/core/protos/out/out.cpp4
-rw-r--r--ydb/core/protos/sys_view.proto1
-rw-r--r--ydb/core/sys_view/common/schema.h4
-rw-r--r--ydb/core/sys_view/storage/pdisks.cpp1
-rw-r--r--ydb/core/sys_view/ut_kqp.cpp9
-rw-r--r--ydb/core/util/testactorsys.cpp22
-rw-r--r--ydb/core/util/testactorsys.h4
-rw-r--r--ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema12
39 files changed, 1814 insertions, 737 deletions
diff --git a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp
index 054d5be665..7a7cb76ed6 100644
--- a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp
+++ b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp
@@ -299,7 +299,7 @@ TBlobStorageGroupInfo::TTopology::TTopology(TBlobStorageGroupType gtype)
{}
TBlobStorageGroupInfo::TTopology::TTopology(TBlobStorageGroupType gtype, ui32 numFailRealms,
- ui32 numFailDomainsPerFailRealm, ui32 numVDisksPerFailDomain)
+ ui32 numFailDomainsPerFailRealm, ui32 numVDisksPerFailDomain, bool finalize)
: GType(gtype)
{
FailRealms = {numFailRealms, {
@@ -307,6 +307,9 @@ TBlobStorageGroupInfo::TTopology::TTopology(TBlobStorageGroupType gtype, ui32 nu
{numVDisksPerFailDomain, TVDiskInfo{}}
}}
}};
+ if (finalize) {
+ FinalizeConstruction();
+ }
}
TBlobStorageGroupInfo::TTopology::~TTopology() = default;
@@ -400,10 +403,6 @@ ui32 TBlobStorageGroupInfo::TTopology::GetOrderNumber(const TVDiskIdShort &vdisk
return FailRealms[vdisk.FailRealm].FailDomains[vdisk.FailDomain].VDisks[vdisk.VDisk].OrderNumber;
}
-ui32 TBlobStorageGroupInfo::TTopology::GetNumVDisksPerFailDomain() const {
- return FailRealms[0].FailDomains[0].VDisks.size();
-}
-
void TBlobStorageGroupInfo::TTopology::PickSubgroup(ui32 hash, TBlobStorageGroupInfo::TOrderNums &orderNums) const {
return BlobMapper->PickSubgroup(hash, orderNums);
}
diff --git a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h
index e2d1445be4..2b89538bc8 100644
--- a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h
+++ b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h
@@ -191,7 +191,8 @@ public:
TVector<TVDiskIdShort> VDiskIdForOrderNumber;
TTopology(TBlobStorageGroupType gtype);
- TTopology(TBlobStorageGroupType gtype, ui32 numFailRealms, ui32 numFailDomainsPerFailRealm, ui32 numVDisksPerFailDomain);
+ TTopology(TBlobStorageGroupType gtype, ui32 numFailRealms, ui32 numFailDomainsPerFailRealm, ui32 numVDisksPerFailDomain,
+ bool finalize = false);
TTopology(const TTopology&) = delete;
TTopology &operator =(const TTopology&) = delete;
TTopology(TTopology&&) = default;
@@ -221,7 +222,9 @@ public:
// get the total number of VDisks in the blobstorage group
ui32 GetTotalVDisksNum() const { return TotalVDisks; }
// get number of VDisks per fail domain
- ui32 GetNumVDisksPerFailDomain() const;
+ ui32 GetNumVDisksPerFailDomain() const { return FailRealms[0].FailDomains[0].VDisks.size(); }
+ // get number of fail domains per fail realm
+ ui32 GetNumFailDomainsPerFailRealm() const { return FailRealms[0].FailDomains.size(); }
// get quorum checker
const IQuorumChecker& GetQuorumChecker() const { return *QuorumChecker; }
diff --git a/ydb/core/blobstorage/pdisk/mock/pdisk_mock.cpp b/ydb/core/blobstorage/pdisk/mock/pdisk_mock.cpp
index 1e84f5d8f6..86cc23e1e9 100644
--- a/ydb/core/blobstorage/pdisk/mock/pdisk_mock.cpp
+++ b/ydb/core/blobstorage/pdisk/mock/pdisk_mock.cpp
@@ -1,4 +1,5 @@
#include "pdisk_mock.h"
+#include <ydb/core/blobstorage/base/blobstorage_events.h>
#include <ydb/core/util/stlog.h>
#include <ydb/core/util/interval_set.h>
@@ -238,7 +239,7 @@ TPDiskMockState::TPtr TPDiskMockState::Snapshot() {
return res;
}
-class TPDiskMockActor : public TActor<TPDiskMockActor> {
+class TPDiskMockActor : public TActorBootstrapped<TPDiskMockActor> {
enum {
EvResume = EventSpaceBegin(TEvents::ES_PRIVATE),
};
@@ -251,8 +252,7 @@ class TPDiskMockActor : public TActor<TPDiskMockActor> {
public:
TPDiskMockActor(TPDiskMockState::TPtr state)
- : TActor(&TThis::StateFunc)
- , State(std::move(state)) // to keep ownership
+ : State(std::move(state)) // to keep ownership
, Impl(*State->Impl)
, Prefix(TStringBuilder() << "PDiskMock[" << Impl.NodeId << ":" << Impl.PDiskId << "] ")
{
@@ -263,6 +263,30 @@ public:
}
}
+ void Bootstrap() {
+ Become(&TThis::StateFunc);
+ ReportMetrics();
+ }
+
+ void ReportMetrics() {
+ ui32 usedChunks = 0;
+ for (const auto& [ownerId, owner] : Impl.Owners) {
+ usedChunks += owner.CommittedChunks.size() + owner.ReservedChunks.size();
+ }
+ Y_VERIFY(usedChunks <= Impl.TotalChunks);
+
+ auto ev = std::make_unique<TEvBlobStorage::TEvControllerUpdateDiskStatus>();
+ auto& record = ev->Record;
+ auto *p = record.AddPDisksMetrics();
+ p->SetPDiskId(Impl.PDiskId);
+ p->SetAvailableSize((Impl.TotalChunks - usedChunks) * Impl.ChunkSize);
+ p->SetTotalSize(Impl.TotalChunks * Impl.ChunkSize);
+ p->SetState(NKikimrBlobStorage::TPDiskState::Normal);
+ Send(MakeBlobStorageNodeWardenID(SelfId().NodeId()), ev.release());
+
+ Schedule(TDuration::Seconds(5), new TEvents::TEvWakeup);
+ }
+
void Handle(NPDisk::TEvYardInit::TPtr ev) {
// report message and validate PDisk guid
auto *msg = ev->Get();
@@ -663,6 +687,7 @@ public:
hFunc(NPDisk::TEvSlay, Handle);
hFunc(NPDisk::TEvHarakiri, Handle);
hFunc(NPDisk::TEvConfigureScheduler, Handle);
+ cFunc(TEvents::TSystem::Wakeup, ReportMetrics);
)
};
diff --git a/ydb/core/blobstorage/ut_blobstorage/decommit_3dc.cpp b/ydb/core/blobstorage/ut_blobstorage/decommit_3dc.cpp
new file mode 100644
index 0000000000..811d791db6
--- /dev/null
+++ b/ydb/core/blobstorage/ut_blobstorage/decommit_3dc.cpp
@@ -0,0 +1,89 @@
+#include <ydb/core/blobstorage/ut_blobstorage/lib/env.h>
+
+Y_UNIT_TEST_SUITE(Decommit3dc) {
+ Y_UNIT_TEST(Test) {
+ for (ui32 numNodes : {12, 16})
+ for (ui32 numGroups : {1, 7})
+ for (bool resetToNone : {false, true})
+ for (ui32 numDecommitted = 0; numDecommitted <= numNodes / 4; ++numDecommitted) {
+ Cerr << "numNodes# " << numNodes
+ << " numGroups# " << numGroups
+ << " resetToNone# " << (resetToNone ? "true" : "false")
+ << " numDecommitted# " << numDecommitted
+ << Endl;
+
+ TEnvironmentSetup env{{
+ .NodeCount = numNodes,
+ .Erasure = TBlobStorageGroupType::ErasureMirror3dc,
+ .NumDataCenters = 4,
+ }};
+
+ env.UpdateSettings(true, true);
+ env.CreateBoxAndPool(1, numGroups);
+ env.Sim(TDuration::Seconds(30));
+ auto config = env.FetchBaseConfig();
+
+ std::set<ui32> nodesToSettle;
+ TString datacenterToSettle;
+ for (const auto& node : config.GetNode()) {
+ const auto& location = node.GetLocation();
+ if (!datacenterToSettle) {
+ datacenterToSettle = location.GetDataCenter();
+ }
+ if (datacenterToSettle == location.GetDataCenter()) {
+ nodesToSettle.insert(node.GetNodeId());
+ }
+ }
+
+ std::set<std::pair<ui32, ui32>> pdisksToSettle;
+ for (const auto& pdisk : config.GetPDisk()) {
+ if (nodesToSettle.count(pdisk.GetNodeId())) {
+ pdisksToSettle.emplace(pdisk.GetNodeId(), pdisk.GetPDiskId());
+ env.UpdateDriveStatus(pdisk.GetNodeId(), pdisk.GetPDiskId(), {},
+ NKikimrBlobStorage::EDecommitStatus::DECOMMIT_PENDING);
+ }
+ }
+
+ UNIT_ASSERT_VALUES_EQUAL(pdisksToSettle.size(), numNodes / 4);
+ std::set<std::pair<ui32, ui32>> movedOutPDisks;
+ auto it = pdisksToSettle.begin();
+ for (ui32 i = 0; i < numDecommitted; ++i, ++it) {
+ UNIT_ASSERT(it != pdisksToSettle.end());
+ movedOutPDisks.insert(*it);
+ env.UpdateDriveStatus(it->first, it->second, {}, NKikimrBlobStorage::EDecommitStatus::DECOMMIT_IMMINENT);
+
+ env.Sim(TDuration::Seconds(60));
+ auto config = env.FetchBaseConfig();
+ for (const auto& vslot : config.GetVSlot()) {
+ const auto& vslotId = vslot.GetVSlotId();
+ UNIT_ASSERT(!movedOutPDisks.count({vslotId.GetNodeId(), vslotId.GetPDiskId()}));
+ }
+ }
+ if (resetToNone) {
+ for (const auto& [nodeId, pdiskId] : pdisksToSettle) {
+ env.UpdateDriveStatus(nodeId, pdiskId, {}, NKikimrBlobStorage::EDecommitStatus::DECOMMIT_NONE);
+ }
+ movedOutPDisks.clear();
+ }
+ for (const auto& [nodeId, pdiskId] : pdisksToSettle) {
+ Cerr << "nodeId# " << nodeId << " pdiskId# " << pdiskId << Endl;
+
+ env.UpdateDriveStatus(nodeId, pdiskId, NKikimrBlobStorage::EDriveStatus::FAULTY, {});
+ movedOutPDisks.emplace(nodeId, pdiskId);
+
+ env.Sim(TDuration::Seconds(90));
+ auto config = env.FetchBaseConfig();
+ for (const auto& vslot : config.GetVSlot()) {
+ const auto& vslotId = vslot.GetVSlotId();
+ Cerr << vslotId.GetNodeId() << ":" << vslotId.GetPDiskId() << " " << vslot.GetFailRealmIdx()
+ << " " << vslot.GetFailDomainIdx() << Endl;
+ UNIT_ASSERT(!movedOutPDisks.count({vslotId.GetNodeId(), vslotId.GetPDiskId()}) ||
+ (numNodes == 12 && numDecommitted == 0));
+ }
+
+ env.UpdateDriveStatus(nodeId, pdiskId, NKikimrBlobStorage::EDriveStatus::ACTIVE, {});
+ movedOutPDisks.erase({nodeId, pdiskId});
+ }
+ }
+ }
+}
diff --git a/ydb/core/blobstorage/ut_blobstorage/lib/env.h b/ydb/core/blobstorage/ut_blobstorage/lib/env.h
index d5ab6cdeee..63329eac32 100644
--- a/ydb/core/blobstorage/ut_blobstorage/lib/env.h
+++ b/ydb/core/blobstorage/ut_blobstorage/lib/env.h
@@ -29,6 +29,8 @@ struct TEnvironmentSetup {
const std::function<void(TTestActorSystem&)> PrepareRuntime;
const ui32 ControllerNodeId = 1;
const bool Cache = false;
+ const ui32 NumDataCenters = 0;
+ const std::function<TNodeLocation(ui32)> LocationGenerator;
};
const TSettings Settings;
@@ -93,6 +95,11 @@ struct TEnvironmentSetup {
Cerr << "RandomSeed# " << seed << Endl;
}
+ ui32 GetNumDataCenters() const {
+ return Settings.NumDataCenters ? Settings.NumDataCenters :
+ Settings.Erasure.GetErasure() == TBlobStorageGroupType::ErasureMirror3dc ? 3 : 1;
+ }
+
void Initialize() {
Runtime = std::make_unique<TTestActorSystem>(Settings.NodeCount);
if (Settings.PrepareRuntime) {
@@ -102,8 +109,11 @@ struct TEnvironmentSetup {
Runtime->Start();
auto *appData = Runtime->GetAppData();
appData->DomainsInfo->AddDomain(TDomainsInfo::TDomain::ConstructEmptyDomain("dom", DomainId).Release());
- Runtime->SetupTabletRuntime(Settings.Erasure.GetErasure() == TBlobStorageGroupType::ErasureMirror3dc,
- Settings.ControllerNodeId);
+ if (Settings.LocationGenerator) {
+ Runtime->SetupTabletRuntime(Settings.LocationGenerator, Settings.ControllerNodeId);
+ } else {
+ Runtime->SetupTabletRuntime(GetNumDataCenters(), Settings.ControllerNodeId);
+ }
SetupStaticStorage();
SetupTablet();
SetupStorage();
@@ -115,8 +125,11 @@ struct TEnvironmentSetup {
void StartNode(ui32 nodeId) {
Runtime->StartNode(nodeId);
- Runtime->SetupTabletRuntime(Settings.Erasure.GetErasure() == TBlobStorageGroupType::ErasureMirror3dc,
- Settings.ControllerNodeId, nodeId);
+ if (Settings.LocationGenerator) {
+ Runtime->SetupTabletRuntime(Settings.LocationGenerator, Settings.ControllerNodeId, nodeId);
+ } else {
+ Runtime->SetupTabletRuntime(GetNumDataCenters(), Settings.ControllerNodeId, nodeId);
+ }
if (nodeId == Settings.ControllerNodeId) {
SetupStaticStorage();
SetupTablet();
@@ -549,6 +562,17 @@ struct TEnvironmentSetup {
}
}
+ void UpdateSettings(bool selfHeal, bool donorMode, bool groupLayoutSanitizer = false) {
+ NKikimrBlobStorage::TConfigRequest request;
+ auto *cmd = request.AddCommand();
+ auto *us = cmd->MutableUpdateSettings();
+ us->AddEnableSelfHeal(selfHeal);
+ us->AddEnableDonorMode(donorMode);
+ us->AddEnableGroupLayoutSanitizer(groupLayoutSanitizer);
+ auto response = Invoke(request);
+ UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription());
+ }
+
void EnableDonorMode() {
NKikimrBlobStorage::TConfigRequest request;
request.AddCommand()->MutableEnableDonorMode()->SetEnable(true);
@@ -556,6 +580,19 @@ struct TEnvironmentSetup {
UNIT_ASSERT(response.GetSuccess());
}
+ void UpdateDriveStatus(ui32 nodeId, ui32 pdiskId, NKikimrBlobStorage::EDriveStatus status,
+ NKikimrBlobStorage::EDecommitStatus decommitStatus) {
+ NKikimrBlobStorage::TConfigRequest request;
+ auto *cmd = request.AddCommand();
+ auto *ds = cmd->MutableUpdateDriveStatus();
+ ds->MutableHostKey()->SetNodeId(nodeId);
+ ds->SetPDiskId(pdiskId);
+ ds->SetStatus(status);
+ ds->SetDecommitStatus(decommitStatus);
+ auto response = Invoke(request);
+ UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription());
+ }
+
void SetScrubPeriodicity(TDuration periodicity) {
NKikimrBlobStorage::TConfigRequest request;
request.AddCommand()->MutableSetScrubPeriodicity()->SetScrubPeriodicity(periodicity.Seconds());
diff --git a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp
new file mode 100644
index 0000000000..f74e2d4841
--- /dev/null
+++ b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp
@@ -0,0 +1,103 @@
+#include <ydb/core/blobstorage/ut_blobstorage/lib/env.h>
+
+Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
+ Y_UNIT_TEST(Test3dc) {
+ const ui32 numRacks = 15;
+ std::vector<ui32> nodesPerRack(numRacks);
+ std::vector<ui32> nodeToRack;
+ for (ui32 numFilledRacks = 0; numFilledRacks < numRacks; ) {
+// const ui32 rackId = RandomNumber(numRacks);
+ const ui32 rackId = numFilledRacks;
+ nodeToRack.emplace_back(rackId);
+ numFilledRacks += !nodesPerRack[rackId]++;
+ }
+ const ui32 numDatacenters = 3;
+ std::vector<ui32> rackToDatacenter;
+ for (ui32 i = 0; i < numRacks; ++i) {
+ rackToDatacenter.push_back(i % numDatacenters);
+ }
+
+ std::vector<TNodeLocation> locations;
+ for (ui32 i = 0; i < nodeToRack.size(); ++i) {
+ NActorsInterconnect::TNodeLocation proto;
+ proto.SetDataCenter(ToString(rackToDatacenter[nodeToRack[i]]));
+ proto.SetRack(ToString(nodeToRack[i]));
+ proto.SetUnit(ToString(i));
+ locations.emplace_back(proto);
+ }
+
+ TEnvironmentSetup env{{
+ .NodeCount = (ui32)nodeToRack.size(),
+ .Erasure = TBlobStorageGroupType::ErasureMirror3dc,
+ .LocationGenerator = [&](ui32 nodeId) { return locations[nodeId - 1]; },
+ }};
+
+ auto getGroupsWithIncorrectLayout = [&] {
+ auto config = env.FetchBaseConfig();
+
+ std::map<ui32, std::tuple<TString, TString>> nodeIdToLocation;
+ for (const auto& node : config.GetNode()) {
+ const auto& location = node.GetLocation();
+ nodeIdToLocation.emplace(node.GetNodeId(), std::make_tuple(location.GetDataCenter(), location.GetRack()));
+ }
+
+ std::map<ui32, std::vector<std::vector<std::tuple<TString, TString>>>> groups;
+ for (const auto& vslot : config.GetVSlot()) {
+ auto& group = groups[vslot.GetGroupId()];
+ if (group.empty()) {
+ group.resize(3, {3, {"", ""}});
+ }
+ group[vslot.GetFailRealmIdx()][vslot.GetFailDomainIdx()] = nodeIdToLocation[vslot.GetVSlotId().GetNodeId()];
+ }
+
+ std::set<ui32> badGroups;
+
+ for (auto& [groupId, group] : groups) {
+ std::set<TString> usedRealms;
+
+ for (const auto& row : group) {
+ TString realm;
+ std::set<TString> usedRacks;
+
+ for (const auto& [dc, rack] : row) {
+ Y_VERIFY(dc && rack);
+
+ if (!usedRacks.insert(rack).second) {
+ badGroups.insert(groupId);
+ }
+
+ if (!realm) {
+ if (!usedRealms.insert(dc).second) {
+ badGroups.insert(groupId);
+ }
+ realm = dc;
+ } else if (realm != dc) {
+ badGroups.insert(groupId);
+ }
+ }
+ }
+ }
+
+ return badGroups;
+ };
+
+ const ui32 disksPerNode = 1;
+ const ui32 slotsPerDisk = 3;
+ env.CreateBoxAndPool(disksPerNode, nodeToRack.size() * disksPerNode * slotsPerDisk / 9);
+ env.Sim(TDuration::Seconds(30));
+ auto before = getGroupsWithIncorrectLayout();
+ Cerr << "bad groups before shuffling# " << FormatList(before) << Endl;
+ UNIT_ASSERT(before.empty());
+ env.Cleanup();
+ std::random_shuffle(locations.begin(), locations.end());
+ env.Initialize();
+ env.Sim(TDuration::Seconds(100));
+ auto after = getGroupsWithIncorrectLayout();
+ Cerr << "bad groups just after shuffling# " << FormatList(after) << Endl;
+ env.UpdateSettings(true, false, true);
+ env.Sim(TDuration::Minutes(15));
+ auto corrected = getGroupsWithIncorrectLayout();
+ Cerr << "bad groups after shuffling and fixing# " << FormatList(corrected) << Endl;
+ UNIT_ASSERT(corrected.empty());
+ }
+}
diff --git a/ydb/core/blobstorage/ut_blobstorage/ya.make b/ydb/core/blobstorage/ut_blobstorage/ya.make
index efe91d91cd..9b33efe1a9 100644
--- a/ydb/core/blobstorage/ut_blobstorage/ya.make
+++ b/ydb/core/blobstorage/ut_blobstorage/ya.make
@@ -11,6 +11,7 @@ TIMEOUT(600)
SRCS(
block_race.cpp
counting_events.cpp
+ decommit_3dc.cpp
defrag.cpp
donor.cpp
encryption.cpp
@@ -18,6 +19,7 @@ SRCS(
incorrect_queries.cpp
main.cpp
mirror3of4.cpp
+ sanitize_groups.cpp
space_check.cpp
sync.cpp
replication.cpp
diff --git a/ydb/core/cms/sentinel.cpp b/ydb/core/cms/sentinel.cpp
index 638b78fe09..1823586d28 100644
--- a/ydb/core/cms/sentinel.cpp
+++ b/ydb/core/cms/sentinel.cpp
@@ -145,7 +145,6 @@ bool TPDiskStatus::IsNewStatusGood() const {
switch (Compute(Current, unused)) {
case EPDiskStatus::INACTIVE:
case EPDiskStatus::ACTIVE:
- case EPDiskStatus::SPARE:
return true;
case EPDiskStatus::UNKNOWN:
diff --git a/ydb/core/cms/sentinel_ut.cpp b/ydb/core/cms/sentinel_ut.cpp
index 62c0fdf309..f916cf6005 100644
--- a/ydb/core/cms/sentinel_ut.cpp
+++ b/ydb/core/cms/sentinel_ut.cpp
@@ -48,8 +48,8 @@ Y_UNIT_TEST_SUITE(TSentinelBaseTests) {
EPDiskStatus::ACTIVE,
EPDiskStatus::INACTIVE,
EPDiskStatus::BROKEN,
- EPDiskStatus::SPARE,
EPDiskStatus::FAULTY,
+ EPDiskStatus::TO_BE_REMOVED,
};
for (const EPDiskStatus status : AllStatuses) {
diff --git a/ydb/core/mind/bscontroller/bsc.cpp b/ydb/core/mind/bscontroller/bsc.cpp
index 779381fd51..bd9f83d86d 100644
--- a/ydb/core/mind/bscontroller/bsc.cpp
+++ b/ydb/core/mind/bscontroller/bsc.cpp
@@ -116,7 +116,7 @@ void TBlobStorageController::OnActivateExecutor(const TActorContext&) {
}
// create self-heal actor
- SelfHealId = Register(CreateSelfHealActor(TabletID(), SelfHealUnreassignableGroups));
+ SelfHealId = Register(CreateSelfHealActor());
// create stat processor
StatProcessorActorId = Register(CreateStatProcessorActor());
@@ -152,6 +152,7 @@ void TBlobStorageController::Handle(TEvInterconnect::TEvNodesInfo::TPtr &ev) {
const bool initial = !HostRecords;
HostRecords = std::make_shared<THostRecordMap::element_type>(ev->Get());
Schedule(TDuration::Minutes(5), new TEvPrivate::TEvHostRecordsTimeToLiveExceeded);
+ Send(SelfHealId, new TEvPrivate::TEvUpdateHostRecords(HostRecords));
if (initial) {
Execute(CreateTxInitScheme());
}
diff --git a/ydb/core/mind/bscontroller/cmds_drive_status.cpp b/ydb/core/mind/bscontroller/cmds_drive_status.cpp
index 8abaaa5979..25fe500b2c 100644
--- a/ydb/core/mind/bscontroller/cmds_drive_status.cpp
+++ b/ydb/core/mind/bscontroller/cmds_drive_status.cpp
@@ -24,16 +24,20 @@ namespace NKikimr::NBsController {
}
TPDiskInfo *pdisk = PDisks.FindForUpdate(pdiskId);
- if (cmd.GetStatus() != pdisk->Status) {
- const bool wasGoodExpectedStatus = pdisk->HasGoodExpectedStatus();
- pdisk->Status = cmd.GetStatus();
+ const bool wasGoodExpectedStatus = pdisk->HasGoodExpectedStatus();
+ if (const auto s = cmd.GetStatus(); s != NKikimrBlobStorage::EDriveStatus::UNKNOWN && s != pdisk->Status) {
+ pdisk->Status = s;
pdisk->StatusTimestamp = Timestamp;
- if (pdisk->HasGoodExpectedStatus() != wasGoodExpectedStatus) {
- for (const auto& [id, slot] : pdisk->VSlotsOnPDisk) {
- if (slot->Group) {
- TGroupInfo *group = Groups.FindForUpdate(slot->Group->ID);
- group->CalculateGroupStatus();
- }
+ }
+ if (const auto ds = cmd.GetDecommitStatus(); ds != NKikimrBlobStorage::EDecommitStatus::DECOMMIT_UNSET &&
+ ds != pdisk->DecommitStatus) {
+ pdisk->DecommitStatus = ds;
+ }
+ if (wasGoodExpectedStatus != pdisk->HasGoodExpectedStatus()) {
+ for (const auto& [id, slot] : pdisk->VSlotsOnPDisk) {
+ if (slot->Group) {
+ TGroupInfo *group = Groups.FindForUpdate(slot->Group->ID);
+ group->CalculateGroupStatus();
}
}
}
@@ -44,7 +48,8 @@ namespace NKikimr::NBsController {
(IcPort, host.GetIcPort()),
(NodeId, host.GetNodeId()),
(Path, cmd.GetPath()),
- (Status, cmd.GetStatus()));
+ (Status, cmd.GetStatus()),
+ (DecommitStatus, cmd.GetDecommitStatus()));
}
void TBlobStorageController::TConfigState::ExecuteStep(const NKikimrBlobStorage::TReadDriveStatus& cmd, TStatus& status) {
diff --git a/ydb/core/mind/bscontroller/cmds_storage_pool.cpp b/ydb/core/mind/bscontroller/cmds_storage_pool.cpp
index 51bacc0571..0ca115ec3e 100644
--- a/ydb/core/mind/bscontroller/cmds_storage_pool.cpp
+++ b/ydb/core/mind/bscontroller/cmds_storage_pool.cpp
@@ -465,6 +465,7 @@ namespace NKikimr::NBsController {
x->SetNumStaticSlots(pdisk.StaticSlotUsage);
x->SetDriveStatus(NKikimrBlobStorage::EDriveStatus::ACTIVE);
x->SetExpectedSlotCount(pdisk.ExpectedSlotCount);
+ x->SetDecommitStatus(NKikimrBlobStorage::EDecommitStatus::DECOMMIT_NONE);
if (pdisk.PDiskMetrics) {
x->MutablePDiskMetrics()->CopyFrom(*pdisk.PDiskMetrics);
x->MutablePDiskMetrics()->ClearPDiskId();
diff --git a/ydb/core/mind/bscontroller/config.cpp b/ydb/core/mind/bscontroller/config.cpp
index 47b522f24d..b43b155504 100644
--- a/ydb/core/mind/bscontroller/config.cpp
+++ b/ydb/core/mind/bscontroller/config.cpp
@@ -794,6 +794,7 @@ namespace NKikimr::NBsController {
pb->SetDriveStatus(pdisk.Status);
pb->SetExpectedSlotCount(pdisk.ExpectedSlotCount);
pb->SetDriveStatusChangeTimestamp(pdisk.StatusTimestamp.GetValue());
+ pb->SetDecommitStatus(pdisk.DecommitStatus);
pb->MutablePDiskMetrics()->CopyFrom(pdisk.Metrics);
pb->MutablePDiskMetrics()->ClearPDiskId();
}
diff --git a/ydb/core/mind/bscontroller/config_cmd.cpp b/ydb/core/mind/bscontroller/config_cmd.cpp
index 2fc925a9cd..f4bb7d5aa1 100644
--- a/ydb/core/mind/bscontroller/config_cmd.cpp
+++ b/ydb/core/mind/bscontroller/config_cmd.cpp
@@ -126,6 +126,13 @@ namespace NKikimr::NBsController {
Self->PDiskSpaceColorBorder = static_cast<T::PDiskSpaceColorBorder::Type>(value);
db.Table<T>().Key(true).Update<T::PDiskSpaceColorBorder>(Self->PDiskSpaceColorBorder);
}
+ for (bool value : settings.GetEnableGroupLayoutSanitizer()) {
+ Self->GroupLayoutSanitizer = value;
+ db.Table<T>().Key(true).Update<T::GroupLayoutSanitizer>(Self->GroupLayoutSanitizer);
+ auto ev = std::make_unique<TEvControllerUpdateSelfHealInfo>();
+ ev->GroupLayoutSanitizer = Self->GroupLayoutSanitizer;
+ Self->Send(Self->SelfHealId, ev.release());
+ }
return true;
}
diff --git a/ydb/core/mind/bscontroller/config_fit_groups.cpp b/ydb/core/mind/bscontroller/config_fit_groups.cpp
index cb8801b0f5..dd1513b354 100644
--- a/ydb/core/mind/bscontroller/config_fit_groups.cpp
+++ b/ydb/core/mind/bscontroller/config_fit_groups.cpp
@@ -51,7 +51,7 @@ namespace NKikimr {
for (ui64 reserve = 0; reserve < min || (reserve - min) * 1000000 / Max<ui64>(1, total) < part; ++reserve, ++total) {
TGroupMapper::TGroupDefinition group;
try {
- AllocateGroup(0, group, nullptr, 0, {}, 0, false);
+ AllocateGroup(0, group, {}, {}, 0, false);
} catch (const TExFitGroupError&) {
throw TExError() << "group reserve constraint hit";
}
@@ -92,7 +92,7 @@ namespace NKikimr {
requiredSpace = ExpectedSlotSize.front();
ExpectedSlotSize.pop_front();
}
- AllocateGroup(groupId, group, nullptr, 0, {}, requiredSpace, false);
+ AllocateGroup(groupId, group, {}, {}, requiredSpace, false);
// scan all comprising PDisks for PDiskCategory
TMaybe<TPDiskCategory> desiredPDiskCategory;
@@ -171,6 +171,7 @@ namespace NKikimr {
// mapping for audit log
TMap<TVDiskIdShort, TVSlotId> replacedSlots;
TStackVec<std::pair<TVSlotId, bool>, 32> replaceQueue;
+ THashMap<TVDiskIdShort, TPDiskId> replacedDisks;
i64 requiredSpace = Min<i64>();
////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -197,9 +198,6 @@ namespace NKikimr {
replace = true;
break;
- case NKikimrBlobStorage::EDriveStatus::SPARE:
- break;
-
case NKikimrBlobStorage::EDriveStatus::FAULTY:
case NKikimrBlobStorage::EDriveStatus::TO_BE_REMOVED:
// groups are moved out asynchronously
@@ -217,6 +215,7 @@ namespace NKikimr {
g[vslot->RingIdx][vslot->FailDomainIdx][vslot->VDiskIdx] = targetPDiskId;
replacedSlots.emplace(TVDiskIdShort(vslot->RingIdx, vslot->FailDomainIdx, vslot->VDiskIdx), vslot->VSlotId);
replaceQueue.emplace_back(vslot->VSlotId, State.SuppressDonorMode.count(vslot->VSlotId));
+ replacedDisks.emplace(vslot->GetShortVDiskId(), vslot->VSlotId.ComprisingPDiskId());
} else {
preservedSlots.emplace(vslot->GetVDiskId(), vslot->VSlotId);
auto& m = vslot->Metrics;
@@ -243,10 +242,6 @@ namespace NKikimr {
}
}
if (hasMissingSlots || !IgnoreGroupSanityChecks) {
- TStackVec<TPDiskId, 32> replacedDiskIds;
- for (const auto& [vslotId, suppressDonorMode] : replaceQueue) {
- replacedDiskIds.push_back(vslotId.ComprisingPDiskId());
- }
TGroupMapper::TForbiddenPDisks forbid;
for (const auto& vslot : groupInfo->VDisksInGroup) {
for (const auto& [vslotId, vdiskId] : vslot->Donors) {
@@ -255,8 +250,7 @@ namespace NKikimr {
}
}
}
- AllocateGroup(groupId, group, replacedDiskIds.data(), replacedDiskIds.size(), std::move(forbid),
- requiredSpace, AllowUnusableDisks);
+ AllocateGroup(groupId, group, replacedDisks, std::move(forbid), requiredSpace, AllowUnusableDisks);
if (!IgnoreVSlotQuotaCheck) {
adjustSpaceAvailable = true;
for (const auto& [pos, vslotId] : replacedSlots) {
@@ -361,9 +355,9 @@ namespace NKikimr {
}
private:
- void AllocateGroup(TGroupId groupId, TGroupMapper::TGroupDefinition& group, const TPDiskId replacedDiskIds[],
- size_t numReplacedDisks, TGroupMapper::TForbiddenPDisks forbid, i64 requiredSpace,
- bool addExistingDisks) {
+ void AllocateGroup(TGroupId groupId, TGroupMapper::TGroupDefinition& group,
+ const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks, TGroupMapper::TForbiddenPDisks forbid,
+ i64 requiredSpace, bool addExistingDisks) {
if (!Mapper) {
Mapper.emplace(Geometry, StoragePool.RandomizeGroupMapping);
PopulateGroupMapper();
@@ -382,8 +376,7 @@ namespace NKikimr {
}
}
}
- Geometry.AllocateGroup(*Mapper, groupId, group, replacedDiskIds, numReplacedDisks, std::move(forbid),
- requiredSpace);
+ Geometry.AllocateGroup(*Mapper, groupId, group, replacedDisks, std::move(forbid), requiredSpace);
for (const TPDiskId pdiskId : removeQ) {
Mapper->UnregisterPDisk(pdiskId);
}
@@ -451,7 +444,7 @@ namespace NKikimr {
}
}
- if (info.Status != NKikimrBlobStorage::EDriveStatus::ACTIVE) {
+ if (!info.AcceptsNewSlots()) {
usable = false;
}
@@ -460,8 +453,17 @@ namespace NKikimr {
}
// register PDisk in the mapper
- return Mapper->RegisterPDisk(id, State.HostRecords->GetLocation(id.NodeId), usable, numSlots,
- info.ExpectedSlotCount, groups.data(), groups.size(), availableSpace, info.Operational);
+ return Mapper->RegisterPDisk({
+ .PDiskId = id,
+ .Location = State.HostRecords->GetLocation(id.NodeId),
+ .Usable = usable,
+ .NumSlots = numSlots,
+ .MaxSlots = info.ExpectedSlotCount,
+ .Groups = std::move(groups),
+ .SpaceAvailable = availableSpace,
+ .Operational = info.Operational,
+ .Decommitted = info.Decommitted(),
+ });
}
std::map<TVDiskIdShort, TVSlotInfo*> CreateVSlotsForGroup(TGroupInfo *groupInfo,
diff --git a/ydb/core/mind/bscontroller/config_fit_pdisks.cpp b/ydb/core/mind/bscontroller/config_fit_pdisks.cpp
index 3112a24317..6baddf3bba 100644
--- a/ydb/core/mind/bscontroller/config_fit_pdisks.cpp
+++ b/ydb/core/mind/bscontroller/config_fit_pdisks.cpp
@@ -89,7 +89,8 @@ namespace NKikimr {
state.PDisks.ConstructInplaceNewEntry(pdiskId, *hostId, TString(), category.GetRaw(),
*driveInfo->Guid, false, false, 1000, driveInfo->PDiskConfig.GetOrElse(TString()),
driveInfo->BoxId, DefaultMaxSlots, NKikimrBlobStorage::EDriveStatus::ACTIVE,
- TInstant::Zero(), serial.Serial, TString(), fsPath, staticSlotUsage);
+ TInstant::Zero(), NKikimrBlobStorage::EDecommitStatus::DECOMMIT_NONE, serial.Serial,
+ TString(), fsPath, staticSlotUsage);
const TPDiskLocation location(nodeId, serial.Serial);
state.PDiskLocationMap.emplace(location, pdiskId);
@@ -234,8 +235,9 @@ namespace NKikimr {
state.PDisks.ConstructInplaceNewEntry(pdiskId, hostId, path, category.GetRaw(),
guid, driveInfo.SharedWithOs, driveInfo.ReadCentric, 1000,
driveInfo.PDiskConfig.GetOrElse(TString()), boxId, DefaultMaxSlots,
- NKikimrBlobStorage::EDriveStatus::ACTIVE, TInstant::Zero(), currentSerial, currentSerial,
- TString(), staticSlotUsage);
+ NKikimrBlobStorage::EDriveStatus::ACTIVE, TInstant::Zero(),
+ NKikimrBlobStorage::EDecommitStatus::DECOMMIT_NONE,
+ currentSerial, currentSerial, TString(), staticSlotUsage);
// insert PDisk into location map
state.PDiskLocationMap.emplace(location, pdiskId);
diff --git a/ydb/core/mind/bscontroller/group_geometry_info.h b/ydb/core/mind/bscontroller/group_geometry_info.h
index 10e5daedba..a74698f1f8 100644
--- a/ydb/core/mind/bscontroller/group_geometry_info.h
+++ b/ydb/core/mind/bscontroller/group_geometry_info.h
@@ -69,12 +69,11 @@ namespace NKikimr::NBsController {
ui32 GetDomainLevelEnd() const { return DomainLevelEnd; }
void AllocateGroup(TGroupMapper &mapper, TGroupId groupId, TGroupMapper::TGroupDefinition &group,
- const TPDiskId replacedDiskIds[], size_t numReplacedDisks, TGroupMapper::TForbiddenPDisks forbid,
+ const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks, TGroupMapper::TForbiddenPDisks forbid,
i64 requiredSpace) const {
TString error;
for (const bool requireOperational : {true, false}) {
- if (mapper.AllocateGroup(groupId, group, replacedDiskIds, numReplacedDisks, forbid,
- requiredSpace, requireOperational, error)) {
+ if (mapper.AllocateGroup(groupId, group, replacedDisks, forbid, requiredSpace, requireOperational, error)) {
return;
}
}
@@ -115,6 +114,10 @@ namespace NKikimr::NBsController {
return true;
}
+ TBlobStorageGroupType GetType() const {
+ return Type;
+ }
+
TBlobStorageGroupType::EErasureSpecies GetErasure() const {
return Type.GetErasure();
}
diff --git a/ydb/core/mind/bscontroller/group_layout_checker.cpp b/ydb/core/mind/bscontroller/group_layout_checker.cpp
new file mode 100644
index 0000000000..8ab76e3e4f
--- /dev/null
+++ b/ydb/core/mind/bscontroller/group_layout_checker.cpp
@@ -0,0 +1,47 @@
+#include "group_layout_checker.h"
+#include "group_geometry_info.h"
+
+namespace NKikimr::NBsController {
+
+ TLayoutCheckResult CheckGroupLayout(const TGroupGeometryInfo& geom, const THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>>& layout) {
+ using namespace NLayoutChecker;
+
+ if (layout.empty()) {
+ return {};
+ }
+
+ TBlobStorageGroupInfo::TTopology topology(geom.GetType(), geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm(),
+ geom.GetNumVDisksPerFailDomain(), true);
+ TGroupLayout group(topology);
+ TDomainMapper mapper;
+ THashMap<TVDiskIdShort, TPDiskLayoutPosition> map;
+ for (const auto& [vdiskId, p] : layout) {
+ const auto& [location, pdiskId] = p;
+ TPDiskLayoutPosition pos(mapper, location, pdiskId, geom);
+ group.AddDisk(pos, topology.GetOrderNumber(vdiskId));
+ map.emplace(vdiskId, pos);
+ }
+
+ std::vector<std::pair<TScore, TVDiskIdShort>> scoreboard;
+ for (const auto& [vdiskId, pos] : map) {
+ scoreboard.emplace_back(group.GetCandidateScore(pos, topology.GetOrderNumber(vdiskId)), vdiskId);
+ }
+
+ auto comp1 = [](const auto& x, const auto& y) { return x.second < y.second; };
+ std::sort(scoreboard.begin(), scoreboard.end(), comp1);
+
+ auto comp = [](const auto& x, const auto& y) { return x.first.BetterThan(y.first); };
+ std::sort(scoreboard.begin(), scoreboard.end(), comp);
+ TLayoutCheckResult res;
+ const auto reference = scoreboard.back().first;
+ if (!reference.SameAs({})) { // not perfectly correct layout
+ for (; !scoreboard.empty() && !scoreboard.back().first.BetterThan(reference); scoreboard.pop_back()) {
+ res.Candidates.push_back(scoreboard.back().second);
+ }
+ }
+ return res;
+ }
+
+} // NKikimr::NBsController
+
+Y_DECLARE_OUT_SPEC(, NKikimr::NBsController::NLayoutChecker::TEntityId, stream, value) { value.Output(stream); }
diff --git a/ydb/core/mind/bscontroller/group_layout_checker.h b/ydb/core/mind/bscontroller/group_layout_checker.h
new file mode 100644
index 0000000000..407f0b7c7f
--- /dev/null
+++ b/ydb/core/mind/bscontroller/group_layout_checker.h
@@ -0,0 +1,233 @@
+#pragma once
+
+#include "defs.h"
+#include "types.h"
+#include "group_geometry_info.h"
+
+namespace NKikimr::NBsController {
+
+ namespace NLayoutChecker {
+
+ struct TEntityId {
+ ui32 Value = ::Max<ui32>();
+
+ public:
+ bool operator ==(const TEntityId& other) const { return Value == other.Value; }
+ bool operator !=(const TEntityId& other) const { return Value != other.Value; }
+ bool operator < (const TEntityId& other) const { return Value < other.Value; }
+ bool operator <=(const TEntityId& other) const { return Value <= other.Value; }
+ bool operator > (const TEntityId& other) const { return Value > other.Value; }
+ bool operator >=(const TEntityId& other) const { return Value >= other.Value; }
+
+ size_t Index() const {
+ return Value;
+ }
+
+ size_t Hash() const {
+ return THash<ui32>()(Value);
+ }
+
+ static constexpr TEntityId Min() { return {.Value = 0}; };
+ static constexpr TEntityId Max() { return {.Value = ::Max<ui32>()}; };
+
+ TString ToString() const { return TStringBuilder() << Value; }
+ void Output(IOutputStream& s) const { s << Value; }
+
+ private:
+ friend class TDomainMapper;
+
+ static TEntityId SequentialValue(size_t index) {
+ return TEntityId{static_cast<ui32>(index)};
+ }
+ };
+
+ } // NLayoutChecker
+
+} // NKikimr::NBsController
+
+template<>
+struct THash<NKikimr::NBsController::NLayoutChecker::TEntityId> {
+ template<typename T>
+ size_t operator()(const T& id) const { return id.Hash(); }
+};
+
+namespace NKikimr::NBsController {
+
+ namespace NLayoutChecker {
+
+ class TDomainMapper {
+ std::unordered_map<TString, TEntityId> FailDomainId;
+
+ public:
+ TEntityId operator ()(TString item) {
+ return FailDomainId.emplace(std::move(item), TEntityId::SequentialValue(FailDomainId.size())).first->second;
+ }
+
+ size_t GetIdCount() const {
+ return FailDomainId.size();
+ }
+ };
+
+ struct TPDiskLayoutPosition {
+ TEntityId RealmGroup;
+ TEntityId Realm;
+ TEntityId Domain;
+
+ TPDiskLayoutPosition() = default;
+
+ TPDiskLayoutPosition(TEntityId realmGroup, TEntityId realm, TEntityId domain)
+ : RealmGroup(realmGroup)
+ , Realm(realm)
+ , Domain(domain)
+ {}
+
+ TPDiskLayoutPosition(TDomainMapper& mapper, const TNodeLocation& location, TPDiskId pdiskId, const TGroupGeometryInfo& geom) {
+ TStringStream realmGroup, realm, domain;
+ const std::pair<int, TStringStream*> levels[] = {
+ {geom.GetRealmLevelBegin(), &realmGroup},
+ {Max(geom.GetRealmLevelEnd(), geom.GetDomainLevelBegin()), &realm},
+ {Max(geom.GetRealmLevelEnd(), geom.GetDomainLevelEnd()), &domain}
+ };
+ auto addLevel = [&](int key, const TString& value) {
+ for (const auto& [reference, stream] : levels) {
+ if (key < reference) {
+ Save(stream, std::make_tuple(key, value));
+ }
+ }
+ };
+ for (const auto& [key, value] : location.GetItems()) {
+ addLevel(key, value);
+ }
+ addLevel(255, pdiskId.ToString()); // ephemeral level to distinguish between PDisks on the same node
+ RealmGroup = mapper(realmGroup.Str());
+ Realm = mapper(realm.Str());
+ Domain = mapper(domain.Str());
+ }
+
+ TString ToString() const {
+ return TStringBuilder() << "{" << RealmGroup << "." << Realm << "." << Domain << "}";
+ }
+
+ auto AsTuple() const {
+ return std::tie(RealmGroup, Realm, Domain);
+ }
+
+ friend bool operator ==(const TPDiskLayoutPosition& x, const TPDiskLayoutPosition& y) {
+ return x.AsTuple() == y.AsTuple();
+ }
+
+ friend bool operator <(const TPDiskLayoutPosition& x, const TPDiskLayoutPosition& y) {
+ return x.AsTuple() < y.AsTuple();
+ }
+ };
+
+ struct TScore {
+ ui32 RealmInterlace = 0;
+ ui32 DomainInterlace = 0;
+ ui32 RealmGroupScatter = 0;
+ ui32 RealmScatter = 0;
+ ui32 DomainScatter = 0;
+
+ auto AsTuple() const {
+ return std::make_tuple(RealmInterlace, DomainInterlace, RealmGroupScatter, RealmScatter, DomainScatter);
+ }
+
+ bool BetterThan(const TScore& other) const {
+ return AsTuple() < other.AsTuple();
+ }
+
+ bool SameAs(const TScore& other) const {
+ return AsTuple() == other.AsTuple();
+ }
+
+ static TScore Max() {
+ return {::Max<ui32>(), ::Max<ui32>(), ::Max<ui32>(), ::Max<ui32>(), ::Max<ui32>()};
+ }
+
+ TString ToString() const {
+ return TStringBuilder() << "{RealmInterlace# " << RealmInterlace
+ << " DomainInterlace# " << DomainInterlace
+ << " RealmGroupScatter# " << RealmGroupScatter
+ << " RealmScatter# " << RealmScatter
+ << " DomainScatter# " << DomainScatter
+ << "}";
+ }
+ };
+
+ struct TGroupLayout {
+ const TBlobStorageGroupInfo::TTopology& Topology;
+
+ ui32 NumDisks = 0;
+ THashMap<TEntityId, ui32> NumDisksPerRealmGroup;
+
+ TStackVec<ui32, 4> NumDisksInRealm;
+ TStackVec<THashMap<TEntityId, ui32>, 4> NumDisksPerRealm;
+ THashMap<TEntityId, ui32> NumDisksPerRealmTotal;
+
+ TStackVec<ui32, 32> NumDisksInDomain;
+ TStackVec<THashMap<TEntityId, ui32>, 32> NumDisksPerDomain;
+ THashMap<TEntityId, ui32> NumDisksPerDomainTotal;
+
+ TGroupLayout(const TBlobStorageGroupInfo::TTopology& topology)
+ : Topology(topology)
+ , NumDisksInRealm(Topology.GetTotalFailRealmsNum())
+ , NumDisksPerRealm(Topology.GetTotalFailRealmsNum())
+ , NumDisksInDomain(Topology.GetTotalFailDomainsNum())
+ , NumDisksPerDomain(Topology.GetTotalFailDomainsNum())
+ {}
+
+ void UpdateDisk(const TPDiskLayoutPosition& pos, ui32 orderNumber, ui32 value) {
+ NumDisks += value;
+ NumDisksPerRealmGroup[pos.RealmGroup] += value;
+ const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber);
+ NumDisksInRealm[vdisk.FailRealm] += value;
+ NumDisksPerRealm[vdisk.FailRealm][pos.Realm] += value;
+ NumDisksPerRealmTotal[pos.Realm] += value;
+ const ui32 domainIdx = Topology.GetFailDomainOrderNumber(vdisk);
+ NumDisksInDomain[domainIdx] += value;
+ NumDisksPerDomain[domainIdx][pos.Domain] += value;
+ NumDisksPerDomainTotal[pos.Domain] += value;
+ }
+
+ void AddDisk(const TPDiskLayoutPosition& pos, ui32 orderNumber) {
+ UpdateDisk(pos, orderNumber, 1);
+ }
+
+ void RemoveDisk(const TPDiskLayoutPosition& pos, ui32 orderNumber) {
+ UpdateDisk(pos, orderNumber, Max<ui32>());
+ }
+
+ TScore GetCandidateScore(const TPDiskLayoutPosition& pos, ui32 orderNumber) {
+ const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber);
+ const ui32 domainIdx = Topology.GetFailDomainOrderNumber(vdisk);
+
+ return {
+ .RealmInterlace = NumDisksPerRealmTotal[pos.Realm] - NumDisksPerRealm[vdisk.FailRealm][pos.Realm],
+ .DomainInterlace = NumDisksPerDomainTotal[pos.Domain] - NumDisksPerDomain[domainIdx][pos.Domain],
+ .RealmGroupScatter = NumDisks - NumDisksPerRealmGroup[pos.RealmGroup],
+ .RealmScatter = NumDisksInRealm[vdisk.FailRealm] - NumDisksPerRealm[vdisk.FailRealm][pos.Realm],
+ .DomainScatter = NumDisksInDomain[domainIdx] - NumDisksPerDomain[domainIdx][pos.Domain],
+ };
+ }
+
+ TScore GetExcludedDiskScore(const TPDiskLayoutPosition& pos, ui32 orderNumber) {
+ RemoveDisk(pos, orderNumber);
+ const TScore score = GetCandidateScore(pos, orderNumber);
+ AddDisk(pos, orderNumber);
+ return score;
+ }
+ };
+
+ } // NLayoutChecker
+
+ struct TLayoutCheckResult {
+ std::vector<TVDiskIdShort> Candidates;
+
+ explicit operator bool() const { // checks whether fail model is correct
+ return Candidates.empty();
+ }
+ };
+
+ TLayoutCheckResult CheckGroupLayout(const TGroupGeometryInfo& geom, const THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>>& layout);
+
+} // NKikimr::NBsController
diff --git a/ydb/core/mind/bscontroller/group_mapper.cpp b/ydb/core/mind/bscontroller/group_mapper.cpp
index 84b5ac6d7a..5619534fb3 100644
--- a/ydb/core/mind/bscontroller/group_mapper.cpp
+++ b/ydb/core/mind/bscontroller/group_mapper.cpp
@@ -1,105 +1,29 @@
#include "group_mapper.h"
#include "group_geometry_info.h"
+#include "group_layout_checker.h"
namespace NKikimr::NBsController {
- class TGroupMapper::TImpl : TNonCopyable {
- class TDomainMapper {
- std::unordered_map<TString, ui32> FailDomainId;
-
- public:
- ui32 operator ()(TString item) {
- return FailDomainId.emplace(std::move(item), FailDomainId.size() + 1).first->second;
- }
- };
-
- struct TPDiskLayoutPosition {
- ui32 RealmGroup = 0;
- ui32 RealmInGroup = 0;
- ui32 DomainGroup = 0;
- ui32 DomainInGroup = 0;
-
- TPDiskLayoutPosition() = default;
-
- TPDiskLayoutPosition(TDomainMapper& mapper, const TNodeLocation& location, TPDiskId pdiskId, const TGroupGeometryInfo& geom) {
- TStringStream realmGroup, realmInGroup, domainGroup, domainInGroup;
- const std::pair<int, TStringStream*> levels[] = {
- {geom.GetRealmLevelBegin(), &realmGroup},
- {geom.GetRealmLevelEnd(), &realmInGroup},
- {geom.GetDomainLevelBegin(), &domainGroup},
- {geom.GetDomainLevelEnd(), &domainInGroup}
- };
- auto addLevel = [&](int key, const TString& value) {
- for (const auto& [reference, stream] : levels) {
- if (key < reference) {
- Save(stream, std::make_tuple(key, value));
- }
- }
- };
- for (const auto& [key, value] : location.GetItems()) {
- addLevel(key, value);
- }
- addLevel(255, pdiskId.ToString()); // ephemeral level to distinguish between PDisks on the same node
- RealmGroup = mapper(realmGroup.Str());
- RealmInGroup = mapper(realmInGroup.Str());
- DomainGroup = mapper(domainGroup.Str());
- DomainInGroup = mapper(domainInGroup.Str());
- }
-
- auto AsTuple() const {
- return std::tie(RealmGroup, RealmInGroup, DomainGroup, DomainInGroup);
- }
-
- friend bool operator <(const TPDiskLayoutPosition& x, const TPDiskLayoutPosition& y) {
- return x.AsTuple() < y.AsTuple();
- }
+ using namespace NLayoutChecker;
- size_t GetDiffIndex(const TPDiskLayoutPosition& other) const {
- return RealmGroup != other.RealmGroup ? 0 :
- RealmInGroup != other.RealmInGroup ? 1 :
- DomainGroup != other.DomainGroup ? 2 :
- DomainInGroup != other.DomainInGroup ? 3 : 4;
- }
- };
-
- struct TFailDomainInfo;
-
- struct TPDiskInfo {
- const TNodeLocation Location;
- i64 SpaceAvailable;
- bool Usable;
- ui32 NumSlots;
- const ui32 MaxSlots;
+ class TGroupMapper::TImpl : TNonCopyable {
+ struct TPDiskInfo : TPDiskRecord {
TPDiskLayoutPosition Position;
- const TPDiskId PDiskId;
- TStackVec<ui32, 32> Groups;
- const bool Operational;
- TFailDomainInfo *FailDomain;
- bool Matching = false;
-
- TPDiskInfo(TNodeLocation location, bool usable, ui32 numSlots, ui32 maxSlots, TPDiskLayoutPosition position,
- const TPDiskId& pdiskId, const ui32 groupIds[], size_t numGroups, i64 spaceAvailable,
- bool operational, TFailDomainInfo *failDomain)
- : Location(std::move(location))
- , SpaceAvailable(spaceAvailable)
- , Usable(usable)
- , NumSlots(numSlots)
- , MaxSlots(maxSlots)
+ bool Matching;
+ ui32 NumDomainMatchingDisks;
+ ui32 SkipToNextRealmGroup;
+ ui32 SkipToNextRealm;
+ ui32 SkipToNextDomain;
+
+ TPDiskInfo(const TPDiskRecord& pdisk, TPDiskLayoutPosition position)
+ : TPDiskRecord(pdisk)
, Position(std::move(position))
- , PDiskId(pdiskId)
- , Groups(groupIds, groupIds + numGroups)
- , Operational(operational)
- , FailDomain(failDomain)
{
std::sort(Groups.begin(), Groups.end());
}
- TString ToString() const {
- return Location.ToString();
- }
-
bool IsUsable() const {
- return Usable && NumSlots < MaxSlots;
+ return Usable && !Decommitted && NumSlots < MaxSlots;
}
void InsertGroup(ui32 groupId) {
@@ -120,222 +44,428 @@ namespace NKikimr::NBsController {
};
using TPDisks = THashMap<TPDiskId, TPDiskInfo>;
+ using TPDiskByPosition = std::vector<std::pair<TPDiskLayoutPosition, TPDiskInfo*>>;
- struct TFailDomainInfo : std::vector<TPDisks::value_type*> {
- bool Matching;
- ui32 NumMatchingDisks;
- };
-
- struct TFailDomainGroup : std::unordered_map<ui32, TFailDomainInfo> {
- bool Matching;
- };
-
- struct TFailRealmInfo : std::unordered_map<ui32, TFailDomainGroup> {
- bool Matching;
- };
-
- struct TFailRealmGroup : std::unordered_map<ui32, TFailRealmInfo> {
- bool Matching;
- };
+ struct TComparePDiskByPosition {
+ bool operator ()(const TPDiskByPosition::value_type& x, const TPDiskLayoutPosition& y) const {
+ return x.first < y;
+ }
- struct TBox : std::unordered_map<ui32, TFailRealmGroup> {
- auto& operator ()(const TPDiskLayoutPosition& p) {
- return (*this)[p.RealmGroup][p.RealmInGroup][p.DomainGroup][p.DomainInGroup];
+ bool operator ()(const TPDiskLayoutPosition& x, const TPDiskByPosition::value_type& y) const {
+ return x < y.first;
}
};
- struct TAllocateContext {
- const ui32 NumFailRealms;
- const ui32 NumFailDomainsPerFailRealm;
- ui32 RealmGroup = 0; // the realm group we are forced to use (if forced, of course)
- TStackVec<ui32, 8> RealmInGroup; // indexed by realm number
- TStackVec<ui32, 8> DomainGroup; // indexed by realm number
- TStackVec<ui32, 32> DomainInGroup; // indexed by realm/domain
+ using TGroup = std::vector<TPDiskInfo*>;
+
+ struct TAllocator {
+ TImpl& Self;
+ const TBlobStorageGroupInfo::TTopology Topology;
THashSet<TPDiskId> OldGroupContent; // set of all existing disks in the group, inclusing ones which are replaced
- THashSet<TPDiskId> NewGroupContent; // newly generated group content
const i64 RequiredSpace;
const bool RequireOperational;
- TForbiddenPDisks Forbid;
-
- TAllocateContext(const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational,
- TForbiddenPDisks forbid)
- : NumFailRealms(geom.GetNumFailRealms())
- , NumFailDomainsPerFailRealm(geom.GetNumFailDomainsPerFailRealm())
- , RealmInGroup(NumFailRealms, 0)
- , DomainGroup(NumFailRealms, 0)
- , DomainInGroup(NumFailRealms * NumFailDomainsPerFailRealm, 0)
+ TForbiddenPDisks ForbiddenDisks;
+ THashMap<ui32, unsigned> LocalityFactor;
+ TGroupLayout GroupLayout;
+ std::optional<TScore> WorstScore;
+
+ TAllocator(TImpl& self, const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational,
+ TForbiddenPDisks forbiddenDisks, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks)
+ : Self(self)
+ , Topology(geom.GetType(), geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm(), geom.GetNumVDisksPerFailDomain(), true)
, RequiredSpace(requiredSpace)
, RequireOperational(requireOperational)
- , Forbid(std::move(forbid))
- {}
-
- bool ProcessExistingGroup(const TGroupDefinition& group, const TPDisks& pdisks, const TPDiskId replacedDiskIds[],
- size_t numReplacedDisks, TString& error) {
- OldGroupContent = {replacedDiskIds, replacedDiskIds + numReplacedDisks};
-
- for (ui32 failRealmIdx = 0, domainThroughIdx = 0; failRealmIdx < group.size(); ++failRealmIdx) {
- const auto& realm = group[failRealmIdx];
- for (ui32 failDomainIdx = 0; failDomainIdx < realm.size(); ++failDomainIdx, ++domainThroughIdx) {
- const auto& domain = realm[failDomainIdx];
- for (const TPDiskId pdiskId : domain) {
- if (pdiskId != TPDiskId()) {
- // add to used pdisk set
- const bool inserted = OldGroupContent.insert(pdiskId).second;
- Y_VERIFY(inserted);
-
- // find existing pdisk
- auto it = pdisks.find(pdiskId);
- if (it == pdisks.end()) {
- error = TStringBuilder() << "existing group contains missing PDisks";
- return false;
- }
- const TPDiskInfo& pdisk = it->second;
-
- // register the disk in context
- if (!AddDisk(pdisk, failRealmIdx, domainThroughIdx, error)) {
- return false;
- }
+ , ForbiddenDisks(std::move(forbiddenDisks))
+ , GroupLayout(Topology)
+ {
+ for (const auto& [vdiskId, pdiskId] : replacedDisks) {
+ OldGroupContent.insert(pdiskId);
+ }
+ }
+
+ TGroup ProcessExistingGroup(const TGroupDefinition& group, TString& error) {
+ TGroup res(Topology.GetTotalVDisksNum());
+
+ struct TExError { TString error; };
+
+ try {
+ Traverse(group, [&](TVDiskIdShort vdisk, TPDiskId pdiskId) {
+ if (pdiskId != TPDiskId()) {
+ const ui32 orderNumber = Topology.GetOrderNumber(vdisk);
+
+ const auto it = Self.PDisks.find(pdiskId);
+ if (it == Self.PDisks.end()) {
+ throw TExError{TStringBuilder() << "existing group contains missing PDiskId# " << pdiskId};
+ }
+ TPDiskInfo& pdisk = it->second;
+ res[orderNumber] = &pdisk;
+
+ const auto [_, inserted] = OldGroupContent.insert(pdiskId);
+ if (!inserted) {
+ throw TExError{TStringBuilder() << "group contains duplicate PDiskId# " << pdiskId};
+ }
+
+ if (!pdisk.Decommitted) {
+ AddUsedDisk(pdisk);
+ GroupLayout.AddDisk(pdisk.Position, orderNumber);
}
}
- }
+ });
+ } catch (const TExError& e) {
+ error = e.error;
+ return {};
}
- return true;
+
+ return res;
}
- bool AddDisk(const TPDiskInfo& pdisk, ui32 failRealmIdx, ui32 domainThroughIdx, TString& error) {
- auto update = [](ui32& place, ui32 value) {
- const ui32 prev = std::exchange(place, value);
- return !prev || prev == value;
- };
- if (!update(RealmGroup, pdisk.Position.RealmGroup)) {
- error = "group contains PDisks from different realm groups";
- } else if (!update(RealmInGroup[failRealmIdx], pdisk.Position.RealmInGroup)) {
- error = "group contains PDisks from different realms within same realm";
- } else if (!update(DomainGroup[failRealmIdx], pdisk.Position.DomainGroup)) {
- error = "group contains PDisks from different domain groups within same realm";
- } else if (!update(DomainInGroup[domainThroughIdx], pdisk.Position.DomainInGroup)) {
- error = "group contains PDisks from different domain groups within same realm";
- } else if (!NewGroupContent.insert(pdisk.PDiskId).second) {
- error = "group contains duplicate PDisks";
- } else {
- return true;
+ void Decompose(const TGroup& in, TGroupDefinition& out) {
+ for (ui32 i = 0; i < in.size(); ++i) {
+ const TVDiskIdShort vdisk = Topology.GetVDiskId(i);
+ out[vdisk.FailRealm][vdisk.FailDomain][vdisk.VDisk] = in[i]->PDiskId;
}
- return false;
}
bool DiskIsUsable(const TPDiskInfo& pdisk) const {
if (!pdisk.IsUsable()) {
return false; // disk is not usable in this case
}
- if (OldGroupContent.count(pdisk.PDiskId) || NewGroupContent.count(pdisk.PDiskId) || Forbid.count(pdisk.PDiskId)) {
+ if (OldGroupContent.contains(pdisk.PDiskId) || ForbiddenDisks.contains(pdisk.PDiskId)) {
return false; // can't allow duplicate disks
}
if (RequireOperational && !pdisk.Operational) {
return false;
}
- return pdisk.SpaceAvailable >= RequiredSpace;
+ if (pdisk.SpaceAvailable < RequiredSpace) {
+ return false;
+ }
+ return true;
}
- };
- class THelper {
- TImpl& Self;
- TAllocateContext& Ctx;
- std::unordered_map<ui32, unsigned> LocalityFactor;
+ TPDiskByPosition SetupMatchingDisks(ui32 maxScore) {
+ TPDiskByPosition res;
+ res.reserve(Self.PDiskByPosition.size());
- public:
- THelper(TImpl& self, TAllocateContext& ctx)
- : Self(self)
- , Ctx(ctx)
- {
- for (const TPDiskId& pdiskId : Ctx.NewGroupContent) {
- if (const auto it = Self.PDisks.find(pdiskId); it != Self.PDisks.end()) {
- for (ui32 groupId : it->second.Groups) {
- ++LocalityFactor[groupId];
+ ui32 realmGroupBegin = 0;
+ ui32 realmBegin = 0;
+ ui32 domainBegin = 0;
+ TPDiskLayoutPosition prev;
+
+ std::vector<ui32> numMatchingDisksInDomain(Self.DomainMapper.GetIdCount(), 0);
+ for (const auto& [position, pdisk] : Self.PDiskByPosition) {
+ pdisk->Matching = pdisk->GetPickerScore() <= maxScore && DiskIsUsable(*pdisk);
+ if (pdisk->Matching) {
+ if (position.RealmGroup != prev.RealmGroup) {
+ for (; realmGroupBegin < res.size(); ++realmGroupBegin) {
+ res[realmGroupBegin].second->SkipToNextRealmGroup = res.size() - realmGroupBegin;
+ }
}
- } else {
- Y_FAIL();
- }
- }
- if (const ui32 id = Ctx.RealmGroup) {
- auto& realmGroup = Self.Box.at(id);
- Y_VERIFY_DEBUG(realmGroup.Matching);
- realmGroup.Matching = false;
- for (size_t i = 0; i < Ctx.NumFailRealms; ++i) {
- if (const ui32 id = Ctx.RealmInGroup[i]) {
- auto& realm = realmGroup.at(id);
- Y_VERIFY_DEBUG(realm.Matching);
- realm.Matching = false;
- if (const ui32 id = Ctx.DomainGroup[i]) {
- auto& domainGroup = realm.at(id);
- Y_VERIFY_DEBUG(domainGroup.Matching);
- domainGroup.Matching = false;
- for (size_t j = 0; j < Ctx.NumFailDomainsPerFailRealm; ++j) {
- const size_t domainThroughIdx = i * Ctx.NumFailDomainsPerFailRealm + j;
- if (const ui32 id = Ctx.DomainInGroup[domainThroughIdx]) {
- auto& domain = domainGroup.at(id);
- Y_VERIFY_DEBUG(domain.Matching);
- domain.Matching = false;
- }
- }
+ if (position.Realm != prev.Realm) {
+ for (; realmBegin < res.size(); ++realmBegin) {
+ res[realmBegin].second->SkipToNextRealm = res.size() - realmBegin;
}
}
+ if (position.Domain != prev.Domain) {
+ for (; domainBegin < res.size(); ++domainBegin) {
+ res[domainBegin].second->SkipToNextDomain = res.size() - domainBegin;
+ }
+ }
+ prev = position;
+
+ res.emplace_back(position, pdisk);
+ ++numMatchingDisksInDomain[position.Domain.Index()];
}
}
+ for (; realmGroupBegin < res.size(); ++realmGroupBegin) {
+ res[realmGroupBegin].second->SkipToNextRealmGroup = res.size() - realmGroupBegin;
+ }
+ for (; realmBegin < res.size(); ++realmBegin) {
+ res[realmBegin].second->SkipToNextRealm = res.size() - realmBegin;
+ }
+ for (; domainBegin < res.size(); ++domainBegin) {
+ res[domainBegin].second->SkipToNextDomain = res.size() - domainBegin;
+ }
+ for (const auto& [position, pdisk] : res) {
+ pdisk->NumDomainMatchingDisks = numMatchingDisksInDomain[position.Domain.Index()];
+ }
+
+ return std::move(res);
}
- TPDiskId AddBestDisk(ui32 realmIdx, ui32 domainThroughIdx) {
- TPDiskInfo *pdisk = nullptr;
- auto descend = [&](auto& level, ui32& id, ui32 TPDiskLayoutPosition::*pos) -> auto& {
- if (!id) {
- pdisk = pdisk ? pdisk : FindBestDisk(level);
- id = pdisk->Position.*pos;
- level[id].Matching = false; // do not allow this level for further selection
- } else if (pdisk) {
- Y_VERIFY(id == pdisk->Position.*pos);
- }
- return level[id];
+ struct TUndoLog {
+ struct TItem {
+ ui32 Index;
+ TPDiskInfo *PDisk;
};
- auto& realmGroup = descend(Self.Box, Ctx.RealmGroup, &TPDiskLayoutPosition::RealmGroup);
- auto& realm = descend(realmGroup, Ctx.RealmInGroup[realmIdx], &TPDiskLayoutPosition::RealmInGroup);
- auto& domainGroup = descend(realm, Ctx.DomainGroup[realmIdx], &TPDiskLayoutPosition::DomainGroup);
- auto& domain = descend(domainGroup, Ctx.DomainInGroup[domainThroughIdx], &TPDiskLayoutPosition::DomainInGroup);
- pdisk = pdisk ? pdisk : FindBestDisk(domain);
- TString error;
- const bool success = Ctx.AddDisk(*pdisk, realmIdx, domainThroughIdx, error);
- Y_VERIFY(success, "AddDisk: %s", error.data());
- pdisk->Matching = false; // disable this disk for further selection
+ std::vector<TItem> Items;
+
+ void Log(ui32 index, TPDiskInfo *pdisk) {
+ Items.push_back({index, pdisk});
+ }
+ size_t GetPosition() const {
+ return Items.size();
+ }
+ };
+
+ void AddDiskViaUndoLog(TUndoLog& undo, TGroup& group, ui32 index, TPDiskInfo *pdisk) {
+ undo.Log(index, pdisk);
+ group[index] = pdisk;
AddUsedDisk(*pdisk);
+ GroupLayout.AddDisk(pdisk->Position, index);
+ WorstScore.reset(); // invalidate score
+ }
- return pdisk->PDiskId;
+ void Revert(TUndoLog& undo, TGroup& group, size_t until) {
+ for (; undo.Items.size() > until; undo.Items.pop_back()) {
+ const auto& item = undo.Items.back();
+ group[item.Index] = nullptr;
+ RemoveUsedDisk(*item.PDisk);
+ GroupLayout.RemoveDisk(item.PDisk->Position, item.Index);
+ WorstScore.reset(); // invalidate score
+ }
}
- private:
+ bool FillInGroup(ui32 maxScore, TUndoLog& undo, TGroup& group) {
+ // determine PDisks that fit our requirements (including score)
+ auto v = SetupMatchingDisks(maxScore);
+
+ // find which entities we need to allocate -- whole group, some realms, maybe some domains within specific realms?
+ bool isEmptyGroup = true;
+ std::vector<bool> isEmptyRealm(Topology.GetTotalFailRealmsNum(), true);
+ std::vector<bool> isEmptyDomain(Topology.GetTotalFailDomainsNum(), true);
+ for (ui32 orderNumber = 0; orderNumber < group.size(); ++orderNumber) {
+ if (group[orderNumber]) {
+ const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber);
+ isEmptyGroup = false;
+ isEmptyRealm[vdisk.FailRealm] = false;
+ const ui32 domainIdx = Topology.GetFailDomainOrderNumber(vdisk);
+ isEmptyDomain[domainIdx] = false;
+ }
+ }
+
+ auto allocate = [&](auto what, ui32 index) {
+ TDynBitMap forbiddenEntities;
+ forbiddenEntities.Reserve(Self.DomainMapper.GetIdCount());
+ if (!AllocateWholeEntity(what, group, undo, index, {v.begin(), v.end()}, forbiddenEntities)) {
+ Revert(undo, group, 0);
+ return false;
+ }
+ return true;
+ };
+
+ if (isEmptyGroup) {
+ return allocate(TAllocateWholeGroup(), 0);
+ }
+
+ const ui32 numFailDomainsPerFailRealm = Topology.GetNumFailDomainsPerFailRealm();
+ const ui32 numVDisksPerFailDomain = Topology.GetNumVDisksPerFailDomain();
+ ui32 domainOrderNumber = 0;
+ ui32 orderNumber = 0;
+
+ // scan all fail realms and allocate missing realms or their parts
+ for (ui32 failRealmIdx = 0; failRealmIdx < isEmptyRealm.size(); ++failRealmIdx) {
+ if (isEmptyRealm[failRealmIdx]) {
+ // we have an empty realm -- we have to allocate it fully
+ if (!allocate(TAllocateWholeRealm(), failRealmIdx)) {
+ return false;
+ }
+ // skip to next realm
+ domainOrderNumber += numFailDomainsPerFailRealm;
+ orderNumber += numVDisksPerFailDomain * numFailDomainsPerFailRealm;
+ continue;
+ }
+
+ // scan through domains of this realm, find unallocated ones
+ for (ui32 failDomainIdx = 0; failDomainIdx < numFailDomainsPerFailRealm; ++failDomainIdx, ++domainOrderNumber) {
+ if (isEmptyDomain[domainOrderNumber]) {
+ // try to allocate full domain
+ if (!allocate(TAllocateWholeDomain(), domainOrderNumber)) {
+ return false;
+ }
+ // skip to next domain
+ orderNumber += numVDisksPerFailDomain;
+ continue;
+ }
+
+ // scan individual disks of the domain and fill gaps
+ for (ui32 vdiskIdx = 0; vdiskIdx < numVDisksPerFailDomain; ++vdiskIdx, ++orderNumber) {
+ if (!group[orderNumber] && !allocate(TAllocateDisk(), orderNumber)) {
+ return false;
+ }
+ }
+ }
+ }
+
+ Y_VERIFY(domainOrderNumber == Topology.GetTotalFailDomainsNum());
+ Y_VERIFY(orderNumber == Topology.GetTotalVDisksNum());
+
+ return true;
+ }
+
+ using TAllocateResult = TPDiskLayoutPosition*;
+
+ struct TAllocateDisk {};
+
+ struct TAllocateWholeDomain {
+ static constexpr auto GetEntityCount = &TBlobStorageGroupInfo::TTopology::GetNumVDisksPerFailDomain;
+ using TNestedEntity = TAllocateDisk;
+
+ static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x, TEntityId& scope) {
+ scope = x.Domain;
+ return {x, x};
+ }
+ };
+
+ struct TAllocateWholeRealm {
+ static constexpr auto GetEntityCount = &TBlobStorageGroupInfo::TTopology::GetNumFailDomainsPerFailRealm;
+ using TNestedEntity = TAllocateWholeDomain;
+
+ static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x, TEntityId& scope) {
+ scope = x.Realm;
+ return {{x.RealmGroup, x.Realm, TEntityId::Min()}, {x.RealmGroup, x.Realm, TEntityId::Max()}};
+ }
+ };
+
+ struct TAllocateWholeGroup {
+ static constexpr auto GetEntityCount = &TBlobStorageGroupInfo::TTopology::GetTotalFailRealmsNum;
+ using TNestedEntity = TAllocateWholeRealm;
+
+ static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x, TEntityId& scope) {
+ scope = x.RealmGroup;
+ return {{x.RealmGroup, TEntityId::Min(), TEntityId::Min()}, {x.RealmGroup, TEntityId::Max(), TEntityId::Max()}};
+ }
+ };
+
+ using TDiskRange = std::pair<TPDiskByPosition::const_iterator, TPDiskByPosition::const_iterator>;
+
template<typename T>
- TPDiskInfo *FindBestDisk(T& level) {
- TPDiskInfo *res = nullptr;
- for (auto& [id, item] : level) {
- if (item.Matching) {
- TPDiskInfo *candidate = FindBestDisk(item);
- Y_VERIFY(candidate);
- res = !res || DiskIsBetter(*candidate, *res) ? candidate : res;
+ TAllocateResult AllocateWholeEntity(T, TGroup& group, TUndoLog& undo, ui32 parentEntityIndex, TDiskRange range,
+ TDynBitMap& forbiddenEntities) {
+ // number of enclosed child entities within this one
+ const ui32 entityCount = (Topology.*T::GetEntityCount)();
+ Y_VERIFY(entityCount);
+ parentEntityIndex *= entityCount;
+ // remember current undo stack size
+ const size_t undoPosition = undo.GetPosition();
+
+ for (;;) {
+ auto [from, to] = range;
+ TPDiskLayoutPosition *prefix;
+ TEntityId scope;
+
+ for (ui32 index = 0;; ++index) {
+ // allocate nested entity
+ prefix = AllocateWholeEntity(typename T::TNestedEntity(), group, undo, parentEntityIndex + index,
+ {from, to}, forbiddenEntities);
+
+ if (prefix) {
+ if (!index) {
+ // reduce range to specific realm/domain entity
+ auto [min, max] = T::MakeRange(*prefix, scope);
+ from = std::lower_bound(from, to, min, TComparePDiskByPosition());
+ to = std::upper_bound(from, to, max, TComparePDiskByPosition());
+ }
+ if (index + 1 == entityCount) {
+ // disable filled entity from further selection if it was really allocated
+ forbiddenEntities.Set(scope.Index());
+ return prefix;
+ }
+ } else if (index) {
+ // disable just checked entity (to prevent its selection again)
+ forbiddenEntities.Set(scope.Index());
+ // try another entity at this level
+ Revert(undo, group, undoPosition);
+ // break the loop and retry
+ break;
+ } else {
+ // no chance to allocate new entity, exit
+ return {};
+ }
}
}
- Y_VERIFY(res);
- return res;
}
- TPDiskInfo *FindBestDisk(TFailDomainInfo& level) {
- TPDiskInfo *res = nullptr;
- for (TPDisks::value_type *it : level) {
- auto& [pdiskId, pdisk] = *it;
- if (pdisk.Matching) {
- res = !res || DiskIsBetter(pdisk, *res) ? &pdisk : res;
+ TAllocateResult AllocateWholeEntity(TAllocateDisk, TGroup& group, TUndoLog& undo, ui32 index, TDiskRange range,
+ TDynBitMap& forbiddenEntities) {
+ TPDiskInfo *pdisk = group[index];
+ Y_VERIFY(!pdisk);
+ auto process = [this, &pdisk](TPDiskInfo *candidate) {
+ if (!pdisk || DiskIsBetter(*candidate, *pdisk)) {
+ pdisk = candidate;
+ }
+ };
+ FindMatchingDiskBasedOnScore(process, group, index, range, forbiddenEntities);
+ if (pdisk) {
+ AddDiskViaUndoLog(undo, group, index, pdisk);
+ pdisk->Matching = false;
+ return &pdisk->Position;
+ } else {
+ return {};
+ }
+ }
+
+ TScore CalculateWorstScoreWithCache(const TGroup& group) {
+ if (!WorstScore) {
+ // find the worst disk from a position of layout correctness and use it as a milestone for other
+ // disks -- they can't be misplaced worse
+ TScore worstScore;
+ for (ui32 i = 0; i < Topology.GetTotalVDisksNum(); ++i) {
+ if (TPDiskInfo *pdisk = group[i]; pdisk && !pdisk->Decommitted) {
+ // calculate score for this pdisk, removing it from the set first -- to prevent counting itself
+ const TScore score = GroupLayout.GetExcludedDiskScore(pdisk->Position, i);
+ if (worstScore.BetterThan(score)) {
+ worstScore = score;
+ }
+ }
}
+ WorstScore = worstScore;
+ }
+ return *WorstScore;
+ }
+
+ template<typename TCallback>
+ void FindMatchingDiskBasedOnScore(
+ TCallback&& cb, // callback to be invoked for every matching candidate
+ const TGroup& group, // group with peer disks
+ ui32 orderNumber, // order number of disk being allocated
+ TDiskRange range, // range of PDisk candidates to scan
+ TDynBitMap& forbiddenEntities) { // a set of forbidden TEntityId's prevented from allocation
+ // first, find the best score for current group layout -- we can't make failure model inconsistency
+ // any worse than it already is
+ TScore bestScore = CalculateWorstScoreWithCache(group);
+
+ std::vector<TPDiskInfo*> candidates;
+
+ // scan the candidate range
+ while (range.first != range.second) {
+ const auto& [position, pdisk] = *range.first++;
+
+ // skip inappropriate disks, whole realm groups, realms and domains
+ if (!pdisk->Matching) {
+ // just do nothing, skip this candidate disk
+ } else if (forbiddenEntities[position.RealmGroup.Index()]) {
+ range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextRealmGroup - 1);
+ } else if (forbiddenEntities[position.Realm.Index()]) {
+ range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextRealm - 1);
+ } else if (forbiddenEntities[position.Domain.Index()]) {
+ range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextDomain - 1);
+ } else {
+ const TScore score = GroupLayout.GetCandidateScore(position, orderNumber);
+ if (score.BetterThan(bestScore)) {
+ candidates.clear();
+ bestScore = score;
+ }
+ if (score.SameAs(bestScore)) {
+ candidates.push_back(pdisk);
+ }
+ }
+ }
+
+ for (TPDiskInfo *pdisk : candidates) {
+ cb(pdisk);
}
- Y_VERIFY(res);
- return res;
}
bool DiskIsBetter(TPDiskInfo& pretender, TPDiskInfo& king) const {
@@ -344,10 +474,8 @@ namespace NKikimr::NBsController {
} else if (GivesLocalityBoost(pretender, king) || BetterQuotaMatch(pretender, king)) {
return true;
} else {
- const TFailDomainInfo& pretenderDomain = Self.Box(pretender.Position);
- const TFailDomainInfo& kingDomain = Self.Box(king.Position);
- if (pretenderDomain.NumMatchingDisks != kingDomain.NumMatchingDisks) {
- return pretenderDomain.NumMatchingDisks > kingDomain.NumMatchingDisks;
+ if (pretender.NumDomainMatchingDisks != king.NumDomainMatchingDisks) {
+ return pretender.NumDomainMatchingDisks > king.NumDomainMatchingDisks;
}
return pretender.PDiskId < king.PDiskId;
}
@@ -369,6 +497,14 @@ namespace NKikimr::NBsController {
}
}
+ void RemoveUsedDisk(const TPDiskInfo& pdisk) {
+ for (ui32 groupId : pdisk.Groups) {
+ if (!--LocalityFactor[groupId]) {
+ LocalityFactor.erase(groupId);
+ }
+ }
+ }
+
unsigned GetLocalityFactor(const TPDiskInfo& pdisk) const {
unsigned res = 0;
for (ui32 groupId : pdisk.Groups) {
@@ -388,7 +524,8 @@ namespace NKikimr::NBsController {
const bool Randomize;
TDomainMapper DomainMapper;
TPDisks PDisks;
- TBox Box;
+ TPDiskByPosition PDiskByPosition;
+ bool Dirty = false;
public:
TImpl(TGroupGeometryInfo geom, bool randomize)
@@ -396,18 +533,17 @@ namespace NKikimr::NBsController {
, Randomize(randomize)
{}
- bool RegisterPDisk(TPDiskId pdiskId, TNodeLocation location, bool usable, ui32 numSlots, ui32 maxSlots,
- const ui32 groupIds[], size_t numGroups, i64 spaceAvailable, bool operational) {
+ bool RegisterPDisk(const TPDiskRecord& pdisk) {
// calculate disk position
- const TPDiskLayoutPosition p(DomainMapper, location, pdiskId, Geom);
+ const TPDiskLayoutPosition p(DomainMapper, pdisk.Location, pdisk.PDiskId, Geom);
// insert PDisk into specific map
TPDisks::iterator it;
bool inserted;
- std::tie(it, inserted) = PDisks.try_emplace(pdiskId, std::move(location), usable, numSlots, maxSlots,
- p, pdiskId, groupIds, numGroups, spaceAvailable, operational, &Box(p));
+ std::tie(it, inserted) = PDisks.try_emplace(pdisk.PDiskId, pdisk, p);
if (inserted) {
- it->second.FailDomain->push_back(&*it);
+ PDiskByPosition.emplace_back(it->second.Position, &it->second);
+ Dirty = true;
}
return inserted;
@@ -416,16 +552,9 @@ namespace NKikimr::NBsController {
void UnregisterPDisk(TPDiskId pdiskId) {
const auto it = PDisks.find(pdiskId);
Y_VERIFY(it != PDisks.end());
- TFailDomainInfo& fdom = *it->second.FailDomain;
- bool erased = false;
- for (auto x = fdom.begin(); x != fdom.end(); ++x) {
- if (*x == &*it) {
- fdom.erase(x);
- erased = true;
- break;
- }
- }
- Y_VERIFY(erased);
+ auto x = std::remove(PDiskByPosition.begin(), PDiskByPosition.end(), std::make_pair(it->second.Position, &it->second));
+ Y_VERIFY(x + 1 == PDiskByPosition.end());
+ PDiskByPosition.pop_back();
PDisks.erase(it);
}
@@ -435,332 +564,142 @@ namespace NKikimr::NBsController {
it->second.SpaceAvailable += increment;
}
- TString FormatPDisks(const TAllocateContext& ctx) const {
+ TString FormatPDisks(const TAllocator& allocator) const {
TStringStream s;
s << "PDisks# ";
- bool first1 = true;
- for (const auto& [id, realmGroup] : Box) {
- s << (std::exchange(first1, false) ? "" : " | ") << "<";
- bool first2 = true;
- for (const auto& [id, realm] : realmGroup) {
- s << (std::exchange(first2, false) ? "" : " ") << "{";
- bool first3 = true;
- for (const auto& [id, domainGroup] : realm) {
- s << (std::exchange(first3, false) ? "" : " | ") << "[";
- bool first4 = true;
- for (const auto& [id, domain] : domainGroup) {
- if (!domain.empty()) {
- s << (std::exchange(first4, false) ? "" : " ") << "(";
- std::vector<TPDisks::value_type*> v(domain);
- auto comp = [](const auto *x, const auto *y) { return x->first < y->first; };
- std::sort(v.begin(), v.end(), comp);
- bool first5 = true;
- for (const TPDisks::value_type *it : v) {
- s << (std::exchange(first5, false) ? "" : " ")
- << it->first.NodeId << ":" << it->first.PDiskId;
- if (ctx.OldGroupContent.count(it->first)) {
- s << "*";
- }
- const char *minus = "-";
- if (ctx.Forbid.count(it->second.PDiskId)) {
- s << std::exchange(minus, "") << "f";
- }
- if (!it->second.Usable) {
- s << std::exchange(minus, "") << "u";
- }
- if (it->second.NumSlots >= it->second.MaxSlots) {
- s << std::exchange(minus, "") << "m";
- }
- if (it->second.NumSlots >= it->second.MaxSlots) {
- s << std::exchange(minus, "") << "s";
- }
- if (it->second.SpaceAvailable < ctx.RequiredSpace) {
- s << std::exchange(minus, "") << "v";
- }
- if (!it->second.Operational) {
- s << std::exchange(minus, "") << "o";
- }
- if (ctx.DiskIsUsable(it->second)) {
- s << "+";
- }
- }
- s << ")";
- }
- }
- s << "]";
+ if (!PDiskByPosition.empty()) {
+ s << "{[(";
+ TPDiskLayoutPosition prevPosition = PDiskByPosition.front().first;
+ const char *space = "";
+ for (const auto& [position, pdisk] : PDiskByPosition) {
+ if (prevPosition != position) {
+ s << (prevPosition.Domain != position.Domain ? ")" : "")
+ << (prevPosition.Realm != position.Realm ? "]" : "")
+ << (prevPosition.RealmGroup != position.RealmGroup ? "} {" : "")
+ << (prevPosition.Realm != position.Realm ? "[" : "")
+ << (prevPosition.Domain != position.Domain ? "(" : "");
+ space = "";
+ }
+
+ s << std::exchange(space, " ") << pdisk->PDiskId;
+
+ if (allocator.OldGroupContent.contains(pdisk->PDiskId)) {
+ s << "*";
+ }
+ const char *minus = "-";
+ if (allocator.ForbiddenDisks.contains(pdisk->PDiskId)) {
+ s << std::exchange(minus, "") << "f";
+ }
+ if (!pdisk->Usable) {
+ s << std::exchange(minus, "") << "u";
}
- s << "}";
+ if (pdisk->Decommitted) {
+ s << std::exchange(minus, "") << "d";
+ }
+ if (pdisk->NumSlots >= pdisk->MaxSlots) {
+ s << std::exchange(minus, "") << "s[" << pdisk->NumSlots << "/" << pdisk->MaxSlots << "]";
+ }
+ if (pdisk->SpaceAvailable < allocator.RequiredSpace) {
+ s << std::exchange(minus, "") << "v";
+ }
+ if (!pdisk->Operational) {
+ s << std::exchange(minus, "") << "o";
+ }
+ if (allocator.DiskIsUsable(*pdisk)) {
+ s << "+";
+ }
+
+ prevPosition = position;
}
- s << ">";
+ s << ")]}";
+ } else {
+ s << "<empty>";
}
return s.Str();
}
- bool AllocateGroup(ui32 groupId, TGroupDefinition& group, const TPDiskId replacedDiskIds[],
- size_t numReplacedDisks, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational,
+ bool AllocateGroup(ui32 groupId, TGroupDefinition& groupDefinition, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks,
+ TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational,
TString& error) {
- // fill in the allocation context
- TAllocateContext ctx(Geom, requiredSpace, requireOperational, std::move(forbid));
- if (!ctx.ProcessExistingGroup(group, PDisks, replacedDiskIds, numReplacedDisks, error)) {
- return false;
+ if (Dirty) {
+ std::sort(PDiskByPosition.begin(), PDiskByPosition.end());
+ Dirty = false;
}
// create group of required size, if it is not created yet
- if (!Geom.ResizeGroup(group)) {
+ if (!Geom.ResizeGroup(groupDefinition)) {
error = "incorrect existing group";
return false;
}
- // if the group is already created, check for missing entities
- bool hasMissingEntities = false;
- for (const auto& realm : group) {
- for (const auto& domain : realm) {
- for (const TPDiskId& pdiskId : domain) {
- if (pdiskId == TPDiskId()) {
- hasMissingEntities = true;
- break;
- }
- }
- if (hasMissingEntities) {
- break;
- }
- }
- if (hasMissingEntities) {
- break;
- }
- }
- if (!hasMissingEntities) {
- return true; // group is okay
- }
-
- // adjust number of slots
- for (TPDiskId pdiskId : ctx.OldGroupContent) {
- --PDisks.at(pdiskId).NumSlots;
- }
- for (size_t i = 0; i < numReplacedDisks; ++i) {
- PDisks.at(replacedDiskIds[i]).EraseGroup(groupId);
- }
-
- // check if we can map a group only with PDisks that have NumSlots <= nums
- if (!FindMatchingDisks(ctx)) {
- // undo changes to the mapper content
- for (TPDiskId pdiskId : ctx.OldGroupContent) {
- ++PDisks.at(pdiskId).NumSlots;
- }
- for (size_t i = 0; i < numReplacedDisks; ++i) {
- PDisks.at(replacedDiskIds[i]).InsertGroup(groupId);
- }
- error = "no group options " + FormatPDisks(ctx);
+ // fill in the allocation context
+ TAllocator allocator(*this, Geom, requiredSpace, requireOperational, std::move(forbid), replacedDisks);
+ TGroup group = allocator.ProcessExistingGroup(groupDefinition, error);
+ if (group.empty()) {
return false;
}
-
- // fill in missing PDisk entities
- THelper helper(*this, ctx);
- for (ui32 realmIdx = 0, domainThroughIdx = 0; realmIdx < ctx.NumFailRealms; ++realmIdx) {
- for (ui32 domainIdx = 0; domainIdx < ctx.NumFailDomainsPerFailRealm; ++domainIdx, ++domainThroughIdx) {
- for (TPDiskId& pdiskId : group[realmIdx][domainIdx]) {
- if (pdiskId == TPDiskId()) {
- pdiskId = helper.AddBestDisk(realmIdx, domainThroughIdx);
- }
- }
- }
- }
-
- // adjust number of slots
- for (TPDiskId pdiskId : ctx.NewGroupContent) {
- if (const auto it = PDisks.find(pdiskId); it != PDisks.end()) {
- ++it->second.NumSlots;
- it->second.InsertGroup(groupId);
- } else {
- Y_FAIL();
+ bool ok = true;
+ for (TPDiskInfo *pdisk : group) {
+ if (!pdisk) {
+ ok = false;
+ break;
}
}
-
- return true;
- }
-
- class TScorePicker {
- static constexpr size_t MaxStaticItems = 16;
- ui32 NumFake = 0;
- TStackVec<ui32, MaxStaticItems> StaticHist;
- std::map<ui32, ui32> ScoreHist;
-
- public:
- static constexpr ui32 FakeScore = Max<ui32>();
-
- public:
- void AddFake() {
- ++NumFake;
+ if (ok) {
+ return true;
}
- void Add(ui32 score) {
- if (score < MaxStaticItems) {
- if (StaticHist.size() <= score) {
- StaticHist.resize(score + 1);
- }
- ++StaticHist[score];
- } else {
- ++ScoreHist[score];
+ // calculate score table
+ std::vector<ui32> scores;
+ for (const auto& [pdiskId, pdisk] : PDisks) {
+ if (allocator.DiskIsUsable(pdisk)) {
+ scores.push_back(pdisk.GetPickerScore());
}
}
-
- std::optional<ui32> CalculateMaxScore(ui32 threshold) const {
- if (threshold <= NumFake) {
- return FakeScore;
+ std::sort(scores.begin(), scores.end());
+ scores.erase(std::unique(scores.begin(), scores.end()), scores.end());
+
+ // bisect scores to find optimal working one
+ std::optional<TGroup> result;
+ ui32 begin = 0, end = scores.size();
+ while (begin < end) {
+ const ui32 mid = begin + (end - begin) / 2;
+ TAllocator::TUndoLog undo;
+ if (allocator.FillInGroup(scores[mid], undo, group)) {
+ result = group;
+ allocator.Revert(undo, group, 0);
+ end = mid;
} else {
- threshold -= NumFake;
- }
- for (size_t score = 0; score < StaticHist.size(); ++score) {
- if (threshold <= StaticHist[score]) {
- return score;
- } else {
- threshold -= StaticHist[score];
- }
+ begin = mid + 1;
}
- for (const auto& [score, num] : ScoreHist) {
- if (threshold <= num) {
- return score;
- } else {
- threshold -= num;
- }
- }
- Y_VERIFY_DEBUG(threshold);
- return std::nullopt;
}
- void Cascade(TScorePicker& parent, ui32 threshold) {
- if (std::optional<ui32> maxScore = CalculateMaxScore(threshold)) {
- if (*maxScore != FakeScore) {
- parent.Add(*maxScore);
- } else {
- parent.AddFake();
- }
+ if (result) {
+ for (const auto& [vdiskId, pdiskId] : replacedDisks) {
+ const auto it = PDisks.find(pdiskId);
+ Y_VERIFY(it != PDisks.end());
+ TPDiskInfo& pdisk = it->second;
+ --pdisk.NumSlots;
+ pdisk.EraseGroup(groupId);
}
- }
- };
-
- bool FindMatchingDisks(const TAllocateContext& ctx) {
- struct TMatchingDisk {
- TPDiskInfo *PDisk;
- bool IsInGroup;
- ui32 Score = 0;
-
- TPDiskInfo *operator ->() const { return PDisk; }
- bool operator <(const TMatchingDisk& other) const { return PDisk->Position < other->Position; }
- };
-
- TScorePicker realmGroupPicker;
- std::vector<TMatchingDisk> matchingDisks;
- for (auto& [id, realmGroup] : Box) {
- TScorePicker realmPicker;
- realmGroup.Matching = false;
- for (auto& [id, realm] : realmGroup) {
- TScorePicker domainGroupPicker;
- realm.Matching = false;
- for (auto& [id, domainGroup] : realm) {
- TScorePicker domainPicker;
- domainGroup.Matching = false;
- for (auto& [id, domain] : domainGroup) {
- TScorePicker diskPicker;
- domain.Matching = false;
- domain.NumMatchingDisks = 0;
- for (TPDisks::value_type *it : domain) {
- auto& [pdiskId, pdisk] = *it;
- pdisk.Matching = ctx.DiskIsUsable(pdisk);
- if (pdisk.Matching) {
- const ui32 score = pdisk.GetPickerScore();
- matchingDisks.push_back(TMatchingDisk{&pdisk, false, score});
- diskPicker.Add(score);
- } else if (ctx.NewGroupContent.count(pdiskId)) {
- // we create a fake record to keep correct count of fail realms and domains, but
- // this particular one never gets selected
- matchingDisks.push_back(TMatchingDisk{&pdisk, true});
- diskPicker.AddFake();
- }
- }
- diskPicker.Cascade(domainPicker, Geom.GetNumVDisksPerFailDomain());
- }
- domainPicker.Cascade(domainGroupPicker, Geom.GetNumFailDomainsPerFailRealm());
+ ui32 numZero = 0;
+ for (ui32 i = 0; i < allocator.Topology.GetTotalVDisksNum(); ++i) {
+ if (!group[i]) {
+ ++numZero;
+ TPDiskInfo *pdisk = result->at(i);
+ ++pdisk->NumSlots;
+ pdisk->InsertGroup(groupId);
}
- domainGroupPicker.Cascade(realmPicker, 1);
}
- realmPicker.Cascade(realmGroupPicker, Geom.GetNumFailRealms());
- }
-
- bool boxMatching = false;
-
- if (const std::optional<ui32> maxScore = realmGroupPicker.CalculateMaxScore(1)) {
- Y_VERIFY_DEBUG(*maxScore != TScorePicker::FakeScore);
-
- // remove all mismatched candidate disks and sort them according to their position
- auto remove = [maxScore = *maxScore](const TMatchingDisk& p) {
- p->Matching = p->Matching && p.Score <= maxScore;
- return !p->Matching && !p.IsInGroup;
- };
- const auto begin = matchingDisks.begin();
- const auto end = matchingDisks.end();
- matchingDisks.erase(std::remove_if(begin, end, remove), end);
- std::sort(matchingDisks.begin(), matchingDisks.end());
-
- std::optional<TPDiskLayoutPosition> prev;
-
- ui32 numMatchingRealmGroupsInBox = 0;
- ui32 numMatchingRealmsInRealmGroup = 0;
- ui32 numMatchingDomainGroupsInRealm = 0;
- ui32 numMatchingDomainsInDomainGroup = 0;
- ui32 numMatchingDisksInDomain = 0;
-
- TFailRealmGroup *realmGroup = nullptr;
- TFailRealmInfo *realm = nullptr;
- TFailDomainGroup *domainGroup = nullptr;
- TFailDomainInfo *domain = nullptr;
-
- for (const auto& disk : matchingDisks) {
- const auto& cur = disk->Position;
- switch (prev ? prev->GetDiffIndex(cur) : 0) {
- case 0:
- numMatchingRealmsInRealmGroup = 0;
- realmGroup = &Box[cur.RealmGroup];
- [[fallthrough]];
- case 1:
- numMatchingDomainGroupsInRealm = 0;
- realm = &(*realmGroup)[cur.RealmInGroup];
- [[fallthrough]];
- case 2:
- numMatchingDomainsInDomainGroup = 0;
- domainGroup = &(*realm)[cur.DomainGroup];
- [[fallthrough]];
- case 3:
- numMatchingDisksInDomain = 0;
- domain = &(*domainGroup)[cur.DomainInGroup];
- prev = cur;
- [[fallthrough]];
- case 4:
- break;
- }
-
- ++domain->NumMatchingDisks;
- const std::tuple<ui32&, ui32, bool&> items[] = {
- {numMatchingDisksInDomain, Geom.GetNumVDisksPerFailDomain(), domain->Matching },
- {numMatchingDomainsInDomainGroup, Geom.GetNumFailDomainsPerFailRealm(), domainGroup->Matching},
- {numMatchingDomainGroupsInRealm, 1, realm->Matching },
- {numMatchingRealmsInRealmGroup, Geom.GetNumFailRealms(), realmGroup->Matching },
- {numMatchingRealmGroupsInBox, 1, boxMatching },
- };
- for (const auto& item : items) {
- if (++std::get<0>(item) == std::get<1>(item)) {
- std::get<2>(item) = true;
- } else {
- break;
- }
- }
- }
- Y_VERIFY(boxMatching);
+ Y_VERIFY(numZero == allocator.Topology.GetTotalVDisksNum() || numZero == replacedDisks.size());
+ allocator.Decompose(*result, groupDefinition);
+ return true;
+ } else {
+ error = "no group options " + FormatPDisks(allocator);
+ return false;
}
-
- return boxMatching;
}
};
@@ -770,10 +709,8 @@ namespace NKikimr::NBsController {
TGroupMapper::~TGroupMapper() = default;
- bool TGroupMapper::RegisterPDisk(TPDiskId pdiskId, TNodeLocation location, bool usable, ui32 numSlots, ui32 maxSlots,
- const ui32 groupIds[], size_t numGroups, i64 spaceAvailable, bool operational) {
- return Impl->RegisterPDisk(pdiskId, std::move(location), usable, numSlots, maxSlots, groupIds, numGroups,
- spaceAvailable, operational);
+ bool TGroupMapper::RegisterPDisk(const TPDiskRecord& pdisk) {
+ return Impl->RegisterPDisk(pdisk);
}
void TGroupMapper::UnregisterPDisk(TPDiskId pdiskId) {
@@ -784,10 +721,9 @@ namespace NKikimr::NBsController {
return Impl->AdjustSpaceAvailable(pdiskId, increment);
}
- bool TGroupMapper::AllocateGroup(ui32 groupId, TGroupDefinition& group, const TPDiskId replacedDiskIds[],
- size_t numReplacedDisks, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error) {
- return Impl->AllocateGroup(groupId, group, replacedDiskIds, numReplacedDisks, std::move(forbid),
- requiredSpace, requireOperational, error);
+ bool TGroupMapper::AllocateGroup(ui32 groupId, TGroupDefinition& group, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks,
+ TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error) {
+ return Impl->AllocateGroup(groupId, group, replacedDisks, std::move(forbid), requiredSpace, requireOperational, error);
}
} // NKikimr::NBsController
diff --git a/ydb/core/mind/bscontroller/group_mapper.h b/ydb/core/mind/bscontroller/group_mapper.h
index f66ba3171c..991a636bf3 100644
--- a/ydb/core/mind/bscontroller/group_mapper.h
+++ b/ydb/core/mind/bscontroller/group_mapper.h
@@ -18,13 +18,37 @@ namespace NKikimr {
using TGroupDefinition = TVector<TVector<TVector<TPDiskId>>>; // Realm/Domain/Disk
using TForbiddenPDisks = std::unordered_set<TPDiskId, THash<TPDiskId>>;
+ template<typename T>
+ static void Traverse(const TGroupDefinition& group, T&& callback) {
+ for (ui32 failRealmIdx = 0; failRealmIdx != group.size(); ++failRealmIdx) {
+ const auto& realm = group[failRealmIdx];
+ for (ui32 failDomainIdx = 0; failDomainIdx != realm.size(); ++failDomainIdx) {
+ const auto& domain = realm[failDomainIdx];
+ for (ui32 vdiskIdx = 0; vdiskIdx != domain.size(); ++vdiskIdx) {
+ callback(TVDiskIdShort(failRealmIdx, failDomainIdx, vdiskIdx), domain[vdiskIdx]);
+ }
+ }
+ }
+ }
+
+ struct TPDiskRecord {
+ const TPDiskId PDiskId;
+ const TNodeLocation Location;
+ const bool Usable;
+ ui32 NumSlots;
+ const ui32 MaxSlots;
+ TStackVec<ui32, 16> Groups;
+ i64 SpaceAvailable;
+ const bool Operational;
+ const bool Decommitted;
+ };
+
public:
TGroupMapper(TGroupGeometryInfo geom, bool randomize = false);
~TGroupMapper();
// Register PDisk inside mapper to use it in subsequent map operations
- bool RegisterPDisk(TPDiskId pdiskId, TNodeLocation location, bool usable, ui32 numSlots, ui32 maxSlots,
- const ui32 groupIds[], size_t numGroups, i64 spaceAvailable, bool operational);
+ bool RegisterPDisk(const TPDiskRecord& pdisk);
// Remove PDisk from the table.
void UnregisterPDisk(TPDiskId pdiskId);
@@ -52,9 +76,8 @@ namespace NKikimr {
// failRealmBeginDxLevel, failRealmEndDxLevel, and then by finding possible options to meet requirements
// (1) and (2). That is, prefix gives us unique domains in which we can find realms to operate, while
// prefix+infix part gives us distinct fail realms we can use while generating groups.
- bool AllocateGroup(ui32 groupId, TGroupDefinition& group, const TPDiskId replacedDiskIds[],
- size_t numReplacedDisks, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational,
- TString& error);
+ bool AllocateGroup(ui32 groupId, TGroupDefinition& group, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks,
+ TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error);
};
} // NBsController
diff --git a/ydb/core/mind/bscontroller/group_mapper_ut.cpp b/ydb/core/mind/bscontroller/group_mapper_ut.cpp
index 0dcb5be086..1be3878296 100644
--- a/ydb/core/mind/bscontroller/group_mapper_ut.cpp
+++ b/ydb/core/mind/bscontroller/group_mapper_ut.cpp
@@ -2,6 +2,7 @@
#include "group_geometry_info.h"
#include "group_mapper.h"
+#include "group_layout_checker.h"
#include "ut_helpers.h"
using namespace NKikimr;
@@ -136,17 +137,44 @@ public:
}
}
+ ui32 GetDataCenter(TPDiskId pdiskId) const {
+ const auto it = PDisks.find(pdiskId);
+ UNIT_ASSERT(it != PDisks.end());
+ return it->second.DataCenterId;
+ }
+
+ TNodeLocation GetLocation(TPDiskId pdiskId) const {
+ const auto it = PDisks.find(pdiskId);
+ UNIT_ASSERT(it != PDisks.end());
+ return it->second.GetLocation();
+ }
+
+ std::vector<std::tuple<ui32, ui32, ui32, ui32>> ExportLayout() const {
+ std::vector<std::tuple<ui32, ui32, ui32, ui32>> res;
+ for (const auto& [pdiskId, pdisk] : PDisks) {
+ res.emplace_back(pdisk.DataCenterId, pdisk.RoomId, pdisk.RackId, pdisk.BodyId);
+ }
+ return res;
+ }
+
+ void ImportLayout(const std::vector<std::tuple<ui32, ui32, ui32, ui32>>& v) {
+ size_t index = 0;
+ for (auto& [pdiskId, pdisk] : PDisks) {
+ UNIT_ASSERT(index != v.size());
+ std::tie(pdisk.DataCenterId, pdisk.RoomId, pdisk.RackId, pdisk.BodyId) = v[index];
+ ++index;
+ }
+ UNIT_ASSERT(index == v.size());
+ }
+
ui32 AllocateGroup(TGroupMapper& mapper, TGroupMapper::TGroupDefinition& group, bool allowFailure = false) {
ui32 groupId = NextGroupId++;
TString error;
- bool success = mapper.AllocateGroup(groupId, group, nullptr, 0, {}, 0, false, error);
+ bool success = mapper.AllocateGroup(groupId, group, {}, {}, 0, false, error);
if (!success && allowFailure) {
return 0;
}
- if (!success) {
- Ctest << "error# " << error << Endl;
- }
- UNIT_ASSERT(success);
+ UNIT_ASSERT_C(success, error);
TGroupRecord& record = Groups[groupId];
record.Group = group;
for (const auto& realm : group) {
@@ -161,7 +189,7 @@ public:
}
TGroupMapper::TGroupDefinition ReallocateGroup(TGroupMapper& mapper, ui32 groupId, const TSet<TPDiskId>& unusableDisks,
- bool makeThemForbidden = false, bool requireOperational = false, bool requireError = false) {
+ bool makeThemForbidden = false, bool requireOperational = false, bool allowError = false) {
TGroupRecord& group = Groups.at(groupId);
TGroupMapper::TForbiddenPDisks forbid(unusableDisks.begin(), unusableDisks.end());
@@ -170,13 +198,14 @@ public:
}
// remove unusable disks from the set
- std::vector<TPDiskId> replaced;
- for (auto& realm : group.Group) {
- for (auto& domain : realm) {
- for (auto& pdisk : domain) {
+ THashMap<TVDiskIdShort, TPDiskId> replacedDisks;
+ for (ui32 i = 0; i < group.Group.size(); ++i) {
+ for (ui32 j = 0; j < group.Group[i].size(); ++j) {
+ for (ui32 k = 0; k < group.Group[i][j].size(); ++k) {
+ auto& pdisk = group.Group[i][j][k];
--PDisks.at(pdisk).NumSlots;
if (unusableDisks.count(pdisk)) {
- replaced.push_back(std::exchange(pdisk, {}));
+ replacedDisks.emplace(TVDiskIdShort(i, j, k), std::exchange(pdisk, {}));
}
}
}
@@ -185,15 +214,24 @@ public:
Ctest << "groupId# " << groupId << " reallocating group# " << FormatGroup(group.Group) << Endl;
TString error;
- bool success = mapper.AllocateGroup(groupId, group.Group, replaced.data(), replaced.size(), std::move(forbid),
- 0, requireOperational, error);
+ bool success = mapper.AllocateGroup(groupId, group.Group, replacedDisks, std::move(forbid), 0,
+ requireOperational, error);
if (!success) {
- if (requireError) {
+ Ctest << "error# " << error << Endl;
+ if (allowError) {
+ // revert group to its original state
+ for (const auto& [vdiskId, pdiskId] : replacedDisks) {
+ group.Group[vdiskId.FailRealm][vdiskId.FailDomain][vdiskId.VDisk] = pdiskId;
+ }
+ for (auto& realm : group.Group) {
+ for (auto& domain : realm) {
+ for (auto& pdisk : domain) {
+ ++PDisks.at(pdisk).NumSlots;
+ }
+ }
+ }
return {};
}
- Ctest << "error# " << error << Endl;
- } else {
- UNIT_ASSERT(!requireError);
}
UNIT_ASSERT(success);
@@ -210,6 +248,23 @@ public:
return group.Group;
}
+ void SetGroup(ui32 groupId, const TGroupMapper::TGroupDefinition& group) {
+ auto& g = Groups[groupId];
+ for (const TPDiskId& pdiskId : g.PDisks) {
+ --PDisks.at(pdiskId).NumSlots;
+ }
+ g.Group = group;
+ g.PDisks.clear();
+ for (const auto& realm : g.Group) {
+ for (const auto& domain : realm) {
+ for (const auto& pdisk : domain) {
+ g.PDisks.push_back(pdisk);
+ ++PDisks.at(pdisk).NumSlots;
+ }
+ }
+ }
+ }
+
TString FormatGroup(const TGroupMapper::TGroupDefinition& group) {
TStringStream str;
str << "[";
@@ -234,23 +289,27 @@ public:
return str.Str();
}
- void CheckGroupErasure(const TGroupMapper::TGroupDefinition& group) {
+ void CheckGroupErasure(const TGroupMapper::TGroupDefinition& group, ui32 decommittedDataCenter = 0) {
TSet<ui32> dataCenters;
for (const auto& realm : group) {
TMaybe<ui32> dataCenter;
- TSet<std::tuple<ui32, ui32>> domains;
+ TSet<std::tuple<ui32, ui32, ui32>> domains;
for (const auto& domain : realm) {
- TMaybe<std::tuple<ui32, ui32>> currentDom;
+ TMaybe<std::tuple<ui32, ui32, ui32>> currentDom;
for (const auto& pdisk : domain) {
const TPDiskRecord& record = PDisks.at(pdisk);
- if (dataCenter) {
- UNIT_ASSERT_VALUES_EQUAL(*dataCenter, record.DataCenterId);
- } else {
- dataCenter = record.DataCenterId;
- const bool inserted = dataCenters.insert(*dataCenter).second;
- UNIT_ASSERT(inserted);
+ if (record.DataCenterId != decommittedDataCenter) { // ignore entries from decommitted data center
+ if (dataCenter) {
+ if (*dataCenter != decommittedDataCenter && record.DataCenterId != decommittedDataCenter) {
+ UNIT_ASSERT_VALUES_EQUAL(*dataCenter, record.DataCenterId);
+ }
+ } else {
+ dataCenter = record.DataCenterId;
+ const bool inserted = dataCenters.insert(*dataCenter).second;
+ UNIT_ASSERT(inserted);
+ }
}
- std::tuple<ui32, ui32> dom = {record.RoomId, record.RackId};
+ auto dom = std::make_tuple(record.DataCenterId, record.RoomId, record.RackId);
if (currentDom) {
// check that all disks from the same domain reside in the same domain :)
UNIT_ASSERT_EQUAL(dom, *currentDom);
@@ -297,7 +356,7 @@ public:
}
void PopulateGroupMapper(TGroupMapper& mapper, ui32 maxSlots = 16, TSet<TPDiskId> unusableDisks = {},
- TSet<TPDiskId> nonoperationalDisks = {}) {
+ TSet<TPDiskId> nonoperationalDisks = {}, std::optional<ui32> decommittedDataCenter = std::nullopt) {
std::map<TPDiskId, std::vector<ui32>> groupDisks;
for (const auto& [groupId, group] : Groups) {
for (TPDiskId pdiskId : group.PDisks) {
@@ -306,8 +365,88 @@ public:
}
for (const auto& pair : PDisks) {
auto& g = groupDisks[pair.first];
- mapper.RegisterPDisk(pair.first, pair.second.GetLocation(), !unusableDisks.count(pair.first),
- pair.second.NumSlots, maxSlots, g.data(), g.size(), 0, nonoperationalDisks.count(pair.first));
+ mapper.RegisterPDisk({
+ .PDiskId = pair.first,
+ .Location = pair.second.GetLocation(),
+ .Usable = !unusableDisks.count(pair.first),
+ .NumSlots = pair.second.NumSlots,
+ .MaxSlots = maxSlots,
+ .Groups{g.begin(), g.end()},
+ .SpaceAvailable = 0,
+ .Operational = !nonoperationalDisks.contains(pair.first),
+ .Decommitted = decommittedDataCenter == pair.second.DataCenterId,
+ });
+ }
+ }
+
+ void DumpGroup(const TGroupMapper::TGroupDefinition& group) {
+ std::set<std::tuple<ui32, ui32, ui32>> locations;
+ for (const auto& [pdiskId, pdisk] : PDisks) {
+ locations.emplace(pdisk.DataCenterId, pdisk.RoomId, pdisk.RackId);
+ }
+
+ std::unordered_map<ui32, ui32> dataCenterToColumn;
+ std::unordered_map<ui32, std::unordered_map<std::tuple<ui32, ui32>, ui32>> rackToColumn;
+ for (const auto& x : locations) {
+ const ui32 dataCenterId = std::get<0>(x);
+ const ui32 roomId = std::get<1>(x);
+ const ui32 rackId = std::get<2>(x);
+ dataCenterToColumn.try_emplace(dataCenterId, dataCenterToColumn.size());
+ auto& rtc = rackToColumn[dataCenterId];
+ rtc.try_emplace(std::make_tuple(roomId, rackId), rtc.size());
+ }
+
+ std::vector<std::vector<TString>> cells(dataCenterToColumn.size());
+ for (const auto& [dataCenterId, racks] : rackToColumn) {
+ cells[dataCenterToColumn[dataCenterId]].resize(racks.size());
+ }
+
+ ui32 maxCellWidth = 0;
+ for (ui32 failRealmIdx = 0; failRealmIdx < group.size(); ++failRealmIdx) {
+ for (ui32 failDomainIdx = 0; failDomainIdx < group[failRealmIdx].size(); ++failDomainIdx) {
+ for (const TPDiskId& pdiskId : group[failRealmIdx][failDomainIdx]) {
+ if (pdiskId != TPDiskId()) {
+ const auto it = PDisks.find(pdiskId);
+ UNIT_ASSERT(it != PDisks.end());
+ const TPDiskRecord& pdisk = it->second;
+ auto& cell = cells[dataCenterToColumn[pdisk.DataCenterId]]
+ [rackToColumn[pdisk.DataCenterId][{pdisk.RoomId, pdisk.RackId}]];
+ if (cell) {
+ cell += ", ";
+ }
+ cell += TStringBuilder() << failRealmIdx << "/" << failDomainIdx;
+ maxCellWidth = Max<ui32>(maxCellWidth, cell.size());
+ }
+ }
+ }
+ }
+
+ if (!maxCellWidth) {
+ ++maxCellWidth;
+ }
+
+ for (ui32 row = 0;; ++row) {
+ bool done = true;
+ TStringBuilder s;
+ for (ui32 column = 0; column < cells.size(); ++column) {
+ if (row >= cells[column].size()) {
+ s << TString(maxCellWidth, ' ');
+ } else if (const auto& cell = cells[column][row]) {
+ s << cell << TString(maxCellWidth - cell.size(), ' ');
+ done = false;
+ } else {
+ s << TString(maxCellWidth, 'X');
+ done = false;
+ }
+ if (column != cells.size() - 1) {
+ s << ' ';
+ }
+ }
+ if (done) {
+ break;
+ } else {
+ Ctest << s << Endl;
+ }
}
}
};
@@ -468,6 +607,50 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) {
}
}
+ Y_UNIT_TEST(NonUniformClusterMirror3dcWithUnusableDomain) {
+ std::vector<std::vector<ui32>> disposition{
+ { // datacenter1
+ 4, 4, 4, 4, 4, 2, 2, 4, 2, 5, 5, 5,
+ },
+ { // datacenter2
+ 2, 2, 2, 2, 2, 2, 1, 1, 2, 4, 8, 8, 9,
+ },
+ { // datacenter3
+ 4, 4, 1, 3, 4, 4, 2, 6, 9, 8,
+ },
+ { // datacenter4
+ 1,
+ },
+ };
+ TTestContext context(disposition, 4);
+ TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc));
+ context.PopulateGroupMapper(mapper, 9);
+ for (ui32 i = 0; i < context.GetTotalDisks() - 4; ++i) {
+ Ctest << i << "/" << (context.GetTotalDisks() - 4) << Endl;
+ TGroupMapper::TGroupDefinition group;
+ context.AllocateGroup(mapper, group);
+ context.CheckGroupErasure(group);
+
+ TVector<ui32> slots = context.GetSlots();
+ UNIT_ASSERT(slots);
+ ui32 min = Max<ui32>();
+ ui32 max = 0;
+ for (const ui32 x : slots) {
+ if (x) {
+ min = Min(min, x);
+ max = Max(max, x);
+ }
+ }
+ UNIT_ASSERT_C(max - min <= 1, Sprintf("min# %" PRIu32 " max# %" PRIu32, min, max));
+ }
+ TVector<ui32> slots = context.GetSlots();
+ for (ui32 numSlots : slots) {
+ if (numSlots) {
+ UNIT_ASSERT_VALUES_EQUAL(9, numSlots);
+ }
+ }
+ }
+
Y_UNIT_TEST(MakeDisksUnusable) {
TTestContext context(1, 1, 10, 1, 1);
TVector<ui32> groupIds;
@@ -538,12 +721,15 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) {
nonoperationalDisks.insert(pdiskId);
});
context.PopulateGroupMapper(mapper, 10, unusableDisks, nonoperationalDisks);
+ ui32 hasEmpty = false;
for (ui32 groupId : groupIds) {
- auto group = context.ReallocateGroup(mapper, groupId, unusableDisks, true, true);
- group = context.ReallocateGroup(mapper, groupId, unusableDisks);
+ auto tmp = context.ReallocateGroup(mapper, groupId, unusableDisks, false, true, true);
+ hasEmpty |= tmp.empty();
+ auto group = context.ReallocateGroup(mapper, groupId, unusableDisks);
Ctest << "groupId# " << groupId << " new content# " << context.FormatGroup(group) << Endl;
context.CheckGroupErasure(group);
}
+ UNIT_ASSERT(hasEmpty);
}
}
@@ -630,4 +816,179 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) {
}
}
+ Y_UNIT_TEST(ReassignGroupTest3dc) {
+ for (ui32 i = 0; i < 10000; ++i) {
+ Ctest << "iteration# " << i << Endl;
+
+ const ui32 numDataCenters = 5;
+ const ui32 numRacks = 5;
+ TTestContext context(numDataCenters, 1, numRacks, 1, 1);
+
+ TGroupMapper::TGroupDefinition group;
+ ui32 groupId;
+ {
+ TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc));
+ context.PopulateGroupMapper(mapper, 1);
+ groupId = context.AllocateGroup(mapper, group);
+ Ctest << "group after allocation:" << Endl;
+ context.DumpGroup(group);
+ }
+
+ ui32 decommittedDataCenter = RandomNumber<ui32>(numDataCenters + 1);
+ Ctest << "decommittedDataCenter# " << decommittedDataCenter << Endl;
+ {
+ // randomly move some of disks from decommitted datacenter
+ TSet<TPDiskId> unusableDisks;
+ for (auto& realm : group) {
+ for (auto& domain : realm) {
+ for (auto& pdisk : domain) {
+ if (context.GetDataCenter(pdisk) == decommittedDataCenter && RandomNumber(2u)) {
+ unusableDisks.insert(pdisk);
+ }
+ }
+ }
+ }
+
+ TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc));
+ context.PopulateGroupMapper(mapper, 1, {}, {}, decommittedDataCenter);
+ group = context.ReallocateGroup(mapper, groupId, unusableDisks);
+ Ctest << "group after data center decommission:" << Endl;
+ context.DumpGroup(group);
+ }
+
+ TSet<TPDiskId> unusableDisks;
+ ui32 unusableDataCenter = RandomNumber<ui32>(numDataCenters + 1);
+ Ctest << "unusableDataCenter# " << unusableDataCenter << Endl;
+ if (unusableDataCenter) {
+ context.IteratePDisks([&](const auto& pdiskId, const auto& record) {
+ if (record.DataCenterId == unusableDataCenter) {
+ unusableDisks.insert(pdiskId);
+ }
+ });
+ }
+
+ for (ui32 i = 0; i < 2; ++i) {
+ if (const ui32 unusableDataCenter = RandomNumber<ui32>(numDataCenters + 1)) {
+ const ui32 unusableRack = 1 + RandomNumber<ui32>(numRacks);
+ context.IteratePDisks([&](const auto& pdiskId, const auto& record) {
+ if (record.DataCenterId == unusableDataCenter && record.RackId == unusableRack) {
+ unusableDisks.insert(pdiskId);
+ }
+ });
+ }
+ }
+
+ {
+ TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc));
+ context.PopulateGroupMapper(mapper, 1);
+ auto group = context.ReallocateGroup(mapper, groupId, unusableDisks);
+ Ctest << "group after reallocation:" << Endl;
+ context.DumpGroup(group);
+ context.CheckGroupErasure(group, decommittedDataCenter);
+ }
+
+ Ctest << Endl;
+ }
+ }
+
+ Y_UNIT_TEST(SanitizeGroupTest3dc) {
+ const ui32 numDataCenters = 3;
+ const ui32 numRacks = 5;
+ TTestContext context(numDataCenters, 1, numRacks, 1, 1);
+ TGroupMapper::TGroupDefinition group;
+ ui32 groupId;
+ {
+ TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc));
+ context.PopulateGroupMapper(mapper, 1);
+ groupId = context.AllocateGroup(mapper, group);
+ Ctest << "group after allocation:" << Endl;
+ context.DumpGroup(group);
+ }
+
+ auto checkLayout = [&](const auto& group) {
+ TGroupGeometryInfo geom = TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc);
+ THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>> layout;
+ for (ui32 i = 0; i < group.size(); ++i) {
+ for (ui32 j = 0; j < group[i].size(); ++j) {
+ for (ui32 k = 0; k < group[i][j].size(); ++k) {
+ layout.emplace(TVDiskIdShort(i, j, k), std::make_pair(context.GetLocation(group[i][j][k]),
+ group[i][j][k]));
+ }
+ }
+ }
+ return CheckGroupLayout(geom, layout);
+ };
+
+ UNIT_ASSERT(checkLayout(group));
+
+ for (ui32 n = 0; n < 1000; ++n) {
+ Ctest << Endl << "iteration# " << n << Endl;
+
+ auto layout = context.ExportLayout();
+ std::random_shuffle(layout.begin(), layout.end());
+ context.ImportLayout(layout);
+
+ Ctest << "group after layout shuffling:" << Endl;
+ context.DumpGroup(group);
+
+ struct TQueueItem {
+ TGroupMapper::TGroupDefinition Group;
+ TString Path;
+ TSet<TGroupMapper::TGroupDefinition> Seen;
+ TSet<TVDiskIdShort> VDiskItems;
+ TSet<TPDiskId> PDiskItems;
+ };
+ std::deque<TQueueItem> queue;
+ for (queue.push_back({.Group = group}); !queue.empty(); ) {
+ TQueueItem item = std::move(queue.front());
+ queue.pop_front();
+ const auto [it, inserted] = item.Seen.insert(item.Group);
+ UNIT_ASSERT(inserted);
+ UNIT_ASSERT(item.Seen.size() <= 9);
+ Ctest << "processing path# " << item.Path << Endl;
+
+ auto candidates = checkLayout(item.Group);
+ if (!candidates) {
+ for (const TVDiskIdShort& vdiskId : candidates.Candidates) {
+ TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc));
+ context.SetGroup(groupId, item.Group);
+ context.PopulateGroupMapper(mapper, 2);
+ const TPDiskId& pdiskId = item.Group[vdiskId.FailRealm][vdiskId.FailDomain][vdiskId.VDisk];
+ auto temp = context.ReallocateGroup(mapper, groupId, {pdiskId}, false, false, false);
+ TString path = TStringBuilder() << item.Path << "/" << (int)vdiskId.FailRealm << ":"
+ << (int)vdiskId.FailDomain << ":" << (int)vdiskId.VDisk << "@" << pdiskId;
+ Ctest << "path# " << path << Endl;
+ context.DumpGroup(temp);
+
+ auto vdiskItems = item.VDiskItems;
+// const auto [it1, inserted1] = vdiskItems.insert(vdiskId);
+// UNIT_ASSERT_C(inserted1, "Duplicate group cell# " << vdiskId);
+
+ auto pdiskItems = item.PDiskItems;
+// const auto [it2, inserted2] = pdiskItems.insert(pdiskId);
+// UNIT_ASSERT_C(inserted2, "Duplicate origin PDisk# " << pdiskId);
+
+ queue.push_front({.Group = std::move(temp), .Path = std::move(path), .Seen = item.Seen,
+ .VDiskItems = std::move(vdiskItems), .PDiskItems = std::move(pdiskItems)});
+ }
+ }
+
+ Ctest << Endl;
+ }
+ }
+ }
+
+ Y_UNIT_TEST(CheckNotToBreakFailModel) {
+ TTestContext context(4, 1, 3, 1, 1);
+ TGroupMapper::TGroupDefinition group;
+ TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc));
+ context.PopulateGroupMapper(mapper, 1);
+ ui32 groupId = context.AllocateGroup(mapper, group);
+ Ctest << "group after allocation:" << Endl;
+ context.DumpGroup(group);
+ group = context.ReallocateGroup(mapper, groupId, {group[0][0][0]}, false, false, true);
+ Ctest << "group after reallocation:" << Endl;
+ context.DumpGroup(group);
+ UNIT_ASSERT(group.empty());
+ }
}
diff --git a/ydb/core/mind/bscontroller/impl.h b/ydb/core/mind/bscontroller/impl.h
index abe31a576b..8910303ce8 100644
--- a/ydb/core/mind/bscontroller/impl.h
+++ b/ydb/core/mind/bscontroller/impl.h
@@ -78,6 +78,7 @@ public:
class TConfigState;
class TGroupSelector;
class TGroupFitter;
+ class TSelfHealActor;
using TVSlotReadyTimestampQ = std::list<std::pair<TInstant, TVSlotInfo*>>;
@@ -294,6 +295,7 @@ public:
NKikimrBlobStorage::EDriveStatus Status;
TInstant StatusTimestamp;
+ NKikimrBlobStorage::EDecommitStatus DecommitStatus;
TString ExpectedSerial;
TString LastSeenSerial;
TString LastSeenPath;
@@ -313,7 +315,8 @@ public:
Table::Timestamp,
Table::ExpectedSerial,
Table::LastSeenSerial,
- Table::LastSeenPath
+ Table::LastSeenPath,
+ Table::DecommitStatus
> adapter(
&TPDiskInfo::Path,
&TPDiskInfo::Kind,
@@ -326,7 +329,8 @@ public:
&TPDiskInfo::StatusTimestamp,
&TPDiskInfo::ExpectedSerial,
&TPDiskInfo::LastSeenSerial,
- &TPDiskInfo::LastSeenPath
+ &TPDiskInfo::LastSeenPath,
+ &TPDiskInfo::DecommitStatus
);
callback(&adapter);
}
@@ -343,6 +347,7 @@ public:
ui32 defaultMaxSlots,
NKikimrBlobStorage::EDriveStatus status,
TInstant statusTimestamp,
+ NKikimrBlobStorage::EDecommitStatus decommitStatus,
const TString& expectedSerial,
const TString& lastSeenSerial,
const TString& lastSeenPath,
@@ -358,6 +363,7 @@ public:
, BoxId(boxId)
, Status(status)
, StatusTimestamp(statusTimestamp)
+ , DecommitStatus(decommitStatus)
, ExpectedSerial(expectedSerial)
, LastSeenSerial(lastSeenSerial)
, LastSeenPath(lastSeenPath)
@@ -401,23 +407,40 @@ public:
}
bool ShouldBeSettledBySelfHeal() const {
- switch (Status) {
- case NKikimrBlobStorage::EDriveStatus::FAULTY:
- case NKikimrBlobStorage::EDriveStatus::TO_BE_REMOVED:
- return true;
- default:
- return false;
- }
+ return Status == NKikimrBlobStorage::EDriveStatus::FAULTY
+ || Status == NKikimrBlobStorage::EDriveStatus::TO_BE_REMOVED
+ || DecommitStatus == NKikimrBlobStorage::EDecommitStatus::DECOMMIT_IMMINENT;
}
bool BadInTermsOfSelfHeal() const {
- return ShouldBeSettledBySelfHeal() || Status == NKikimrBlobStorage::EDriveStatus::INACTIVE;
+ return Status == NKikimrBlobStorage::EDriveStatus::FAULTY
+ || Status == NKikimrBlobStorage::EDriveStatus::TO_BE_REMOVED
+ || Status == NKikimrBlobStorage::EDriveStatus::INACTIVE;
}
std::tuple<bool, bool> GetSelfHealStatusTuple() const {
return {ShouldBeSettledBySelfHeal(), BadInTermsOfSelfHeal()};
}
+ bool AcceptsNewSlots() const {
+ return Status == NKikimrBlobStorage::EDriveStatus::ACTIVE;
+ }
+
+ bool Decommitted() const {
+ switch (DecommitStatus) {
+ case NKikimrBlobStorage::EDecommitStatus::DECOMMIT_NONE:
+ return false;
+ case NKikimrBlobStorage::EDecommitStatus::DECOMMIT_PENDING:
+ case NKikimrBlobStorage::EDecommitStatus::DECOMMIT_IMMINENT:
+ return true;
+ case NKikimrBlobStorage::EDecommitStatus::DECOMMIT_UNSET:
+ case NKikimrBlobStorage::EDecommitStatus::EDecommitStatus_INT_MIN_SENTINEL_DO_NOT_USE_:
+ case NKikimrBlobStorage::EDecommitStatus::EDecommitStatus_INT_MAX_SENTINEL_DO_NOT_USE_:
+ break;
+ }
+ Y_FAIL("unexpected EDecommitStatus");
+ }
+
bool HasGoodExpectedStatus() const {
switch (Status) {
case NKikimrBlobStorage::EDriveStatus::UNKNOWN:
@@ -426,7 +449,6 @@ public:
case NKikimrBlobStorage::EDriveStatus::ACTIVE:
case NKikimrBlobStorage::EDriveStatus::INACTIVE:
- case NKikimrBlobStorage::EDriveStatus::SPARE:
case NKikimrBlobStorage::EDriveStatus::FAULTY:
case NKikimrBlobStorage::EDriveStatus::TO_BE_REMOVED:
return true;
@@ -488,7 +510,7 @@ public:
struct TGroupStatus {
// status derived from the actual state of VDisks (IsReady() to be exact)
NKikimrBlobStorage::TGroupStatus::E OperatingStatus = NKikimrBlobStorage::TGroupStatus::UNKNOWN;
- // status derived by adding underlying PDisk status (FAULTY&BROKEN are assumed to be not working ones)
+ // status derived by adding underlying PDisk status (some of them are assumed to be not working ones)
NKikimrBlobStorage::TGroupStatus::E ExpectedStatus = NKikimrBlobStorage::TGroupStatus::UNKNOWN;
} Status;
@@ -1322,6 +1344,7 @@ private:
THashMap<TPDiskId, ui32> StaticPDiskSlotUsage;
std::unique_ptr<TStoragePoolStat> StoragePoolStat;
bool StopGivingGroups = false;
+ bool GroupLayoutSanitizer = false;
NKikimrBlobStorage::TSerialManagementStage::E SerialManagementStage
= NKikimrBlobStorage::TSerialManagementStage::DISCOVER_SERIAL;
@@ -1347,6 +1370,7 @@ private:
EvVSlotReadyUpdate,
EvVSlotNotReadyHistogramUpdate,
EvProcessIncomingEvent,
+ EvUpdateHostRecords,
};
struct TEvUpdateSystemViews : public TEventLocal<TEvUpdateSystemViews, EvUpdateSystemViews> {};
@@ -1360,6 +1384,14 @@ private:
struct TEvScrub : TEventLocal<TEvScrub, EvScrub> {};
struct TEvVSlotReadyUpdate : TEventLocal<TEvVSlotReadyUpdate, EvVSlotReadyUpdate> {};
struct TEvVSlotNotReadyHistogramUpdate : TEventLocal<TEvVSlotNotReadyHistogramUpdate, EvVSlotNotReadyHistogramUpdate> {};
+
+ struct TEvUpdateHostRecords : TEventLocal<TEvUpdateHostRecords, EvUpdateHostRecords> {
+ THostRecordMap HostRecords;
+
+ TEvUpdateHostRecords(THostRecordMap hostRecords)
+ : HostRecords(std::move(hostRecords))
+ {}
+ };
};
static constexpr TDuration UpdateSystemViewsPeriod = TDuration::Seconds(5);
@@ -1546,6 +1578,16 @@ private:
void Handle(TEvInterconnect::TEvNodesInfo::TPtr &ev);
void HandleHostRecordsTimeToLiveExceeded();
+public:
+ // Self-heal actor's main purpose is to monitor FAULTY pdisks and to slightly move groups out of them; every move
+ // should not render group unusable, also it should not exceed its fail model. It also takes into account replication
+ // broker features such as only one vslot over PDisk is being replicated at a moment.
+ //
+ // It interacts with BS_CONTROLLER and group observer (which provides information about group state on a per-vdisk
+ // basis). BS_CONTROLLER reports faulty PDisks and all involved groups in a push notification manner.
+ IActor *CreateSelfHealActor();
+
+private:
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Online state
void Handle(TEvBlobStorage::TEvControllerRegisterNode::TPtr &ev);
diff --git a/ydb/core/mind/bscontroller/load_everything.cpp b/ydb/core/mind/bscontroller/load_everything.cpp
index 8fa32c53f7..8f33e2a1b6 100644
--- a/ydb/core/mind/bscontroller/load_everything.cpp
+++ b/ydb/core/mind/bscontroller/load_everything.cpp
@@ -83,6 +83,7 @@ public:
Self->GroupReservePart = state.GetValue<T::GroupReservePart>();
Self->MaxScrubbedDisksAtOnce = state.GetValue<T::MaxScrubbedDisksAtOnce>();
Self->PDiskSpaceColorBorder = state.GetValue<T::PDiskSpaceColorBorder>();
+ Self->GroupLayoutSanitizer = state.GetValue<T::GroupLayoutSanitizer>();
Self->SysViewChangedSettings = true;
}
}
@@ -285,8 +286,8 @@ public:
disks.GetValue<T::Guid>(), getOpt(T::SharedWithOs()), getOpt(T::ReadCentric()),
disks.GetValueOrDefault<T::NextVSlotId>(), disks.GetValue<T::PDiskConfig>(), boxId,
Self->DefaultMaxSlots, disks.GetValue<T::Status>(), disks.GetValue<T::Timestamp>(),
- disks.GetValue<T::ExpectedSerial>(), disks.GetValue<T::LastSeenSerial>(),
- disks.GetValue<T::LastSeenPath>(), staticSlotUsage);
+ disks.GetValue<T::DecommitStatus>(), disks.GetValue<T::ExpectedSerial>(),
+ disks.GetValue<T::LastSeenSerial>(), disks.GetValue<T::LastSeenPath>(), staticSlotUsage);
if (!disks.Next())
return false;
diff --git a/ydb/core/mind/bscontroller/scheme.h b/ydb/core/mind/bscontroller/scheme.h
index 3b7a425c95..cd47ac234d 100644
--- a/ydb/core/mind/bscontroller/scheme.h
+++ b/ydb/core/mind/bscontroller/scheme.h
@@ -38,10 +38,11 @@ struct Schema : NIceDb::Schema {
struct ExpectedSerial : Column<14, NScheme::NTypeIds::String> {};
struct LastSeenSerial : Column<15, NScheme::NTypeIds::String> {};
struct LastSeenPath : Column<16, NScheme::NTypeIds::String> {};
+ struct DecommitStatus : Column<17, NScheme::NTypeIds::Uint32> { using Type = NKikimrBlobStorage::EDecommitStatus; static constexpr Type Default = Type::DECOMMIT_NONE; };
using TKey = TableKey<NodeID, PDiskID>; // order is important
using TColumns = TableColumns<NodeID, PDiskID, Path, Category, Guid, SharedWithOs, ReadCentric, NextVSlotId,
- Status, Timestamp, PDiskConfig, ExpectedSerial, LastSeenSerial, LastSeenPath>;
+ Status, Timestamp, PDiskConfig, ExpectedSerial, LastSeenSerial, LastSeenPath, DecommitStatus>;
};
struct Group : Table<4> {
@@ -84,11 +85,13 @@ struct Schema : NIceDb::Schema {
struct GroupReservePart : Column<15, NScheme::NTypeIds::Uint32> { static constexpr Type Default = 0; }; // parts per million
struct MaxScrubbedDisksAtOnce : Column<16, NScheme::NTypeIds::Uint32> { static constexpr Type Default = Max<ui32>(); }; // no limit
struct PDiskSpaceColorBorder : Column<17, NScheme::NTypeIds::Uint32> { using Type = NKikimrBlobStorage::TPDiskSpaceColor::E; static constexpr Type Default = NKikimrBlobStorage::TPDiskSpaceColor::GREEN; };
+ struct GroupLayoutSanitizer : Column<18, NScheme::NTypeIds::Bool> { static constexpr Type Default = false; };
using TKey = TableKey<FixedKey>;
using TColumns = TableColumns<FixedKey, NextGroupID, SchemaVersion, NextOperationLogIndex, DefaultMaxSlots,
InstanceId, SelfHealEnable, DonorModeEnable, ScrubPeriodicity, SerialManagementStage, NextStoragePoolId,
- PDiskSpaceMarginPromille, GroupReserveMin, GroupReservePart, MaxScrubbedDisksAtOnce, PDiskSpaceColorBorder>;
+ PDiskSpaceMarginPromille, GroupReserveMin, GroupReservePart, MaxScrubbedDisksAtOnce, PDiskSpaceColorBorder,
+ GroupLayoutSanitizer>;
};
struct VSlot : Table<5> {
diff --git a/ydb/core/mind/bscontroller/self_heal.cpp b/ydb/core/mind/bscontroller/self_heal.cpp
index b732d98e93..24c6045625 100644
--- a/ydb/core/mind/bscontroller/self_heal.cpp
+++ b/ydb/core/mind/bscontroller/self_heal.cpp
@@ -2,6 +2,8 @@
#include "impl.h"
#include "vdisk_status_tracker.h"
#include "config.h"
+#include "group_geometry_info.h"
+#include "group_layout_checker.h"
namespace NKikimr::NBsController {
@@ -111,7 +113,9 @@ namespace NKikimr::NBsController {
void Handle(TEvBlobStorage::TEvVStatusResult::TPtr& ev) {
const auto& record = ev->Get()->Record;
- STLOG(PRI_DEBUG, BS_SELFHEAL, BSSH03, "Reassigner TEvVStatusResult", (GroupId, GroupId), (Response, record));
+ STLOG(PRI_DEBUG, BS_SELFHEAL, BSSH03, "Reassigner TEvVStatusResult", (GroupId, GroupId),
+ (Status, record.GetStatus()), (JoinedGroup, record.GetJoinedGroup()),
+ (Replicated, record.GetReplicated()));
bool diskIsOk = false;
if (record.GetStatus() == NKikimrProto::RACE) {
@@ -169,6 +173,13 @@ namespace NKikimr::NBsController {
if (!record.GetResponse().GetSuccess()) {
STLOG(PRI_WARN, BS_SELFHEAL, BSSH07, "Reassigner ReassignGroupDisk request failed", (GroupId, GroupId),
(VDiskToReplace, VDiskToReplace), (Response, record));
+ } else {
+ TString items = "none";
+ for (const auto& item : record.GetResponse().GetStatus(0).GetReassignedItem()) {
+ items = TStringBuilder() << VDiskIDFromVDiskID(item.GetVDiskId()) << ": "
+ << TVSlotId(item.GetFrom()) << " -> " << TVSlotId(item.GetTo());
+ }
+ STLOG(PRI_INFO, BS_SELFHEAL, BSSH09, "Reassigner succeeded", (GroupId, GroupId), (Items, items));
}
Finish(record.GetResponse().GetSuccess());
}
@@ -204,23 +215,36 @@ namespace NKikimr::NBsController {
})
};
- class TSelfHealActor : public TActorBootstrapped<TSelfHealActor> {
+ class TBlobStorageController::TSelfHealActor : public TActorBootstrapped<TSelfHealActor> {
static constexpr TDuration MinRetryTimeout = TDuration::Seconds(1);
static constexpr TDuration MaxRetryTimeout = TDuration::Seconds(60);
- struct TGroupRecord {
+ struct TWithFaultyDisks {};
+ struct TWithInvalidLayout {};
+
+ struct TGroupRecord
+ : TIntrusiveListItem<TGroupRecord, TWithFaultyDisks>
+ , TIntrusiveListItem<TGroupRecord, TWithInvalidLayout>
+ {
+ const TGroupId GroupId;
TEvControllerUpdateSelfHealInfo::TGroupContent Content;
TActorId ReassignerActorId; // reassigner in flight
TDuration RetryTimeout = MinRetryTimeout;
TInstant NextRetryTimestamp = TInstant::Zero();
THashMap<TVDiskID, TVDiskStatusTracker> VDiskStatus;
+ bool LayoutValid = false;
+
+ TGroupRecord(TGroupId groupId) : GroupId(groupId) {}
};
const ui64 TabletId;
TActorId ControllerId;
THashMap<TGroupId, TGroupRecord> Groups;
- TSet<TGroupId> GroupsWithFaultyDisks;
+ TIntrusiveList<TGroupRecord, TWithFaultyDisks> GroupsWithFaultyDisks;
+ TIntrusiveList<TGroupRecord, TWithInvalidLayout> GroupsWithInvalidLayout;
std::shared_ptr<std::atomic_uint64_t> UnreassignableGroups;
+ bool GroupLayoutSanitizer = false;
+ THostRecordMap HostRecords;
public:
TSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups)
@@ -236,11 +260,17 @@ namespace NKikimr::NBsController {
void Handle(TEvControllerUpdateSelfHealInfo::TPtr& ev) {
const TInstant now = TActivationContext::Now();
+ if (const auto& setting = ev->Get()->GroupLayoutSanitizer) {
+ GroupLayoutSanitizer = *setting;
+ }
for (const auto& [groupId, data] : ev->Get()->GroupsToUpdate) {
if (data) {
- auto& g = Groups[groupId];
+ const auto [it, inserted] = Groups.try_emplace(groupId, groupId);
+ auto& g = it->second;
bool hasFaultyDisks = false;
g.Content = std::move(*data);
+ g.LayoutValid = false;
+ GroupsWithInvalidLayout.PushBack(&g);
for (const auto& [vdiskId, vdisk] : g.Content.VDisks) {
g.VDiskStatus[vdiskId].Update(vdisk.VDiskStatus, now);
hasFaultyDisks |= vdisk.Faulty;
@@ -253,9 +283,9 @@ namespace NKikimr::NBsController {
}
}
if (hasFaultyDisks) {
- GroupsWithFaultyDisks.insert(groupId);
+ GroupsWithFaultyDisks.PushBack(&g);
} else {
- GroupsWithFaultyDisks.erase(groupId);
+ GroupsWithFaultyDisks.Remove(&g);
}
} else {
// find the group to delete
@@ -272,7 +302,6 @@ namespace NKikimr::NBsController {
}
// remove the group
- GroupsWithFaultyDisks.erase(groupId);
Groups.erase(it);
}
}
@@ -293,29 +322,40 @@ namespace NKikimr::NBsController {
ui64 counter = 0;
- for (const TGroupId groupId : GroupsWithFaultyDisks) {
- // find the group to process
- const auto it = Groups.find(groupId);
- Y_VERIFY(it != Groups.end());
- TGroupRecord& group = it->second;
-
+ for (TGroupRecord& group : GroupsWithFaultyDisks) {
if (group.ReassignerActorId || now < group.NextRetryTimestamp) {
continue; // we are already running reassigner for this group
}
// check if it is possible to move anything out
if (const auto v = FindVDiskToReplace(group.VDiskStatus, group.Content, now)) {
- group.ReassignerActorId = Register(new TReassignerActor(ControllerId, groupId, group.Content, *v));
+ group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, *v));
} else {
++counter; // this group can't be reassigned right now
}
}
+ if (GroupLayoutSanitizer) {
+ for (auto it = GroupsWithInvalidLayout.begin(); it != GroupsWithInvalidLayout.end(); ) {
+ TGroupRecord& group = *it++;
+ Y_VERIFY(!group.LayoutValid);
+ if (group.ReassignerActorId || now < group.NextRetryTimestamp) {
+ // nothing to do
+ } else if (const auto v = FindVDiskToReplaceByLayout(group, now)) {
+ group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, *v));
+ } else if (group.LayoutValid) {
+ GroupsWithInvalidLayout.Remove(&group);
+ } else {
+ ++counter;
+ }
+ }
+ }
+
UnreassignableGroups->store(counter);
}
std::optional<TVDiskID> FindVDiskToReplace(const THashMap<TVDiskID, TVDiskStatusTracker>& tracker,
- const TEvControllerUpdateSelfHealInfo::TGroupContent& content, const TInstant now) {
+ const TEvControllerUpdateSelfHealInfo::TGroupContent& content, TInstant now) {
auto status = [&](const TVDiskID& id) {
try {
return tracker.at(id).GetStatus(now);
@@ -362,6 +402,41 @@ namespace NKikimr::NBsController {
}
}
+ std::optional<TVDiskID> FindVDiskToReplaceByLayout(TGroupRecord& group, TInstant now) {
+ THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>> layout;
+ for (const auto& [vdiskId, vdisk] : group.Content.VDisks) {
+ Y_VERIFY(HostRecords);
+ if (!vdisk.Decommitted) {
+ layout.emplace(vdiskId, std::make_pair(HostRecords->GetLocation(vdisk.Location.NodeId),
+ vdisk.Location.ComprisingPDiskId()));
+ }
+ }
+ const TLayoutCheckResult checkResult = CheckGroupLayout(*group.Content.Geometry, layout);
+ if (checkResult) { // group is valid
+ group.LayoutValid = true;
+ return std::nullopt;
+ }
+
+ THashSet<TVDiskIdShort> badDisks;
+ for (const auto& [vdiskId, vdisk] : group.Content.VDisks) {
+ const auto it = group.VDiskStatus.find(vdiskId);
+ if (it == group.VDiskStatus.end() || it->second.GetStatus(now) != NKikimrBlobStorage::EVDiskStatus::READY || vdisk.Bad) {
+ badDisks.insert(vdiskId);
+ }
+ }
+ if (badDisks.empty()) {
+ return TVDiskID(group.GroupId, group.Content.Generation, checkResult.Candidates.front());
+ } else if (badDisks.size() == 1) {
+ for (const auto& vdiskId : checkResult.Candidates) {
+ if (badDisks.contains(vdiskId)) {
+ return TVDiskID(group.GroupId, group.Content.Generation, vdiskId);
+ }
+ }
+ }
+
+ return std::nullopt;
+ }
+
void HandleWakeup() {
CheckGroups();
Schedule(TDuration::Seconds(10), new TEvents::TEvWakeup);
@@ -439,9 +514,8 @@ namespace NKikimr::NBsController {
TABLE_CLASS("table-sortable table") {
TABLEHEAD() {
ui32 numCols = 0;
- for (const auto& id : GroupsWithFaultyDisks) {
- const auto& info = Groups.at(id);
- numCols = Max<ui32>(numCols, info.Content.VDisks.size());
+ for (const TGroupRecord& group : GroupsWithFaultyDisks) {
+ numCols = Max<ui32>(numCols, group.Content.VDisks.size());
}
TABLER() {
@@ -452,20 +526,19 @@ namespace NKikimr::NBsController {
}
}
TABLEBODY() {
- for (const auto& id : GroupsWithFaultyDisks) {
- const auto& info = Groups.at(id);
+ for (const TGroupRecord& group : GroupsWithFaultyDisks) {
TABLER() {
out << "<td rowspan='2'><a href='?TabletID=" << TabletId
- << "&page=GroupDetail&GroupId=" << id << "'>"
- << id << "</a>:" << info.Content.Generation << "</td>";
+ << "&page=GroupDetail&GroupId=" << group.GroupId << "'>"
+ << group.GroupId << "</a>:" << group.Content.Generation << "</td>";
- for (const auto& [vdiskId, vdisk] : info.Content.VDisks) {
+ for (const auto& [vdiskId, vdisk] : group.Content.VDisks) {
TABLED() {
out << vdiskId.ToString();
out << "<br/>";
out << vdisk.VDiskStatus;
out << "<br/><strong>";
- if (const auto it = info.VDiskStatus.find(vdiskId); it != info.VDiskStatus.end()) {
+ if (const auto it = group.VDiskStatus.find(vdiskId); it != group.VDiskStatus.end()) {
if (const auto& status = it->second.GetStatus(now)) {
out << *status;
} else {
@@ -479,7 +552,7 @@ namespace NKikimr::NBsController {
}
}
TABLER() {
- for (const auto& [vdiskId, vdisk] : info.Content.VDisks) {
+ for (const auto& [vdiskId, vdisk] : group.Content.VDisks) {
TABLED() {
const auto& l = vdisk.Location;
if (vdisk.Faulty) {
@@ -506,17 +579,22 @@ namespace NKikimr::NBsController {
}
}
+ void Handle(TEvPrivate::TEvUpdateHostRecords::TPtr ev) {
+ HostRecords = std::move(ev->Get()->HostRecords);
+ }
+
STRICT_STFUNC(StateFunc, {
cFunc(TEvents::TSystem::Poison, PassAway);
hFunc(TEvControllerUpdateSelfHealInfo, Handle);
hFunc(NMon::TEvRemoteHttpInfo, Handle);
hFunc(TEvReassignerDone, Handle);
cFunc(TEvents::TSystem::Wakeup, HandleWakeup);
+ hFunc(TEvPrivate::TEvUpdateHostRecords, Handle);
})
};
- IActor *CreateSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups) {
- return new TSelfHealActor(tabletId, std::move(unreassignableGroups));
+ IActor *TBlobStorageController::CreateSelfHealActor() {
+ return new TSelfHealActor(TabletID(), SelfHealUnreassignableGroups);
}
void TBlobStorageController::InitializeSelfHealState() {
@@ -525,10 +603,13 @@ namespace NKikimr::NBsController {
ev->GroupsToUpdate.emplace(groupId, TEvControllerUpdateSelfHealInfo::TGroupContent());
}
FillInSelfHealGroups(*ev, nullptr);
+ ev->GroupLayoutSanitizer = GroupLayoutSanitizer;
Send(SelfHealId, ev.Release());
}
void TBlobStorageController::FillInSelfHealGroups(TEvControllerUpdateSelfHealInfo& msg, TConfigState *state) {
+ THashMap<TBoxStoragePoolId, std::shared_ptr<TGroupGeometryInfo>> geomCache;
+
for (auto& [groupId, group] : msg.GroupsToUpdate) {
if (!group) {
continue;
@@ -540,11 +621,24 @@ namespace NKikimr::NBsController {
group->Generation = p->Generation;
group->Type = TBlobStorageGroupType(p->ErasureSpecies);
+ if (auto it = geomCache.find(p->StoragePoolId); it != geomCache.end()) {
+ group->Geometry = it->second;
+ } else {
+ const TMap<TBoxStoragePoolId, TStoragePoolInfo>& storagePools = state
+ ? state->StoragePools.Get()
+ : StoragePools;
+ const auto spIt = storagePools.find(p->StoragePoolId);
+ Y_VERIFY(spIt != storagePools.end());
+ group->Geometry = std::make_unique<TGroupGeometryInfo>(group->Type, spIt->second.GetGroupGeometry());
+ geomCache.emplace(p->StoragePoolId, group->Geometry);
+ }
+
for (const TVSlotInfo *slot : p->VDisksInGroup) {
group->VDisks[slot->GetVDiskId()] = {
slot->VSlotId,
slot->PDisk->ShouldBeSettledBySelfHeal(),
slot->PDisk->BadInTermsOfSelfHeal(),
+ slot->PDisk->Decommitted(),
slot->GetStatus(),
};
}
diff --git a/ydb/core/mind/bscontroller/self_heal.h b/ydb/core/mind/bscontroller/self_heal.h
index 287f05d467..b2740f4800 100644
--- a/ydb/core/mind/bscontroller/self_heal.h
+++ b/ydb/core/mind/bscontroller/self_heal.h
@@ -6,29 +6,26 @@
namespace NKikimr::NBsController {
+ class TGroupGeometryInfo;
+
struct TEvControllerUpdateSelfHealInfo : TEventLocal<TEvControllerUpdateSelfHealInfo, TEvBlobStorage::EvControllerUpdateSelfHealInfo> {
struct TGroupContent {
struct TVDiskInfo {
TVSlotId Location;
bool Faulty;
bool Bad;
+ bool Decommitted;
NKikimrBlobStorage::EVDiskStatus VDiskStatus;
};
ui32 Generation;
TBlobStorageGroupType Type;
TMap<TVDiskID, TVDiskInfo> VDisks;
+ std::shared_ptr<TGroupGeometryInfo> Geometry;
};
THashMap<TGroupId, std::optional<TGroupContent>> GroupsToUpdate; // groups with faulty groups that are changed or got faulty PDisks for the first time
TVector<std::pair<TVDiskID, NKikimrBlobStorage::EVDiskStatus>> VDiskStatusUpdate;
+ std::optional<bool> GroupLayoutSanitizer;
};
- // Self-heal actor's main purpose is to monitor FAULTY pdisks and to slightly move groups out of them; every move
- // should not render group unusable, also it should not exceed its fail model. It also takes into account replication
- // broker features such as only one vslot over PDisk is being replicated at a moment.
- //
- // It interacts with BS_CONTROLLER and group observer (which provides information about group state on a per-vdisk
- // basis). BS_CONTROLLER reports faulty PDisks and all involved groups in a push notification manner.
- IActor *CreateSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups);
-
} // NKikimr::NBsController
diff --git a/ydb/core/mind/bscontroller/sys_view.cpp b/ydb/core/mind/bscontroller/sys_view.cpp
index ba9b50cde0..8dffa53468 100644
--- a/ydb/core/mind/bscontroller/sys_view.cpp
+++ b/ydb/core/mind/bscontroller/sys_view.cpp
@@ -289,8 +289,17 @@ public:
const auto readCentric = pdisk.HasReadCentric() ? MakeMaybe(pdisk.GetReadCentric()) : Nothing();
if (filter.MatchPDisk(pdisk.GetCategory(), sharedWithOs, readCentric)) {
const TNodeLocation& location = HostRecords->GetLocation(pdiskId.NodeId);
- const bool ok = mapper.RegisterPDisk(pdiskId, location, true, pdisk.GetNumActiveSlots(),
- pdisk.GetExpectedSlotCount(), nullptr, 0, 0, true);
+ const bool ok = mapper.RegisterPDisk({
+ .PDiskId = pdiskId,
+ .Location = location,
+ .Usable = true,
+ .NumSlots = pdisk.GetNumActiveSlots(),
+ .MaxSlots = pdisk.GetExpectedSlotCount(),
+ .Groups = {},
+ .SpaceAvailable = 0,
+ .Operational = true,
+ .Decommitted = false,
+ });
Y_VERIFY(ok);
break;
}
@@ -301,7 +310,7 @@ public:
TGroupMapper::TGroupDefinition group;
TString error;
std::deque<ui64> groupSizes;
- while (mapper.AllocateGroup(groupSizes.size(), group, nullptr, 0, {}, 0, false, error)) {
+ while (mapper.AllocateGroup(groupSizes.size(), group, {}, {}, 0, false, error)) {
std::vector<TGroupDiskInfo> disks;
std::deque<NKikimrBlobStorage::TPDiskMetrics> pdiskMetrics;
std::deque<NKikimrBlobStorage::TVDiskMetrics> vdiskMetrics;
@@ -425,6 +434,7 @@ void CopyInfo(NKikimrSysView::TPDiskInfo* info, const THolder<TBlobStorageContro
}
info->SetExpectedSlotCount(pDiskInfo->ExpectedSlotCount);
info->SetNumActiveSlots(pDiskInfo->NumActiveSlots + pDiskInfo->StaticSlotUsage);
+ info->SetDecommitStatus(NKikimrBlobStorage::EDecommitStatus_Name(pDiskInfo->DecommitStatus));
}
void SerializeVSlotInfo(NKikimrSysView::TVSlotInfo *pb, const TVDiskID& vdiskId, const NKikimrBlobStorage::TVDiskMetrics& m,
@@ -518,7 +528,6 @@ void CopyInfo(TDstMap& dst, TDeletedSet& deleted, const TSrcMap& src, TChangedSe
deleted.insert(key);
}
}
- changed.clear();
}
void TBlobStorageController::UpdateSystemViews() {
@@ -527,7 +536,7 @@ void TBlobStorageController::UpdateSystemViews() {
}
if (!SysViewChangedPDisks.empty() || !SysViewChangedVSlots.empty() || !SysViewChangedGroups.empty() ||
- !SysViewChangedStoragePools.empty()) {
+ !SysViewChangedStoragePools.empty() || SysViewChangedSettings) {
auto update = MakeHolder<TEvControllerUpdateSystemViews>();
update->HostRecords = HostRecords;
update->GroupReserveMin = GroupReserveMin;
@@ -538,7 +547,6 @@ void TBlobStorageController::UpdateSystemViews() {
CopyInfo(state.VSlots, update->DeletedVSlots, VSlots, SysViewChangedVSlots);
CopyInfo(state.Groups, update->DeletedGroups, GroupMap, SysViewChangedGroups);
CopyInfo(state.StoragePools, update->DeletedStoragePools, StoragePools, SysViewChangedStoragePools);
- SysViewChangedSettings = false;
// process static slots and static groups
for (const auto& [pdiskId, pdisk] : StaticPDisks) {
@@ -558,6 +566,7 @@ void TBlobStorageController::UpdateSystemViews() {
}
}
pb->SetStatusV2(NKikimrBlobStorage::EDriveStatus_Name(NKikimrBlobStorage::EDriveStatus::ACTIVE));
+ pb->SetDecommitStatus(NKikimrBlobStorage::EDecommitStatus_Name(NKikimrBlobStorage::EDecommitStatus::DECOMMIT_NONE));
pb->SetExpectedSlotCount(pdisk.ExpectedSlotCount ? pdisk.ExpectedSlotCount : pdisk.StaticSlotUsage);
pb->SetNumActiveSlots(pdisk.StaticSlotUsage);
}
@@ -603,6 +612,12 @@ void TBlobStorageController::UpdateSystemViews() {
CalculateGroupUsageStats(pb, disks, (TBlobStorageGroupType::EErasureSpecies)group.GetErasureSpecies());
}
+ SysViewChangedPDisks.clear();
+ SysViewChangedVSlots.clear();
+ SysViewChangedGroups.clear();
+ SysViewChangedStoragePools.clear();
+ SysViewChangedSettings = false;
+
Send(SystemViewsCollectorId, update.Release());
}
diff --git a/ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp b/ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp
index 5c2d0aad20..d4205abea8 100644
--- a/ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp
+++ b/ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp
@@ -1,6 +1,7 @@
#include <library/cpp/testing/unittest/registar.h>
#include <ydb/core/util/testactorsys.h>
#include <ydb/core/mind/bscontroller/self_heal.h>
+#include <ydb/core/mind/bscontroller/impl.h>
using namespace NActors;
using namespace NKikimr;
@@ -13,8 +14,8 @@ void RunTestCase(TCallback&& callback) {
TTestActorSystem runtime(1);
runtime.Start();
const TActorId& parentId = runtime.AllocateEdgeActor(1);
- std::shared_ptr<std::atomic_uint64_t> UnreassignableGroups = std::make_shared<std::atomic_uint64_t>();
- const TActorId& selfHealId = runtime.Register(CreateSelfHealActor(1, UnreassignableGroups), parentId, {}, {}, 1);
+ TBlobStorageController Controller({}, new TTabletStorageInfo(1, TTabletTypes::FLAT_BS_CONTROLLER));
+ const TActorId& selfHealId = runtime.Register(Controller.CreateSelfHealActor(), parentId, {}, {}, 1);
callback(selfHealId, parentId, runtime);
runtime.Stop();
}
diff --git a/ydb/core/mind/bscontroller/ya.make b/ydb/core/mind/bscontroller/ya.make
index 275a4dc238..c652ec110e 100644
--- a/ydb/core/mind/bscontroller/ya.make
+++ b/ydb/core/mind/bscontroller/ya.make
@@ -25,6 +25,8 @@ SRCS(
get_group.cpp
grouper.cpp
grouper.h
+ group_layout_checker.cpp
+ group_layout_checker.h
group_mapper.cpp
group_mapper.h
group_reconfigure_wipe.cpp
diff --git a/ydb/core/protos/blobstorage_config.proto b/ydb/core/protos/blobstorage_config.proto
index 10d0a29e6c..bc7f54aa6e 100644
--- a/ydb/core/protos/blobstorage_config.proto
+++ b/ydb/core/protos/blobstorage_config.proto
@@ -186,11 +186,18 @@ enum EDriveStatus {
ACTIVE = 1; // working as expected
INACTIVE = 2; // new groups are not created over this drive, but existing ones continue to work as expected
BROKEN = 3; // drive is not working, groups are automatically moved out of this drive upon reception of this status
- SPARE = 4; // spare drive -- groups are created only when being moved from BROKEN drives
+ reserved 4;
FAULTY = 5; // drive is expected to become BROKEN soon, new groups are not created, old groups are asynchronously moved out from this drive
TO_BE_REMOVED = 6; // same as INACTIVE, but drive is counted in fault model as not working
}
+enum EDecommitStatus {
+ DECOMMIT_UNSET = 0; // unset status -- missing optional field
+ DECOMMIT_NONE = 1; // no decomission
+ DECOMMIT_PENDING = 2; // drive is going to be removed soon, but SelfHeal logic would not remove it automatically
+ DECOMMIT_IMMINENT = 3; // drive is going to be settled automatically
+}
+
message TGroupStatus {
enum E {
UNKNOWN = 0; // group status is unknown (default value)
@@ -226,6 +233,7 @@ message TUpdateDriveStatus {
uint32 PDiskId = 4; // may be set instead of path to identify PDisk
string Serial = 5; // may be set instead of path and PDiskId to identify PDisk
uint64 StatusChangeTimestamp = 6; // used only in return of ReadDriveStatus
+ EDecommitStatus DecommitStatus = 7;
}
message TReadDriveStatus {
@@ -426,6 +434,7 @@ message TUpdateSettings {
repeated uint32 GroupReservePartPPM = 7;
repeated uint32 MaxScrubbedDisksAtOnce = 8;
repeated NKikimrBlobStorage.TPDiskSpaceColor.E PDiskSpaceColorBorder = 9;
+ repeated bool EnableGroupLayoutSanitizer = 10;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -526,6 +535,7 @@ message TBaseConfig {
uint32 ExpectedSlotCount = 13;
NKikimrBlobStorage.TPDiskMetrics PDiskMetrics = 14;
uint64 DriveStatusChangeTimestamp = 15; // TInstant::GetValue()
+ EDecommitStatus DecommitStatus = 16;
}
message TVSlot {
message TDonorDisk {
diff --git a/ydb/core/protos/out/out.cpp b/ydb/core/protos/out/out.cpp
index 28c49a5c36..88db8ad5f8 100644
--- a/ydb/core/protos/out/out.cpp
+++ b/ydb/core/protos/out/out.cpp
@@ -69,6 +69,10 @@ Y_DECLARE_OUT_SPEC(, NKikimrBlobStorage::EDriveStatus, stream, value) {
stream << NKikimrBlobStorage::EDriveStatus_Name(value);
}
+Y_DECLARE_OUT_SPEC(, NKikimrBlobStorage::EDecommitStatus, stream, value) {
+ stream << NKikimrBlobStorage::EDecommitStatus_Name(value);
+}
+
Y_DECLARE_OUT_SPEC(, NKikimrBlobStorage::TGroupStatus::E, stream, value) {
stream << NKikimrBlobStorage::TGroupStatus::E_Name(value);
}
diff --git a/ydb/core/protos/sys_view.proto b/ydb/core/protos/sys_view.proto
index 44abe03515..4d495d6b60 100644
--- a/ydb/core/protos/sys_view.proto
+++ b/ydb/core/protos/sys_view.proto
@@ -166,6 +166,7 @@ message TPDiskInfo {
optional uint32 ExpectedSlotCount = 15;
optional uint32 NumActiveSlots = 16;
optional uint64 Category = 17;
+ optional string DecommitStatus = 18;
// metrics ?
// physical location ?
// config ?
diff --git a/ydb/core/sys_view/common/schema.h b/ydb/core/sys_view/common/schema.h
index f7a4f191e1..29ba8b9b0f 100644
--- a/ydb/core/sys_view/common/schema.h
+++ b/ydb/core/sys_view/common/schema.h
@@ -191,6 +191,7 @@ struct Schema : NIceDb::Schema {
struct StatusChangeTimestamp : Column<14, NScheme::NTypeIds::Timestamp> {};
struct ExpectedSlotCount : Column<15, NScheme::NTypeIds::Uint32> {};
struct NumActiveSlots : Column<16, NScheme::NTypeIds::Uint32> {};
+ struct DecommitStatus : Column<17, NScheme::NTypeIds::String> {};
using TKey = TableKey<NodeId, PDiskId>;
using TColumns = TableColumns<
@@ -208,7 +209,8 @@ struct Schema : NIceDb::Schema {
Status,
StatusChangeTimestamp,
ExpectedSlotCount,
- NumActiveSlots>;
+ NumActiveSlots,
+ DecommitStatus>;
};
struct VSlots : Table<5> {
diff --git a/ydb/core/sys_view/storage/pdisks.cpp b/ydb/core/sys_view/storage/pdisks.cpp
index 47f84ae9aa..7b99991c52 100644
--- a/ydb/core/sys_view/storage/pdisks.cpp
+++ b/ydb/core/sys_view/storage/pdisks.cpp
@@ -39,6 +39,7 @@ public:
{T::StatusChangeTimestamp::ColumnId, {E::kInfoFieldNumber, V::kStatusChangeTimestampFieldNumber}},
{T::ExpectedSlotCount::ColumnId, {E::kInfoFieldNumber, V::kExpectedSlotCountFieldNumber}},
{T::NumActiveSlots::ColumnId, {E::kInfoFieldNumber, V::kNumActiveSlotsFieldNumber}},
+ {T::DecommitStatus::ColumnId, {E::kInfoFieldNumber, V::kDecommitStatusFieldNumber}},
};
return fieldMap;
}
diff --git a/ydb/core/sys_view/ut_kqp.cpp b/ydb/core/sys_view/ut_kqp.cpp
index c5f6c5470a..56eb5af269 100644
--- a/ydb/core/sys_view/ut_kqp.cpp
+++ b/ydb/core/sys_view/ut_kqp.cpp
@@ -884,8 +884,10 @@ Y_UNIT_TEST_SUITE(SystemView) {
TotalSize,
Type,
ExpectedSlotCount,
- NumActiveSlots
- FROM `/Root/.sys/ds_pdisks`;
+ NumActiveSlots,
+ DecommitStatus
+ FROM `/Root/.sys/ds_pdisks`
+ WHERE BoxId IS NOT NULL;
)").GetValueSync();
UNIT_ASSERT_C(it.IsSuccess(), it.GetIssues().ToString());
@@ -900,7 +902,7 @@ Y_UNIT_TEST_SUITE(SystemView) {
}
}
- TYsonFieldChecker check(ysonString, 15);
+ TYsonFieldChecker check(ysonString, 16);
check.Uint64(0u); // AvailableSize
check.Uint64(999u); // BoxId
@@ -917,6 +919,7 @@ Y_UNIT_TEST_SUITE(SystemView) {
check.String("ROT"); // Type
check.Uint64(16); // ExpectedSlotCount
check.Uint64(2); // NumActiveSlots
+ check.String("DECOMMIT_NONE"); // DecommitStatus
}
Y_UNIT_TEST(VSlotsFields) {
diff --git a/ydb/core/util/testactorsys.cpp b/ydb/core/util/testactorsys.cpp
index 4a270e4f8c..ead159d35e 100644
--- a/ydb/core/util/testactorsys.cpp
+++ b/ydb/core/util/testactorsys.cpp
@@ -132,17 +132,25 @@ TActorId TTestActorSystem::CreateTestBootstrapper(TTabletStorageInfo *info, std:
return Register(CreateBootstrapper(info, bi.Get()), nodeId);
}
-void TTestActorSystem::SetupTabletRuntime(bool isMirror3dc, ui32 stateStorageNodeId, ui32 targetNodeId) {
- auto setup = MakeIntrusive<TTableNameserverSetup>();
- ui32 nodeCountInDC = (MaxNodeId + 2) / 3;
- for (ui32 nodeId : GetNodes()) {
- const TString name = Sprintf("127.0.0.%u", nodeId);
- ui32 dcNum = isMirror3dc ? ((nodeId + nodeCountInDC - 1) / nodeCountInDC) : 1;
+void TTestActorSystem::SetupTabletRuntime(ui32 numDataCenters, ui32 stateStorageNodeId, ui32 targetNodeId) {
+ const ui32 nodeCountInDC = (MaxNodeId + numDataCenters - 1) / numDataCenters;
+ auto locationGenerator = [&](ui32 nodeId) {
+ const ui32 dcNum = (nodeId + nodeCountInDC - 1) / nodeCountInDC;
NActorsInterconnect::TNodeLocation location;
location.SetDataCenter(ToString(dcNum));
location.SetRack(ToString(nodeId));
location.SetUnit(ToString(nodeId));
- setup->StaticNodeTable[nodeId] = {name, name, name, 19001, TNodeLocation(location)};
+ return TNodeLocation(location);
+ };
+ SetupTabletRuntime(locationGenerator, stateStorageNodeId, targetNodeId);
+}
+
+void TTestActorSystem::SetupTabletRuntime(const std::function<TNodeLocation(ui32)>& locationGenerator,
+ ui32 stateStorageNodeId, ui32 targetNodeId) {
+ auto setup = MakeIntrusive<TTableNameserverSetup>();
+ for (ui32 nodeId : GetNodes()) {
+ const TString name = Sprintf("127.0.0.%u", nodeId);
+ setup->StaticNodeTable[nodeId] = {name, name, name, 19001, locationGenerator(nodeId)};
}
for (ui32 nodeId : GetNodes()) {
diff --git a/ydb/core/util/testactorsys.h b/ydb/core/util/testactorsys.h
index 722d5a8163..5037f7ab71 100644
--- a/ydb/core/util/testactorsys.h
+++ b/ydb/core/util/testactorsys.h
@@ -660,7 +660,9 @@ public:
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// tablet-related utility functions
- void SetupTabletRuntime(bool isMirror3dc = false, ui32 stateStorageNodeId = 0, ui32 targetNodeId = 0);
+ void SetupTabletRuntime(ui32 numDataCenters = 1, ui32 stateStorageNodeId = 0, ui32 targetNodeId = 0);
+ void SetupTabletRuntime(const std::function<TNodeLocation(ui32)>& locationGenerator, ui32 stateStorageNodeId = 0,
+ ui32 targetNodeId = 0);
static NTabletPipe::TClientConfig GetPipeConfigWithRetries();
void SendToPipe(ui64 tabletId, const TActorId& sender, IEventBase* payload, ui64 cookie, const NKikimr::NTabletPipe::TClientConfig& pipeConfig);
static TTabletStorageInfo *CreateTestTabletInfo(ui64 tabletId, TTabletTypes::EType tabletType, TBlobStorageGroupType::EErasureSpecies erasure, ui32 groupId);
diff --git a/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema b/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema
index 480b1d419f..928757d025 100644
--- a/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema
+++ b/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema
@@ -12,6 +12,11 @@
"ColumnType": "Uint32"
},
{
+ "ColumnId": 18,
+ "ColumnName": "GroupLayoutSanitizer",
+ "ColumnType": "Bool"
+ },
+ {
"ColumnId": 1,
"ColumnName": "FixedKey",
"ColumnType": "Bool"
@@ -92,6 +97,7 @@
"0": {
"Columns": [
17,
+ 18,
1,
2,
4,
@@ -174,6 +180,11 @@
],
"ColumnsAdded": [
{
+ "ColumnId": 17,
+ "ColumnName": "DecommitStatus",
+ "ColumnType": "Uint32"
+ },
+ {
"ColumnId": 1,
"ColumnName": "NodeID",
"ColumnType": "Uint32"
@@ -248,6 +259,7 @@
"ColumnFamilies": {
"0": {
"Columns": [
+ 17,
1,
2,
3,