diff options
author | Alexander Rutkovsky <alexvru@mail.ru> | 2022-06-06 14:52:24 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-06-06 14:52:24 +0300 |
commit | 1c9a9bebfd56f7648a7879824fa93adeab288978 (patch) | |
tree | 1b28da8263e45f4beb6ce7f6b21f66db48b5887e | |
parent | 3ad62aa372aacec6af48bb3e8c2e82a768022f8b (diff) | |
download | ydb-1c9a9bebfd56f7648a7879824fa93adeab288978.tar.gz |
PR from branch users/alexvru/merge/22-2/KIKIMR-14580
PR from branch users/alexvru/bsdc/KIKIMR-12959/fix-bug KIKIMR-12959
REVIEW: 2492305
Fix some problems and improve code KIKIMR-14580
REVIEW: 2509145
Improve code by adding TEntityId KIKIMR-14580
REVIEW: 2500367
Add group layout sanitizer feature KIKIMR-14580
REVIEW: 2443980
Fix group mapper bug KIKIMR-14580
REVIEW: 2430098
Refactor group mapper to support partially aligned groups according to majority principle KIKIMR-14580
REVIEW: 2425466
Separate DecommitStatus for drive KIKIMR-14580
REVIEW: 2421023
Support DECOMMIT_* status in BS_CONTROLLER KIKIMR-14580
REVIEW: 2416336
REVIEW: 2515731
x-ydb-stable-ref: 2c2ad79ee0c925ddb5845f5543245862a930a68a
39 files changed, 1814 insertions, 737 deletions
diff --git a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp index 054d5be665..7a7cb76ed6 100644 --- a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp +++ b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.cpp @@ -299,7 +299,7 @@ TBlobStorageGroupInfo::TTopology::TTopology(TBlobStorageGroupType gtype) {} TBlobStorageGroupInfo::TTopology::TTopology(TBlobStorageGroupType gtype, ui32 numFailRealms, - ui32 numFailDomainsPerFailRealm, ui32 numVDisksPerFailDomain) + ui32 numFailDomainsPerFailRealm, ui32 numVDisksPerFailDomain, bool finalize) : GType(gtype) { FailRealms = {numFailRealms, { @@ -307,6 +307,9 @@ TBlobStorageGroupInfo::TTopology::TTopology(TBlobStorageGroupType gtype, ui32 nu {numVDisksPerFailDomain, TVDiskInfo{}} }} }}; + if (finalize) { + FinalizeConstruction(); + } } TBlobStorageGroupInfo::TTopology::~TTopology() = default; @@ -400,10 +403,6 @@ ui32 TBlobStorageGroupInfo::TTopology::GetOrderNumber(const TVDiskIdShort &vdisk return FailRealms[vdisk.FailRealm].FailDomains[vdisk.FailDomain].VDisks[vdisk.VDisk].OrderNumber; } -ui32 TBlobStorageGroupInfo::TTopology::GetNumVDisksPerFailDomain() const { - return FailRealms[0].FailDomains[0].VDisks.size(); -} - void TBlobStorageGroupInfo::TTopology::PickSubgroup(ui32 hash, TBlobStorageGroupInfo::TOrderNums &orderNums) const { return BlobMapper->PickSubgroup(hash, orderNums); } diff --git a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h index e2d1445be4..2b89538bc8 100644 --- a/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h +++ b/ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h @@ -191,7 +191,8 @@ public: TVector<TVDiskIdShort> VDiskIdForOrderNumber; TTopology(TBlobStorageGroupType gtype); - TTopology(TBlobStorageGroupType gtype, ui32 numFailRealms, ui32 numFailDomainsPerFailRealm, ui32 numVDisksPerFailDomain); + TTopology(TBlobStorageGroupType gtype, ui32 numFailRealms, ui32 numFailDomainsPerFailRealm, ui32 numVDisksPerFailDomain, + bool finalize = false); TTopology(const TTopology&) = delete; TTopology &operator =(const TTopology&) = delete; TTopology(TTopology&&) = default; @@ -221,7 +222,9 @@ public: // get the total number of VDisks in the blobstorage group ui32 GetTotalVDisksNum() const { return TotalVDisks; } // get number of VDisks per fail domain - ui32 GetNumVDisksPerFailDomain() const; + ui32 GetNumVDisksPerFailDomain() const { return FailRealms[0].FailDomains[0].VDisks.size(); } + // get number of fail domains per fail realm + ui32 GetNumFailDomainsPerFailRealm() const { return FailRealms[0].FailDomains.size(); } // get quorum checker const IQuorumChecker& GetQuorumChecker() const { return *QuorumChecker; } diff --git a/ydb/core/blobstorage/pdisk/mock/pdisk_mock.cpp b/ydb/core/blobstorage/pdisk/mock/pdisk_mock.cpp index 1e84f5d8f6..86cc23e1e9 100644 --- a/ydb/core/blobstorage/pdisk/mock/pdisk_mock.cpp +++ b/ydb/core/blobstorage/pdisk/mock/pdisk_mock.cpp @@ -1,4 +1,5 @@ #include "pdisk_mock.h" +#include <ydb/core/blobstorage/base/blobstorage_events.h> #include <ydb/core/util/stlog.h> #include <ydb/core/util/interval_set.h> @@ -238,7 +239,7 @@ TPDiskMockState::TPtr TPDiskMockState::Snapshot() { return res; } -class TPDiskMockActor : public TActor<TPDiskMockActor> { +class TPDiskMockActor : public TActorBootstrapped<TPDiskMockActor> { enum { EvResume = EventSpaceBegin(TEvents::ES_PRIVATE), }; @@ -251,8 +252,7 @@ class TPDiskMockActor : public TActor<TPDiskMockActor> { public: TPDiskMockActor(TPDiskMockState::TPtr state) - : TActor(&TThis::StateFunc) - , State(std::move(state)) // to keep ownership + : State(std::move(state)) // to keep ownership , Impl(*State->Impl) , Prefix(TStringBuilder() << "PDiskMock[" << Impl.NodeId << ":" << Impl.PDiskId << "] ") { @@ -263,6 +263,30 @@ public: } } + void Bootstrap() { + Become(&TThis::StateFunc); + ReportMetrics(); + } + + void ReportMetrics() { + ui32 usedChunks = 0; + for (const auto& [ownerId, owner] : Impl.Owners) { + usedChunks += owner.CommittedChunks.size() + owner.ReservedChunks.size(); + } + Y_VERIFY(usedChunks <= Impl.TotalChunks); + + auto ev = std::make_unique<TEvBlobStorage::TEvControllerUpdateDiskStatus>(); + auto& record = ev->Record; + auto *p = record.AddPDisksMetrics(); + p->SetPDiskId(Impl.PDiskId); + p->SetAvailableSize((Impl.TotalChunks - usedChunks) * Impl.ChunkSize); + p->SetTotalSize(Impl.TotalChunks * Impl.ChunkSize); + p->SetState(NKikimrBlobStorage::TPDiskState::Normal); + Send(MakeBlobStorageNodeWardenID(SelfId().NodeId()), ev.release()); + + Schedule(TDuration::Seconds(5), new TEvents::TEvWakeup); + } + void Handle(NPDisk::TEvYardInit::TPtr ev) { // report message and validate PDisk guid auto *msg = ev->Get(); @@ -663,6 +687,7 @@ public: hFunc(NPDisk::TEvSlay, Handle); hFunc(NPDisk::TEvHarakiri, Handle); hFunc(NPDisk::TEvConfigureScheduler, Handle); + cFunc(TEvents::TSystem::Wakeup, ReportMetrics); ) }; diff --git a/ydb/core/blobstorage/ut_blobstorage/decommit_3dc.cpp b/ydb/core/blobstorage/ut_blobstorage/decommit_3dc.cpp new file mode 100644 index 0000000000..811d791db6 --- /dev/null +++ b/ydb/core/blobstorage/ut_blobstorage/decommit_3dc.cpp @@ -0,0 +1,89 @@ +#include <ydb/core/blobstorage/ut_blobstorage/lib/env.h> + +Y_UNIT_TEST_SUITE(Decommit3dc) { + Y_UNIT_TEST(Test) { + for (ui32 numNodes : {12, 16}) + for (ui32 numGroups : {1, 7}) + for (bool resetToNone : {false, true}) + for (ui32 numDecommitted = 0; numDecommitted <= numNodes / 4; ++numDecommitted) { + Cerr << "numNodes# " << numNodes + << " numGroups# " << numGroups + << " resetToNone# " << (resetToNone ? "true" : "false") + << " numDecommitted# " << numDecommitted + << Endl; + + TEnvironmentSetup env{{ + .NodeCount = numNodes, + .Erasure = TBlobStorageGroupType::ErasureMirror3dc, + .NumDataCenters = 4, + }}; + + env.UpdateSettings(true, true); + env.CreateBoxAndPool(1, numGroups); + env.Sim(TDuration::Seconds(30)); + auto config = env.FetchBaseConfig(); + + std::set<ui32> nodesToSettle; + TString datacenterToSettle; + for (const auto& node : config.GetNode()) { + const auto& location = node.GetLocation(); + if (!datacenterToSettle) { + datacenterToSettle = location.GetDataCenter(); + } + if (datacenterToSettle == location.GetDataCenter()) { + nodesToSettle.insert(node.GetNodeId()); + } + } + + std::set<std::pair<ui32, ui32>> pdisksToSettle; + for (const auto& pdisk : config.GetPDisk()) { + if (nodesToSettle.count(pdisk.GetNodeId())) { + pdisksToSettle.emplace(pdisk.GetNodeId(), pdisk.GetPDiskId()); + env.UpdateDriveStatus(pdisk.GetNodeId(), pdisk.GetPDiskId(), {}, + NKikimrBlobStorage::EDecommitStatus::DECOMMIT_PENDING); + } + } + + UNIT_ASSERT_VALUES_EQUAL(pdisksToSettle.size(), numNodes / 4); + std::set<std::pair<ui32, ui32>> movedOutPDisks; + auto it = pdisksToSettle.begin(); + for (ui32 i = 0; i < numDecommitted; ++i, ++it) { + UNIT_ASSERT(it != pdisksToSettle.end()); + movedOutPDisks.insert(*it); + env.UpdateDriveStatus(it->first, it->second, {}, NKikimrBlobStorage::EDecommitStatus::DECOMMIT_IMMINENT); + + env.Sim(TDuration::Seconds(60)); + auto config = env.FetchBaseConfig(); + for (const auto& vslot : config.GetVSlot()) { + const auto& vslotId = vslot.GetVSlotId(); + UNIT_ASSERT(!movedOutPDisks.count({vslotId.GetNodeId(), vslotId.GetPDiskId()})); + } + } + if (resetToNone) { + for (const auto& [nodeId, pdiskId] : pdisksToSettle) { + env.UpdateDriveStatus(nodeId, pdiskId, {}, NKikimrBlobStorage::EDecommitStatus::DECOMMIT_NONE); + } + movedOutPDisks.clear(); + } + for (const auto& [nodeId, pdiskId] : pdisksToSettle) { + Cerr << "nodeId# " << nodeId << " pdiskId# " << pdiskId << Endl; + + env.UpdateDriveStatus(nodeId, pdiskId, NKikimrBlobStorage::EDriveStatus::FAULTY, {}); + movedOutPDisks.emplace(nodeId, pdiskId); + + env.Sim(TDuration::Seconds(90)); + auto config = env.FetchBaseConfig(); + for (const auto& vslot : config.GetVSlot()) { + const auto& vslotId = vslot.GetVSlotId(); + Cerr << vslotId.GetNodeId() << ":" << vslotId.GetPDiskId() << " " << vslot.GetFailRealmIdx() + << " " << vslot.GetFailDomainIdx() << Endl; + UNIT_ASSERT(!movedOutPDisks.count({vslotId.GetNodeId(), vslotId.GetPDiskId()}) || + (numNodes == 12 && numDecommitted == 0)); + } + + env.UpdateDriveStatus(nodeId, pdiskId, NKikimrBlobStorage::EDriveStatus::ACTIVE, {}); + movedOutPDisks.erase({nodeId, pdiskId}); + } + } + } +} diff --git a/ydb/core/blobstorage/ut_blobstorage/lib/env.h b/ydb/core/blobstorage/ut_blobstorage/lib/env.h index d5ab6cdeee..63329eac32 100644 --- a/ydb/core/blobstorage/ut_blobstorage/lib/env.h +++ b/ydb/core/blobstorage/ut_blobstorage/lib/env.h @@ -29,6 +29,8 @@ struct TEnvironmentSetup { const std::function<void(TTestActorSystem&)> PrepareRuntime; const ui32 ControllerNodeId = 1; const bool Cache = false; + const ui32 NumDataCenters = 0; + const std::function<TNodeLocation(ui32)> LocationGenerator; }; const TSettings Settings; @@ -93,6 +95,11 @@ struct TEnvironmentSetup { Cerr << "RandomSeed# " << seed << Endl; } + ui32 GetNumDataCenters() const { + return Settings.NumDataCenters ? Settings.NumDataCenters : + Settings.Erasure.GetErasure() == TBlobStorageGroupType::ErasureMirror3dc ? 3 : 1; + } + void Initialize() { Runtime = std::make_unique<TTestActorSystem>(Settings.NodeCount); if (Settings.PrepareRuntime) { @@ -102,8 +109,11 @@ struct TEnvironmentSetup { Runtime->Start(); auto *appData = Runtime->GetAppData(); appData->DomainsInfo->AddDomain(TDomainsInfo::TDomain::ConstructEmptyDomain("dom", DomainId).Release()); - Runtime->SetupTabletRuntime(Settings.Erasure.GetErasure() == TBlobStorageGroupType::ErasureMirror3dc, - Settings.ControllerNodeId); + if (Settings.LocationGenerator) { + Runtime->SetupTabletRuntime(Settings.LocationGenerator, Settings.ControllerNodeId); + } else { + Runtime->SetupTabletRuntime(GetNumDataCenters(), Settings.ControllerNodeId); + } SetupStaticStorage(); SetupTablet(); SetupStorage(); @@ -115,8 +125,11 @@ struct TEnvironmentSetup { void StartNode(ui32 nodeId) { Runtime->StartNode(nodeId); - Runtime->SetupTabletRuntime(Settings.Erasure.GetErasure() == TBlobStorageGroupType::ErasureMirror3dc, - Settings.ControllerNodeId, nodeId); + if (Settings.LocationGenerator) { + Runtime->SetupTabletRuntime(Settings.LocationGenerator, Settings.ControllerNodeId, nodeId); + } else { + Runtime->SetupTabletRuntime(GetNumDataCenters(), Settings.ControllerNodeId, nodeId); + } if (nodeId == Settings.ControllerNodeId) { SetupStaticStorage(); SetupTablet(); @@ -549,6 +562,17 @@ struct TEnvironmentSetup { } } + void UpdateSettings(bool selfHeal, bool donorMode, bool groupLayoutSanitizer = false) { + NKikimrBlobStorage::TConfigRequest request; + auto *cmd = request.AddCommand(); + auto *us = cmd->MutableUpdateSettings(); + us->AddEnableSelfHeal(selfHeal); + us->AddEnableDonorMode(donorMode); + us->AddEnableGroupLayoutSanitizer(groupLayoutSanitizer); + auto response = Invoke(request); + UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription()); + } + void EnableDonorMode() { NKikimrBlobStorage::TConfigRequest request; request.AddCommand()->MutableEnableDonorMode()->SetEnable(true); @@ -556,6 +580,19 @@ struct TEnvironmentSetup { UNIT_ASSERT(response.GetSuccess()); } + void UpdateDriveStatus(ui32 nodeId, ui32 pdiskId, NKikimrBlobStorage::EDriveStatus status, + NKikimrBlobStorage::EDecommitStatus decommitStatus) { + NKikimrBlobStorage::TConfigRequest request; + auto *cmd = request.AddCommand(); + auto *ds = cmd->MutableUpdateDriveStatus(); + ds->MutableHostKey()->SetNodeId(nodeId); + ds->SetPDiskId(pdiskId); + ds->SetStatus(status); + ds->SetDecommitStatus(decommitStatus); + auto response = Invoke(request); + UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription()); + } + void SetScrubPeriodicity(TDuration periodicity) { NKikimrBlobStorage::TConfigRequest request; request.AddCommand()->MutableSetScrubPeriodicity()->SetScrubPeriodicity(periodicity.Seconds()); diff --git a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp new file mode 100644 index 0000000000..f74e2d4841 --- /dev/null +++ b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp @@ -0,0 +1,103 @@ +#include <ydb/core/blobstorage/ut_blobstorage/lib/env.h> + +Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { + Y_UNIT_TEST(Test3dc) { + const ui32 numRacks = 15; + std::vector<ui32> nodesPerRack(numRacks); + std::vector<ui32> nodeToRack; + for (ui32 numFilledRacks = 0; numFilledRacks < numRacks; ) { +// const ui32 rackId = RandomNumber(numRacks); + const ui32 rackId = numFilledRacks; + nodeToRack.emplace_back(rackId); + numFilledRacks += !nodesPerRack[rackId]++; + } + const ui32 numDatacenters = 3; + std::vector<ui32> rackToDatacenter; + for (ui32 i = 0; i < numRacks; ++i) { + rackToDatacenter.push_back(i % numDatacenters); + } + + std::vector<TNodeLocation> locations; + for (ui32 i = 0; i < nodeToRack.size(); ++i) { + NActorsInterconnect::TNodeLocation proto; + proto.SetDataCenter(ToString(rackToDatacenter[nodeToRack[i]])); + proto.SetRack(ToString(nodeToRack[i])); + proto.SetUnit(ToString(i)); + locations.emplace_back(proto); + } + + TEnvironmentSetup env{{ + .NodeCount = (ui32)nodeToRack.size(), + .Erasure = TBlobStorageGroupType::ErasureMirror3dc, + .LocationGenerator = [&](ui32 nodeId) { return locations[nodeId - 1]; }, + }}; + + auto getGroupsWithIncorrectLayout = [&] { + auto config = env.FetchBaseConfig(); + + std::map<ui32, std::tuple<TString, TString>> nodeIdToLocation; + for (const auto& node : config.GetNode()) { + const auto& location = node.GetLocation(); + nodeIdToLocation.emplace(node.GetNodeId(), std::make_tuple(location.GetDataCenter(), location.GetRack())); + } + + std::map<ui32, std::vector<std::vector<std::tuple<TString, TString>>>> groups; + for (const auto& vslot : config.GetVSlot()) { + auto& group = groups[vslot.GetGroupId()]; + if (group.empty()) { + group.resize(3, {3, {"", ""}}); + } + group[vslot.GetFailRealmIdx()][vslot.GetFailDomainIdx()] = nodeIdToLocation[vslot.GetVSlotId().GetNodeId()]; + } + + std::set<ui32> badGroups; + + for (auto& [groupId, group] : groups) { + std::set<TString> usedRealms; + + for (const auto& row : group) { + TString realm; + std::set<TString> usedRacks; + + for (const auto& [dc, rack] : row) { + Y_VERIFY(dc && rack); + + if (!usedRacks.insert(rack).second) { + badGroups.insert(groupId); + } + + if (!realm) { + if (!usedRealms.insert(dc).second) { + badGroups.insert(groupId); + } + realm = dc; + } else if (realm != dc) { + badGroups.insert(groupId); + } + } + } + } + + return badGroups; + }; + + const ui32 disksPerNode = 1; + const ui32 slotsPerDisk = 3; + env.CreateBoxAndPool(disksPerNode, nodeToRack.size() * disksPerNode * slotsPerDisk / 9); + env.Sim(TDuration::Seconds(30)); + auto before = getGroupsWithIncorrectLayout(); + Cerr << "bad groups before shuffling# " << FormatList(before) << Endl; + UNIT_ASSERT(before.empty()); + env.Cleanup(); + std::random_shuffle(locations.begin(), locations.end()); + env.Initialize(); + env.Sim(TDuration::Seconds(100)); + auto after = getGroupsWithIncorrectLayout(); + Cerr << "bad groups just after shuffling# " << FormatList(after) << Endl; + env.UpdateSettings(true, false, true); + env.Sim(TDuration::Minutes(15)); + auto corrected = getGroupsWithIncorrectLayout(); + Cerr << "bad groups after shuffling and fixing# " << FormatList(corrected) << Endl; + UNIT_ASSERT(corrected.empty()); + } +} diff --git a/ydb/core/blobstorage/ut_blobstorage/ya.make b/ydb/core/blobstorage/ut_blobstorage/ya.make index efe91d91cd..9b33efe1a9 100644 --- a/ydb/core/blobstorage/ut_blobstorage/ya.make +++ b/ydb/core/blobstorage/ut_blobstorage/ya.make @@ -11,6 +11,7 @@ TIMEOUT(600) SRCS( block_race.cpp counting_events.cpp + decommit_3dc.cpp defrag.cpp donor.cpp encryption.cpp @@ -18,6 +19,7 @@ SRCS( incorrect_queries.cpp main.cpp mirror3of4.cpp + sanitize_groups.cpp space_check.cpp sync.cpp replication.cpp diff --git a/ydb/core/cms/sentinel.cpp b/ydb/core/cms/sentinel.cpp index 638b78fe09..1823586d28 100644 --- a/ydb/core/cms/sentinel.cpp +++ b/ydb/core/cms/sentinel.cpp @@ -145,7 +145,6 @@ bool TPDiskStatus::IsNewStatusGood() const { switch (Compute(Current, unused)) { case EPDiskStatus::INACTIVE: case EPDiskStatus::ACTIVE: - case EPDiskStatus::SPARE: return true; case EPDiskStatus::UNKNOWN: diff --git a/ydb/core/cms/sentinel_ut.cpp b/ydb/core/cms/sentinel_ut.cpp index 62c0fdf309..f916cf6005 100644 --- a/ydb/core/cms/sentinel_ut.cpp +++ b/ydb/core/cms/sentinel_ut.cpp @@ -48,8 +48,8 @@ Y_UNIT_TEST_SUITE(TSentinelBaseTests) { EPDiskStatus::ACTIVE, EPDiskStatus::INACTIVE, EPDiskStatus::BROKEN, - EPDiskStatus::SPARE, EPDiskStatus::FAULTY, + EPDiskStatus::TO_BE_REMOVED, }; for (const EPDiskStatus status : AllStatuses) { diff --git a/ydb/core/mind/bscontroller/bsc.cpp b/ydb/core/mind/bscontroller/bsc.cpp index 779381fd51..bd9f83d86d 100644 --- a/ydb/core/mind/bscontroller/bsc.cpp +++ b/ydb/core/mind/bscontroller/bsc.cpp @@ -116,7 +116,7 @@ void TBlobStorageController::OnActivateExecutor(const TActorContext&) { } // create self-heal actor - SelfHealId = Register(CreateSelfHealActor(TabletID(), SelfHealUnreassignableGroups)); + SelfHealId = Register(CreateSelfHealActor()); // create stat processor StatProcessorActorId = Register(CreateStatProcessorActor()); @@ -152,6 +152,7 @@ void TBlobStorageController::Handle(TEvInterconnect::TEvNodesInfo::TPtr &ev) { const bool initial = !HostRecords; HostRecords = std::make_shared<THostRecordMap::element_type>(ev->Get()); Schedule(TDuration::Minutes(5), new TEvPrivate::TEvHostRecordsTimeToLiveExceeded); + Send(SelfHealId, new TEvPrivate::TEvUpdateHostRecords(HostRecords)); if (initial) { Execute(CreateTxInitScheme()); } diff --git a/ydb/core/mind/bscontroller/cmds_drive_status.cpp b/ydb/core/mind/bscontroller/cmds_drive_status.cpp index 8abaaa5979..25fe500b2c 100644 --- a/ydb/core/mind/bscontroller/cmds_drive_status.cpp +++ b/ydb/core/mind/bscontroller/cmds_drive_status.cpp @@ -24,16 +24,20 @@ namespace NKikimr::NBsController { } TPDiskInfo *pdisk = PDisks.FindForUpdate(pdiskId); - if (cmd.GetStatus() != pdisk->Status) { - const bool wasGoodExpectedStatus = pdisk->HasGoodExpectedStatus(); - pdisk->Status = cmd.GetStatus(); + const bool wasGoodExpectedStatus = pdisk->HasGoodExpectedStatus(); + if (const auto s = cmd.GetStatus(); s != NKikimrBlobStorage::EDriveStatus::UNKNOWN && s != pdisk->Status) { + pdisk->Status = s; pdisk->StatusTimestamp = Timestamp; - if (pdisk->HasGoodExpectedStatus() != wasGoodExpectedStatus) { - for (const auto& [id, slot] : pdisk->VSlotsOnPDisk) { - if (slot->Group) { - TGroupInfo *group = Groups.FindForUpdate(slot->Group->ID); - group->CalculateGroupStatus(); - } + } + if (const auto ds = cmd.GetDecommitStatus(); ds != NKikimrBlobStorage::EDecommitStatus::DECOMMIT_UNSET && + ds != pdisk->DecommitStatus) { + pdisk->DecommitStatus = ds; + } + if (wasGoodExpectedStatus != pdisk->HasGoodExpectedStatus()) { + for (const auto& [id, slot] : pdisk->VSlotsOnPDisk) { + if (slot->Group) { + TGroupInfo *group = Groups.FindForUpdate(slot->Group->ID); + group->CalculateGroupStatus(); } } } @@ -44,7 +48,8 @@ namespace NKikimr::NBsController { (IcPort, host.GetIcPort()), (NodeId, host.GetNodeId()), (Path, cmd.GetPath()), - (Status, cmd.GetStatus())); + (Status, cmd.GetStatus()), + (DecommitStatus, cmd.GetDecommitStatus())); } void TBlobStorageController::TConfigState::ExecuteStep(const NKikimrBlobStorage::TReadDriveStatus& cmd, TStatus& status) { diff --git a/ydb/core/mind/bscontroller/cmds_storage_pool.cpp b/ydb/core/mind/bscontroller/cmds_storage_pool.cpp index 51bacc0571..0ca115ec3e 100644 --- a/ydb/core/mind/bscontroller/cmds_storage_pool.cpp +++ b/ydb/core/mind/bscontroller/cmds_storage_pool.cpp @@ -465,6 +465,7 @@ namespace NKikimr::NBsController { x->SetNumStaticSlots(pdisk.StaticSlotUsage); x->SetDriveStatus(NKikimrBlobStorage::EDriveStatus::ACTIVE); x->SetExpectedSlotCount(pdisk.ExpectedSlotCount); + x->SetDecommitStatus(NKikimrBlobStorage::EDecommitStatus::DECOMMIT_NONE); if (pdisk.PDiskMetrics) { x->MutablePDiskMetrics()->CopyFrom(*pdisk.PDiskMetrics); x->MutablePDiskMetrics()->ClearPDiskId(); diff --git a/ydb/core/mind/bscontroller/config.cpp b/ydb/core/mind/bscontroller/config.cpp index 47b522f24d..b43b155504 100644 --- a/ydb/core/mind/bscontroller/config.cpp +++ b/ydb/core/mind/bscontroller/config.cpp @@ -794,6 +794,7 @@ namespace NKikimr::NBsController { pb->SetDriveStatus(pdisk.Status); pb->SetExpectedSlotCount(pdisk.ExpectedSlotCount); pb->SetDriveStatusChangeTimestamp(pdisk.StatusTimestamp.GetValue()); + pb->SetDecommitStatus(pdisk.DecommitStatus); pb->MutablePDiskMetrics()->CopyFrom(pdisk.Metrics); pb->MutablePDiskMetrics()->ClearPDiskId(); } diff --git a/ydb/core/mind/bscontroller/config_cmd.cpp b/ydb/core/mind/bscontroller/config_cmd.cpp index 2fc925a9cd..f4bb7d5aa1 100644 --- a/ydb/core/mind/bscontroller/config_cmd.cpp +++ b/ydb/core/mind/bscontroller/config_cmd.cpp @@ -126,6 +126,13 @@ namespace NKikimr::NBsController { Self->PDiskSpaceColorBorder = static_cast<T::PDiskSpaceColorBorder::Type>(value); db.Table<T>().Key(true).Update<T::PDiskSpaceColorBorder>(Self->PDiskSpaceColorBorder); } + for (bool value : settings.GetEnableGroupLayoutSanitizer()) { + Self->GroupLayoutSanitizer = value; + db.Table<T>().Key(true).Update<T::GroupLayoutSanitizer>(Self->GroupLayoutSanitizer); + auto ev = std::make_unique<TEvControllerUpdateSelfHealInfo>(); + ev->GroupLayoutSanitizer = Self->GroupLayoutSanitizer; + Self->Send(Self->SelfHealId, ev.release()); + } return true; } diff --git a/ydb/core/mind/bscontroller/config_fit_groups.cpp b/ydb/core/mind/bscontroller/config_fit_groups.cpp index cb8801b0f5..dd1513b354 100644 --- a/ydb/core/mind/bscontroller/config_fit_groups.cpp +++ b/ydb/core/mind/bscontroller/config_fit_groups.cpp @@ -51,7 +51,7 @@ namespace NKikimr { for (ui64 reserve = 0; reserve < min || (reserve - min) * 1000000 / Max<ui64>(1, total) < part; ++reserve, ++total) { TGroupMapper::TGroupDefinition group; try { - AllocateGroup(0, group, nullptr, 0, {}, 0, false); + AllocateGroup(0, group, {}, {}, 0, false); } catch (const TExFitGroupError&) { throw TExError() << "group reserve constraint hit"; } @@ -92,7 +92,7 @@ namespace NKikimr { requiredSpace = ExpectedSlotSize.front(); ExpectedSlotSize.pop_front(); } - AllocateGroup(groupId, group, nullptr, 0, {}, requiredSpace, false); + AllocateGroup(groupId, group, {}, {}, requiredSpace, false); // scan all comprising PDisks for PDiskCategory TMaybe<TPDiskCategory> desiredPDiskCategory; @@ -171,6 +171,7 @@ namespace NKikimr { // mapping for audit log TMap<TVDiskIdShort, TVSlotId> replacedSlots; TStackVec<std::pair<TVSlotId, bool>, 32> replaceQueue; + THashMap<TVDiskIdShort, TPDiskId> replacedDisks; i64 requiredSpace = Min<i64>(); //////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -197,9 +198,6 @@ namespace NKikimr { replace = true; break; - case NKikimrBlobStorage::EDriveStatus::SPARE: - break; - case NKikimrBlobStorage::EDriveStatus::FAULTY: case NKikimrBlobStorage::EDriveStatus::TO_BE_REMOVED: // groups are moved out asynchronously @@ -217,6 +215,7 @@ namespace NKikimr { g[vslot->RingIdx][vslot->FailDomainIdx][vslot->VDiskIdx] = targetPDiskId; replacedSlots.emplace(TVDiskIdShort(vslot->RingIdx, vslot->FailDomainIdx, vslot->VDiskIdx), vslot->VSlotId); replaceQueue.emplace_back(vslot->VSlotId, State.SuppressDonorMode.count(vslot->VSlotId)); + replacedDisks.emplace(vslot->GetShortVDiskId(), vslot->VSlotId.ComprisingPDiskId()); } else { preservedSlots.emplace(vslot->GetVDiskId(), vslot->VSlotId); auto& m = vslot->Metrics; @@ -243,10 +242,6 @@ namespace NKikimr { } } if (hasMissingSlots || !IgnoreGroupSanityChecks) { - TStackVec<TPDiskId, 32> replacedDiskIds; - for (const auto& [vslotId, suppressDonorMode] : replaceQueue) { - replacedDiskIds.push_back(vslotId.ComprisingPDiskId()); - } TGroupMapper::TForbiddenPDisks forbid; for (const auto& vslot : groupInfo->VDisksInGroup) { for (const auto& [vslotId, vdiskId] : vslot->Donors) { @@ -255,8 +250,7 @@ namespace NKikimr { } } } - AllocateGroup(groupId, group, replacedDiskIds.data(), replacedDiskIds.size(), std::move(forbid), - requiredSpace, AllowUnusableDisks); + AllocateGroup(groupId, group, replacedDisks, std::move(forbid), requiredSpace, AllowUnusableDisks); if (!IgnoreVSlotQuotaCheck) { adjustSpaceAvailable = true; for (const auto& [pos, vslotId] : replacedSlots) { @@ -361,9 +355,9 @@ namespace NKikimr { } private: - void AllocateGroup(TGroupId groupId, TGroupMapper::TGroupDefinition& group, const TPDiskId replacedDiskIds[], - size_t numReplacedDisks, TGroupMapper::TForbiddenPDisks forbid, i64 requiredSpace, - bool addExistingDisks) { + void AllocateGroup(TGroupId groupId, TGroupMapper::TGroupDefinition& group, + const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks, TGroupMapper::TForbiddenPDisks forbid, + i64 requiredSpace, bool addExistingDisks) { if (!Mapper) { Mapper.emplace(Geometry, StoragePool.RandomizeGroupMapping); PopulateGroupMapper(); @@ -382,8 +376,7 @@ namespace NKikimr { } } } - Geometry.AllocateGroup(*Mapper, groupId, group, replacedDiskIds, numReplacedDisks, std::move(forbid), - requiredSpace); + Geometry.AllocateGroup(*Mapper, groupId, group, replacedDisks, std::move(forbid), requiredSpace); for (const TPDiskId pdiskId : removeQ) { Mapper->UnregisterPDisk(pdiskId); } @@ -451,7 +444,7 @@ namespace NKikimr { } } - if (info.Status != NKikimrBlobStorage::EDriveStatus::ACTIVE) { + if (!info.AcceptsNewSlots()) { usable = false; } @@ -460,8 +453,17 @@ namespace NKikimr { } // register PDisk in the mapper - return Mapper->RegisterPDisk(id, State.HostRecords->GetLocation(id.NodeId), usable, numSlots, - info.ExpectedSlotCount, groups.data(), groups.size(), availableSpace, info.Operational); + return Mapper->RegisterPDisk({ + .PDiskId = id, + .Location = State.HostRecords->GetLocation(id.NodeId), + .Usable = usable, + .NumSlots = numSlots, + .MaxSlots = info.ExpectedSlotCount, + .Groups = std::move(groups), + .SpaceAvailable = availableSpace, + .Operational = info.Operational, + .Decommitted = info.Decommitted(), + }); } std::map<TVDiskIdShort, TVSlotInfo*> CreateVSlotsForGroup(TGroupInfo *groupInfo, diff --git a/ydb/core/mind/bscontroller/config_fit_pdisks.cpp b/ydb/core/mind/bscontroller/config_fit_pdisks.cpp index 3112a24317..6baddf3bba 100644 --- a/ydb/core/mind/bscontroller/config_fit_pdisks.cpp +++ b/ydb/core/mind/bscontroller/config_fit_pdisks.cpp @@ -89,7 +89,8 @@ namespace NKikimr { state.PDisks.ConstructInplaceNewEntry(pdiskId, *hostId, TString(), category.GetRaw(), *driveInfo->Guid, false, false, 1000, driveInfo->PDiskConfig.GetOrElse(TString()), driveInfo->BoxId, DefaultMaxSlots, NKikimrBlobStorage::EDriveStatus::ACTIVE, - TInstant::Zero(), serial.Serial, TString(), fsPath, staticSlotUsage); + TInstant::Zero(), NKikimrBlobStorage::EDecommitStatus::DECOMMIT_NONE, serial.Serial, + TString(), fsPath, staticSlotUsage); const TPDiskLocation location(nodeId, serial.Serial); state.PDiskLocationMap.emplace(location, pdiskId); @@ -234,8 +235,9 @@ namespace NKikimr { state.PDisks.ConstructInplaceNewEntry(pdiskId, hostId, path, category.GetRaw(), guid, driveInfo.SharedWithOs, driveInfo.ReadCentric, 1000, driveInfo.PDiskConfig.GetOrElse(TString()), boxId, DefaultMaxSlots, - NKikimrBlobStorage::EDriveStatus::ACTIVE, TInstant::Zero(), currentSerial, currentSerial, - TString(), staticSlotUsage); + NKikimrBlobStorage::EDriveStatus::ACTIVE, TInstant::Zero(), + NKikimrBlobStorage::EDecommitStatus::DECOMMIT_NONE, + currentSerial, currentSerial, TString(), staticSlotUsage); // insert PDisk into location map state.PDiskLocationMap.emplace(location, pdiskId); diff --git a/ydb/core/mind/bscontroller/group_geometry_info.h b/ydb/core/mind/bscontroller/group_geometry_info.h index 10e5daedba..a74698f1f8 100644 --- a/ydb/core/mind/bscontroller/group_geometry_info.h +++ b/ydb/core/mind/bscontroller/group_geometry_info.h @@ -69,12 +69,11 @@ namespace NKikimr::NBsController { ui32 GetDomainLevelEnd() const { return DomainLevelEnd; } void AllocateGroup(TGroupMapper &mapper, TGroupId groupId, TGroupMapper::TGroupDefinition &group, - const TPDiskId replacedDiskIds[], size_t numReplacedDisks, TGroupMapper::TForbiddenPDisks forbid, + const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks, TGroupMapper::TForbiddenPDisks forbid, i64 requiredSpace) const { TString error; for (const bool requireOperational : {true, false}) { - if (mapper.AllocateGroup(groupId, group, replacedDiskIds, numReplacedDisks, forbid, - requiredSpace, requireOperational, error)) { + if (mapper.AllocateGroup(groupId, group, replacedDisks, forbid, requiredSpace, requireOperational, error)) { return; } } @@ -115,6 +114,10 @@ namespace NKikimr::NBsController { return true; } + TBlobStorageGroupType GetType() const { + return Type; + } + TBlobStorageGroupType::EErasureSpecies GetErasure() const { return Type.GetErasure(); } diff --git a/ydb/core/mind/bscontroller/group_layout_checker.cpp b/ydb/core/mind/bscontroller/group_layout_checker.cpp new file mode 100644 index 0000000000..8ab76e3e4f --- /dev/null +++ b/ydb/core/mind/bscontroller/group_layout_checker.cpp @@ -0,0 +1,47 @@ +#include "group_layout_checker.h" +#include "group_geometry_info.h" + +namespace NKikimr::NBsController { + + TLayoutCheckResult CheckGroupLayout(const TGroupGeometryInfo& geom, const THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>>& layout) { + using namespace NLayoutChecker; + + if (layout.empty()) { + return {}; + } + + TBlobStorageGroupInfo::TTopology topology(geom.GetType(), geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm(), + geom.GetNumVDisksPerFailDomain(), true); + TGroupLayout group(topology); + TDomainMapper mapper; + THashMap<TVDiskIdShort, TPDiskLayoutPosition> map; + for (const auto& [vdiskId, p] : layout) { + const auto& [location, pdiskId] = p; + TPDiskLayoutPosition pos(mapper, location, pdiskId, geom); + group.AddDisk(pos, topology.GetOrderNumber(vdiskId)); + map.emplace(vdiskId, pos); + } + + std::vector<std::pair<TScore, TVDiskIdShort>> scoreboard; + for (const auto& [vdiskId, pos] : map) { + scoreboard.emplace_back(group.GetCandidateScore(pos, topology.GetOrderNumber(vdiskId)), vdiskId); + } + + auto comp1 = [](const auto& x, const auto& y) { return x.second < y.second; }; + std::sort(scoreboard.begin(), scoreboard.end(), comp1); + + auto comp = [](const auto& x, const auto& y) { return x.first.BetterThan(y.first); }; + std::sort(scoreboard.begin(), scoreboard.end(), comp); + TLayoutCheckResult res; + const auto reference = scoreboard.back().first; + if (!reference.SameAs({})) { // not perfectly correct layout + for (; !scoreboard.empty() && !scoreboard.back().first.BetterThan(reference); scoreboard.pop_back()) { + res.Candidates.push_back(scoreboard.back().second); + } + } + return res; + } + +} // NKikimr::NBsController + +Y_DECLARE_OUT_SPEC(, NKikimr::NBsController::NLayoutChecker::TEntityId, stream, value) { value.Output(stream); } diff --git a/ydb/core/mind/bscontroller/group_layout_checker.h b/ydb/core/mind/bscontroller/group_layout_checker.h new file mode 100644 index 0000000000..407f0b7c7f --- /dev/null +++ b/ydb/core/mind/bscontroller/group_layout_checker.h @@ -0,0 +1,233 @@ +#pragma once + +#include "defs.h" +#include "types.h" +#include "group_geometry_info.h" + +namespace NKikimr::NBsController { + + namespace NLayoutChecker { + + struct TEntityId { + ui32 Value = ::Max<ui32>(); + + public: + bool operator ==(const TEntityId& other) const { return Value == other.Value; } + bool operator !=(const TEntityId& other) const { return Value != other.Value; } + bool operator < (const TEntityId& other) const { return Value < other.Value; } + bool operator <=(const TEntityId& other) const { return Value <= other.Value; } + bool operator > (const TEntityId& other) const { return Value > other.Value; } + bool operator >=(const TEntityId& other) const { return Value >= other.Value; } + + size_t Index() const { + return Value; + } + + size_t Hash() const { + return THash<ui32>()(Value); + } + + static constexpr TEntityId Min() { return {.Value = 0}; }; + static constexpr TEntityId Max() { return {.Value = ::Max<ui32>()}; }; + + TString ToString() const { return TStringBuilder() << Value; } + void Output(IOutputStream& s) const { s << Value; } + + private: + friend class TDomainMapper; + + static TEntityId SequentialValue(size_t index) { + return TEntityId{static_cast<ui32>(index)}; + } + }; + + } // NLayoutChecker + +} // NKikimr::NBsController + +template<> +struct THash<NKikimr::NBsController::NLayoutChecker::TEntityId> { + template<typename T> + size_t operator()(const T& id) const { return id.Hash(); } +}; + +namespace NKikimr::NBsController { + + namespace NLayoutChecker { + + class TDomainMapper { + std::unordered_map<TString, TEntityId> FailDomainId; + + public: + TEntityId operator ()(TString item) { + return FailDomainId.emplace(std::move(item), TEntityId::SequentialValue(FailDomainId.size())).first->second; + } + + size_t GetIdCount() const { + return FailDomainId.size(); + } + }; + + struct TPDiskLayoutPosition { + TEntityId RealmGroup; + TEntityId Realm; + TEntityId Domain; + + TPDiskLayoutPosition() = default; + + TPDiskLayoutPosition(TEntityId realmGroup, TEntityId realm, TEntityId domain) + : RealmGroup(realmGroup) + , Realm(realm) + , Domain(domain) + {} + + TPDiskLayoutPosition(TDomainMapper& mapper, const TNodeLocation& location, TPDiskId pdiskId, const TGroupGeometryInfo& geom) { + TStringStream realmGroup, realm, domain; + const std::pair<int, TStringStream*> levels[] = { + {geom.GetRealmLevelBegin(), &realmGroup}, + {Max(geom.GetRealmLevelEnd(), geom.GetDomainLevelBegin()), &realm}, + {Max(geom.GetRealmLevelEnd(), geom.GetDomainLevelEnd()), &domain} + }; + auto addLevel = [&](int key, const TString& value) { + for (const auto& [reference, stream] : levels) { + if (key < reference) { + Save(stream, std::make_tuple(key, value)); + } + } + }; + for (const auto& [key, value] : location.GetItems()) { + addLevel(key, value); + } + addLevel(255, pdiskId.ToString()); // ephemeral level to distinguish between PDisks on the same node + RealmGroup = mapper(realmGroup.Str()); + Realm = mapper(realm.Str()); + Domain = mapper(domain.Str()); + } + + TString ToString() const { + return TStringBuilder() << "{" << RealmGroup << "." << Realm << "." << Domain << "}"; + } + + auto AsTuple() const { + return std::tie(RealmGroup, Realm, Domain); + } + + friend bool operator ==(const TPDiskLayoutPosition& x, const TPDiskLayoutPosition& y) { + return x.AsTuple() == y.AsTuple(); + } + + friend bool operator <(const TPDiskLayoutPosition& x, const TPDiskLayoutPosition& y) { + return x.AsTuple() < y.AsTuple(); + } + }; + + struct TScore { + ui32 RealmInterlace = 0; + ui32 DomainInterlace = 0; + ui32 RealmGroupScatter = 0; + ui32 RealmScatter = 0; + ui32 DomainScatter = 0; + + auto AsTuple() const { + return std::make_tuple(RealmInterlace, DomainInterlace, RealmGroupScatter, RealmScatter, DomainScatter); + } + + bool BetterThan(const TScore& other) const { + return AsTuple() < other.AsTuple(); + } + + bool SameAs(const TScore& other) const { + return AsTuple() == other.AsTuple(); + } + + static TScore Max() { + return {::Max<ui32>(), ::Max<ui32>(), ::Max<ui32>(), ::Max<ui32>(), ::Max<ui32>()}; + } + + TString ToString() const { + return TStringBuilder() << "{RealmInterlace# " << RealmInterlace + << " DomainInterlace# " << DomainInterlace + << " RealmGroupScatter# " << RealmGroupScatter + << " RealmScatter# " << RealmScatter + << " DomainScatter# " << DomainScatter + << "}"; + } + }; + + struct TGroupLayout { + const TBlobStorageGroupInfo::TTopology& Topology; + + ui32 NumDisks = 0; + THashMap<TEntityId, ui32> NumDisksPerRealmGroup; + + TStackVec<ui32, 4> NumDisksInRealm; + TStackVec<THashMap<TEntityId, ui32>, 4> NumDisksPerRealm; + THashMap<TEntityId, ui32> NumDisksPerRealmTotal; + + TStackVec<ui32, 32> NumDisksInDomain; + TStackVec<THashMap<TEntityId, ui32>, 32> NumDisksPerDomain; + THashMap<TEntityId, ui32> NumDisksPerDomainTotal; + + TGroupLayout(const TBlobStorageGroupInfo::TTopology& topology) + : Topology(topology) + , NumDisksInRealm(Topology.GetTotalFailRealmsNum()) + , NumDisksPerRealm(Topology.GetTotalFailRealmsNum()) + , NumDisksInDomain(Topology.GetTotalFailDomainsNum()) + , NumDisksPerDomain(Topology.GetTotalFailDomainsNum()) + {} + + void UpdateDisk(const TPDiskLayoutPosition& pos, ui32 orderNumber, ui32 value) { + NumDisks += value; + NumDisksPerRealmGroup[pos.RealmGroup] += value; + const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber); + NumDisksInRealm[vdisk.FailRealm] += value; + NumDisksPerRealm[vdisk.FailRealm][pos.Realm] += value; + NumDisksPerRealmTotal[pos.Realm] += value; + const ui32 domainIdx = Topology.GetFailDomainOrderNumber(vdisk); + NumDisksInDomain[domainIdx] += value; + NumDisksPerDomain[domainIdx][pos.Domain] += value; + NumDisksPerDomainTotal[pos.Domain] += value; + } + + void AddDisk(const TPDiskLayoutPosition& pos, ui32 orderNumber) { + UpdateDisk(pos, orderNumber, 1); + } + + void RemoveDisk(const TPDiskLayoutPosition& pos, ui32 orderNumber) { + UpdateDisk(pos, orderNumber, Max<ui32>()); + } + + TScore GetCandidateScore(const TPDiskLayoutPosition& pos, ui32 orderNumber) { + const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber); + const ui32 domainIdx = Topology.GetFailDomainOrderNumber(vdisk); + + return { + .RealmInterlace = NumDisksPerRealmTotal[pos.Realm] - NumDisksPerRealm[vdisk.FailRealm][pos.Realm], + .DomainInterlace = NumDisksPerDomainTotal[pos.Domain] - NumDisksPerDomain[domainIdx][pos.Domain], + .RealmGroupScatter = NumDisks - NumDisksPerRealmGroup[pos.RealmGroup], + .RealmScatter = NumDisksInRealm[vdisk.FailRealm] - NumDisksPerRealm[vdisk.FailRealm][pos.Realm], + .DomainScatter = NumDisksInDomain[domainIdx] - NumDisksPerDomain[domainIdx][pos.Domain], + }; + } + + TScore GetExcludedDiskScore(const TPDiskLayoutPosition& pos, ui32 orderNumber) { + RemoveDisk(pos, orderNumber); + const TScore score = GetCandidateScore(pos, orderNumber); + AddDisk(pos, orderNumber); + return score; + } + }; + + } // NLayoutChecker + + struct TLayoutCheckResult { + std::vector<TVDiskIdShort> Candidates; + + explicit operator bool() const { // checks whether fail model is correct + return Candidates.empty(); + } + }; + + TLayoutCheckResult CheckGroupLayout(const TGroupGeometryInfo& geom, const THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>>& layout); + +} // NKikimr::NBsController diff --git a/ydb/core/mind/bscontroller/group_mapper.cpp b/ydb/core/mind/bscontroller/group_mapper.cpp index 84b5ac6d7a..5619534fb3 100644 --- a/ydb/core/mind/bscontroller/group_mapper.cpp +++ b/ydb/core/mind/bscontroller/group_mapper.cpp @@ -1,105 +1,29 @@ #include "group_mapper.h" #include "group_geometry_info.h" +#include "group_layout_checker.h" namespace NKikimr::NBsController { - class TGroupMapper::TImpl : TNonCopyable { - class TDomainMapper { - std::unordered_map<TString, ui32> FailDomainId; - - public: - ui32 operator ()(TString item) { - return FailDomainId.emplace(std::move(item), FailDomainId.size() + 1).first->second; - } - }; - - struct TPDiskLayoutPosition { - ui32 RealmGroup = 0; - ui32 RealmInGroup = 0; - ui32 DomainGroup = 0; - ui32 DomainInGroup = 0; - - TPDiskLayoutPosition() = default; - - TPDiskLayoutPosition(TDomainMapper& mapper, const TNodeLocation& location, TPDiskId pdiskId, const TGroupGeometryInfo& geom) { - TStringStream realmGroup, realmInGroup, domainGroup, domainInGroup; - const std::pair<int, TStringStream*> levels[] = { - {geom.GetRealmLevelBegin(), &realmGroup}, - {geom.GetRealmLevelEnd(), &realmInGroup}, - {geom.GetDomainLevelBegin(), &domainGroup}, - {geom.GetDomainLevelEnd(), &domainInGroup} - }; - auto addLevel = [&](int key, const TString& value) { - for (const auto& [reference, stream] : levels) { - if (key < reference) { - Save(stream, std::make_tuple(key, value)); - } - } - }; - for (const auto& [key, value] : location.GetItems()) { - addLevel(key, value); - } - addLevel(255, pdiskId.ToString()); // ephemeral level to distinguish between PDisks on the same node - RealmGroup = mapper(realmGroup.Str()); - RealmInGroup = mapper(realmInGroup.Str()); - DomainGroup = mapper(domainGroup.Str()); - DomainInGroup = mapper(domainInGroup.Str()); - } - - auto AsTuple() const { - return std::tie(RealmGroup, RealmInGroup, DomainGroup, DomainInGroup); - } - - friend bool operator <(const TPDiskLayoutPosition& x, const TPDiskLayoutPosition& y) { - return x.AsTuple() < y.AsTuple(); - } + using namespace NLayoutChecker; - size_t GetDiffIndex(const TPDiskLayoutPosition& other) const { - return RealmGroup != other.RealmGroup ? 0 : - RealmInGroup != other.RealmInGroup ? 1 : - DomainGroup != other.DomainGroup ? 2 : - DomainInGroup != other.DomainInGroup ? 3 : 4; - } - }; - - struct TFailDomainInfo; - - struct TPDiskInfo { - const TNodeLocation Location; - i64 SpaceAvailable; - bool Usable; - ui32 NumSlots; - const ui32 MaxSlots; + class TGroupMapper::TImpl : TNonCopyable { + struct TPDiskInfo : TPDiskRecord { TPDiskLayoutPosition Position; - const TPDiskId PDiskId; - TStackVec<ui32, 32> Groups; - const bool Operational; - TFailDomainInfo *FailDomain; - bool Matching = false; - - TPDiskInfo(TNodeLocation location, bool usable, ui32 numSlots, ui32 maxSlots, TPDiskLayoutPosition position, - const TPDiskId& pdiskId, const ui32 groupIds[], size_t numGroups, i64 spaceAvailable, - bool operational, TFailDomainInfo *failDomain) - : Location(std::move(location)) - , SpaceAvailable(spaceAvailable) - , Usable(usable) - , NumSlots(numSlots) - , MaxSlots(maxSlots) + bool Matching; + ui32 NumDomainMatchingDisks; + ui32 SkipToNextRealmGroup; + ui32 SkipToNextRealm; + ui32 SkipToNextDomain; + + TPDiskInfo(const TPDiskRecord& pdisk, TPDiskLayoutPosition position) + : TPDiskRecord(pdisk) , Position(std::move(position)) - , PDiskId(pdiskId) - , Groups(groupIds, groupIds + numGroups) - , Operational(operational) - , FailDomain(failDomain) { std::sort(Groups.begin(), Groups.end()); } - TString ToString() const { - return Location.ToString(); - } - bool IsUsable() const { - return Usable && NumSlots < MaxSlots; + return Usable && !Decommitted && NumSlots < MaxSlots; } void InsertGroup(ui32 groupId) { @@ -120,222 +44,428 @@ namespace NKikimr::NBsController { }; using TPDisks = THashMap<TPDiskId, TPDiskInfo>; + using TPDiskByPosition = std::vector<std::pair<TPDiskLayoutPosition, TPDiskInfo*>>; - struct TFailDomainInfo : std::vector<TPDisks::value_type*> { - bool Matching; - ui32 NumMatchingDisks; - }; - - struct TFailDomainGroup : std::unordered_map<ui32, TFailDomainInfo> { - bool Matching; - }; - - struct TFailRealmInfo : std::unordered_map<ui32, TFailDomainGroup> { - bool Matching; - }; - - struct TFailRealmGroup : std::unordered_map<ui32, TFailRealmInfo> { - bool Matching; - }; + struct TComparePDiskByPosition { + bool operator ()(const TPDiskByPosition::value_type& x, const TPDiskLayoutPosition& y) const { + return x.first < y; + } - struct TBox : std::unordered_map<ui32, TFailRealmGroup> { - auto& operator ()(const TPDiskLayoutPosition& p) { - return (*this)[p.RealmGroup][p.RealmInGroup][p.DomainGroup][p.DomainInGroup]; + bool operator ()(const TPDiskLayoutPosition& x, const TPDiskByPosition::value_type& y) const { + return x < y.first; } }; - struct TAllocateContext { - const ui32 NumFailRealms; - const ui32 NumFailDomainsPerFailRealm; - ui32 RealmGroup = 0; // the realm group we are forced to use (if forced, of course) - TStackVec<ui32, 8> RealmInGroup; // indexed by realm number - TStackVec<ui32, 8> DomainGroup; // indexed by realm number - TStackVec<ui32, 32> DomainInGroup; // indexed by realm/domain + using TGroup = std::vector<TPDiskInfo*>; + + struct TAllocator { + TImpl& Self; + const TBlobStorageGroupInfo::TTopology Topology; THashSet<TPDiskId> OldGroupContent; // set of all existing disks in the group, inclusing ones which are replaced - THashSet<TPDiskId> NewGroupContent; // newly generated group content const i64 RequiredSpace; const bool RequireOperational; - TForbiddenPDisks Forbid; - - TAllocateContext(const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational, - TForbiddenPDisks forbid) - : NumFailRealms(geom.GetNumFailRealms()) - , NumFailDomainsPerFailRealm(geom.GetNumFailDomainsPerFailRealm()) - , RealmInGroup(NumFailRealms, 0) - , DomainGroup(NumFailRealms, 0) - , DomainInGroup(NumFailRealms * NumFailDomainsPerFailRealm, 0) + TForbiddenPDisks ForbiddenDisks; + THashMap<ui32, unsigned> LocalityFactor; + TGroupLayout GroupLayout; + std::optional<TScore> WorstScore; + + TAllocator(TImpl& self, const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational, + TForbiddenPDisks forbiddenDisks, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks) + : Self(self) + , Topology(geom.GetType(), geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm(), geom.GetNumVDisksPerFailDomain(), true) , RequiredSpace(requiredSpace) , RequireOperational(requireOperational) - , Forbid(std::move(forbid)) - {} - - bool ProcessExistingGroup(const TGroupDefinition& group, const TPDisks& pdisks, const TPDiskId replacedDiskIds[], - size_t numReplacedDisks, TString& error) { - OldGroupContent = {replacedDiskIds, replacedDiskIds + numReplacedDisks}; - - for (ui32 failRealmIdx = 0, domainThroughIdx = 0; failRealmIdx < group.size(); ++failRealmIdx) { - const auto& realm = group[failRealmIdx]; - for (ui32 failDomainIdx = 0; failDomainIdx < realm.size(); ++failDomainIdx, ++domainThroughIdx) { - const auto& domain = realm[failDomainIdx]; - for (const TPDiskId pdiskId : domain) { - if (pdiskId != TPDiskId()) { - // add to used pdisk set - const bool inserted = OldGroupContent.insert(pdiskId).second; - Y_VERIFY(inserted); - - // find existing pdisk - auto it = pdisks.find(pdiskId); - if (it == pdisks.end()) { - error = TStringBuilder() << "existing group contains missing PDisks"; - return false; - } - const TPDiskInfo& pdisk = it->second; - - // register the disk in context - if (!AddDisk(pdisk, failRealmIdx, domainThroughIdx, error)) { - return false; - } + , ForbiddenDisks(std::move(forbiddenDisks)) + , GroupLayout(Topology) + { + for (const auto& [vdiskId, pdiskId] : replacedDisks) { + OldGroupContent.insert(pdiskId); + } + } + + TGroup ProcessExistingGroup(const TGroupDefinition& group, TString& error) { + TGroup res(Topology.GetTotalVDisksNum()); + + struct TExError { TString error; }; + + try { + Traverse(group, [&](TVDiskIdShort vdisk, TPDiskId pdiskId) { + if (pdiskId != TPDiskId()) { + const ui32 orderNumber = Topology.GetOrderNumber(vdisk); + + const auto it = Self.PDisks.find(pdiskId); + if (it == Self.PDisks.end()) { + throw TExError{TStringBuilder() << "existing group contains missing PDiskId# " << pdiskId}; + } + TPDiskInfo& pdisk = it->second; + res[orderNumber] = &pdisk; + + const auto [_, inserted] = OldGroupContent.insert(pdiskId); + if (!inserted) { + throw TExError{TStringBuilder() << "group contains duplicate PDiskId# " << pdiskId}; + } + + if (!pdisk.Decommitted) { + AddUsedDisk(pdisk); + GroupLayout.AddDisk(pdisk.Position, orderNumber); } } - } + }); + } catch (const TExError& e) { + error = e.error; + return {}; } - return true; + + return res; } - bool AddDisk(const TPDiskInfo& pdisk, ui32 failRealmIdx, ui32 domainThroughIdx, TString& error) { - auto update = [](ui32& place, ui32 value) { - const ui32 prev = std::exchange(place, value); - return !prev || prev == value; - }; - if (!update(RealmGroup, pdisk.Position.RealmGroup)) { - error = "group contains PDisks from different realm groups"; - } else if (!update(RealmInGroup[failRealmIdx], pdisk.Position.RealmInGroup)) { - error = "group contains PDisks from different realms within same realm"; - } else if (!update(DomainGroup[failRealmIdx], pdisk.Position.DomainGroup)) { - error = "group contains PDisks from different domain groups within same realm"; - } else if (!update(DomainInGroup[domainThroughIdx], pdisk.Position.DomainInGroup)) { - error = "group contains PDisks from different domain groups within same realm"; - } else if (!NewGroupContent.insert(pdisk.PDiskId).second) { - error = "group contains duplicate PDisks"; - } else { - return true; + void Decompose(const TGroup& in, TGroupDefinition& out) { + for (ui32 i = 0; i < in.size(); ++i) { + const TVDiskIdShort vdisk = Topology.GetVDiskId(i); + out[vdisk.FailRealm][vdisk.FailDomain][vdisk.VDisk] = in[i]->PDiskId; } - return false; } bool DiskIsUsable(const TPDiskInfo& pdisk) const { if (!pdisk.IsUsable()) { return false; // disk is not usable in this case } - if (OldGroupContent.count(pdisk.PDiskId) || NewGroupContent.count(pdisk.PDiskId) || Forbid.count(pdisk.PDiskId)) { + if (OldGroupContent.contains(pdisk.PDiskId) || ForbiddenDisks.contains(pdisk.PDiskId)) { return false; // can't allow duplicate disks } if (RequireOperational && !pdisk.Operational) { return false; } - return pdisk.SpaceAvailable >= RequiredSpace; + if (pdisk.SpaceAvailable < RequiredSpace) { + return false; + } + return true; } - }; - class THelper { - TImpl& Self; - TAllocateContext& Ctx; - std::unordered_map<ui32, unsigned> LocalityFactor; + TPDiskByPosition SetupMatchingDisks(ui32 maxScore) { + TPDiskByPosition res; + res.reserve(Self.PDiskByPosition.size()); - public: - THelper(TImpl& self, TAllocateContext& ctx) - : Self(self) - , Ctx(ctx) - { - for (const TPDiskId& pdiskId : Ctx.NewGroupContent) { - if (const auto it = Self.PDisks.find(pdiskId); it != Self.PDisks.end()) { - for (ui32 groupId : it->second.Groups) { - ++LocalityFactor[groupId]; + ui32 realmGroupBegin = 0; + ui32 realmBegin = 0; + ui32 domainBegin = 0; + TPDiskLayoutPosition prev; + + std::vector<ui32> numMatchingDisksInDomain(Self.DomainMapper.GetIdCount(), 0); + for (const auto& [position, pdisk] : Self.PDiskByPosition) { + pdisk->Matching = pdisk->GetPickerScore() <= maxScore && DiskIsUsable(*pdisk); + if (pdisk->Matching) { + if (position.RealmGroup != prev.RealmGroup) { + for (; realmGroupBegin < res.size(); ++realmGroupBegin) { + res[realmGroupBegin].second->SkipToNextRealmGroup = res.size() - realmGroupBegin; + } } - } else { - Y_FAIL(); - } - } - if (const ui32 id = Ctx.RealmGroup) { - auto& realmGroup = Self.Box.at(id); - Y_VERIFY_DEBUG(realmGroup.Matching); - realmGroup.Matching = false; - for (size_t i = 0; i < Ctx.NumFailRealms; ++i) { - if (const ui32 id = Ctx.RealmInGroup[i]) { - auto& realm = realmGroup.at(id); - Y_VERIFY_DEBUG(realm.Matching); - realm.Matching = false; - if (const ui32 id = Ctx.DomainGroup[i]) { - auto& domainGroup = realm.at(id); - Y_VERIFY_DEBUG(domainGroup.Matching); - domainGroup.Matching = false; - for (size_t j = 0; j < Ctx.NumFailDomainsPerFailRealm; ++j) { - const size_t domainThroughIdx = i * Ctx.NumFailDomainsPerFailRealm + j; - if (const ui32 id = Ctx.DomainInGroup[domainThroughIdx]) { - auto& domain = domainGroup.at(id); - Y_VERIFY_DEBUG(domain.Matching); - domain.Matching = false; - } - } + if (position.Realm != prev.Realm) { + for (; realmBegin < res.size(); ++realmBegin) { + res[realmBegin].second->SkipToNextRealm = res.size() - realmBegin; } } + if (position.Domain != prev.Domain) { + for (; domainBegin < res.size(); ++domainBegin) { + res[domainBegin].second->SkipToNextDomain = res.size() - domainBegin; + } + } + prev = position; + + res.emplace_back(position, pdisk); + ++numMatchingDisksInDomain[position.Domain.Index()]; } } + for (; realmGroupBegin < res.size(); ++realmGroupBegin) { + res[realmGroupBegin].second->SkipToNextRealmGroup = res.size() - realmGroupBegin; + } + for (; realmBegin < res.size(); ++realmBegin) { + res[realmBegin].second->SkipToNextRealm = res.size() - realmBegin; + } + for (; domainBegin < res.size(); ++domainBegin) { + res[domainBegin].second->SkipToNextDomain = res.size() - domainBegin; + } + for (const auto& [position, pdisk] : res) { + pdisk->NumDomainMatchingDisks = numMatchingDisksInDomain[position.Domain.Index()]; + } + + return std::move(res); } - TPDiskId AddBestDisk(ui32 realmIdx, ui32 domainThroughIdx) { - TPDiskInfo *pdisk = nullptr; - auto descend = [&](auto& level, ui32& id, ui32 TPDiskLayoutPosition::*pos) -> auto& { - if (!id) { - pdisk = pdisk ? pdisk : FindBestDisk(level); - id = pdisk->Position.*pos; - level[id].Matching = false; // do not allow this level for further selection - } else if (pdisk) { - Y_VERIFY(id == pdisk->Position.*pos); - } - return level[id]; + struct TUndoLog { + struct TItem { + ui32 Index; + TPDiskInfo *PDisk; }; - auto& realmGroup = descend(Self.Box, Ctx.RealmGroup, &TPDiskLayoutPosition::RealmGroup); - auto& realm = descend(realmGroup, Ctx.RealmInGroup[realmIdx], &TPDiskLayoutPosition::RealmInGroup); - auto& domainGroup = descend(realm, Ctx.DomainGroup[realmIdx], &TPDiskLayoutPosition::DomainGroup); - auto& domain = descend(domainGroup, Ctx.DomainInGroup[domainThroughIdx], &TPDiskLayoutPosition::DomainInGroup); - pdisk = pdisk ? pdisk : FindBestDisk(domain); - TString error; - const bool success = Ctx.AddDisk(*pdisk, realmIdx, domainThroughIdx, error); - Y_VERIFY(success, "AddDisk: %s", error.data()); - pdisk->Matching = false; // disable this disk for further selection + std::vector<TItem> Items; + + void Log(ui32 index, TPDiskInfo *pdisk) { + Items.push_back({index, pdisk}); + } + size_t GetPosition() const { + return Items.size(); + } + }; + + void AddDiskViaUndoLog(TUndoLog& undo, TGroup& group, ui32 index, TPDiskInfo *pdisk) { + undo.Log(index, pdisk); + group[index] = pdisk; AddUsedDisk(*pdisk); + GroupLayout.AddDisk(pdisk->Position, index); + WorstScore.reset(); // invalidate score + } - return pdisk->PDiskId; + void Revert(TUndoLog& undo, TGroup& group, size_t until) { + for (; undo.Items.size() > until; undo.Items.pop_back()) { + const auto& item = undo.Items.back(); + group[item.Index] = nullptr; + RemoveUsedDisk(*item.PDisk); + GroupLayout.RemoveDisk(item.PDisk->Position, item.Index); + WorstScore.reset(); // invalidate score + } } - private: + bool FillInGroup(ui32 maxScore, TUndoLog& undo, TGroup& group) { + // determine PDisks that fit our requirements (including score) + auto v = SetupMatchingDisks(maxScore); + + // find which entities we need to allocate -- whole group, some realms, maybe some domains within specific realms? + bool isEmptyGroup = true; + std::vector<bool> isEmptyRealm(Topology.GetTotalFailRealmsNum(), true); + std::vector<bool> isEmptyDomain(Topology.GetTotalFailDomainsNum(), true); + for (ui32 orderNumber = 0; orderNumber < group.size(); ++orderNumber) { + if (group[orderNumber]) { + const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber); + isEmptyGroup = false; + isEmptyRealm[vdisk.FailRealm] = false; + const ui32 domainIdx = Topology.GetFailDomainOrderNumber(vdisk); + isEmptyDomain[domainIdx] = false; + } + } + + auto allocate = [&](auto what, ui32 index) { + TDynBitMap forbiddenEntities; + forbiddenEntities.Reserve(Self.DomainMapper.GetIdCount()); + if (!AllocateWholeEntity(what, group, undo, index, {v.begin(), v.end()}, forbiddenEntities)) { + Revert(undo, group, 0); + return false; + } + return true; + }; + + if (isEmptyGroup) { + return allocate(TAllocateWholeGroup(), 0); + } + + const ui32 numFailDomainsPerFailRealm = Topology.GetNumFailDomainsPerFailRealm(); + const ui32 numVDisksPerFailDomain = Topology.GetNumVDisksPerFailDomain(); + ui32 domainOrderNumber = 0; + ui32 orderNumber = 0; + + // scan all fail realms and allocate missing realms or their parts + for (ui32 failRealmIdx = 0; failRealmIdx < isEmptyRealm.size(); ++failRealmIdx) { + if (isEmptyRealm[failRealmIdx]) { + // we have an empty realm -- we have to allocate it fully + if (!allocate(TAllocateWholeRealm(), failRealmIdx)) { + return false; + } + // skip to next realm + domainOrderNumber += numFailDomainsPerFailRealm; + orderNumber += numVDisksPerFailDomain * numFailDomainsPerFailRealm; + continue; + } + + // scan through domains of this realm, find unallocated ones + for (ui32 failDomainIdx = 0; failDomainIdx < numFailDomainsPerFailRealm; ++failDomainIdx, ++domainOrderNumber) { + if (isEmptyDomain[domainOrderNumber]) { + // try to allocate full domain + if (!allocate(TAllocateWholeDomain(), domainOrderNumber)) { + return false; + } + // skip to next domain + orderNumber += numVDisksPerFailDomain; + continue; + } + + // scan individual disks of the domain and fill gaps + for (ui32 vdiskIdx = 0; vdiskIdx < numVDisksPerFailDomain; ++vdiskIdx, ++orderNumber) { + if (!group[orderNumber] && !allocate(TAllocateDisk(), orderNumber)) { + return false; + } + } + } + } + + Y_VERIFY(domainOrderNumber == Topology.GetTotalFailDomainsNum()); + Y_VERIFY(orderNumber == Topology.GetTotalVDisksNum()); + + return true; + } + + using TAllocateResult = TPDiskLayoutPosition*; + + struct TAllocateDisk {}; + + struct TAllocateWholeDomain { + static constexpr auto GetEntityCount = &TBlobStorageGroupInfo::TTopology::GetNumVDisksPerFailDomain; + using TNestedEntity = TAllocateDisk; + + static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x, TEntityId& scope) { + scope = x.Domain; + return {x, x}; + } + }; + + struct TAllocateWholeRealm { + static constexpr auto GetEntityCount = &TBlobStorageGroupInfo::TTopology::GetNumFailDomainsPerFailRealm; + using TNestedEntity = TAllocateWholeDomain; + + static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x, TEntityId& scope) { + scope = x.Realm; + return {{x.RealmGroup, x.Realm, TEntityId::Min()}, {x.RealmGroup, x.Realm, TEntityId::Max()}}; + } + }; + + struct TAllocateWholeGroup { + static constexpr auto GetEntityCount = &TBlobStorageGroupInfo::TTopology::GetTotalFailRealmsNum; + using TNestedEntity = TAllocateWholeRealm; + + static std::pair<TPDiskLayoutPosition, TPDiskLayoutPosition> MakeRange(const TPDiskLayoutPosition& x, TEntityId& scope) { + scope = x.RealmGroup; + return {{x.RealmGroup, TEntityId::Min(), TEntityId::Min()}, {x.RealmGroup, TEntityId::Max(), TEntityId::Max()}}; + } + }; + + using TDiskRange = std::pair<TPDiskByPosition::const_iterator, TPDiskByPosition::const_iterator>; + template<typename T> - TPDiskInfo *FindBestDisk(T& level) { - TPDiskInfo *res = nullptr; - for (auto& [id, item] : level) { - if (item.Matching) { - TPDiskInfo *candidate = FindBestDisk(item); - Y_VERIFY(candidate); - res = !res || DiskIsBetter(*candidate, *res) ? candidate : res; + TAllocateResult AllocateWholeEntity(T, TGroup& group, TUndoLog& undo, ui32 parentEntityIndex, TDiskRange range, + TDynBitMap& forbiddenEntities) { + // number of enclosed child entities within this one + const ui32 entityCount = (Topology.*T::GetEntityCount)(); + Y_VERIFY(entityCount); + parentEntityIndex *= entityCount; + // remember current undo stack size + const size_t undoPosition = undo.GetPosition(); + + for (;;) { + auto [from, to] = range; + TPDiskLayoutPosition *prefix; + TEntityId scope; + + for (ui32 index = 0;; ++index) { + // allocate nested entity + prefix = AllocateWholeEntity(typename T::TNestedEntity(), group, undo, parentEntityIndex + index, + {from, to}, forbiddenEntities); + + if (prefix) { + if (!index) { + // reduce range to specific realm/domain entity + auto [min, max] = T::MakeRange(*prefix, scope); + from = std::lower_bound(from, to, min, TComparePDiskByPosition()); + to = std::upper_bound(from, to, max, TComparePDiskByPosition()); + } + if (index + 1 == entityCount) { + // disable filled entity from further selection if it was really allocated + forbiddenEntities.Set(scope.Index()); + return prefix; + } + } else if (index) { + // disable just checked entity (to prevent its selection again) + forbiddenEntities.Set(scope.Index()); + // try another entity at this level + Revert(undo, group, undoPosition); + // break the loop and retry + break; + } else { + // no chance to allocate new entity, exit + return {}; + } } } - Y_VERIFY(res); - return res; } - TPDiskInfo *FindBestDisk(TFailDomainInfo& level) { - TPDiskInfo *res = nullptr; - for (TPDisks::value_type *it : level) { - auto& [pdiskId, pdisk] = *it; - if (pdisk.Matching) { - res = !res || DiskIsBetter(pdisk, *res) ? &pdisk : res; + TAllocateResult AllocateWholeEntity(TAllocateDisk, TGroup& group, TUndoLog& undo, ui32 index, TDiskRange range, + TDynBitMap& forbiddenEntities) { + TPDiskInfo *pdisk = group[index]; + Y_VERIFY(!pdisk); + auto process = [this, &pdisk](TPDiskInfo *candidate) { + if (!pdisk || DiskIsBetter(*candidate, *pdisk)) { + pdisk = candidate; + } + }; + FindMatchingDiskBasedOnScore(process, group, index, range, forbiddenEntities); + if (pdisk) { + AddDiskViaUndoLog(undo, group, index, pdisk); + pdisk->Matching = false; + return &pdisk->Position; + } else { + return {}; + } + } + + TScore CalculateWorstScoreWithCache(const TGroup& group) { + if (!WorstScore) { + // find the worst disk from a position of layout correctness and use it as a milestone for other + // disks -- they can't be misplaced worse + TScore worstScore; + for (ui32 i = 0; i < Topology.GetTotalVDisksNum(); ++i) { + if (TPDiskInfo *pdisk = group[i]; pdisk && !pdisk->Decommitted) { + // calculate score for this pdisk, removing it from the set first -- to prevent counting itself + const TScore score = GroupLayout.GetExcludedDiskScore(pdisk->Position, i); + if (worstScore.BetterThan(score)) { + worstScore = score; + } + } } + WorstScore = worstScore; + } + return *WorstScore; + } + + template<typename TCallback> + void FindMatchingDiskBasedOnScore( + TCallback&& cb, // callback to be invoked for every matching candidate + const TGroup& group, // group with peer disks + ui32 orderNumber, // order number of disk being allocated + TDiskRange range, // range of PDisk candidates to scan + TDynBitMap& forbiddenEntities) { // a set of forbidden TEntityId's prevented from allocation + // first, find the best score for current group layout -- we can't make failure model inconsistency + // any worse than it already is + TScore bestScore = CalculateWorstScoreWithCache(group); + + std::vector<TPDiskInfo*> candidates; + + // scan the candidate range + while (range.first != range.second) { + const auto& [position, pdisk] = *range.first++; + + // skip inappropriate disks, whole realm groups, realms and domains + if (!pdisk->Matching) { + // just do nothing, skip this candidate disk + } else if (forbiddenEntities[position.RealmGroup.Index()]) { + range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextRealmGroup - 1); + } else if (forbiddenEntities[position.Realm.Index()]) { + range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextRealm - 1); + } else if (forbiddenEntities[position.Domain.Index()]) { + range.first += Min<ui32>(std::distance(range.first, range.second), pdisk->SkipToNextDomain - 1); + } else { + const TScore score = GroupLayout.GetCandidateScore(position, orderNumber); + if (score.BetterThan(bestScore)) { + candidates.clear(); + bestScore = score; + } + if (score.SameAs(bestScore)) { + candidates.push_back(pdisk); + } + } + } + + for (TPDiskInfo *pdisk : candidates) { + cb(pdisk); } - Y_VERIFY(res); - return res; } bool DiskIsBetter(TPDiskInfo& pretender, TPDiskInfo& king) const { @@ -344,10 +474,8 @@ namespace NKikimr::NBsController { } else if (GivesLocalityBoost(pretender, king) || BetterQuotaMatch(pretender, king)) { return true; } else { - const TFailDomainInfo& pretenderDomain = Self.Box(pretender.Position); - const TFailDomainInfo& kingDomain = Self.Box(king.Position); - if (pretenderDomain.NumMatchingDisks != kingDomain.NumMatchingDisks) { - return pretenderDomain.NumMatchingDisks > kingDomain.NumMatchingDisks; + if (pretender.NumDomainMatchingDisks != king.NumDomainMatchingDisks) { + return pretender.NumDomainMatchingDisks > king.NumDomainMatchingDisks; } return pretender.PDiskId < king.PDiskId; } @@ -369,6 +497,14 @@ namespace NKikimr::NBsController { } } + void RemoveUsedDisk(const TPDiskInfo& pdisk) { + for (ui32 groupId : pdisk.Groups) { + if (!--LocalityFactor[groupId]) { + LocalityFactor.erase(groupId); + } + } + } + unsigned GetLocalityFactor(const TPDiskInfo& pdisk) const { unsigned res = 0; for (ui32 groupId : pdisk.Groups) { @@ -388,7 +524,8 @@ namespace NKikimr::NBsController { const bool Randomize; TDomainMapper DomainMapper; TPDisks PDisks; - TBox Box; + TPDiskByPosition PDiskByPosition; + bool Dirty = false; public: TImpl(TGroupGeometryInfo geom, bool randomize) @@ -396,18 +533,17 @@ namespace NKikimr::NBsController { , Randomize(randomize) {} - bool RegisterPDisk(TPDiskId pdiskId, TNodeLocation location, bool usable, ui32 numSlots, ui32 maxSlots, - const ui32 groupIds[], size_t numGroups, i64 spaceAvailable, bool operational) { + bool RegisterPDisk(const TPDiskRecord& pdisk) { // calculate disk position - const TPDiskLayoutPosition p(DomainMapper, location, pdiskId, Geom); + const TPDiskLayoutPosition p(DomainMapper, pdisk.Location, pdisk.PDiskId, Geom); // insert PDisk into specific map TPDisks::iterator it; bool inserted; - std::tie(it, inserted) = PDisks.try_emplace(pdiskId, std::move(location), usable, numSlots, maxSlots, - p, pdiskId, groupIds, numGroups, spaceAvailable, operational, &Box(p)); + std::tie(it, inserted) = PDisks.try_emplace(pdisk.PDiskId, pdisk, p); if (inserted) { - it->second.FailDomain->push_back(&*it); + PDiskByPosition.emplace_back(it->second.Position, &it->second); + Dirty = true; } return inserted; @@ -416,16 +552,9 @@ namespace NKikimr::NBsController { void UnregisterPDisk(TPDiskId pdiskId) { const auto it = PDisks.find(pdiskId); Y_VERIFY(it != PDisks.end()); - TFailDomainInfo& fdom = *it->second.FailDomain; - bool erased = false; - for (auto x = fdom.begin(); x != fdom.end(); ++x) { - if (*x == &*it) { - fdom.erase(x); - erased = true; - break; - } - } - Y_VERIFY(erased); + auto x = std::remove(PDiskByPosition.begin(), PDiskByPosition.end(), std::make_pair(it->second.Position, &it->second)); + Y_VERIFY(x + 1 == PDiskByPosition.end()); + PDiskByPosition.pop_back(); PDisks.erase(it); } @@ -435,332 +564,142 @@ namespace NKikimr::NBsController { it->second.SpaceAvailable += increment; } - TString FormatPDisks(const TAllocateContext& ctx) const { + TString FormatPDisks(const TAllocator& allocator) const { TStringStream s; s << "PDisks# "; - bool first1 = true; - for (const auto& [id, realmGroup] : Box) { - s << (std::exchange(first1, false) ? "" : " | ") << "<"; - bool first2 = true; - for (const auto& [id, realm] : realmGroup) { - s << (std::exchange(first2, false) ? "" : " ") << "{"; - bool first3 = true; - for (const auto& [id, domainGroup] : realm) { - s << (std::exchange(first3, false) ? "" : " | ") << "["; - bool first4 = true; - for (const auto& [id, domain] : domainGroup) { - if (!domain.empty()) { - s << (std::exchange(first4, false) ? "" : " ") << "("; - std::vector<TPDisks::value_type*> v(domain); - auto comp = [](const auto *x, const auto *y) { return x->first < y->first; }; - std::sort(v.begin(), v.end(), comp); - bool first5 = true; - for (const TPDisks::value_type *it : v) { - s << (std::exchange(first5, false) ? "" : " ") - << it->first.NodeId << ":" << it->first.PDiskId; - if (ctx.OldGroupContent.count(it->first)) { - s << "*"; - } - const char *minus = "-"; - if (ctx.Forbid.count(it->second.PDiskId)) { - s << std::exchange(minus, "") << "f"; - } - if (!it->second.Usable) { - s << std::exchange(minus, "") << "u"; - } - if (it->second.NumSlots >= it->second.MaxSlots) { - s << std::exchange(minus, "") << "m"; - } - if (it->second.NumSlots >= it->second.MaxSlots) { - s << std::exchange(minus, "") << "s"; - } - if (it->second.SpaceAvailable < ctx.RequiredSpace) { - s << std::exchange(minus, "") << "v"; - } - if (!it->second.Operational) { - s << std::exchange(minus, "") << "o"; - } - if (ctx.DiskIsUsable(it->second)) { - s << "+"; - } - } - s << ")"; - } - } - s << "]"; + if (!PDiskByPosition.empty()) { + s << "{[("; + TPDiskLayoutPosition prevPosition = PDiskByPosition.front().first; + const char *space = ""; + for (const auto& [position, pdisk] : PDiskByPosition) { + if (prevPosition != position) { + s << (prevPosition.Domain != position.Domain ? ")" : "") + << (prevPosition.Realm != position.Realm ? "]" : "") + << (prevPosition.RealmGroup != position.RealmGroup ? "} {" : "") + << (prevPosition.Realm != position.Realm ? "[" : "") + << (prevPosition.Domain != position.Domain ? "(" : ""); + space = ""; + } + + s << std::exchange(space, " ") << pdisk->PDiskId; + + if (allocator.OldGroupContent.contains(pdisk->PDiskId)) { + s << "*"; + } + const char *minus = "-"; + if (allocator.ForbiddenDisks.contains(pdisk->PDiskId)) { + s << std::exchange(minus, "") << "f"; + } + if (!pdisk->Usable) { + s << std::exchange(minus, "") << "u"; } - s << "}"; + if (pdisk->Decommitted) { + s << std::exchange(minus, "") << "d"; + } + if (pdisk->NumSlots >= pdisk->MaxSlots) { + s << std::exchange(minus, "") << "s[" << pdisk->NumSlots << "/" << pdisk->MaxSlots << "]"; + } + if (pdisk->SpaceAvailable < allocator.RequiredSpace) { + s << std::exchange(minus, "") << "v"; + } + if (!pdisk->Operational) { + s << std::exchange(minus, "") << "o"; + } + if (allocator.DiskIsUsable(*pdisk)) { + s << "+"; + } + + prevPosition = position; } - s << ">"; + s << ")]}"; + } else { + s << "<empty>"; } return s.Str(); } - bool AllocateGroup(ui32 groupId, TGroupDefinition& group, const TPDiskId replacedDiskIds[], - size_t numReplacedDisks, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, + bool AllocateGroup(ui32 groupId, TGroupDefinition& groupDefinition, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks, + TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error) { - // fill in the allocation context - TAllocateContext ctx(Geom, requiredSpace, requireOperational, std::move(forbid)); - if (!ctx.ProcessExistingGroup(group, PDisks, replacedDiskIds, numReplacedDisks, error)) { - return false; + if (Dirty) { + std::sort(PDiskByPosition.begin(), PDiskByPosition.end()); + Dirty = false; } // create group of required size, if it is not created yet - if (!Geom.ResizeGroup(group)) { + if (!Geom.ResizeGroup(groupDefinition)) { error = "incorrect existing group"; return false; } - // if the group is already created, check for missing entities - bool hasMissingEntities = false; - for (const auto& realm : group) { - for (const auto& domain : realm) { - for (const TPDiskId& pdiskId : domain) { - if (pdiskId == TPDiskId()) { - hasMissingEntities = true; - break; - } - } - if (hasMissingEntities) { - break; - } - } - if (hasMissingEntities) { - break; - } - } - if (!hasMissingEntities) { - return true; // group is okay - } - - // adjust number of slots - for (TPDiskId pdiskId : ctx.OldGroupContent) { - --PDisks.at(pdiskId).NumSlots; - } - for (size_t i = 0; i < numReplacedDisks; ++i) { - PDisks.at(replacedDiskIds[i]).EraseGroup(groupId); - } - - // check if we can map a group only with PDisks that have NumSlots <= nums - if (!FindMatchingDisks(ctx)) { - // undo changes to the mapper content - for (TPDiskId pdiskId : ctx.OldGroupContent) { - ++PDisks.at(pdiskId).NumSlots; - } - for (size_t i = 0; i < numReplacedDisks; ++i) { - PDisks.at(replacedDiskIds[i]).InsertGroup(groupId); - } - error = "no group options " + FormatPDisks(ctx); + // fill in the allocation context + TAllocator allocator(*this, Geom, requiredSpace, requireOperational, std::move(forbid), replacedDisks); + TGroup group = allocator.ProcessExistingGroup(groupDefinition, error); + if (group.empty()) { return false; } - - // fill in missing PDisk entities - THelper helper(*this, ctx); - for (ui32 realmIdx = 0, domainThroughIdx = 0; realmIdx < ctx.NumFailRealms; ++realmIdx) { - for (ui32 domainIdx = 0; domainIdx < ctx.NumFailDomainsPerFailRealm; ++domainIdx, ++domainThroughIdx) { - for (TPDiskId& pdiskId : group[realmIdx][domainIdx]) { - if (pdiskId == TPDiskId()) { - pdiskId = helper.AddBestDisk(realmIdx, domainThroughIdx); - } - } - } - } - - // adjust number of slots - for (TPDiskId pdiskId : ctx.NewGroupContent) { - if (const auto it = PDisks.find(pdiskId); it != PDisks.end()) { - ++it->second.NumSlots; - it->second.InsertGroup(groupId); - } else { - Y_FAIL(); + bool ok = true; + for (TPDiskInfo *pdisk : group) { + if (!pdisk) { + ok = false; + break; } } - - return true; - } - - class TScorePicker { - static constexpr size_t MaxStaticItems = 16; - ui32 NumFake = 0; - TStackVec<ui32, MaxStaticItems> StaticHist; - std::map<ui32, ui32> ScoreHist; - - public: - static constexpr ui32 FakeScore = Max<ui32>(); - - public: - void AddFake() { - ++NumFake; + if (ok) { + return true; } - void Add(ui32 score) { - if (score < MaxStaticItems) { - if (StaticHist.size() <= score) { - StaticHist.resize(score + 1); - } - ++StaticHist[score]; - } else { - ++ScoreHist[score]; + // calculate score table + std::vector<ui32> scores; + for (const auto& [pdiskId, pdisk] : PDisks) { + if (allocator.DiskIsUsable(pdisk)) { + scores.push_back(pdisk.GetPickerScore()); } } - - std::optional<ui32> CalculateMaxScore(ui32 threshold) const { - if (threshold <= NumFake) { - return FakeScore; + std::sort(scores.begin(), scores.end()); + scores.erase(std::unique(scores.begin(), scores.end()), scores.end()); + + // bisect scores to find optimal working one + std::optional<TGroup> result; + ui32 begin = 0, end = scores.size(); + while (begin < end) { + const ui32 mid = begin + (end - begin) / 2; + TAllocator::TUndoLog undo; + if (allocator.FillInGroup(scores[mid], undo, group)) { + result = group; + allocator.Revert(undo, group, 0); + end = mid; } else { - threshold -= NumFake; - } - for (size_t score = 0; score < StaticHist.size(); ++score) { - if (threshold <= StaticHist[score]) { - return score; - } else { - threshold -= StaticHist[score]; - } + begin = mid + 1; } - for (const auto& [score, num] : ScoreHist) { - if (threshold <= num) { - return score; - } else { - threshold -= num; - } - } - Y_VERIFY_DEBUG(threshold); - return std::nullopt; } - void Cascade(TScorePicker& parent, ui32 threshold) { - if (std::optional<ui32> maxScore = CalculateMaxScore(threshold)) { - if (*maxScore != FakeScore) { - parent.Add(*maxScore); - } else { - parent.AddFake(); - } + if (result) { + for (const auto& [vdiskId, pdiskId] : replacedDisks) { + const auto it = PDisks.find(pdiskId); + Y_VERIFY(it != PDisks.end()); + TPDiskInfo& pdisk = it->second; + --pdisk.NumSlots; + pdisk.EraseGroup(groupId); } - } - }; - - bool FindMatchingDisks(const TAllocateContext& ctx) { - struct TMatchingDisk { - TPDiskInfo *PDisk; - bool IsInGroup; - ui32 Score = 0; - - TPDiskInfo *operator ->() const { return PDisk; } - bool operator <(const TMatchingDisk& other) const { return PDisk->Position < other->Position; } - }; - - TScorePicker realmGroupPicker; - std::vector<TMatchingDisk> matchingDisks; - for (auto& [id, realmGroup] : Box) { - TScorePicker realmPicker; - realmGroup.Matching = false; - for (auto& [id, realm] : realmGroup) { - TScorePicker domainGroupPicker; - realm.Matching = false; - for (auto& [id, domainGroup] : realm) { - TScorePicker domainPicker; - domainGroup.Matching = false; - for (auto& [id, domain] : domainGroup) { - TScorePicker diskPicker; - domain.Matching = false; - domain.NumMatchingDisks = 0; - for (TPDisks::value_type *it : domain) { - auto& [pdiskId, pdisk] = *it; - pdisk.Matching = ctx.DiskIsUsable(pdisk); - if (pdisk.Matching) { - const ui32 score = pdisk.GetPickerScore(); - matchingDisks.push_back(TMatchingDisk{&pdisk, false, score}); - diskPicker.Add(score); - } else if (ctx.NewGroupContent.count(pdiskId)) { - // we create a fake record to keep correct count of fail realms and domains, but - // this particular one never gets selected - matchingDisks.push_back(TMatchingDisk{&pdisk, true}); - diskPicker.AddFake(); - } - } - diskPicker.Cascade(domainPicker, Geom.GetNumVDisksPerFailDomain()); - } - domainPicker.Cascade(domainGroupPicker, Geom.GetNumFailDomainsPerFailRealm()); + ui32 numZero = 0; + for (ui32 i = 0; i < allocator.Topology.GetTotalVDisksNum(); ++i) { + if (!group[i]) { + ++numZero; + TPDiskInfo *pdisk = result->at(i); + ++pdisk->NumSlots; + pdisk->InsertGroup(groupId); } - domainGroupPicker.Cascade(realmPicker, 1); } - realmPicker.Cascade(realmGroupPicker, Geom.GetNumFailRealms()); - } - - bool boxMatching = false; - - if (const std::optional<ui32> maxScore = realmGroupPicker.CalculateMaxScore(1)) { - Y_VERIFY_DEBUG(*maxScore != TScorePicker::FakeScore); - - // remove all mismatched candidate disks and sort them according to their position - auto remove = [maxScore = *maxScore](const TMatchingDisk& p) { - p->Matching = p->Matching && p.Score <= maxScore; - return !p->Matching && !p.IsInGroup; - }; - const auto begin = matchingDisks.begin(); - const auto end = matchingDisks.end(); - matchingDisks.erase(std::remove_if(begin, end, remove), end); - std::sort(matchingDisks.begin(), matchingDisks.end()); - - std::optional<TPDiskLayoutPosition> prev; - - ui32 numMatchingRealmGroupsInBox = 0; - ui32 numMatchingRealmsInRealmGroup = 0; - ui32 numMatchingDomainGroupsInRealm = 0; - ui32 numMatchingDomainsInDomainGroup = 0; - ui32 numMatchingDisksInDomain = 0; - - TFailRealmGroup *realmGroup = nullptr; - TFailRealmInfo *realm = nullptr; - TFailDomainGroup *domainGroup = nullptr; - TFailDomainInfo *domain = nullptr; - - for (const auto& disk : matchingDisks) { - const auto& cur = disk->Position; - switch (prev ? prev->GetDiffIndex(cur) : 0) { - case 0: - numMatchingRealmsInRealmGroup = 0; - realmGroup = &Box[cur.RealmGroup]; - [[fallthrough]]; - case 1: - numMatchingDomainGroupsInRealm = 0; - realm = &(*realmGroup)[cur.RealmInGroup]; - [[fallthrough]]; - case 2: - numMatchingDomainsInDomainGroup = 0; - domainGroup = &(*realm)[cur.DomainGroup]; - [[fallthrough]]; - case 3: - numMatchingDisksInDomain = 0; - domain = &(*domainGroup)[cur.DomainInGroup]; - prev = cur; - [[fallthrough]]; - case 4: - break; - } - - ++domain->NumMatchingDisks; - const std::tuple<ui32&, ui32, bool&> items[] = { - {numMatchingDisksInDomain, Geom.GetNumVDisksPerFailDomain(), domain->Matching }, - {numMatchingDomainsInDomainGroup, Geom.GetNumFailDomainsPerFailRealm(), domainGroup->Matching}, - {numMatchingDomainGroupsInRealm, 1, realm->Matching }, - {numMatchingRealmsInRealmGroup, Geom.GetNumFailRealms(), realmGroup->Matching }, - {numMatchingRealmGroupsInBox, 1, boxMatching }, - }; - for (const auto& item : items) { - if (++std::get<0>(item) == std::get<1>(item)) { - std::get<2>(item) = true; - } else { - break; - } - } - } - Y_VERIFY(boxMatching); + Y_VERIFY(numZero == allocator.Topology.GetTotalVDisksNum() || numZero == replacedDisks.size()); + allocator.Decompose(*result, groupDefinition); + return true; + } else { + error = "no group options " + FormatPDisks(allocator); + return false; } - - return boxMatching; } }; @@ -770,10 +709,8 @@ namespace NKikimr::NBsController { TGroupMapper::~TGroupMapper() = default; - bool TGroupMapper::RegisterPDisk(TPDiskId pdiskId, TNodeLocation location, bool usable, ui32 numSlots, ui32 maxSlots, - const ui32 groupIds[], size_t numGroups, i64 spaceAvailable, bool operational) { - return Impl->RegisterPDisk(pdiskId, std::move(location), usable, numSlots, maxSlots, groupIds, numGroups, - spaceAvailable, operational); + bool TGroupMapper::RegisterPDisk(const TPDiskRecord& pdisk) { + return Impl->RegisterPDisk(pdisk); } void TGroupMapper::UnregisterPDisk(TPDiskId pdiskId) { @@ -784,10 +721,9 @@ namespace NKikimr::NBsController { return Impl->AdjustSpaceAvailable(pdiskId, increment); } - bool TGroupMapper::AllocateGroup(ui32 groupId, TGroupDefinition& group, const TPDiskId replacedDiskIds[], - size_t numReplacedDisks, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error) { - return Impl->AllocateGroup(groupId, group, replacedDiskIds, numReplacedDisks, std::move(forbid), - requiredSpace, requireOperational, error); + bool TGroupMapper::AllocateGroup(ui32 groupId, TGroupDefinition& group, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks, + TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error) { + return Impl->AllocateGroup(groupId, group, replacedDisks, std::move(forbid), requiredSpace, requireOperational, error); } } // NKikimr::NBsController diff --git a/ydb/core/mind/bscontroller/group_mapper.h b/ydb/core/mind/bscontroller/group_mapper.h index f66ba3171c..991a636bf3 100644 --- a/ydb/core/mind/bscontroller/group_mapper.h +++ b/ydb/core/mind/bscontroller/group_mapper.h @@ -18,13 +18,37 @@ namespace NKikimr { using TGroupDefinition = TVector<TVector<TVector<TPDiskId>>>; // Realm/Domain/Disk using TForbiddenPDisks = std::unordered_set<TPDiskId, THash<TPDiskId>>; + template<typename T> + static void Traverse(const TGroupDefinition& group, T&& callback) { + for (ui32 failRealmIdx = 0; failRealmIdx != group.size(); ++failRealmIdx) { + const auto& realm = group[failRealmIdx]; + for (ui32 failDomainIdx = 0; failDomainIdx != realm.size(); ++failDomainIdx) { + const auto& domain = realm[failDomainIdx]; + for (ui32 vdiskIdx = 0; vdiskIdx != domain.size(); ++vdiskIdx) { + callback(TVDiskIdShort(failRealmIdx, failDomainIdx, vdiskIdx), domain[vdiskIdx]); + } + } + } + } + + struct TPDiskRecord { + const TPDiskId PDiskId; + const TNodeLocation Location; + const bool Usable; + ui32 NumSlots; + const ui32 MaxSlots; + TStackVec<ui32, 16> Groups; + i64 SpaceAvailable; + const bool Operational; + const bool Decommitted; + }; + public: TGroupMapper(TGroupGeometryInfo geom, bool randomize = false); ~TGroupMapper(); // Register PDisk inside mapper to use it in subsequent map operations - bool RegisterPDisk(TPDiskId pdiskId, TNodeLocation location, bool usable, ui32 numSlots, ui32 maxSlots, - const ui32 groupIds[], size_t numGroups, i64 spaceAvailable, bool operational); + bool RegisterPDisk(const TPDiskRecord& pdisk); // Remove PDisk from the table. void UnregisterPDisk(TPDiskId pdiskId); @@ -52,9 +76,8 @@ namespace NKikimr { // failRealmBeginDxLevel, failRealmEndDxLevel, and then by finding possible options to meet requirements // (1) and (2). That is, prefix gives us unique domains in which we can find realms to operate, while // prefix+infix part gives us distinct fail realms we can use while generating groups. - bool AllocateGroup(ui32 groupId, TGroupDefinition& group, const TPDiskId replacedDiskIds[], - size_t numReplacedDisks, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, - TString& error); + bool AllocateGroup(ui32 groupId, TGroupDefinition& group, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks, + TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error); }; } // NBsController diff --git a/ydb/core/mind/bscontroller/group_mapper_ut.cpp b/ydb/core/mind/bscontroller/group_mapper_ut.cpp index 0dcb5be086..1be3878296 100644 --- a/ydb/core/mind/bscontroller/group_mapper_ut.cpp +++ b/ydb/core/mind/bscontroller/group_mapper_ut.cpp @@ -2,6 +2,7 @@ #include "group_geometry_info.h" #include "group_mapper.h" +#include "group_layout_checker.h" #include "ut_helpers.h" using namespace NKikimr; @@ -136,17 +137,44 @@ public: } } + ui32 GetDataCenter(TPDiskId pdiskId) const { + const auto it = PDisks.find(pdiskId); + UNIT_ASSERT(it != PDisks.end()); + return it->second.DataCenterId; + } + + TNodeLocation GetLocation(TPDiskId pdiskId) const { + const auto it = PDisks.find(pdiskId); + UNIT_ASSERT(it != PDisks.end()); + return it->second.GetLocation(); + } + + std::vector<std::tuple<ui32, ui32, ui32, ui32>> ExportLayout() const { + std::vector<std::tuple<ui32, ui32, ui32, ui32>> res; + for (const auto& [pdiskId, pdisk] : PDisks) { + res.emplace_back(pdisk.DataCenterId, pdisk.RoomId, pdisk.RackId, pdisk.BodyId); + } + return res; + } + + void ImportLayout(const std::vector<std::tuple<ui32, ui32, ui32, ui32>>& v) { + size_t index = 0; + for (auto& [pdiskId, pdisk] : PDisks) { + UNIT_ASSERT(index != v.size()); + std::tie(pdisk.DataCenterId, pdisk.RoomId, pdisk.RackId, pdisk.BodyId) = v[index]; + ++index; + } + UNIT_ASSERT(index == v.size()); + } + ui32 AllocateGroup(TGroupMapper& mapper, TGroupMapper::TGroupDefinition& group, bool allowFailure = false) { ui32 groupId = NextGroupId++; TString error; - bool success = mapper.AllocateGroup(groupId, group, nullptr, 0, {}, 0, false, error); + bool success = mapper.AllocateGroup(groupId, group, {}, {}, 0, false, error); if (!success && allowFailure) { return 0; } - if (!success) { - Ctest << "error# " << error << Endl; - } - UNIT_ASSERT(success); + UNIT_ASSERT_C(success, error); TGroupRecord& record = Groups[groupId]; record.Group = group; for (const auto& realm : group) { @@ -161,7 +189,7 @@ public: } TGroupMapper::TGroupDefinition ReallocateGroup(TGroupMapper& mapper, ui32 groupId, const TSet<TPDiskId>& unusableDisks, - bool makeThemForbidden = false, bool requireOperational = false, bool requireError = false) { + bool makeThemForbidden = false, bool requireOperational = false, bool allowError = false) { TGroupRecord& group = Groups.at(groupId); TGroupMapper::TForbiddenPDisks forbid(unusableDisks.begin(), unusableDisks.end()); @@ -170,13 +198,14 @@ public: } // remove unusable disks from the set - std::vector<TPDiskId> replaced; - for (auto& realm : group.Group) { - for (auto& domain : realm) { - for (auto& pdisk : domain) { + THashMap<TVDiskIdShort, TPDiskId> replacedDisks; + for (ui32 i = 0; i < group.Group.size(); ++i) { + for (ui32 j = 0; j < group.Group[i].size(); ++j) { + for (ui32 k = 0; k < group.Group[i][j].size(); ++k) { + auto& pdisk = group.Group[i][j][k]; --PDisks.at(pdisk).NumSlots; if (unusableDisks.count(pdisk)) { - replaced.push_back(std::exchange(pdisk, {})); + replacedDisks.emplace(TVDiskIdShort(i, j, k), std::exchange(pdisk, {})); } } } @@ -185,15 +214,24 @@ public: Ctest << "groupId# " << groupId << " reallocating group# " << FormatGroup(group.Group) << Endl; TString error; - bool success = mapper.AllocateGroup(groupId, group.Group, replaced.data(), replaced.size(), std::move(forbid), - 0, requireOperational, error); + bool success = mapper.AllocateGroup(groupId, group.Group, replacedDisks, std::move(forbid), 0, + requireOperational, error); if (!success) { - if (requireError) { + Ctest << "error# " << error << Endl; + if (allowError) { + // revert group to its original state + for (const auto& [vdiskId, pdiskId] : replacedDisks) { + group.Group[vdiskId.FailRealm][vdiskId.FailDomain][vdiskId.VDisk] = pdiskId; + } + for (auto& realm : group.Group) { + for (auto& domain : realm) { + for (auto& pdisk : domain) { + ++PDisks.at(pdisk).NumSlots; + } + } + } return {}; } - Ctest << "error# " << error << Endl; - } else { - UNIT_ASSERT(!requireError); } UNIT_ASSERT(success); @@ -210,6 +248,23 @@ public: return group.Group; } + void SetGroup(ui32 groupId, const TGroupMapper::TGroupDefinition& group) { + auto& g = Groups[groupId]; + for (const TPDiskId& pdiskId : g.PDisks) { + --PDisks.at(pdiskId).NumSlots; + } + g.Group = group; + g.PDisks.clear(); + for (const auto& realm : g.Group) { + for (const auto& domain : realm) { + for (const auto& pdisk : domain) { + g.PDisks.push_back(pdisk); + ++PDisks.at(pdisk).NumSlots; + } + } + } + } + TString FormatGroup(const TGroupMapper::TGroupDefinition& group) { TStringStream str; str << "["; @@ -234,23 +289,27 @@ public: return str.Str(); } - void CheckGroupErasure(const TGroupMapper::TGroupDefinition& group) { + void CheckGroupErasure(const TGroupMapper::TGroupDefinition& group, ui32 decommittedDataCenter = 0) { TSet<ui32> dataCenters; for (const auto& realm : group) { TMaybe<ui32> dataCenter; - TSet<std::tuple<ui32, ui32>> domains; + TSet<std::tuple<ui32, ui32, ui32>> domains; for (const auto& domain : realm) { - TMaybe<std::tuple<ui32, ui32>> currentDom; + TMaybe<std::tuple<ui32, ui32, ui32>> currentDom; for (const auto& pdisk : domain) { const TPDiskRecord& record = PDisks.at(pdisk); - if (dataCenter) { - UNIT_ASSERT_VALUES_EQUAL(*dataCenter, record.DataCenterId); - } else { - dataCenter = record.DataCenterId; - const bool inserted = dataCenters.insert(*dataCenter).second; - UNIT_ASSERT(inserted); + if (record.DataCenterId != decommittedDataCenter) { // ignore entries from decommitted data center + if (dataCenter) { + if (*dataCenter != decommittedDataCenter && record.DataCenterId != decommittedDataCenter) { + UNIT_ASSERT_VALUES_EQUAL(*dataCenter, record.DataCenterId); + } + } else { + dataCenter = record.DataCenterId; + const bool inserted = dataCenters.insert(*dataCenter).second; + UNIT_ASSERT(inserted); + } } - std::tuple<ui32, ui32> dom = {record.RoomId, record.RackId}; + auto dom = std::make_tuple(record.DataCenterId, record.RoomId, record.RackId); if (currentDom) { // check that all disks from the same domain reside in the same domain :) UNIT_ASSERT_EQUAL(dom, *currentDom); @@ -297,7 +356,7 @@ public: } void PopulateGroupMapper(TGroupMapper& mapper, ui32 maxSlots = 16, TSet<TPDiskId> unusableDisks = {}, - TSet<TPDiskId> nonoperationalDisks = {}) { + TSet<TPDiskId> nonoperationalDisks = {}, std::optional<ui32> decommittedDataCenter = std::nullopt) { std::map<TPDiskId, std::vector<ui32>> groupDisks; for (const auto& [groupId, group] : Groups) { for (TPDiskId pdiskId : group.PDisks) { @@ -306,8 +365,88 @@ public: } for (const auto& pair : PDisks) { auto& g = groupDisks[pair.first]; - mapper.RegisterPDisk(pair.first, pair.second.GetLocation(), !unusableDisks.count(pair.first), - pair.second.NumSlots, maxSlots, g.data(), g.size(), 0, nonoperationalDisks.count(pair.first)); + mapper.RegisterPDisk({ + .PDiskId = pair.first, + .Location = pair.second.GetLocation(), + .Usable = !unusableDisks.count(pair.first), + .NumSlots = pair.second.NumSlots, + .MaxSlots = maxSlots, + .Groups{g.begin(), g.end()}, + .SpaceAvailable = 0, + .Operational = !nonoperationalDisks.contains(pair.first), + .Decommitted = decommittedDataCenter == pair.second.DataCenterId, + }); + } + } + + void DumpGroup(const TGroupMapper::TGroupDefinition& group) { + std::set<std::tuple<ui32, ui32, ui32>> locations; + for (const auto& [pdiskId, pdisk] : PDisks) { + locations.emplace(pdisk.DataCenterId, pdisk.RoomId, pdisk.RackId); + } + + std::unordered_map<ui32, ui32> dataCenterToColumn; + std::unordered_map<ui32, std::unordered_map<std::tuple<ui32, ui32>, ui32>> rackToColumn; + for (const auto& x : locations) { + const ui32 dataCenterId = std::get<0>(x); + const ui32 roomId = std::get<1>(x); + const ui32 rackId = std::get<2>(x); + dataCenterToColumn.try_emplace(dataCenterId, dataCenterToColumn.size()); + auto& rtc = rackToColumn[dataCenterId]; + rtc.try_emplace(std::make_tuple(roomId, rackId), rtc.size()); + } + + std::vector<std::vector<TString>> cells(dataCenterToColumn.size()); + for (const auto& [dataCenterId, racks] : rackToColumn) { + cells[dataCenterToColumn[dataCenterId]].resize(racks.size()); + } + + ui32 maxCellWidth = 0; + for (ui32 failRealmIdx = 0; failRealmIdx < group.size(); ++failRealmIdx) { + for (ui32 failDomainIdx = 0; failDomainIdx < group[failRealmIdx].size(); ++failDomainIdx) { + for (const TPDiskId& pdiskId : group[failRealmIdx][failDomainIdx]) { + if (pdiskId != TPDiskId()) { + const auto it = PDisks.find(pdiskId); + UNIT_ASSERT(it != PDisks.end()); + const TPDiskRecord& pdisk = it->second; + auto& cell = cells[dataCenterToColumn[pdisk.DataCenterId]] + [rackToColumn[pdisk.DataCenterId][{pdisk.RoomId, pdisk.RackId}]]; + if (cell) { + cell += ", "; + } + cell += TStringBuilder() << failRealmIdx << "/" << failDomainIdx; + maxCellWidth = Max<ui32>(maxCellWidth, cell.size()); + } + } + } + } + + if (!maxCellWidth) { + ++maxCellWidth; + } + + for (ui32 row = 0;; ++row) { + bool done = true; + TStringBuilder s; + for (ui32 column = 0; column < cells.size(); ++column) { + if (row >= cells[column].size()) { + s << TString(maxCellWidth, ' '); + } else if (const auto& cell = cells[column][row]) { + s << cell << TString(maxCellWidth - cell.size(), ' '); + done = false; + } else { + s << TString(maxCellWidth, 'X'); + done = false; + } + if (column != cells.size() - 1) { + s << ' '; + } + } + if (done) { + break; + } else { + Ctest << s << Endl; + } } } }; @@ -468,6 +607,50 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) { } } + Y_UNIT_TEST(NonUniformClusterMirror3dcWithUnusableDomain) { + std::vector<std::vector<ui32>> disposition{ + { // datacenter1 + 4, 4, 4, 4, 4, 2, 2, 4, 2, 5, 5, 5, + }, + { // datacenter2 + 2, 2, 2, 2, 2, 2, 1, 1, 2, 4, 8, 8, 9, + }, + { // datacenter3 + 4, 4, 1, 3, 4, 4, 2, 6, 9, 8, + }, + { // datacenter4 + 1, + }, + }; + TTestContext context(disposition, 4); + TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc)); + context.PopulateGroupMapper(mapper, 9); + for (ui32 i = 0; i < context.GetTotalDisks() - 4; ++i) { + Ctest << i << "/" << (context.GetTotalDisks() - 4) << Endl; + TGroupMapper::TGroupDefinition group; + context.AllocateGroup(mapper, group); + context.CheckGroupErasure(group); + + TVector<ui32> slots = context.GetSlots(); + UNIT_ASSERT(slots); + ui32 min = Max<ui32>(); + ui32 max = 0; + for (const ui32 x : slots) { + if (x) { + min = Min(min, x); + max = Max(max, x); + } + } + UNIT_ASSERT_C(max - min <= 1, Sprintf("min# %" PRIu32 " max# %" PRIu32, min, max)); + } + TVector<ui32> slots = context.GetSlots(); + for (ui32 numSlots : slots) { + if (numSlots) { + UNIT_ASSERT_VALUES_EQUAL(9, numSlots); + } + } + } + Y_UNIT_TEST(MakeDisksUnusable) { TTestContext context(1, 1, 10, 1, 1); TVector<ui32> groupIds; @@ -538,12 +721,15 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) { nonoperationalDisks.insert(pdiskId); }); context.PopulateGroupMapper(mapper, 10, unusableDisks, nonoperationalDisks); + ui32 hasEmpty = false; for (ui32 groupId : groupIds) { - auto group = context.ReallocateGroup(mapper, groupId, unusableDisks, true, true); - group = context.ReallocateGroup(mapper, groupId, unusableDisks); + auto tmp = context.ReallocateGroup(mapper, groupId, unusableDisks, false, true, true); + hasEmpty |= tmp.empty(); + auto group = context.ReallocateGroup(mapper, groupId, unusableDisks); Ctest << "groupId# " << groupId << " new content# " << context.FormatGroup(group) << Endl; context.CheckGroupErasure(group); } + UNIT_ASSERT(hasEmpty); } } @@ -630,4 +816,179 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) { } } + Y_UNIT_TEST(ReassignGroupTest3dc) { + for (ui32 i = 0; i < 10000; ++i) { + Ctest << "iteration# " << i << Endl; + + const ui32 numDataCenters = 5; + const ui32 numRacks = 5; + TTestContext context(numDataCenters, 1, numRacks, 1, 1); + + TGroupMapper::TGroupDefinition group; + ui32 groupId; + { + TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc)); + context.PopulateGroupMapper(mapper, 1); + groupId = context.AllocateGroup(mapper, group); + Ctest << "group after allocation:" << Endl; + context.DumpGroup(group); + } + + ui32 decommittedDataCenter = RandomNumber<ui32>(numDataCenters + 1); + Ctest << "decommittedDataCenter# " << decommittedDataCenter << Endl; + { + // randomly move some of disks from decommitted datacenter + TSet<TPDiskId> unusableDisks; + for (auto& realm : group) { + for (auto& domain : realm) { + for (auto& pdisk : domain) { + if (context.GetDataCenter(pdisk) == decommittedDataCenter && RandomNumber(2u)) { + unusableDisks.insert(pdisk); + } + } + } + } + + TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc)); + context.PopulateGroupMapper(mapper, 1, {}, {}, decommittedDataCenter); + group = context.ReallocateGroup(mapper, groupId, unusableDisks); + Ctest << "group after data center decommission:" << Endl; + context.DumpGroup(group); + } + + TSet<TPDiskId> unusableDisks; + ui32 unusableDataCenter = RandomNumber<ui32>(numDataCenters + 1); + Ctest << "unusableDataCenter# " << unusableDataCenter << Endl; + if (unusableDataCenter) { + context.IteratePDisks([&](const auto& pdiskId, const auto& record) { + if (record.DataCenterId == unusableDataCenter) { + unusableDisks.insert(pdiskId); + } + }); + } + + for (ui32 i = 0; i < 2; ++i) { + if (const ui32 unusableDataCenter = RandomNumber<ui32>(numDataCenters + 1)) { + const ui32 unusableRack = 1 + RandomNumber<ui32>(numRacks); + context.IteratePDisks([&](const auto& pdiskId, const auto& record) { + if (record.DataCenterId == unusableDataCenter && record.RackId == unusableRack) { + unusableDisks.insert(pdiskId); + } + }); + } + } + + { + TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc)); + context.PopulateGroupMapper(mapper, 1); + auto group = context.ReallocateGroup(mapper, groupId, unusableDisks); + Ctest << "group after reallocation:" << Endl; + context.DumpGroup(group); + context.CheckGroupErasure(group, decommittedDataCenter); + } + + Ctest << Endl; + } + } + + Y_UNIT_TEST(SanitizeGroupTest3dc) { + const ui32 numDataCenters = 3; + const ui32 numRacks = 5; + TTestContext context(numDataCenters, 1, numRacks, 1, 1); + TGroupMapper::TGroupDefinition group; + ui32 groupId; + { + TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc)); + context.PopulateGroupMapper(mapper, 1); + groupId = context.AllocateGroup(mapper, group); + Ctest << "group after allocation:" << Endl; + context.DumpGroup(group); + } + + auto checkLayout = [&](const auto& group) { + TGroupGeometryInfo geom = TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc); + THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>> layout; + for (ui32 i = 0; i < group.size(); ++i) { + for (ui32 j = 0; j < group[i].size(); ++j) { + for (ui32 k = 0; k < group[i][j].size(); ++k) { + layout.emplace(TVDiskIdShort(i, j, k), std::make_pair(context.GetLocation(group[i][j][k]), + group[i][j][k])); + } + } + } + return CheckGroupLayout(geom, layout); + }; + + UNIT_ASSERT(checkLayout(group)); + + for (ui32 n = 0; n < 1000; ++n) { + Ctest << Endl << "iteration# " << n << Endl; + + auto layout = context.ExportLayout(); + std::random_shuffle(layout.begin(), layout.end()); + context.ImportLayout(layout); + + Ctest << "group after layout shuffling:" << Endl; + context.DumpGroup(group); + + struct TQueueItem { + TGroupMapper::TGroupDefinition Group; + TString Path; + TSet<TGroupMapper::TGroupDefinition> Seen; + TSet<TVDiskIdShort> VDiskItems; + TSet<TPDiskId> PDiskItems; + }; + std::deque<TQueueItem> queue; + for (queue.push_back({.Group = group}); !queue.empty(); ) { + TQueueItem item = std::move(queue.front()); + queue.pop_front(); + const auto [it, inserted] = item.Seen.insert(item.Group); + UNIT_ASSERT(inserted); + UNIT_ASSERT(item.Seen.size() <= 9); + Ctest << "processing path# " << item.Path << Endl; + + auto candidates = checkLayout(item.Group); + if (!candidates) { + for (const TVDiskIdShort& vdiskId : candidates.Candidates) { + TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc)); + context.SetGroup(groupId, item.Group); + context.PopulateGroupMapper(mapper, 2); + const TPDiskId& pdiskId = item.Group[vdiskId.FailRealm][vdiskId.FailDomain][vdiskId.VDisk]; + auto temp = context.ReallocateGroup(mapper, groupId, {pdiskId}, false, false, false); + TString path = TStringBuilder() << item.Path << "/" << (int)vdiskId.FailRealm << ":" + << (int)vdiskId.FailDomain << ":" << (int)vdiskId.VDisk << "@" << pdiskId; + Ctest << "path# " << path << Endl; + context.DumpGroup(temp); + + auto vdiskItems = item.VDiskItems; +// const auto [it1, inserted1] = vdiskItems.insert(vdiskId); +// UNIT_ASSERT_C(inserted1, "Duplicate group cell# " << vdiskId); + + auto pdiskItems = item.PDiskItems; +// const auto [it2, inserted2] = pdiskItems.insert(pdiskId); +// UNIT_ASSERT_C(inserted2, "Duplicate origin PDisk# " << pdiskId); + + queue.push_front({.Group = std::move(temp), .Path = std::move(path), .Seen = item.Seen, + .VDiskItems = std::move(vdiskItems), .PDiskItems = std::move(pdiskItems)}); + } + } + + Ctest << Endl; + } + } + } + + Y_UNIT_TEST(CheckNotToBreakFailModel) { + TTestContext context(4, 1, 3, 1, 1); + TGroupMapper::TGroupDefinition group; + TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc)); + context.PopulateGroupMapper(mapper, 1); + ui32 groupId = context.AllocateGroup(mapper, group); + Ctest << "group after allocation:" << Endl; + context.DumpGroup(group); + group = context.ReallocateGroup(mapper, groupId, {group[0][0][0]}, false, false, true); + Ctest << "group after reallocation:" << Endl; + context.DumpGroup(group); + UNIT_ASSERT(group.empty()); + } } diff --git a/ydb/core/mind/bscontroller/impl.h b/ydb/core/mind/bscontroller/impl.h index abe31a576b..8910303ce8 100644 --- a/ydb/core/mind/bscontroller/impl.h +++ b/ydb/core/mind/bscontroller/impl.h @@ -78,6 +78,7 @@ public: class TConfigState; class TGroupSelector; class TGroupFitter; + class TSelfHealActor; using TVSlotReadyTimestampQ = std::list<std::pair<TInstant, TVSlotInfo*>>; @@ -294,6 +295,7 @@ public: NKikimrBlobStorage::EDriveStatus Status; TInstant StatusTimestamp; + NKikimrBlobStorage::EDecommitStatus DecommitStatus; TString ExpectedSerial; TString LastSeenSerial; TString LastSeenPath; @@ -313,7 +315,8 @@ public: Table::Timestamp, Table::ExpectedSerial, Table::LastSeenSerial, - Table::LastSeenPath + Table::LastSeenPath, + Table::DecommitStatus > adapter( &TPDiskInfo::Path, &TPDiskInfo::Kind, @@ -326,7 +329,8 @@ public: &TPDiskInfo::StatusTimestamp, &TPDiskInfo::ExpectedSerial, &TPDiskInfo::LastSeenSerial, - &TPDiskInfo::LastSeenPath + &TPDiskInfo::LastSeenPath, + &TPDiskInfo::DecommitStatus ); callback(&adapter); } @@ -343,6 +347,7 @@ public: ui32 defaultMaxSlots, NKikimrBlobStorage::EDriveStatus status, TInstant statusTimestamp, + NKikimrBlobStorage::EDecommitStatus decommitStatus, const TString& expectedSerial, const TString& lastSeenSerial, const TString& lastSeenPath, @@ -358,6 +363,7 @@ public: , BoxId(boxId) , Status(status) , StatusTimestamp(statusTimestamp) + , DecommitStatus(decommitStatus) , ExpectedSerial(expectedSerial) , LastSeenSerial(lastSeenSerial) , LastSeenPath(lastSeenPath) @@ -401,23 +407,40 @@ public: } bool ShouldBeSettledBySelfHeal() const { - switch (Status) { - case NKikimrBlobStorage::EDriveStatus::FAULTY: - case NKikimrBlobStorage::EDriveStatus::TO_BE_REMOVED: - return true; - default: - return false; - } + return Status == NKikimrBlobStorage::EDriveStatus::FAULTY + || Status == NKikimrBlobStorage::EDriveStatus::TO_BE_REMOVED + || DecommitStatus == NKikimrBlobStorage::EDecommitStatus::DECOMMIT_IMMINENT; } bool BadInTermsOfSelfHeal() const { - return ShouldBeSettledBySelfHeal() || Status == NKikimrBlobStorage::EDriveStatus::INACTIVE; + return Status == NKikimrBlobStorage::EDriveStatus::FAULTY + || Status == NKikimrBlobStorage::EDriveStatus::TO_BE_REMOVED + || Status == NKikimrBlobStorage::EDriveStatus::INACTIVE; } std::tuple<bool, bool> GetSelfHealStatusTuple() const { return {ShouldBeSettledBySelfHeal(), BadInTermsOfSelfHeal()}; } + bool AcceptsNewSlots() const { + return Status == NKikimrBlobStorage::EDriveStatus::ACTIVE; + } + + bool Decommitted() const { + switch (DecommitStatus) { + case NKikimrBlobStorage::EDecommitStatus::DECOMMIT_NONE: + return false; + case NKikimrBlobStorage::EDecommitStatus::DECOMMIT_PENDING: + case NKikimrBlobStorage::EDecommitStatus::DECOMMIT_IMMINENT: + return true; + case NKikimrBlobStorage::EDecommitStatus::DECOMMIT_UNSET: + case NKikimrBlobStorage::EDecommitStatus::EDecommitStatus_INT_MIN_SENTINEL_DO_NOT_USE_: + case NKikimrBlobStorage::EDecommitStatus::EDecommitStatus_INT_MAX_SENTINEL_DO_NOT_USE_: + break; + } + Y_FAIL("unexpected EDecommitStatus"); + } + bool HasGoodExpectedStatus() const { switch (Status) { case NKikimrBlobStorage::EDriveStatus::UNKNOWN: @@ -426,7 +449,6 @@ public: case NKikimrBlobStorage::EDriveStatus::ACTIVE: case NKikimrBlobStorage::EDriveStatus::INACTIVE: - case NKikimrBlobStorage::EDriveStatus::SPARE: case NKikimrBlobStorage::EDriveStatus::FAULTY: case NKikimrBlobStorage::EDriveStatus::TO_BE_REMOVED: return true; @@ -488,7 +510,7 @@ public: struct TGroupStatus { // status derived from the actual state of VDisks (IsReady() to be exact) NKikimrBlobStorage::TGroupStatus::E OperatingStatus = NKikimrBlobStorage::TGroupStatus::UNKNOWN; - // status derived by adding underlying PDisk status (FAULTY&BROKEN are assumed to be not working ones) + // status derived by adding underlying PDisk status (some of them are assumed to be not working ones) NKikimrBlobStorage::TGroupStatus::E ExpectedStatus = NKikimrBlobStorage::TGroupStatus::UNKNOWN; } Status; @@ -1322,6 +1344,7 @@ private: THashMap<TPDiskId, ui32> StaticPDiskSlotUsage; std::unique_ptr<TStoragePoolStat> StoragePoolStat; bool StopGivingGroups = false; + bool GroupLayoutSanitizer = false; NKikimrBlobStorage::TSerialManagementStage::E SerialManagementStage = NKikimrBlobStorage::TSerialManagementStage::DISCOVER_SERIAL; @@ -1347,6 +1370,7 @@ private: EvVSlotReadyUpdate, EvVSlotNotReadyHistogramUpdate, EvProcessIncomingEvent, + EvUpdateHostRecords, }; struct TEvUpdateSystemViews : public TEventLocal<TEvUpdateSystemViews, EvUpdateSystemViews> {}; @@ -1360,6 +1384,14 @@ private: struct TEvScrub : TEventLocal<TEvScrub, EvScrub> {}; struct TEvVSlotReadyUpdate : TEventLocal<TEvVSlotReadyUpdate, EvVSlotReadyUpdate> {}; struct TEvVSlotNotReadyHistogramUpdate : TEventLocal<TEvVSlotNotReadyHistogramUpdate, EvVSlotNotReadyHistogramUpdate> {}; + + struct TEvUpdateHostRecords : TEventLocal<TEvUpdateHostRecords, EvUpdateHostRecords> { + THostRecordMap HostRecords; + + TEvUpdateHostRecords(THostRecordMap hostRecords) + : HostRecords(std::move(hostRecords)) + {} + }; }; static constexpr TDuration UpdateSystemViewsPeriod = TDuration::Seconds(5); @@ -1546,6 +1578,16 @@ private: void Handle(TEvInterconnect::TEvNodesInfo::TPtr &ev); void HandleHostRecordsTimeToLiveExceeded(); +public: + // Self-heal actor's main purpose is to monitor FAULTY pdisks and to slightly move groups out of them; every move + // should not render group unusable, also it should not exceed its fail model. It also takes into account replication + // broker features such as only one vslot over PDisk is being replicated at a moment. + // + // It interacts with BS_CONTROLLER and group observer (which provides information about group state on a per-vdisk + // basis). BS_CONTROLLER reports faulty PDisks and all involved groups in a push notification manner. + IActor *CreateSelfHealActor(); + +private: //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Online state void Handle(TEvBlobStorage::TEvControllerRegisterNode::TPtr &ev); diff --git a/ydb/core/mind/bscontroller/load_everything.cpp b/ydb/core/mind/bscontroller/load_everything.cpp index 8fa32c53f7..8f33e2a1b6 100644 --- a/ydb/core/mind/bscontroller/load_everything.cpp +++ b/ydb/core/mind/bscontroller/load_everything.cpp @@ -83,6 +83,7 @@ public: Self->GroupReservePart = state.GetValue<T::GroupReservePart>(); Self->MaxScrubbedDisksAtOnce = state.GetValue<T::MaxScrubbedDisksAtOnce>(); Self->PDiskSpaceColorBorder = state.GetValue<T::PDiskSpaceColorBorder>(); + Self->GroupLayoutSanitizer = state.GetValue<T::GroupLayoutSanitizer>(); Self->SysViewChangedSettings = true; } } @@ -285,8 +286,8 @@ public: disks.GetValue<T::Guid>(), getOpt(T::SharedWithOs()), getOpt(T::ReadCentric()), disks.GetValueOrDefault<T::NextVSlotId>(), disks.GetValue<T::PDiskConfig>(), boxId, Self->DefaultMaxSlots, disks.GetValue<T::Status>(), disks.GetValue<T::Timestamp>(), - disks.GetValue<T::ExpectedSerial>(), disks.GetValue<T::LastSeenSerial>(), - disks.GetValue<T::LastSeenPath>(), staticSlotUsage); + disks.GetValue<T::DecommitStatus>(), disks.GetValue<T::ExpectedSerial>(), + disks.GetValue<T::LastSeenSerial>(), disks.GetValue<T::LastSeenPath>(), staticSlotUsage); if (!disks.Next()) return false; diff --git a/ydb/core/mind/bscontroller/scheme.h b/ydb/core/mind/bscontroller/scheme.h index 3b7a425c95..cd47ac234d 100644 --- a/ydb/core/mind/bscontroller/scheme.h +++ b/ydb/core/mind/bscontroller/scheme.h @@ -38,10 +38,11 @@ struct Schema : NIceDb::Schema { struct ExpectedSerial : Column<14, NScheme::NTypeIds::String> {}; struct LastSeenSerial : Column<15, NScheme::NTypeIds::String> {}; struct LastSeenPath : Column<16, NScheme::NTypeIds::String> {}; + struct DecommitStatus : Column<17, NScheme::NTypeIds::Uint32> { using Type = NKikimrBlobStorage::EDecommitStatus; static constexpr Type Default = Type::DECOMMIT_NONE; }; using TKey = TableKey<NodeID, PDiskID>; // order is important using TColumns = TableColumns<NodeID, PDiskID, Path, Category, Guid, SharedWithOs, ReadCentric, NextVSlotId, - Status, Timestamp, PDiskConfig, ExpectedSerial, LastSeenSerial, LastSeenPath>; + Status, Timestamp, PDiskConfig, ExpectedSerial, LastSeenSerial, LastSeenPath, DecommitStatus>; }; struct Group : Table<4> { @@ -84,11 +85,13 @@ struct Schema : NIceDb::Schema { struct GroupReservePart : Column<15, NScheme::NTypeIds::Uint32> { static constexpr Type Default = 0; }; // parts per million struct MaxScrubbedDisksAtOnce : Column<16, NScheme::NTypeIds::Uint32> { static constexpr Type Default = Max<ui32>(); }; // no limit struct PDiskSpaceColorBorder : Column<17, NScheme::NTypeIds::Uint32> { using Type = NKikimrBlobStorage::TPDiskSpaceColor::E; static constexpr Type Default = NKikimrBlobStorage::TPDiskSpaceColor::GREEN; }; + struct GroupLayoutSanitizer : Column<18, NScheme::NTypeIds::Bool> { static constexpr Type Default = false; }; using TKey = TableKey<FixedKey>; using TColumns = TableColumns<FixedKey, NextGroupID, SchemaVersion, NextOperationLogIndex, DefaultMaxSlots, InstanceId, SelfHealEnable, DonorModeEnable, ScrubPeriodicity, SerialManagementStage, NextStoragePoolId, - PDiskSpaceMarginPromille, GroupReserveMin, GroupReservePart, MaxScrubbedDisksAtOnce, PDiskSpaceColorBorder>; + PDiskSpaceMarginPromille, GroupReserveMin, GroupReservePart, MaxScrubbedDisksAtOnce, PDiskSpaceColorBorder, + GroupLayoutSanitizer>; }; struct VSlot : Table<5> { diff --git a/ydb/core/mind/bscontroller/self_heal.cpp b/ydb/core/mind/bscontroller/self_heal.cpp index b732d98e93..24c6045625 100644 --- a/ydb/core/mind/bscontroller/self_heal.cpp +++ b/ydb/core/mind/bscontroller/self_heal.cpp @@ -2,6 +2,8 @@ #include "impl.h" #include "vdisk_status_tracker.h" #include "config.h" +#include "group_geometry_info.h" +#include "group_layout_checker.h" namespace NKikimr::NBsController { @@ -111,7 +113,9 @@ namespace NKikimr::NBsController { void Handle(TEvBlobStorage::TEvVStatusResult::TPtr& ev) { const auto& record = ev->Get()->Record; - STLOG(PRI_DEBUG, BS_SELFHEAL, BSSH03, "Reassigner TEvVStatusResult", (GroupId, GroupId), (Response, record)); + STLOG(PRI_DEBUG, BS_SELFHEAL, BSSH03, "Reassigner TEvVStatusResult", (GroupId, GroupId), + (Status, record.GetStatus()), (JoinedGroup, record.GetJoinedGroup()), + (Replicated, record.GetReplicated())); bool diskIsOk = false; if (record.GetStatus() == NKikimrProto::RACE) { @@ -169,6 +173,13 @@ namespace NKikimr::NBsController { if (!record.GetResponse().GetSuccess()) { STLOG(PRI_WARN, BS_SELFHEAL, BSSH07, "Reassigner ReassignGroupDisk request failed", (GroupId, GroupId), (VDiskToReplace, VDiskToReplace), (Response, record)); + } else { + TString items = "none"; + for (const auto& item : record.GetResponse().GetStatus(0).GetReassignedItem()) { + items = TStringBuilder() << VDiskIDFromVDiskID(item.GetVDiskId()) << ": " + << TVSlotId(item.GetFrom()) << " -> " << TVSlotId(item.GetTo()); + } + STLOG(PRI_INFO, BS_SELFHEAL, BSSH09, "Reassigner succeeded", (GroupId, GroupId), (Items, items)); } Finish(record.GetResponse().GetSuccess()); } @@ -204,23 +215,36 @@ namespace NKikimr::NBsController { }) }; - class TSelfHealActor : public TActorBootstrapped<TSelfHealActor> { + class TBlobStorageController::TSelfHealActor : public TActorBootstrapped<TSelfHealActor> { static constexpr TDuration MinRetryTimeout = TDuration::Seconds(1); static constexpr TDuration MaxRetryTimeout = TDuration::Seconds(60); - struct TGroupRecord { + struct TWithFaultyDisks {}; + struct TWithInvalidLayout {}; + + struct TGroupRecord + : TIntrusiveListItem<TGroupRecord, TWithFaultyDisks> + , TIntrusiveListItem<TGroupRecord, TWithInvalidLayout> + { + const TGroupId GroupId; TEvControllerUpdateSelfHealInfo::TGroupContent Content; TActorId ReassignerActorId; // reassigner in flight TDuration RetryTimeout = MinRetryTimeout; TInstant NextRetryTimestamp = TInstant::Zero(); THashMap<TVDiskID, TVDiskStatusTracker> VDiskStatus; + bool LayoutValid = false; + + TGroupRecord(TGroupId groupId) : GroupId(groupId) {} }; const ui64 TabletId; TActorId ControllerId; THashMap<TGroupId, TGroupRecord> Groups; - TSet<TGroupId> GroupsWithFaultyDisks; + TIntrusiveList<TGroupRecord, TWithFaultyDisks> GroupsWithFaultyDisks; + TIntrusiveList<TGroupRecord, TWithInvalidLayout> GroupsWithInvalidLayout; std::shared_ptr<std::atomic_uint64_t> UnreassignableGroups; + bool GroupLayoutSanitizer = false; + THostRecordMap HostRecords; public: TSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups) @@ -236,11 +260,17 @@ namespace NKikimr::NBsController { void Handle(TEvControllerUpdateSelfHealInfo::TPtr& ev) { const TInstant now = TActivationContext::Now(); + if (const auto& setting = ev->Get()->GroupLayoutSanitizer) { + GroupLayoutSanitizer = *setting; + } for (const auto& [groupId, data] : ev->Get()->GroupsToUpdate) { if (data) { - auto& g = Groups[groupId]; + const auto [it, inserted] = Groups.try_emplace(groupId, groupId); + auto& g = it->second; bool hasFaultyDisks = false; g.Content = std::move(*data); + g.LayoutValid = false; + GroupsWithInvalidLayout.PushBack(&g); for (const auto& [vdiskId, vdisk] : g.Content.VDisks) { g.VDiskStatus[vdiskId].Update(vdisk.VDiskStatus, now); hasFaultyDisks |= vdisk.Faulty; @@ -253,9 +283,9 @@ namespace NKikimr::NBsController { } } if (hasFaultyDisks) { - GroupsWithFaultyDisks.insert(groupId); + GroupsWithFaultyDisks.PushBack(&g); } else { - GroupsWithFaultyDisks.erase(groupId); + GroupsWithFaultyDisks.Remove(&g); } } else { // find the group to delete @@ -272,7 +302,6 @@ namespace NKikimr::NBsController { } // remove the group - GroupsWithFaultyDisks.erase(groupId); Groups.erase(it); } } @@ -293,29 +322,40 @@ namespace NKikimr::NBsController { ui64 counter = 0; - for (const TGroupId groupId : GroupsWithFaultyDisks) { - // find the group to process - const auto it = Groups.find(groupId); - Y_VERIFY(it != Groups.end()); - TGroupRecord& group = it->second; - + for (TGroupRecord& group : GroupsWithFaultyDisks) { if (group.ReassignerActorId || now < group.NextRetryTimestamp) { continue; // we are already running reassigner for this group } // check if it is possible to move anything out if (const auto v = FindVDiskToReplace(group.VDiskStatus, group.Content, now)) { - group.ReassignerActorId = Register(new TReassignerActor(ControllerId, groupId, group.Content, *v)); + group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, *v)); } else { ++counter; // this group can't be reassigned right now } } + if (GroupLayoutSanitizer) { + for (auto it = GroupsWithInvalidLayout.begin(); it != GroupsWithInvalidLayout.end(); ) { + TGroupRecord& group = *it++; + Y_VERIFY(!group.LayoutValid); + if (group.ReassignerActorId || now < group.NextRetryTimestamp) { + // nothing to do + } else if (const auto v = FindVDiskToReplaceByLayout(group, now)) { + group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, *v)); + } else if (group.LayoutValid) { + GroupsWithInvalidLayout.Remove(&group); + } else { + ++counter; + } + } + } + UnreassignableGroups->store(counter); } std::optional<TVDiskID> FindVDiskToReplace(const THashMap<TVDiskID, TVDiskStatusTracker>& tracker, - const TEvControllerUpdateSelfHealInfo::TGroupContent& content, const TInstant now) { + const TEvControllerUpdateSelfHealInfo::TGroupContent& content, TInstant now) { auto status = [&](const TVDiskID& id) { try { return tracker.at(id).GetStatus(now); @@ -362,6 +402,41 @@ namespace NKikimr::NBsController { } } + std::optional<TVDiskID> FindVDiskToReplaceByLayout(TGroupRecord& group, TInstant now) { + THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>> layout; + for (const auto& [vdiskId, vdisk] : group.Content.VDisks) { + Y_VERIFY(HostRecords); + if (!vdisk.Decommitted) { + layout.emplace(vdiskId, std::make_pair(HostRecords->GetLocation(vdisk.Location.NodeId), + vdisk.Location.ComprisingPDiskId())); + } + } + const TLayoutCheckResult checkResult = CheckGroupLayout(*group.Content.Geometry, layout); + if (checkResult) { // group is valid + group.LayoutValid = true; + return std::nullopt; + } + + THashSet<TVDiskIdShort> badDisks; + for (const auto& [vdiskId, vdisk] : group.Content.VDisks) { + const auto it = group.VDiskStatus.find(vdiskId); + if (it == group.VDiskStatus.end() || it->second.GetStatus(now) != NKikimrBlobStorage::EVDiskStatus::READY || vdisk.Bad) { + badDisks.insert(vdiskId); + } + } + if (badDisks.empty()) { + return TVDiskID(group.GroupId, group.Content.Generation, checkResult.Candidates.front()); + } else if (badDisks.size() == 1) { + for (const auto& vdiskId : checkResult.Candidates) { + if (badDisks.contains(vdiskId)) { + return TVDiskID(group.GroupId, group.Content.Generation, vdiskId); + } + } + } + + return std::nullopt; + } + void HandleWakeup() { CheckGroups(); Schedule(TDuration::Seconds(10), new TEvents::TEvWakeup); @@ -439,9 +514,8 @@ namespace NKikimr::NBsController { TABLE_CLASS("table-sortable table") { TABLEHEAD() { ui32 numCols = 0; - for (const auto& id : GroupsWithFaultyDisks) { - const auto& info = Groups.at(id); - numCols = Max<ui32>(numCols, info.Content.VDisks.size()); + for (const TGroupRecord& group : GroupsWithFaultyDisks) { + numCols = Max<ui32>(numCols, group.Content.VDisks.size()); } TABLER() { @@ -452,20 +526,19 @@ namespace NKikimr::NBsController { } } TABLEBODY() { - for (const auto& id : GroupsWithFaultyDisks) { - const auto& info = Groups.at(id); + for (const TGroupRecord& group : GroupsWithFaultyDisks) { TABLER() { out << "<td rowspan='2'><a href='?TabletID=" << TabletId - << "&page=GroupDetail&GroupId=" << id << "'>" - << id << "</a>:" << info.Content.Generation << "</td>"; + << "&page=GroupDetail&GroupId=" << group.GroupId << "'>" + << group.GroupId << "</a>:" << group.Content.Generation << "</td>"; - for (const auto& [vdiskId, vdisk] : info.Content.VDisks) { + for (const auto& [vdiskId, vdisk] : group.Content.VDisks) { TABLED() { out << vdiskId.ToString(); out << "<br/>"; out << vdisk.VDiskStatus; out << "<br/><strong>"; - if (const auto it = info.VDiskStatus.find(vdiskId); it != info.VDiskStatus.end()) { + if (const auto it = group.VDiskStatus.find(vdiskId); it != group.VDiskStatus.end()) { if (const auto& status = it->second.GetStatus(now)) { out << *status; } else { @@ -479,7 +552,7 @@ namespace NKikimr::NBsController { } } TABLER() { - for (const auto& [vdiskId, vdisk] : info.Content.VDisks) { + for (const auto& [vdiskId, vdisk] : group.Content.VDisks) { TABLED() { const auto& l = vdisk.Location; if (vdisk.Faulty) { @@ -506,17 +579,22 @@ namespace NKikimr::NBsController { } } + void Handle(TEvPrivate::TEvUpdateHostRecords::TPtr ev) { + HostRecords = std::move(ev->Get()->HostRecords); + } + STRICT_STFUNC(StateFunc, { cFunc(TEvents::TSystem::Poison, PassAway); hFunc(TEvControllerUpdateSelfHealInfo, Handle); hFunc(NMon::TEvRemoteHttpInfo, Handle); hFunc(TEvReassignerDone, Handle); cFunc(TEvents::TSystem::Wakeup, HandleWakeup); + hFunc(TEvPrivate::TEvUpdateHostRecords, Handle); }) }; - IActor *CreateSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups) { - return new TSelfHealActor(tabletId, std::move(unreassignableGroups)); + IActor *TBlobStorageController::CreateSelfHealActor() { + return new TSelfHealActor(TabletID(), SelfHealUnreassignableGroups); } void TBlobStorageController::InitializeSelfHealState() { @@ -525,10 +603,13 @@ namespace NKikimr::NBsController { ev->GroupsToUpdate.emplace(groupId, TEvControllerUpdateSelfHealInfo::TGroupContent()); } FillInSelfHealGroups(*ev, nullptr); + ev->GroupLayoutSanitizer = GroupLayoutSanitizer; Send(SelfHealId, ev.Release()); } void TBlobStorageController::FillInSelfHealGroups(TEvControllerUpdateSelfHealInfo& msg, TConfigState *state) { + THashMap<TBoxStoragePoolId, std::shared_ptr<TGroupGeometryInfo>> geomCache; + for (auto& [groupId, group] : msg.GroupsToUpdate) { if (!group) { continue; @@ -540,11 +621,24 @@ namespace NKikimr::NBsController { group->Generation = p->Generation; group->Type = TBlobStorageGroupType(p->ErasureSpecies); + if (auto it = geomCache.find(p->StoragePoolId); it != geomCache.end()) { + group->Geometry = it->second; + } else { + const TMap<TBoxStoragePoolId, TStoragePoolInfo>& storagePools = state + ? state->StoragePools.Get() + : StoragePools; + const auto spIt = storagePools.find(p->StoragePoolId); + Y_VERIFY(spIt != storagePools.end()); + group->Geometry = std::make_unique<TGroupGeometryInfo>(group->Type, spIt->second.GetGroupGeometry()); + geomCache.emplace(p->StoragePoolId, group->Geometry); + } + for (const TVSlotInfo *slot : p->VDisksInGroup) { group->VDisks[slot->GetVDiskId()] = { slot->VSlotId, slot->PDisk->ShouldBeSettledBySelfHeal(), slot->PDisk->BadInTermsOfSelfHeal(), + slot->PDisk->Decommitted(), slot->GetStatus(), }; } diff --git a/ydb/core/mind/bscontroller/self_heal.h b/ydb/core/mind/bscontroller/self_heal.h index 287f05d467..b2740f4800 100644 --- a/ydb/core/mind/bscontroller/self_heal.h +++ b/ydb/core/mind/bscontroller/self_heal.h @@ -6,29 +6,26 @@ namespace NKikimr::NBsController { + class TGroupGeometryInfo; + struct TEvControllerUpdateSelfHealInfo : TEventLocal<TEvControllerUpdateSelfHealInfo, TEvBlobStorage::EvControllerUpdateSelfHealInfo> { struct TGroupContent { struct TVDiskInfo { TVSlotId Location; bool Faulty; bool Bad; + bool Decommitted; NKikimrBlobStorage::EVDiskStatus VDiskStatus; }; ui32 Generation; TBlobStorageGroupType Type; TMap<TVDiskID, TVDiskInfo> VDisks; + std::shared_ptr<TGroupGeometryInfo> Geometry; }; THashMap<TGroupId, std::optional<TGroupContent>> GroupsToUpdate; // groups with faulty groups that are changed or got faulty PDisks for the first time TVector<std::pair<TVDiskID, NKikimrBlobStorage::EVDiskStatus>> VDiskStatusUpdate; + std::optional<bool> GroupLayoutSanitizer; }; - // Self-heal actor's main purpose is to monitor FAULTY pdisks and to slightly move groups out of them; every move - // should not render group unusable, also it should not exceed its fail model. It also takes into account replication - // broker features such as only one vslot over PDisk is being replicated at a moment. - // - // It interacts with BS_CONTROLLER and group observer (which provides information about group state on a per-vdisk - // basis). BS_CONTROLLER reports faulty PDisks and all involved groups in a push notification manner. - IActor *CreateSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups); - } // NKikimr::NBsController diff --git a/ydb/core/mind/bscontroller/sys_view.cpp b/ydb/core/mind/bscontroller/sys_view.cpp index ba9b50cde0..8dffa53468 100644 --- a/ydb/core/mind/bscontroller/sys_view.cpp +++ b/ydb/core/mind/bscontroller/sys_view.cpp @@ -289,8 +289,17 @@ public: const auto readCentric = pdisk.HasReadCentric() ? MakeMaybe(pdisk.GetReadCentric()) : Nothing(); if (filter.MatchPDisk(pdisk.GetCategory(), sharedWithOs, readCentric)) { const TNodeLocation& location = HostRecords->GetLocation(pdiskId.NodeId); - const bool ok = mapper.RegisterPDisk(pdiskId, location, true, pdisk.GetNumActiveSlots(), - pdisk.GetExpectedSlotCount(), nullptr, 0, 0, true); + const bool ok = mapper.RegisterPDisk({ + .PDiskId = pdiskId, + .Location = location, + .Usable = true, + .NumSlots = pdisk.GetNumActiveSlots(), + .MaxSlots = pdisk.GetExpectedSlotCount(), + .Groups = {}, + .SpaceAvailable = 0, + .Operational = true, + .Decommitted = false, + }); Y_VERIFY(ok); break; } @@ -301,7 +310,7 @@ public: TGroupMapper::TGroupDefinition group; TString error; std::deque<ui64> groupSizes; - while (mapper.AllocateGroup(groupSizes.size(), group, nullptr, 0, {}, 0, false, error)) { + while (mapper.AllocateGroup(groupSizes.size(), group, {}, {}, 0, false, error)) { std::vector<TGroupDiskInfo> disks; std::deque<NKikimrBlobStorage::TPDiskMetrics> pdiskMetrics; std::deque<NKikimrBlobStorage::TVDiskMetrics> vdiskMetrics; @@ -425,6 +434,7 @@ void CopyInfo(NKikimrSysView::TPDiskInfo* info, const THolder<TBlobStorageContro } info->SetExpectedSlotCount(pDiskInfo->ExpectedSlotCount); info->SetNumActiveSlots(pDiskInfo->NumActiveSlots + pDiskInfo->StaticSlotUsage); + info->SetDecommitStatus(NKikimrBlobStorage::EDecommitStatus_Name(pDiskInfo->DecommitStatus)); } void SerializeVSlotInfo(NKikimrSysView::TVSlotInfo *pb, const TVDiskID& vdiskId, const NKikimrBlobStorage::TVDiskMetrics& m, @@ -518,7 +528,6 @@ void CopyInfo(TDstMap& dst, TDeletedSet& deleted, const TSrcMap& src, TChangedSe deleted.insert(key); } } - changed.clear(); } void TBlobStorageController::UpdateSystemViews() { @@ -527,7 +536,7 @@ void TBlobStorageController::UpdateSystemViews() { } if (!SysViewChangedPDisks.empty() || !SysViewChangedVSlots.empty() || !SysViewChangedGroups.empty() || - !SysViewChangedStoragePools.empty()) { + !SysViewChangedStoragePools.empty() || SysViewChangedSettings) { auto update = MakeHolder<TEvControllerUpdateSystemViews>(); update->HostRecords = HostRecords; update->GroupReserveMin = GroupReserveMin; @@ -538,7 +547,6 @@ void TBlobStorageController::UpdateSystemViews() { CopyInfo(state.VSlots, update->DeletedVSlots, VSlots, SysViewChangedVSlots); CopyInfo(state.Groups, update->DeletedGroups, GroupMap, SysViewChangedGroups); CopyInfo(state.StoragePools, update->DeletedStoragePools, StoragePools, SysViewChangedStoragePools); - SysViewChangedSettings = false; // process static slots and static groups for (const auto& [pdiskId, pdisk] : StaticPDisks) { @@ -558,6 +566,7 @@ void TBlobStorageController::UpdateSystemViews() { } } pb->SetStatusV2(NKikimrBlobStorage::EDriveStatus_Name(NKikimrBlobStorage::EDriveStatus::ACTIVE)); + pb->SetDecommitStatus(NKikimrBlobStorage::EDecommitStatus_Name(NKikimrBlobStorage::EDecommitStatus::DECOMMIT_NONE)); pb->SetExpectedSlotCount(pdisk.ExpectedSlotCount ? pdisk.ExpectedSlotCount : pdisk.StaticSlotUsage); pb->SetNumActiveSlots(pdisk.StaticSlotUsage); } @@ -603,6 +612,12 @@ void TBlobStorageController::UpdateSystemViews() { CalculateGroupUsageStats(pb, disks, (TBlobStorageGroupType::EErasureSpecies)group.GetErasureSpecies()); } + SysViewChangedPDisks.clear(); + SysViewChangedVSlots.clear(); + SysViewChangedGroups.clear(); + SysViewChangedStoragePools.clear(); + SysViewChangedSettings = false; + Send(SystemViewsCollectorId, update.Release()); } diff --git a/ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp b/ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp index 5c2d0aad20..d4205abea8 100644 --- a/ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp +++ b/ydb/core/mind/bscontroller/ut_selfheal/self_heal_actor_ut.cpp @@ -1,6 +1,7 @@ #include <library/cpp/testing/unittest/registar.h> #include <ydb/core/util/testactorsys.h> #include <ydb/core/mind/bscontroller/self_heal.h> +#include <ydb/core/mind/bscontroller/impl.h> using namespace NActors; using namespace NKikimr; @@ -13,8 +14,8 @@ void RunTestCase(TCallback&& callback) { TTestActorSystem runtime(1); runtime.Start(); const TActorId& parentId = runtime.AllocateEdgeActor(1); - std::shared_ptr<std::atomic_uint64_t> UnreassignableGroups = std::make_shared<std::atomic_uint64_t>(); - const TActorId& selfHealId = runtime.Register(CreateSelfHealActor(1, UnreassignableGroups), parentId, {}, {}, 1); + TBlobStorageController Controller({}, new TTabletStorageInfo(1, TTabletTypes::FLAT_BS_CONTROLLER)); + const TActorId& selfHealId = runtime.Register(Controller.CreateSelfHealActor(), parentId, {}, {}, 1); callback(selfHealId, parentId, runtime); runtime.Stop(); } diff --git a/ydb/core/mind/bscontroller/ya.make b/ydb/core/mind/bscontroller/ya.make index 275a4dc238..c652ec110e 100644 --- a/ydb/core/mind/bscontroller/ya.make +++ b/ydb/core/mind/bscontroller/ya.make @@ -25,6 +25,8 @@ SRCS( get_group.cpp grouper.cpp grouper.h + group_layout_checker.cpp + group_layout_checker.h group_mapper.cpp group_mapper.h group_reconfigure_wipe.cpp diff --git a/ydb/core/protos/blobstorage_config.proto b/ydb/core/protos/blobstorage_config.proto index 10d0a29e6c..bc7f54aa6e 100644 --- a/ydb/core/protos/blobstorage_config.proto +++ b/ydb/core/protos/blobstorage_config.proto @@ -186,11 +186,18 @@ enum EDriveStatus { ACTIVE = 1; // working as expected INACTIVE = 2; // new groups are not created over this drive, but existing ones continue to work as expected BROKEN = 3; // drive is not working, groups are automatically moved out of this drive upon reception of this status - SPARE = 4; // spare drive -- groups are created only when being moved from BROKEN drives + reserved 4; FAULTY = 5; // drive is expected to become BROKEN soon, new groups are not created, old groups are asynchronously moved out from this drive TO_BE_REMOVED = 6; // same as INACTIVE, but drive is counted in fault model as not working } +enum EDecommitStatus { + DECOMMIT_UNSET = 0; // unset status -- missing optional field + DECOMMIT_NONE = 1; // no decomission + DECOMMIT_PENDING = 2; // drive is going to be removed soon, but SelfHeal logic would not remove it automatically + DECOMMIT_IMMINENT = 3; // drive is going to be settled automatically +} + message TGroupStatus { enum E { UNKNOWN = 0; // group status is unknown (default value) @@ -226,6 +233,7 @@ message TUpdateDriveStatus { uint32 PDiskId = 4; // may be set instead of path to identify PDisk string Serial = 5; // may be set instead of path and PDiskId to identify PDisk uint64 StatusChangeTimestamp = 6; // used only in return of ReadDriveStatus + EDecommitStatus DecommitStatus = 7; } message TReadDriveStatus { @@ -426,6 +434,7 @@ message TUpdateSettings { repeated uint32 GroupReservePartPPM = 7; repeated uint32 MaxScrubbedDisksAtOnce = 8; repeated NKikimrBlobStorage.TPDiskSpaceColor.E PDiskSpaceColorBorder = 9; + repeated bool EnableGroupLayoutSanitizer = 10; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -526,6 +535,7 @@ message TBaseConfig { uint32 ExpectedSlotCount = 13; NKikimrBlobStorage.TPDiskMetrics PDiskMetrics = 14; uint64 DriveStatusChangeTimestamp = 15; // TInstant::GetValue() + EDecommitStatus DecommitStatus = 16; } message TVSlot { message TDonorDisk { diff --git a/ydb/core/protos/out/out.cpp b/ydb/core/protos/out/out.cpp index 28c49a5c36..88db8ad5f8 100644 --- a/ydb/core/protos/out/out.cpp +++ b/ydb/core/protos/out/out.cpp @@ -69,6 +69,10 @@ Y_DECLARE_OUT_SPEC(, NKikimrBlobStorage::EDriveStatus, stream, value) { stream << NKikimrBlobStorage::EDriveStatus_Name(value); } +Y_DECLARE_OUT_SPEC(, NKikimrBlobStorage::EDecommitStatus, stream, value) { + stream << NKikimrBlobStorage::EDecommitStatus_Name(value); +} + Y_DECLARE_OUT_SPEC(, NKikimrBlobStorage::TGroupStatus::E, stream, value) { stream << NKikimrBlobStorage::TGroupStatus::E_Name(value); } diff --git a/ydb/core/protos/sys_view.proto b/ydb/core/protos/sys_view.proto index 44abe03515..4d495d6b60 100644 --- a/ydb/core/protos/sys_view.proto +++ b/ydb/core/protos/sys_view.proto @@ -166,6 +166,7 @@ message TPDiskInfo { optional uint32 ExpectedSlotCount = 15; optional uint32 NumActiveSlots = 16; optional uint64 Category = 17; + optional string DecommitStatus = 18; // metrics ? // physical location ? // config ? diff --git a/ydb/core/sys_view/common/schema.h b/ydb/core/sys_view/common/schema.h index f7a4f191e1..29ba8b9b0f 100644 --- a/ydb/core/sys_view/common/schema.h +++ b/ydb/core/sys_view/common/schema.h @@ -191,6 +191,7 @@ struct Schema : NIceDb::Schema { struct StatusChangeTimestamp : Column<14, NScheme::NTypeIds::Timestamp> {}; struct ExpectedSlotCount : Column<15, NScheme::NTypeIds::Uint32> {}; struct NumActiveSlots : Column<16, NScheme::NTypeIds::Uint32> {}; + struct DecommitStatus : Column<17, NScheme::NTypeIds::String> {}; using TKey = TableKey<NodeId, PDiskId>; using TColumns = TableColumns< @@ -208,7 +209,8 @@ struct Schema : NIceDb::Schema { Status, StatusChangeTimestamp, ExpectedSlotCount, - NumActiveSlots>; + NumActiveSlots, + DecommitStatus>; }; struct VSlots : Table<5> { diff --git a/ydb/core/sys_view/storage/pdisks.cpp b/ydb/core/sys_view/storage/pdisks.cpp index 47f84ae9aa..7b99991c52 100644 --- a/ydb/core/sys_view/storage/pdisks.cpp +++ b/ydb/core/sys_view/storage/pdisks.cpp @@ -39,6 +39,7 @@ public: {T::StatusChangeTimestamp::ColumnId, {E::kInfoFieldNumber, V::kStatusChangeTimestampFieldNumber}}, {T::ExpectedSlotCount::ColumnId, {E::kInfoFieldNumber, V::kExpectedSlotCountFieldNumber}}, {T::NumActiveSlots::ColumnId, {E::kInfoFieldNumber, V::kNumActiveSlotsFieldNumber}}, + {T::DecommitStatus::ColumnId, {E::kInfoFieldNumber, V::kDecommitStatusFieldNumber}}, }; return fieldMap; } diff --git a/ydb/core/sys_view/ut_kqp.cpp b/ydb/core/sys_view/ut_kqp.cpp index c5f6c5470a..56eb5af269 100644 --- a/ydb/core/sys_view/ut_kqp.cpp +++ b/ydb/core/sys_view/ut_kqp.cpp @@ -884,8 +884,10 @@ Y_UNIT_TEST_SUITE(SystemView) { TotalSize, Type, ExpectedSlotCount, - NumActiveSlots - FROM `/Root/.sys/ds_pdisks`; + NumActiveSlots, + DecommitStatus + FROM `/Root/.sys/ds_pdisks` + WHERE BoxId IS NOT NULL; )").GetValueSync(); UNIT_ASSERT_C(it.IsSuccess(), it.GetIssues().ToString()); @@ -900,7 +902,7 @@ Y_UNIT_TEST_SUITE(SystemView) { } } - TYsonFieldChecker check(ysonString, 15); + TYsonFieldChecker check(ysonString, 16); check.Uint64(0u); // AvailableSize check.Uint64(999u); // BoxId @@ -917,6 +919,7 @@ Y_UNIT_TEST_SUITE(SystemView) { check.String("ROT"); // Type check.Uint64(16); // ExpectedSlotCount check.Uint64(2); // NumActiveSlots + check.String("DECOMMIT_NONE"); // DecommitStatus } Y_UNIT_TEST(VSlotsFields) { diff --git a/ydb/core/util/testactorsys.cpp b/ydb/core/util/testactorsys.cpp index 4a270e4f8c..ead159d35e 100644 --- a/ydb/core/util/testactorsys.cpp +++ b/ydb/core/util/testactorsys.cpp @@ -132,17 +132,25 @@ TActorId TTestActorSystem::CreateTestBootstrapper(TTabletStorageInfo *info, std: return Register(CreateBootstrapper(info, bi.Get()), nodeId); } -void TTestActorSystem::SetupTabletRuntime(bool isMirror3dc, ui32 stateStorageNodeId, ui32 targetNodeId) { - auto setup = MakeIntrusive<TTableNameserverSetup>(); - ui32 nodeCountInDC = (MaxNodeId + 2) / 3; - for (ui32 nodeId : GetNodes()) { - const TString name = Sprintf("127.0.0.%u", nodeId); - ui32 dcNum = isMirror3dc ? ((nodeId + nodeCountInDC - 1) / nodeCountInDC) : 1; +void TTestActorSystem::SetupTabletRuntime(ui32 numDataCenters, ui32 stateStorageNodeId, ui32 targetNodeId) { + const ui32 nodeCountInDC = (MaxNodeId + numDataCenters - 1) / numDataCenters; + auto locationGenerator = [&](ui32 nodeId) { + const ui32 dcNum = (nodeId + nodeCountInDC - 1) / nodeCountInDC; NActorsInterconnect::TNodeLocation location; location.SetDataCenter(ToString(dcNum)); location.SetRack(ToString(nodeId)); location.SetUnit(ToString(nodeId)); - setup->StaticNodeTable[nodeId] = {name, name, name, 19001, TNodeLocation(location)}; + return TNodeLocation(location); + }; + SetupTabletRuntime(locationGenerator, stateStorageNodeId, targetNodeId); +} + +void TTestActorSystem::SetupTabletRuntime(const std::function<TNodeLocation(ui32)>& locationGenerator, + ui32 stateStorageNodeId, ui32 targetNodeId) { + auto setup = MakeIntrusive<TTableNameserverSetup>(); + for (ui32 nodeId : GetNodes()) { + const TString name = Sprintf("127.0.0.%u", nodeId); + setup->StaticNodeTable[nodeId] = {name, name, name, 19001, locationGenerator(nodeId)}; } for (ui32 nodeId : GetNodes()) { diff --git a/ydb/core/util/testactorsys.h b/ydb/core/util/testactorsys.h index 722d5a8163..5037f7ab71 100644 --- a/ydb/core/util/testactorsys.h +++ b/ydb/core/util/testactorsys.h @@ -660,7 +660,9 @@ public: //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // tablet-related utility functions - void SetupTabletRuntime(bool isMirror3dc = false, ui32 stateStorageNodeId = 0, ui32 targetNodeId = 0); + void SetupTabletRuntime(ui32 numDataCenters = 1, ui32 stateStorageNodeId = 0, ui32 targetNodeId = 0); + void SetupTabletRuntime(const std::function<TNodeLocation(ui32)>& locationGenerator, ui32 stateStorageNodeId = 0, + ui32 targetNodeId = 0); static NTabletPipe::TClientConfig GetPipeConfigWithRetries(); void SendToPipe(ui64 tabletId, const TActorId& sender, IEventBase* payload, ui64 cookie, const NKikimr::NTabletPipe::TClientConfig& pipeConfig); static TTabletStorageInfo *CreateTestTabletInfo(ui64 tabletId, TTabletTypes::EType tabletType, TBlobStorageGroupType::EErasureSpecies erasure, ui32 groupId); diff --git a/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema b/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema index 480b1d419f..928757d025 100644 --- a/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema +++ b/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_bs_controller_/flat_bs_controller.schema @@ -12,6 +12,11 @@ "ColumnType": "Uint32" }, { + "ColumnId": 18, + "ColumnName": "GroupLayoutSanitizer", + "ColumnType": "Bool" + }, + { "ColumnId": 1, "ColumnName": "FixedKey", "ColumnType": "Bool" @@ -92,6 +97,7 @@ "0": { "Columns": [ 17, + 18, 1, 2, 4, @@ -174,6 +180,11 @@ ], "ColumnsAdded": [ { + "ColumnId": 17, + "ColumnName": "DecommitStatus", + "ColumnType": "Uint32" + }, + { "ColumnId": 1, "ColumnName": "NodeID", "ColumnType": "Uint32" @@ -248,6 +259,7 @@ "ColumnFamilies": { "0": { "Columns": [ + 17, 1, 2, 3, |