summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorserg-belyakov <[email protected]>2022-12-27 12:45:42 +0300
committerserg-belyakov <[email protected]>2022-12-27 12:45:42 +0300
commitbbda7c5295e60cbe2620908dc2b1d49385ef0352 (patch)
tree8a92bf93dce4f5eb253c1c8e6fb5e3325b977812
parent726a249dea3cbcaa4b1bf07754b1bb5ee1a6626a (diff)
Add SanitizeGroup function to TGroupMapper,
Determine failRealm->pRealm mapping Add sanitize function to GroupMapper
-rw-r--r--ydb/core/mind/bscontroller/bsc.cpp1
-rw-r--r--ydb/core/mind/bscontroller/cmds_storage_pool.cpp5
-rw-r--r--ydb/core/mind/bscontroller/config.h2
-rw-r--r--ydb/core/mind/bscontroller/config_cmd.cpp2
-rw-r--r--ydb/core/mind/bscontroller/config_fit_groups.cpp54
-rw-r--r--ydb/core/mind/bscontroller/group_geometry_info.h39
-rw-r--r--ydb/core/mind/bscontroller/group_mapper.cpp469
-rw-r--r--ydb/core/mind/bscontroller/group_mapper.h30
-rw-r--r--ydb/core/mind/bscontroller/group_mapper_ut.cpp257
-rw-r--r--ydb/core/mind/bscontroller/monitoring.cpp15
-rw-r--r--ydb/core/mind/bscontroller/self_heal.cpp69
-rw-r--r--ydb/core/mind/bscontroller/ut_selfheal/env.h2
-rw-r--r--ydb/core/protos/blobstorage_config.proto5
13 files changed, 804 insertions, 146 deletions
diff --git a/ydb/core/mind/bscontroller/bsc.cpp b/ydb/core/mind/bscontroller/bsc.cpp
index c44f3cc71f0..1f1d9f1e1fa 100644
--- a/ydb/core/mind/bscontroller/bsc.cpp
+++ b/ydb/core/mind/bscontroller/bsc.cpp
@@ -349,6 +349,7 @@ ui32 TBlobStorageController::GetEventPriority(IEventHandle *ev) {
case NKikimrBlobStorage::TConfigRequest::TCommand::kAllocateVirtualGroup:
case NKikimrBlobStorage::TConfigRequest::TCommand::kDecommitGroups:
case NKikimrBlobStorage::TConfigRequest::TCommand::kWipeVDisk:
+ case NKikimrBlobStorage::TConfigRequest::TCommand::kSanitizeGroup:
return 2; // read-write commands go with higher priority as they are needed to keep cluster intact
case NKikimrBlobStorage::TConfigRequest::TCommand::kReadHostConfig:
diff --git a/ydb/core/mind/bscontroller/cmds_storage_pool.cpp b/ydb/core/mind/bscontroller/cmds_storage_pool.cpp
index ca931601bad..d590081069f 100644
--- a/ydb/core/mind/bscontroller/cmds_storage_pool.cpp
+++ b/ydb/core/mind/bscontroller/cmds_storage_pool.cpp
@@ -584,4 +584,9 @@ namespace NKikimr::NBsController {
group->CalculateGroupStatus();
}
+ void TBlobStorageController::TConfigState::ExecuteStep(const NKikimrBlobStorage::TSanitizeGroup& cmd, NKikimrBlobStorage::TConfigResponse::TStatus& /*status*/) {
+ ui32 groupId = cmd.GetGroupId();
+ SanitizingRequests.emplace(groupId);
+ }
+
} // NKikimr::NBsController
diff --git a/ydb/core/mind/bscontroller/config.h b/ydb/core/mind/bscontroller/config.h
index ca7e5f44d77..0c46340d168 100644
--- a/ydb/core/mind/bscontroller/config.h
+++ b/ydb/core/mind/bscontroller/config.h
@@ -79,6 +79,7 @@ namespace NKikimr {
// volatile reconfiguration state
THashMap<TVSlotId, TPDiskId> ExplicitReconfigureMap;
std::set<TVSlotId> SuppressDonorMode;
+ std::unordered_set<ui32> SanitizingRequests;
// just-created vslots, which are not yet committed to the storage
TSet<TVSlotId> UncommittedVSlots;
@@ -273,6 +274,7 @@ namespace NKikimr {
void ExecuteStep(const NKikimrBlobStorage::TAllocateVirtualGroup& cmd, TStatus& status);
void ExecuteStep(const NKikimrBlobStorage::TDecommitGroups& cmd, TStatus& status);
void ExecuteStep(const NKikimrBlobStorage::TWipeVDisk& cmd, TStatus& status);
+ void ExecuteStep(const NKikimrBlobStorage::TSanitizeGroup& cmd, TStatus& status);
};
} // NBsController
diff --git a/ydb/core/mind/bscontroller/config_cmd.cpp b/ydb/core/mind/bscontroller/config_cmd.cpp
index 96084e3b842..d53fc9400c3 100644
--- a/ydb/core/mind/bscontroller/config_cmd.cpp
+++ b/ydb/core/mind/bscontroller/config_cmd.cpp
@@ -218,6 +218,7 @@ namespace NKikimr::NBsController {
MAP_TIMING(DropDonorDisk, DROP_DONOR_DISK)
MAP_TIMING(ReassignGroupDisk, REASSIGN_GROUP_DISK)
MAP_TIMING(WipeVDisk, REASSIGN_GROUP_DISK)
+ MAP_TIMING(SanitizeGroup, REASSIGN_GROUP_DISK)
default:
break;
@@ -314,6 +315,7 @@ namespace NKikimr::NBsController {
HANDLE_COMMAND(AllocateVirtualGroup)
HANDLE_COMMAND(DecommitGroups)
HANDLE_COMMAND(WipeVDisk)
+ HANDLE_COMMAND(SanitizeGroup)
case NKikimrBlobStorage::TConfigRequest::TCommand::kAddMigrationPlan:
case NKikimrBlobStorage::TConfigRequest::TCommand::kDeleteMigrationPlan:
diff --git a/ydb/core/mind/bscontroller/config_fit_groups.cpp b/ydb/core/mind/bscontroller/config_fit_groups.cpp
index 45d3443b8e5..802a20554b5 100644
--- a/ydb/core/mind/bscontroller/config_fit_groups.cpp
+++ b/ydb/core/mind/bscontroller/config_fit_groups.cpp
@@ -172,6 +172,7 @@ namespace NKikimr {
TStackVec<std::pair<TVSlotId, bool>, 32> replaceQueue;
THashMap<TVDiskIdShort, TPDiskId> replacedDisks;
i64 requiredSpace = Min<i64>();
+ bool sanitizingRequest = (State.SanitizingRequests.find(groupId) != State.SanitizingRequests.end());
////////////////////////////////////////////////////////////////////////////////////////////////////////
// scan through all VSlots and find matching PDisks
@@ -225,6 +226,11 @@ namespace NKikimr {
}
}
+ if (sanitizingRequest) {
+ // resize group definition
+ getGroup();
+ }
+
if (group) {
TGroupInfo *groupInfo = State.Groups.FindForUpdate(groupId);
@@ -249,7 +255,26 @@ namespace NKikimr {
}
}
}
- AllocateGroup(groupId, group, replacedDisks, std::move(forbid), requiredSpace, AllowUnusableDisks);
+ if ((replacedDisks.empty() && sanitizingRequest) || (replacedDisks.size() == 1)) {
+ auto result = SanitizeGroup(groupId, group, std::move(forbid), requiredSpace, AllowUnusableDisks);
+
+ if (replacedDisks.empty()) {
+ // update information about replaced disks
+ for (const TVSlotInfo *vslot : groupInfo->VDisksInGroup) {
+ if (vslot->GetShortVDiskId() == result.first) {
+ auto it = preservedSlots.find(vslot->GetVDiskId());
+ Y_VERIFY(it != preservedSlots.end());
+ preservedSlots.erase(it);
+ replacedSlots.emplace(result.first, vslot->VSlotId);
+ replaceQueue.emplace_back(vslot->VSlotId, State.SuppressDonorMode.count(vslot->VSlotId));
+ replacedDisks.emplace(result.first, vslot->VSlotId.ComprisingPDiskId());
+ break;
+ }
+ }
+ }
+ } else {
+ AllocateGroup(groupId, group, replacedDisks, std::move(forbid), requiredSpace, AllowUnusableDisks);
+ }
if (!IgnoreVSlotQuotaCheck) {
adjustSpaceAvailable = true;
for (const auto& [pos, vslotId] : replacedSlots) {
@@ -381,6 +406,33 @@ namespace NKikimr {
}
}
+ std::pair<TVDiskIdShort, TPDiskId> SanitizeGroup(TGroupId groupId, TGroupMapper::TGroupDefinition& group,
+ TGroupMapper::TForbiddenPDisks forbid, i64 requiredSpace, bool addExistingDisks) {
+ if (!Mapper) {
+ Mapper.emplace(Geometry, StoragePool.RandomizeGroupMapping);
+ PopulateGroupMapper();
+ }
+ TStackVec<TPDiskId, 32> removeQ;
+ if (addExistingDisks) {
+ for (const auto& realm : group) {
+ for (const auto& domain : realm) {
+ for (const TPDiskId id : domain) {
+ if (id != TPDiskId()) {
+ if (auto *info = State.PDisks.Find(id); info && RegisterPDisk(id, *info, false)) {
+ removeQ.push_back(id);
+ }
+ }
+ }
+ }
+ }
+ }
+ auto res = Geometry.SanitizeGroup(*Mapper, groupId, group, std::move(forbid), requiredSpace);
+ for (const TPDiskId pdiskId : removeQ) {
+ Mapper->UnregisterPDisk(pdiskId);
+ }
+ return res;
+ }
+
void PopulateGroupMapper() {
const TBoxId boxId = std::get<0>(StoragePoolId);
diff --git a/ydb/core/mind/bscontroller/group_geometry_info.h b/ydb/core/mind/bscontroller/group_geometry_info.h
index a74698f1f84..748f9802fc7 100644
--- a/ydb/core/mind/bscontroller/group_geometry_info.h
+++ b/ydb/core/mind/bscontroller/group_geometry_info.h
@@ -80,6 +80,23 @@ namespace NKikimr::NBsController {
throw TExFitGroupError() << "failed to allocate group: " << error;
}
+ // returns pair of previous VDisk and PDisk id's
+ std::pair<TVDiskIdShort, TPDiskId> SanitizeGroup(TGroupMapper &mapper, TGroupId groupId, TGroupMapper::TGroupDefinition &group,
+ TGroupMapper::TForbiddenPDisks forbid, i64 requiredSpace) const {
+ TString error;
+ auto misplacedVDisks = mapper.FindMisplacedVDisks(group);
+ for (const bool requireOperational : {true, false}) {
+ for (const auto& replacedDisk : misplacedVDisks.Disks) {
+ TPDiskId pdiskId = group[replacedDisk.FailRealm][replacedDisk.FailDomain][replacedDisk.VDisk];
+ if (mapper.TargetMisplacedVDisk(groupId, group, replacedDisk, forbid, requiredSpace,
+ requireOperational, error)) {
+ return {replacedDisk, pdiskId};
+ }
+ }
+ }
+ throw TExFitGroupError() << "failed to sanitize group: " << error;
+ }
+
bool ResizeGroup(TGroupMapper::TGroupDefinition& group) const {
if (!group) {
group.resize(NumFailRealms);
@@ -114,6 +131,28 @@ namespace NKikimr::NBsController {
return true;
}
+ bool CheckGroupSize(const TGroupMapper::TGroupDefinition& group) const {
+ if (!group) {
+ return false;
+ }
+
+ if (group.size() != NumFailRealms) {
+ return false;
+ }
+ for (const auto& realm : group) {
+ if (realm.size() != NumFailDomainsPerFailRealm) {
+ return false;
+ }
+ for (const auto& domain : realm) {
+ if (domain.size() != NumVDisksPerFailDomain) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
TBlobStorageGroupType GetType() const {
return Type;
}
diff --git a/ydb/core/mind/bscontroller/group_mapper.cpp b/ydb/core/mind/bscontroller/group_mapper.cpp
index 5619534fb3a..3f4dd8f2158 100644
--- a/ydb/core/mind/bscontroller/group_mapper.cpp
+++ b/ydb/core/mind/bscontroller/group_mapper.cpp
@@ -6,6 +6,8 @@ namespace NKikimr::NBsController {
using namespace NLayoutChecker;
+ struct TAllocator;
+
class TGroupMapper::TImpl : TNonCopyable {
struct TPDiskInfo : TPDiskRecord {
TPDiskLayoutPosition Position;
@@ -58,7 +60,12 @@ namespace NKikimr::NBsController {
using TGroup = std::vector<TPDiskInfo*>;
- struct TAllocator {
+ // PDomain/PRealm - TPDiskLayoutPosition, Fail Domain/Fail Realm - VDiskId
+
+ using TPDomainCandidatesRange = std::pair<std::vector<ui32>::const_iterator, std::vector<ui32>::const_iterator>;
+ using TPDiskCandidatesRange = std::pair<std::vector<TPDiskInfo*>::const_iterator, std::vector<TPDiskInfo*>::const_iterator>;
+
+ struct TDiskManager {
TImpl& Self;
const TBlobStorageGroupInfo::TTopology Topology;
THashSet<TPDiskId> OldGroupContent; // set of all existing disks in the group, inclusing ones which are replaced
@@ -69,7 +76,7 @@ namespace NKikimr::NBsController {
TGroupLayout GroupLayout;
std::optional<TScore> WorstScore;
- TAllocator(TImpl& self, const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational,
+ TDiskManager(TImpl& self, const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational,
TForbiddenPDisks forbiddenDisks, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks)
: Self(self)
, Topology(geom.GetType(), geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm(), geom.GetNumVDisksPerFailDomain(), true)
@@ -227,6 +234,65 @@ namespace NKikimr::NBsController {
}
}
+ bool DiskIsBetter(const TPDiskInfo& pretender, const TPDiskInfo& king) const {
+ if (pretender.NumSlots != king.NumSlots) {
+ return pretender.NumSlots < king.NumSlots;
+ } else if (GivesLocalityBoost(pretender, king) || BetterQuotaMatch(pretender, king)) {
+ return true;
+ } else {
+ if (pretender.NumDomainMatchingDisks != king.NumDomainMatchingDisks) {
+ return pretender.NumDomainMatchingDisks > king.NumDomainMatchingDisks;
+ }
+ return pretender.PDiskId < king.PDiskId;
+ }
+ }
+
+ bool GivesLocalityBoost(const TPDiskInfo& pretender, const TPDiskInfo& king) const {
+ const ui32 a = GetLocalityFactor(pretender);
+ const ui32 b = GetLocalityFactor(king);
+ return Self.Randomize ? a < b : a > b;
+ }
+
+ bool BetterQuotaMatch(const TPDiskInfo& pretender, const TPDiskInfo& king) const {
+ return pretender.SpaceAvailable < king.SpaceAvailable;
+ }
+
+ void AddUsedDisk(const TPDiskInfo& pdisk) {
+ for (ui32 groupId : pdisk.Groups) {
+ ++LocalityFactor[groupId];
+ }
+ }
+
+ void RemoveUsedDisk(const TPDiskInfo& pdisk) {
+ for (ui32 groupId : pdisk.Groups) {
+ if (!--LocalityFactor[groupId]) {
+ LocalityFactor.erase(groupId);
+ }
+ }
+ }
+
+ unsigned GetLocalityFactor(const TPDiskInfo& pdisk) const {
+ unsigned res = 0;
+ for (ui32 groupId : pdisk.Groups) {
+ res += GetLocalityFactor(groupId);
+ }
+ return res;
+ }
+
+ unsigned GetLocalityFactor(ui32 groupId) const {
+ const auto it = LocalityFactor.find(groupId);
+ return it != LocalityFactor.end() ? it->second : 0;
+ }
+ };
+
+ struct TAllocator : public TDiskManager {
+
+ TAllocator(TImpl& self, const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational,
+ TForbiddenPDisks forbiddenDisks, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks)
+ : TDiskManager(self, geom, requiredSpace, requireOperational, forbiddenDisks, replacedDisks)
+ {
+ }
+
bool FillInGroup(ui32 maxScore, TUndoLog& undo, TGroup& group) {
// determine PDisks that fit our requirements (including score)
auto v = SetupMatchingDisks(maxScore);
@@ -467,55 +533,270 @@ namespace NKikimr::NBsController {
cb(pdisk);
}
}
+ };
- bool DiskIsBetter(TPDiskInfo& pretender, TPDiskInfo& king) const {
- if (pretender.NumSlots != king.NumSlots) {
- return pretender.NumSlots < king.NumSlots;
- } else if (GivesLocalityBoost(pretender, king) || BetterQuotaMatch(pretender, king)) {
- return true;
- } else {
- if (pretender.NumDomainMatchingDisks != king.NumDomainMatchingDisks) {
- return pretender.NumDomainMatchingDisks > king.NumDomainMatchingDisks;
+ struct TSanitizer : public TDiskManager {
+ ui32 DesiredRealmGroup;
+ std::vector<ui32> RealmNavigator;
+ // failRealm -> pRealm
+ std::unordered_map<ui32, std::vector<ui32>> DomainCandidates;
+ // pRealm -> {pDomain1, pDomain2, ... }, sorted by number of slots in pDomains
+ std::unordered_map<ui32, std::unordered_map<ui32, std::vector<TPDiskInfo*>>> DiskCandidates;
+ // {pRealm, pDomain} -> {pdisk1, pdisk2, ... }, sorted by DiskIsBetter() relation
+ std::unordered_map<ui32, std::unordered_set<ui32>> BannedDomains;
+ // pRealm -> {pDomain1, pDomain2, ... }
+ // Cannot be a candidate, this domains are already placed correctly
+
+ TSanitizer(TImpl& self, const TGroupGeometryInfo& geom, i64 requiredSpace, bool requireOperational,
+ TForbiddenPDisks forbiddenDisks, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks)
+ : TDiskManager(self, geom, requiredSpace, requireOperational, forbiddenDisks, replacedDisks)
+ {
+ }
+
+ bool SetupNavigation(const TGroup& group) {
+ TPDiskByPosition matchingDisks = SetupMatchingDisks(::Max<ui32>());
+ const ui32 totalFailRealmsNum = Topology.GetTotalFailRealmsNum();
+ const ui32 numFailDomainsPerFailRealm = Topology.GetNumFailDomainsPerFailRealm();
+ const ui32 numDisksPerFailRealm = numFailDomainsPerFailRealm * Topology.GetNumVDisksPerFailDomain();
+ RealmNavigator.assign(totalFailRealmsNum, ::Max<ui32>());
+
+ std::map<ui32, ui32> realmGroups;
+
+ // {failRealm, pRealm} -> #number of pdisks from ${pRealm} in ${failRealm}
+ std::vector<std::unordered_map<ui32, ui32>> disksInPRealmByFailRealm(totalFailRealmsNum);
+
+ // pRealm -> #number of pdisks from ${pRealm} in ${group}
+ std::unordered_map<ui32, ui32> disksInPRealm;
+ std::set<ui32> realmCandidates;
+
+ // the list of potentailly free pDomains in pRealm, which include free domains and
+ // domains, currently occupied by group's pdisks
+ std::unordered_map<ui32, std::unordered_set<ui32>> pDomainsInPRealm;
+
+ for (ui32 orderNumber = 0; orderNumber < group.size(); ++orderNumber) {
+ if (group[orderNumber]) {
+ const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber);
+ const ui32 pRealmGroup = group[orderNumber]->Position.RealmGroup.Index();
+ const ui32 pRealm = group[orderNumber]->Position.Realm.Index();
+ const ui32 pDomain = group[orderNumber]->Position.Domain.Index();
+ realmGroups[pRealmGroup]++;
+ disksInPRealmByFailRealm[vdisk.FailRealm][pRealm]++;
+ disksInPRealm[pRealm]++;
+ pDomainsInPRealm[pRealm].insert(pDomain);
}
- return pretender.PDiskId < king.PDiskId;
}
- }
- bool GivesLocalityBoost(TPDiskInfo& pretender, TPDiskInfo& king) const {
- const ui32 a = GetLocalityFactor(pretender);
- const ui32 b = GetLocalityFactor(king);
- return Self.Randomize ? a < b : a > b;
- }
+ DesiredRealmGroup = 0;
+ ui32 bestRealmGroupSize = 0;
+ for (auto it = realmGroups.begin(); it != realmGroups.end(); ++it) {
+ if (it->second > bestRealmGroupSize) {
+ bestRealmGroupSize = it->second;
+ DesiredRealmGroup = it->first;
+ }
+ }
- bool BetterQuotaMatch(TPDiskInfo& pretender, TPDiskInfo& king) const {
- return pretender.SpaceAvailable < king.SpaceAvailable;
+ for (const auto& [position, pdisk] : matchingDisks) {
+ if (position.RealmGroup.Index() == DesiredRealmGroup) {
+ pDomainsInPRealm[position.Realm.Index()].insert(position.Domain.Index());
+ }
+ }
+
+ for (auto& [pRealmIdx, pRealm] : pDomainsInPRealm) {
+ if (pRealm.size() >= numFailDomainsPerFailRealm) {
+ realmCandidates.insert(pRealmIdx);
+ }
+ }
+
+
+ std::vector<std::pair<ui32, ui32>> realmFilling(totalFailRealmsNum);
+ for (ui32 failRealm = 0; failRealm < totalFailRealmsNum; ++failRealm) {
+ ui32 maxFilling = 0;
+ for (const auto& [pRealm, filling] : disksInPRealmByFailRealm[failRealm]) {
+ maxFilling = std::max(maxFilling, filling);
+ }
+ realmFilling[failRealm] = { numFailDomainsPerFailRealm - maxFilling, failRealm };
+ }
+ std::sort(realmFilling.begin(), realmFilling.end());
+
+ for (const auto& [_, failRealm] : realmFilling) {
+ ui32 bestRealm = ::Max<ui32>();
+ ui32 movesRequired = ::Max<ui32>();
+ for (auto it = realmCandidates.begin(); it != realmCandidates.end(); ++it) {
+ ui32 pRealm = *it;
+ ui32 correctAlready = disksInPRealmByFailRealm[failRealm][pRealm];
+ ui32 toMoveIn = numDisksPerFailRealm - correctAlready;
+ ui32 toMoveOut = disksInPRealm[pRealm] - correctAlready;
+ ui32 freeDomains = pDomainsInPRealm[pRealm].size();
+ ui32 newMovesRequired = toMoveIn;
+ if (toMoveOut + freeDomains < toMoveIn) {
+ continue; // not enough free domains to place all the disks
+ }
+ if (newMovesRequired < movesRequired || (newMovesRequired == movesRequired &&
+ freeDomains > pDomainsInPRealm[bestRealm].size())) {
+ bestRealm = pRealm;
+ movesRequired = newMovesRequired;
+ }
+ }
+ if (bestRealm == ::Max<ui32>()) {
+ return false;
+ }
+ RealmNavigator[failRealm] = bestRealm;
+ realmCandidates.erase(realmCandidates.find(bestRealm));
+ }
+
+ UpdateGroup(group);
+ return true;
}
- void AddUsedDisk(const TPDiskInfo& pdisk) {
- for (ui32 groupId : pdisk.Groups) {
- ++LocalityFactor[groupId];
+ void UpdateGroup(const TGroup& group) {
+ BannedDomains.clear();
+ for (ui32 orderNumber = 0; orderNumber < group.size(); ++orderNumber) {
+ if (group[orderNumber]) {
+ const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber);
+ const ui32 pRealm = group[orderNumber]->Position.Realm.Index();
+ const ui32 pDomain = group[orderNumber]->Position.Domain.Index();
+ if (pRealm == RealmNavigator[vdisk.FailRealm]) {
+ BannedDomains[pRealm].insert(pDomain);
+ }
+ }
}
}
- void RemoveUsedDisk(const TPDiskInfo& pdisk) {
- for (ui32 groupId : pdisk.Groups) {
- if (!--LocalityFactor[groupId]) {
- LocalityFactor.erase(groupId);
+ void SetupCandidates(ui32 maxScore) {
+ TPDiskByPosition matchingDisks = SetupMatchingDisks(maxScore);
+ DomainCandidates.clear();
+ DiskCandidates.clear();
+
+ std::unordered_map<ui32, std::unordered_map<ui32, ui32>> slotsInPDomain;
+ // {pRealm, pDomain} -> #summary number of slots in ${pDomain, pRealm}
+
+ for (const auto& [position, pdisk] : matchingDisks) {
+ if (position.RealmGroup.Index() == DesiredRealmGroup) {
+ ui32 pRealm = position.Realm.Index();
+ ui32 pDomain = position.Domain.Index();
+
+ if (BannedDomains[pRealm].count(pDomain) == 0) {
+ DomainCandidates[pRealm].push_back(pDomain);
+ DiskCandidates[pRealm][pDomain].push_back(pdisk);
+ }
+
+ slotsInPDomain[pRealm][pDomain] += pdisk->NumSlots;
+ }
+ }
+ for (auto it = DomainCandidates.begin(); it != DomainCandidates.end(); ++it) {
+ const ui32 pRealmIdx = it->first;
+ // sort domains in realm by the number of free disks
+ const auto& pRealmInfo = slotsInPDomain[pRealmIdx];
+ auto realm = it->second;
+ std::sort(realm.begin(), realm.end(), [&pRealmInfo](const ui32& left, const ui32& right) {
+ return pRealmInfo.at(left) > pRealmInfo.at(right);
+ });
+ it->second = realm;
+
+ auto& diskCandidatesInRealm = DiskCandidates[pRealmIdx];
+ for (auto jt = diskCandidatesInRealm.begin(); jt != diskCandidatesInRealm.end(); ++jt) {
+ auto domain = jt->second;
+ // sort disks in domain by DiskIsBetter metric
+ // DiskIsBetter() is not suitable for std::sort, better ordering required
+ // std::sort(domain.begin(), domain.end(), [this](const TPDiskInfo* left, const TPDiskInfo* right) {
+ // return this->DiskIsBetter(*left, *right);
+ // });
+
+ for (ui32 i = 0; i < domain.size(); ++i) {
+ if (DiskIsBetter(*domain[0], *domain[i])) {
+ std::swap(domain[0], domain[i]);
+ }
+ }
+ jt->second = domain;
}
}
}
- unsigned GetLocalityFactor(const TPDiskInfo& pdisk) const {
- unsigned res = 0;
- for (ui32 groupId : pdisk.Groups) {
- res += GetLocalityFactor(groupId);
+ // if optional is empty, then all disks in group are placed correctly
+ std::pair<TMisplacedVDisks::EFailLevel, std::vector<ui32>> FindMisplacedVDisks(const TGroup& group) {
+ using EFailLevel = TMisplacedVDisks::EFailLevel;
+ std::unordered_map<ui32, std::unordered_set<ui32>> usedPDomains; // pRealm -> { pDomain1, pDomain2, ... }
+ std::set<TPDiskId> usedPDisks;
+ // {pRealm, pDomain} -> { pdisk1, pdisk2, ... }
+
+ EFailLevel failLevel = EFailLevel::ALL_OK;
+ std::vector<ui32> misplacedVDisks;
+ std::unordered_map<ui32, std::unordered_map<ui32, ui32>> domainInterlace;
+ std::map<TPDiskId, ui32> diskInterlace;
+
+ for (ui32 orderNum = 0; orderNum < group.size(); ++orderNum) {
+ if (group[orderNum]) {
+ ui32 pRealm = group[orderNum]->Position.Realm.Index();
+ ui32 pDomain = group[orderNum]->Position.Domain.Index();
+ TPDiskId pdisk = group[orderNum]->PDiskId;
+ domainInterlace[pRealm][pDomain]++;
+ diskInterlace[pdisk]++;
+ }
}
- return res;
+
+ for (ui32 orderNum = 0; orderNum < group.size(); ++orderNum) {
+ if (group[orderNum]) {
+ const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNum);
+ ui32 pRealm = group[orderNum]->Position.Realm.Index();
+ ui32 pDomain = group[orderNum]->Position.Domain.Index();
+ TPDiskId pdisk = group[orderNum]->PDiskId;
+ ui32 desiredPRealm = RealmNavigator[vdisk.FailRealm];
+ if (desiredPRealm != pRealm && (ui32)failLevel <= (ui32)EFailLevel::REALM_FAIL) {
+ if ((ui32)failLevel < (ui32)EFailLevel::REALM_FAIL) {
+ misplacedVDisks.clear();
+ }
+ failLevel = EFailLevel::REALM_FAIL;
+ misplacedVDisks.push_back(orderNum);
+ } else if (domainInterlace[pRealm][pDomain] > 1 && (ui32)failLevel <= (ui32)EFailLevel::DOMAIN_FAIL) {
+ if ((ui32)failLevel < (ui32)EFailLevel::DOMAIN_FAIL) {
+ misplacedVDisks.clear();
+ }
+ failLevel = EFailLevel::DOMAIN_FAIL;
+ misplacedVDisks.push_back(orderNum);
+ } else if (diskInterlace[pdisk] > 1 && (ui32)failLevel <= (ui32)EFailLevel::DISK_FAIL) {
+ failLevel = EFailLevel::DISK_FAIL;
+ misplacedVDisks.push_back(orderNum);
+ }
+ } else {
+ if (failLevel == EFailLevel::EMPTY_SLOT) {
+ misplacedVDisks.clear();
+ failLevel = EFailLevel::INCORRECT_LAYOUT;
+ } else if ((ui32)failLevel < (ui32)EFailLevel::EMPTY_SLOT) {
+ misplacedVDisks = {orderNum};
+ failLevel = EFailLevel::EMPTY_SLOT;
+ }
+ }
+ }
+
+ return {failLevel, misplacedVDisks};
}
- unsigned GetLocalityFactor(ui32 groupId) const {
- const auto it = LocalityFactor.find(groupId);
- return it != LocalityFactor.end() ? it->second : 0;
+ std::optional<TPDiskId> TargetMisplacedVDisk(ui32 maxScore, const TGroup& group, const TVDiskIdShort& vdisk) {
+ for (ui32 orderNumber = 0; orderNumber < group.size(); ++orderNumber) {
+ if (!group[orderNumber] && orderNumber != Topology.GetOrderNumber(vdisk)) {
+ return std::nullopt;
+ }
+ }
+
+ UpdateGroup(group);
+ SetupCandidates(maxScore);
+
+ ui32 failRealm = vdisk.FailRealm;
+ ui32 pRealm = RealmNavigator[failRealm];
+
+ const auto& domainCandidates = DomainCandidates[pRealm];
+ TPDomainCandidatesRange pDomainRange = { domainCandidates.begin(), domainCandidates.end() };
+
+ for (; pDomainRange.first != pDomainRange.second;) {
+ ui32 pDomain = *pDomainRange.first++;
+ const auto& diskCandidates = DiskCandidates[pRealm][pDomain];
+
+ if (!diskCandidates.empty()) {
+ return (*diskCandidates.begin())->PDiskId;
+ }
+ }
+
+ return std::nullopt;
}
};
@@ -564,7 +845,7 @@ namespace NKikimr::NBsController {
it->second.SpaceAvailable += increment;
}
- TString FormatPDisks(const TAllocator& allocator) const {
+ TString FormatPDisks(const TDiskManager& diskManager) const {
TStringStream s;
s << "PDisks# ";
@@ -584,11 +865,11 @@ namespace NKikimr::NBsController {
s << std::exchange(space, " ") << pdisk->PDiskId;
- if (allocator.OldGroupContent.contains(pdisk->PDiskId)) {
+ if (diskManager.OldGroupContent.contains(pdisk->PDiskId)) {
s << "*";
}
const char *minus = "-";
- if (allocator.ForbiddenDisks.contains(pdisk->PDiskId)) {
+ if (diskManager.ForbiddenDisks.contains(pdisk->PDiskId)) {
s << std::exchange(minus, "") << "f";
}
if (!pdisk->Usable) {
@@ -600,13 +881,13 @@ namespace NKikimr::NBsController {
if (pdisk->NumSlots >= pdisk->MaxSlots) {
s << std::exchange(minus, "") << "s[" << pdisk->NumSlots << "/" << pdisk->MaxSlots << "]";
}
- if (pdisk->SpaceAvailable < allocator.RequiredSpace) {
+ if (pdisk->SpaceAvailable < diskManager.RequiredSpace) {
s << std::exchange(minus, "") << "v";
}
if (!pdisk->Operational) {
s << std::exchange(minus, "") << "o";
}
- if (allocator.DiskIsUsable(*pdisk)) {
+ if (diskManager.DiskIsUsable(*pdisk)) {
s << "+";
}
@@ -701,6 +982,108 @@ namespace NKikimr::NBsController {
return false;
}
}
+
+ TMisplacedVDisks FindMisplacedVDisks(const TGroupDefinition& groupDefinition) {
+ using EFailLevel = TMisplacedVDisks::EFailLevel;
+ // create group of required size, if it is not created yet
+ if (!Geom.CheckGroupSize(groupDefinition)) {
+ return TMisplacedVDisks(EFailLevel::INCORRECT_LAYOUT, {}, "Incorrect group");
+ }
+
+ TSanitizer sanitizer(*this, Geom, 0, false, {}, {});
+ TString error;
+ TGroup group = sanitizer.ProcessExistingGroup(groupDefinition, error);
+ if (group.empty()) {
+ return TMisplacedVDisks(EFailLevel::INCORRECT_LAYOUT, {}, error);
+ }
+ if (!sanitizer.SetupNavigation(group)) {
+ return TMisplacedVDisks(EFailLevel::INCORRECT_LAYOUT, {}, "Cannot map failRealms to pRealms");
+ }
+
+ sanitizer.SetupCandidates(::Max<ui32>());
+ auto [failLevel, misplacedVDiskNums] = sanitizer.FindMisplacedVDisks(group);
+ std::vector<TVDiskIdShort> misplacedVDisks;
+ for (ui32 orderNum : misplacedVDiskNums) {
+ misplacedVDisks.push_back(sanitizer.Topology.GetVDiskId(orderNum));
+ }
+ return TMisplacedVDisks(failLevel, misplacedVDisks);
+ }
+
+ std::optional<TPDiskId> TargetMisplacedVDisk(ui32 groupId, TGroupDefinition& groupDefinition, TVDiskIdShort vdisk,
+ TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error) {
+ if (Dirty) {
+ std::sort(PDiskByPosition.begin(), PDiskByPosition.end());
+ Dirty = false;
+ }
+
+ // create group of required size, if it is not created yet
+ if (!Geom.CheckGroupSize(groupDefinition)) {
+ error = "Incorrect group";
+ return std::nullopt;
+ }
+
+ TSanitizer sanitizer(*this, Geom, requiredSpace, requireOperational, std::move(forbid), {});
+ TGroup group = sanitizer.ProcessExistingGroup(groupDefinition, error);
+ if (group.empty()) {
+ error = "Empty group";
+ return std::nullopt;
+ }
+ if (!sanitizer.SetupNavigation(group)) {
+ error = "Cannot map failRealms to pRealms";
+ return std::nullopt;
+ }
+
+ // calculate score table
+ std::vector<ui32> scores;
+ for (const auto& [pdiskId, pdisk] : PDisks) {
+ if (sanitizer.DiskIsUsable(pdisk)) {
+ scores.push_back(pdisk.GetPickerScore());
+ }
+ }
+ std::sort(scores.begin(), scores.end());
+ scores.erase(std::unique(scores.begin(), scores.end()), scores.end());
+
+ // bisect scores to find optimal working one
+ sanitizer.SetupCandidates(::Max<ui32>());
+
+ std::optional<TPDiskId> result;
+
+ ui32 begin = 0, end = scores.size();
+ while (begin < end) {
+ const ui32 mid = begin + (end - begin) / 2;
+ std::optional<TPDiskId> target;
+ if ((target = sanitizer.TargetMisplacedVDisk(scores[mid], group, vdisk))) {
+ result = target;
+ end = mid;
+ } else {
+ begin = mid + 1;
+ }
+ }
+
+ if (result) {
+ ui32 orderNum = sanitizer.Topology.GetOrderNumber(vdisk);
+ if (group[orderNum]) {
+ TPDiskId pdiskId = group[orderNum]->PDiskId;
+ const auto it = PDisks.find(pdiskId);
+ Y_VERIFY(it != PDisks.end());
+ TPDiskInfo& pdisk = it->second;
+ --pdisk.NumSlots;
+ pdisk.EraseGroup(groupId);
+ }
+ {
+ const auto it = PDisks.find(*result);
+ Y_VERIFY(it != PDisks.end());
+ TPDiskInfo& pdisk = it->second;
+ ++pdisk.NumSlots;
+ pdisk.InsertGroup(groupId);
+ groupDefinition[vdisk.FailRealm][vdisk.FailDomain][vdisk.VDisk] = *result;
+ }
+ return result;
+ }
+
+ error = "Cannot replace vdisk";
+ return std::nullopt;
+ }
};
TGroupMapper::TGroupMapper(TGroupGeometryInfo geom, bool randomize)
@@ -726,4 +1109,12 @@ namespace NKikimr::NBsController {
return Impl->AllocateGroup(groupId, group, replacedDisks, std::move(forbid), requiredSpace, requireOperational, error);
}
+ TGroupMapper::TMisplacedVDisks TGroupMapper::FindMisplacedVDisks(const TGroupDefinition& group) {
+ return Impl->FindMisplacedVDisks(group);
+ }
+
+ std::optional<TPDiskId> TGroupMapper::TargetMisplacedVDisk(ui32 groupId, TGroupMapper::TGroupDefinition& group,
+ TVDiskIdShort vdisk, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error) {
+ return Impl->TargetMisplacedVDisk(groupId, group, vdisk, std::move(forbid), requiredSpace, requireOperational, error);
+ }
} // NKikimr::NBsController
diff --git a/ydb/core/mind/bscontroller/group_mapper.h b/ydb/core/mind/bscontroller/group_mapper.h
index 991a636bf38..99ffcd433bb 100644
--- a/ydb/core/mind/bscontroller/group_mapper.h
+++ b/ydb/core/mind/bscontroller/group_mapper.h
@@ -78,6 +78,36 @@ namespace NKikimr {
// prefix+infix part gives us distinct fail realms we can use while generating groups.
bool AllocateGroup(ui32 groupId, TGroupDefinition& group, const THashMap<TVDiskIdShort, TPDiskId>& replacedDisks,
TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error);
+
+ struct TMisplacedVDisks {
+ enum EFailLevel : ui32 {
+ ALL_OK,
+ DISK_FAIL,
+ DOMAIN_FAIL,
+ REALM_FAIL,
+ EMPTY_SLOT,
+ INCORRECT_LAYOUT,
+ };
+
+ TMisplacedVDisks(EFailLevel failLevel, std::vector<TVDiskIdShort> disks, TString errorReason = "")
+ : FailLevel(failLevel)
+ , Disks(std::move(disks))
+ , ErrorReason(errorReason)
+ {}
+
+ EFailLevel FailLevel;
+ std::vector<TVDiskIdShort> Disks;
+ TString ErrorReason;
+
+ operator bool() const {
+ return FailLevel != EFailLevel::INCORRECT_LAYOUT;
+ }
+ };
+
+ TMisplacedVDisks FindMisplacedVDisks(const TGroupDefinition& group);
+
+ std::optional<TPDiskId> TargetMisplacedVDisk(ui32 groupId, TGroupDefinition& group, TVDiskIdShort vdisk,
+ TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error);
};
} // NBsController
diff --git a/ydb/core/mind/bscontroller/group_mapper_ut.cpp b/ydb/core/mind/bscontroller/group_mapper_ut.cpp
index 1be38782962..513283d8a2b 100644
--- a/ydb/core/mind/bscontroller/group_mapper_ut.cpp
+++ b/ydb/core/mind/bscontroller/group_mapper_ut.cpp
@@ -248,6 +248,83 @@ public:
return group.Group;
}
+ enum class ESanitizeResult {
+ SUCCESS,
+ FAIL,
+ ALREADY,
+ };
+
+ using TSanitizeGroupResult = std::pair<ESanitizeResult, TGroupMapper::TGroupDefinition>;
+ TSanitizeGroupResult SanitizeGroup(TGroupMapper& mapper, ui32 groupId, const TSet<TPDiskId>& unusableDisks,
+ bool makeThemForbidden = false, bool requireOperational = false, bool allowError = false,
+ std::pair<TVDiskIdShort, TPDiskId>* movedDisk = nullptr) {
+ TGroupRecord& group = Groups.at(groupId);
+
+ TGroupMapper::TForbiddenPDisks forbid(unusableDisks.begin(), unusableDisks.end());
+ if (!makeThemForbidden) {
+ forbid.clear();
+ }
+
+ Ctest << "groupId# " << groupId << " sanitizing group# " << FormatGroup(group.Group) << Endl;
+ for (ui32 i = 0; i < group.Group.size(); ++i) {
+ for (ui32 j = 0; j < group.Group[i].size(); ++j) {
+ for (ui32 k = 0; k < group.Group[i][j].size(); ++k) {
+ auto& pdisk = group.Group[i][j][k];
+ --PDisks.at(pdisk).NumSlots;
+ }
+ }
+ }
+
+ TGroupMapper::TMisplacedVDisks result = mapper.FindMisplacedVDisks(group.Group);
+ if (result) {
+ Ctest << "error# " << result.ErrorReason << Endl;
+ if (allowError) {
+ for (auto& realm : group.Group) {
+ for (auto& domain : realm) {
+ for (auto& pdisk : domain) {
+ ++PDisks.at(pdisk).NumSlots;
+ }
+ }
+ }
+ return {ESanitizeResult::FAIL, {}};
+ }
+ }
+
+ ESanitizeResult status = ESanitizeResult::ALREADY;
+ TString error;
+
+ if (!result.Disks.empty()) {
+ status = ESanitizeResult::FAIL;
+ for (auto vdisk : result.Disks) {
+ auto target = mapper.TargetMisplacedVDisk(groupId, group.Group, vdisk,
+ std::move(forbid), 0, requireOperational, error);
+ if (target) {
+ status = ESanitizeResult::SUCCESS;
+ if (movedDisk) {
+ *movedDisk = {vdisk, *target};
+ }
+ break;
+ }
+ }
+ }
+
+ if (status == ESanitizeResult::FAIL) {
+ Ctest << "Sanitation failed! Last error reason: " << error << Endl;
+ }
+
+ group.PDisks.clear();
+ for (const auto& realm : group.Group) {
+ for (const auto& domain : realm) {
+ for (const auto& pdisk : domain) {
+ group.PDisks.push_back(pdisk);
+ ++PDisks.at(pdisk).NumSlots;
+ }
+ }
+ }
+
+ return {status, group.Group};
+ }
+
void SetGroup(ui32 groupId, const TGroupMapper::TGroupDefinition& group) {
auto& g = Groups[groupId];
for (const TPDiskId& pdiskId : g.PDisks) {
@@ -449,6 +526,69 @@ public:
}
}
}
+
+ bool CheckGroupPlacement(const TGroupMapper::TGroupDefinition& group, TGroupGeometryInfo geom, TString& error) {
+ NLayoutChecker::TDomainMapper domainMapper;
+ if (group.size() != geom.GetNumFailRealms()) {
+ error = "Wrong fail realms number";
+ return false;
+ }
+
+ for (ui32 failRealm = 0; failRealm < geom.GetNumFailRealms(); ++failRealm) {
+ if (group[failRealm].size() != geom.GetNumFailDomainsPerFailRealm()) {
+ error = TStringBuilder() << "Wrong fail domains number in failRealm# " << failRealm;
+ return false;
+ }
+ for (ui32 failDomain = 0; failDomain < geom.GetNumFailDomainsPerFailRealm(); ++failDomain) {
+ if (group[failRealm][failDomain].size() != geom.GetNumVDisksPerFailDomain()) {
+ error = TStringBuilder() << "Wrong vdisks number in failRealm# " << failRealm << ", failDomain# " << failDomain;
+ return false;
+ }
+ }
+ }
+
+ std::unordered_set<ui32> usedPRealms;
+ for (ui32 failRealm = 0; failRealm < geom.GetNumFailRealms(); ++failRealm) {
+ const NLayoutChecker::TPDiskLayoutPosition pdisk0(domainMapper, PDisks.at(group[failRealm][0][0]).GetLocation(), group[failRealm][0][0], geom);
+ ui32 pRealm = pdisk0.Realm.Index();
+ if (usedPRealms.count(pRealm)) {
+ error = "same pRealm in different fail realms detected";
+ return false;
+ }
+ usedPRealms.insert(pRealm);
+ std::unordered_set<ui32> usedPDomains;
+ for (ui32 failDomain = 0; failDomain < geom.GetNumFailDomainsPerFailRealm(); ++failDomain) {
+ const NLayoutChecker::TPDiskLayoutPosition pdisk1(domainMapper, PDisks.at(group[failRealm][failDomain][0]).GetLocation(),
+ group[failRealm][failDomain][0], geom);
+ ui32 pDomain = pdisk1.Domain.Index();
+ if (usedPDomains.count(pDomain)) {
+ error = "same pDomain in different fail domains detected";
+ return false;
+ }
+ usedPDomains.insert(pDomain);
+ std::set<TPDiskId> usedPDisks;
+ for (ui32 vdisk = 0; vdisk < geom.GetNumVDisksPerFailDomain(); ++vdisk) {
+ auto pdiskId = group[failRealm][failDomain][vdisk];
+ auto pdisk = NLayoutChecker::TPDiskLayoutPosition(domainMapper, PDisks.at(pdiskId).GetLocation(), pdiskId, geom);
+ if (pdisk.Realm.Index() != pRealm) {
+ error = TStringBuilder() << "different pRealms within one failRealm, vdisk# " << failRealm << ":" << failDomain <<
+ ":" << vdisk << ", expected pRealm " << pRealm << ", got " << pdisk.Realm.Index();
+ return false;
+ }
+ if (pdisk.Domain.Index() != pDomain) {
+ error = "different pDomains within one failDomain";
+ return false;
+ }
+ if (usedPDisks.count(pdiskId)) {
+ error = "same PDisk in different VDisks";
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+ }
};
Y_UNIT_TEST_SUITE(TGroupMapperTest) {
@@ -894,33 +1034,18 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) {
Y_UNIT_TEST(SanitizeGroupTest3dc) {
const ui32 numDataCenters = 3;
const ui32 numRacks = 5;
- TTestContext context(numDataCenters, 1, numRacks, 1, 1);
- TGroupMapper::TGroupDefinition group;
+ const ui32 numDisks = 3;
+ TTestContext context(numDataCenters, 1, numRacks, 1, numDisks);
+ TGroupMapper::TGroupDefinition groupDef;
ui32 groupId;
{
TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc));
context.PopulateGroupMapper(mapper, 1);
- groupId = context.AllocateGroup(mapper, group);
+ groupId = context.AllocateGroup(mapper, groupDef);
Ctest << "group after allocation:" << Endl;
- context.DumpGroup(group);
+ context.DumpGroup(groupDef);
}
- auto checkLayout = [&](const auto& group) {
- TGroupGeometryInfo geom = TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc);
- THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>> layout;
- for (ui32 i = 0; i < group.size(); ++i) {
- for (ui32 j = 0; j < group[i].size(); ++j) {
- for (ui32 k = 0; k < group[i][j].size(); ++k) {
- layout.emplace(TVDiskIdShort(i, j, k), std::make_pair(context.GetLocation(group[i][j][k]),
- group[i][j][k]));
- }
- }
- }
- return CheckGroupLayout(geom, layout);
- };
-
- UNIT_ASSERT(checkLayout(group));
-
for (ui32 n = 0; n < 1000; ++n) {
Ctest << Endl << "iteration# " << n << Endl;
@@ -929,51 +1054,61 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) {
context.ImportLayout(layout);
Ctest << "group after layout shuffling:" << Endl;
- context.DumpGroup(group);
-
- struct TQueueItem {
- TGroupMapper::TGroupDefinition Group;
- TString Path;
- TSet<TGroupMapper::TGroupDefinition> Seen;
- TSet<TVDiskIdShort> VDiskItems;
- TSet<TPDiskId> PDiskItems;
- };
- std::deque<TQueueItem> queue;
- for (queue.push_back({.Group = group}); !queue.empty(); ) {
- TQueueItem item = std::move(queue.front());
- queue.pop_front();
- const auto [it, inserted] = item.Seen.insert(item.Group);
+ context.DumpGroup(groupDef);
+
+ ui32 sanitationStep = 0;
+
+ TGroupMapper::TGroupDefinition group = groupDef;
+ TString path = "";
+ TSet<TGroupMapper::TGroupDefinition> seen;
+ TSet<TVDiskIdShort> vdiskItems;
+ TSet<TPDiskId> pdiskItems;
+
+ while (true) {
+ const auto [it, inserted] = seen.insert(group);
UNIT_ASSERT(inserted);
- UNIT_ASSERT(item.Seen.size() <= 9);
- Ctest << "processing path# " << item.Path << Endl;
-
- auto candidates = checkLayout(item.Group);
- if (!candidates) {
- for (const TVDiskIdShort& vdiskId : candidates.Candidates) {
- TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc));
- context.SetGroup(groupId, item.Group);
- context.PopulateGroupMapper(mapper, 2);
- const TPDiskId& pdiskId = item.Group[vdiskId.FailRealm][vdiskId.FailDomain][vdiskId.VDisk];
- auto temp = context.ReallocateGroup(mapper, groupId, {pdiskId}, false, false, false);
- TString path = TStringBuilder() << item.Path << "/" << (int)vdiskId.FailRealm << ":"
- << (int)vdiskId.FailDomain << ":" << (int)vdiskId.VDisk << "@" << pdiskId;
- Ctest << "path# " << path << Endl;
- context.DumpGroup(temp);
-
- auto vdiskItems = item.VDiskItems;
-// const auto [it1, inserted1] = vdiskItems.insert(vdiskId);
-// UNIT_ASSERT_C(inserted1, "Duplicate group cell# " << vdiskId);
-
- auto pdiskItems = item.PDiskItems;
-// const auto [it2, inserted2] = pdiskItems.insert(pdiskId);
-// UNIT_ASSERT_C(inserted2, "Duplicate origin PDisk# " << pdiskId);
-
- queue.push_front({.Group = std::move(temp), .Path = std::move(path), .Seen = item.Seen,
- .VDiskItems = std::move(vdiskItems), .PDiskItems = std::move(pdiskItems)});
- }
+ UNIT_ASSERT(seen.size() <= 9);
+ Ctest << "processing path# " << path << Endl;
+
+ TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc));
+
+ context.SetGroup(groupId, group);
+ context.PopulateGroupMapper(mapper, 2);
+
+ std::pair<TVDiskIdShort, TPDiskId> movedDisk;
+ auto [res, tempGroup] = context.SanitizeGroup(mapper, groupId, {}, false, false, false, &movedDisk);
+ Ctest << "Sanititaion step# " << sanitationStep++ << ", sanitizer ";
+ switch (res) {
+ case TTestContext::ESanitizeResult::FAIL:
+ Ctest << "FAIL" << Endl;
+ UNIT_FAIL("Sanitizing failed");
+ break;
+ case TTestContext::ESanitizeResult::ALREADY:
+ Ctest << "ALREADY" << Endl;
+ break;
+ case TTestContext::ESanitizeResult::SUCCESS:
+ Ctest << "SUCCESS" << Endl;
+ break;
+ }
+
+ path = TStringBuilder() << path << "/" << (int)movedDisk.first.FailRealm << ":"
+ << (int)movedDisk.first.FailDomain << ":" << (int)movedDisk.first.VDisk << "@" << movedDisk.second;
+ Ctest << "path# " << path << Endl;
+ context.DumpGroup(tempGroup);
+ if (res == TTestContext::ESanitizeResult::ALREADY) {
+ TString error;
+ UNIT_ASSERT_C(context.CheckGroupPlacement(group, TTestContext::CreateGroupGeometry(TBlobStorageGroupType::ErasureMirror3dc), error), error);
+ break;
}
Ctest << Endl;
+ group = tempGroup;
+
+ const auto [it1, inserted1] = vdiskItems.insert(movedDisk.first);
+ UNIT_ASSERT_C(inserted1, "Duplicate group cell# " << movedDisk.first);
+
+ const auto [it2, inserted2] = pdiskItems.insert(movedDisk.second);
+ UNIT_ASSERT_C(inserted2, "Duplicate origin PDisk# " << movedDisk.second);
}
}
}
diff --git a/ydb/core/mind/bscontroller/monitoring.cpp b/ydb/core/mind/bscontroller/monitoring.cpp
index 8864a53dc4d..60645b190a5 100644
--- a/ydb/core/mind/bscontroller/monitoring.cpp
+++ b/ydb/core/mind/bscontroller/monitoring.cpp
@@ -445,6 +445,21 @@ public:
break;
}
+ case NKikimrBlobStorage::TConfigRequest::TCommand::kSanitizeGroup: {
+ const auto& cmd = q.GetSanitizeGroup();
+
+ const ui32 groupId = cmd.GetGroupId();
+ TStringStream msg;
+ msg << "Group sanitizing request"
+ << "<br/>GroupId# " << groupId;
+
+ Events.emplace_front(timestamp, msg.Str(), std::move(reassign));
+ auto& j = Events.front().Json;
+ j["Event"] = "SanitizeGroup";
+ j["GroupId"] = ToString(groupId);
+ break;
+ }
+
case NKikimrBlobStorage::TConfigRequest::TCommand::kUpdateDriveStatus: {
const auto& cmd = q.GetUpdateDriveStatus();
diff --git a/ydb/core/mind/bscontroller/self_heal.cpp b/ydb/core/mind/bscontroller/self_heal.cpp
index 4e7178363fe..13c67af6eb0 100644
--- a/ydb/core/mind/bscontroller/self_heal.cpp
+++ b/ydb/core/mind/bscontroller/self_heal.cpp
@@ -26,7 +26,7 @@ namespace NKikimr::NBsController {
TActorId SelfHealId; // filled on bootstrap
const TGroupId GroupId;
const TEvControllerUpdateSelfHealInfo::TGroupContent Group;
- const TVDiskID VDiskToReplace;
+ const std::optional<TVDiskID> VDiskToReplace;
TBlobStorageGroupInfo::TTopology Topology;
THolder<TBlobStorageGroupInfo::TGroupVDisks> FailedGroupDisks;
THashSet<TVDiskID> PendingVDisks;
@@ -35,7 +35,7 @@ namespace NKikimr::NBsController {
public:
TReassignerActor(TActorId controllerId, TGroupId groupId, TEvControllerUpdateSelfHealInfo::TGroupContent group,
- TVDiskID vdiskToReplace)
+ std::optional<TVDiskID> vdiskToReplace)
: ControllerId(controllerId)
, GroupId(groupId)
, Group(std::move(group))
@@ -78,7 +78,7 @@ namespace NKikimr::NBsController {
FailedGroupDisks = MakeHolder<TBlobStorageGroupInfo::TGroupVDisks>(&Topology);
for (const auto& [vdiskId, vdisk] : Group.VDisks) {
- if (vdiskId == VDiskToReplace) {
+ if (VDiskToReplace && vdiskId == *VDiskToReplace) {
*FailedGroupDisks |= {&Topology, vdiskId};
continue; // skip disk we are going to replcate -- it will be wiped out anyway
}
@@ -158,12 +158,17 @@ namespace NKikimr::NBsController {
auto *request = record.MutableRequest();
request->SetIgnoreGroupReserve(true);
request->SetSettleOnlyOnOperationalDisks(true);
- auto *cmd = request->AddCommand()->MutableReassignGroupDisk();
- cmd->SetGroupId(VDiskToReplace.GroupID);
- cmd->SetGroupGeneration(VDiskToReplace.GroupGeneration);
- cmd->SetFailRealmIdx(VDiskToReplace.FailRealm);
- cmd->SetFailDomainIdx(VDiskToReplace.FailDomain);
- cmd->SetVDiskIdx(VDiskToReplace.VDisk);
+ if (VDiskToReplace) {
+ auto *cmd = request->AddCommand()->MutableReassignGroupDisk();
+ cmd->SetGroupId(VDiskToReplace->GroupID);
+ cmd->SetGroupGeneration(VDiskToReplace->GroupGeneration);
+ cmd->SetFailRealmIdx(VDiskToReplace->FailRealm);
+ cmd->SetFailDomainIdx(VDiskToReplace->FailDomain);
+ cmd->SetVDiskIdx(VDiskToReplace->VDisk);
+ } else {
+ auto *cmd = request->AddCommand()->MutableSanitizeGroup();
+ cmd->SetGroupId(GroupId);
+ }
Send(ControllerId, ev.Release());
}
@@ -341,12 +346,8 @@ namespace NKikimr::NBsController {
Y_VERIFY(!group.LayoutValid);
if (group.ReassignerActorId || now < group.NextRetryTimestamp) {
// nothing to do
- } else if (const auto v = FindVDiskToReplaceByLayout(group, now)) {
- group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, *v));
- } else if (group.LayoutValid) {
- GroupsWithInvalidLayout.Remove(&group);
} else {
- ++counter;
+ group.ReassignerActorId = Register(new TReassignerActor(ControllerId, group.GroupId, group.Content, std::nullopt));
}
}
}
@@ -401,40 +402,20 @@ namespace NKikimr::NBsController {
CheckGroups();
}
}
-
- std::optional<TVDiskID> FindVDiskToReplaceByLayout(TGroupRecord& group, TInstant now) {
- THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>> layout;
- for (const auto& [vdiskId, vdisk] : group.Content.VDisks) {
- Y_VERIFY(HostRecords);
+
+ using TVDiskInfo = TEvControllerUpdateSelfHealInfo::TGroupContent::TVDiskInfo;
+ TGroupMapper::TGroupDefinition MakeGroupDefinition(const TMap<TVDiskID, TVDiskInfo>& vdisks,
+ const TGroupGeometryInfo& geom) {
+ TGroupMapper::TGroupDefinition groupDefinition;
+ geom.ResizeGroup(groupDefinition);
+
+ for (const auto& [vdiskId, vdisk] : vdisks) {
if (!vdisk.Decommitted) {
- layout.emplace(vdiskId, std::make_pair(HostRecords->GetLocation(vdisk.Location.NodeId),
- vdisk.Location.ComprisingPDiskId()));
+ groupDefinition[vdiskId.FailRealm][vdiskId.FailDomain][vdiskId.VDisk] = vdisk.Location.ComprisingPDiskId();
}
}
- const TLayoutCheckResult checkResult = CheckGroupLayout(*group.Content.Geometry, layout);
- if (checkResult) { // group is valid
- group.LayoutValid = true;
- return std::nullopt;
- }
- THashSet<TVDiskIdShort> badDisks;
- for (const auto& [vdiskId, vdisk] : group.Content.VDisks) {
- const auto it = group.VDiskStatus.find(vdiskId);
- if (it == group.VDiskStatus.end() || it->second.GetStatus(now) != NKikimrBlobStorage::EVDiskStatus::READY || vdisk.Bad) {
- badDisks.insert(vdiskId);
- }
- }
- if (badDisks.empty()) {
- return TVDiskID(group.GroupId, group.Content.Generation, checkResult.Candidates.front());
- } else if (badDisks.size() == 1) {
- for (const auto& vdiskId : checkResult.Candidates) {
- if (badDisks.contains(vdiskId)) {
- return TVDiskID(group.GroupId, group.Content.Generation, vdiskId);
- }
- }
- }
-
- return std::nullopt;
+ return std::move(groupDefinition);
}
void HandleWakeup() {
diff --git a/ydb/core/mind/bscontroller/ut_selfheal/env.h b/ydb/core/mind/bscontroller/ut_selfheal/env.h
index 90643f56b2b..197ab4b4f76 100644
--- a/ydb/core/mind/bscontroller/ut_selfheal/env.h
+++ b/ydb/core/mind/bscontroller/ut_selfheal/env.h
@@ -214,4 +214,4 @@ struct TEnvironmentSetup {
WaitForEdgeActorEvent<TEvents::TEvWakeup>(edge);
}
-}; \ No newline at end of file
+};
diff --git a/ydb/core/protos/blobstorage_config.proto b/ydb/core/protos/blobstorage_config.proto
index 20f8315654b..f901fb24269 100644
--- a/ydb/core/protos/blobstorage_config.proto
+++ b/ydb/core/protos/blobstorage_config.proto
@@ -271,6 +271,10 @@ message TReassignGroupDisk {
bool SuppressDonorMode = 7; // when set, donor mode is not used even if it is enabled through BSC
}
+message TSanitizeGroup {
+ uint32 GroupId = 1;
+}
+
enum EClusterFitAlgorithm {
QUADRATIC = 0;
HUNGARIAN = 1;
@@ -515,6 +519,7 @@ message TConfigRequest {
// commands intended for internal use
TReassignGroupDisk ReassignGroupDisk = 19;
+ TSanitizeGroup SanitizeGroup = 42;
}
}