diff options
author | robdrynkin <robdrynkin@yandex-team.com> | 2023-11-08 16:20:19 +0300 |
---|---|---|
committer | robdrynkin <robdrynkin@yandex-team.com> | 2023-11-08 16:55:30 +0300 |
commit | 1cfb60d093d33df4c50825aa4fc71415a9d80db5 (patch) | |
tree | 37cc73fc52fc5529c971be177003eb08369067d6 | |
parent | 212ec2d7ce6232b6f2a345d76a53f7d9dd75e9ed (diff) | |
download | ydb-1cfb60d093d33df4c50825aa4fc71415a9d80db5.tar.gz |
KIKIMR-19590: Special option to run self heal localy
-rw-r--r-- | ydb/core/mind/bscontroller/cmds_storage_pool.cpp | 2 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/config.h | 1 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/config_cmd.cpp | 5 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/config_fit_groups.cpp | 2 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/impl.h | 1 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/load_everything.cpp | 1 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/scheme.h | 4 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/self_heal.cpp | 2 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/ut_selfheal/main.cpp | 236 | ||||
-rw-r--r-- | ydb/core/protos/blobstorage_config.proto | 2 |
10 files changed, 185 insertions, 71 deletions
diff --git a/ydb/core/mind/bscontroller/cmds_storage_pool.cpp b/ydb/core/mind/bscontroller/cmds_storage_pool.cpp index 8056880c57..ad0f037e26 100644 --- a/ydb/core/mind/bscontroller/cmds_storage_pool.cpp +++ b/ydb/core/mind/bscontroller/cmds_storage_pool.cpp @@ -364,6 +364,8 @@ namespace NKikimr::NBsController { throw TExError() << "TargetPDiskId# " << pdiskId.ToString() << " not found"; } targetPDiskId = pdiskId; + } else if (cmd.HasTargetNodeId() && Self.UseSelfHealLocalPolicy) { + TargetNodeId = cmd.GetTargetNodeId(); } ExplicitReconfigureMap.emplace(vslotId, targetPDiskId); diff --git a/ydb/core/mind/bscontroller/config.h b/ydb/core/mind/bscontroller/config.h index 6e92bc9858..b915c91491 100644 --- a/ydb/core/mind/bscontroller/config.h +++ b/ydb/core/mind/bscontroller/config.h @@ -81,6 +81,7 @@ namespace NKikimr { THashMap<TVSlotId, TPDiskId> ExplicitReconfigureMap; std::set<TVSlotId> SuppressDonorMode; std::unordered_set<ui32> SanitizingRequests; + std::optional<ui32> TargetNodeId; // just-created vslots, which are not yet committed to the storage TSet<TVSlotId> UncommittedVSlots; diff --git a/ydb/core/mind/bscontroller/config_cmd.cpp b/ydb/core/mind/bscontroller/config_cmd.cpp index a252371ddc..4abaffe222 100644 --- a/ydb/core/mind/bscontroller/config_cmd.cpp +++ b/ydb/core/mind/bscontroller/config_cmd.cpp @@ -142,6 +142,10 @@ namespace NKikimr::NBsController { ev->AllowMultipleRealmsOccupation = Self->AllowMultipleRealmsOccupation; Self->Send(Self->SelfHealId, ev.release()); } + for (bool value : settings.GetUseSelfHealLocalPolicy()) { + Self->UseSelfHealLocalPolicy = value; + db.Table<T>().Key(true).Update<T::UseSelfHealLocalPolicy>(Self->UseSelfHealLocalPolicy); + } return true; } @@ -307,6 +311,7 @@ namespace NKikimr::NBsController { state.SanitizingRequests.clear(); state.ExplicitReconfigureMap.clear(); state.SuppressDonorMode.clear(); + state.TargetNodeId.reset(); switch (cmd.GetCommandCase()) { #define HANDLE_COMMAND(NAME) \ case NKikimrBlobStorage::TConfigRequest::TCommand::k ## NAME: return state.ExecuteStep(cmd.Get ## NAME(), status); diff --git a/ydb/core/mind/bscontroller/config_fit_groups.cpp b/ydb/core/mind/bscontroller/config_fit_groups.cpp index 50b036f8bf..9ffb64c749 100644 --- a/ydb/core/mind/bscontroller/config_fit_groups.cpp +++ b/ydb/core/mind/bscontroller/config_fit_groups.cpp @@ -464,7 +464,7 @@ namespace NKikimr { } for (const auto& filter : StoragePool.PDiskFilters) { - if (filter.MatchPDisk(info)) { + if (filter.MatchPDisk(info) && (!State.TargetNodeId.has_value() || *State.TargetNodeId == id.NodeId)) { const bool inserted = RegisterPDisk(id, info, true); Y_ABORT_UNLESS(inserted); break; diff --git a/ydb/core/mind/bscontroller/impl.h b/ydb/core/mind/bscontroller/impl.h index 388588cb0b..2ecfcfa2d9 100644 --- a/ydb/core/mind/bscontroller/impl.h +++ b/ydb/core/mind/bscontroller/impl.h @@ -1496,6 +1496,7 @@ private: TActorId StatProcessorActorId; TInstant LastMetricsCommit; bool SelfHealEnable = false; + bool UseSelfHealLocalPolicy; bool DonorMode = false; TDuration ScrubPeriodicity; NKikimrBlobStorage::TStorageConfig StorageConfig; diff --git a/ydb/core/mind/bscontroller/load_everything.cpp b/ydb/core/mind/bscontroller/load_everything.cpp index 7125f5b951..6c99b5ac9d 100644 --- a/ydb/core/mind/bscontroller/load_everything.cpp +++ b/ydb/core/mind/bscontroller/load_everything.cpp @@ -89,6 +89,7 @@ public: Self->GroupLayoutSanitizerEnabled = state.GetValue<T::GroupLayoutSanitizer>(); Self->AllowMultipleRealmsOccupation = state.GetValueOrDefault<T::AllowMultipleRealmsOccupation>(); Self->SysViewChangedSettings = true; + Self->UseSelfHealLocalPolicy = state.GetValue<T::UseSelfHealLocalPolicy>(); } } diff --git a/ydb/core/mind/bscontroller/scheme.h b/ydb/core/mind/bscontroller/scheme.h index a2a00fd9f9..57eca4c019 100644 --- a/ydb/core/mind/bscontroller/scheme.h +++ b/ydb/core/mind/bscontroller/scheme.h @@ -105,12 +105,14 @@ struct Schema : NIceDb::Schema { struct NextVirtualGroupId : Column<19, Group::ID::ColumnType> { static constexpr Type Default = 0; }; struct AllowMultipleRealmsOccupation : Column<20, NScheme::NTypeIds::Bool> { static constexpr Type Default = true; }; struct CompatibilityInfo : Column<21, NScheme::NTypeIds::String> {}; + struct UseSelfHealLocalPolicy : Column<22, NScheme::NTypeIds::Bool> { static constexpr Type Default = false; }; using TKey = TableKey<FixedKey>; using TColumns = TableColumns<FixedKey, NextGroupID, SchemaVersion, NextOperationLogIndex, DefaultMaxSlots, InstanceId, SelfHealEnable, DonorModeEnable, ScrubPeriodicity, SerialManagementStage, NextStoragePoolId, PDiskSpaceMarginPromille, GroupReserveMin, GroupReservePart, MaxScrubbedDisksAtOnce, PDiskSpaceColorBorder, - GroupLayoutSanitizer, NextVirtualGroupId, AllowMultipleRealmsOccupation, CompatibilityInfo>; + GroupLayoutSanitizer, NextVirtualGroupId, AllowMultipleRealmsOccupation, CompatibilityInfo, + UseSelfHealLocalPolicy>; }; struct VSlot : Table<5> { diff --git a/ydb/core/mind/bscontroller/self_heal.cpp b/ydb/core/mind/bscontroller/self_heal.cpp index 6cb8643a6b..c234911121 100644 --- a/ydb/core/mind/bscontroller/self_heal.cpp +++ b/ydb/core/mind/bscontroller/self_heal.cpp @@ -152,6 +152,7 @@ namespace NKikimr::NBsController { request->SetIgnoreGroupReserve(true); request->SetSettleOnlyOnOperationalDisks(true); request->SetIsSelfHealReasonDecommit(IsSelfHealReasonDecommit); + request->SetAllowUnusableDisks(true); if (VDiskToReplace) { ev->SelfHeal = true; auto *cmd = request->AddCommand()->MutableReassignGroupDisk(); @@ -160,6 +161,7 @@ namespace NKikimr::NBsController { cmd->SetFailRealmIdx(VDiskToReplace->FailRealm); cmd->SetFailDomainIdx(VDiskToReplace->FailDomain); cmd->SetVDiskIdx(VDiskToReplace->VDisk); + cmd->SetTargetNodeId(Group.VDisks.at(*VDiskToReplace).Location.NodeId); } else { ev->GroupLayoutSanitizer = true; auto *cmd = request->AddCommand()->MutableSanitizeGroup(); diff --git a/ydb/core/mind/bscontroller/ut_selfheal/main.cpp b/ydb/core/mind/bscontroller/ut_selfheal/main.cpp index 30592dbf25..f9f0a04bdb 100644 --- a/ydb/core/mind/bscontroller/ut_selfheal/main.cpp +++ b/ydb/core/mind/bscontroller/ut_selfheal/main.cpp @@ -9,83 +9,107 @@ Y_UNIT_TEST_SUITE(BsControllerTest) { - void TestSelfHeal(const ui32 numDCs = 3, ui32 numRacksPerDC = 4, const ui32 numUnitsPerRack = 4, const ui32 numDisksPerNode = 2, const ui32 numGroups = 64, - TString erasure = "block-4-2", TBlobStorageGroupType groupType = TBlobStorageGroupType::Erasure4Plus2Block) { - ui32 numNodes = numDCs * numRacksPerDC * numUnitsPerRack; - auto locationGenerator = [=](ui32 nodeId) { - NActorsInterconnect::TNodeLocation proto; - proto.SetDataCenter(ToString((nodeId - 1) / (numUnitsPerRack * numRacksPerDC))); - proto.SetRack(ToString((nodeId - 1) / numUnitsPerRack)); - proto.SetUnit(ToString((nodeId - 1))); - return TNodeLocation(proto); - }; - - TEnvironmentSetup env(numNodes, locationGenerator); - - const TGroupGeometryInfo geom = CreateGroupGeometry(groupType); - ui32 disksNum = geom.GetNumFailRealms() * geom.GetNumFailDomainsPerFailRealm() * geom.GetNumVDisksPerFailDomain(); - - NKikimrBlobStorage::TConfigRequest request; - TVector<TEnvironmentSetup::TDrive> drives; - for (ui32 i = 0; i < numDisksPerNode; ++i) { - drives.push_back({ .Path = "/dev/disk" + std::to_string(1 + i)}); + struct TTestSelfHeal { + TTestSelfHeal( + ui32 numDCs = 3, ui32 numRacksPerDC = 4, ui32 numUnitsPerRack = 4, ui32 numDisksPerNode = 2, ui32 numGroups = 64, + TString erasure = "block-4-2", TBlobStorageGroupType groupType = TBlobStorageGroupType::Erasure4Plus2Block + ) + : NumDCs(numDCs) + , NumRacksPerDC(numRacksPerDC) + , NumUnitsPerRack(numUnitsPerRack) + , NumNodes(NumDCs * NumRacksPerDC * NumUnitsPerRack) + , NumDisksPerNode(numDisksPerNode) + , NumGroups(numGroups) + , Erasure(erasure) + , GroupType(groupType) + , Env(NumNodes, [=](ui32 nodeId) { + NActorsInterconnect::TNodeLocation proto; + proto.SetDataCenter(ToString((nodeId - 1) / (NumUnitsPerRack * NumRacksPerDC))); + proto.SetRack(ToString((nodeId - 1) / NumUnitsPerRack)); + proto.SetUnit(ToString((nodeId - 1))); + return TNodeLocation(proto); + }) + , Geom(CreateGroupGeometry(GroupType)) + { } - env.DefineBox(1, drives, {{1, numNodes}}, &request); - env.DefineStoragePool(1, 1, numGroups, NKikimrBlobStorage::ROT, {}, &request, erasure); - auto response = env.Invoke(request); - UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription()); - - env.WaitForNodeWardensToConnect(); - - request.Clear(); - auto *cmd = request.AddCommand()->MutableEnableSelfHeal(); - cmd->SetEnable(true); - response = env.Invoke(request); - UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription()); - - std::set<TPDiskId> active, faulty; - - request = {}; - env.QueryBaseConfig(&request); - response = env.Invoke(request); - UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription()); - for (const auto& pdisk : response.GetStatus(0).GetBaseConfig().GetPDisk()) { - active.emplace(pdisk.GetNodeId(), pdisk.GetPDiskId()); + + void InitCluster() { + NKikimrBlobStorage::TConfigRequest request; + TVector<TEnvironmentSetup::TDrive> drives; + for (ui32 i = 0; i < NumDisksPerNode; ++i) { + drives.push_back({ .Path = "/dev/disk" + std::to_string(1 + i)}); + } + Env.DefineBox(1, drives, {{1, NumNodes}}, &request); + Env.DefineStoragePool(1, 1, NumGroups, NKikimrBlobStorage::ROT, {}, &request, Erasure); + auto response = Env.Invoke(request); + UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription()); + + Env.WaitForNodeWardensToConnect(); + + request.Clear(); + auto *cmd = request.AddCommand()->MutableEnableSelfHeal(); + cmd->SetEnable(true); + response = Env.Invoke(request); + UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription()); } - TString error; - UNIT_ASSERT_C(CheckBaseConfigLayout(geom, response.GetStatus(0).GetBaseConfig(), true, error), - "Initial group layout is incorrect, ErrorReason# " << error); + void SetSelfHealLocalPolicyTimeout(TDuration timeout) { + NKikimrBlobStorage::TConfigRequest request; + auto *cmd = request.AddCommand()->MutableUpdateSettings(); + Y_UNUSED(timeout); + cmd->AddUseSelfHealLocalPolicy(true); + cmd->AddGroupReserveMin(8 + 4); + cmd->AddEnableDonorMode(true); + auto response = Env.Invoke(request); + UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription()); + } - UNIT_ASSERT_VALUES_EQUAL(active.size(), numNodes * numDisksPerNode); + std::set<TPDiskId> GetActiveDisks() { + std::set<TPDiskId> active; - auto move = [&](auto& from, auto& to, NKikimrBlobStorage::EDriveStatus status) { + NKikimrBlobStorage::TConfigRequest request; + Env.QueryBaseConfig(&request); + auto response = Env.Invoke(request); + UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription()); + for (const auto& pdisk : response.GetStatus(0).GetBaseConfig().GetPDisk()) { + active.emplace(pdisk.GetNodeId(), pdisk.GetPDiskId()); + } + + TString error; + UNIT_ASSERT_C(CheckBaseConfigLayout(Geom, response.GetStatus(0).GetBaseConfig(), true, error), + "Initial group layout is incorrect, ErrorReason# " << error); + + UNIT_ASSERT_VALUES_EQUAL(active.size(), NumNodes * NumDisksPerNode); + + return active; + } + + TPDiskId Move(std::set<TPDiskId>& from, std::set<TPDiskId>& to, NKikimrBlobStorage::EDriveStatus status) { auto it = from.begin(); + auto pDiskId = *it; std::advance(it, RandomNumber(from.size())); Ctest << "PDisk# " << *it << " setting status to " << NKikimrBlobStorage::EDriveStatus_Name(status) << Endl; - request = {}; - env.UpdateDriveStatus(*it, status, &request); - response = env.Invoke(request); + NKikimrBlobStorage::TConfigRequest request; + Env.UpdateDriveStatus(*it, status, &request); + auto response = Env.Invoke(request); UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription()); to.insert(from.extract(it)); - }; - - for (size_t i = 0; i < numNodes; ++i) { - env.Wait(TDuration::Seconds(300)); - if (faulty.size() < disksNum) { - move(active, faulty, NKikimrBlobStorage::FAULTY); - } else { - move(faulty, active, NKikimrBlobStorage::ACTIVE); - } - env.Wait(TDuration::Seconds(300)); + return pDiskId; + } - request = {}; - env.QueryBaseConfig(&request); - response = env.Invoke(request); + auto RequestBasicConfig() { + NKikimrBlobStorage::TConfigRequest request; + Env.QueryBaseConfig(&request); + auto response = Env.Invoke(request); UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription()); - for (const auto& pdisk : response.GetStatus(0).GetBaseConfig().GetPDisk()) { + return response.GetStatus(0).GetBaseConfig(); + } + + void CheckDiskStatuses(const std::set<TPDiskId>& active, std::set<TPDiskId>& faulty) { + auto conf = RequestBasicConfig(); + for (const auto& pdisk : conf.GetPDisk()) { const TPDiskId pdiskId(pdisk.GetNodeId(), pdisk.GetPDiskId()); if (pdisk.GetDriveStatus() == NKikimrBlobStorage::ACTIVE) { UNIT_ASSERT(active.count(pdiskId)); @@ -94,7 +118,11 @@ Y_UNIT_TEST_SUITE(BsControllerTest) { UNIT_ASSERT(faulty.count(pdiskId)); } } - for (const auto& vslot : response.GetStatus(0).GetBaseConfig().GetVSlot()) { + } + + void CheckDiskLocations(const std::set<TPDiskId>& active, std::set<TPDiskId>& faulty) { + auto conf = RequestBasicConfig(); + for (const auto& vslot : conf.GetVSlot()) { const auto& id = vslot.GetVSlotId(); const TPDiskId pdiskId(id.GetNodeId(), id.GetPDiskId()); if (!active.count(pdiskId)) { @@ -112,17 +140,83 @@ Y_UNIT_TEST_SUITE(BsControllerTest) { UNIT_FAIL("non-active disk is present in group"); } } - UNIT_ASSERT_C(CheckBaseConfigLayout(geom, response.GetStatus(0).GetBaseConfig(), true, error), "Error on step# " << i - << ", ErrorReason# " << error); } - } + + void TestCorrectMoves() { + ui32 disksNum = Geom.GetNumFailRealms() * Geom.GetNumFailDomainsPerFailRealm() * Geom.GetNumVDisksPerFailDomain(); + std::set<TPDiskId> active = GetActiveDisks(), faulty; + + for (size_t i = 0; i < NumNodes; ++i) { + Env.Wait(TDuration::Seconds(300)); + if (faulty.size() < disksNum) { + Move(active, faulty, NKikimrBlobStorage::FAULTY); + } else { + Move(faulty, active, NKikimrBlobStorage::ACTIVE); + } + Env.Wait(TDuration::Seconds(300)); + + CheckDiskStatuses(active, faulty); + CheckDiskLocations(active, faulty); + + TString error; + UNIT_ASSERT_C(CheckBaseConfigLayout(Geom, RequestBasicConfig(), true, error), "Error on step# " << i + << ", ErrorReason# " << error); + } + } + + void RunTestCorrectMoves() { + InitCluster(); + TestCorrectMoves(); + } + + THashMap<ui32, ui32> CountVDisksPerNode() { + THashMap<ui32, ui32> result; + const auto conf = RequestBasicConfig(); + for (const auto& vslot : conf.GetVSlot()) { + ++result[vslot.GetVSlotId().GetNodeId()]; + } + return result; + } + + void RunTestCorrectLocalMoves() { + InitCluster(); + SetSelfHealLocalPolicyTimeout(TDuration::Days(1)); + std::set<TPDiskId> active = GetActiveDisks(), faulty; + auto checkVDisksPerNode = [&]() { + for (const auto& [_, count]: CountVDisksPerNode()) { + UNIT_ASSERT_VALUES_EQUAL(count, 8 * NumDisksPerNode); + } + }; + checkVDisksPerNode(); + + Env.Wait(TDuration::Seconds(300)); + Move(active, faulty, NKikimrBlobStorage::FAULTY); + Env.Wait(TDuration::Seconds(300 * 8)); + + CheckDiskStatuses(active, faulty); + checkVDisksPerNode(); + CheckDiskLocations(active, faulty); + } + + const ui32 NumDCs; + const ui32 NumRacksPerDC; + const ui32 NumUnitsPerRack; + const ui32 NumNodes; + const ui32 NumDisksPerNode; + const ui32 NumGroups; + const TString Erasure; + const TBlobStorageGroupType GroupType; + + TEnvironmentSetup Env; + const TGroupGeometryInfo Geom; + }; Y_UNIT_TEST(SelfHealBlock4Plus2) { - TestSelfHeal(1, 32, 1, 2, 64, "block-4-2", TBlobStorageGroupType::Erasure4Plus2Block); + TTestSelfHeal(1, 32, 1, 2, 64, "block-4-2", TBlobStorageGroupType::Erasure4Plus2Block).RunTestCorrectMoves(); } Y_UNIT_TEST(SelfHealMirror3dc) { - TestSelfHeal(3, 4, 3, 4, 128, "mirror-3-dc", TBlobStorageGroupType::ErasureMirror3dc); + TTestSelfHeal(3, 4, 3, 4, 128, "mirror-3-dc", TBlobStorageGroupType::ErasureMirror3dc).RunTestCorrectMoves(); } Y_UNIT_TEST(DecommitRejected) { @@ -205,4 +299,8 @@ Y_UNIT_TEST_SUITE(BsControllerTest) { UNIT_ASSERT_EQUAL(group1nodes, (THashSet<ui32>{{1, 2, 3, 10, 11, 12, 13, 14, 15}})); } } + + Y_UNIT_TEST(TestLocalSelfHeal) { + TTestSelfHeal(3, 4, 3, 4, 128, "mirror-3-dc", TBlobStorageGroupType::ErasureMirror3dc).RunTestCorrectLocalMoves(); + } } diff --git a/ydb/core/protos/blobstorage_config.proto b/ydb/core/protos/blobstorage_config.proto index 14010e891a..bb756302f7 100644 --- a/ydb/core/protos/blobstorage_config.proto +++ b/ydb/core/protos/blobstorage_config.proto @@ -259,6 +259,7 @@ message TReassignGroupDisk { uint32 VDiskIdx = 5; TPDiskId TargetPDiskId = 6; // optional; when not specified, selected automatically bool SuppressDonorMode = 7; // when set, donor mode is not used even if it is enabled through BSC + optional uint32 TargetNodeId = 8; // when set and flag UseSelfHealLocalPolicy=true, bscontroller would reassign disk to this node, if it is not possible error would be returned } message TSanitizeGroup { @@ -450,6 +451,7 @@ message TUpdateSettings { // TODO // repeated TSerialManagementStage.E SerialManagementStage = 11; repeated bool AllowMultipleRealmsOccupation = 12; + repeated bool UseSelfHealLocalPolicy = 13; } message TBoxStoragePoolId { |