aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorrobdrynkin <robdrynkin@yandex-team.com>2023-11-08 16:20:19 +0300
committerrobdrynkin <robdrynkin@yandex-team.com>2023-11-08 16:55:30 +0300
commit1cfb60d093d33df4c50825aa4fc71415a9d80db5 (patch)
tree37cc73fc52fc5529c971be177003eb08369067d6
parent212ec2d7ce6232b6f2a345d76a53f7d9dd75e9ed (diff)
downloadydb-1cfb60d093d33df4c50825aa4fc71415a9d80db5.tar.gz
KIKIMR-19590: Special option to run self heal localy
-rw-r--r--ydb/core/mind/bscontroller/cmds_storage_pool.cpp2
-rw-r--r--ydb/core/mind/bscontroller/config.h1
-rw-r--r--ydb/core/mind/bscontroller/config_cmd.cpp5
-rw-r--r--ydb/core/mind/bscontroller/config_fit_groups.cpp2
-rw-r--r--ydb/core/mind/bscontroller/impl.h1
-rw-r--r--ydb/core/mind/bscontroller/load_everything.cpp1
-rw-r--r--ydb/core/mind/bscontroller/scheme.h4
-rw-r--r--ydb/core/mind/bscontroller/self_heal.cpp2
-rw-r--r--ydb/core/mind/bscontroller/ut_selfheal/main.cpp236
-rw-r--r--ydb/core/protos/blobstorage_config.proto2
10 files changed, 185 insertions, 71 deletions
diff --git a/ydb/core/mind/bscontroller/cmds_storage_pool.cpp b/ydb/core/mind/bscontroller/cmds_storage_pool.cpp
index 8056880c57..ad0f037e26 100644
--- a/ydb/core/mind/bscontroller/cmds_storage_pool.cpp
+++ b/ydb/core/mind/bscontroller/cmds_storage_pool.cpp
@@ -364,6 +364,8 @@ namespace NKikimr::NBsController {
throw TExError() << "TargetPDiskId# " << pdiskId.ToString() << " not found";
}
targetPDiskId = pdiskId;
+ } else if (cmd.HasTargetNodeId() && Self.UseSelfHealLocalPolicy) {
+ TargetNodeId = cmd.GetTargetNodeId();
}
ExplicitReconfigureMap.emplace(vslotId, targetPDiskId);
diff --git a/ydb/core/mind/bscontroller/config.h b/ydb/core/mind/bscontroller/config.h
index 6e92bc9858..b915c91491 100644
--- a/ydb/core/mind/bscontroller/config.h
+++ b/ydb/core/mind/bscontroller/config.h
@@ -81,6 +81,7 @@ namespace NKikimr {
THashMap<TVSlotId, TPDiskId> ExplicitReconfigureMap;
std::set<TVSlotId> SuppressDonorMode;
std::unordered_set<ui32> SanitizingRequests;
+ std::optional<ui32> TargetNodeId;
// just-created vslots, which are not yet committed to the storage
TSet<TVSlotId> UncommittedVSlots;
diff --git a/ydb/core/mind/bscontroller/config_cmd.cpp b/ydb/core/mind/bscontroller/config_cmd.cpp
index a252371ddc..4abaffe222 100644
--- a/ydb/core/mind/bscontroller/config_cmd.cpp
+++ b/ydb/core/mind/bscontroller/config_cmd.cpp
@@ -142,6 +142,10 @@ namespace NKikimr::NBsController {
ev->AllowMultipleRealmsOccupation = Self->AllowMultipleRealmsOccupation;
Self->Send(Self->SelfHealId, ev.release());
}
+ for (bool value : settings.GetUseSelfHealLocalPolicy()) {
+ Self->UseSelfHealLocalPolicy = value;
+ db.Table<T>().Key(true).Update<T::UseSelfHealLocalPolicy>(Self->UseSelfHealLocalPolicy);
+ }
return true;
}
@@ -307,6 +311,7 @@ namespace NKikimr::NBsController {
state.SanitizingRequests.clear();
state.ExplicitReconfigureMap.clear();
state.SuppressDonorMode.clear();
+ state.TargetNodeId.reset();
switch (cmd.GetCommandCase()) {
#define HANDLE_COMMAND(NAME) \
case NKikimrBlobStorage::TConfigRequest::TCommand::k ## NAME: return state.ExecuteStep(cmd.Get ## NAME(), status);
diff --git a/ydb/core/mind/bscontroller/config_fit_groups.cpp b/ydb/core/mind/bscontroller/config_fit_groups.cpp
index 50b036f8bf..9ffb64c749 100644
--- a/ydb/core/mind/bscontroller/config_fit_groups.cpp
+++ b/ydb/core/mind/bscontroller/config_fit_groups.cpp
@@ -464,7 +464,7 @@ namespace NKikimr {
}
for (const auto& filter : StoragePool.PDiskFilters) {
- if (filter.MatchPDisk(info)) {
+ if (filter.MatchPDisk(info) && (!State.TargetNodeId.has_value() || *State.TargetNodeId == id.NodeId)) {
const bool inserted = RegisterPDisk(id, info, true);
Y_ABORT_UNLESS(inserted);
break;
diff --git a/ydb/core/mind/bscontroller/impl.h b/ydb/core/mind/bscontroller/impl.h
index 388588cb0b..2ecfcfa2d9 100644
--- a/ydb/core/mind/bscontroller/impl.h
+++ b/ydb/core/mind/bscontroller/impl.h
@@ -1496,6 +1496,7 @@ private:
TActorId StatProcessorActorId;
TInstant LastMetricsCommit;
bool SelfHealEnable = false;
+ bool UseSelfHealLocalPolicy;
bool DonorMode = false;
TDuration ScrubPeriodicity;
NKikimrBlobStorage::TStorageConfig StorageConfig;
diff --git a/ydb/core/mind/bscontroller/load_everything.cpp b/ydb/core/mind/bscontroller/load_everything.cpp
index 7125f5b951..6c99b5ac9d 100644
--- a/ydb/core/mind/bscontroller/load_everything.cpp
+++ b/ydb/core/mind/bscontroller/load_everything.cpp
@@ -89,6 +89,7 @@ public:
Self->GroupLayoutSanitizerEnabled = state.GetValue<T::GroupLayoutSanitizer>();
Self->AllowMultipleRealmsOccupation = state.GetValueOrDefault<T::AllowMultipleRealmsOccupation>();
Self->SysViewChangedSettings = true;
+ Self->UseSelfHealLocalPolicy = state.GetValue<T::UseSelfHealLocalPolicy>();
}
}
diff --git a/ydb/core/mind/bscontroller/scheme.h b/ydb/core/mind/bscontroller/scheme.h
index a2a00fd9f9..57eca4c019 100644
--- a/ydb/core/mind/bscontroller/scheme.h
+++ b/ydb/core/mind/bscontroller/scheme.h
@@ -105,12 +105,14 @@ struct Schema : NIceDb::Schema {
struct NextVirtualGroupId : Column<19, Group::ID::ColumnType> { static constexpr Type Default = 0; };
struct AllowMultipleRealmsOccupation : Column<20, NScheme::NTypeIds::Bool> { static constexpr Type Default = true; };
struct CompatibilityInfo : Column<21, NScheme::NTypeIds::String> {};
+ struct UseSelfHealLocalPolicy : Column<22, NScheme::NTypeIds::Bool> { static constexpr Type Default = false; };
using TKey = TableKey<FixedKey>;
using TColumns = TableColumns<FixedKey, NextGroupID, SchemaVersion, NextOperationLogIndex, DefaultMaxSlots,
InstanceId, SelfHealEnable, DonorModeEnable, ScrubPeriodicity, SerialManagementStage, NextStoragePoolId,
PDiskSpaceMarginPromille, GroupReserveMin, GroupReservePart, MaxScrubbedDisksAtOnce, PDiskSpaceColorBorder,
- GroupLayoutSanitizer, NextVirtualGroupId, AllowMultipleRealmsOccupation, CompatibilityInfo>;
+ GroupLayoutSanitizer, NextVirtualGroupId, AllowMultipleRealmsOccupation, CompatibilityInfo,
+ UseSelfHealLocalPolicy>;
};
struct VSlot : Table<5> {
diff --git a/ydb/core/mind/bscontroller/self_heal.cpp b/ydb/core/mind/bscontroller/self_heal.cpp
index 6cb8643a6b..c234911121 100644
--- a/ydb/core/mind/bscontroller/self_heal.cpp
+++ b/ydb/core/mind/bscontroller/self_heal.cpp
@@ -152,6 +152,7 @@ namespace NKikimr::NBsController {
request->SetIgnoreGroupReserve(true);
request->SetSettleOnlyOnOperationalDisks(true);
request->SetIsSelfHealReasonDecommit(IsSelfHealReasonDecommit);
+ request->SetAllowUnusableDisks(true);
if (VDiskToReplace) {
ev->SelfHeal = true;
auto *cmd = request->AddCommand()->MutableReassignGroupDisk();
@@ -160,6 +161,7 @@ namespace NKikimr::NBsController {
cmd->SetFailRealmIdx(VDiskToReplace->FailRealm);
cmd->SetFailDomainIdx(VDiskToReplace->FailDomain);
cmd->SetVDiskIdx(VDiskToReplace->VDisk);
+ cmd->SetTargetNodeId(Group.VDisks.at(*VDiskToReplace).Location.NodeId);
} else {
ev->GroupLayoutSanitizer = true;
auto *cmd = request->AddCommand()->MutableSanitizeGroup();
diff --git a/ydb/core/mind/bscontroller/ut_selfheal/main.cpp b/ydb/core/mind/bscontroller/ut_selfheal/main.cpp
index 30592dbf25..f9f0a04bdb 100644
--- a/ydb/core/mind/bscontroller/ut_selfheal/main.cpp
+++ b/ydb/core/mind/bscontroller/ut_selfheal/main.cpp
@@ -9,83 +9,107 @@
Y_UNIT_TEST_SUITE(BsControllerTest) {
- void TestSelfHeal(const ui32 numDCs = 3, ui32 numRacksPerDC = 4, const ui32 numUnitsPerRack = 4, const ui32 numDisksPerNode = 2, const ui32 numGroups = 64,
- TString erasure = "block-4-2", TBlobStorageGroupType groupType = TBlobStorageGroupType::Erasure4Plus2Block) {
- ui32 numNodes = numDCs * numRacksPerDC * numUnitsPerRack;
- auto locationGenerator = [=](ui32 nodeId) {
- NActorsInterconnect::TNodeLocation proto;
- proto.SetDataCenter(ToString((nodeId - 1) / (numUnitsPerRack * numRacksPerDC)));
- proto.SetRack(ToString((nodeId - 1) / numUnitsPerRack));
- proto.SetUnit(ToString((nodeId - 1)));
- return TNodeLocation(proto);
- };
-
- TEnvironmentSetup env(numNodes, locationGenerator);
-
- const TGroupGeometryInfo geom = CreateGroupGeometry(groupType);
- ui32 disksNum = geom.GetNumFailRealms() * geom.GetNumFailDomainsPerFailRealm() * geom.GetNumVDisksPerFailDomain();
-
- NKikimrBlobStorage::TConfigRequest request;
- TVector<TEnvironmentSetup::TDrive> drives;
- for (ui32 i = 0; i < numDisksPerNode; ++i) {
- drives.push_back({ .Path = "/dev/disk" + std::to_string(1 + i)});
+ struct TTestSelfHeal {
+ TTestSelfHeal(
+ ui32 numDCs = 3, ui32 numRacksPerDC = 4, ui32 numUnitsPerRack = 4, ui32 numDisksPerNode = 2, ui32 numGroups = 64,
+ TString erasure = "block-4-2", TBlobStorageGroupType groupType = TBlobStorageGroupType::Erasure4Plus2Block
+ )
+ : NumDCs(numDCs)
+ , NumRacksPerDC(numRacksPerDC)
+ , NumUnitsPerRack(numUnitsPerRack)
+ , NumNodes(NumDCs * NumRacksPerDC * NumUnitsPerRack)
+ , NumDisksPerNode(numDisksPerNode)
+ , NumGroups(numGroups)
+ , Erasure(erasure)
+ , GroupType(groupType)
+ , Env(NumNodes, [=](ui32 nodeId) {
+ NActorsInterconnect::TNodeLocation proto;
+ proto.SetDataCenter(ToString((nodeId - 1) / (NumUnitsPerRack * NumRacksPerDC)));
+ proto.SetRack(ToString((nodeId - 1) / NumUnitsPerRack));
+ proto.SetUnit(ToString((nodeId - 1)));
+ return TNodeLocation(proto);
+ })
+ , Geom(CreateGroupGeometry(GroupType))
+ {
}
- env.DefineBox(1, drives, {{1, numNodes}}, &request);
- env.DefineStoragePool(1, 1, numGroups, NKikimrBlobStorage::ROT, {}, &request, erasure);
- auto response = env.Invoke(request);
- UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription());
-
- env.WaitForNodeWardensToConnect();
-
- request.Clear();
- auto *cmd = request.AddCommand()->MutableEnableSelfHeal();
- cmd->SetEnable(true);
- response = env.Invoke(request);
- UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription());
-
- std::set<TPDiskId> active, faulty;
-
- request = {};
- env.QueryBaseConfig(&request);
- response = env.Invoke(request);
- UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription());
- for (const auto& pdisk : response.GetStatus(0).GetBaseConfig().GetPDisk()) {
- active.emplace(pdisk.GetNodeId(), pdisk.GetPDiskId());
+
+ void InitCluster() {
+ NKikimrBlobStorage::TConfigRequest request;
+ TVector<TEnvironmentSetup::TDrive> drives;
+ for (ui32 i = 0; i < NumDisksPerNode; ++i) {
+ drives.push_back({ .Path = "/dev/disk" + std::to_string(1 + i)});
+ }
+ Env.DefineBox(1, drives, {{1, NumNodes}}, &request);
+ Env.DefineStoragePool(1, 1, NumGroups, NKikimrBlobStorage::ROT, {}, &request, Erasure);
+ auto response = Env.Invoke(request);
+ UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription());
+
+ Env.WaitForNodeWardensToConnect();
+
+ request.Clear();
+ auto *cmd = request.AddCommand()->MutableEnableSelfHeal();
+ cmd->SetEnable(true);
+ response = Env.Invoke(request);
+ UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription());
}
- TString error;
- UNIT_ASSERT_C(CheckBaseConfigLayout(geom, response.GetStatus(0).GetBaseConfig(), true, error),
- "Initial group layout is incorrect, ErrorReason# " << error);
+ void SetSelfHealLocalPolicyTimeout(TDuration timeout) {
+ NKikimrBlobStorage::TConfigRequest request;
+ auto *cmd = request.AddCommand()->MutableUpdateSettings();
+ Y_UNUSED(timeout);
+ cmd->AddUseSelfHealLocalPolicy(true);
+ cmd->AddGroupReserveMin(8 + 4);
+ cmd->AddEnableDonorMode(true);
+ auto response = Env.Invoke(request);
+ UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription());
+ }
- UNIT_ASSERT_VALUES_EQUAL(active.size(), numNodes * numDisksPerNode);
+ std::set<TPDiskId> GetActiveDisks() {
+ std::set<TPDiskId> active;
- auto move = [&](auto& from, auto& to, NKikimrBlobStorage::EDriveStatus status) {
+ NKikimrBlobStorage::TConfigRequest request;
+ Env.QueryBaseConfig(&request);
+ auto response = Env.Invoke(request);
+ UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription());
+ for (const auto& pdisk : response.GetStatus(0).GetBaseConfig().GetPDisk()) {
+ active.emplace(pdisk.GetNodeId(), pdisk.GetPDiskId());
+ }
+
+ TString error;
+ UNIT_ASSERT_C(CheckBaseConfigLayout(Geom, response.GetStatus(0).GetBaseConfig(), true, error),
+ "Initial group layout is incorrect, ErrorReason# " << error);
+
+ UNIT_ASSERT_VALUES_EQUAL(active.size(), NumNodes * NumDisksPerNode);
+
+ return active;
+ }
+
+ TPDiskId Move(std::set<TPDiskId>& from, std::set<TPDiskId>& to, NKikimrBlobStorage::EDriveStatus status) {
auto it = from.begin();
+ auto pDiskId = *it;
std::advance(it, RandomNumber(from.size()));
Ctest << "PDisk# " << *it
<< " setting status to " << NKikimrBlobStorage::EDriveStatus_Name(status)
<< Endl;
- request = {};
- env.UpdateDriveStatus(*it, status, &request);
- response = env.Invoke(request);
+ NKikimrBlobStorage::TConfigRequest request;
+ Env.UpdateDriveStatus(*it, status, &request);
+ auto response = Env.Invoke(request);
UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription());
to.insert(from.extract(it));
- };
-
- for (size_t i = 0; i < numNodes; ++i) {
- env.Wait(TDuration::Seconds(300));
- if (faulty.size() < disksNum) {
- move(active, faulty, NKikimrBlobStorage::FAULTY);
- } else {
- move(faulty, active, NKikimrBlobStorage::ACTIVE);
- }
- env.Wait(TDuration::Seconds(300));
+ return pDiskId;
+ }
- request = {};
- env.QueryBaseConfig(&request);
- response = env.Invoke(request);
+ auto RequestBasicConfig() {
+ NKikimrBlobStorage::TConfigRequest request;
+ Env.QueryBaseConfig(&request);
+ auto response = Env.Invoke(request);
UNIT_ASSERT_C(response.GetSuccess(), response.GetErrorDescription());
- for (const auto& pdisk : response.GetStatus(0).GetBaseConfig().GetPDisk()) {
+ return response.GetStatus(0).GetBaseConfig();
+ }
+
+ void CheckDiskStatuses(const std::set<TPDiskId>& active, std::set<TPDiskId>& faulty) {
+ auto conf = RequestBasicConfig();
+ for (const auto& pdisk : conf.GetPDisk()) {
const TPDiskId pdiskId(pdisk.GetNodeId(), pdisk.GetPDiskId());
if (pdisk.GetDriveStatus() == NKikimrBlobStorage::ACTIVE) {
UNIT_ASSERT(active.count(pdiskId));
@@ -94,7 +118,11 @@ Y_UNIT_TEST_SUITE(BsControllerTest) {
UNIT_ASSERT(faulty.count(pdiskId));
}
}
- for (const auto& vslot : response.GetStatus(0).GetBaseConfig().GetVSlot()) {
+ }
+
+ void CheckDiskLocations(const std::set<TPDiskId>& active, std::set<TPDiskId>& faulty) {
+ auto conf = RequestBasicConfig();
+ for (const auto& vslot : conf.GetVSlot()) {
const auto& id = vslot.GetVSlotId();
const TPDiskId pdiskId(id.GetNodeId(), id.GetPDiskId());
if (!active.count(pdiskId)) {
@@ -112,17 +140,83 @@ Y_UNIT_TEST_SUITE(BsControllerTest) {
UNIT_FAIL("non-active disk is present in group");
}
}
- UNIT_ASSERT_C(CheckBaseConfigLayout(geom, response.GetStatus(0).GetBaseConfig(), true, error), "Error on step# " << i
- << ", ErrorReason# " << error);
}
- }
+
+ void TestCorrectMoves() {
+ ui32 disksNum = Geom.GetNumFailRealms() * Geom.GetNumFailDomainsPerFailRealm() * Geom.GetNumVDisksPerFailDomain();
+ std::set<TPDiskId> active = GetActiveDisks(), faulty;
+
+ for (size_t i = 0; i < NumNodes; ++i) {
+ Env.Wait(TDuration::Seconds(300));
+ if (faulty.size() < disksNum) {
+ Move(active, faulty, NKikimrBlobStorage::FAULTY);
+ } else {
+ Move(faulty, active, NKikimrBlobStorage::ACTIVE);
+ }
+ Env.Wait(TDuration::Seconds(300));
+
+ CheckDiskStatuses(active, faulty);
+ CheckDiskLocations(active, faulty);
+
+ TString error;
+ UNIT_ASSERT_C(CheckBaseConfigLayout(Geom, RequestBasicConfig(), true, error), "Error on step# " << i
+ << ", ErrorReason# " << error);
+ }
+ }
+
+ void RunTestCorrectMoves() {
+ InitCluster();
+ TestCorrectMoves();
+ }
+
+ THashMap<ui32, ui32> CountVDisksPerNode() {
+ THashMap<ui32, ui32> result;
+ const auto conf = RequestBasicConfig();
+ for (const auto& vslot : conf.GetVSlot()) {
+ ++result[vslot.GetVSlotId().GetNodeId()];
+ }
+ return result;
+ }
+
+ void RunTestCorrectLocalMoves() {
+ InitCluster();
+ SetSelfHealLocalPolicyTimeout(TDuration::Days(1));
+ std::set<TPDiskId> active = GetActiveDisks(), faulty;
+ auto checkVDisksPerNode = [&]() {
+ for (const auto& [_, count]: CountVDisksPerNode()) {
+ UNIT_ASSERT_VALUES_EQUAL(count, 8 * NumDisksPerNode);
+ }
+ };
+ checkVDisksPerNode();
+
+ Env.Wait(TDuration::Seconds(300));
+ Move(active, faulty, NKikimrBlobStorage::FAULTY);
+ Env.Wait(TDuration::Seconds(300 * 8));
+
+ CheckDiskStatuses(active, faulty);
+ checkVDisksPerNode();
+ CheckDiskLocations(active, faulty);
+ }
+
+ const ui32 NumDCs;
+ const ui32 NumRacksPerDC;
+ const ui32 NumUnitsPerRack;
+ const ui32 NumNodes;
+ const ui32 NumDisksPerNode;
+ const ui32 NumGroups;
+ const TString Erasure;
+ const TBlobStorageGroupType GroupType;
+
+ TEnvironmentSetup Env;
+ const TGroupGeometryInfo Geom;
+ };
Y_UNIT_TEST(SelfHealBlock4Plus2) {
- TestSelfHeal(1, 32, 1, 2, 64, "block-4-2", TBlobStorageGroupType::Erasure4Plus2Block);
+ TTestSelfHeal(1, 32, 1, 2, 64, "block-4-2", TBlobStorageGroupType::Erasure4Plus2Block).RunTestCorrectMoves();
}
Y_UNIT_TEST(SelfHealMirror3dc) {
- TestSelfHeal(3, 4, 3, 4, 128, "mirror-3-dc", TBlobStorageGroupType::ErasureMirror3dc);
+ TTestSelfHeal(3, 4, 3, 4, 128, "mirror-3-dc", TBlobStorageGroupType::ErasureMirror3dc).RunTestCorrectMoves();
}
Y_UNIT_TEST(DecommitRejected) {
@@ -205,4 +299,8 @@ Y_UNIT_TEST_SUITE(BsControllerTest) {
UNIT_ASSERT_EQUAL(group1nodes, (THashSet<ui32>{{1, 2, 3, 10, 11, 12, 13, 14, 15}}));
}
}
+
+ Y_UNIT_TEST(TestLocalSelfHeal) {
+ TTestSelfHeal(3, 4, 3, 4, 128, "mirror-3-dc", TBlobStorageGroupType::ErasureMirror3dc).RunTestCorrectLocalMoves();
+ }
}
diff --git a/ydb/core/protos/blobstorage_config.proto b/ydb/core/protos/blobstorage_config.proto
index 14010e891a..bb756302f7 100644
--- a/ydb/core/protos/blobstorage_config.proto
+++ b/ydb/core/protos/blobstorage_config.proto
@@ -259,6 +259,7 @@ message TReassignGroupDisk {
uint32 VDiskIdx = 5;
TPDiskId TargetPDiskId = 6; // optional; when not specified, selected automatically
bool SuppressDonorMode = 7; // when set, donor mode is not used even if it is enabled through BSC
+ optional uint32 TargetNodeId = 8; // when set and flag UseSelfHealLocalPolicy=true, bscontroller would reassign disk to this node, if it is not possible error would be returned
}
message TSanitizeGroup {
@@ -450,6 +451,7 @@ message TUpdateSettings {
// TODO
// repeated TSerialManagementStage.E SerialManagementStage = 11;
repeated bool AllowMultipleRealmsOccupation = 12;
+ repeated bool UseSelfHealLocalPolicy = 13;
}
message TBoxStoragePoolId {