aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSergey Belyakov <serg-belyakov@ydb.tech>2025-02-28 19:13:40 +0300
committerGitHub <noreply@github.com>2025-02-28 19:13:40 +0300
commitbf4f55b8e360284b921e123eaec0ee6202d2b969 (patch)
treeea153216f1eaee64f5c5f9be1020817bf0e87a32
parent62279d7c155ab55fcab3d2fabaa5c6adfeaa5cb0 (diff)
downloadydb-bf4f55b8e360284b921e123eaec0ee6202d2b969.tar.gz
GroupLayoutSanitizer always monitors invalid groups, add UTs (#15026)
-rw-r--r--ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp66
-rw-r--r--ydb/core/mind/bscontroller/group_layout_checker.cpp44
-rw-r--r--ydb/core/mind/bscontroller/group_layout_checker.h2
-rw-r--r--ydb/core/mind/bscontroller/impl.h1
-rw-r--r--ydb/core/mind/bscontroller/self_heal.cpp20
-rw-r--r--ydb/core/protos/counters_bs_controller.proto1
6 files changed, 57 insertions, 77 deletions
diff --git a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp
index 26c40dd7d1..34e660ca02 100644
--- a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp
+++ b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp
@@ -25,8 +25,8 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
}
}
- void CreateEnv(std::unique_ptr<TEnvironmentSetup>& env, std::vector<TNodeLocation>& locations) {
- TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc;
+ void CreateEnv(std::unique_ptr<TEnvironmentSetup>& env, std::vector<TNodeLocation>& locations,
+ TBlobStorageGroupType groupType) {
const ui32 numNodes = locations.size();
env.reset(new TEnvironmentSetup({
@@ -37,39 +37,49 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
const ui32 disksPerNode = 1;
const ui32 slotsPerDisk = 3;
+
+ env->Runtime->FilterFunction = CatchSanitizeRequests;
env->CreateBoxAndPool(disksPerNode, numNodes * disksPerNode * slotsPerDisk / 9);
+ env->Runtime->FilterFunction = {};
}
- Y_UNIT_TEST(Test3dc) {
+ NActorsInterconnect::TNodeLocation LocationGenerator(ui32 dc, ui32 rack, ui32 unit) {
+ NActorsInterconnect::TNodeLocation proto;
+ proto.SetDataCenter(ToString(dc));
+ proto.SetRack(ToString(rack));
+ proto.SetUnit(ToString(unit));
+ return proto;
+ }
+
+ void Test(TBlobStorageGroupType groupType, ui32 dcs, ui32 racks, ui32 units) {
std::vector<TNodeLocation> locations;
- TLocationGenerator locationGenerator = [](ui32 dc, ui32 rack, ui32 unit) {
- NActorsInterconnect::TNodeLocation proto;
- proto.SetDataCenter(ToString(dc));
- proto.SetRack(ToString(rack));
- proto.SetUnit(ToString(unit));
- return proto;
- };
- MakeLocations(locations, 3, 5, 1, locationGenerator);
+ MakeLocations(locations, dcs, racks, units, LocationGenerator);
std::unique_ptr<TEnvironmentSetup> env;
- CreateEnv(env, locations);
- TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc;
- TGroupGeometryInfo geom = CreateGroupGeometry(groupType);
+ CreateEnv(env, locations, groupType);
+
+ // Assure that sanitizer doesn't send request to initially allocated groups
env->Runtime->FilterFunction = CatchSanitizeRequests;
+ env->UpdateSettings(true, false, true);
+ env->Sim(TDuration::Minutes(3));
+ env->UpdateSettings(false, false, false);
+
+ TGroupGeometryInfo geom = CreateGroupGeometry(groupType);
TString error;
auto cfg = env->FetchBaseConfig();
UNIT_ASSERT_C(CheckBaseConfigLayout(geom, cfg, true, error), error);
- env->Cleanup();
// Shuffle node locayion, assure that layout error occured
- std::random_shuffle(locations.begin(), locations.end());
- env->Initialize();
- env->Sim(TDuration::Seconds(100));
- cfg = env->FetchBaseConfig();
- CheckBaseConfigLayout(geom, cfg, true, error);
+ do {
+ env->Cleanup();
+ std::random_shuffle(locations.begin(), locations.end());
+ env->Initialize();
+ env->Sim(TDuration::Seconds(100));
+ cfg = env->FetchBaseConfig();
+ } while (CheckBaseConfigLayout(geom, cfg, true, error));
Cerr << error << Endl;
// Sanitize groups
@@ -86,6 +96,18 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
UNIT_ASSERT_C(CheckBaseConfigLayout(geom, cfg, true, error), error);
}
+ Y_UNIT_TEST(Test3dc) {
+ Test(TBlobStorageGroupType::ErasureMirror3dc, 3, 5, 1);
+ }
+
+ Y_UNIT_TEST(TestBlock4Plus2) {
+ Test(TBlobStorageGroupType::Erasure4Plus2Block, 1, 10, 2);
+ }
+
+ Y_UNIT_TEST(TestMirror3of4) {
+ Test(TBlobStorageGroupType::ErasureMirror3of4, 1, 10, 2);
+ }
+
TString PrintGroups(TBlobStorageGroupType groupType, const NKikimrBlobStorage::TBaseConfig& cfg,
std::vector<TNodeLocation> locations) {
TGroupGeometryInfo geom = CreateGroupGeometry(groupType);
@@ -137,6 +159,7 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
}
void TestMultipleRealmsOccupation(bool allowMultipleRealmsOccupation) {
+ TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc;
std::vector<TNodeLocation> locations;
TLocationGenerator locationGenerator = [](ui32 dc, ui32 rack, ui32 unit) {
NActorsInterconnect::TNodeLocation proto;
@@ -152,9 +175,8 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) {
};
MakeLocations(locations, 4, 5, 1, locationGenerator);
std::unique_ptr<TEnvironmentSetup> env;
- CreateEnv(env, locations);
+ CreateEnv(env, locations, groupType);
- TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc;
TGroupGeometryInfo geom = CreateGroupGeometry(groupType);
env->Runtime->FilterFunction = CatchSanitizeRequests;
diff --git a/ydb/core/mind/bscontroller/group_layout_checker.cpp b/ydb/core/mind/bscontroller/group_layout_checker.cpp
index 8ab76e3e4f..31e822eb4f 100644
--- a/ydb/core/mind/bscontroller/group_layout_checker.cpp
+++ b/ydb/core/mind/bscontroller/group_layout_checker.cpp
@@ -1,47 +1,3 @@
#include "group_layout_checker.h"
-#include "group_geometry_info.h"
-
-namespace NKikimr::NBsController {
-
- TLayoutCheckResult CheckGroupLayout(const TGroupGeometryInfo& geom, const THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>>& layout) {
- using namespace NLayoutChecker;
-
- if (layout.empty()) {
- return {};
- }
-
- TBlobStorageGroupInfo::TTopology topology(geom.GetType(), geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm(),
- geom.GetNumVDisksPerFailDomain(), true);
- TGroupLayout group(topology);
- TDomainMapper mapper;
- THashMap<TVDiskIdShort, TPDiskLayoutPosition> map;
- for (const auto& [vdiskId, p] : layout) {
- const auto& [location, pdiskId] = p;
- TPDiskLayoutPosition pos(mapper, location, pdiskId, geom);
- group.AddDisk(pos, topology.GetOrderNumber(vdiskId));
- map.emplace(vdiskId, pos);
- }
-
- std::vector<std::pair<TScore, TVDiskIdShort>> scoreboard;
- for (const auto& [vdiskId, pos] : map) {
- scoreboard.emplace_back(group.GetCandidateScore(pos, topology.GetOrderNumber(vdiskId)), vdiskId);
- }
-
- auto comp1 = [](const auto& x, const auto& y) { return x.second < y.second; };
- std::sort(scoreboard.begin(), scoreboard.end(), comp1);
-
- auto comp = [](const auto& x, const auto& y) { return x.first.BetterThan(y.first); };
- std::sort(scoreboard.begin(), scoreboard.end(), comp);
- TLayoutCheckResult res;
- const auto reference = scoreboard.back().first;
- if (!reference.SameAs({})) { // not perfectly correct layout
- for (; !scoreboard.empty() && !scoreboard.back().first.BetterThan(reference); scoreboard.pop_back()) {
- res.Candidates.push_back(scoreboard.back().second);
- }
- }
- return res;
- }
-
-} // NKikimr::NBsController
Y_DECLARE_OUT_SPEC(, NKikimr::NBsController::NLayoutChecker::TEntityId, stream, value) { value.Output(stream); }
diff --git a/ydb/core/mind/bscontroller/group_layout_checker.h b/ydb/core/mind/bscontroller/group_layout_checker.h
index 3c42fef3d4..192a6690c9 100644
--- a/ydb/core/mind/bscontroller/group_layout_checker.h
+++ b/ydb/core/mind/bscontroller/group_layout_checker.h
@@ -289,6 +289,4 @@ namespace NKikimr::NBsController {
}
};
- TLayoutCheckResult CheckGroupLayout(const TGroupGeometryInfo& geom, const THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>>& layout);
-
} // NKikimr::NBsController
diff --git a/ydb/core/mind/bscontroller/impl.h b/ydb/core/mind/bscontroller/impl.h
index e5b745e3f6..e91c805e48 100644
--- a/ydb/core/mind/bscontroller/impl.h
+++ b/ydb/core/mind/bscontroller/impl.h
@@ -1520,6 +1520,7 @@ public:
private:
TString InstanceId;
std::shared_ptr<std::atomic_uint64_t> SelfHealUnreassignableGroups = std::make_shared<std::atomic_uint64_t>();
+ std::shared_ptr<std::atomic_uint64_t> GroupLayoutSanitizerInvalidGroups = std::make_shared<std::atomic_uint64_t>();
TMaybe<TActorId> MigrationId;
TVSlots VSlots; // ordering is important
TPDisks PDisks; // ordering is important
diff --git a/ydb/core/mind/bscontroller/self_heal.cpp b/ydb/core/mind/bscontroller/self_heal.cpp
index 7618d1a93e..de5282bce9 100644
--- a/ydb/core/mind/bscontroller/self_heal.cpp
+++ b/ydb/core/mind/bscontroller/self_heal.cpp
@@ -284,6 +284,7 @@ namespace NKikimr::NBsController {
bool DonorMode;
THostRecordMap HostRecords;
std::shared_ptr<TControlWrapper> EnableSelfHealWithDegraded;
+ std::shared_ptr<std::atomic_uint64_t> GroupsWithInvalidLayoutCounter;
using TTopologyDescr = std::tuple<TBlobStorageGroupType::EErasureSpecies, ui32, ui32, ui32>;
THashMap<TTopologyDescr, std::shared_ptr<TBlobStorageGroupInfo::TTopology>> Topologies;
@@ -296,7 +297,8 @@ namespace NKikimr::NBsController {
public:
TSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups, THostRecordMap hostRecords,
bool groupLayoutSanitizerEnabled, bool allowMultipleRealmsOccupation, bool donorMode,
- std::shared_ptr<TControlWrapper> enableSelfHealWithDegraded)
+ std::shared_ptr<TControlWrapper> enableSelfHealWithDegraded,
+ std::shared_ptr<std::atomic_uint64_t> groupsWithInvalidLayoutCounter)
: TabletId(tabletId)
, UnreassignableGroups(std::move(unreassignableGroups))
, GroupLayoutSanitizerEnabled(groupLayoutSanitizerEnabled)
@@ -304,6 +306,7 @@ namespace NKikimr::NBsController {
, DonorMode(donorMode)
, HostRecords(std::move(hostRecords))
, EnableSelfHealWithDegraded(std::move(enableSelfHealWithDegraded))
+ , GroupsWithInvalidLayoutCounter(std::move(groupsWithInvalidLayoutCounter))
{}
void Bootstrap(const TActorId& parentId) {
@@ -318,17 +321,16 @@ namespace NKikimr::NBsController {
void Handle(TEvControllerUpdateSelfHealInfo::TPtr& ev) {
if (const auto& setting = ev->Get()->GroupLayoutSanitizerEnabled) {
- bool previousSetting = std::exchange(GroupLayoutSanitizerEnabled, *setting);
- if (!previousSetting && GroupLayoutSanitizerEnabled) {
- UpdateLayoutInformationForAllGroups();
- }
+ std::exchange(GroupLayoutSanitizerEnabled, *setting);
}
+
if (const auto& setting = ev->Get()->AllowMultipleRealmsOccupation) {
bool previousSetting = std::exchange(AllowMultipleRealmsOccupation, *setting);
if (previousSetting != AllowMultipleRealmsOccupation) {
UpdateLayoutInformationForAllGroups();
}
}
+
if (const auto& setting = ev->Get()->DonorMode) {
DonorMode = *setting;
}
@@ -345,9 +347,7 @@ namespace NKikimr::NBsController {
g.Content = std::move(*data);
- if (GroupLayoutSanitizerEnabled) {
- UpdateGroupLayoutInformation(g);
- }
+ UpdateGroupLayoutInformation(g);
ui32 numFailRealms = 0;
ui32 numFailDomainsPerFailRealm = 0;
@@ -500,6 +500,7 @@ namespace NKikimr::NBsController {
}
}
+ GroupsWithInvalidLayoutCounter->store(GroupsWithInvalidLayout.Size());
UnreassignableGroups->store(counter);
}
@@ -899,7 +900,7 @@ namespace NKikimr::NBsController {
IActor *TBlobStorageController::CreateSelfHealActor() {
Y_ABORT_UNLESS(HostRecords);
return new TSelfHealActor(TabletID(), SelfHealUnreassignableGroups, HostRecords, GroupLayoutSanitizerEnabled,
- AllowMultipleRealmsOccupation, DonorMode, EnableSelfHealWithDegraded);
+ AllowMultipleRealmsOccupation, DonorMode, EnableSelfHealWithDegraded, GroupLayoutSanitizerInvalidGroups);
}
void TBlobStorageController::InitializeSelfHealState() {
@@ -1159,6 +1160,7 @@ namespace NKikimr::NBsController {
);
TabletCounters->Simple()[NBlobStorageController::COUNTER_SELF_HEAL_UNREASSIGNABLE_GROUPS] = SelfHealUnreassignableGroups->load();
+ TabletCounters->Simple()[NBlobStorageController::COUNTER_GROUP_LAYOUT_SANITIZER_INVALID_GROUPS] = GroupLayoutSanitizerInvalidGroups->load();
Schedule(TDuration::Seconds(15), new TEvPrivate::TEvUpdateSelfHealCounters);
}
diff --git a/ydb/core/protos/counters_bs_controller.proto b/ydb/core/protos/counters_bs_controller.proto
index 5d6cdae97e..82642fa424 100644
--- a/ydb/core/protos/counters_bs_controller.proto
+++ b/ydb/core/protos/counters_bs_controller.proto
@@ -28,6 +28,7 @@ enum ESimpleCounters {
COUNTER_DISK_SCRUB_CUR_DISKS = 18 [(CounterOpts) = {Name: "CurrentlyScrubbedDisks"}];
COUNTER_DISK_SCRUB_CUR_GROUPS = 19 [(CounterOpts) = {Name: "CurrentlyScrubbedGroups"}];
COUNTER_SELF_HEAL_UNREASSIGNABLE_GROUPS = 20 [(CounterOpts) = {Name: "SelfHealUnreassignableGroups"}];
+ COUNTER_GROUP_LAYOUT_SANITIZER_INVALID_GROUPS = 21 [(CounterOpts) = {Name: "GroupLayoutSanitizerInvlaidGroups"}];
}
enum ECumulativeCounters {