diff options
author | Sergey Belyakov <serg-belyakov@ydb.tech> | 2025-02-28 19:13:40 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-02-28 19:13:40 +0300 |
commit | bf4f55b8e360284b921e123eaec0ee6202d2b969 (patch) | |
tree | ea153216f1eaee64f5c5f9be1020817bf0e87a32 | |
parent | 62279d7c155ab55fcab3d2fabaa5c6adfeaa5cb0 (diff) | |
download | ydb-bf4f55b8e360284b921e123eaec0ee6202d2b969.tar.gz |
GroupLayoutSanitizer always monitors invalid groups, add UTs (#15026)
-rw-r--r-- | ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp | 66 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/group_layout_checker.cpp | 44 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/group_layout_checker.h | 2 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/impl.h | 1 | ||||
-rw-r--r-- | ydb/core/mind/bscontroller/self_heal.cpp | 20 | ||||
-rw-r--r-- | ydb/core/protos/counters_bs_controller.proto | 1 |
6 files changed, 57 insertions, 77 deletions
diff --git a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp index 26c40dd7d1..34e660ca02 100644 --- a/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp +++ b/ydb/core/blobstorage/ut_blobstorage/sanitize_groups.cpp @@ -25,8 +25,8 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { } } - void CreateEnv(std::unique_ptr<TEnvironmentSetup>& env, std::vector<TNodeLocation>& locations) { - TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc; + void CreateEnv(std::unique_ptr<TEnvironmentSetup>& env, std::vector<TNodeLocation>& locations, + TBlobStorageGroupType groupType) { const ui32 numNodes = locations.size(); env.reset(new TEnvironmentSetup({ @@ -37,39 +37,49 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { const ui32 disksPerNode = 1; const ui32 slotsPerDisk = 3; + + env->Runtime->FilterFunction = CatchSanitizeRequests; env->CreateBoxAndPool(disksPerNode, numNodes * disksPerNode * slotsPerDisk / 9); + env->Runtime->FilterFunction = {}; } - Y_UNIT_TEST(Test3dc) { + NActorsInterconnect::TNodeLocation LocationGenerator(ui32 dc, ui32 rack, ui32 unit) { + NActorsInterconnect::TNodeLocation proto; + proto.SetDataCenter(ToString(dc)); + proto.SetRack(ToString(rack)); + proto.SetUnit(ToString(unit)); + return proto; + } + + void Test(TBlobStorageGroupType groupType, ui32 dcs, ui32 racks, ui32 units) { std::vector<TNodeLocation> locations; - TLocationGenerator locationGenerator = [](ui32 dc, ui32 rack, ui32 unit) { - NActorsInterconnect::TNodeLocation proto; - proto.SetDataCenter(ToString(dc)); - proto.SetRack(ToString(rack)); - proto.SetUnit(ToString(unit)); - return proto; - }; - MakeLocations(locations, 3, 5, 1, locationGenerator); + MakeLocations(locations, dcs, racks, units, LocationGenerator); std::unique_ptr<TEnvironmentSetup> env; - CreateEnv(env, locations); - TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc; - TGroupGeometryInfo geom = CreateGroupGeometry(groupType); + CreateEnv(env, locations, groupType); + + // Assure that sanitizer doesn't send request to initially allocated groups env->Runtime->FilterFunction = CatchSanitizeRequests; + env->UpdateSettings(true, false, true); + env->Sim(TDuration::Minutes(3)); + env->UpdateSettings(false, false, false); + + TGroupGeometryInfo geom = CreateGroupGeometry(groupType); TString error; auto cfg = env->FetchBaseConfig(); UNIT_ASSERT_C(CheckBaseConfigLayout(geom, cfg, true, error), error); - env->Cleanup(); // Shuffle node locayion, assure that layout error occured - std::random_shuffle(locations.begin(), locations.end()); - env->Initialize(); - env->Sim(TDuration::Seconds(100)); - cfg = env->FetchBaseConfig(); - CheckBaseConfigLayout(geom, cfg, true, error); + do { + env->Cleanup(); + std::random_shuffle(locations.begin(), locations.end()); + env->Initialize(); + env->Sim(TDuration::Seconds(100)); + cfg = env->FetchBaseConfig(); + } while (CheckBaseConfigLayout(geom, cfg, true, error)); Cerr << error << Endl; // Sanitize groups @@ -86,6 +96,18 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { UNIT_ASSERT_C(CheckBaseConfigLayout(geom, cfg, true, error), error); } + Y_UNIT_TEST(Test3dc) { + Test(TBlobStorageGroupType::ErasureMirror3dc, 3, 5, 1); + } + + Y_UNIT_TEST(TestBlock4Plus2) { + Test(TBlobStorageGroupType::Erasure4Plus2Block, 1, 10, 2); + } + + Y_UNIT_TEST(TestMirror3of4) { + Test(TBlobStorageGroupType::ErasureMirror3of4, 1, 10, 2); + } + TString PrintGroups(TBlobStorageGroupType groupType, const NKikimrBlobStorage::TBaseConfig& cfg, std::vector<TNodeLocation> locations) { TGroupGeometryInfo geom = CreateGroupGeometry(groupType); @@ -137,6 +159,7 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { } void TestMultipleRealmsOccupation(bool allowMultipleRealmsOccupation) { + TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc; std::vector<TNodeLocation> locations; TLocationGenerator locationGenerator = [](ui32 dc, ui32 rack, ui32 unit) { NActorsInterconnect::TNodeLocation proto; @@ -152,9 +175,8 @@ Y_UNIT_TEST_SUITE(GroupLayoutSanitizer) { }; MakeLocations(locations, 4, 5, 1, locationGenerator); std::unique_ptr<TEnvironmentSetup> env; - CreateEnv(env, locations); + CreateEnv(env, locations, groupType); - TBlobStorageGroupType groupType = TBlobStorageGroupType::ErasureMirror3dc; TGroupGeometryInfo geom = CreateGroupGeometry(groupType); env->Runtime->FilterFunction = CatchSanitizeRequests; diff --git a/ydb/core/mind/bscontroller/group_layout_checker.cpp b/ydb/core/mind/bscontroller/group_layout_checker.cpp index 8ab76e3e4f..31e822eb4f 100644 --- a/ydb/core/mind/bscontroller/group_layout_checker.cpp +++ b/ydb/core/mind/bscontroller/group_layout_checker.cpp @@ -1,47 +1,3 @@ #include "group_layout_checker.h" -#include "group_geometry_info.h" - -namespace NKikimr::NBsController { - - TLayoutCheckResult CheckGroupLayout(const TGroupGeometryInfo& geom, const THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>>& layout) { - using namespace NLayoutChecker; - - if (layout.empty()) { - return {}; - } - - TBlobStorageGroupInfo::TTopology topology(geom.GetType(), geom.GetNumFailRealms(), geom.GetNumFailDomainsPerFailRealm(), - geom.GetNumVDisksPerFailDomain(), true); - TGroupLayout group(topology); - TDomainMapper mapper; - THashMap<TVDiskIdShort, TPDiskLayoutPosition> map; - for (const auto& [vdiskId, p] : layout) { - const auto& [location, pdiskId] = p; - TPDiskLayoutPosition pos(mapper, location, pdiskId, geom); - group.AddDisk(pos, topology.GetOrderNumber(vdiskId)); - map.emplace(vdiskId, pos); - } - - std::vector<std::pair<TScore, TVDiskIdShort>> scoreboard; - for (const auto& [vdiskId, pos] : map) { - scoreboard.emplace_back(group.GetCandidateScore(pos, topology.GetOrderNumber(vdiskId)), vdiskId); - } - - auto comp1 = [](const auto& x, const auto& y) { return x.second < y.second; }; - std::sort(scoreboard.begin(), scoreboard.end(), comp1); - - auto comp = [](const auto& x, const auto& y) { return x.first.BetterThan(y.first); }; - std::sort(scoreboard.begin(), scoreboard.end(), comp); - TLayoutCheckResult res; - const auto reference = scoreboard.back().first; - if (!reference.SameAs({})) { // not perfectly correct layout - for (; !scoreboard.empty() && !scoreboard.back().first.BetterThan(reference); scoreboard.pop_back()) { - res.Candidates.push_back(scoreboard.back().second); - } - } - return res; - } - -} // NKikimr::NBsController Y_DECLARE_OUT_SPEC(, NKikimr::NBsController::NLayoutChecker::TEntityId, stream, value) { value.Output(stream); } diff --git a/ydb/core/mind/bscontroller/group_layout_checker.h b/ydb/core/mind/bscontroller/group_layout_checker.h index 3c42fef3d4..192a6690c9 100644 --- a/ydb/core/mind/bscontroller/group_layout_checker.h +++ b/ydb/core/mind/bscontroller/group_layout_checker.h @@ -289,6 +289,4 @@ namespace NKikimr::NBsController { } }; - TLayoutCheckResult CheckGroupLayout(const TGroupGeometryInfo& geom, const THashMap<TVDiskIdShort, std::pair<TNodeLocation, TPDiskId>>& layout); - } // NKikimr::NBsController diff --git a/ydb/core/mind/bscontroller/impl.h b/ydb/core/mind/bscontroller/impl.h index e5b745e3f6..e91c805e48 100644 --- a/ydb/core/mind/bscontroller/impl.h +++ b/ydb/core/mind/bscontroller/impl.h @@ -1520,6 +1520,7 @@ public: private: TString InstanceId; std::shared_ptr<std::atomic_uint64_t> SelfHealUnreassignableGroups = std::make_shared<std::atomic_uint64_t>(); + std::shared_ptr<std::atomic_uint64_t> GroupLayoutSanitizerInvalidGroups = std::make_shared<std::atomic_uint64_t>(); TMaybe<TActorId> MigrationId; TVSlots VSlots; // ordering is important TPDisks PDisks; // ordering is important diff --git a/ydb/core/mind/bscontroller/self_heal.cpp b/ydb/core/mind/bscontroller/self_heal.cpp index 7618d1a93e..de5282bce9 100644 --- a/ydb/core/mind/bscontroller/self_heal.cpp +++ b/ydb/core/mind/bscontroller/self_heal.cpp @@ -284,6 +284,7 @@ namespace NKikimr::NBsController { bool DonorMode; THostRecordMap HostRecords; std::shared_ptr<TControlWrapper> EnableSelfHealWithDegraded; + std::shared_ptr<std::atomic_uint64_t> GroupsWithInvalidLayoutCounter; using TTopologyDescr = std::tuple<TBlobStorageGroupType::EErasureSpecies, ui32, ui32, ui32>; THashMap<TTopologyDescr, std::shared_ptr<TBlobStorageGroupInfo::TTopology>> Topologies; @@ -296,7 +297,8 @@ namespace NKikimr::NBsController { public: TSelfHealActor(ui64 tabletId, std::shared_ptr<std::atomic_uint64_t> unreassignableGroups, THostRecordMap hostRecords, bool groupLayoutSanitizerEnabled, bool allowMultipleRealmsOccupation, bool donorMode, - std::shared_ptr<TControlWrapper> enableSelfHealWithDegraded) + std::shared_ptr<TControlWrapper> enableSelfHealWithDegraded, + std::shared_ptr<std::atomic_uint64_t> groupsWithInvalidLayoutCounter) : TabletId(tabletId) , UnreassignableGroups(std::move(unreassignableGroups)) , GroupLayoutSanitizerEnabled(groupLayoutSanitizerEnabled) @@ -304,6 +306,7 @@ namespace NKikimr::NBsController { , DonorMode(donorMode) , HostRecords(std::move(hostRecords)) , EnableSelfHealWithDegraded(std::move(enableSelfHealWithDegraded)) + , GroupsWithInvalidLayoutCounter(std::move(groupsWithInvalidLayoutCounter)) {} void Bootstrap(const TActorId& parentId) { @@ -318,17 +321,16 @@ namespace NKikimr::NBsController { void Handle(TEvControllerUpdateSelfHealInfo::TPtr& ev) { if (const auto& setting = ev->Get()->GroupLayoutSanitizerEnabled) { - bool previousSetting = std::exchange(GroupLayoutSanitizerEnabled, *setting); - if (!previousSetting && GroupLayoutSanitizerEnabled) { - UpdateLayoutInformationForAllGroups(); - } + std::exchange(GroupLayoutSanitizerEnabled, *setting); } + if (const auto& setting = ev->Get()->AllowMultipleRealmsOccupation) { bool previousSetting = std::exchange(AllowMultipleRealmsOccupation, *setting); if (previousSetting != AllowMultipleRealmsOccupation) { UpdateLayoutInformationForAllGroups(); } } + if (const auto& setting = ev->Get()->DonorMode) { DonorMode = *setting; } @@ -345,9 +347,7 @@ namespace NKikimr::NBsController { g.Content = std::move(*data); - if (GroupLayoutSanitizerEnabled) { - UpdateGroupLayoutInformation(g); - } + UpdateGroupLayoutInformation(g); ui32 numFailRealms = 0; ui32 numFailDomainsPerFailRealm = 0; @@ -500,6 +500,7 @@ namespace NKikimr::NBsController { } } + GroupsWithInvalidLayoutCounter->store(GroupsWithInvalidLayout.Size()); UnreassignableGroups->store(counter); } @@ -899,7 +900,7 @@ namespace NKikimr::NBsController { IActor *TBlobStorageController::CreateSelfHealActor() { Y_ABORT_UNLESS(HostRecords); return new TSelfHealActor(TabletID(), SelfHealUnreassignableGroups, HostRecords, GroupLayoutSanitizerEnabled, - AllowMultipleRealmsOccupation, DonorMode, EnableSelfHealWithDegraded); + AllowMultipleRealmsOccupation, DonorMode, EnableSelfHealWithDegraded, GroupLayoutSanitizerInvalidGroups); } void TBlobStorageController::InitializeSelfHealState() { @@ -1159,6 +1160,7 @@ namespace NKikimr::NBsController { ); TabletCounters->Simple()[NBlobStorageController::COUNTER_SELF_HEAL_UNREASSIGNABLE_GROUPS] = SelfHealUnreassignableGroups->load(); + TabletCounters->Simple()[NBlobStorageController::COUNTER_GROUP_LAYOUT_SANITIZER_INVALID_GROUPS] = GroupLayoutSanitizerInvalidGroups->load(); Schedule(TDuration::Seconds(15), new TEvPrivate::TEvUpdateSelfHealCounters); } diff --git a/ydb/core/protos/counters_bs_controller.proto b/ydb/core/protos/counters_bs_controller.proto index 5d6cdae97e..82642fa424 100644 --- a/ydb/core/protos/counters_bs_controller.proto +++ b/ydb/core/protos/counters_bs_controller.proto @@ -28,6 +28,7 @@ enum ESimpleCounters { COUNTER_DISK_SCRUB_CUR_DISKS = 18 [(CounterOpts) = {Name: "CurrentlyScrubbedDisks"}]; COUNTER_DISK_SCRUB_CUR_GROUPS = 19 [(CounterOpts) = {Name: "CurrentlyScrubbedGroups"}]; COUNTER_SELF_HEAL_UNREASSIGNABLE_GROUPS = 20 [(CounterOpts) = {Name: "SelfHealUnreassignableGroups"}]; + COUNTER_GROUP_LAYOUT_SANITIZER_INVALID_GROUPS = 21 [(CounterOpts) = {Name: "GroupLayoutSanitizerInvlaidGroups"}]; } enum ECumulativeCounters { |