diff options
author | t1mursadykov <t1mursadykov@ydb.tech> | 2023-05-04 18:58:35 +0300 |
---|---|---|
committer | t1mursadykov <t1mursadykov@ydb.tech> | 2023-05-04 18:58:35 +0300 |
commit | d8041116809f56885d742109346f7d530e7b0def (patch) | |
tree | 621b10bc3f620085689f725692844275963d884e | |
parent | 82951621d9c50b5360d435b26f44d38adfd03f0d (diff) | |
download | ydb-d8041116809f56885d742109346f7d530e7b0def.tar.gz |
Unify checking in CMS
-rw-r--r-- | ydb/core/cms/CMakeLists.darwin-x86_64.txt | 11 | ||||
-rw-r--r-- | ydb/core/cms/CMakeLists.linux-aarch64.txt | 11 | ||||
-rw-r--r-- | ydb/core/cms/CMakeLists.linux-x86_64.txt | 11 | ||||
-rw-r--r-- | ydb/core/cms/CMakeLists.windows-x86_64.txt | 11 | ||||
-rw-r--r-- | ydb/core/cms/checkers_ut.cpp | 294 | ||||
-rw-r--r-- | ydb/core/cms/cluster_info.cpp | 113 | ||||
-rw-r--r-- | ydb/core/cms/cluster_info.h | 34 | ||||
-rw-r--r-- | ydb/core/cms/cluster_info_ut.cpp | 37 | ||||
-rw-r--r-- | ydb/core/cms/cms.cpp | 107 | ||||
-rw-r--r-- | ydb/core/cms/cms_impl.h | 3 | ||||
-rw-r--r-- | ydb/core/cms/cms_ut.cpp | 90 | ||||
-rw-r--r-- | ydb/core/cms/cms_ut_common.cpp | 10 | ||||
-rw-r--r-- | ydb/core/cms/erasure_checkers.cpp | 482 | ||||
-rw-r--r-- | ydb/core/cms/erasure_checkers.h | 144 | ||||
-rw-r--r-- | ydb/core/cms/node_checkers.cpp | 280 | ||||
-rw-r--r-- | ydb/core/cms/node_checkers.h | 102 | ||||
-rw-r--r-- | ydb/core/cms/ut/CMakeLists.darwin-x86_64.txt | 1 | ||||
-rw-r--r-- | ydb/core/cms/ut/CMakeLists.linux-aarch64.txt | 1 | ||||
-rw-r--r-- | ydb/core/cms/ut/CMakeLists.linux-x86_64.txt | 1 | ||||
-rw-r--r-- | ydb/core/cms/ut/CMakeLists.windows-x86_64.txt | 1 | ||||
-rw-r--r-- | ydb/public/api/protos/draft/ydb_maintenance.proto | 10 |
21 files changed, 1241 insertions, 513 deletions
diff --git a/ydb/core/cms/CMakeLists.darwin-x86_64.txt b/ydb/core/cms/CMakeLists.darwin-x86_64.txt index cf839e87ff8..7ef104adaa9 100644 --- a/ydb/core/cms/CMakeLists.darwin-x86_64.txt +++ b/ydb/core/cms/CMakeLists.darwin-x86_64.txt @@ -22,6 +22,12 @@ get_built_tool_path( enum_parser ) get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) +get_built_tool_path( TOOL_rescompiler_bin TOOL_rescompiler_dependency tools/rescompiler/bin @@ -93,6 +99,11 @@ generate_enum_serilization(ydb-core-cms INCLUDE_HEADERS ydb/core/cms/node_checkers.h ) +generate_enum_serilization(ydb-core-cms + ${CMAKE_SOURCE_DIR}/ydb/core/cms/erasure_checkers.h + INCLUDE_HEADERS + ydb/core/cms/erasure_checkers.h +) add_global_library_for(ydb-core-cms.global ydb-core-cms) target_link_libraries(ydb-core-cms.global PUBLIC diff --git a/ydb/core/cms/CMakeLists.linux-aarch64.txt b/ydb/core/cms/CMakeLists.linux-aarch64.txt index e1dcbee6629..54abfb25b63 100644 --- a/ydb/core/cms/CMakeLists.linux-aarch64.txt +++ b/ydb/core/cms/CMakeLists.linux-aarch64.txt @@ -22,6 +22,12 @@ get_built_tool_path( enum_parser ) get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) +get_built_tool_path( TOOL_rescompiler_bin TOOL_rescompiler_dependency tools/rescompiler/bin @@ -94,6 +100,11 @@ generate_enum_serilization(ydb-core-cms INCLUDE_HEADERS ydb/core/cms/node_checkers.h ) +generate_enum_serilization(ydb-core-cms + ${CMAKE_SOURCE_DIR}/ydb/core/cms/erasure_checkers.h + INCLUDE_HEADERS + ydb/core/cms/erasure_checkers.h +) add_global_library_for(ydb-core-cms.global ydb-core-cms) target_link_libraries(ydb-core-cms.global PUBLIC diff --git a/ydb/core/cms/CMakeLists.linux-x86_64.txt b/ydb/core/cms/CMakeLists.linux-x86_64.txt index e1dcbee6629..54abfb25b63 100644 --- a/ydb/core/cms/CMakeLists.linux-x86_64.txt +++ b/ydb/core/cms/CMakeLists.linux-x86_64.txt @@ -22,6 +22,12 @@ get_built_tool_path( enum_parser ) get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) +get_built_tool_path( TOOL_rescompiler_bin TOOL_rescompiler_dependency tools/rescompiler/bin @@ -94,6 +100,11 @@ generate_enum_serilization(ydb-core-cms INCLUDE_HEADERS ydb/core/cms/node_checkers.h ) +generate_enum_serilization(ydb-core-cms + ${CMAKE_SOURCE_DIR}/ydb/core/cms/erasure_checkers.h + INCLUDE_HEADERS + ydb/core/cms/erasure_checkers.h +) add_global_library_for(ydb-core-cms.global ydb-core-cms) target_link_libraries(ydb-core-cms.global PUBLIC diff --git a/ydb/core/cms/CMakeLists.windows-x86_64.txt b/ydb/core/cms/CMakeLists.windows-x86_64.txt index cf839e87ff8..7ef104adaa9 100644 --- a/ydb/core/cms/CMakeLists.windows-x86_64.txt +++ b/ydb/core/cms/CMakeLists.windows-x86_64.txt @@ -22,6 +22,12 @@ get_built_tool_path( enum_parser ) get_built_tool_path( + TOOL_enum_parser_bin + TOOL_enum_parser_dependency + tools/enum_parser/enum_parser + enum_parser +) +get_built_tool_path( TOOL_rescompiler_bin TOOL_rescompiler_dependency tools/rescompiler/bin @@ -93,6 +99,11 @@ generate_enum_serilization(ydb-core-cms INCLUDE_HEADERS ydb/core/cms/node_checkers.h ) +generate_enum_serilization(ydb-core-cms + ${CMAKE_SOURCE_DIR}/ydb/core/cms/erasure_checkers.h + INCLUDE_HEADERS + ydb/core/cms/erasure_checkers.h +) add_global_library_for(ydb-core-cms.global ydb-core-cms) target_link_libraries(ydb-core-cms.global PUBLIC diff --git a/ydb/core/cms/checkers_ut.cpp b/ydb/core/cms/checkers_ut.cpp new file mode 100644 index 00000000000..0964432c5f6 --- /dev/null +++ b/ydb/core/cms/checkers_ut.cpp @@ -0,0 +1,294 @@ +#include "cluster_info.h" +#include "erasure_checkers.h" +#include "node_checkers.h" +#include "ut_helpers.h" +#include "util/string/cast.h" + +#include <ydb/core/protos/cms.pb.h> +#include <ydb/public/api/protos/draft/ydb_maintenance.pb.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/generic/ptr.h> +#include <util/generic/vector.h> + +#include <bitset> +#include <string> + +namespace NKikimr::NCmsTest { + +using namespace NCms; +using namespace NKikimrCms; +using namespace Ydb::Maintenance; + +TVector<TVDiskID> GenerateDefaultBlock42Group() { + TVector<TVDiskID> group; + for (ui32 i = 0; i < 8; ++i) { + group.push_back(TVDiskID(0, 1, 0, i % 8, 0)); + } + return group; +} + +TVector<TVDiskID> GenerateDefaultMirror3dcGroup() { + TVector<TVDiskID> group; + for (ui32 i = 0; i < 9; ++i) { + group.push_back(TVDiskID(0, 1, i / 3, i % 3, 0)); + } + return group; +} + +Y_UNIT_TEST_SUITE(TCmsCheckersTest) { + Y_UNIT_TEST(DefaultErasureCheckerAvailabilityMode) + { + TDefaultErasureChecker checker(0); + + auto vdisks = GenerateDefaultBlock42Group(); + for (auto vdisk : vdisks) { + checker.UpdateVDisk(vdisk, UP); + } + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[0], MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_OK); + + checker.UpdateVDisk(vdisks[0], DOWN); + + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[0], MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_OK); + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[1], MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS); + + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[0], MODE_KEEP_AVAILABLE, 0, 0), ActionState::ACTION_REASON_OK); + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[1], MODE_KEEP_AVAILABLE, 0, 0), ActionState::ACTION_REASON_OK); + + checker.LockVDisk(vdisks[0]); + + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[0], MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_ALREADY_LOCKED); + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[1], MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS); + + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[0], MODE_KEEP_AVAILABLE, 0, 0), ActionState::ACTION_REASON_ALREADY_LOCKED); + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[1], MODE_KEEP_AVAILABLE, 0, 0), ActionState::ACTION_REASON_OK); + } + + Y_UNIT_TEST(Mirror3dcCheckerAvailabilityMode) + { + TMirror3dcChecker checker(0); + + auto vdisks = GenerateDefaultMirror3dcGroup(); + for (auto vdisk : vdisks) { + checker.UpdateVDisk(vdisk, UP); + } + + // One disabled disk for max availability + for (ui32 i = 0; i < 9; ++i) { + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[i], MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_OK); + checker.LockVDisk(vdisks[i]); + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[i], MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_ALREADY_LOCKED); + + for (ui32 j = 0; j < 9; ++j) { + if (i == j) + continue; + + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[j], MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS); + } + + checker.UnlockVDisk(vdisks[i]); + } + + for (ui32 dc = 0; dc < 3; ++dc) { + // Minus 1 dc + checker.LockVDisk(vdisks[dc * 3]); + checker.LockVDisk(vdisks[dc * 3 + 1]); + checker.LockVDisk(vdisks[dc * 3 + 2]); + + for (ui32 i = 0; i < 9; ++i) { + if ((i <= dc * 3 + 2) && (i >= dc * 3)) { + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[i], MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_ALREADY_LOCKED); + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[i], MODE_KEEP_AVAILABLE, 0, 0), ActionState::ACTION_REASON_ALREADY_LOCKED); + continue; + } + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[i], MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS); + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[i], MODE_KEEP_AVAILABLE, 0, 0), ActionState::ACTION_REASON_OK); + } + + // Minus 2 in dc + for (ui32 i = 0; i < 3; ++i) { + checker.UnlockVDisk(vdisks[dc * 3 + i]); + + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[dc * 3 + i], MODE_KEEP_AVAILABLE, 0, 0), ActionState::ACTION_REASON_OK); + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[dc * 3 + i], MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS); + + for (ui32 j = 0; j < 9; ++j) { + if ((j <= dc * 3 + 2) && (j >= dc * 3)) + continue; + + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[i], MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS); + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[i], MODE_KEEP_AVAILABLE, 0, 0), ActionState::ACTION_REASON_OK); + } + + checker.LockVDisk(vdisks[dc * 3 + i]); + } + + checker.UnlockVDisk(vdisks[dc * 3]); + checker.UnlockVDisk(vdisks[dc * 3 + 1]); + checker.UnlockVDisk(vdisks[dc * 3 + 2]); + } + + // Minus 1 in each dc + for (ui32 i = 0; i < 3; ++i) { + checker.LockVDisk(vdisks[i]); + checker.LockVDisk(vdisks[i + 3]); + checker.LockVDisk(vdisks[i + 6]); + + for (ui32 j = 0; j < 9; ++j) { + if (j % 3 == i) + continue; + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[j], MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS); + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[j], MODE_KEEP_AVAILABLE, 0, 0), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS); + } + + checker.UnlockVDisk(vdisks[i]); + checker.UnlockVDisk(vdisks[i + 3]); + checker.UnlockVDisk(vdisks[i + 6]); + } + + checker.UpdateVDisk(vdisks[0], DOWN); + + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[0], MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_OK); + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[1], MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS); + } + + Y_UNIT_TEST(DefaultErasureCheckerPriorities) + { + TDefaultErasureChecker checker(0); + + auto vdisks = GenerateDefaultBlock42Group(); + for (auto vdisk : vdisks) { + checker.UpdateVDisk(vdisk, UP); + } + + // Check one scheduled task with order + for (ui32 i = 0; i < 8; ++i) { + checker.EmplaceTask(vdisks[i], 0, 1, "task-1"); + + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[i], MODE_MAX_AVAILABILITY, 0, 2), ActionState::ACTION_REASON_LOW_PRIORITY); + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[i], MODE_KEEP_AVAILABLE, 0, 2), ActionState::ACTION_REASON_LOW_PRIORITY); + + for (ui32 j = 0; j < 8; ++j) { + if (j == i) + continue; + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[j], MODE_MAX_AVAILABILITY, 0, 2), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS); + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[j], MODE_KEEP_AVAILABLE, 0, 2), ActionState::ACTION_REASON_OK); + } + + checker.RemoveTask("task-1"); + } + + // Check two scheduled task with priority and order + checker.EmplaceTask(vdisks[1], 1, 1, "task-1"); + checker.EmplaceTask(vdisks[2], 2, 1, "task-2"); + + // Priority is higher than task-1 but lower than task-2 + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[3], MODE_MAX_AVAILABILITY, 2, 2), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS); + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[3], MODE_KEEP_AVAILABLE, 2, 2), ActionState::ACTION_REASON_OK); + + checker.RemoveTask("task-1"); + checker.RemoveTask("task-2"); + } + + Y_UNIT_TEST(Mirror3dcCheckerPriorities) + { + TMirror3dcChecker checker(0); + + auto vdisks = GenerateDefaultMirror3dcGroup(); + for (auto vdisk : vdisks) { + checker.UpdateVDisk(vdisk, UP); + } + + // task-1 > task-2 > task-3 > task-4 + checker.EmplaceTask(vdisks[0], 2, 2, "task-1"); + checker.EmplaceTask(vdisks[1], 2, 5, "task-2"); + checker.EmplaceTask(vdisks[2], 1, 2, "task-3"); + checker.EmplaceTask(vdisks[3], 1, 4, "task-4"); + + // Highest priority + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[4], MODE_MAX_AVAILABILITY, 3, 1), ActionState::ACTION_REASON_OK); + // Blocked by all tasks + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[4], MODE_KEEP_AVAILABLE, 1, 6), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS); + // Blocked by task-1, task-2, task-3 + UNIT_ASSERT_EQUAL(checker.TryToLockVDisk(vdisks[3], MODE_KEEP_AVAILABLE, 1, 3), ActionState::ACTION_REASON_OK); + } + + Y_UNIT_TEST(ClusterNodesCounter) + { + const ui32 nodeCount = 30; + TClusterLimitsCounter checker(0, 0); + + for (ui32 i = 1; i <= nodeCount; ++i) { + checker.UpdateNode(i, UP); + } + + // Without limit allow all nodes + for (ui32 i = 1; i < nodeCount; ++i) { + checker.LockNode(i); + } + + UNIT_ASSERT_EQUAL(checker.TryToLockNode(nodeCount, MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_OK); + UNIT_ASSERT_EQUAL(checker.TryToLockNode(nodeCount, MODE_KEEP_AVAILABLE, 0, 0), ActionState::ACTION_REASON_OK); + + for (ui32 i = 1; i < nodeCount; ++i) { + checker.UnlockNode(i); + } + + // Limit 15 nodes + checker.ApplyLimits(15, 0); + for (ui32 i = 1; i < 15; ++i) { + checker.LockNode(i); + } + + UNIT_ASSERT_EQUAL(checker.TryToLockNode(17, MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_OK); + UNIT_ASSERT_EQUAL(checker.TryToLockNode(17, MODE_KEEP_AVAILABLE, 0, 0), ActionState::ACTION_REASON_OK); + + checker.ApplyLimits(0, 50); + + UNIT_ASSERT_EQUAL(checker.TryToLockNode(17, MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_OK); + UNIT_ASSERT_EQUAL(checker.TryToLockNode(17, MODE_KEEP_AVAILABLE, 0, 0), ActionState::ACTION_REASON_OK); + + checker.LockNode(15); + checker.ApplyLimits(15, 0); + + UNIT_ASSERT_EQUAL(checker.TryToLockNode(17, MODE_MAX_AVAILABILITY, 0, 0), ActionState::ACTION_REASON_DISABLED_NODES_LIMIT_REACHED); + UNIT_ASSERT_EQUAL(checker.TryToLockNode(17, MODE_KEEP_AVAILABLE, 0, 0), ActionState::ACTION_REASON_DISABLED_NODES_LIMIT_REACHED); + + checker.ApplyLimits(0, 50); + + for (ui32 i = 1; i <= 15; ++i) { + checker.UnlockNode(i); + } + + checker.ApplyLimits(0, 0); + for (ui32 i = 1; i < nodeCount; ++i) { + checker.EmplaceTask(i, 1, 3, "task-1"); + } + + UNIT_ASSERT_EQUAL(checker.TryToLockNode(nodeCount, MODE_MAX_AVAILABILITY, 1, 1), ActionState::ACTION_REASON_OK); + UNIT_ASSERT_EQUAL(checker.TryToLockNode(nodeCount, MODE_KEEP_AVAILABLE, 1, 1), ActionState::ACTION_REASON_OK); + + UNIT_ASSERT_EQUAL(checker.TryToLockNode(nodeCount, MODE_MAX_AVAILABILITY, 1, 4), ActionState::ACTION_REASON_OK); + UNIT_ASSERT_EQUAL(checker.TryToLockNode(nodeCount, MODE_KEEP_AVAILABLE, 1, 4), ActionState::ACTION_REASON_OK); + + checker.RemoveTask("task-1"); + + checker.ApplyLimits(15, 0); + for (ui32 i = 1; i < 15; ++i) { + checker.EmplaceTask(i, 1, 3, "task-2"); + } + checker.EmplaceTask(15, 1, 4, "task-3"); + + UNIT_ASSERT_EQUAL(checker.TryToLockNode(17, MODE_MAX_AVAILABILITY, 1, 1), ActionState::ACTION_REASON_OK); + UNIT_ASSERT_EQUAL(checker.TryToLockNode(17, MODE_KEEP_AVAILABLE, 1, 1), ActionState::ACTION_REASON_OK); + + UNIT_ASSERT_EQUAL(checker.TryToLockNode(17, MODE_MAX_AVAILABILITY, 1, 5), ActionState::ACTION_REASON_DISABLED_NODES_LIMIT_REACHED); + UNIT_ASSERT_EQUAL(checker.TryToLockNode(17, MODE_KEEP_AVAILABLE, 1, 5), ActionState::ACTION_REASON_DISABLED_NODES_LIMIT_REACHED); + + checker.RemoveTask("task-2"); + checker.RemoveTask("task-3"); + } + +} +} diff --git a/ydb/core/cms/cluster_info.cpp b/ydb/core/cms/cluster_info.cpp index 28e6baeddbf..1a3c85d5b33 100644 --- a/ydb/core/cms/cluster_info.cpp +++ b/ydb/core/cms/cluster_info.cpp @@ -1,8 +1,11 @@ #include "cluster_info.h" #include "cms_state.h" #include "node_checkers.h" +#include "erasure_checkers.h" +#include <ydb/core/protos/cms.pb.h> #include <ydb/core/protos/services.pb.h> +#include <ydb/public/api/protos/draft/ydb_maintenance.pb.h> #include <library/cpp/actors/core/actor.h> #include <library/cpp/actors/core/log.h> @@ -401,7 +404,7 @@ void TClusterInfo::SetNodeState(ui32 nodeId, NKikimrCms::EState state, const NKi } } - node.UpdateNodeState(); + node.UpdateNodeState(); } void TClusterInfo::ClearNode(ui32 nodeId) @@ -416,6 +419,10 @@ void TClusterInfo::ClearNode(ui32 nodeId) node.HasTenantInfo = false; node.State = NKikimrCms::DOWN; node.UpdateNodeState(); + + for (auto& vdisk : node.VDisks) { + BSGroup(vdisk.GroupID).GroupChecker->UpdateVDisk(vdisk, DOWN); + } } void TClusterInfo::ApplyInitialNodeTenants(const TActorContext& ctx, const THashMap<ui32, TString>& nodeTenants) @@ -489,7 +496,15 @@ void TClusterInfo::UpdatePDiskState(const TPDiskID &id, const NKikimrWhiteboard: } auto &pdisk = PDiskRef(id); - pdisk.State = info.GetState() == NKikimrBlobStorage::TPDiskState::Normal ? UP : DOWN; + auto state = info.GetState() == NKikimrBlobStorage::TPDiskState::Normal ? UP : DOWN; + pdisk.State = state; + + if (state == UP) + return; + + for (auto &vdisk : pdisk.VDisks) { + BSGroup(vdisk.GroupID).GroupChecker->UpdateVDisk(vdisk, state); + } } void TClusterInfo::AddVDisk(const NKikimrBlobStorage::TBaseConfig::TVSlot &info) @@ -545,10 +560,13 @@ void TClusterInfo::UpdateVDiskState(const TVDiskID &id, const NKikimrWhiteboard: } auto &vdisk = VDiskRef(id); - if (info.GetVDiskState() == NKikimrWhiteboard::OK && info.GetReplicated()) + if (info.GetVDiskState() == NKikimrWhiteboard::OK && info.GetReplicated()) { vdisk.State = UP; - else + BSGroup(vdisk.VDiskId.GroupID).GroupChecker->UpdateVDisk(id, UP); + } else { vdisk.State = DOWN; + BSGroup(vdisk.VDiskId.GroupID).GroupChecker->UpdateVDisk(id, DOWN); + } } void TClusterInfo::AddBSGroup(const NKikimrBlobStorage::TBaseConfig::TGroup &info) @@ -557,6 +575,8 @@ void TClusterInfo::AddBSGroup(const NKikimrBlobStorage::TBaseConfig::TGroup &inf bsgroup.GroupId = info.GetGroupId(); if (info.GetErasureSpecies()) bsgroup.Erasure = {TErasureType::ErasureSpeciesByName(info.GetErasureSpecies())}; + + bsgroup.GroupChecker = CreateStorageGroupChecker(bsgroup.Erasure.GetErasure(), bsgroup.GroupId); for (const auto &vdisk : info.GetVSlotId()) { TPDiskID pdiskId = {vdisk.GetNodeId(), vdisk.GetPDiskId()}; Y_VERIFY_DEBUG(HasPDisk(pdiskId)); @@ -576,8 +596,10 @@ void TClusterInfo::AddBSGroup(const NKikimrBlobStorage::TBaseConfig::TGroup &inf bsgroup.VDisks.insert(pdisk.VSlots.at(vdisk.GetVSlotId())); } - for (auto &vdisk : bsgroup.VDisks) + for (auto &vdisk : bsgroup.VDisks) { VDiskRef(vdisk).BSGroups.insert(bsgroup.GroupId); + bsgroup.GroupChecker->AddVDisk(vdisk); + } BSGroups[bsgroup.GroupId] = std::move(bsgroup); } @@ -612,12 +634,18 @@ void TClusterInfo::AddPDiskTempLock(TPDiskID pdiskId, const NKikimrCms::TAction { auto &pdisk = PDiskRef(pdiskId); pdisk.TempLocks.push_back({RollbackPoint, action}); + + for (auto& vdisk : pdisk.VDisks) { + LogManager.AddLockVDiskOperation(vdisk ,BSGroup(vdisk.GroupID).GroupChecker); + } } void TClusterInfo::AddVDiskTempLock(TVDiskID vdiskId, const NKikimrCms::TAction &action) { auto &vdisk = VDiskRef(vdiskId); vdisk.TempLocks.push_back({RollbackPoint, action}); + + LogManager.AddLockVDiskOperation(vdiskId, BSGroup(vdiskId.GroupID).GroupChecker); } static TServices MakeServices(const NKikimrCms::TAction &action) { @@ -650,6 +678,10 @@ void TClusterInfo::ApplyActionWithoutLog(const NKikimrCms::TAction &action) for (const auto node : nodes) { for (auto &nodeGroup: node->NodeGroups) nodeGroup->LockNode(node->NodeId); + + for (auto vdisk : node->VDisks) { + BSGroup(vdisk.GroupID).GroupChecker->LockVDisk(vdisk); + } } } break; @@ -657,12 +689,12 @@ void TClusterInfo::ApplyActionWithoutLog(const NKikimrCms::TAction &action) for (const auto &device : action.GetDevices()) { if (HasPDisk(device)) { auto pdisk = &PDiskRef(device); - for (auto &nodeGroup: NodeRef(pdisk->NodeId).NodeGroups) - nodeGroup->LockNode(pdisk->NodeId); + for (auto vdisk : pdisk->VDisks) + BSGroup(vdisk.GroupID).GroupChecker->LockVDisk(vdisk); } else if (HasVDisk(device)) { auto vdisk = &VDiskRef(device); - for (auto &nodeGroup: NodeRef(vdisk->NodeId).NodeGroups) - nodeGroup->LockNode(vdisk->NodeId); + + BSGroup(vdisk->VDiskId.GroupID).GroupChecker->LockVDisk(vdisk->VDiskId); } } break; @@ -825,8 +857,9 @@ ui64 TClusterInfo::AddTempLocks(const NKikimrCms::TAction &action, const TActorC LogManager.ApplyAction(action, this); - for (auto item : items) + for (auto item : items) { item->TempLocks.push_back({RollbackPoint, action}); + } return items.size(); } @@ -841,6 +874,38 @@ ui64 TClusterInfo::ScheduleActions(const TRequestInfo &request, const TActorCont item->ScheduleLock({action, request.Owner, request.RequestId, request.Order}); locks += items.size(); + + switch (action.GetType()) { + case NKikimrCms::TAction::RESTART_SERVICES: + case NKikimrCms::TAction::SHUTDOWN_HOST: + if (auto nodes = NodePtrs(action.GetHost(), MakeServices(action))) { + for (const auto node : nodes) { + for (auto& group : node->NodeGroups) { + group->EmplaceTask(node->NodeId, 0, request.Order, request.RequestId); + } + for (auto &vdisk: node->VDisks) { + BSGroup(vdisk.GroupID).GroupChecker->EmplaceTask(vdisk, 0, request.Order, request.RequestId); + } + } + } + break; + case NKikimrCms::TAction::REPLACE_DEVICES: + for (const auto &device : action.GetDevices()) { + if (HasPDisk(device)) { + auto pdisk = &PDisk(device); + for (auto &vdisk: pdisk->VDisks) { + BSGroup(vdisk.GroupID).GroupChecker->EmplaceTask(vdisk, 0, request.Order, request.RequestId); + } + } else if (HasVDisk(device)) { + auto vdisk = &VDisk(device); + BSGroup(vdisk->VDiskId.GroupID).GroupChecker->EmplaceTask(vdisk->VDiskId, 0, request.Order, request.RequestId); + } + } + break; + + default: + break; + } } return locks; @@ -850,6 +915,20 @@ void TClusterInfo::UnscheduleActions(const TString &requestId) { for (auto &entry : LockableItems) entry.second->RemoveScheduledLocks(requestId); + + for (auto &group : BSGroups) { + group.second.GroupChecker->RemoveTask(requestId); + } + + ClusterNodes->RemoveTask(requestId); + + for (auto& [_, tenantChecker] : TenantNodesChecker) { + tenantChecker->RemoveTask(requestId); + } + + for (auto& [_, sysNodesCheckers] : SysNodesCheckers) { + sysNodesCheckers->RemoveTask(requestId); + } } void TClusterInfo::DeactivateScheduledLocks(ui64 order) @@ -918,7 +997,7 @@ void TClusterInfo::GenerateTenantNodesCheckers() { void TClusterInfo::GenerateSysTabletsNodesCheckers() { for (auto tablet : BootstrapConfig.GetTablet()) { - SysNodesCheckers[tablet.GetType()] = TSimpleSharedPtr<TSysTabletsNodesCounter>(new TSysTabletsNodesCounter(tablet.GetType())); + SysNodesCheckers[tablet.GetType()] = TSimpleSharedPtr<TSysTabletsNodesCounter>(new TSysTabletsNodesCounter(tablet.GetType())); for (auto nodeId : tablet.GetNode()) { NodeToTabletTypes[nodeId].push_back(tablet.GetType()); @@ -999,6 +1078,10 @@ void TOperationLogManager::ApplyAction(const NKikimrCms::TAction &action, for (const auto node : nodes) { for (auto &nodeGroup: node->NodeGroups) AddNodeLockOperation(node->NodeId, nodeGroup); + + for (auto& vdisk : node->VDisks) { + AddLockVDiskOperation(vdisk, clusterState->BSGroup(vdisk.GroupID).GroupChecker); + } } } break; @@ -1006,13 +1089,13 @@ void TOperationLogManager::ApplyAction(const NKikimrCms::TAction &action, for (const auto &device : action.GetDevices()) { if (clusterState->HasPDisk(device)) { auto pdisk = &clusterState->PDisk(device); - for (auto &nodeGroup: clusterState->NodeRef(pdisk->NodeId).NodeGroups) - AddNodeLockOperation(pdisk->NodeId, nodeGroup); + for (auto& vdisk : pdisk->VDisks) { + AddLockVDiskOperation(vdisk, clusterState->BSGroup(vdisk.GroupID).GroupChecker); + } } else if (clusterState->HasVDisk(device)) { auto vdisk = &clusterState->VDisk(device); - for (auto &nodeGroup: clusterState->NodeRef(vdisk->NodeId).NodeGroups) - AddNodeLockOperation(vdisk->NodeId, nodeGroup); + AddLockVDiskOperation(vdisk->VDiskId, clusterState->BSGroup(vdisk->VDiskId.GroupID).GroupChecker); } } break; diff --git a/ydb/core/cms/cluster_info.h b/ydb/core/cms/cluster_info.h index 923b5a7d05f..0b7253d3688 100644 --- a/ydb/core/cms/cluster_info.h +++ b/ydb/core/cms/cluster_info.h @@ -3,6 +3,7 @@ #include "defs.h" #include "config.h" #include "downtime.h" +#include "erasure_checkers.h" #include "node_checkers.h" #include "services.h" @@ -14,6 +15,7 @@ #include <ydb/core/protos/cms.pb.h> #include <ydb/core/protos/config.pb.h> #include <ydb/core/protos/console.pb.h> +#include <ydb/public/api/protos/draft/ydb_maintenance.pb.h> #include <library/cpp/actors/core/actor.h> #include <library/cpp/actors/interconnect/interconnect.h> @@ -471,6 +473,8 @@ struct TBSGroupInfo { ui32 GroupId = 0; TErasureType Erasure; TSet<TVDiskID> VDisks; + + TSimpleSharedPtr<IStorageGroupChecker> GroupChecker; }; /** @@ -596,6 +600,32 @@ public: } }; +class TLockDiskOperation : public TOperationBase { +private: + TVDiskID VDiskId; + +private: + TSimpleSharedPtr<IStorageGroupChecker> StorageGroupChecker; + +public: + TLockDiskOperation(const TVDiskID& vdiskId, TSimpleSharedPtr<IStorageGroupChecker> checker) + : TOperationBase(OPERATION_TYPE_LOCK_DISK) + , VDiskId(vdiskId) + , StorageGroupChecker(checker) + { + } + + void Do() override final { + StorageGroupChecker->LockVDisk(VDiskId); + } + + void Undo() override final { + StorageGroupChecker->UnlockVDisk(VDiskId); + } + + +}; + class TLogRollbackPoint : public TOperationBase { public: TLogRollbackPoint() : TOperationBase(OPERATION_TYPE_ROLLBACK_POINT) @@ -624,6 +654,10 @@ public: Log.emplace_back(new TLockNodeOperation(nodeId, nodesState))->Do(); } + void AddLockVDiskOperation(const TVDiskID& vdiskId, TSimpleSharedPtr<IStorageGroupChecker> checker) { + Log.emplace_back(new TLockDiskOperation(vdiskId, checker))->Do(); + } + void RollbackOperations() { while (!Log.empty() && Log.back()->Type != OPERATION_TYPE_ROLLBACK_POINT) { Log.back()->Undo(); diff --git a/ydb/core/cms/cluster_info_ut.cpp b/ydb/core/cms/cluster_info_ut.cpp index 0e79306a3dd..e56a530f2b5 100644 --- a/ydb/core/cms/cluster_info_ut.cpp +++ b/ydb/core/cms/cluster_info_ut.cpp @@ -292,56 +292,57 @@ Y_UNIT_TEST_SUITE(TClusterInfoTest) { UNIT_ASSERT_VALUES_EQUAL(cluster->NodesCount("localhost"), 2); cluster->AddPDisk(MakePDiskConfig(1, 1)); + cluster->AddPDisk(MakePDiskConfig(2, 2)); + cluster->AddVDisk(MakeVSlotConfig(1, {0, 1, 0, 0, 0}, 1, 0)); + cluster->AddVDisk(MakeVSlotConfig(2, {0, 1, 0, 1, 0}, 2, 0)); + cluster->AddBSGroup(MakeBSGroup(0, "none", 1, 1, 0, 2, 2, 0)); + cluster->UpdatePDiskState(NCms::TPDiskID(1, 1), MakePDiskInfo(1)); UNIT_ASSERT(cluster->HasPDisk(NCms::TPDiskID(1, 1))); UNIT_ASSERT(!cluster->HasPDisk(NCms::TPDiskID(1, 2))); UNIT_ASSERT(!cluster->HasPDisk(NCms::TPDiskID(2, 1))); - CheckPDisk(cluster->PDisk(NCms::TPDiskID(1, 1)), 1, 1, UP, 0); + CheckPDisk(cluster->PDisk(NCms::TPDiskID(1, 1)), 1, 1, UP, 1); UNIT_ASSERT(cluster->Node(1).PDisks.contains(NCms::TPDiskID(1, 1))); - cluster->AddVDisk(MakeVSlotConfig(1, {0, 1, 0, 0, 0}, 1, 0)); cluster->UpdateVDiskState({0, 1, 0, 0, 0}, MakeVDiskInfo({0, 1, 0, 0, 0}, 1, 0)); UNIT_ASSERT(cluster->HasVDisk({0, 1, 0, 0, 0})); - UNIT_ASSERT(!cluster->HasVDisk({0, 1, 0, 1, 0})); - CheckVDisk(cluster->VDisk({0, 1, 0, 0, 0}), {0, 1, 0, 0, 0}, 1, UP, 1, 0); + CheckVDisk(cluster->VDisk({0, 1, 0, 0, 0}), {0, 1, 0, 0, 0}, 1, UP, 1, 1); UNIT_ASSERT_VALUES_EQUAL(cluster->PDisk(NCms::TPDiskID(1, 1)).VDisks.size(), 1); UNIT_ASSERT(cluster->PDisk(NCms::TPDiskID(1, 1)).VDisks.contains(TVDiskID(0, 1, 0, 0, 0))); - cluster->AddPDisk(MakePDiskConfig(2, 2)); cluster->UpdatePDiskState(NCms::TPDiskID(2, 2), MakePDiskInfo(2)); UNIT_ASSERT(cluster->HasPDisk(NCms::TPDiskID(2, 2))); - CheckPDisk(cluster->PDisk(NCms::TPDiskID(2, 2)), 2, 2, UP, 0); + CheckPDisk(cluster->PDisk(NCms::TPDiskID(2, 2)), 2, 2, UP, 1); - cluster->AddVDisk(MakeVSlotConfig(2, {0, 1, 0, 1, 0}, 2, 0)); cluster->UpdateVDiskState({0, 1, 0, 1, 0}, MakeVDiskInfo({0, 1, 0, 1, 0}, 2, 0)); UNIT_ASSERT(cluster->HasVDisk({0, 1, 0, 1, 0})); UNIT_ASSERT(cluster->HasPDisk(NCms::TPDiskID(2, 2))); CheckPDisk(cluster->PDisk(NCms::TPDiskID(2, 2)), 2, 2, UP, 1); UNIT_ASSERT(cluster->PDisk(NCms::TPDiskID(2, 2)).VDisks.contains(TVDiskID(0, 1, 0, 1, 0))); - cluster->AddBSGroup(MakeBSGroup(1, "none", 1, 1, 0, 2, 2, 0)); - UNIT_ASSERT(cluster->HasBSGroup(1)); - UNIT_ASSERT(!cluster->HasBSGroup(2)); - CheckBSGroup(cluster->BSGroup(1), 1, TErasureType::ErasureNone, 2, + UNIT_ASSERT(cluster->HasBSGroup(0)); + UNIT_ASSERT(!cluster->HasBSGroup(1)); + CheckBSGroup(cluster->BSGroup(0), 0, TErasureType::ErasureNone, 2, TVDiskID(0, 1, 0, 0, 0), TVDiskID(0, 1, 0, 1, 0)); - CheckVDisk(cluster->VDisk({0, 1, 0, 0, 0}), 1, 1); - CheckVDisk(cluster->VDisk({0, 1, 0, 1, 0}), 1, 1); + CheckVDisk(cluster->VDisk({0, 1, 0, 0, 0}), 1, 0); + CheckVDisk(cluster->VDisk({0, 1, 0, 1, 0}), 1, 0); cluster->AddPDisk(MakePDiskConfig(3, 3)); + cluster->AddVDisk(MakeVSlotConfig(3, {0, 1, 0, 2, 0}, 3, 0)); + cluster->AddBSGroup(MakeBSGroup(2, "none", 1, 1, 0, 3, 3, 0)); + cluster->UpdatePDiskState(NCms::TPDiskID(3, 3), MakePDiskInfo(3)); UNIT_ASSERT(cluster->HasPDisk(NCms::TPDiskID(3, 3))); - CheckPDisk(cluster->PDisk(NCms::TPDiskID(3, 3)), 3, 3, UP, 0); + CheckPDisk(cluster->PDisk(NCms::TPDiskID(3, 3)), 3, 3, UP, 1); - cluster->AddVDisk(MakeVSlotConfig(3, {0, 1, 0, 2, 0}, 3, 0)); cluster->UpdateVDiskState({0, 1, 0, 2, 0}, MakeVDiskInfo({0, 1, 0, 2, 0}, 3, 0)); UNIT_ASSERT(cluster->HasVDisk({0, 1, 0, 2, 0})); - CheckVDisk(cluster->VDisk({0, 1, 0, 2, 0}), TVDiskID(0, 1, 0, 2, 0), 3, UP, 3, 0); + CheckVDisk(cluster->VDisk({0, 1, 0, 2, 0}), TVDiskID(0, 1, 0, 2, 0), 3, UP, 3, 1); - cluster->AddBSGroup(MakeBSGroup(2, "none", 1, 1, 0, 3, 3, 0)); UNIT_ASSERT(cluster->HasBSGroup(2)); CheckBSGroup(cluster->BSGroup(2), 2, TErasureType::ErasureNone, 2, TVDiskID(0, 1, 0, 0, 0), TVDiskID(0, 1, 0, 2, 0)); - CheckVDisk(cluster->VDisk({0, 1, 0, 0, 0}), 2, 1, 2); + CheckVDisk(cluster->VDisk({0, 1, 0, 0, 0}), 2, 0, 2); UNIT_ASSERT(cluster->HasVDisk({0, 1, 0, 2, 0})); CheckVDisk(cluster->VDisk({0, 1, 0, 2, 0}), TVDiskID(0, 1, 0, 2, 0), 3, UP, 3, 1, 2); diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp index 211a7e0c622..c400618ccaa 100644 --- a/ydb/core/cms/cms.cpp +++ b/ydb/core/cms/cms.cpp @@ -17,6 +17,7 @@ #include <ydb/core/protos/config_units.pb.h> #include <ydb/core/protos/counters_cms.pb.h> #include <ydb/core/tablet_flat/tablet_flat_executed.h> +#include <ydb/public/api/protos/draft/ydb_maintenance.pb.h> #include <library/cpp/actors/core/actor.h> #include <library/cpp/actors/core/hfunc.h> @@ -110,6 +111,7 @@ namespace { bool TCms::CheckPermissionRequest(const TPermissionRequest &request, TPermissionResponse &response, TPermissionRequest &scheduled, + ui64 requestOrder, const TActorContext &ctx) { static THashMap<EStatusCode, ui32> CodesRate = BuildCodesRateMap({ @@ -166,6 +168,7 @@ bool TCms::CheckPermissionRequest(const TPermissionRequest &request, opts.TenantPolicy = request.GetTenantPolicy(); opts.AvailabilityMode = request.GetAvailabilityMode(); opts.PartialPermissionAllowed = allowPartial; + opts.Order = requestOrder; TErrorInfo error; @@ -335,7 +338,7 @@ bool TCms::CheckAction(const TAction &action, case TAction::SHUTDOWN_HOST: return CheckActionShutdownHost(action, opts, error, ctx); case TAction::REPLACE_DEVICES: - return CheckActionReplaceDevices(action, opts.PermissionDuration, error); + return CheckActionReplaceDevices(action, opts, error); case TAction::START_SERVICES: case TAction::STOP_SERVICES: case TAction::ADD_HOST: @@ -542,9 +545,10 @@ bool TCms::CheckSysTabletsNode(const TActionOptions &opts, } for (auto &tabletType : ClusterInfo->NodeToTabletTypes[node.NodeId]) { - if (!ClusterInfo->SysNodesCheckers[tabletType]->TryToLockNode(node.NodeId, opts.AvailabilityMode)) { + auto reason = ClusterInfo->SysNodesCheckers[tabletType]->TryToLockNode(node.NodeId, opts.AvailabilityMode, 0, opts.Order); + if (reason != Ydb::Maintenance::ActionState::ACTION_REASON_OK) { error.Code = TStatus::DISALLOW_TEMP; - error.Reason = ClusterInfo->SysNodesCheckers[tabletType]->ReadableReason(node.NodeId, opts.AvailabilityMode); + error.Reason = ClusterInfo->SysNodesCheckers[tabletType]->ReadableReason(node.NodeId, opts.AvailabilityMode, reason); error.Deadline = TActivationContext::Now() + State->Config.DefaultRetryTime; return false; } @@ -561,24 +565,27 @@ bool TCms::TryToLockNode(const TAction& action, TDuration duration = TDuration::MicroSeconds(action.GetDuration()); duration += opts.PermissionDuration; - if (!ClusterInfo->ClusterNodes->TryToLockNode(node.NodeId, opts.AvailabilityMode)) + auto clusterReason = ClusterInfo->ClusterNodes->TryToLockNode(node.NodeId, opts.AvailabilityMode, 0, opts.Order); + if (clusterReason != Ydb::Maintenance::ActionState::ACTION_REASON_OK) { error.Code = TStatus::DISALLOW_TEMP; - error.Reason = ClusterInfo->ClusterNodes->ReadableReason(node.NodeId, opts.AvailabilityMode); + error.Reason = ClusterInfo->ClusterNodes->ReadableReason(node.NodeId, opts.AvailabilityMode, clusterReason); error.Deadline = TActivationContext::Now() + State->Config.DefaultRetryTime; return false; } if (node.Tenant - && opts.TenantPolicy != NONE - && !ClusterInfo->TenantNodesChecker[node.Tenant]->TryToLockNode(node.NodeId, opts.AvailabilityMode)) - { - error.Code = TStatus::DISALLOW_TEMP; - error.Reason = ClusterInfo->TenantNodesChecker[node.Tenant]->ReadableReason(node.NodeId, opts.AvailabilityMode); - error.Deadline = TActivationContext::Now() + State->Config.DefaultRetryTime; + && opts.TenantPolicy != NONE) { + auto tenantReason = ClusterInfo->TenantNodesChecker[node.Tenant]->TryToLockNode(node.NodeId, opts.AvailabilityMode, 0, opts.Order); - return false; + if (tenantReason != Ydb::Maintenance::ActionState::ACTION_REASON_OK) { + error.Code = TStatus::DISALLOW_TEMP; + error.Reason = ClusterInfo->TenantNodesChecker[node.Tenant]->ReadableReason(node.NodeId, opts.AvailabilityMode, tenantReason); + error.Deadline = TActivationContext::Now() + State->Config.DefaultRetryTime; + + return false; + } } return true; @@ -657,35 +664,37 @@ bool TCms::TryToLockVDisk(const TActionOptions& opts, return false; } - auto counters = CreateErasureCounter(ClusterInfo->BSGroup(groupId).Erasure.GetErasure(), vdisk, groupId); - counters->CountGroupState(ClusterInfo, State->Config.DefaultRetryTime, duration, error); + ui32 tempLocksCount = 0; + for (auto& vdiskId : group.VDisks) { + if (vdisk.VDiskId == vdiskId) + continue; - switch (opts.AvailabilityMode) { - case MODE_MAX_AVAILABILITY: - if (!counters->CheckForMaxAvailability(error, defaultDeadline, opts.PartialPermissionAllowed)) { - return false; + if (!ClusterInfo->VDisk(vdiskId).TempLocks.empty() + || !ClusterInfo->Node(ClusterInfo->VDisk(vdiskId).NodeId).TempLocks.empty() + || !ClusterInfo->PDisk(ClusterInfo->VDisk(vdiskId).PDiskId).TempLocks.empty()) { + tempLocksCount += 1; } - break; - case MODE_KEEP_AVAILABLE: - if (!counters->CheckForKeepAvailability(ClusterInfo, error, defaultDeadline, opts.PartialPermissionAllowed)) { - return false; - } - break; - case MODE_FORCE_RESTART: - if ( counters->GroupAlreadyHasLockedDisks() && opts.PartialPermissionAllowed) { - error.Code = TStatus::DISALLOW_TEMP; - error.Reason = "You cannot get two or more disks from the same group at the same time" - " without specifying the PartialPermissionAllowed parameter"; - error.Deadline = defaultDeadline; - return false; - } - // Any number of down disks is OK for this mode. - break; - default: - error.Code = TStatus::WRONG_REQUEST; - error.Reason = Sprintf("Unknown availability mode: %s (%" PRIu32 ")", - EAvailabilityMode_Name(opts.AvailabilityMode).data(), - static_cast<ui32>(opts.AvailabilityMode)); + } + + if (opts.PartialPermissionAllowed && tempLocksCount == 1) { + error.Code = TStatus::DISALLOW_TEMP; + error.Reason = "You cannot get two or more disks from the same group at the same time"; + error.Deadline = defaultDeadline; + return false; + } + + auto result = group.GroupChecker->TryToLockVDisk(vdisk.VDiskId, opts.AvailabilityMode, 0, opts.Order); + bool resultIsOk = result == Ydb::Maintenance::ActionState::ACTION_REASON_OK; + + if (!resultIsOk && !opts.PartialPermissionAllowed && tempLocksCount > 0) { + error.Code = TStatus::DISALLOW; + error.Reason = "Request is incorrect. You will never get a permissions. Try with PartialPermissionAllowed"; + return false; + } + + if (!resultIsOk) { + error.Code = TStatus::DISALLOW_TEMP; + error.Reason = group.GroupChecker->ReadableReason(vdisk.VDiskId, opts.AvailabilityMode, result); error.Deadline = defaultDeadline; return false; } @@ -1550,13 +1559,9 @@ void TCms::Handle(TEvCms::TEvPermissionRequest::TPtr &ev, } } - ClusterInfo->LogManager.PushRollbackPoint(); - for (const auto &scheduled_request : State->ScheduledRequests) { - for (auto &action : scheduled_request.second.Request.GetActions()) - ClusterInfo->LogManager.ApplyAction(action, ClusterInfo); - } - bool ok = CheckPermissionRequest(rec, resp->Record, scheduled.Request, ctx); - ClusterInfo->LogManager.RollbackOperations(); + LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::CMS, "Next request Id: " << State->NextRequestId); + + bool ok = CheckPermissionRequest(rec, resp->Record, scheduled.Request, State->NextRequestId, ctx); // Schedule request if required. if (rec.GetDryRun()) { @@ -1616,20 +1621,12 @@ void TCms::Handle(TEvCms::TEvCheckRequest::TPtr &ev, const TActorContext &ctx) auto requestStartTime = TInstant::Now(); - ClusterInfo->LogManager.PushRollbackPoint(); - for (const auto &scheduled_request : State->ScheduledRequests) { - if (scheduled_request.second.Order < request.Order) { - for (auto &action : scheduled_request.second.Request.GetActions()) - ClusterInfo->LogManager.ApplyAction(action, ClusterInfo); - } - } // Deactivate locks of this and later requests to // avoid false conflicts. ClusterInfo->DeactivateScheduledLocks(request.Order); request.Request.SetAvailabilityMode(rec.GetAvailabilityMode()); - bool ok = CheckPermissionRequest(request.Request, resp->Record, scheduled.Request, ctx); + bool ok = CheckPermissionRequest(request.Request, resp->Record, scheduled.Request, request.Order, ctx); ClusterInfo->ReactivateScheduledLocks(); - ClusterInfo->LogManager.RollbackOperations(); // Schedule request if required. if (rec.GetDryRun()) { diff --git a/ydb/core/cms/cms_impl.h b/ydb/core/cms/cms_impl.h index 9cb688b125a..d2038ec6e6f 100644 --- a/ydb/core/cms/cms_impl.h +++ b/ydb/core/cms/cms_impl.h @@ -105,12 +105,14 @@ private: NKikimrCms::ETenantPolicy TenantPolicy; NKikimrCms::EAvailabilityMode AvailabilityMode; bool PartialPermissionAllowed; + ui64 Order; TActionOptions(TDuration dur) : PermissionDuration(dur) , TenantPolicy(NKikimrCms::DEFAULT) , AvailabilityMode(NKikimrCms::MODE_MAX_AVAILABILITY) , PartialPermissionAllowed(false) + , Order(0) {} }; @@ -275,6 +277,7 @@ private: bool CheckPermissionRequest(const NKikimrCms::TPermissionRequest &request, NKikimrCms::TPermissionResponse &response, NKikimrCms::TPermissionRequest &scheduled, + const ui64 requestOrder, const TActorContext &ctx); bool IsActionHostValid(const NKikimrCms::TAction &action, TErrorInfo &error) const; bool ParseServices(const NKikimrCms::TAction &action, TServices &services, TErrorInfo &error) const; diff --git a/ydb/core/cms/cms_ut.cpp b/ydb/core/cms/cms_ut.cpp index 4b7ea52ce1f..4ec398add89 100644 --- a/ydb/core/cms/cms_ut.cpp +++ b/ydb/core/cms/cms_ut.cpp @@ -729,92 +729,6 @@ Y_UNIT_TEST_SUITE(TCmsTest) { env.CheckWalleCheckTask("task-2", TStatus::ALLOW, env.GetNodeId(1)); } - Y_UNIT_TEST(Notifications) - { - TCmsTestEnv env(8); - env.AdvanceCurrentTime(TDuration::Minutes(20)); - - // User is not specified. - env.CheckNotification(TStatus::WRONG_REQUEST, "", env.GetCurrentTime(), - MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000)); - // Too old. - env.CheckNotification(TStatus::WRONG_REQUEST, "user", env.GetCurrentTime() - TDuration::Minutes(10), - MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000)); - // Store notification user-1. - auto id1 = env.CheckNotification - (TStatus::OK, "user", env.GetCurrentTime() + TDuration::Minutes(10), - MakeAction(TAction::REPLACE_DEVICES, env.GetNodeId(0), 60000000, env.PDiskName(1, 0))); - - // OK to replace the same device before notification start time. - env.CheckPermissionRequest("user", false, true, true, true, TStatus::ALLOW, - MakeAction(TAction::REPLACE_DEVICES, env.GetNodeId(0), 60000000, env.PDiskName(1, 0))); - - // Intersects with notification. - env.CheckPermissionRequest("user", false, true, true, true, TStatus::DISALLOW_TEMP, - MakeAction(TAction::REPLACE_DEVICES, env.GetNodeId(0), 10 * 60000000, env.PDiskName(1, 0))); - - // Store notification user-2. - auto id2 = env.CheckNotification(TStatus::OK, "user", env.GetCurrentTime(), - MakeAction(TAction::REPLACE_DEVICES, env.GetNodeId(0), 60000000, env.PDiskName(2, 0))); - // Store notificaiton user1-3. - auto id3 = env.CheckNotification(TStatus::OK, "user1", env.GetCurrentTime(), - MakeAction(TAction::REPLACE_DEVICES, env.GetNodeId(0), 60000000, env.PDiskName(3, 0))); - // Get notification with no user. - env.CheckGetNotification("", id1, TStatus::WRONG_REQUEST); - // Get user-1. - env.CheckGetNotification("user", id1, TStatus::OK); - // Get with wrong user. - env.CheckGetNotification("user1", id1, TStatus::WRONG_REQUEST); - // Get with wrong id. - env.CheckGetNotification("user", "wrong-id", TStatus::WRONG_REQUEST); - // List notifications for user. - env.CheckListNotifications("user", TStatus::OK, 2); - // List notifications for user1. - env.CheckListNotifications("user1", TStatus::OK, 1); - // List with no user. - env.CheckListNotifications("", TStatus::WRONG_REQUEST, 0); - // Reject notification with no user. - env.CheckRejectNotification("", id1, TStatus::WRONG_REQUEST); - // Reject notification with wrong user. - env.CheckRejectNotification("user1", id1, TStatus::WRONG_REQUEST); - // Reject user-1 (dry run) - env.CheckRejectNotification("user", id1, TStatus::OK, true); - // Get user-1. - env.CheckGetNotification("user", id1, TStatus::OK); - // Reject user1-3. - env.CheckRejectNotification("user1", id3, TStatus::OK); - // Reject user-2. - env.CheckRejectNotification("user", id2, TStatus::OK); - // List notifications for user. - env.CheckListNotifications("user", TStatus::OK, 1); - // List notifications for user1. - env.CheckListNotifications("user1", TStatus::OK, 0); - // Get rejected user1-3. - env.CheckGetNotification("user1", id3, TStatus::WRONG_REQUEST); - // Get rejected user-2. - env.CheckGetNotification("user", id2, TStatus::WRONG_REQUEST); - // Get user-1. - env.CheckGetNotification("user", id1, TStatus::OK); - } - - Y_UNIT_TEST(PermissionDuration) { - TCmsTestEnv env(8); - - // Store notification user-1. - auto id1 = env.CheckNotification - (TStatus::OK, "user", env.GetCurrentTime() + TDuration::Minutes(10), - MakeAction(TAction::REPLACE_DEVICES, env.GetNodeId(0), 60000000, env.PDiskName(1, 0))); - - // Intersects with notification. - const TDuration _10minutes = TDuration::Minutes(10); - env.CheckPermissionRequest("user", false, true, true, true, _10minutes, TStatus::DISALLOW_TEMP, - MakeAction(TAction::REPLACE_DEVICES, env.GetNodeId(0), _10minutes.MicroSeconds(), env.PDiskName(1, 0))); - - // OK with default duration. - env.CheckPermissionRequest("user", false, true, true, true, TStatus::ALLOW, - MakeAction(TAction::REPLACE_DEVICES, env.GetNodeId(0), 60000000, env.PDiskName(1, 0))); - } - Y_UNIT_TEST(ActionWithZeroDuration) { TCmsTestEnv env(8); @@ -1309,7 +1223,6 @@ Y_UNIT_TEST_SUITE(TCmsTest) { MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage")); TFakeNodeWhiteboardService::Info[env.GetNodeId(1)].Connected = false; - env.RestartCms(); env.CheckPermissionRequest("user", false, true, false, true, MODE_MAX_AVAILABILITY, TStatus::DISALLOW_TEMP, MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage")); @@ -1317,14 +1230,12 @@ Y_UNIT_TEST_SUITE(TCmsTest) { TFakeNodeWhiteboardService::Info[env.GetNodeId(7)].Connected = false; TFakeNodeWhiteboardService::Info[env.GetNodeId(4)].Connected = false; TFakeNodeWhiteboardService::Info[env.GetNodeId(1)].Connected = false; - env.RestartCms(); env.CheckPermissionRequest("user", false, true, false, true, MODE_KEEP_AVAILABLE, TStatus::DISALLOW_TEMP, MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(5), 60000000, "storage")); // 2dc disabled TFakeNodeWhiteboardService::Info[env.GetNodeId(7)].Connected = true; - env.RestartCms(); env.CheckPermissionRequest("user", false, true, false, true, MODE_KEEP_AVAILABLE, TStatus::DISALLOW_TEMP, MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(7), 60000000, "storage")); @@ -1334,7 +1245,6 @@ Y_UNIT_TEST_SUITE(TCmsTest) { MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(5), 60000000, "storage")); TFakeNodeWhiteboardService::Info[env.GetNodeId(5)].Connected = false; - env.RestartCms(); env.CheckPermissionRequest("user", false, true, false, true, MODE_KEEP_AVAILABLE, TStatus::DISALLOW_TEMP, MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(2), 60000000, "storage")); diff --git a/ydb/core/cms/cms_ut_common.cpp b/ydb/core/cms/cms_ut_common.cpp index 96421cc7a50..444ff73933a 100644 --- a/ydb/core/cms/cms_ut_common.cpp +++ b/ydb/core/cms/cms_ut_common.cpp @@ -316,10 +316,12 @@ void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseC ui32 vdiskId = pdiskIndex * vdiskPerPdisk + vdiskIndex; ui32 groupId = groupShift + vdiskId; ui32 failRealm = 0; - if (useMirror3dcErasure) + ui32 failDomain = nodeIndex % 8; + if (useMirror3dcErasure) { failRealm = (nodeIndex % 9) / 3; - - TVDiskID id = {(ui8)groupId, 1, (ui8)failRealm, (ui8)(nodeIndex % 8), (ui8)0}; + failDomain = (nodeIndex % 9) % 3; + } + TVDiskID id = {(ui8)groupId, 1, (ui8)failRealm, (ui8)failDomain, (ui8)0}; auto &vdisk = node.VDiskStateInfo[id]; VDiskIDFromVDiskID(id, vdisk.MutableVDiskId()); @@ -337,7 +339,7 @@ void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseC vdiskConfig.SetGroupId(groupId); vdiskConfig.SetGroupGeneration(1); vdiskConfig.SetFailRealmIdx(failRealm); - vdiskConfig.SetFailDomainIdx(nodeIndex % 8); + vdiskConfig.SetFailDomainIdx(failDomain); config->MutableGroup(groupId)->AddVSlotId() ->CopyFrom(vdiskConfig.GetVSlotId()); diff --git a/ydb/core/cms/erasure_checkers.cpp b/ydb/core/cms/erasure_checkers.cpp index b98748c7932..28634af6f09 100644 --- a/ydb/core/cms/erasure_checkers.cpp +++ b/ydb/core/cms/erasure_checkers.cpp @@ -1,233 +1,361 @@ #include "erasure_checkers.h" +#include <ydb/core/protos/cms.pb.h> +#include <ydb/public/api/protos/draft/ydb_maintenance.pb.h> + +#include <library/cpp/actors/core/log.h> + +#include <util/string/cast.h> +#include <util/system/backtrace.h> +#include <util/system/yassert.h> + +#include <bitset> +#include <sstream> +#include <vector> + namespace NKikimr::NCms { -bool TErasureCounterBase::IsDown(const TVDiskInfo &vdisk, TClusterInfoPtr info, TDuration &retryTime, TErrorInfo &error) { - const auto &node = info->Node(vdisk.NodeId); - const auto &pdisk = info->PDisk(vdisk.PDiskId); - const auto defaultDeadline = TActivationContext::Now() + retryTime; +using namespace Ydb::Maintenance; - // Check we received info for PDisk. - if (!pdisk.NodeId) { - ++Down; - error.Reason = TStringBuilder() << "Missing info for " << pdisk.ItemName(); - return false; +IStorageGroupChecker::EVDiskState IStorageGroupChecker::VDiskState(NKikimrCms::EState state) { + switch (state) { + case NKikimrCms::UP: + return VDISK_STATE_UP; + case NKikimrCms::UNKNOWN: + return VDISK_STATE_UNSPECIFIED; + case NKikimrCms::DOWN: + return VDISK_STATE_DOWN; + case NKikimrCms::RESTART: + return VDISK_STATE_RESTART; + default: + Y_FAIL("Unknown EState"); } +} - return (node.NodeId != VDisk.NodeId && node.IsDown(error, defaultDeadline)) - || (pdisk.PDiskId != VDisk.PDiskId && pdisk.IsDown(error, defaultDeadline)) - || vdisk.IsDown(error, defaultDeadline); +TSimpleSharedPtr<IStorageGroupChecker> CreateStorageGroupChecker(TErasureType::EErasureSpecies es, ui32 groupId) { + switch (es) { + case TErasureType::ErasureNone: + case TErasureType::ErasureMirror3: + case TErasureType::Erasure3Plus1Block: + case TErasureType::Erasure3Plus1Stripe: + case TErasureType::Erasure4Plus2Block: + case TErasureType::Erasure3Plus2Block: + case TErasureType::Erasure4Plus2Stripe: + case TErasureType::Erasure3Plus2Stripe: + case TErasureType::ErasureMirror3Plus2: + case TErasureType::Erasure4Plus3Block: + case TErasureType::Erasure4Plus3Stripe: + case TErasureType::Erasure3Plus3Block: + case TErasureType::Erasure3Plus3Stripe: + case TErasureType::Erasure2Plus3Block: + case TErasureType::Erasure2Plus3Stripe: + case TErasureType::Erasure2Plus2Block: + case TErasureType::Erasure2Plus2Stripe: + case TErasureType::ErasureMirror3of4: + return TSimpleSharedPtr<IStorageGroupChecker>(new TDefaultErasureChecker(groupId)); + case TErasureType::ErasureMirror3dc: + return TSimpleSharedPtr<IStorageGroupChecker>(new TMirror3dcChecker(groupId)); + default: + Y_FAIL("Unknown erasure type: %d", es); + } } -bool TErasureCounterBase::IsLocked(const TVDiskInfo &vdisk, TClusterInfoPtr info, TDuration &retryTime, - TDuration &duration, TErrorInfo &error) -{ - const auto &node = info->Node(vdisk.NodeId); - const auto &pdisk = info->PDisk(vdisk.PDiskId); - // Check we received info for VDisk. - if (!vdisk.NodeId || !vdisk.PDiskId) { - ++Down; - error.Code = TStatus::DISALLOW_TEMP; - error.Reason = TStringBuilder() << "Missing info for " << vdisk.ItemName(); - return false; +void TErasureCheckerBase::AddVDisk(const TVDiskID& vdiskId) { + if (DiskToState.contains(vdiskId)) { + return; } - - return node.IsLocked(error, retryTime, TActivationContext::Now(), duration) - || pdisk.IsLocked(error, retryTime, TActivationContext::Now(), duration) - || vdisk.IsLocked(error, retryTime, TActivationContext::Now(), duration); + DiskToState[vdiskId].State = VDISK_STATE_UNSPECIFIED; } -bool TErasureCounterBase::GroupAlreadyHasLockedDisks() const { - return HasAlreadyLockedDisks; -} +void TErasureCheckerBase::UpdateVDisk(const TVDiskID& vdiskId, EState state) { + AddVDisk(vdiskId); -bool TErasureCounterBase::CheckForMaxAvailability(TErrorInfo &error, TInstant &defaultDeadline, bool allowPartial) const { - if (Locked + Down > 1) { - if (HasAlreadyLockedDisks && !allowPartial) { - error.Code = TStatus::DISALLOW; - error.Reason = "The request is incorrect: too many disks from the one group. " - "Fix the request or set PartialPermissionAllowed to true"; - return false; - } + const auto newState = VDiskState(state); + + // The disk is marked based on the information obtained by InfoCollector. + // If we marked the disk DOWN, it means that there was a reason + if (DiskToState[vdiskId].State == VDISK_STATE_DOWN + && newState == VDISK_STATE_UP) + return; + + if (DiskToState[vdiskId].State == VDISK_STATE_DOWN) { + --DownVDisksCount; + } + + if (DiskToState[vdiskId].State == VDISK_STATE_LOCKED || + DiskToState[vdiskId].State == VDISK_STATE_RESTART) { + --LockedVDisksCount; + } + + DiskToState[vdiskId].State = newState; - error.Code = TStatus::DISALLOW_TEMP; - error.Reason = TStringBuilder() << "Issue in affected group " << GroupId - << ". " << "Too many locked and down vdisks: " << Locked + Down; - error.Deadline = defaultDeadline; - return false; + if (newState == VDISK_STATE_RESTART || newState == VDISK_STATE_LOCKED) { + ++LockedVDisksCount; + } + + if (newState == VDISK_STATE_DOWN) { + ++DownVDisksCount; } - return true; } -void TDefaultErasureCounter::CountVDisk(const TVDiskInfo &vdisk, TClusterInfoPtr info, TDuration retryTime, - TDuration duration, TErrorInfo &error) -{ - Y_VERIFY_DEBUG(vdisk.VDiskId != VDisk.VDiskId); - - // Check locks. - TErrorInfo err; - if (IsLocked(vdisk, info, retryTime, duration, err)) { - ++Locked; - error.Code = err.Code; - error.Reason = TStringBuilder() << "Issue in affected group " << GroupId - << ". " << err.Reason; - error.Deadline = Max(error.Deadline, err.Deadline); - return; +void TErasureCheckerBase::LockVDisk(const TVDiskID& vdiskId) { + Y_VERIFY(DiskToState.contains(vdiskId)); + + ++LockedVDisksCount; + if (DiskToState[vdiskId].State == VDISK_STATE_DOWN) { + DiskToState[vdiskId].State = VDISK_STATE_RESTART; + --DownVDisksCount; + } else { + DiskToState[vdiskId].State = VDISK_STATE_LOCKED; } +} - // Check if disk is down. - if (IsDown(vdisk, info, retryTime, err)) { - ++Down; - error.Code = err.Code; - error.Reason = TStringBuilder() << "Issue in affected group " << GroupId - << ". " << err.Reason; - error.Deadline = Max(error.Deadline, err.Deadline); +void TErasureCheckerBase::UnlockVDisk(const TVDiskID& vdiskId) { + Y_VERIFY(DiskToState.contains(vdiskId)); + + --LockedVDisksCount; + if (DiskToState[vdiskId].State == VDISK_STATE_RESTART) { + DiskToState[vdiskId].State = VDISK_STATE_DOWN; + ++DownVDisksCount; + } else { + DiskToState[vdiskId].State = VDISK_STATE_UP; } } -bool TDefaultErasureCounter::CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo &error, - TInstant &defaultDeadline, bool allowPartial) const -{ - if (HasAlreadyLockedDisks && allowPartial) { - error.Code = TStatus::DISALLOW_TEMP; - error.Reason = "You cannot get two or more disks from the same group at the same time" - " without specifying the PartialPermissionAllowed parameter"; - error.Deadline = defaultDeadline; - return false; - } - - if (Down + Locked > info->BSGroup(GroupId).Erasure.ParityParts()) { - if (HasAlreadyLockedDisks && !allowPartial) { - error.Code = TStatus::DISALLOW; - error.Reason = "The request is incorrect: too many disks from the one group. " - "Fix the request or set PartialPermissionAllowed to true"; - return false; +void TErasureCheckerBase::EmplaceTask(const TVDiskID &vdiskId, i32 priority, + ui64 order, const std::string &taskUId) { + + auto& priorities = DiskToState[vdiskId].Priorities; + auto it = priorities.lower_bound(TVDiskState::TTaskPriority(priority, order, "")); + + if (it != priorities.end() && (it->Order == order && it->Priority == priority)) { + if (it->TaskUId == taskUId) { + return; } - error.Code = TStatus::DISALLOW_TEMP; - error.Reason = TStringBuilder() << "Cannot lock disk " << VDisk.PrettyItemName() - << ". Too many locked nodes for group " << GroupId; - error.Deadline = defaultDeadline; - return false; + Y_FAIL("Task with the same priority and order already exists"); + } else { + priorities.emplace_hint(it, priority, order, taskUId); } - return true; } -bool TMirror3dcCounter::CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo &error, - TInstant &defaultDeadline, bool allowPartial) const -{ - Y_UNUSED(info); +void TErasureCheckerBase::RemoveTask(const std::string &taskUId) { + auto taskUIdsEqual = [&taskUId](const TVDiskState::TTaskPriority &p) { + return p.TaskUId == taskUId; + }; + + for (auto &[vdiskId, vdiskState] : DiskToState) { + auto it = std::find_if(vdiskState.Priorities.begin(), + vdiskState.Priorities.end(), taskUIdsEqual); + + if (it == vdiskState.Priorities.end()) { + continue; + } + + vdiskState.Priorities.erase(it); - if (HasAlreadyLockedDisks && allowPartial) { - error.Code = TStatus::DISALLOW_TEMP; - error.Reason = "You cannot get two or more disks from the same group at the same time" - " without specifying the PartialPermissionAllowed parameter"; - error.Deadline = defaultDeadline; - return false; } +} - if (DataCenterDisabledNodes.size() <= 1) - return true; +ActionState::ActionReason TDefaultErasureChecker::TryToLockVDisk(const TVDiskID &vdiskId, EAvailabilityMode mode, i32 priority, ui64 order) const { + Y_VERIFY(DiskToState.contains(vdiskId)); - if (DataCenterDisabledNodes.size() == 2 - && (DataCenterDisabledNodes.begin()->second <= 1 - || (++DataCenterDisabledNodes.begin())->second <= 1)) - { - return true; + const auto& diskState = DiskToState.at(vdiskId); + + if (diskState.State == VDISK_STATE_RESTART + || diskState.State == VDISK_STATE_LOCKED) { + return ActionState::ACTION_REASON_ALREADY_LOCKED; } - if (HasAlreadyLockedDisks && !allowPartial) { - error.Code = TStatus::DISALLOW; - error.Reason = "The request is incorrect: too many disks from the one group. " - "Fix the request or set PartialPermissionAllowed to true"; - return false; + auto taskPriority = TVDiskState::TTaskPriority(priority, order, ""); + if (!diskState.Priorities.empty() && taskPriority < *diskState.Priorities.rbegin()) { + return ActionState::ACTION_REASON_LOW_PRIORITY; } - if (DataCenterDisabledNodes.size() > 2) { - error.Code = TStatus::DISALLOW_TEMP; - error.Reason = TStringBuilder() << "Issue in affected group " << GroupId - << ". Too many data centers have unavailable vdisks: " - << DataCenterDisabledNodes.size(); - error.Deadline = defaultDeadline; - return false; + if (mode == NKikimrCms::MODE_FORCE_RESTART) { + return ActionState::ACTION_REASON_OK; } - error.Code = TStatus::DISALLOW_TEMP; - error.Reason = TStringBuilder() << "Issue in affected group " << GroupId - << ". Data centers have too many unavailable vdisks"; - error.Deadline = defaultDeadline; + // Check how many disks are waiting for higher prioriry task to be locked + ui32 priorityLockedCount = 0; + for (auto &[id, vdiskState] : DiskToState) { + if (vdiskState.State != VDISK_STATE_UP) { + continue; + } - return false; -} + if (!vdiskState.Priorities.empty() && taskPriority < *vdiskState.Priorities.rbegin()) { + ++priorityLockedCount; + } + } -void TMirror3dcCounter::CountVDisk(const TVDiskInfo &vdisk, TClusterInfoPtr info, TDuration retryTime, - TDuration duration, TErrorInfo &error) -{ - Y_VERIFY_DEBUG(vdisk.VDiskId != VDisk.VDiskId); - - // Check locks. - TErrorInfo err; - if (IsLocked(vdisk, info, retryTime, duration, err) - || IsDown(vdisk, info, retryTime, err)) { - error.Code = err.Code; - error.Reason = TStringBuilder() << "Issue in affected group " << GroupId - << ". " << err.Reason; - error.Deadline = Max(error.Deadline, err.Deadline); - ++Locked; - ++DataCenterDisabledNodes[vdisk.VDiskId.FailRealm]; + ui32 disksLimit = 0; + if (diskState.State == VDISK_STATE_DOWN) { + disksLimit = 1; } + + switch (mode) { + case NKikimrCms::MODE_MAX_AVAILABILITY: + if ((LockedVDisksCount + DownVDisksCount + priorityLockedCount) > disksLimit) { + return ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS; + } + break; + case NKikimrCms::MODE_KEEP_AVAILABLE: + if ((LockedVDisksCount + DownVDisksCount + priorityLockedCount) >= disksLimit + 2) { + return ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS; + } + break; + default: + Y_FAIL("Unexpected Availability mode"); + } + + return ActionState::ACTION_REASON_OK; } -void TMirror3dcCounter::CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) { - for (const auto &vdId : info->BSGroup(GroupId).VDisks) { - if (vdId != VDisk.VDiskId) - CountVDisk(info->VDisk(vdId), info, retryTime, duration, error); +ActionState::ActionReason TMirror3dcChecker::TryToLockVDisk(const TVDiskID &vdiskId, EAvailabilityMode mode, i32 priority, ui64 order) const { + Y_VERIFY(DiskToState.contains(vdiskId)); + + const auto& diskState = DiskToState.at(vdiskId); + const auto taskPriority = TVDiskState::TTaskPriority(priority, order, ""); + + if (!diskState.Priorities.empty() && taskPriority < *diskState.Priorities.rbegin()) { + return ActionState::ACTION_REASON_LOW_PRIORITY; + } + + if (mode == MODE_FORCE_RESTART) { + return ActionState::ACTION_REASON_OK; + } + + if (diskState.State == VDISK_STATE_LOCKED + || diskState.State == VDISK_STATE_RESTART) { + return ActionState::ACTION_REASON_ALREADY_LOCKED; + } + + const std::vector<std::bitset<9>> MaxOkGroups = { + 0x1E0, 0x1D0, 0x1C8, 0x1C4, 0x1C2, 0x1C1, + 0x138, 0xB8, 0x78, 0x3C, 0x3A, 0x39, + 0x107, 0x87, 0x47, 0x27, 0x17, 0xF, + }; + + ui32 priorityLockedCount = 0; + std::bitset<9> groupState(0); + for (auto& [id, state] : DiskToState) { + if (id == vdiskId) + continue; + + if (state.State != VDISK_STATE_UP + || (!state.Priorities.empty() && taskPriority < *state.Priorities.rbegin())) { + groupState |= (1 << (id.FailRealm * 3 + id.FailDomain)); + } + + if (!state.Priorities.empty() && taskPriority < *state.Priorities.rbegin()) { + ++priorityLockedCount; + } + } + groupState |= (1 << (vdiskId.FailRealm * 3 + vdiskId.FailDomain)); + + ui32 downVDisks = diskState.State == VDISK_STATE_DOWN ? DownVDisksCount - 1 : DownVDisksCount; + if (mode == NKikimrCms::MODE_MAX_AVAILABILITY) { + if ((downVDisks + LockedVDisksCount + priorityLockedCount) == 0) { + return ActionState::ACTION_REASON_OK; + } + return ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS; + } + + size_t minCount = 9; + for (auto okGroup : MaxOkGroups) { + auto xoredState = (~okGroup) & groupState; + minCount = std::min(minCount, xoredState.count()); } - ++Locked; - ++DataCenterDisabledNodes[VDisk.VDiskId.FailRealm]; - if (Locked && error.Code == TStatus::DISALLOW) { - HasAlreadyLockedDisks = true; + if (minCount > 0) { + return ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS; } + + return ActionState::ACTION_REASON_OK; } -void TDefaultErasureCounter::CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) { - for (const auto &vdId : info->BSGroup(GroupId).VDisks) { - if (vdId != VDisk.VDiskId) - CountVDisk(info->VDisk(vdId), info, retryTime, duration, error); +std::string TDefaultErasureChecker::ReadableReason(const TVDiskID &vdiskId, + EAvailabilityMode mode, ActionState::ActionReason reason) const { + std::stringstream readableReason; + + if (reason == ActionState::ACTION_REASON_OK) { + readableReason << "Action is OK"; + return readableReason.str(); } - if (Locked && error.Code == TStatus::DISALLOW) { - HasAlreadyLockedDisks = true; + + readableReason << "Cannot lock vdisk" << vdiskId.ToString() << ". "; + + switch (reason) { + case ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS: + readableReason << "Group " << GroupId + << " has too many unavailable vdisks. " + << "Down disks count: " << DownVDisksCount + << ". Locked disks count: " << LockedVDisksCount; + + if (mode == NKikimrCms::MODE_KEEP_AVAILABLE) { + readableReason << ". Limit of unavailable disks for mode " << NKikimrCms::EAvailabilityMode_Name(mode) + << " is " << 2; + } + if (mode == NKikimrCms::MODE_MAX_AVAILABILITY) { + readableReason << ". Limit of unavailable disks for mode " << NKikimrCms::EAvailabilityMode_Name(mode) + << " is " << 1; + } + break; + case ActionState::ACTION_REASON_ALREADY_LOCKED: + // TODO:: add info about lock id + readableReason << "Disk is already locked"; + break; + case ActionState::ACTION_REASON_LOW_PRIORITY: + // TODO:: add info about task with higher priority + readableReason << "Task with higher priority in progress"; + break; + default: + Y_FAIL("Unexpected Reason"); } - ++Locked; + + return readableReason.str(); } -TSimpleSharedPtr<IErasureCounter> CreateErasureCounter(TErasureType::EErasureSpecies es, const TVDiskInfo &vdisk, ui32 groupId) { - switch (es) { - case TErasureType::ErasureNone: - case TErasureType::ErasureMirror3: - case TErasureType::Erasure3Plus1Block: - case TErasureType::Erasure3Plus1Stripe: - case TErasureType::Erasure4Plus2Block: - case TErasureType::Erasure3Plus2Block: - case TErasureType::Erasure4Plus2Stripe: - case TErasureType::Erasure3Plus2Stripe: - case TErasureType::ErasureMirror3Plus2: - case TErasureType::Erasure4Plus3Block: - case TErasureType::Erasure4Plus3Stripe: - case TErasureType::Erasure3Plus3Block: - case TErasureType::Erasure3Plus3Stripe: - case TErasureType::Erasure2Plus3Block: - case TErasureType::Erasure2Plus3Stripe: - case TErasureType::Erasure2Plus2Block: - case TErasureType::Erasure2Plus2Stripe: - case TErasureType::ErasureMirror3of4: - return TSimpleSharedPtr<IErasureCounter>(new TDefaultErasureCounter(vdisk, groupId)); - case TErasureType::ErasureMirror3dc: - return TSimpleSharedPtr<IErasureCounter>(new TMirror3dcCounter(vdisk, groupId)); +std::string TMirror3dcChecker::ReadableReason(const TVDiskID &vdiskId, + EAvailabilityMode mode, ActionState::ActionReason reason) const { + std::stringstream readableReason; + + if (reason == ActionState::ACTION_REASON_OK) { + readableReason << "Action is OK"; + return readableReason.str(); + } + + readableReason << "Cannot lock vdisk" << vdiskId.ToString() << ". "; + + switch (reason) { + case ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS: + readableReason << "Group " << GroupId + << " has too many unavailable vdisks. " + << "Down disks count: " << DownVDisksCount + << ". Locked disks count: " << LockedVDisksCount; + + if (mode == NKikimrCms::MODE_KEEP_AVAILABLE) { + readableReason << ". Limit of unavailable disks for mode " << NKikimrCms::EAvailabilityMode_Name(mode) + << " is 1"; + } + if (mode == NKikimrCms::MODE_MAX_AVAILABILITY) { + readableReason << ". Limit of unavailable disks for mode " << NKikimrCms::EAvailabilityMode_Name(mode) + << "4, 3 of which are in the same data center"; + } + break; + case ActionState::ACTION_REASON_ALREADY_LOCKED: + // TODO:: add info about lock id + readableReason << "Disk is already locked"; + break; + case ActionState::ACTION_REASON_LOW_PRIORITY: + // TODO:: add info about task with higher priority + readableReason << "Task with higher priority in progress"; + break; default: - Y_FAIL("Unknown erasure type: %d", es); + Y_FAIL("Unexpected Reason"); } + + return readableReason.str(); } } // namespace NKikimr::NCms diff --git a/ydb/core/cms/erasure_checkers.h b/ydb/core/cms/erasure_checkers.h index 0c0de0be766..c958630e273 100644 --- a/ydb/core/cms/erasure_checkers.h +++ b/ydb/core/cms/erasure_checkers.h @@ -1,79 +1,133 @@ #pragma once #include "defs.h" -#include "cluster_info.h" +#include <ydb/core/blobstorage/base/blobstorage_vdiskid.h> #include <ydb/core/erasure/erasure.h> #include <ydb/core/protos/cms.pb.h> +#include <ydb/public/api/protos/draft/ydb_maintenance.pb.h> + +#include <util/generic/queue.h> +#include <util/system/compiler.h> + +#include <algorithm> +#include <functional> +#include <queue> +#include <string> namespace NKikimr::NCms { using namespace NKikimrCms; -class IErasureCounter { +class IStorageGroupChecker { +public: + enum EVDiskState : ui32 { + VDISK_STATE_UNSPECIFIED /* "Unspecified" */, + VDISK_STATE_UP /* "Up" */, + VDISK_STATE_LOCKED /* "Locked" */, + VDISK_STATE_RESTART /* "Restart" */, + VDISK_STATE_DOWN /* "Down" */, + VDISK_STATE_SCHEDULED_LOCKED /* "Scheduled locked" */ + }; + +protected: + EVDiskState VDiskState(NKikimrCms::EState state); + public: - virtual ~IErasureCounter() = default; + virtual ~IStorageGroupChecker() = default; + + virtual void AddVDisk(const TVDiskID& vdiskId) = 0; + virtual void UpdateVDisk(const TVDiskID& vdiskId, EState state) = 0; + + virtual void LockVDisk(const TVDiskID& vdiskId) = 0; + virtual void UnlockVDisk(const TVDiskID& vdiskId) = 0; - virtual bool GroupAlreadyHasLockedDisks() const = 0; - virtual bool CheckForMaxAvailability(TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const = 0; - virtual bool CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const = 0; - virtual void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) = 0; - virtual void CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) = 0; + virtual void EmplaceTask(const TVDiskID& vdiskId, i32 priority, ui64 order, const std::string& taskUId) = 0; + virtual void RemoveTask(const std::string& taskUId) = 0; + + virtual Ydb::Maintenance::ActionState::ActionReason TryToLockVDisk(const TVDiskID& vdiskId, EAvailabilityMode mode, i32 priority, ui64 order) const = 0; + virtual std::string ReadableReason(const TVDiskID& vdiskId, EAvailabilityMode mode, Ydb::Maintenance::ActionState::ActionReason reason) const = 0; }; -class TErasureCounterBase: public IErasureCounter { +class TErasureCheckerBase : public IStorageGroupChecker { protected: - ui32 Down; - ui32 Locked; - const TVDiskInfo& VDisk; - const ui32 GroupId; - bool HasAlreadyLockedDisks; + /** Structure to hold information about vdisk state and priorities and orders of some task. + * + * Requests with equal priority are processed in the order of arrival at CMS. + */ + struct TVDiskState { + public: + struct TTaskPriority { + i32 Priority; + ui64 Order; + std::string TaskUId; + + explicit TTaskPriority(i32 priority, ui64 order, const std::string& taskUId) + : Priority(priority) + , Order(order) + , TaskUId(taskUId) + {} + + bool operator<(const TTaskPriority& rhs) const { + return Priority < rhs.Priority || (Priority == rhs.Priority && Order > rhs.Order); + } + }; + public: + EVDiskState State; + std::set<TTaskPriority> Priorities; + }; protected: - bool IsDown(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration& retryTime, TErrorInfo& error); - bool IsLocked(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration& retryTime, TDuration& duration, TErrorInfo& error); + ui32 GroupId; + + THashMap<TVDiskID, TVDiskState> DiskToState; + ui32 DownVDisksCount; + ui32 LockedVDisksCount; public: - TErasureCounterBase(const TVDiskInfo& vdisk, ui32 groupId) - : Down(0) - , Locked(0) - , VDisk(vdisk) - , GroupId(groupId) - , HasAlreadyLockedDisks(false) + explicit TErasureCheckerBase(ui32 groupId) + : GroupId(groupId) + , DownVDisksCount(0) + , LockedVDisksCount(0) { } + virtual ~TErasureCheckerBase() = default; - bool GroupAlreadyHasLockedDisks() const final; - bool CheckForMaxAvailability(TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const final; -}; + void AddVDisk(const TVDiskID& vdiskId) override final; + void UpdateVDisk(const TVDiskID& vdiskId, EState state) override final; -class TDefaultErasureCounter: public TErasureCounterBase { -public: - TDefaultErasureCounter(const TVDiskInfo& vdisk, ui32 groupId) - : TErasureCounterBase(vdisk, groupId) - { - } + void LockVDisk(const TVDiskID& vdiskId) override final; + void UnlockVDisk(const TVDiskID& vdiskId) override final; - void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) override; - bool CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const override; - void CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) override; + void EmplaceTask(const TVDiskID &vdiskId, i32 priority, ui64 order, + const std::string &taskUId) override final; + void RemoveTask(const std::string &taskUId) override final; }; -class TMirror3dcCounter: public TErasureCounterBase { -private: - THashMap<ui8, ui32> DataCenterDisabledNodes; +class TDefaultErasureChecker : public TErasureCheckerBase { +public: + explicit TDefaultErasureChecker(ui32 groupId) + : TErasureCheckerBase(groupId) + {} + + virtual Ydb::Maintenance::ActionState::ActionReason TryToLockVDisk(const TVDiskID& vdiskId, EAvailabilityMode mode, i32 priority, ui64 order) const override final; + + virtual std::string ReadableReason(const TVDiskID &vdiskId, + EAvailabilityMode mode, Ydb::Maintenance::ActionState::ActionReason reason) const override final; +}; +class TMirror3dcChecker : public TErasureCheckerBase { public: - TMirror3dcCounter(const TVDiskInfo& vdisk, ui32 groupId) - : TErasureCounterBase(vdisk, groupId) - { - } + explicit TMirror3dcChecker(ui32 groupId) + : TErasureCheckerBase(groupId) + {} + + virtual Ydb::Maintenance::ActionState::ActionReason TryToLockVDisk(const TVDiskID& vdiskId, EAvailabilityMode mode, i32 priority, ui64 order) const override final; - void CountGroupState(TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo &error) override; - bool CheckForKeepAvailability(TClusterInfoPtr info, TErrorInfo& error, TInstant& defaultDeadline, bool allowPartial) const override; - void CountVDisk(const TVDiskInfo& vdisk, TClusterInfoPtr info, TDuration retryTime, TDuration duration, TErrorInfo& error) override; + virtual std::string ReadableReason(const TVDiskID &vdiskId, + EAvailabilityMode mode, Ydb::Maintenance::ActionState::ActionReason reason) const override final; }; -TSimpleSharedPtr<IErasureCounter> CreateErasureCounter(TErasureType::EErasureSpecies es, const TVDiskInfo& vdisk, ui32 groupId); +TSimpleSharedPtr<IStorageGroupChecker> CreateStorageGroupChecker(TErasureType::EErasureSpecies es, ui32 groupId); } // namespace NKikimr::NCms diff --git a/ydb/core/cms/node_checkers.cpp b/ydb/core/cms/node_checkers.cpp index 8a857b0a63f..9be56b669a4 100644 --- a/ydb/core/cms/node_checkers.cpp +++ b/ydb/core/cms/node_checkers.cpp @@ -1,4 +1,6 @@ #include "node_checkers.h" +#include "util/system/yassert.h" +#include "ydb/public/api/protos/draft/ydb_maintenance.pb.h" #include <ydb/core/protos/cms.pb.h> @@ -9,6 +11,8 @@ namespace NKikimr::NCms { #define NCH_LOG_D(stream) LOG_DEBUG_S (*TlsActivationContext, NKikimrServices::CMS, "[Nodes Counter] " << stream) #define NCH_LOG_T(stream) LOG_TRACE_S (*TlsActivationContext, NKikimrServices::CMS, "[Nodes Counter] " << stream) +using namespace Ydb::Maintenance; + TNodesLimitsCounterBase::ENodeState INodesChecker::NodeState(NKikimrCms::EState state) { switch (state) { case NKikimrCms::UP: @@ -28,7 +32,7 @@ void TNodesCounterBase::AddNode(ui32 nodeId) { if (NodeToState.contains(nodeId)) { return; } - NodeToState[nodeId] = NODE_STATE_UNSPECIFIED; + NodeToState[nodeId].State = NODE_STATE_UNSPECIFIED; } void TNodesCounterBase::UpdateNode(ui32 nodeId, NKikimrCms::EState state) { @@ -36,17 +40,17 @@ void TNodesCounterBase::UpdateNode(ui32 nodeId, NKikimrCms::EState state) { AddNode(nodeId); } - if (NodeToState[nodeId] == NODE_STATE_DOWN) { + if (NodeToState[nodeId].State == NODE_STATE_DOWN) { --DownNodesCount; } - if (NodeToState[nodeId] == NODE_STATE_LOCKED || - NodeToState[nodeId] == NODE_STATE_RESTART) { + if (NodeToState[nodeId].State == NODE_STATE_LOCKED || + NodeToState[nodeId].State == NODE_STATE_RESTART) { --LockedNodesCount; } const auto nodeState = NodeState(state); - NodeToState[nodeId] = nodeState; + NodeToState[nodeId].State = nodeState; if (nodeState == NODE_STATE_RESTART || nodeState == NODE_STATE_LOCKED) { ++LockedNodesCount; @@ -61,11 +65,11 @@ void TNodesCounterBase::LockNode(ui32 nodeId) { Y_VERIFY(NodeToState.contains(nodeId)); ++LockedNodesCount; - if (NodeToState[nodeId] == NODE_STATE_DOWN) { - NodeToState[nodeId] = NODE_STATE_RESTART; + if (NodeToState[nodeId].State == NODE_STATE_DOWN) { + NodeToState[nodeId].State = NODE_STATE_RESTART; --DownNodesCount; } else { - NodeToState[nodeId] = NODE_STATE_LOCKED; + NodeToState[nodeId].State = NODE_STATE_LOCKED; } } @@ -73,89 +77,144 @@ void TNodesCounterBase::UnlockNode(ui32 nodeId) { Y_VERIFY(NodeToState.contains(nodeId)); --LockedNodesCount; - if (NodeToState[nodeId] == NODE_STATE_RESTART) { - NodeToState[nodeId] = NODE_STATE_DOWN; + if (NodeToState[nodeId].State == NODE_STATE_RESTART) { + NodeToState[nodeId].State = NODE_STATE_DOWN; ++DownNodesCount; } else { - NodeToState[nodeId] = NODE_STATE_UP; + NodeToState[nodeId].State = NODE_STATE_UP; + } +} + +void TNodesCounterBase::EmplaceTask(const ui32 nodeId, i32 priority, ui64 order, const std::string& taskUId) { + auto& priorities = NodeToState[nodeId].Priorities; + auto it = priorities.lower_bound(TNodeState::TTaskPriority(priority, order, "")); + + if (it != priorities.end() && (it->Order == order && it->Priority == priority)) { + if (it->TaskUId == taskUId) { + return; + } + Y_FAIL("Task with the same priority and order already exists"); + } else { + priorities.emplace_hint(it, priority, order, taskUId); + } + + NodesWithScheduledTasks.insert(nodeId); +} + +void TNodesCounterBase::RemoveTask(const std::string& taskUId) { + auto taskUIdsEqual = [&taskUId](const TNodeState::TTaskPriority &p) { + return p.TaskUId == taskUId; + }; + + TVector<ui32> NodesToRemove; + for (auto nodeId : NodesWithScheduledTasks) { + auto& nodeState = NodeToState[nodeId]; + auto it = std::find_if(nodeState.Priorities.begin(), + nodeState.Priorities.end(), taskUIdsEqual); + if (it == nodeState.Priorities.end()) { + continue; + } + + nodeState.Priorities.erase(it); + + if (nodeState.Priorities.empty()) { + NodesToRemove.push_back(nodeId); + } + } + + for (auto nodeId : NodesToRemove) { + NodesWithScheduledTasks.erase(nodeId); } } -bool TNodesLimitsCounterBase::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const { +ActionState::ActionReason TNodesLimitsCounterBase::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, i32 priority, ui64 order) const { Y_VERIFY(NodeToState.contains(nodeId)); - auto nodeState = NodeToState.at(nodeId); - bool isForceRestart = mode == NKikimrCms::MODE_FORCE_RESTART; + const auto& nodeState = NodeToState.at(nodeId); + const auto taskPriority = TNodeState::TTaskPriority(priority, order, ""); - NCH_LOG_D("Checking Node: " - << nodeId << ", with state: " << ToString(nodeState) - << ", with limit: " << DisabledNodesLimit - << ", with ratio limit: " << DisabledNodesRatioLimit - << ", locked nodes: " << LockedNodesCount - << ", down nodes: " << DownNodesCount); + if (!nodeState.Priorities.empty() && (taskPriority < *nodeState.Priorities.rbegin())) { + return ActionState::ACTION_REASON_LOW_PRIORITY; + } - // Allow to maintain down/unavailable node - if (nodeState == NODE_STATE_DOWN) { - return true; + if (nodeState.State == NODE_STATE_RESTART || + nodeState.State == NODE_STATE_LOCKED || + nodeState.State == NODE_STATE_UNSPECIFIED) { + + return ActionState::ACTION_REASON_ALREADY_LOCKED; } - if (nodeState == NODE_STATE_RESTART || - nodeState == NODE_STATE_LOCKED || - nodeState == NODE_STATE_UNSPECIFIED) { + ui32 priorityLockedCount = 0; + for (auto id : NodesWithScheduledTasks) { + Y_VERIFY(!NodeToState.at(id).Priorities.empty()); - return false; + if (taskPriority < *NodeToState.at(id).Priorities.rbegin()) { + ++priorityLockedCount; + } } + ui32 downNodes = nodeState.State == NODE_STATE_DOWN ? DownNodesCount - 1 : DownNodesCount; // Always allow at least one node - if (LockedNodesCount + DownNodesCount == 0) { - return true; + if (LockedNodesCount + downNodes + priorityLockedCount == 0) { + return ActionState::ACTION_REASON_OK; } + bool isForceRestart = mode == NKikimrCms::MODE_FORCE_RESTART; + if (isForceRestart && !LockedNodesCount) { - return true; + return ActionState::ACTION_REASON_OK; } if (DisabledNodesLimit > 0 && - (LockedNodesCount + DownNodesCount + 1 > DisabledNodesLimit)) { - return false; + (LockedNodesCount + downNodes + priorityLockedCount + 1 > DisabledNodesLimit)) { + return ActionState::ACTION_REASON_DISABLED_NODES_LIMIT_REACHED; } if (DisabledNodesRatioLimit > 0 && - ((LockedNodesCount + DownNodesCount + 1) * 100 > - (NodeToState.size() * DisabledNodesRatioLimit))) { - return false; + ((LockedNodesCount + downNodes + priorityLockedCount + 1) * 100 > (NodeToState.size() * DisabledNodesRatioLimit))) { + return ActionState::ACTION_REASON_DISABLED_NODES_LIMIT_REACHED; } - return true; + return ActionState::ACTION_REASON_OK; } -bool TSysTabletsNodesCounter::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const { +ActionState::ActionReason TSysTabletsNodesCounter::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, i32 priority, ui64 order) const { Y_VERIFY(NodeToState.contains(nodeId)); - auto nodeState = NodeToState.at(nodeId); - NCH_LOG_D("Checking limits for sys tablet: " << NKikimrConfig::TBootstrap_ETabletType_Name(TabletType) - << ", on node: " << nodeId - << ", with state: " << ToString(nodeState) - << ", locked nodes: " << LockedNodesCount - << ", down nodes: " << DownNodesCount); + const auto& nodeState = NodeToState.at(nodeId); + const auto taskPriority = TNodeState::TTaskPriority(priority, order, ""); - if (nodeState == NODE_STATE_RESTART || - nodeState == NODE_STATE_LOCKED || - nodeState == NODE_STATE_UNSPECIFIED) { + if (!nodeState.Priorities.empty() && (taskPriority < *nodeState.Priorities.rbegin())) { + return ActionState::ACTION_REASON_LOW_PRIORITY; + } + + if (nodeState.State == NODE_STATE_RESTART || + nodeState.State == NODE_STATE_LOCKED || + nodeState.State == NODE_STATE_UNSPECIFIED) { + + return ActionState::ACTION_REASON_ALREADY_LOCKED; + } + + ui32 priorityLockedCount = 0; + for (auto id : NodesWithScheduledTasks) { + Y_VERIFY(!NodeToState.at(id).Priorities.empty()); - return false; + if (taskPriority < *NodeToState.at(id).Priorities.rbegin()) { + ++priorityLockedCount; + } } + ui32 downNodes = nodeState.State == NODE_STATE_DOWN ? DownNodesCount - 1 : DownNodesCount; ui32 tabletNodes = NodeToState.size(); switch (mode) { case NKikimrCms::MODE_MAX_AVAILABILITY: - if (tabletNodes > 1 && (DownNodesCount + LockedNodesCount + 1) * 2 > tabletNodes){ - return false; + if (tabletNodes > 1 && (downNodes + LockedNodesCount + priorityLockedCount + 1) * 2 > tabletNodes){ + return ActionState::ACTION_REASON_SYS_TABLETS_NODE_LIMIT_REACHED; } break; case NKikimrCms::MODE_KEEP_AVAILABLE: - if (tabletNodes > 1 && (DownNodesCount + LockedNodesCount + 1) > tabletNodes - 1) { - return false; + if (tabletNodes > 1 && (downNodes + LockedNodesCount + priorityLockedCount + 1) > tabletNodes - 1) { + return ActionState::ACTION_REASON_SYS_TABLETS_NODE_LIMIT_REACHED; } break; case NKikimrCms::MODE_FORCE_RESTART: @@ -164,7 +223,124 @@ bool TSysTabletsNodesCounter::TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabili Y_FAIL("Unknown availability mode"); } - return true; + return ActionState::ACTION_REASON_OK; } +std::string TTenantLimitsCounter::ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, + ActionState::ActionReason reason) const { + Y_UNUSED(mode); + + std::stringstream readableReason; + + if (reason == ActionState::ACTION_REASON_OK) { + readableReason << "Action is OK"; + return readableReason.str(); + } + + readableReason << "Cannot lock node: " << nodeId; + + switch (reason) { + case ActionState::ACTION_REASON_ALREADY_LOCKED: + readableReason << "Node is already locked"; + break; + case ActionState::ACTION_REASON_LOW_PRIORITY: + readableReason << "Task with higher priority in progress: " << (*NodeToState.at(nodeId).Priorities.rbegin()).TaskUId; + break; + case ActionState::ACTION_REASON_DISABLED_NODES_LIMIT_REACHED: + readableReason << ". Too many locked nodes for tenant " << TenantName + << "; locked: " << LockedNodesCount + << "; down: " << DownNodesCount + << "; total: " << NodeToState.size() + << "; limit: " << DisabledNodesLimit + << "; ratio limit: " << DisabledNodesRatioLimit << "%"; + break; + default: + Y_FAIL("Unexpected reason"); + break; + } + return readableReason.str(); +} + +std::string TClusterLimitsCounter::ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, + ActionState::ActionReason reason) const { + Y_UNUSED(mode); + + std::stringstream readableReason; + + if (reason == ActionState::ACTION_REASON_OK) { + readableReason << "Action is OK"; + return readableReason.str(); + } + + if (mode == NKikimrCms::MODE_FORCE_RESTART) { + return readableReason.str(); + } + + readableReason << "Cannot lock node: " << nodeId; + + switch (reason) { + case ActionState::ACTION_REASON_ALREADY_LOCKED: + readableReason << "Node is already locked"; + break; + case ActionState::ACTION_REASON_LOW_PRIORITY: + readableReason << "Task with higher priority in progress: " << (*NodeToState.at(nodeId).Priorities.rbegin()).TaskUId; + break; + case ActionState::ACTION_REASON_DISABLED_NODES_LIMIT_REACHED: + readableReason << ". Too many locked nodes in cluster" + << "; locked: " << LockedNodesCount + << "; down: " << DownNodesCount + << "; total: " << NodeToState.size() + << "; limit: " << DisabledNodesLimit + << "; ratio limit: " << DisabledNodesRatioLimit << "%"; + break; + default: + Y_FAIL("Unexpected reason"); + break; + } + + return readableReason.str(); +} + + +std::string TSysTabletsNodesCounter::ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, + ActionState::ActionReason reason) const { + std::stringstream readableReason; + + if (reason == ActionState::ACTION_REASON_OK) { + readableReason << "Action is OK"; + return readableReason.str(); + } + + if (mode == NKikimrCms::MODE_FORCE_RESTART) { + return readableReason.str(); + } + + switch (reason) { + case ActionState::ACTION_REASON_ALREADY_LOCKED: + readableReason << "Node is already locked"; + break; + case ActionState::ACTION_REASON_LOW_PRIORITY: + readableReason << "Task with higher priority in progress: " << (*NodeToState.at(nodeId).Priorities.rbegin()).TaskUId; + break; + case ActionState::ACTION_REASON_SYS_TABLETS_NODE_LIMIT_REACHED: + readableReason << "Cannot lock node: " << nodeId << ". Tablet " + << NKikimrConfig::TBootstrap_ETabletType_Name(TabletType) + << " has too many unavailable nodes. Locked: " + << LockedNodesCount << ". Down: " << DownNodesCount; + + if (mode == NKikimrCms::MODE_MAX_AVAILABILITY) { + readableReason << ". Limit: " << NodeToState.size() / 2 << " (50%)"; + } + + if (mode == NKikimrCms::MODE_KEEP_AVAILABLE) { + readableReason << ". Limit: " << NodeToState.size() - 1; + } + break; + default: + Y_FAIL("Unexpected reason"); + + } + + return readableReason.str(); +} } // namespace NKikimr::NCms diff --git a/ydb/core/cms/node_checkers.h b/ydb/core/cms/node_checkers.h index 72362c84dba..047a7a5c5ca 100644 --- a/ydb/core/cms/node_checkers.h +++ b/ydb/core/cms/node_checkers.h @@ -6,6 +6,7 @@ #include <ydb/core/erasure/erasure.h> #include <ydb/core/protos/cms.pb.h> #include <ydb/core/protos/config.pb.h> +#include <ydb/public/api/protos/draft/ydb_maintenance.pb.h> #include <library/cpp/actors/core/log.h> @@ -36,6 +37,7 @@ public: NODE_STATE_DOWN /* "Down" */ }; + protected: static ENodeState NodeState(NKikimrCms::EState state); @@ -48,9 +50,12 @@ public: virtual void LockNode(ui32 nodeId) = 0; virtual void UnlockNode(ui32 nodeId) = 0; - virtual bool TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const = 0; + virtual void EmplaceTask(const ui32 nodeId, i32 priority, ui64 order, const std::string& taskUId) = 0; + virtual void RemoveTask(const std::string& taskUId) = 0; + + virtual Ydb::Maintenance::ActionState::ActionReason TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, i32 priority, ui64 order) const = 0; - virtual std::string ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const = 0; + virtual std::string ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, Ydb::Maintenance::ActionState::ActionReason reason) const = 0; }; /** @@ -58,7 +63,35 @@ public: */ class TNodesCounterBase : public INodesChecker { protected: - THashMap<ui32, ENodeState> NodeToState; + /** Structure to hold information about vdisk state and priorities and orders of some task. + * + * Requests with equal priority are processed in the order of arrival at CMS. + */ + struct TNodeState { + public: + struct TTaskPriority { + i32 Priority; + ui64 Order; + std::string TaskUId; + + explicit TTaskPriority(i32 priority, ui64 order, const std::string& taskUId) + : Priority(priority) + , Order(order) + , TaskUId(taskUId) + {} + + bool operator<(const TTaskPriority& rhs) const { + return Priority < rhs.Priority || (Priority == rhs.Priority && Order > rhs.Order); + } + }; + public: + ENodeState State; + std::set<TTaskPriority> Priorities; + }; + +protected: + THashMap<ui32, TNodeState> NodeToState; + THashSet<ui32> NodesWithScheduledTasks; ui32 LockedNodesCount; ui32 DownNodesCount; @@ -73,6 +106,9 @@ public: void AddNode(ui32 nodeId) override; void UpdateNode(ui32 nodeId, NKikimrCms::EState) override; + void EmplaceTask(const ui32 nodeId, i32 priority, ui64 order, const std::string& taskUId) override final; + virtual void RemoveTask(const std::string& taskUId) override final; + void LockNode(ui32 nodeId) override; void UnlockNode(ui32 nodeId) override; }; @@ -103,7 +139,7 @@ public: DisabledNodesRatioLimit = ratioLimit; } - bool TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const override final; + Ydb::Maintenance::ActionState::ActionReason TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, i32 priority, ui64 order) const override final; }; class TTenantLimitsCounter : public TNodesLimitsCounterBase { @@ -117,20 +153,8 @@ public: { } - std::string ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const override final { - Y_UNUSED(mode); - - std::stringstream reason; - reason << "Cannot lock node: " << nodeId - << ". Too many locked nodes for tenant " << TenantName - << "; locked: " << LockedNodesCount - << "; down: " << DownNodesCount - << "; total: " << NodeToState.size() - << "; limit: " << DisabledNodesLimit - << "; ratio limit: " << DisabledNodesRatioLimit << "%"; - - return reason.str(); - } + std::string ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, + Ydb::Maintenance::ActionState::ActionReason reason) const override final; }; class TClusterLimitsCounter : public TNodesLimitsCounterBase { @@ -140,20 +164,8 @@ public: { } - std::string ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const override final { - Y_UNUSED(mode); - - std::stringstream reason; - reason << "Cannot lock node: " << nodeId - <<". Too many locked nodes in cluster" - << "; locked: " << LockedNodesCount - << "; down: " << DownNodesCount - << "; total: " << NodeToState.size() - << "; limit: " << DisabledNodesLimit - << "; ratio limit: " << DisabledNodesRatioLimit << "%"; - - return reason.str(); - } + std::string ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, + Ydb::Maintenance::ActionState::ActionReason reason) const override final; }; /** @@ -171,30 +183,10 @@ public: : TabletType(tabletType) {} - bool TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const override final; - - std::string ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode) const override final { - std::stringstream reason; + Ydb::Maintenance::ActionState::ActionReason TryToLockNode(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, i32 priority, ui64 order) const override final; - if (mode == NKikimrCms::MODE_FORCE_RESTART) { - return reason.str(); - } - - reason << "Cannot lock node: " << nodeId - << ". Tablet " - << NKikimrConfig::TBootstrap_ETabletType_Name(TabletType) - << " has too many unavailable nodes. Locked: " << LockedNodesCount - << ". Down: " << DownNodesCount; - if (mode == NKikimrCms::MODE_MAX_AVAILABILITY) { - reason << ". Limit: " << NodeToState.size() / 2 << " (50%)"; - } - - if (mode == NKikimrCms::MODE_KEEP_AVAILABLE) { - reason << ". Limit: " << NodeToState.size() - 1; - } - - return reason.str(); - } + std::string ReadableReason(ui32 nodeId, NKikimrCms::EAvailabilityMode mode, + Ydb::Maintenance::ActionState::ActionReason reason) const override final; }; } // namespace NKikimr::NCms diff --git a/ydb/core/cms/ut/CMakeLists.darwin-x86_64.txt b/ydb/core/cms/ut/CMakeLists.darwin-x86_64.txt index 922dc62a7b9..1cff84774ce 100644 --- a/ydb/core/cms/ut/CMakeLists.darwin-x86_64.txt +++ b/ydb/core/cms/ut/CMakeLists.darwin-x86_64.txt @@ -32,6 +32,7 @@ target_link_options(ydb-core-cms-ut PRIVATE CoreFoundation ) target_sources(ydb-core-cms-ut PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/core/cms/checkers_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cluster_info_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cms_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cms_tenants_ut.cpp diff --git a/ydb/core/cms/ut/CMakeLists.linux-aarch64.txt b/ydb/core/cms/ut/CMakeLists.linux-aarch64.txt index 64ee05e5931..890df0f266e 100644 --- a/ydb/core/cms/ut/CMakeLists.linux-aarch64.txt +++ b/ydb/core/cms/ut/CMakeLists.linux-aarch64.txt @@ -35,6 +35,7 @@ target_link_options(ydb-core-cms-ut PRIVATE -ldl ) target_sources(ydb-core-cms-ut PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/core/cms/checkers_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cluster_info_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cms_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cms_tenants_ut.cpp diff --git a/ydb/core/cms/ut/CMakeLists.linux-x86_64.txt b/ydb/core/cms/ut/CMakeLists.linux-x86_64.txt index ede8d036c2a..3d68909dc52 100644 --- a/ydb/core/cms/ut/CMakeLists.linux-x86_64.txt +++ b/ydb/core/cms/ut/CMakeLists.linux-x86_64.txt @@ -36,6 +36,7 @@ target_link_options(ydb-core-cms-ut PRIVATE -ldl ) target_sources(ydb-core-cms-ut PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/core/cms/checkers_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cluster_info_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cms_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cms_tenants_ut.cpp diff --git a/ydb/core/cms/ut/CMakeLists.windows-x86_64.txt b/ydb/core/cms/ut/CMakeLists.windows-x86_64.txt index 1dbbee1959a..29a6d9015d2 100644 --- a/ydb/core/cms/ut/CMakeLists.windows-x86_64.txt +++ b/ydb/core/cms/ut/CMakeLists.windows-x86_64.txt @@ -25,6 +25,7 @@ target_link_libraries(ydb-core-cms-ut PUBLIC core-testlib-default ) target_sources(ydb-core-cms-ut PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/core/cms/checkers_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cluster_info_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cms_ut.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cms_tenants_ut.cpp diff --git a/ydb/public/api/protos/draft/ydb_maintenance.proto b/ydb/public/api/protos/draft/ydb_maintenance.proto index cbd1481ed1a..329c1532479 100644 --- a/ydb/public/api/protos/draft/ydb_maintenance.proto +++ b/ydb/public/api/protos/draft/ydb_maintenance.proto @@ -87,11 +87,17 @@ message ActionState { // State storage broken. Too many (more than (nToSelect - 1) / 2) unavailable rings ACTION_REASON_STATE_STORAGE_BROKEN = 5; // Issue in cluster disabled nodes limit - ACTION_REASON_DISABLED_NODES_LIMIT_RICHED = 6; + ACTION_REASON_DISABLED_NODES_LIMIT_REACHED = 6; // Issue in tenant limits - ACTION_REASON_TENANT_DISABLED_NODES_LIMIT_RICHED = 7; + ACTION_REASON_TENANT_DISABLED_NODES_LIMIT_REACHED = 7; // Wrong request ACTION_REASON_WRONG_REQUEST = 8; + // Low priority + ACTION_REASON_LOW_PRIORITY = 9; + // Lock is granded to this item + ACTION_REASON_ALREADY_LOCKED = 10; + // Limit to nodes with sys tablets + ACTION_REASON_SYS_TABLETS_NODE_LIMIT_REACHED = 11; } Action action = 1; |