diff options
author | ilnaz <ilnaz@ydb.tech> | 2023-11-01 11:57:00 +0300 |
---|---|---|
committer | ilnaz <ilnaz@ydb.tech> | 2023-11-01 12:27:23 +0300 |
commit | 9793ffdc8db2b9940f15deebe3b951504bac64be (patch) | |
tree | 9b0299831bb7349201df3e0388cdc5651bc916d3 | |
parent | 7f97144a42658fab5a97c83c0c66b0ad957b5b0f (diff) | |
download | ydb-9793ffdc8db2b9940f15deebe3b951504bac64be.tar.gz |
Allow to reboot down nodes KIKIMR-8420
-rw-r--r-- | ydb/core/cms/cluster_info.cpp | 4 | ||||
-rw-r--r-- | ydb/core/cms/cms.cpp | 6 | ||||
-rw-r--r-- | ydb/core/cms/cms_ut.cpp | 13 | ||||
-rw-r--r-- | ydb/core/cms/walle_create_task_adapter.cpp | 89 | ||||
-rw-r--r-- | ydb/core/protos/cms.proto | 1 |
5 files changed, 55 insertions, 58 deletions
diff --git a/ydb/core/cms/cluster_info.cpp b/ydb/core/cms/cluster_info.cpp index 0893fa2ebc..d14f6de637 100644 --- a/ydb/core/cms/cluster_info.cpp +++ b/ydb/core/cms/cluster_info.cpp @@ -646,6 +646,7 @@ void TClusterInfo::ApplyActionWithoutLog(const NKikimrCms::TAction &action) switch (action.GetType()) { case TAction::RESTART_SERVICES: case TAction::SHUTDOWN_HOST: + case TAction::REBOOT_HOST: if (auto nodes = NodePtrs(action.GetHost(), MakeServices(action))) { for (const auto node : nodes) { for (auto &nodeGroup: node->NodeGroups) @@ -696,6 +697,7 @@ TSet<TLockableItem *> TClusterInfo::FindLockedItems(const NKikimrCms::TAction &a switch (action.GetType()) { case TAction::RESTART_SERVICES: case TAction::SHUTDOWN_HOST: + case TAction::REBOOT_HOST: if (auto nodes = NodePtrs(action.GetHost(), MakeServices(action))) { for (const auto node : nodes) { res.insert(node); @@ -750,6 +752,7 @@ ui64 TClusterInfo::AddLocks(const TPermissionInfo &permission, const TActorConte if (item->State == DOWN && (permission.Action.GetType() == TAction::RESTART_SERVICES || permission.Action.GetType() == TAction::SHUTDOWN_HOST + || permission.Action.GetType() == TAction::REBOOT_HOST || permission.Action.GetType() == TAction::REPLACE_DEVICES)) { item->State = RESTART; lock = true;; @@ -1001,6 +1004,7 @@ void TOperationLogManager::ApplyAction(const NKikimrCms::TAction &action, switch (action.GetType()) { case NKikimrCms::TAction::RESTART_SERVICES: case NKikimrCms::TAction::SHUTDOWN_HOST: + case NKikimrCms::TAction::REBOOT_HOST: if (auto nodes = clusterState->NodePtrs(action.GetHost(), MakeServices(action))) { for (const auto node : nodes) { for (auto &nodeGroup: node->NodeGroups) diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp index ffca58da2c..0834ef50c4 100644 --- a/ydb/core/cms/cms.cpp +++ b/ydb/core/cms/cms.cpp @@ -436,6 +436,7 @@ void TCms::AddPermissionExtensions(const TAction& action, TPermission& perm) con switch (action.GetType()) { case TAction::RESTART_SERVICES: case TAction::SHUTDOWN_HOST: + case TAction::REBOOT_HOST: AddHostExtensions(action.GetHost(), perm); break; default: @@ -493,6 +494,7 @@ bool TCms::CheckAction(const TAction &action, case TAction::RESTART_SERVICES: return CheckActionRestartServices(action, opts, error, ctx); case TAction::SHUTDOWN_HOST: + case TAction::REBOOT_HOST: return CheckActionShutdownHost(action, opts, error, ctx); case TAction::REPLACE_DEVICES: return CheckActionReplaceDevices(action, opts.PermissionDuration, error); @@ -581,8 +583,12 @@ bool TCms::CheckActionShutdownHost(const TAction &action, TErrorInfo &error, const TActorContext &ctx) const { + const bool forciblyAllow = action.GetType() == TAction::REBOOT_HOST; for (const auto node : ClusterInfo->HostNodes(action.GetHost())) { if (!CheckActionShutdownNode(action, opts, *node, error, ctx)) { + if (forciblyAllow && node->State == DOWN) { + continue; + } return false; } } diff --git a/ydb/core/cms/cms_ut.cpp b/ydb/core/cms/cms_ut.cpp index 3ff189cbd2..a82ea3f608 100644 --- a/ydb/core/cms/cms_ut.cpp +++ b/ydb/core/cms/cms_ut.cpp @@ -729,6 +729,19 @@ Y_UNIT_TEST_SUITE(TCmsTest) { env.CheckWalleCheckTask("task-2", TStatus::ALLOW, env.GetNodeId(1)); } + Y_UNIT_TEST(WalleRebootDownNode) + { + TCmsTestEnv env(8); + + // alllow + env.CheckWalleCreateTask("task-1", "reboot", false, TStatus::ALLOW, env.GetNodeId(0)); + // disallow (up) + env.CheckWalleCreateTask("task-2", "reboot", false, TStatus::DISALLOW_TEMP, env.GetNodeId(1)); + // allow (down) + TFakeNodeWhiteboardService::Info[env.GetNodeId(1)].Connected = false; + env.CheckWalleCheckTask("task-2", TStatus::ALLOW, env.GetNodeId(1)); + } + Y_UNIT_TEST(Notifications) { TCmsTestEnv env(8); diff --git a/ydb/core/cms/walle_create_task_adapter.cpp b/ydb/core/cms/walle_create_task_adapter.cpp index c1a2866e37..5c7ce0393b 100644 --- a/ydb/core/cms/walle_create_task_adapter.cpp +++ b/ydb/core/cms/walle_create_task_adapter.cpp @@ -5,6 +5,8 @@ #include <library/cpp/actors/core/actor_bootstrapped.h> #include <library/cpp/actors/core/hfunc.h> +#include <optional> + namespace NKikimr::NCms { using namespace NKikimrCms; @@ -27,19 +29,7 @@ public: LOG_INFO(ctx, NKikimrServices::CMS, "Processing Wall-E request: %s", rec.ShortDebugString().data()); - if (rec.GetAction() != "reboot" - && rec.GetAction() != "power-off" - && rec.GetAction() != "change-disk" - && rec.GetAction() != "change-memory" - && rec.GetAction() != "profile" - && rec.GetAction() != "redeploy" - && rec.GetAction() != "prepare" - && rec.GetAction() != "repair-link" - && rec.GetAction() != "repair-bmc" - && rec.GetAction() != "repair-overheat" - && rec.GetAction() != "repair-capping" - && rec.GetAction() != "deactivate" - && rec.GetAction() != "temporary-unreachable") { + if (!Actions.contains(rec.GetAction())) { ReplyWithErrorAndDie(TStatus::WRONG_REQUEST, "Unsupported action", ctx); return; } @@ -132,9 +122,10 @@ private: request->Record.SetSchedule(true); request->Record.SetDryRun(task.GetDryRun()); - TAction action; - if (task.GetAction() == "prepare" - || task.GetAction() == "deactivate") { + auto it = Actions.find(task.GetAction()); + Y_ABORT_UNLESS(it != Actions.end()); + + if (!it->second) { TAutoPtr<TEvCms::TEvWalleCreateTaskResponse> resp = new TEvCms::TEvWalleCreateTaskResponse; resp->Record.SetTaskId(task.GetTaskId()); resp->Record.MutableHosts()->CopyFrom(task.GetHosts()); @@ -142,52 +133,17 @@ private: ReplyAndDie(resp.Release(), ctx); return; } else { - // We always use infinite duration. - // Wall-E MUST delete processed tasks. - if (task.GetAction() == "reboot") { - action.SetType(TAction::SHUTDOWN_HOST); - action.SetDuration(TDuration::Max().GetValue()); - } else if (task.GetAction() == "power-off") { - action.SetType(TAction::SHUTDOWN_HOST); - action.SetDuration(TDuration::Max().GetValue()); - } else if (task.GetAction() == "change-disk") { - action.SetType(TAction::REPLACE_DEVICES); - action.SetDuration(TDuration::Max().GetValue()); - } else if (task.GetAction() == "change-memory") { - action.SetType(TAction::SHUTDOWN_HOST); - action.SetDuration(TDuration::Max().GetValue()); - } else if (task.GetAction() == "profile") { - action.SetType(TAction::SHUTDOWN_HOST); - action.SetDuration(TDuration::Max().GetValue()); - } else if (task.GetAction() == "redeploy") { - action.SetType(TAction::SHUTDOWN_HOST); - action.SetDuration(TDuration::Max().GetValue()); - } else if (task.GetAction() == "repair-link") { - action.SetType(TAction::SHUTDOWN_HOST); - action.SetDuration(TDuration::Max().GetValue()); - } else if (task.GetAction() == "repair-bmc") { - action.SetType(TAction::SHUTDOWN_HOST); - action.SetDuration(TDuration::Max().GetValue()); - } else if (task.GetAction() == "repair-overheat") { - action.SetType(TAction::SHUTDOWN_HOST); - action.SetDuration(TDuration::Max().GetValue()); - } else if (task.GetAction() == "repair-capping") { - action.SetType(TAction::SHUTDOWN_HOST); - action.SetDuration(TDuration::Max().GetValue()); - } else if (task.GetAction() == "temporary-unreachable") { - action.SetType(TAction::SHUTDOWN_HOST); - action.SetDuration(TDuration::Max().GetValue()); - } else - Y_ABORT("Unknown action"); - for (auto &host : task.GetHosts()) { - auto &hostAction = *request->Record.AddActions(); - hostAction.CopyFrom(action); - hostAction.SetHost(host); + auto &action = *request->Record.AddActions(); + action.SetHost(host); + action.SetType(*it->second); + // We always use infinite duration. + // Wall-E MUST delete processed tasks. + action.SetDuration(TDuration::Max().GetValue()); if (action.GetType() == TAction::REPLACE_DEVICES) { for (const auto node : cluster->HostNodes(host)) { for (auto &pdiskId : node->PDisks) - *hostAction.AddDevices() = cluster->PDisk(pdiskId).GetDeviceName(); + *action.AddDevices() = cluster->PDisk(pdiskId).GetDeviceName(); } } } @@ -204,11 +160,28 @@ private: ReplyAndDie(Response, ctx); } + static const THashMap<TString, std::optional<TAction::EType>> Actions; TEvCms::TEvWalleCreateTaskRequest::TPtr RequestEvent; TAutoPtr<TEvCms::TEvWalleCreateTaskResponse> Response; TActorId Cms; }; +const THashMap<TString, std::optional<TAction::EType>> TWalleCreateTaskAdapter::Actions = { + {"reboot", TAction::REBOOT_HOST}, + {"power-off", TAction::SHUTDOWN_HOST}, + {"change-disk", TAction::REPLACE_DEVICES}, + {"change-memory", TAction::SHUTDOWN_HOST}, + {"profile", TAction::SHUTDOWN_HOST}, + {"redeploy", TAction::SHUTDOWN_HOST}, + {"repair-link", TAction::SHUTDOWN_HOST}, + {"repair-bmc", TAction::SHUTDOWN_HOST}, + {"repair-overheat", TAction::SHUTDOWN_HOST}, + {"repair-capping", TAction::SHUTDOWN_HOST}, + {"temporary-unreachable", TAction::SHUTDOWN_HOST}, + {"prepare", std::nullopt}, + {"deactivate", std::nullopt}, +}; + IActor *CreateWalleAdapter(TEvCms::TEvWalleCreateTaskRequest::TPtr &ev, TActorId cms) { return new TWalleCreateTaskAdapter(ev, cms); } diff --git a/ydb/core/protos/cms.proto b/ydb/core/protos/cms.proto index 6658e0611f..86ee47ce66 100644 --- a/ydb/core/protos/cms.proto +++ b/ydb/core/protos/cms.proto @@ -106,6 +106,7 @@ message TAction { ADD_DEVICES = 7; REPLACE_DEVICES = 8; REMOVE_DEVICES = 9; + REBOOT_HOST = 10; // Same as SHUTDOWN_HOST, but forcibly allowed if host is down } optional EType Type = 1; |