aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorilnaz <ilnaz@ydb.tech>2023-11-01 11:57:00 +0300
committerilnaz <ilnaz@ydb.tech>2023-11-01 12:27:23 +0300
commit9793ffdc8db2b9940f15deebe3b951504bac64be (patch)
tree9b0299831bb7349201df3e0388cdc5651bc916d3
parent7f97144a42658fab5a97c83c0c66b0ad957b5b0f (diff)
downloadydb-9793ffdc8db2b9940f15deebe3b951504bac64be.tar.gz
Allow to reboot down nodes KIKIMR-8420
-rw-r--r--ydb/core/cms/cluster_info.cpp4
-rw-r--r--ydb/core/cms/cms.cpp6
-rw-r--r--ydb/core/cms/cms_ut.cpp13
-rw-r--r--ydb/core/cms/walle_create_task_adapter.cpp89
-rw-r--r--ydb/core/protos/cms.proto1
5 files changed, 55 insertions, 58 deletions
diff --git a/ydb/core/cms/cluster_info.cpp b/ydb/core/cms/cluster_info.cpp
index 0893fa2ebc..d14f6de637 100644
--- a/ydb/core/cms/cluster_info.cpp
+++ b/ydb/core/cms/cluster_info.cpp
@@ -646,6 +646,7 @@ void TClusterInfo::ApplyActionWithoutLog(const NKikimrCms::TAction &action)
switch (action.GetType()) {
case TAction::RESTART_SERVICES:
case TAction::SHUTDOWN_HOST:
+ case TAction::REBOOT_HOST:
if (auto nodes = NodePtrs(action.GetHost(), MakeServices(action))) {
for (const auto node : nodes) {
for (auto &nodeGroup: node->NodeGroups)
@@ -696,6 +697,7 @@ TSet<TLockableItem *> TClusterInfo::FindLockedItems(const NKikimrCms::TAction &a
switch (action.GetType()) {
case TAction::RESTART_SERVICES:
case TAction::SHUTDOWN_HOST:
+ case TAction::REBOOT_HOST:
if (auto nodes = NodePtrs(action.GetHost(), MakeServices(action))) {
for (const auto node : nodes) {
res.insert(node);
@@ -750,6 +752,7 @@ ui64 TClusterInfo::AddLocks(const TPermissionInfo &permission, const TActorConte
if (item->State == DOWN
&& (permission.Action.GetType() == TAction::RESTART_SERVICES
|| permission.Action.GetType() == TAction::SHUTDOWN_HOST
+ || permission.Action.GetType() == TAction::REBOOT_HOST
|| permission.Action.GetType() == TAction::REPLACE_DEVICES)) {
item->State = RESTART;
lock = true;;
@@ -1001,6 +1004,7 @@ void TOperationLogManager::ApplyAction(const NKikimrCms::TAction &action,
switch (action.GetType()) {
case NKikimrCms::TAction::RESTART_SERVICES:
case NKikimrCms::TAction::SHUTDOWN_HOST:
+ case NKikimrCms::TAction::REBOOT_HOST:
if (auto nodes = clusterState->NodePtrs(action.GetHost(), MakeServices(action))) {
for (const auto node : nodes) {
for (auto &nodeGroup: node->NodeGroups)
diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp
index ffca58da2c..0834ef50c4 100644
--- a/ydb/core/cms/cms.cpp
+++ b/ydb/core/cms/cms.cpp
@@ -436,6 +436,7 @@ void TCms::AddPermissionExtensions(const TAction& action, TPermission& perm) con
switch (action.GetType()) {
case TAction::RESTART_SERVICES:
case TAction::SHUTDOWN_HOST:
+ case TAction::REBOOT_HOST:
AddHostExtensions(action.GetHost(), perm);
break;
default:
@@ -493,6 +494,7 @@ bool TCms::CheckAction(const TAction &action,
case TAction::RESTART_SERVICES:
return CheckActionRestartServices(action, opts, error, ctx);
case TAction::SHUTDOWN_HOST:
+ case TAction::REBOOT_HOST:
return CheckActionShutdownHost(action, opts, error, ctx);
case TAction::REPLACE_DEVICES:
return CheckActionReplaceDevices(action, opts.PermissionDuration, error);
@@ -581,8 +583,12 @@ bool TCms::CheckActionShutdownHost(const TAction &action,
TErrorInfo &error,
const TActorContext &ctx) const
{
+ const bool forciblyAllow = action.GetType() == TAction::REBOOT_HOST;
for (const auto node : ClusterInfo->HostNodes(action.GetHost())) {
if (!CheckActionShutdownNode(action, opts, *node, error, ctx)) {
+ if (forciblyAllow && node->State == DOWN) {
+ continue;
+ }
return false;
}
}
diff --git a/ydb/core/cms/cms_ut.cpp b/ydb/core/cms/cms_ut.cpp
index 3ff189cbd2..a82ea3f608 100644
--- a/ydb/core/cms/cms_ut.cpp
+++ b/ydb/core/cms/cms_ut.cpp
@@ -729,6 +729,19 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
env.CheckWalleCheckTask("task-2", TStatus::ALLOW, env.GetNodeId(1));
}
+ Y_UNIT_TEST(WalleRebootDownNode)
+ {
+ TCmsTestEnv env(8);
+
+ // alllow
+ env.CheckWalleCreateTask("task-1", "reboot", false, TStatus::ALLOW, env.GetNodeId(0));
+ // disallow (up)
+ env.CheckWalleCreateTask("task-2", "reboot", false, TStatus::DISALLOW_TEMP, env.GetNodeId(1));
+ // allow (down)
+ TFakeNodeWhiteboardService::Info[env.GetNodeId(1)].Connected = false;
+ env.CheckWalleCheckTask("task-2", TStatus::ALLOW, env.GetNodeId(1));
+ }
+
Y_UNIT_TEST(Notifications)
{
TCmsTestEnv env(8);
diff --git a/ydb/core/cms/walle_create_task_adapter.cpp b/ydb/core/cms/walle_create_task_adapter.cpp
index c1a2866e37..5c7ce0393b 100644
--- a/ydb/core/cms/walle_create_task_adapter.cpp
+++ b/ydb/core/cms/walle_create_task_adapter.cpp
@@ -5,6 +5,8 @@
#include <library/cpp/actors/core/actor_bootstrapped.h>
#include <library/cpp/actors/core/hfunc.h>
+#include <optional>
+
namespace NKikimr::NCms {
using namespace NKikimrCms;
@@ -27,19 +29,7 @@ public:
LOG_INFO(ctx, NKikimrServices::CMS, "Processing Wall-E request: %s",
rec.ShortDebugString().data());
- if (rec.GetAction() != "reboot"
- && rec.GetAction() != "power-off"
- && rec.GetAction() != "change-disk"
- && rec.GetAction() != "change-memory"
- && rec.GetAction() != "profile"
- && rec.GetAction() != "redeploy"
- && rec.GetAction() != "prepare"
- && rec.GetAction() != "repair-link"
- && rec.GetAction() != "repair-bmc"
- && rec.GetAction() != "repair-overheat"
- && rec.GetAction() != "repair-capping"
- && rec.GetAction() != "deactivate"
- && rec.GetAction() != "temporary-unreachable") {
+ if (!Actions.contains(rec.GetAction())) {
ReplyWithErrorAndDie(TStatus::WRONG_REQUEST, "Unsupported action", ctx);
return;
}
@@ -132,9 +122,10 @@ private:
request->Record.SetSchedule(true);
request->Record.SetDryRun(task.GetDryRun());
- TAction action;
- if (task.GetAction() == "prepare"
- || task.GetAction() == "deactivate") {
+ auto it = Actions.find(task.GetAction());
+ Y_ABORT_UNLESS(it != Actions.end());
+
+ if (!it->second) {
TAutoPtr<TEvCms::TEvWalleCreateTaskResponse> resp = new TEvCms::TEvWalleCreateTaskResponse;
resp->Record.SetTaskId(task.GetTaskId());
resp->Record.MutableHosts()->CopyFrom(task.GetHosts());
@@ -142,52 +133,17 @@ private:
ReplyAndDie(resp.Release(), ctx);
return;
} else {
- // We always use infinite duration.
- // Wall-E MUST delete processed tasks.
- if (task.GetAction() == "reboot") {
- action.SetType(TAction::SHUTDOWN_HOST);
- action.SetDuration(TDuration::Max().GetValue());
- } else if (task.GetAction() == "power-off") {
- action.SetType(TAction::SHUTDOWN_HOST);
- action.SetDuration(TDuration::Max().GetValue());
- } else if (task.GetAction() == "change-disk") {
- action.SetType(TAction::REPLACE_DEVICES);
- action.SetDuration(TDuration::Max().GetValue());
- } else if (task.GetAction() == "change-memory") {
- action.SetType(TAction::SHUTDOWN_HOST);
- action.SetDuration(TDuration::Max().GetValue());
- } else if (task.GetAction() == "profile") {
- action.SetType(TAction::SHUTDOWN_HOST);
- action.SetDuration(TDuration::Max().GetValue());
- } else if (task.GetAction() == "redeploy") {
- action.SetType(TAction::SHUTDOWN_HOST);
- action.SetDuration(TDuration::Max().GetValue());
- } else if (task.GetAction() == "repair-link") {
- action.SetType(TAction::SHUTDOWN_HOST);
- action.SetDuration(TDuration::Max().GetValue());
- } else if (task.GetAction() == "repair-bmc") {
- action.SetType(TAction::SHUTDOWN_HOST);
- action.SetDuration(TDuration::Max().GetValue());
- } else if (task.GetAction() == "repair-overheat") {
- action.SetType(TAction::SHUTDOWN_HOST);
- action.SetDuration(TDuration::Max().GetValue());
- } else if (task.GetAction() == "repair-capping") {
- action.SetType(TAction::SHUTDOWN_HOST);
- action.SetDuration(TDuration::Max().GetValue());
- } else if (task.GetAction() == "temporary-unreachable") {
- action.SetType(TAction::SHUTDOWN_HOST);
- action.SetDuration(TDuration::Max().GetValue());
- } else
- Y_ABORT("Unknown action");
-
for (auto &host : task.GetHosts()) {
- auto &hostAction = *request->Record.AddActions();
- hostAction.CopyFrom(action);
- hostAction.SetHost(host);
+ auto &action = *request->Record.AddActions();
+ action.SetHost(host);
+ action.SetType(*it->second);
+ // We always use infinite duration.
+ // Wall-E MUST delete processed tasks.
+ action.SetDuration(TDuration::Max().GetValue());
if (action.GetType() == TAction::REPLACE_DEVICES) {
for (const auto node : cluster->HostNodes(host)) {
for (auto &pdiskId : node->PDisks)
- *hostAction.AddDevices() = cluster->PDisk(pdiskId).GetDeviceName();
+ *action.AddDevices() = cluster->PDisk(pdiskId).GetDeviceName();
}
}
}
@@ -204,11 +160,28 @@ private:
ReplyAndDie(Response, ctx);
}
+ static const THashMap<TString, std::optional<TAction::EType>> Actions;
TEvCms::TEvWalleCreateTaskRequest::TPtr RequestEvent;
TAutoPtr<TEvCms::TEvWalleCreateTaskResponse> Response;
TActorId Cms;
};
+const THashMap<TString, std::optional<TAction::EType>> TWalleCreateTaskAdapter::Actions = {
+ {"reboot", TAction::REBOOT_HOST},
+ {"power-off", TAction::SHUTDOWN_HOST},
+ {"change-disk", TAction::REPLACE_DEVICES},
+ {"change-memory", TAction::SHUTDOWN_HOST},
+ {"profile", TAction::SHUTDOWN_HOST},
+ {"redeploy", TAction::SHUTDOWN_HOST},
+ {"repair-link", TAction::SHUTDOWN_HOST},
+ {"repair-bmc", TAction::SHUTDOWN_HOST},
+ {"repair-overheat", TAction::SHUTDOWN_HOST},
+ {"repair-capping", TAction::SHUTDOWN_HOST},
+ {"temporary-unreachable", TAction::SHUTDOWN_HOST},
+ {"prepare", std::nullopt},
+ {"deactivate", std::nullopt},
+};
+
IActor *CreateWalleAdapter(TEvCms::TEvWalleCreateTaskRequest::TPtr &ev, TActorId cms) {
return new TWalleCreateTaskAdapter(ev, cms);
}
diff --git a/ydb/core/protos/cms.proto b/ydb/core/protos/cms.proto
index 6658e0611f..86ee47ce66 100644
--- a/ydb/core/protos/cms.proto
+++ b/ydb/core/protos/cms.proto
@@ -106,6 +106,7 @@ message TAction {
ADD_DEVICES = 7;
REPLACE_DEVICES = 8;
REMOVE_DEVICES = 9;
+ REBOOT_HOST = 10; // Same as SHUTDOWN_HOST, but forcibly allowed if host is down
}
optional EType Type = 1;