aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorilnaz <ilnaz@ydb.tech>2023-12-06 13:42:57 +0300
committerilnaz <ilnaz@ydb.tech>2023-12-06 14:29:00 +0300
commitbfb11ee270b827ddd066a35ee4aa96c389c87603 (patch)
tree85ec6e8a319b1bea22a1156761f80216b6a7d319
parentef5138754487f032c1a415f0faf2101e9e42496a (diff)
downloadydb-bfb11ee270b827ddd066a35ee4aa96c389c87603.tar.gz
Prepare host before granting permission KIKIMR-20254
-rw-r--r--ydb/core/cms/cluster_info.cpp12
-rw-r--r--ydb/core/cms/cluster_info.h4
-rw-r--r--ydb/core/cms/cms.cpp175
-rw-r--r--ydb/core/cms/cms_impl.h14
-rw-r--r--ydb/core/cms/cms_state.h1
-rw-r--r--ydb/core/cms/cms_tx_load_state.cpp12
-rw-r--r--ydb/core/cms/cms_tx_remove_permissions.cpp36
-rw-r--r--ydb/core/cms/cms_tx_remove_request.cpp12
-rw-r--r--ydb/core/cms/cms_tx_store_permissions.cpp16
-rw-r--r--ydb/core/cms/cms_ut.cpp103
-rw-r--r--ydb/core/cms/cms_ut_common.cpp56
-rw-r--r--ydb/core/cms/cms_ut_common.h15
-rw-r--r--ydb/core/cms/sentinel.cpp54
-rw-r--r--ydb/core/cms/sentinel.h16
-rw-r--r--ydb/core/cms/sentinel_impl.h8
-rw-r--r--ydb/core/cms/sentinel_ut.cpp2
-rw-r--r--ydb/core/cms/ut_helpers.cpp4
-rw-r--r--ydb/core/cms/ut_helpers.h41
-rw-r--r--ydb/core/protos/cms.proto2
-rw-r--r--ydb/core/protos/out/out_cms.cpp4
20 files changed, 506 insertions, 81 deletions
diff --git a/ydb/core/cms/cluster_info.cpp b/ydb/core/cms/cluster_info.cpp
index ad4d5fb367..ffeab7b70d 100644
--- a/ydb/core/cms/cluster_info.cpp
+++ b/ydb/core/cms/cluster_info.cpp
@@ -795,6 +795,18 @@ ui64 TClusterInfo::AddExternalLocks(const TNotificationInfo &notification, const
return locks;
}
+void TClusterInfo::SetHostMarkers(const TString &hostName, const THashSet<NKikimrCms::EMarker> &markers) {
+ for (auto node : NodePtrs(hostName)) {
+ node->Markers.insert(markers.begin(), markers.end());
+ }
+}
+
+void TClusterInfo::ResetHostMarkers(const TString &hostName) {
+ for (auto node : NodePtrs(hostName)) {
+ node->Markers.clear();
+ }
+}
+
void TClusterInfo::ApplyDowntimes(const TDowntimes &downtimes)
{
for (auto &pr : downtimes.NodeDowntimes) {
diff --git a/ydb/core/cms/cluster_info.h b/ydb/core/cms/cluster_info.h
index d118336636..5ba7fddd12 100644
--- a/ydb/core/cms/cluster_info.h
+++ b/ydb/core/cms/cluster_info.h
@@ -297,6 +297,7 @@ public:
std::list<TScheduledLock> ScheduledLocks;
TVector<TTemporaryLock> TempLocks;
ui64 DeactivatedLocksOrder = Max<ui64>();
+ THashSet<NKikimrCms::EMarker> Markers;
};
using TLockableItemPtr = TIntrusivePtr<TLockableItem>;
@@ -902,6 +903,9 @@ public:
ui64 AddExternalLocks(const TNotificationInfo &notification, const TActorContext *ctx);
+ void SetHostMarkers(const TString &hostName, const THashSet<NKikimrCms::EMarker> &markers);
+ void ResetHostMarkers(const TString &hostName);
+
void ApplyDowntimes(const TDowntimes &downtimes);
void UpdateDowntimes(TDowntimes &downtimes, const TActorContext &ctx);
diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp
index 5416b1870d..8623a2f098 100644
--- a/ydb/core/cms/cms.cpp
+++ b/ydb/core/cms/cms.cpp
@@ -253,6 +253,8 @@ void TCms::AdjustInfo(TClusterInfoPtr &info, const TActorContext &ctx) const
info->AddLocks(entry.second, &ctx);
for (const auto &entry : State->Notifications)
info->AddExternalLocks(entry.second, &ctx);
+ for (const auto &entry : State->HostMarkers)
+ info->SetHostMarkers(entry.first, entry.second);
}
namespace {
@@ -284,13 +286,20 @@ bool TCms::CheckPermissionRequest(const TPermissionRequest &request,
TStatus::UNKNOWN,
});
bool allowPartial = request.GetPartialPermissionAllowed();
- bool schedule = request.GetSchedule() && !request.GetDryRun();
+ bool schedule = (request.GetSchedule() || request.GetPrepare()) && !request.GetDryRun();
+
+ if (request.GetPrepare() && request.ActionsSize() > 1) {
+ response.MutableStatus()->SetCode(TStatus::WRONG_REQUEST);
+ response.MutableStatus()->SetReason("Cannot prepare more than one action at once");
+ return false;
+ }
response.MutableStatus()->SetCode(TStatus::ALLOW);
if (schedule) {
scheduled.SetUser(request.GetUser());
scheduled.SetPartialPermissionAllowed(allowPartial);
- scheduled.SetSchedule(true);
+ scheduled.SetSchedule(request.GetSchedule());
+ scheduled.SetPrepare(request.GetPrepare());
scheduled.SetReason(request.GetReason());
if (request.HasDuration())
scheduled.SetDuration(request.GetDuration());
@@ -331,7 +340,12 @@ bool TCms::CheckPermissionRequest(const TPermissionRequest &request,
LOG_DEBUG(ctx, NKikimrServices::CMS, "Checking action: %s", action.ShortDebugString().data());
- if (CheckAction(action, opts, error, ctx)) {
+ bool prepared = !request.GetPrepare();
+ if (!prepared) {
+ prepared = CheckPrepareHost(action, error);
+ }
+
+ if (prepared && CheckAction(action, opts, error, ctx)) {
LOG_DEBUG(ctx, NKikimrServices::CMS, "Result: ALLOW");
auto *permission = response.AddPermissions();
@@ -482,11 +496,36 @@ bool TCms::CheckAccess(const TString &token,
return false;
}
-bool TCms::CheckAction(const TAction &action,
- const TActionOptions &opts,
- TErrorInfo &error,
- const TActorContext &ctx) const
-{
+bool TCms::CheckPrepareHost(const TAction &action, TErrorInfo &error) const {
+ if (!State->Sentinel) {
+ error.Code = TStatus::ERROR;
+ error.Reason = "Cannot prepare host while Sentinel (self heal) is disabled";
+ return false;
+ }
+
+ switch (action.GetType()) {
+ case TAction::RESTART_SERVICES:
+ case TAction::SHUTDOWN_HOST:
+ case TAction::REBOOT_HOST:
+ break;
+ default:
+ error.Code = TStatus::WRONG_REQUEST;
+ error.Reason = TStringBuilder() << "Cannot prepare host for action: " << action.GetType();
+ return false;
+ }
+
+ for (const auto node : ClusterInfo->HostNodes(action.GetHost())) {
+ if (!node->VDisks.empty()) {
+ error.Code = TStatus::DISALLOW_TEMP;
+ error.Reason = TStringBuilder() << "Host " << action.GetHost() << " is not prepared yet";
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool TCms::CheckAction(const TAction &action, const TActionOptions &opts, TErrorInfo &error, const TActorContext &ctx) const {
if (!IsActionHostValid(action, error))
return false;
@@ -505,12 +544,11 @@ bool TCms::CheckAction(const TAction &action,
case TAction::ADD_DEVICES:
case TAction::REMOVE_DEVICES:
error.Code = TStatus::ERROR;
- error.Reason = Sprintf("Unsupported action action '%s'",
- TAction::EType_Name(action.GetType()).data());
+ error.Reason = TStringBuilder() << "Unsupported action: " << action.GetType();
return false;
default:
- error.Code = TStatus::WRONG_REQUEST;
- error.Reason = Sprintf("Unknown action '%s'", TAction::EType_Name(action.GetType()).data());
+ error.Code = TStatus::WRONG_REQUEST;
+ error.Reason = TStringBuilder() << "Unknown action: " << static_cast<int>(action.GetType());
return false;
}
}
@@ -924,7 +962,10 @@ void TCms::AcceptPermissions(TPermissionResponse &resp, const TString &requestId
auto &permission = *resp.MutablePermissions(i);
permission.SetId(owner + "-p-" + ToString(State->NextPermissionId++));
State->Permissions.emplace(permission.GetId(), TPermissionInfo(permission, requestId, owner));
- LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::CMS, "Accepting permission");
+ LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::CMS, "Accepting permission"
+ << ": id# " << permission.GetId()
+ << ", requestId# " << requestId
+ << ", owner# " << owner);
ClusterInfo->AddLocks(permission, requestId, owner, &ctx);
if (!check) {
@@ -1120,6 +1161,9 @@ void TCms::AddHostState(const TClusterInfoPtr &clusterInfo, const TNodeInfo &nod
host->SetInterconnectPort(node.IcPort);
host->SetTimestamp(timestamp.GetValue());
node.Location.Serialize(host->MutableLocation(), false);
+ for (auto marker : node.Markers) {
+ host->AddMarkers(marker);
+ }
if (node.State == UP || node.VDisks || node.PDisks) {
for (const auto flag : GetEnumAllValues<EService>()) {
if (!(node.Services & flag)) {
@@ -1141,6 +1185,9 @@ void TCms::AddHostState(const TClusterInfoPtr &clusterInfo, const TNodeInfo &nod
device->SetName(vdisk.GetDeviceName());
device->SetState(vdisk.State);
device->SetTimestamp(timestamp.GetValue());
+ for (auto marker : vdisk.Markers) {
+ device->AddMarkers(marker);
+ }
}
for (const auto &pdId : node.PDisks) {
@@ -1149,6 +1196,9 @@ void TCms::AddHostState(const TClusterInfoPtr &clusterInfo, const TNodeInfo &nod
device->SetName(pdisk.GetDeviceName());
device->SetState(pdisk.State);
device->SetTimestamp(timestamp.GetValue());
+ for (auto marker : pdisk.Markers) {
+ device->AddMarkers(marker);
+ }
}
}
}
@@ -1291,10 +1341,17 @@ void TCms::RemoveRequest(TEvCms::TEvManageRequestRequest::TPtr &ev, const TActor
resp->Record.MutableStatus()->SetReason("Unknown request " + id);
} else {
const auto &request = it->second;
+
if (request.Owner != user) {
resp->Record.MutableStatus()->SetCode(TStatus::WRONG_REQUEST);
resp->Record.MutableStatus()->SetReason(Sprintf("Request %s doesn't belong to %s", id.data(), user.data()));
}
+
+ if (request.Request.GetPrepare() && request.Request.ActionsSize() < 1) {
+ resp->Record.MutableStatus()->SetCode(TStatus::WRONG_REQUEST);
+ resp->Record.MutableStatus()->SetReason(
+ Sprintf("Request %s used to prepare action and cannot be deleted while permission is valid", id.data()));
+ }
}
LOG_DEBUG(ctx, NKikimrServices::CMS, "Resulting status: %s %s",
@@ -1470,6 +1527,65 @@ void TCms::PersistNodeTenants(TTransactionContext& txc, const TActorContext& ctx
}
}
+TVector<TCms::THostMarkers> TCms::SetHostMarker(const TString &host, NKikimrCms::EMarker marker, TTransactionContext &txc, const TActorContext &ctx) {
+ if (State->HostMarkers.contains(host)) {
+ return {};
+ }
+
+ AuditLog(ctx, TStringBuilder() << "Add host marker"
+ << ": host# " << host
+ << ", marker# " << marker);
+
+ State->HostMarkers[host] = {marker};
+ NIceDb::TNiceDb db(txc.DB);
+ db.Table<Schema::HostMarkers>().Key(host).Update(
+ NIceDb::TUpdate<Schema::HostMarkers::Markers>(TVector<NKikimrCms::EMarker>{marker})
+ );
+
+ TVector<TCms::THostMarkers> updateMarkers;
+ if (ClusterInfo) {
+ for (const auto node : ClusterInfo->HostNodes(host)) {
+ updateMarkers.push_back({
+ .NodeId = node->NodeId,
+ .Markers = {marker},
+ });
+ }
+ }
+
+ return updateMarkers;
+}
+
+TVector<TCms::THostMarkers> TCms::ResetHostMarkers(const TString &host, TTransactionContext &txc, const TActorContext &ctx) {
+ if (!State->HostMarkers.contains(host)) {
+ return {};
+ }
+
+ AuditLog(ctx, TStringBuilder() << "Reset host markers"
+ << ": host# " << host);
+
+ State->HostMarkers.erase(host);
+ NIceDb::TNiceDb db(txc.DB);
+ db.Table<Schema::HostMarkers>().Key(host).Delete();
+
+ TVector<TCms::THostMarkers> updateMarkers;
+ if (ClusterInfo) {
+ for (const auto node : ClusterInfo->HostNodes(host)) {
+ updateMarkers.push_back({
+ .NodeId = node->NodeId,
+ .Markers = {},
+ });
+ }
+ }
+
+ return updateMarkers;
+}
+
+void TCms::SentinelUpdateHostMarkers(TVector<TCms::THostMarkers> &&updateMarkers, const TActorContext &ctx) {
+ if (updateMarkers) {
+ ctx.Send(State->Sentinel, new TEvSentinel::TEvUpdateHostMarkers(std::move(updateMarkers)));
+ }
+}
+
void TCms::ProcessQueue()
{
// To avoid getting stuck in the processing queue for too long,
@@ -1728,6 +1844,21 @@ void TCms::Handle(TEvCms::TEvPermissionRequest::TPtr &ev,
}
}
+ if (rec.GetPrepare()) {
+ for (const auto &action : rec.GetActions()) {
+ if (State->HostMarkers.contains(action.GetHost())) {
+ return ReplyWithError<TEvCms::TEvPermissionResponse>(
+ ev, TStatus::WRONG_REQUEST, TStringBuilder() << "Host '" << action.GetHost() << "' is preparing", ctx);
+ }
+ for (const auto node : ClusterInfo->HostNodes(action.GetHost())) {
+ if (State->HostMarkers.contains(ToString(node->NodeId))) {
+ return ReplyWithError<TEvCms::TEvPermissionResponse>(
+ ev, TStatus::WRONG_REQUEST, TStringBuilder() << "Node '" << node->NodeId << "' is preparing", ctx);
+ }
+ }
+ }
+ }
+
bool ok = CheckPermissionRequest(rec, resp->Record, scheduled.Request, ctx);
// Schedule request if required.
@@ -1738,7 +1869,7 @@ void TCms::Handle(TEvCms::TEvPermissionRequest::TPtr &ev,
resp->Record.SetRequestId(reqId);
TAutoPtr<TRequestInfo> copy;
- if (scheduled.Request.ActionsSize()) {
+ if (scheduled.Request.ActionsSize() || scheduled.Request.GetPrepare()) {
scheduled.Owner = user;
scheduled.Order = State->NextRequestId - 1;
scheduled.RequestId = reqId;
@@ -1803,14 +1934,14 @@ void TCms::Handle(TEvCms::TEvCheckRequest::TPtr &ev, const TActorContext &ctx)
auto order = request.Order;
State->ScheduledRequests.erase(it);
- if (scheduled.Request.ActionsSize()) {
+ if (scheduled.Request.ActionsSize() || scheduled.Request.GetPrepare()) {
scheduled.Owner = user;
scheduled.Order = order;
scheduled.RequestId = rec.GetRequestId();
resp->Record.SetRequestId(scheduled.RequestId);
copy = new TRequestInfo(scheduled);
- State->ScheduledRequests.emplace(scheduled.RequestId, std::move(scheduled));
+ State->ScheduledRequests.emplace(rec.GetRequestId(), std::move(scheduled));
} else {
scheduled.RequestId = rec.GetRequestId();
scheduled.Owner = user;
@@ -1818,7 +1949,7 @@ void TCms::Handle(TEvCms::TEvCheckRequest::TPtr &ev, const TActorContext &ctx)
}
if (ok)
- AcceptPermissions(resp->Record, scheduled.RequestId, user, ctx, true);
+ AcceptPermissions(resp->Record, rec.GetRequestId(), user, ctx, true);
auto handle = new IEventHandle(ev->Sender, SelfId(), resp.Release(), 0, ev->Cookie);
Execute(CreateTxStorePermissions(std::move(ev->Release()), handle, user, std::move(copy)), ctx);
@@ -1897,6 +2028,7 @@ bool TCms::IsValidNotificationAction(const TAction &action, TInstant time,
case TAction::RESTART_SERVICES:
return CheckNotificationRestartServices(action, time, error, ctx);
case TAction::SHUTDOWN_HOST:
+ case TAction::REBOOT_HOST:
return CheckNotificationShutdownHost(action, time, error, ctx);
case TAction::REPLACE_DEVICES:
return CheckNotificationReplaceDevices(action, time, error, ctx);
@@ -1907,12 +2039,11 @@ bool TCms::IsValidNotificationAction(const TAction &action, TInstant time,
case TAction::ADD_DEVICES:
case TAction::REMOVE_DEVICES:
error.Code = TStatus::ERROR;
- error.Reason = Sprintf("Unsupported action action '%s'",
- TAction::EType_Name(action.GetType()).data());
+ error.Reason = TStringBuilder() << "Unsupported action: " << action.GetType();
return false;
default:
- error.Code = TStatus::WRONG_REQUEST;
- error.Reason = Sprintf("Unknown action '%s'", TAction::EType_Name(action.GetType()).data());
+ error.Code = TStatus::WRONG_REQUEST;
+ error.Reason = TStringBuilder() << "Unknown action: " << static_cast<int>(action.GetType());
return false;
}
}
@@ -2079,7 +2210,7 @@ void TCms::Handle(TEvCms::TEvGetLogTailRequest::TPtr &ev, const TActorContext &c
void TCms::Handle(TEvCms::TEvGetSentinelStateRequest::TPtr &ev, const TActorContext &ctx)
{
- if(State->Sentinel) {
+ if (State->Sentinel) {
ctx.Send(ev->Forward(State->Sentinel));
} else {
auto Response = MakeHolder<TEvCms::TEvGetSentinelStateResponse>();
diff --git a/ydb/core/cms/cms_impl.h b/ydb/core/cms/cms_impl.h
index d2dace9f3e..7bcced14e7 100644
--- a/ydb/core/cms/cms_impl.h
+++ b/ydb/core/cms/cms_impl.h
@@ -4,6 +4,7 @@
#include "cms.h"
#include "config.h"
#include "logger.h"
+#include "sentinel.h"
#include "services.h"
#include "walle.h"
@@ -78,6 +79,11 @@ public:
void PersistNodeTenants(TTransactionContext &txc, const TActorContext &ctx);
+ using THostMarkers = TEvSentinel::TEvUpdateHostMarkers::THostMarkers;
+ TVector<THostMarkers> SetHostMarker(const TString &host, NKikimrCms::EMarker marker, TTransactionContext &txc, const TActorContext &ctx);
+ TVector<THostMarkers> ResetHostMarkers(const TString &host, TTransactionContext &txc, const TActorContext &ctx);
+ void SentinelUpdateHostMarkers(TVector<THostMarkers> &&updateMarkers, const TActorContext &ctx);
+
static void AddHostState(const TClusterInfoPtr &clusterInfo, const TNodeInfo &node, NKikimrCms::TClusterStateResponse &resp, TInstant timestamp);
private:
@@ -294,8 +300,12 @@ private:
NKikimrCms::TStatus::ECode &code,
TString &error,
const TActorContext &ctx);
- bool CheckAction(const NKikimrCms::TAction &action, const TActionOptions &options,
- TErrorInfo &error, const TActorContext &ctx) const;
+ bool CheckPrepareHost(const NKikimrCms::TAction &action,
+ TErrorInfo &error) const;
+ bool CheckAction(const NKikimrCms::TAction &action,
+ const TActionOptions &opts,
+ TErrorInfo &error,
+ const TActorContext &ctx) const;
bool CheckActionShutdownNode(const NKikimrCms::TAction &action,
const TActionOptions &options,
const TNodeInfo &node,
diff --git a/ydb/core/cms/cms_state.h b/ydb/core/cms/cms_state.h
index d2581b5936..8a2c33ad95 100644
--- a/ydb/core/cms/cms_state.h
+++ b/ydb/core/cms/cms_state.h
@@ -31,6 +31,7 @@ struct TCmsState : public TAtomicRefCount<TCmsState> {
THashMap<TString, TPermissionInfo> Permissions;
THashMap<TString, TRequestInfo> ScheduledRequests;
THashMap<TString, TNotificationInfo> Notifications;
+ THashMap<TString, THashSet<NKikimrCms::EMarker>> HostMarkers;
TDowntimes Downtimes;
ui64 NextPermissionId = 0;
ui64 NextRequestId = 0;
diff --git a/ydb/core/cms/cms_tx_load_state.cpp b/ydb/core/cms/cms_tx_load_state.cpp
index c77e30157a..829ce9f945 100644
--- a/ydb/core/cms/cms_tx_load_state.cpp
+++ b/ydb/core/cms/cms_tx_load_state.cpp
@@ -34,6 +34,7 @@ public:
auto maintenanceTasksRowset = db.Table<Schema::MaintenanceTasks>().Range().Select<Schema::MaintenanceTasks::TColumns>();
auto notificationRowset = db.Table<Schema::Notification>().Range().Select<Schema::Notification::TColumns>();
auto nodeTenantRowset = db.Table<Schema::NodeTenant>().Range().Select<Schema::NodeTenant::TColumns>();
+ auto hostMarkersRowset = db.Table<Schema::HostMarkers>().Range().Select<Schema::HostMarkers::TColumns>();
auto logRowset = db.Table<Schema::LogRecords>().Range().Select<Schema::LogRecords::Timestamp>();
if (!paramRow.IsReady()
@@ -42,6 +43,7 @@ public:
|| !walleTaskRowset.IsReady()
|| !maintenanceTasksRowset.IsReady()
|| !notificationRowset.IsReady()
+ || !hostMarkersRowset.IsReady()
|| !logRowset.IsReady())
return false;
@@ -205,6 +207,16 @@ public:
return false;
}
+ while (!hostMarkersRowset.EndOfSet()) {
+ TString host = hostMarkersRowset.GetValue<Schema::HostMarkers::Host>();
+ TVector<NKikimrCms::EMarker> markers = hostMarkersRowset.GetValue<Schema::HostMarkers::Markers>();
+
+ state->HostMarkers[host].insert(markers.begin(), markers.end());
+
+ if (!hostMarkersRowset.Next())
+ return false;
+ }
+
if (!state->Downtimes.DbLoadState(txc, ctx))
return false;
diff --git a/ydb/core/cms/cms_tx_remove_permissions.cpp b/ydb/core/cms/cms_tx_remove_permissions.cpp
index 31d3c6b4a8..14a6869b5a 100644
--- a/ydb/core/cms/cms_tx_remove_permissions.cpp
+++ b/ydb/core/cms/cms_tx_remove_permissions.cpp
@@ -6,6 +6,12 @@
namespace NKikimr::NCms {
class TCms::TTxRemovePermissions : public TTransactionBase<TCms> {
+ void RemoveRequest(NIceDb::TNiceDb &db, const TString &reqId, const TActorContext &ctx, const TString &reason) {
+ Self->State->ScheduledRequests.erase(reqId);
+ db.Table<Schema::Request>().Key(reqId).Delete();
+ Self->AuditLog(ctx, reason);
+ }
+
public:
TTxRemovePermissions(TCms *self, TVector<TString> &&ids, THolder<IEventBase> req, TAutoPtr<IEventHandle> resp, bool expired)
: TBase(self)
@@ -26,17 +32,29 @@ public:
if (!Self->State->Permissions.contains(id))
continue;
- auto requestId = Self->State->Permissions.find(id)->second.RequestId;
+ const auto &permission = Self->State->Permissions.find(id)->second;
+ const TString requestId = permission.RequestId;
+ const TString host = permission.Action.GetHost();
+
Self->State->Permissions.erase(id);
db.Table<Schema::Permission>().Key(id).Delete();
- if (Expired && Self->State->ScheduledRequests.contains(requestId)) {
- Self->State->ScheduledRequests.erase(requestId);
- db.Table<Schema::Request>().Key(requestId).Delete();
-
- Self->AuditLog(ctx, TStringBuilder() << "Remove request"
- << ": id# " << requestId
- << ", reason# " << "permission " << id << " has expired");
+ auto it = Self->State->ScheduledRequests.find(requestId);
+ if (it != Self->State->ScheduledRequests.end()) {
+ if (Expired) {
+ RemoveRequest(db, requestId, ctx, TStringBuilder() << "Remove request"
+ << ": id# " << requestId
+ << ", reason# " << "permission " << id << " has expired");
+ }
+
+ if (it->second.Request.GetPrepare()) {
+ auto ret = Self->ResetHostMarkers(host, txc, ctx);
+ std::move(ret.begin(), ret.end(), std::back_inserter(UpdateMarkers));
+
+ RemoveRequest(db, requestId, ctx, TStringBuilder() << "Remove request"
+ << ": id# " << requestId
+ << ", reason# " << "permission " << id << " was removed");
+ }
}
if (Self->State->WalleRequests.contains(requestId)) {
@@ -66,6 +84,7 @@ public:
}
Self->RemoveEmptyTasks(ctx);
+ Self->SentinelUpdateHostMarkers(std::move(UpdateMarkers), ctx);
}
private:
@@ -73,6 +92,7 @@ private:
TAutoPtr<IEventHandle> Response;
TVector<TString> Ids;
bool Expired;
+ TVector<TEvSentinel::TEvUpdateHostMarkers::THostMarkers> UpdateMarkers;
};
ITransaction *TCms::CreateTxRemovePermissions(TVector<TString> ids, THolder<IEventBase> req, TAutoPtr<IEventHandle> resp,
diff --git a/ydb/core/cms/cms_tx_remove_request.cpp b/ydb/core/cms/cms_tx_remove_request.cpp
index f584979d01..c2191eec84 100644
--- a/ydb/core/cms/cms_tx_remove_request.cpp
+++ b/ydb/core/cms/cms_tx_remove_request.cpp
@@ -20,6 +20,16 @@ public:
bool Execute(TTransactionContext &txc, const TActorContext &ctx) override {
LOG_DEBUG(ctx, NKikimrServices::CMS, "TTxRemoveRequest Execute");
+ auto it = Self->State->ScheduledRequests.find(Id);
+ if (it != Self->State->ScheduledRequests.end()) {
+ if (it->second.Request.GetPrepare()) {
+ for (const auto &action : it->second.Request.GetActions()) {
+ auto ret = Self->ResetHostMarkers(action.GetHost(), txc, ctx);
+ std::move(ret.begin(), ret.end(), std::back_inserter(UpdateMarkers));
+ }
+ }
+ }
+
NIceDb::TNiceDb db(txc.DB);
db.Table<Schema::Request>().Key(Id).Delete();
Self->State->ScheduledRequests.erase(Id);
@@ -40,12 +50,14 @@ public:
}
Self->RemoveEmptyTasks(ctx);
+ Self->SentinelUpdateHostMarkers(std::move(UpdateMarkers), ctx);
}
private:
THolder<IEventBase> Request;
TAutoPtr<IEventHandle> Response;
TString Id;
+ TVector<TEvSentinel::TEvUpdateHostMarkers::THostMarkers> UpdateMarkers;
};
ITransaction *TCms::CreateTxRemoveRequest(const TString &id, THolder<IEventBase> req, TAutoPtr<IEventHandle> resp) {
diff --git a/ydb/core/cms/cms_tx_store_permissions.cpp b/ydb/core/cms/cms_tx_store_permissions.cpp
index 069b722598..d0967340fd 100644
--- a/ydb/core/cms/cms_tx_store_permissions.cpp
+++ b/ydb/core/cms/cms_tx_store_permissions.cpp
@@ -69,13 +69,18 @@ public:
<< ": id# " << id
<< ", validity# " << TInstant::MicroSeconds(deadline)
<< ", action# " << actionStr);
+
+ if (Scheduled && Scheduled->Request.GetPrepare()) {
+ auto ret = Self->SetHostMarker(permission.GetAction().GetHost(), NKikimrCms::MARKER_DISK_FAULTY, txc, ctx);
+ std::move(ret.begin(), ret.end(), std::back_inserter(UpdateMarkers));
+ }
}
if (Scheduled) {
auto &id = Scheduled->RequestId;
auto &owner = Scheduled->Owner;
- if (Scheduled->Request.ActionsSize()) {
+ if (Scheduled->Request.ActionsSize() || Scheduled->Request.GetPrepare()) {
ui64 order = Scheduled->Order;
TString requestStr;
google::protobuf::TextFormat::PrintToString(Scheduled->Request, &requestStr);
@@ -90,6 +95,13 @@ public:
<< ", owner# " << owner
<< ", order# " << order
<< ", body# " << requestStr);
+
+ if (Scheduled->Request.GetPrepare()) {
+ for (const auto &action : Scheduled->Request.GetActions()) {
+ auto ret = Self->SetHostMarker(action.GetHost(), NKikimrCms::MARKER_DISK_FAULTY, txc, ctx);
+ std::move(ret.begin(), ret.end(), std::back_inserter(UpdateMarkers));
+ }
+ }
} else {
db.Table<Schema::Request>().Key(id).Delete();
@@ -108,6 +120,7 @@ public:
Self->Reply(Request.Get(), Response, ctx);
Self->SchedulePermissionsCleanup(ctx);
+ Self->SentinelUpdateHostMarkers(std::move(UpdateMarkers), ctx);
}
private:
@@ -118,6 +131,7 @@ private:
const TMaybe<TString> MaintenanceTaskId;
ui64 NextPermissionId;
ui64 NextRequestId;
+ TVector<TEvSentinel::TEvUpdateHostMarkers::THostMarkers> UpdateMarkers;
};
ITransaction *TCms::CreateTxStorePermissions(THolder<IEventBase> req, TAutoPtr<IEventHandle> resp,
diff --git a/ydb/core/cms/cms_ut.cpp b/ydb/core/cms/cms_ut.cpp
index 77adaf0837..01253e002f 100644
--- a/ydb/core/cms/cms_ut.cpp
+++ b/ydb/core/cms/cms_ut.cpp
@@ -1684,6 +1684,109 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
env.DestroyDefaultCmsPipe();
}
+
+ Y_UNIT_TEST(PrepareHostShouldFailWhileSentinelIsDisabled)
+ {
+ TCmsTestEnv env(TTestEnvOpts(8).WithoutSentinel());
+ env.CheckPermissionRequest(
+ MakePermissionRequest(TRequestOptions("user").WithPrepare(),
+ MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000)
+ ),
+ TStatus::ERROR
+ );
+ }
+
+ Y_UNIT_TEST(PrepareHostShouldFailOnUnsupportedAction)
+ {
+ TCmsTestEnv env(TTestEnvOpts(8).WithSentinel());
+ env.CheckPermissionRequest(
+ MakePermissionRequest(TRequestOptions("user").WithPrepare(),
+ MakeAction(TAction::REPLACE_DEVICES, env.GetNodeId(0), 60000000, env.PDiskName(0))
+ ),
+ TStatus::WRONG_REQUEST
+ );
+ }
+
+ Y_UNIT_TEST(PrepareHostShouldFailOnMultipleActions)
+ {
+ TCmsTestEnv env(TTestEnvOpts(8).WithSentinel());
+ env.CheckPermissionRequest(
+ MakePermissionRequest(TRequestOptions("user").WithPrepare(),
+ MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000),
+ MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000)
+ ),
+ TStatus::WRONG_REQUEST
+ );
+ }
+
+ Y_UNIT_TEST(PrepareHost)
+ {
+ auto opts = TTestEnvOpts(8).WithSentinel();
+ TCmsTestEnv env(opts);
+ env.SetLogPriority(NKikimrServices::CMS, NLog::PRI_DEBUG);
+
+ // ok
+ auto request1 = env.CheckPermissionRequest(
+ MakePermissionRequest(TRequestOptions("user").WithPrepare(),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(0), 600000000, "storage")
+ ),
+ TStatus::DISALLOW_TEMP
+ );
+ // forbid another prepare request for same host
+ env.CheckPermissionRequest(
+ MakePermissionRequest(TRequestOptions("user").WithPrepare(),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(0), 600000000, "storage")
+ ),
+ TStatus::WRONG_REQUEST
+ );
+
+ // "move" vdisks
+ auto& node = TFakeNodeWhiteboardService::Info[env.GetNodeId(0)];
+ node.VDisksMoved = true;
+ node.VDiskStateInfo.clear();
+ env.RegenerateBSConfig(TFakeNodeWhiteboardService::Config.MutableResponse()->MutableStatus(0)->MutableBaseConfig(), opts);
+
+ // prepared
+ auto permission1 = env.CheckRequest("user", request1.GetRequestId(), false, TStatus::ALLOW, 1);
+ env.CheckRejectRequest("user", request1.GetRequestId(), false, TStatus::WRONG_REQUEST);
+ env.CheckDonePermission("user", permission1.GetPermissions(0).GetId());
+
+ // allow immediately
+ auto request2 = env.CheckPermissionRequest(
+ MakePermissionRequest(TRequestOptions("user").WithPrepare(),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(0), 600000000, "storage")
+ ),
+ TStatus::ALLOW
+ );
+ UNIT_ASSERT_VALUES_EQUAL(request2.PermissionsSize(), 1);
+
+ // check markers after restart
+ env.RestartCms();
+ env.CheckPermissionRequest(
+ MakePermissionRequest(TRequestOptions("user").WithPrepare(),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(0), 600000000, "storage")
+ ),
+ TStatus::WRONG_REQUEST
+ );
+
+ env.CheckRejectRequest("user", request2.GetRequestId(), false, TStatus::WRONG_REQUEST);
+ env.CheckDonePermission("user", request2.GetPermissions(0).GetId());
+
+ // restore vdisks
+ node.VDisksMoved = false;
+ env.RegenerateBSConfig(TFakeNodeWhiteboardService::Config.MutableResponse()->MutableStatus(0)->MutableBaseConfig(), opts);
+
+ // prepare
+ auto request3 = env.CheckPermissionRequest(
+ MakePermissionRequest(TRequestOptions("user").WithPrepare(),
+ MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(0), 600000000, "storage")
+ ),
+ TStatus::DISALLOW_TEMP
+ );
+
+ // reject until prepared
+ env.CheckRejectRequest("user", request3.GetRequestId());
+ }
}
}
diff --git a/ydb/core/cms/cms_ut_common.cpp b/ydb/core/cms/cms_ut_common.cpp
index 01be0de05e..ab8843330d 100644
--- a/ydb/core/cms/cms_ut_common.cpp
+++ b/ydb/core/cms/cms_ut_common.cpp
@@ -187,23 +187,6 @@ void TFakeNodeWhiteboardService::Handle(TEvWhiteboard::TEvSystemStateRequest::TP
namespace {
-struct TFakeNodeInfo {
- struct TVDiskIDComparator {
- bool operator ()(const TVDiskID& a, const TVDiskID& b) const {
- return std::make_tuple(a.GroupID, a.FailRealm, a.FailDomain, a.VDisk)
- < std::make_tuple(b.GroupID, b.FailRealm, b.FailDomain, b.VDisk);
- }
- };
-
- TMap<TTabletId, NKikimrWhiteboard::TTabletStateInfo> TabletStateInfo;
- TMap<TString, NKikimrWhiteboard::TNodeStateInfo> NodeStateInfo;
- TMap<ui32, NKikimrWhiteboard::TPDiskStateInfo> PDiskStateInfo;
- TMap<TVDiskID, NKikimrWhiteboard::TVDiskStateInfo, TVDiskIDComparator> VDiskStateInfo;
- TMap<ui32, NKikimrWhiteboard::TBSGroupStateInfo> BSGroupStateInfo;
- NKikimrWhiteboard::TSystemStateInfo SystemStateInfo;
- bool Connected = true;
-};
-
class TFakeTenantPool : public TActorBootstrapped<TFakeTenantPool> {
public:
TVector<TString> Tenants;
@@ -243,7 +226,6 @@ public:
void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseConfig *config,
ui32 pdisks, ui32 vdiskPerPdisk = 4, const TNodeTenantsMap &tenants = {}, bool useMirror3dcErasure = false)
{
- TGuard<TMutex> guard(TFakeNodeWhiteboardService::Mutex);
ui32 numNodes = runtime.GetNodeCount();
ui32 numNodeGroups = pdisks * vdiskPerPdisk;
ui32 numGroups;
@@ -271,14 +253,16 @@ void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseC
group.SetErasureSpecies("none");
}
- TFakeNodeWhiteboardService::Info.clear();
for (ui32 nodeIndex = 0; nodeIndex < numNodes; ++nodeIndex) {
ui32 nodeId = runtime.GetNodeId(nodeIndex);
- auto &node = TFakeNodeWhiteboardService::Info[nodeId];
- node.SystemStateInfo.SetVersion(ToString(GetProgramSvnRevision()));
- node.SystemStateInfo.SetStartTime(now.GetValue());
- node.SystemStateInfo.SetChangeTime(now.GetValue());
+ auto ret = TFakeNodeWhiteboardService::Info.emplace(nodeId, TFakeNodeInfo());
+ auto &node = ret.first->second;
+ if (ret.second) {
+ node.SystemStateInfo.SetVersion(ToString(GetProgramSvnRevision()));
+ node.SystemStateInfo.SetStartTime(now.GetValue());
+ node.SystemStateInfo.SetChangeTime(now.GetValue());
+ }
if (tenants.contains(nodeIndex)) {
node.SystemStateInfo.AddRoles("Tenant");
@@ -312,6 +296,10 @@ void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseC
pdiskConfig.SetGuid(1);
pdiskConfig.SetDriveStatus(NKikimrBlobStorage::ACTIVE);
+ if (node.VDisksMoved) {
+ continue;
+ }
+
for (ui8 vdiskIndex = 0; vdiskIndex < vdiskPerPdisk; ++vdiskIndex) {
ui32 vdiskId = pdiskIndex * vdiskPerPdisk + vdiskIndex;
ui32 groupId = groupShift + vdiskId;
@@ -400,9 +388,7 @@ static NKikimrConfig::TBootstrap GenerateBootstrapConfig(TTestActorRuntime &runt
return res;
}
-static void SetupServices(TTestActorRuntime &runtime,
- const TNodeTenantsMap &tenants)
-{
+static void SetupServices(TTestActorRuntime &runtime, const TTestEnvOpts &options) {
const ui32 domainsNum = 1;
const ui32 disksInDomain = 1;
@@ -491,8 +477,8 @@ static void SetupServices(TTestActorRuntime &runtime,
runtime.AddLocalService(NNodeWhiteboard::MakeNodeWhiteboardServiceId(runtime.GetNodeId(nodeIndex)),
TActorSetupCmd(CreateFakeNodeWhiteboardService(), TMailboxType::Simple, 0), nodeIndex);
TVector<TString> nodeTenants;
- if (tenants.contains(nodeIndex))
- nodeTenants = tenants.at(nodeIndex);
+ if (options.Tenants.contains(nodeIndex))
+ nodeTenants = options.Tenants.at(nodeIndex);
runtime.AddLocalService(MakeTenantPoolID(runtime.GetNodeId(nodeIndex)),
TActorSetupCmd(new TFakeTenantPool(nodeTenants), TMailboxType::Simple, 0), nodeIndex);
}
@@ -512,7 +498,7 @@ static void SetupServices(TTestActorRuntime &runtime,
runtime.GetAppData().BootstrapConfig = TFakeNodeWhiteboardService::BootstrapConfig;
NKikimrCms::TCmsConfig cmsConfig;
- cmsConfig.MutableSentinelConfig()->SetEnable(false);
+ cmsConfig.MutableSentinelConfig()->SetEnable(options.EnableSentinel);
runtime.GetAppData().DefaultCmsConfig = MakeHolder<NKikimrCms::TCmsConfig>(cmsConfig);
if (!runtime.IsRealThreads()) {
@@ -546,6 +532,8 @@ TCmsTestEnv::TCmsTestEnv(const TTestEnvOpts &options)
TFakeNodeWhiteboardService::BootstrapConfig = GenerateBootstrapConfig(*this, options.NodeCount, options.Tenants);
+ TGuard<TMutex> guard(TFakeNodeWhiteboardService::Mutex);
+ TFakeNodeWhiteboardService::Info.clear();
GenerateExtendedInfo(*this, config, options.VDisks, 4, options.Tenants, options.UseMirror3dcErasure);
SetObserverFunc([](TAutoPtr<IEventHandle> &event) -> auto {
@@ -572,7 +560,7 @@ TCmsTestEnv::TCmsTestEnv(const TTestEnvOpts &options)
SetupStateStorage(*this, nodeIndex);
}
}
- SetupServices(*this, options.Tenants);
+ SetupServices(*this, options);
Sender = AllocateEdgeActor();
ClientId = TActorId();
@@ -580,7 +568,7 @@ TCmsTestEnv::TCmsTestEnv(const TTestEnvOpts &options)
NKikimrCms::TCmsConfig cmsConfig;
cmsConfig.MutableTenantLimits()->SetDisabledNodesRatioLimit(0);
cmsConfig.MutableClusterLimits()->SetDisabledNodesRatioLimit(0);
- cmsConfig.MutableSentinelConfig()->SetEnable(false);
+ cmsConfig.MutableSentinelConfig()->SetEnable(options.EnableSentinel);
SetCmsConfig(cmsConfig);
// Need to allow restart state storage nodes
@@ -1170,5 +1158,11 @@ void TCmsTestEnv::EnableNoisyBSCPipe() {
TFakeNodeWhiteboardService::NoisyBSCPipe = true;
}
+void TCmsTestEnv::RegenerateBSConfig(NKikimrBlobStorage::TBaseConfig *config, const TTestEnvOpts &opts) {
+ TGuard<TMutex> guard(TFakeNodeWhiteboardService::Mutex);
+ config->Clear();
+ GenerateExtendedInfo(*this, config, opts.VDisks, 4, opts.Tenants, opts.UseMirror3dcErasure);
+}
+
} // namespace NCmsTest
} // namespace NKikimr
diff --git a/ydb/core/cms/cms_ut_common.h b/ydb/core/cms/cms_ut_common.h
index c76597a96b..be43e21b74 100644
--- a/ydb/core/cms/cms_ut_common.h
+++ b/ydb/core/cms/cms_ut_common.h
@@ -34,6 +34,7 @@ struct TFakeNodeInfo {
TMap<TVDiskID, NKikimrWhiteboard::TVDiskStateInfo, TVDiskIDComparator> VDiskStateInfo;
NKikimrWhiteboard::TSystemStateInfo SystemStateInfo;
bool Connected = true;
+ bool VDisksMoved = false;
};
class TFakeNodeWhiteboardService : public TActorBootstrapped<TFakeNodeWhiteboardService> {
@@ -84,6 +85,7 @@ struct TTestEnvOpts {
TNodeTenantsMap Tenants;
bool UseMirror3dcErasure;
bool AdvanceCurrentTime;
+ bool EnableSentinel;
TTestEnvOpts() = default;
@@ -99,8 +101,19 @@ struct TTestEnvOpts {
, Tenants(tenants)
, UseMirror3dcErasure(false)
, AdvanceCurrentTime(false)
+ , EnableSentinel(false)
{
}
+
+ TTestEnvOpts& WithSentinel() {
+ EnableSentinel = true;
+ return *this;
+ }
+
+ TTestEnvOpts& WithoutSentinel() {
+ EnableSentinel = false;
+ return *this;
+ }
};
class TCmsTestEnv : public TTestBasicRuntime {
@@ -369,6 +382,8 @@ public:
const ui64 CmsId;
+ void RegenerateBSConfig(NKikimrBlobStorage::TBaseConfig *config, const TTestEnvOpts &opts);
+
private:
void SetupLogging();
diff --git a/ydb/core/cms/sentinel.cpp b/ydb/core/cms/sentinel.cpp
index 16000a0a0d..e16d38b694 100644
--- a/ydb/core/cms/sentinel.cpp
+++ b/ydb/core/cms/sentinel.cpp
@@ -62,6 +62,11 @@ void TPDiskStatusComputer::AddState(EPDiskState state) {
}
EPDiskStatus TPDiskStatusComputer::Compute(EPDiskStatus current, TString& reason) const {
+ if (ForcedStatus) {
+ reason = "Forced status";
+ return *ForcedStatus;
+ }
+
if (!StateCounter) {
reason = "Uninitialized StateCounter";
return current;
@@ -116,6 +121,14 @@ void TPDiskStatusComputer::Reset() {
StateCounter = 0;
}
+void TPDiskStatusComputer::SetForcedStatus(EPDiskStatus status) {
+ ForcedStatus = status;
+}
+
+void TPDiskStatusComputer::ResetForcedStatus() {
+ ForcedStatus.Clear();
+}
+
/// TPDiskStatus
TPDiskStatus::TPDiskStatus(EPDiskStatus initialStatus, const ui32& defaultStateLimit, const TLimitsMap& stateLimits)
@@ -129,13 +142,9 @@ void TPDiskStatus::AddState(EPDiskState state) {
TPDiskStatusComputer::AddState(state);
}
-bool TPDiskStatus::IsChanged(TString& reason) const {
- return Current != Compute(Current, reason);
-}
-
bool TPDiskStatus::IsChanged() const {
TString unused;
- return IsChanged(unused);
+ return Current != Compute(Current, unused);
}
void TPDiskStatus::ApplyChanges(TString& reason) {
@@ -196,6 +205,13 @@ void TPDiskInfo::AddState(EPDiskState state) {
Touch();
}
+/// TNodeInfo
+
+bool TNodeInfo::HasFaultyMarker() const {
+ return Markers.contains(NKikimrCms::MARKER_DISK_FAULTY)
+ || Markers.contains(NKikimrCms::MARKER_DISK_BROKEN);
+}
+
/// TClusterMap
TClusterMap::TClusterMap(TSentinelState::TPtr state)
@@ -412,9 +428,15 @@ class TConfigUpdater: public TUpdaterBase<TEvSentinel::TEvConfigUpdated, TConfig
SentinelState->Nodes.clear();
for (const auto& host : record.GetState().GetHosts()) {
if (host.HasNodeId() && host.HasLocation() && host.HasName()) {
+ THashSet<NKikimrCms::EMarker> markers;
+ for (auto marker : host.GetMarkers()) {
+ markers.insert(static_cast<NKikimrCms::EMarker>(marker));
+ }
+
SentinelState->Nodes.emplace(host.GetNodeId(), TNodeInfo{
.Host = host.GetName(),
.Location = NActors::TNodeLocation(host.GetLocation()),
+ .Markers = std::move(markers),
});
}
}
@@ -862,13 +884,20 @@ class TSentinel: public TActorBootstrapped<TSentinel> {
const TPDiskID& id = pdisk.first;
TPDiskInfo& info = *(pdisk.second);
- if (!SentinelState->Nodes.contains(id.NodeId)) {
+ auto it = SentinelState->Nodes.find(id.NodeId);
+ if (it == SentinelState->Nodes.end()) {
LOG_E("Missing node info"
<< ": pdiskId# " << id);
info.IgnoreReason = NKikimrCms::TPDiskInfo::MISSING_NODE;
continue;
}
+ if (it->second.HasFaultyMarker()) {
+ info.SetForcedStatus(EPDiskStatus::FAULTY);
+ } else {
+ info.ResetForcedStatus();
+ }
+
all.AddPDisk(id);
if (info.IsChanged()) {
if (info.IsNewStatusGood()) {
@@ -961,6 +990,18 @@ class TSentinel: public TActorBootstrapped<TSentinel> {
NTabletPipe::SendData(SelfId(), CmsState->BSControllerPipe, request.Release(), ++SentinelState->ChangeRequestId);
}
+ void Handle(TEvSentinel::TEvUpdateHostMarkers::TPtr& ev) {
+ for (auto& [nodeId, markers] : ev->Get()->HostMarkers) {
+ auto it = SentinelState->Nodes.find(nodeId);
+ if (it == SentinelState->Nodes.end()) {
+ // markers will be updated upon next ConfigUpdate iteration
+ continue;
+ }
+
+ it->second.Markers = std::move(markers);
+ }
+ }
+
void Handle(TEvCms::TEvGetSentinelStateRequest::TPtr& ev) {
const auto& reqRecord = ev->Get()->Record;
@@ -1192,6 +1233,7 @@ public:
sFunc(TEvSentinel::TEvConfigUpdated, OnConfigUpdated);
sFunc(TEvSentinel::TEvUpdateState, UpdateState);
sFunc(TEvSentinel::TEvStateUpdated, OnStateUpdated);
+ hFunc(TEvSentinel::TEvUpdateHostMarkers, Handle);
sFunc(TEvSentinel::TEvBSCPipeDisconnected, OnPipeDisconnected);
hFunc(TEvCms::TEvGetSentinelStateRequest, Handle);
diff --git a/ydb/core/cms/sentinel.h b/ydb/core/cms/sentinel.h
index f0d1249348..d5734aef51 100644
--- a/ydb/core/cms/sentinel.h
+++ b/ydb/core/cms/sentinel.h
@@ -16,6 +16,8 @@ struct TEvSentinel {
EvTimeout,
EvBSCPipeDisconnected,
+ EvUpdateHostMarkers,
+
EvEnd,
};
@@ -30,6 +32,20 @@ struct TEvSentinel {
struct TEvTimeout: public TEventLocal<TEvTimeout, EvTimeout> {};
struct TEvBSCPipeDisconnected: public TEventLocal<TEvBSCPipeDisconnected, EvBSCPipeDisconnected> {};
+ struct TEvUpdateHostMarkers: public TEventLocal<TEvUpdateHostMarkers, EvUpdateHostMarkers> {
+ struct THostMarkers {
+ ui32 NodeId;
+ THashSet<NKikimrCms::EMarker> Markers;
+ };
+
+ TVector<THostMarkers> HostMarkers;
+
+ explicit TEvUpdateHostMarkers(TVector<THostMarkers>&& hostMarkers)
+ : HostMarkers(std::move(hostMarkers))
+ {
+ }
+ };
+
}; // TEvSentinel
IActor* CreateSentinel(TCmsStatePtr state);
diff --git a/ydb/core/cms/sentinel_impl.h b/ydb/core/cms/sentinel_impl.h
index 8ed385184d..7f65e0faab 100644
--- a/ydb/core/cms/sentinel_impl.h
+++ b/ydb/core/cms/sentinel_impl.h
@@ -28,6 +28,9 @@ public:
void Reset();
+ void SetForcedStatus(EPDiskStatus status);
+ void ResetForcedStatus();
+
private:
const ui32& DefaultStateLimit;
const TLimitsMap& StateLimits;
@@ -35,6 +38,7 @@ private:
EPDiskState State = NKikimrBlobStorage::TPDiskState::Unknown;
mutable EPDiskState PrevState = State;
ui64 StateCounter;
+ TMaybe<EPDiskStatus> ForcedStatus;
}; // TPDiskStatusComputer
@@ -43,7 +47,6 @@ public:
explicit TPDiskStatus(EPDiskStatus initialStatus, const ui32& defaultStateLimit, const TLimitsMap& stateLimits);
void AddState(EPDiskState state);
- bool IsChanged(TString& reason) const;
bool IsChanged() const;
void ApplyChanges(TString& reason);
void ApplyChanges();
@@ -104,6 +107,9 @@ private:
struct TNodeInfo {
TString Host;
NActors::TNodeLocation Location;
+ THashSet<NKikimrCms::EMarker> Markers;
+
+ bool HasFaultyMarker() const;
};
struct TConfigUpdaterState {
diff --git a/ydb/core/cms/sentinel_ut.cpp b/ydb/core/cms/sentinel_ut.cpp
index b32fbeefbe..2647806481 100644
--- a/ydb/core/cms/sentinel_ut.cpp
+++ b/ydb/core/cms/sentinel_ut.cpp
@@ -156,7 +156,7 @@ Y_UNIT_TEST_SUITE(TSentinelBaseTests) {
location.SetUnit(ToString(id));
state->ClusterInfo->AddNode(TEvInterconnect::TNodeInfo(id, name, name, name, 10000, TNodeLocation(location)), nullptr);
- sentinelState->Nodes[id] = NSentinel::TNodeInfo{name, NActors::TNodeLocation(location)};
+ sentinelState->Nodes[id] = NSentinel::TNodeInfo{name, NActors::TNodeLocation(location), {}};
for (ui64 npdisk : xrange(pdisksPerNode)) {
NKikimrBlobStorage::TBaseConfig::TPDisk pdisk;
diff --git a/ydb/core/cms/ut_helpers.cpp b/ydb/core/cms/ut_helpers.cpp
index 64625436e6..1a5246157b 100644
--- a/ydb/core/cms/ut_helpers.cpp
+++ b/ydb/core/cms/ut_helpers.cpp
@@ -1,9 +1,5 @@
#include "ut_helpers.h"
-Y_DECLARE_OUT_SPEC(, NKikimrCms::TAction::EType, os, type) {
- os << NKikimrCms::TAction::EType_Name(type);
-}
-
Y_DECLARE_OUT_SPEC(, NKikimrWhiteboard::TTabletStateInfo::ETabletState, os, state) {
os << NKikimrWhiteboard::TTabletStateInfo::ETabletState_Name(state);
}
diff --git a/ydb/core/cms/ut_helpers.h b/ydb/core/cms/ut_helpers.h
index b2d89a6e1e..696e6726d4 100644
--- a/ydb/core/cms/ut_helpers.h
+++ b/ydb/core/cms/ut_helpers.h
@@ -67,18 +67,49 @@ void AddActions(TAutoPtr<NCms::TEvCms::TEvPermissionRequest> &event, const NKiki
AddActions(event, actions...);
}
+struct TRequestOptions {
+ TString User;
+ bool Partial;
+ bool DryRun;
+ bool Schedule;
+ bool Prepare;
+
+ explicit TRequestOptions(const TString &user, bool partial, bool dry, bool schedule)
+ : User(user)
+ , Partial(partial)
+ , DryRun(dry)
+ , Schedule(schedule)
+ , Prepare(false)
+ {}
+
+ explicit TRequestOptions(const TString &user)
+ : TRequestOptions(user, false, false, false)
+ {}
+
+ TRequestOptions& WithPrepare() {
+ Prepare = true;
+ return *this;
+ }
+};
+
template <typename... Ts>
-TAutoPtr<NCms::TEvCms::TEvPermissionRequest> MakePermissionRequest(const TString &user, bool partial, bool dry, bool schedule, Ts... actions) {
+TAutoPtr<NCms::TEvCms::TEvPermissionRequest> MakePermissionRequest(const TRequestOptions &opts, Ts... actions) {
TAutoPtr<NCms::TEvCms::TEvPermissionRequest> event = new NCms::TEvCms::TEvPermissionRequest;
- event->Record.SetUser(user);
- event->Record.SetPartialPermissionAllowed(partial);
- event->Record.SetDryRun(dry);
- event->Record.SetSchedule(schedule);
+ event->Record.SetUser(opts.User);
+ event->Record.SetPartialPermissionAllowed(opts.Partial);
+ event->Record.SetDryRun(opts.DryRun);
+ event->Record.SetSchedule(opts.Schedule);
+ event->Record.SetPrepare(opts.Prepare);
AddActions(event, actions...);
return event;
}
+template <typename... Ts>
+TAutoPtr<NCms::TEvCms::TEvPermissionRequest> MakePermissionRequest(const TString &user, bool partial, bool dry, bool schedule, Ts... actions) {
+ return MakePermissionRequest(TRequestOptions(user, partial, dry, schedule), actions...);
+}
+
inline void AddPermissions(TAutoPtr<NCms::TEvCms::TEvManagePermissionRequest> &ev, const TString &id) {
*ev->Record.AddPermissions() = id;
}
diff --git a/ydb/core/protos/cms.proto b/ydb/core/protos/cms.proto
index 9be2f66d20..c84a2f366d 100644
--- a/ydb/core/protos/cms.proto
+++ b/ydb/core/protos/cms.proto
@@ -161,6 +161,8 @@ message TPermissionRequest {
// Availability mode is not preserved for scheduled events.
optional EAvailabilityMode AvailabilityMode = 9 [default = MODE_MAX_AVAILABILITY];
optional string MaintenanceTaskId = 10;
+ // Prepare item (host/device) before granting permission
+ optional bool Prepare = 11 [default = false];
}
enum EExtensionType {
diff --git a/ydb/core/protos/out/out_cms.cpp b/ydb/core/protos/out/out_cms.cpp
index a681fdb380..71012587da 100644
--- a/ydb/core/protos/out/out_cms.cpp
+++ b/ydb/core/protos/out/out_cms.cpp
@@ -25,3 +25,7 @@ Y_DECLARE_OUT_SPEC(, NKikimrCms::ETextFormat, stream, value) {
Y_DECLARE_OUT_SPEC(, NKikimrCms::TLogRecordData::EType, stream, value) {
stream << NKikimrCms::TLogRecordData::EType_Name(value);
}
+
+Y_DECLARE_OUT_SPEC(, NKikimrCms::TAction::EType, stream, value) {
+ stream << NKikimrCms::TAction::EType_Name(value);
+}