diff options
author | ilnaz <ilnaz@ydb.tech> | 2023-12-06 13:42:57 +0300 |
---|---|---|
committer | ilnaz <ilnaz@ydb.tech> | 2023-12-06 14:29:00 +0300 |
commit | bfb11ee270b827ddd066a35ee4aa96c389c87603 (patch) | |
tree | 85ec6e8a319b1bea22a1156761f80216b6a7d319 | |
parent | ef5138754487f032c1a415f0faf2101e9e42496a (diff) | |
download | ydb-bfb11ee270b827ddd066a35ee4aa96c389c87603.tar.gz |
Prepare host before granting permission KIKIMR-20254
-rw-r--r-- | ydb/core/cms/cluster_info.cpp | 12 | ||||
-rw-r--r-- | ydb/core/cms/cluster_info.h | 4 | ||||
-rw-r--r-- | ydb/core/cms/cms.cpp | 175 | ||||
-rw-r--r-- | ydb/core/cms/cms_impl.h | 14 | ||||
-rw-r--r-- | ydb/core/cms/cms_state.h | 1 | ||||
-rw-r--r-- | ydb/core/cms/cms_tx_load_state.cpp | 12 | ||||
-rw-r--r-- | ydb/core/cms/cms_tx_remove_permissions.cpp | 36 | ||||
-rw-r--r-- | ydb/core/cms/cms_tx_remove_request.cpp | 12 | ||||
-rw-r--r-- | ydb/core/cms/cms_tx_store_permissions.cpp | 16 | ||||
-rw-r--r-- | ydb/core/cms/cms_ut.cpp | 103 | ||||
-rw-r--r-- | ydb/core/cms/cms_ut_common.cpp | 56 | ||||
-rw-r--r-- | ydb/core/cms/cms_ut_common.h | 15 | ||||
-rw-r--r-- | ydb/core/cms/sentinel.cpp | 54 | ||||
-rw-r--r-- | ydb/core/cms/sentinel.h | 16 | ||||
-rw-r--r-- | ydb/core/cms/sentinel_impl.h | 8 | ||||
-rw-r--r-- | ydb/core/cms/sentinel_ut.cpp | 2 | ||||
-rw-r--r-- | ydb/core/cms/ut_helpers.cpp | 4 | ||||
-rw-r--r-- | ydb/core/cms/ut_helpers.h | 41 | ||||
-rw-r--r-- | ydb/core/protos/cms.proto | 2 | ||||
-rw-r--r-- | ydb/core/protos/out/out_cms.cpp | 4 |
20 files changed, 506 insertions, 81 deletions
diff --git a/ydb/core/cms/cluster_info.cpp b/ydb/core/cms/cluster_info.cpp index ad4d5fb367..ffeab7b70d 100644 --- a/ydb/core/cms/cluster_info.cpp +++ b/ydb/core/cms/cluster_info.cpp @@ -795,6 +795,18 @@ ui64 TClusterInfo::AddExternalLocks(const TNotificationInfo ¬ification, const return locks; } +void TClusterInfo::SetHostMarkers(const TString &hostName, const THashSet<NKikimrCms::EMarker> &markers) { + for (auto node : NodePtrs(hostName)) { + node->Markers.insert(markers.begin(), markers.end()); + } +} + +void TClusterInfo::ResetHostMarkers(const TString &hostName) { + for (auto node : NodePtrs(hostName)) { + node->Markers.clear(); + } +} + void TClusterInfo::ApplyDowntimes(const TDowntimes &downtimes) { for (auto &pr : downtimes.NodeDowntimes) { diff --git a/ydb/core/cms/cluster_info.h b/ydb/core/cms/cluster_info.h index d118336636..5ba7fddd12 100644 --- a/ydb/core/cms/cluster_info.h +++ b/ydb/core/cms/cluster_info.h @@ -297,6 +297,7 @@ public: std::list<TScheduledLock> ScheduledLocks; TVector<TTemporaryLock> TempLocks; ui64 DeactivatedLocksOrder = Max<ui64>(); + THashSet<NKikimrCms::EMarker> Markers; }; using TLockableItemPtr = TIntrusivePtr<TLockableItem>; @@ -902,6 +903,9 @@ public: ui64 AddExternalLocks(const TNotificationInfo ¬ification, const TActorContext *ctx); + void SetHostMarkers(const TString &hostName, const THashSet<NKikimrCms::EMarker> &markers); + void ResetHostMarkers(const TString &hostName); + void ApplyDowntimes(const TDowntimes &downtimes); void UpdateDowntimes(TDowntimes &downtimes, const TActorContext &ctx); diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp index 5416b1870d..8623a2f098 100644 --- a/ydb/core/cms/cms.cpp +++ b/ydb/core/cms/cms.cpp @@ -253,6 +253,8 @@ void TCms::AdjustInfo(TClusterInfoPtr &info, const TActorContext &ctx) const info->AddLocks(entry.second, &ctx); for (const auto &entry : State->Notifications) info->AddExternalLocks(entry.second, &ctx); + for (const auto &entry : State->HostMarkers) + info->SetHostMarkers(entry.first, entry.second); } namespace { @@ -284,13 +286,20 @@ bool TCms::CheckPermissionRequest(const TPermissionRequest &request, TStatus::UNKNOWN, }); bool allowPartial = request.GetPartialPermissionAllowed(); - bool schedule = request.GetSchedule() && !request.GetDryRun(); + bool schedule = (request.GetSchedule() || request.GetPrepare()) && !request.GetDryRun(); + + if (request.GetPrepare() && request.ActionsSize() > 1) { + response.MutableStatus()->SetCode(TStatus::WRONG_REQUEST); + response.MutableStatus()->SetReason("Cannot prepare more than one action at once"); + return false; + } response.MutableStatus()->SetCode(TStatus::ALLOW); if (schedule) { scheduled.SetUser(request.GetUser()); scheduled.SetPartialPermissionAllowed(allowPartial); - scheduled.SetSchedule(true); + scheduled.SetSchedule(request.GetSchedule()); + scheduled.SetPrepare(request.GetPrepare()); scheduled.SetReason(request.GetReason()); if (request.HasDuration()) scheduled.SetDuration(request.GetDuration()); @@ -331,7 +340,12 @@ bool TCms::CheckPermissionRequest(const TPermissionRequest &request, LOG_DEBUG(ctx, NKikimrServices::CMS, "Checking action: %s", action.ShortDebugString().data()); - if (CheckAction(action, opts, error, ctx)) { + bool prepared = !request.GetPrepare(); + if (!prepared) { + prepared = CheckPrepareHost(action, error); + } + + if (prepared && CheckAction(action, opts, error, ctx)) { LOG_DEBUG(ctx, NKikimrServices::CMS, "Result: ALLOW"); auto *permission = response.AddPermissions(); @@ -482,11 +496,36 @@ bool TCms::CheckAccess(const TString &token, return false; } -bool TCms::CheckAction(const TAction &action, - const TActionOptions &opts, - TErrorInfo &error, - const TActorContext &ctx) const -{ +bool TCms::CheckPrepareHost(const TAction &action, TErrorInfo &error) const { + if (!State->Sentinel) { + error.Code = TStatus::ERROR; + error.Reason = "Cannot prepare host while Sentinel (self heal) is disabled"; + return false; + } + + switch (action.GetType()) { + case TAction::RESTART_SERVICES: + case TAction::SHUTDOWN_HOST: + case TAction::REBOOT_HOST: + break; + default: + error.Code = TStatus::WRONG_REQUEST; + error.Reason = TStringBuilder() << "Cannot prepare host for action: " << action.GetType(); + return false; + } + + for (const auto node : ClusterInfo->HostNodes(action.GetHost())) { + if (!node->VDisks.empty()) { + error.Code = TStatus::DISALLOW_TEMP; + error.Reason = TStringBuilder() << "Host " << action.GetHost() << " is not prepared yet"; + return false; + } + } + + return true; +} + +bool TCms::CheckAction(const TAction &action, const TActionOptions &opts, TErrorInfo &error, const TActorContext &ctx) const { if (!IsActionHostValid(action, error)) return false; @@ -505,12 +544,11 @@ bool TCms::CheckAction(const TAction &action, case TAction::ADD_DEVICES: case TAction::REMOVE_DEVICES: error.Code = TStatus::ERROR; - error.Reason = Sprintf("Unsupported action action '%s'", - TAction::EType_Name(action.GetType()).data()); + error.Reason = TStringBuilder() << "Unsupported action: " << action.GetType(); return false; default: - error.Code = TStatus::WRONG_REQUEST; - error.Reason = Sprintf("Unknown action '%s'", TAction::EType_Name(action.GetType()).data()); + error.Code = TStatus::WRONG_REQUEST; + error.Reason = TStringBuilder() << "Unknown action: " << static_cast<int>(action.GetType()); return false; } } @@ -924,7 +962,10 @@ void TCms::AcceptPermissions(TPermissionResponse &resp, const TString &requestId auto &permission = *resp.MutablePermissions(i); permission.SetId(owner + "-p-" + ToString(State->NextPermissionId++)); State->Permissions.emplace(permission.GetId(), TPermissionInfo(permission, requestId, owner)); - LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::CMS, "Accepting permission"); + LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::CMS, "Accepting permission" + << ": id# " << permission.GetId() + << ", requestId# " << requestId + << ", owner# " << owner); ClusterInfo->AddLocks(permission, requestId, owner, &ctx); if (!check) { @@ -1120,6 +1161,9 @@ void TCms::AddHostState(const TClusterInfoPtr &clusterInfo, const TNodeInfo &nod host->SetInterconnectPort(node.IcPort); host->SetTimestamp(timestamp.GetValue()); node.Location.Serialize(host->MutableLocation(), false); + for (auto marker : node.Markers) { + host->AddMarkers(marker); + } if (node.State == UP || node.VDisks || node.PDisks) { for (const auto flag : GetEnumAllValues<EService>()) { if (!(node.Services & flag)) { @@ -1141,6 +1185,9 @@ void TCms::AddHostState(const TClusterInfoPtr &clusterInfo, const TNodeInfo &nod device->SetName(vdisk.GetDeviceName()); device->SetState(vdisk.State); device->SetTimestamp(timestamp.GetValue()); + for (auto marker : vdisk.Markers) { + device->AddMarkers(marker); + } } for (const auto &pdId : node.PDisks) { @@ -1149,6 +1196,9 @@ void TCms::AddHostState(const TClusterInfoPtr &clusterInfo, const TNodeInfo &nod device->SetName(pdisk.GetDeviceName()); device->SetState(pdisk.State); device->SetTimestamp(timestamp.GetValue()); + for (auto marker : pdisk.Markers) { + device->AddMarkers(marker); + } } } } @@ -1291,10 +1341,17 @@ void TCms::RemoveRequest(TEvCms::TEvManageRequestRequest::TPtr &ev, const TActor resp->Record.MutableStatus()->SetReason("Unknown request " + id); } else { const auto &request = it->second; + if (request.Owner != user) { resp->Record.MutableStatus()->SetCode(TStatus::WRONG_REQUEST); resp->Record.MutableStatus()->SetReason(Sprintf("Request %s doesn't belong to %s", id.data(), user.data())); } + + if (request.Request.GetPrepare() && request.Request.ActionsSize() < 1) { + resp->Record.MutableStatus()->SetCode(TStatus::WRONG_REQUEST); + resp->Record.MutableStatus()->SetReason( + Sprintf("Request %s used to prepare action and cannot be deleted while permission is valid", id.data())); + } } LOG_DEBUG(ctx, NKikimrServices::CMS, "Resulting status: %s %s", @@ -1470,6 +1527,65 @@ void TCms::PersistNodeTenants(TTransactionContext& txc, const TActorContext& ctx } } +TVector<TCms::THostMarkers> TCms::SetHostMarker(const TString &host, NKikimrCms::EMarker marker, TTransactionContext &txc, const TActorContext &ctx) { + if (State->HostMarkers.contains(host)) { + return {}; + } + + AuditLog(ctx, TStringBuilder() << "Add host marker" + << ": host# " << host + << ", marker# " << marker); + + State->HostMarkers[host] = {marker}; + NIceDb::TNiceDb db(txc.DB); + db.Table<Schema::HostMarkers>().Key(host).Update( + NIceDb::TUpdate<Schema::HostMarkers::Markers>(TVector<NKikimrCms::EMarker>{marker}) + ); + + TVector<TCms::THostMarkers> updateMarkers; + if (ClusterInfo) { + for (const auto node : ClusterInfo->HostNodes(host)) { + updateMarkers.push_back({ + .NodeId = node->NodeId, + .Markers = {marker}, + }); + } + } + + return updateMarkers; +} + +TVector<TCms::THostMarkers> TCms::ResetHostMarkers(const TString &host, TTransactionContext &txc, const TActorContext &ctx) { + if (!State->HostMarkers.contains(host)) { + return {}; + } + + AuditLog(ctx, TStringBuilder() << "Reset host markers" + << ": host# " << host); + + State->HostMarkers.erase(host); + NIceDb::TNiceDb db(txc.DB); + db.Table<Schema::HostMarkers>().Key(host).Delete(); + + TVector<TCms::THostMarkers> updateMarkers; + if (ClusterInfo) { + for (const auto node : ClusterInfo->HostNodes(host)) { + updateMarkers.push_back({ + .NodeId = node->NodeId, + .Markers = {}, + }); + } + } + + return updateMarkers; +} + +void TCms::SentinelUpdateHostMarkers(TVector<TCms::THostMarkers> &&updateMarkers, const TActorContext &ctx) { + if (updateMarkers) { + ctx.Send(State->Sentinel, new TEvSentinel::TEvUpdateHostMarkers(std::move(updateMarkers))); + } +} + void TCms::ProcessQueue() { // To avoid getting stuck in the processing queue for too long, @@ -1728,6 +1844,21 @@ void TCms::Handle(TEvCms::TEvPermissionRequest::TPtr &ev, } } + if (rec.GetPrepare()) { + for (const auto &action : rec.GetActions()) { + if (State->HostMarkers.contains(action.GetHost())) { + return ReplyWithError<TEvCms::TEvPermissionResponse>( + ev, TStatus::WRONG_REQUEST, TStringBuilder() << "Host '" << action.GetHost() << "' is preparing", ctx); + } + for (const auto node : ClusterInfo->HostNodes(action.GetHost())) { + if (State->HostMarkers.contains(ToString(node->NodeId))) { + return ReplyWithError<TEvCms::TEvPermissionResponse>( + ev, TStatus::WRONG_REQUEST, TStringBuilder() << "Node '" << node->NodeId << "' is preparing", ctx); + } + } + } + } + bool ok = CheckPermissionRequest(rec, resp->Record, scheduled.Request, ctx); // Schedule request if required. @@ -1738,7 +1869,7 @@ void TCms::Handle(TEvCms::TEvPermissionRequest::TPtr &ev, resp->Record.SetRequestId(reqId); TAutoPtr<TRequestInfo> copy; - if (scheduled.Request.ActionsSize()) { + if (scheduled.Request.ActionsSize() || scheduled.Request.GetPrepare()) { scheduled.Owner = user; scheduled.Order = State->NextRequestId - 1; scheduled.RequestId = reqId; @@ -1803,14 +1934,14 @@ void TCms::Handle(TEvCms::TEvCheckRequest::TPtr &ev, const TActorContext &ctx) auto order = request.Order; State->ScheduledRequests.erase(it); - if (scheduled.Request.ActionsSize()) { + if (scheduled.Request.ActionsSize() || scheduled.Request.GetPrepare()) { scheduled.Owner = user; scheduled.Order = order; scheduled.RequestId = rec.GetRequestId(); resp->Record.SetRequestId(scheduled.RequestId); copy = new TRequestInfo(scheduled); - State->ScheduledRequests.emplace(scheduled.RequestId, std::move(scheduled)); + State->ScheduledRequests.emplace(rec.GetRequestId(), std::move(scheduled)); } else { scheduled.RequestId = rec.GetRequestId(); scheduled.Owner = user; @@ -1818,7 +1949,7 @@ void TCms::Handle(TEvCms::TEvCheckRequest::TPtr &ev, const TActorContext &ctx) } if (ok) - AcceptPermissions(resp->Record, scheduled.RequestId, user, ctx, true); + AcceptPermissions(resp->Record, rec.GetRequestId(), user, ctx, true); auto handle = new IEventHandle(ev->Sender, SelfId(), resp.Release(), 0, ev->Cookie); Execute(CreateTxStorePermissions(std::move(ev->Release()), handle, user, std::move(copy)), ctx); @@ -1897,6 +2028,7 @@ bool TCms::IsValidNotificationAction(const TAction &action, TInstant time, case TAction::RESTART_SERVICES: return CheckNotificationRestartServices(action, time, error, ctx); case TAction::SHUTDOWN_HOST: + case TAction::REBOOT_HOST: return CheckNotificationShutdownHost(action, time, error, ctx); case TAction::REPLACE_DEVICES: return CheckNotificationReplaceDevices(action, time, error, ctx); @@ -1907,12 +2039,11 @@ bool TCms::IsValidNotificationAction(const TAction &action, TInstant time, case TAction::ADD_DEVICES: case TAction::REMOVE_DEVICES: error.Code = TStatus::ERROR; - error.Reason = Sprintf("Unsupported action action '%s'", - TAction::EType_Name(action.GetType()).data()); + error.Reason = TStringBuilder() << "Unsupported action: " << action.GetType(); return false; default: - error.Code = TStatus::WRONG_REQUEST; - error.Reason = Sprintf("Unknown action '%s'", TAction::EType_Name(action.GetType()).data()); + error.Code = TStatus::WRONG_REQUEST; + error.Reason = TStringBuilder() << "Unknown action: " << static_cast<int>(action.GetType()); return false; } } @@ -2079,7 +2210,7 @@ void TCms::Handle(TEvCms::TEvGetLogTailRequest::TPtr &ev, const TActorContext &c void TCms::Handle(TEvCms::TEvGetSentinelStateRequest::TPtr &ev, const TActorContext &ctx) { - if(State->Sentinel) { + if (State->Sentinel) { ctx.Send(ev->Forward(State->Sentinel)); } else { auto Response = MakeHolder<TEvCms::TEvGetSentinelStateResponse>(); diff --git a/ydb/core/cms/cms_impl.h b/ydb/core/cms/cms_impl.h index d2dace9f3e..7bcced14e7 100644 --- a/ydb/core/cms/cms_impl.h +++ b/ydb/core/cms/cms_impl.h @@ -4,6 +4,7 @@ #include "cms.h" #include "config.h" #include "logger.h" +#include "sentinel.h" #include "services.h" #include "walle.h" @@ -78,6 +79,11 @@ public: void PersistNodeTenants(TTransactionContext &txc, const TActorContext &ctx); + using THostMarkers = TEvSentinel::TEvUpdateHostMarkers::THostMarkers; + TVector<THostMarkers> SetHostMarker(const TString &host, NKikimrCms::EMarker marker, TTransactionContext &txc, const TActorContext &ctx); + TVector<THostMarkers> ResetHostMarkers(const TString &host, TTransactionContext &txc, const TActorContext &ctx); + void SentinelUpdateHostMarkers(TVector<THostMarkers> &&updateMarkers, const TActorContext &ctx); + static void AddHostState(const TClusterInfoPtr &clusterInfo, const TNodeInfo &node, NKikimrCms::TClusterStateResponse &resp, TInstant timestamp); private: @@ -294,8 +300,12 @@ private: NKikimrCms::TStatus::ECode &code, TString &error, const TActorContext &ctx); - bool CheckAction(const NKikimrCms::TAction &action, const TActionOptions &options, - TErrorInfo &error, const TActorContext &ctx) const; + bool CheckPrepareHost(const NKikimrCms::TAction &action, + TErrorInfo &error) const; + bool CheckAction(const NKikimrCms::TAction &action, + const TActionOptions &opts, + TErrorInfo &error, + const TActorContext &ctx) const; bool CheckActionShutdownNode(const NKikimrCms::TAction &action, const TActionOptions &options, const TNodeInfo &node, diff --git a/ydb/core/cms/cms_state.h b/ydb/core/cms/cms_state.h index d2581b5936..8a2c33ad95 100644 --- a/ydb/core/cms/cms_state.h +++ b/ydb/core/cms/cms_state.h @@ -31,6 +31,7 @@ struct TCmsState : public TAtomicRefCount<TCmsState> { THashMap<TString, TPermissionInfo> Permissions; THashMap<TString, TRequestInfo> ScheduledRequests; THashMap<TString, TNotificationInfo> Notifications; + THashMap<TString, THashSet<NKikimrCms::EMarker>> HostMarkers; TDowntimes Downtimes; ui64 NextPermissionId = 0; ui64 NextRequestId = 0; diff --git a/ydb/core/cms/cms_tx_load_state.cpp b/ydb/core/cms/cms_tx_load_state.cpp index c77e30157a..829ce9f945 100644 --- a/ydb/core/cms/cms_tx_load_state.cpp +++ b/ydb/core/cms/cms_tx_load_state.cpp @@ -34,6 +34,7 @@ public: auto maintenanceTasksRowset = db.Table<Schema::MaintenanceTasks>().Range().Select<Schema::MaintenanceTasks::TColumns>(); auto notificationRowset = db.Table<Schema::Notification>().Range().Select<Schema::Notification::TColumns>(); auto nodeTenantRowset = db.Table<Schema::NodeTenant>().Range().Select<Schema::NodeTenant::TColumns>(); + auto hostMarkersRowset = db.Table<Schema::HostMarkers>().Range().Select<Schema::HostMarkers::TColumns>(); auto logRowset = db.Table<Schema::LogRecords>().Range().Select<Schema::LogRecords::Timestamp>(); if (!paramRow.IsReady() @@ -42,6 +43,7 @@ public: || !walleTaskRowset.IsReady() || !maintenanceTasksRowset.IsReady() || !notificationRowset.IsReady() + || !hostMarkersRowset.IsReady() || !logRowset.IsReady()) return false; @@ -205,6 +207,16 @@ public: return false; } + while (!hostMarkersRowset.EndOfSet()) { + TString host = hostMarkersRowset.GetValue<Schema::HostMarkers::Host>(); + TVector<NKikimrCms::EMarker> markers = hostMarkersRowset.GetValue<Schema::HostMarkers::Markers>(); + + state->HostMarkers[host].insert(markers.begin(), markers.end()); + + if (!hostMarkersRowset.Next()) + return false; + } + if (!state->Downtimes.DbLoadState(txc, ctx)) return false; diff --git a/ydb/core/cms/cms_tx_remove_permissions.cpp b/ydb/core/cms/cms_tx_remove_permissions.cpp index 31d3c6b4a8..14a6869b5a 100644 --- a/ydb/core/cms/cms_tx_remove_permissions.cpp +++ b/ydb/core/cms/cms_tx_remove_permissions.cpp @@ -6,6 +6,12 @@ namespace NKikimr::NCms { class TCms::TTxRemovePermissions : public TTransactionBase<TCms> { + void RemoveRequest(NIceDb::TNiceDb &db, const TString &reqId, const TActorContext &ctx, const TString &reason) { + Self->State->ScheduledRequests.erase(reqId); + db.Table<Schema::Request>().Key(reqId).Delete(); + Self->AuditLog(ctx, reason); + } + public: TTxRemovePermissions(TCms *self, TVector<TString> &&ids, THolder<IEventBase> req, TAutoPtr<IEventHandle> resp, bool expired) : TBase(self) @@ -26,17 +32,29 @@ public: if (!Self->State->Permissions.contains(id)) continue; - auto requestId = Self->State->Permissions.find(id)->second.RequestId; + const auto &permission = Self->State->Permissions.find(id)->second; + const TString requestId = permission.RequestId; + const TString host = permission.Action.GetHost(); + Self->State->Permissions.erase(id); db.Table<Schema::Permission>().Key(id).Delete(); - if (Expired && Self->State->ScheduledRequests.contains(requestId)) { - Self->State->ScheduledRequests.erase(requestId); - db.Table<Schema::Request>().Key(requestId).Delete(); - - Self->AuditLog(ctx, TStringBuilder() << "Remove request" - << ": id# " << requestId - << ", reason# " << "permission " << id << " has expired"); + auto it = Self->State->ScheduledRequests.find(requestId); + if (it != Self->State->ScheduledRequests.end()) { + if (Expired) { + RemoveRequest(db, requestId, ctx, TStringBuilder() << "Remove request" + << ": id# " << requestId + << ", reason# " << "permission " << id << " has expired"); + } + + if (it->second.Request.GetPrepare()) { + auto ret = Self->ResetHostMarkers(host, txc, ctx); + std::move(ret.begin(), ret.end(), std::back_inserter(UpdateMarkers)); + + RemoveRequest(db, requestId, ctx, TStringBuilder() << "Remove request" + << ": id# " << requestId + << ", reason# " << "permission " << id << " was removed"); + } } if (Self->State->WalleRequests.contains(requestId)) { @@ -66,6 +84,7 @@ public: } Self->RemoveEmptyTasks(ctx); + Self->SentinelUpdateHostMarkers(std::move(UpdateMarkers), ctx); } private: @@ -73,6 +92,7 @@ private: TAutoPtr<IEventHandle> Response; TVector<TString> Ids; bool Expired; + TVector<TEvSentinel::TEvUpdateHostMarkers::THostMarkers> UpdateMarkers; }; ITransaction *TCms::CreateTxRemovePermissions(TVector<TString> ids, THolder<IEventBase> req, TAutoPtr<IEventHandle> resp, diff --git a/ydb/core/cms/cms_tx_remove_request.cpp b/ydb/core/cms/cms_tx_remove_request.cpp index f584979d01..c2191eec84 100644 --- a/ydb/core/cms/cms_tx_remove_request.cpp +++ b/ydb/core/cms/cms_tx_remove_request.cpp @@ -20,6 +20,16 @@ public: bool Execute(TTransactionContext &txc, const TActorContext &ctx) override { LOG_DEBUG(ctx, NKikimrServices::CMS, "TTxRemoveRequest Execute"); + auto it = Self->State->ScheduledRequests.find(Id); + if (it != Self->State->ScheduledRequests.end()) { + if (it->second.Request.GetPrepare()) { + for (const auto &action : it->second.Request.GetActions()) { + auto ret = Self->ResetHostMarkers(action.GetHost(), txc, ctx); + std::move(ret.begin(), ret.end(), std::back_inserter(UpdateMarkers)); + } + } + } + NIceDb::TNiceDb db(txc.DB); db.Table<Schema::Request>().Key(Id).Delete(); Self->State->ScheduledRequests.erase(Id); @@ -40,12 +50,14 @@ public: } Self->RemoveEmptyTasks(ctx); + Self->SentinelUpdateHostMarkers(std::move(UpdateMarkers), ctx); } private: THolder<IEventBase> Request; TAutoPtr<IEventHandle> Response; TString Id; + TVector<TEvSentinel::TEvUpdateHostMarkers::THostMarkers> UpdateMarkers; }; ITransaction *TCms::CreateTxRemoveRequest(const TString &id, THolder<IEventBase> req, TAutoPtr<IEventHandle> resp) { diff --git a/ydb/core/cms/cms_tx_store_permissions.cpp b/ydb/core/cms/cms_tx_store_permissions.cpp index 069b722598..d0967340fd 100644 --- a/ydb/core/cms/cms_tx_store_permissions.cpp +++ b/ydb/core/cms/cms_tx_store_permissions.cpp @@ -69,13 +69,18 @@ public: << ": id# " << id << ", validity# " << TInstant::MicroSeconds(deadline) << ", action# " << actionStr); + + if (Scheduled && Scheduled->Request.GetPrepare()) { + auto ret = Self->SetHostMarker(permission.GetAction().GetHost(), NKikimrCms::MARKER_DISK_FAULTY, txc, ctx); + std::move(ret.begin(), ret.end(), std::back_inserter(UpdateMarkers)); + } } if (Scheduled) { auto &id = Scheduled->RequestId; auto &owner = Scheduled->Owner; - if (Scheduled->Request.ActionsSize()) { + if (Scheduled->Request.ActionsSize() || Scheduled->Request.GetPrepare()) { ui64 order = Scheduled->Order; TString requestStr; google::protobuf::TextFormat::PrintToString(Scheduled->Request, &requestStr); @@ -90,6 +95,13 @@ public: << ", owner# " << owner << ", order# " << order << ", body# " << requestStr); + + if (Scheduled->Request.GetPrepare()) { + for (const auto &action : Scheduled->Request.GetActions()) { + auto ret = Self->SetHostMarker(action.GetHost(), NKikimrCms::MARKER_DISK_FAULTY, txc, ctx); + std::move(ret.begin(), ret.end(), std::back_inserter(UpdateMarkers)); + } + } } else { db.Table<Schema::Request>().Key(id).Delete(); @@ -108,6 +120,7 @@ public: Self->Reply(Request.Get(), Response, ctx); Self->SchedulePermissionsCleanup(ctx); + Self->SentinelUpdateHostMarkers(std::move(UpdateMarkers), ctx); } private: @@ -118,6 +131,7 @@ private: const TMaybe<TString> MaintenanceTaskId; ui64 NextPermissionId; ui64 NextRequestId; + TVector<TEvSentinel::TEvUpdateHostMarkers::THostMarkers> UpdateMarkers; }; ITransaction *TCms::CreateTxStorePermissions(THolder<IEventBase> req, TAutoPtr<IEventHandle> resp, diff --git a/ydb/core/cms/cms_ut.cpp b/ydb/core/cms/cms_ut.cpp index 77adaf0837..01253e002f 100644 --- a/ydb/core/cms/cms_ut.cpp +++ b/ydb/core/cms/cms_ut.cpp @@ -1684,6 +1684,109 @@ Y_UNIT_TEST_SUITE(TCmsTest) { env.DestroyDefaultCmsPipe(); } + + Y_UNIT_TEST(PrepareHostShouldFailWhileSentinelIsDisabled) + { + TCmsTestEnv env(TTestEnvOpts(8).WithoutSentinel()); + env.CheckPermissionRequest( + MakePermissionRequest(TRequestOptions("user").WithPrepare(), + MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000) + ), + TStatus::ERROR + ); + } + + Y_UNIT_TEST(PrepareHostShouldFailOnUnsupportedAction) + { + TCmsTestEnv env(TTestEnvOpts(8).WithSentinel()); + env.CheckPermissionRequest( + MakePermissionRequest(TRequestOptions("user").WithPrepare(), + MakeAction(TAction::REPLACE_DEVICES, env.GetNodeId(0), 60000000, env.PDiskName(0)) + ), + TStatus::WRONG_REQUEST + ); + } + + Y_UNIT_TEST(PrepareHostShouldFailOnMultipleActions) + { + TCmsTestEnv env(TTestEnvOpts(8).WithSentinel()); + env.CheckPermissionRequest( + MakePermissionRequest(TRequestOptions("user").WithPrepare(), + MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000), + MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000) + ), + TStatus::WRONG_REQUEST + ); + } + + Y_UNIT_TEST(PrepareHost) + { + auto opts = TTestEnvOpts(8).WithSentinel(); + TCmsTestEnv env(opts); + env.SetLogPriority(NKikimrServices::CMS, NLog::PRI_DEBUG); + + // ok + auto request1 = env.CheckPermissionRequest( + MakePermissionRequest(TRequestOptions("user").WithPrepare(), + MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(0), 600000000, "storage") + ), + TStatus::DISALLOW_TEMP + ); + // forbid another prepare request for same host + env.CheckPermissionRequest( + MakePermissionRequest(TRequestOptions("user").WithPrepare(), + MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(0), 600000000, "storage") + ), + TStatus::WRONG_REQUEST + ); + + // "move" vdisks + auto& node = TFakeNodeWhiteboardService::Info[env.GetNodeId(0)]; + node.VDisksMoved = true; + node.VDiskStateInfo.clear(); + env.RegenerateBSConfig(TFakeNodeWhiteboardService::Config.MutableResponse()->MutableStatus(0)->MutableBaseConfig(), opts); + + // prepared + auto permission1 = env.CheckRequest("user", request1.GetRequestId(), false, TStatus::ALLOW, 1); + env.CheckRejectRequest("user", request1.GetRequestId(), false, TStatus::WRONG_REQUEST); + env.CheckDonePermission("user", permission1.GetPermissions(0).GetId()); + + // allow immediately + auto request2 = env.CheckPermissionRequest( + MakePermissionRequest(TRequestOptions("user").WithPrepare(), + MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(0), 600000000, "storage") + ), + TStatus::ALLOW + ); + UNIT_ASSERT_VALUES_EQUAL(request2.PermissionsSize(), 1); + + // check markers after restart + env.RestartCms(); + env.CheckPermissionRequest( + MakePermissionRequest(TRequestOptions("user").WithPrepare(), + MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(0), 600000000, "storage") + ), + TStatus::WRONG_REQUEST + ); + + env.CheckRejectRequest("user", request2.GetRequestId(), false, TStatus::WRONG_REQUEST); + env.CheckDonePermission("user", request2.GetPermissions(0).GetId()); + + // restore vdisks + node.VDisksMoved = false; + env.RegenerateBSConfig(TFakeNodeWhiteboardService::Config.MutableResponse()->MutableStatus(0)->MutableBaseConfig(), opts); + + // prepare + auto request3 = env.CheckPermissionRequest( + MakePermissionRequest(TRequestOptions("user").WithPrepare(), + MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(0), 600000000, "storage") + ), + TStatus::DISALLOW_TEMP + ); + + // reject until prepared + env.CheckRejectRequest("user", request3.GetRequestId()); + } } } diff --git a/ydb/core/cms/cms_ut_common.cpp b/ydb/core/cms/cms_ut_common.cpp index 01be0de05e..ab8843330d 100644 --- a/ydb/core/cms/cms_ut_common.cpp +++ b/ydb/core/cms/cms_ut_common.cpp @@ -187,23 +187,6 @@ void TFakeNodeWhiteboardService::Handle(TEvWhiteboard::TEvSystemStateRequest::TP namespace { -struct TFakeNodeInfo { - struct TVDiskIDComparator { - bool operator ()(const TVDiskID& a, const TVDiskID& b) const { - return std::make_tuple(a.GroupID, a.FailRealm, a.FailDomain, a.VDisk) - < std::make_tuple(b.GroupID, b.FailRealm, b.FailDomain, b.VDisk); - } - }; - - TMap<TTabletId, NKikimrWhiteboard::TTabletStateInfo> TabletStateInfo; - TMap<TString, NKikimrWhiteboard::TNodeStateInfo> NodeStateInfo; - TMap<ui32, NKikimrWhiteboard::TPDiskStateInfo> PDiskStateInfo; - TMap<TVDiskID, NKikimrWhiteboard::TVDiskStateInfo, TVDiskIDComparator> VDiskStateInfo; - TMap<ui32, NKikimrWhiteboard::TBSGroupStateInfo> BSGroupStateInfo; - NKikimrWhiteboard::TSystemStateInfo SystemStateInfo; - bool Connected = true; -}; - class TFakeTenantPool : public TActorBootstrapped<TFakeTenantPool> { public: TVector<TString> Tenants; @@ -243,7 +226,6 @@ public: void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseConfig *config, ui32 pdisks, ui32 vdiskPerPdisk = 4, const TNodeTenantsMap &tenants = {}, bool useMirror3dcErasure = false) { - TGuard<TMutex> guard(TFakeNodeWhiteboardService::Mutex); ui32 numNodes = runtime.GetNodeCount(); ui32 numNodeGroups = pdisks * vdiskPerPdisk; ui32 numGroups; @@ -271,14 +253,16 @@ void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseC group.SetErasureSpecies("none"); } - TFakeNodeWhiteboardService::Info.clear(); for (ui32 nodeIndex = 0; nodeIndex < numNodes; ++nodeIndex) { ui32 nodeId = runtime.GetNodeId(nodeIndex); - auto &node = TFakeNodeWhiteboardService::Info[nodeId]; - node.SystemStateInfo.SetVersion(ToString(GetProgramSvnRevision())); - node.SystemStateInfo.SetStartTime(now.GetValue()); - node.SystemStateInfo.SetChangeTime(now.GetValue()); + auto ret = TFakeNodeWhiteboardService::Info.emplace(nodeId, TFakeNodeInfo()); + auto &node = ret.first->second; + if (ret.second) { + node.SystemStateInfo.SetVersion(ToString(GetProgramSvnRevision())); + node.SystemStateInfo.SetStartTime(now.GetValue()); + node.SystemStateInfo.SetChangeTime(now.GetValue()); + } if (tenants.contains(nodeIndex)) { node.SystemStateInfo.AddRoles("Tenant"); @@ -312,6 +296,10 @@ void GenerateExtendedInfo(TTestActorRuntime &runtime, NKikimrBlobStorage::TBaseC pdiskConfig.SetGuid(1); pdiskConfig.SetDriveStatus(NKikimrBlobStorage::ACTIVE); + if (node.VDisksMoved) { + continue; + } + for (ui8 vdiskIndex = 0; vdiskIndex < vdiskPerPdisk; ++vdiskIndex) { ui32 vdiskId = pdiskIndex * vdiskPerPdisk + vdiskIndex; ui32 groupId = groupShift + vdiskId; @@ -400,9 +388,7 @@ static NKikimrConfig::TBootstrap GenerateBootstrapConfig(TTestActorRuntime &runt return res; } -static void SetupServices(TTestActorRuntime &runtime, - const TNodeTenantsMap &tenants) -{ +static void SetupServices(TTestActorRuntime &runtime, const TTestEnvOpts &options) { const ui32 domainsNum = 1; const ui32 disksInDomain = 1; @@ -491,8 +477,8 @@ static void SetupServices(TTestActorRuntime &runtime, runtime.AddLocalService(NNodeWhiteboard::MakeNodeWhiteboardServiceId(runtime.GetNodeId(nodeIndex)), TActorSetupCmd(CreateFakeNodeWhiteboardService(), TMailboxType::Simple, 0), nodeIndex); TVector<TString> nodeTenants; - if (tenants.contains(nodeIndex)) - nodeTenants = tenants.at(nodeIndex); + if (options.Tenants.contains(nodeIndex)) + nodeTenants = options.Tenants.at(nodeIndex); runtime.AddLocalService(MakeTenantPoolID(runtime.GetNodeId(nodeIndex)), TActorSetupCmd(new TFakeTenantPool(nodeTenants), TMailboxType::Simple, 0), nodeIndex); } @@ -512,7 +498,7 @@ static void SetupServices(TTestActorRuntime &runtime, runtime.GetAppData().BootstrapConfig = TFakeNodeWhiteboardService::BootstrapConfig; NKikimrCms::TCmsConfig cmsConfig; - cmsConfig.MutableSentinelConfig()->SetEnable(false); + cmsConfig.MutableSentinelConfig()->SetEnable(options.EnableSentinel); runtime.GetAppData().DefaultCmsConfig = MakeHolder<NKikimrCms::TCmsConfig>(cmsConfig); if (!runtime.IsRealThreads()) { @@ -546,6 +532,8 @@ TCmsTestEnv::TCmsTestEnv(const TTestEnvOpts &options) TFakeNodeWhiteboardService::BootstrapConfig = GenerateBootstrapConfig(*this, options.NodeCount, options.Tenants); + TGuard<TMutex> guard(TFakeNodeWhiteboardService::Mutex); + TFakeNodeWhiteboardService::Info.clear(); GenerateExtendedInfo(*this, config, options.VDisks, 4, options.Tenants, options.UseMirror3dcErasure); SetObserverFunc([](TAutoPtr<IEventHandle> &event) -> auto { @@ -572,7 +560,7 @@ TCmsTestEnv::TCmsTestEnv(const TTestEnvOpts &options) SetupStateStorage(*this, nodeIndex); } } - SetupServices(*this, options.Tenants); + SetupServices(*this, options); Sender = AllocateEdgeActor(); ClientId = TActorId(); @@ -580,7 +568,7 @@ TCmsTestEnv::TCmsTestEnv(const TTestEnvOpts &options) NKikimrCms::TCmsConfig cmsConfig; cmsConfig.MutableTenantLimits()->SetDisabledNodesRatioLimit(0); cmsConfig.MutableClusterLimits()->SetDisabledNodesRatioLimit(0); - cmsConfig.MutableSentinelConfig()->SetEnable(false); + cmsConfig.MutableSentinelConfig()->SetEnable(options.EnableSentinel); SetCmsConfig(cmsConfig); // Need to allow restart state storage nodes @@ -1170,5 +1158,11 @@ void TCmsTestEnv::EnableNoisyBSCPipe() { TFakeNodeWhiteboardService::NoisyBSCPipe = true; } +void TCmsTestEnv::RegenerateBSConfig(NKikimrBlobStorage::TBaseConfig *config, const TTestEnvOpts &opts) { + TGuard<TMutex> guard(TFakeNodeWhiteboardService::Mutex); + config->Clear(); + GenerateExtendedInfo(*this, config, opts.VDisks, 4, opts.Tenants, opts.UseMirror3dcErasure); +} + } // namespace NCmsTest } // namespace NKikimr diff --git a/ydb/core/cms/cms_ut_common.h b/ydb/core/cms/cms_ut_common.h index c76597a96b..be43e21b74 100644 --- a/ydb/core/cms/cms_ut_common.h +++ b/ydb/core/cms/cms_ut_common.h @@ -34,6 +34,7 @@ struct TFakeNodeInfo { TMap<TVDiskID, NKikimrWhiteboard::TVDiskStateInfo, TVDiskIDComparator> VDiskStateInfo; NKikimrWhiteboard::TSystemStateInfo SystemStateInfo; bool Connected = true; + bool VDisksMoved = false; }; class TFakeNodeWhiteboardService : public TActorBootstrapped<TFakeNodeWhiteboardService> { @@ -84,6 +85,7 @@ struct TTestEnvOpts { TNodeTenantsMap Tenants; bool UseMirror3dcErasure; bool AdvanceCurrentTime; + bool EnableSentinel; TTestEnvOpts() = default; @@ -99,8 +101,19 @@ struct TTestEnvOpts { , Tenants(tenants) , UseMirror3dcErasure(false) , AdvanceCurrentTime(false) + , EnableSentinel(false) { } + + TTestEnvOpts& WithSentinel() { + EnableSentinel = true; + return *this; + } + + TTestEnvOpts& WithoutSentinel() { + EnableSentinel = false; + return *this; + } }; class TCmsTestEnv : public TTestBasicRuntime { @@ -369,6 +382,8 @@ public: const ui64 CmsId; + void RegenerateBSConfig(NKikimrBlobStorage::TBaseConfig *config, const TTestEnvOpts &opts); + private: void SetupLogging(); diff --git a/ydb/core/cms/sentinel.cpp b/ydb/core/cms/sentinel.cpp index 16000a0a0d..e16d38b694 100644 --- a/ydb/core/cms/sentinel.cpp +++ b/ydb/core/cms/sentinel.cpp @@ -62,6 +62,11 @@ void TPDiskStatusComputer::AddState(EPDiskState state) { } EPDiskStatus TPDiskStatusComputer::Compute(EPDiskStatus current, TString& reason) const { + if (ForcedStatus) { + reason = "Forced status"; + return *ForcedStatus; + } + if (!StateCounter) { reason = "Uninitialized StateCounter"; return current; @@ -116,6 +121,14 @@ void TPDiskStatusComputer::Reset() { StateCounter = 0; } +void TPDiskStatusComputer::SetForcedStatus(EPDiskStatus status) { + ForcedStatus = status; +} + +void TPDiskStatusComputer::ResetForcedStatus() { + ForcedStatus.Clear(); +} + /// TPDiskStatus TPDiskStatus::TPDiskStatus(EPDiskStatus initialStatus, const ui32& defaultStateLimit, const TLimitsMap& stateLimits) @@ -129,13 +142,9 @@ void TPDiskStatus::AddState(EPDiskState state) { TPDiskStatusComputer::AddState(state); } -bool TPDiskStatus::IsChanged(TString& reason) const { - return Current != Compute(Current, reason); -} - bool TPDiskStatus::IsChanged() const { TString unused; - return IsChanged(unused); + return Current != Compute(Current, unused); } void TPDiskStatus::ApplyChanges(TString& reason) { @@ -196,6 +205,13 @@ void TPDiskInfo::AddState(EPDiskState state) { Touch(); } +/// TNodeInfo + +bool TNodeInfo::HasFaultyMarker() const { + return Markers.contains(NKikimrCms::MARKER_DISK_FAULTY) + || Markers.contains(NKikimrCms::MARKER_DISK_BROKEN); +} + /// TClusterMap TClusterMap::TClusterMap(TSentinelState::TPtr state) @@ -412,9 +428,15 @@ class TConfigUpdater: public TUpdaterBase<TEvSentinel::TEvConfigUpdated, TConfig SentinelState->Nodes.clear(); for (const auto& host : record.GetState().GetHosts()) { if (host.HasNodeId() && host.HasLocation() && host.HasName()) { + THashSet<NKikimrCms::EMarker> markers; + for (auto marker : host.GetMarkers()) { + markers.insert(static_cast<NKikimrCms::EMarker>(marker)); + } + SentinelState->Nodes.emplace(host.GetNodeId(), TNodeInfo{ .Host = host.GetName(), .Location = NActors::TNodeLocation(host.GetLocation()), + .Markers = std::move(markers), }); } } @@ -862,13 +884,20 @@ class TSentinel: public TActorBootstrapped<TSentinel> { const TPDiskID& id = pdisk.first; TPDiskInfo& info = *(pdisk.second); - if (!SentinelState->Nodes.contains(id.NodeId)) { + auto it = SentinelState->Nodes.find(id.NodeId); + if (it == SentinelState->Nodes.end()) { LOG_E("Missing node info" << ": pdiskId# " << id); info.IgnoreReason = NKikimrCms::TPDiskInfo::MISSING_NODE; continue; } + if (it->second.HasFaultyMarker()) { + info.SetForcedStatus(EPDiskStatus::FAULTY); + } else { + info.ResetForcedStatus(); + } + all.AddPDisk(id); if (info.IsChanged()) { if (info.IsNewStatusGood()) { @@ -961,6 +990,18 @@ class TSentinel: public TActorBootstrapped<TSentinel> { NTabletPipe::SendData(SelfId(), CmsState->BSControllerPipe, request.Release(), ++SentinelState->ChangeRequestId); } + void Handle(TEvSentinel::TEvUpdateHostMarkers::TPtr& ev) { + for (auto& [nodeId, markers] : ev->Get()->HostMarkers) { + auto it = SentinelState->Nodes.find(nodeId); + if (it == SentinelState->Nodes.end()) { + // markers will be updated upon next ConfigUpdate iteration + continue; + } + + it->second.Markers = std::move(markers); + } + } + void Handle(TEvCms::TEvGetSentinelStateRequest::TPtr& ev) { const auto& reqRecord = ev->Get()->Record; @@ -1192,6 +1233,7 @@ public: sFunc(TEvSentinel::TEvConfigUpdated, OnConfigUpdated); sFunc(TEvSentinel::TEvUpdateState, UpdateState); sFunc(TEvSentinel::TEvStateUpdated, OnStateUpdated); + hFunc(TEvSentinel::TEvUpdateHostMarkers, Handle); sFunc(TEvSentinel::TEvBSCPipeDisconnected, OnPipeDisconnected); hFunc(TEvCms::TEvGetSentinelStateRequest, Handle); diff --git a/ydb/core/cms/sentinel.h b/ydb/core/cms/sentinel.h index f0d1249348..d5734aef51 100644 --- a/ydb/core/cms/sentinel.h +++ b/ydb/core/cms/sentinel.h @@ -16,6 +16,8 @@ struct TEvSentinel { EvTimeout, EvBSCPipeDisconnected, + EvUpdateHostMarkers, + EvEnd, }; @@ -30,6 +32,20 @@ struct TEvSentinel { struct TEvTimeout: public TEventLocal<TEvTimeout, EvTimeout> {}; struct TEvBSCPipeDisconnected: public TEventLocal<TEvBSCPipeDisconnected, EvBSCPipeDisconnected> {}; + struct TEvUpdateHostMarkers: public TEventLocal<TEvUpdateHostMarkers, EvUpdateHostMarkers> { + struct THostMarkers { + ui32 NodeId; + THashSet<NKikimrCms::EMarker> Markers; + }; + + TVector<THostMarkers> HostMarkers; + + explicit TEvUpdateHostMarkers(TVector<THostMarkers>&& hostMarkers) + : HostMarkers(std::move(hostMarkers)) + { + } + }; + }; // TEvSentinel IActor* CreateSentinel(TCmsStatePtr state); diff --git a/ydb/core/cms/sentinel_impl.h b/ydb/core/cms/sentinel_impl.h index 8ed385184d..7f65e0faab 100644 --- a/ydb/core/cms/sentinel_impl.h +++ b/ydb/core/cms/sentinel_impl.h @@ -28,6 +28,9 @@ public: void Reset(); + void SetForcedStatus(EPDiskStatus status); + void ResetForcedStatus(); + private: const ui32& DefaultStateLimit; const TLimitsMap& StateLimits; @@ -35,6 +38,7 @@ private: EPDiskState State = NKikimrBlobStorage::TPDiskState::Unknown; mutable EPDiskState PrevState = State; ui64 StateCounter; + TMaybe<EPDiskStatus> ForcedStatus; }; // TPDiskStatusComputer @@ -43,7 +47,6 @@ public: explicit TPDiskStatus(EPDiskStatus initialStatus, const ui32& defaultStateLimit, const TLimitsMap& stateLimits); void AddState(EPDiskState state); - bool IsChanged(TString& reason) const; bool IsChanged() const; void ApplyChanges(TString& reason); void ApplyChanges(); @@ -104,6 +107,9 @@ private: struct TNodeInfo { TString Host; NActors::TNodeLocation Location; + THashSet<NKikimrCms::EMarker> Markers; + + bool HasFaultyMarker() const; }; struct TConfigUpdaterState { diff --git a/ydb/core/cms/sentinel_ut.cpp b/ydb/core/cms/sentinel_ut.cpp index b32fbeefbe..2647806481 100644 --- a/ydb/core/cms/sentinel_ut.cpp +++ b/ydb/core/cms/sentinel_ut.cpp @@ -156,7 +156,7 @@ Y_UNIT_TEST_SUITE(TSentinelBaseTests) { location.SetUnit(ToString(id)); state->ClusterInfo->AddNode(TEvInterconnect::TNodeInfo(id, name, name, name, 10000, TNodeLocation(location)), nullptr); - sentinelState->Nodes[id] = NSentinel::TNodeInfo{name, NActors::TNodeLocation(location)}; + sentinelState->Nodes[id] = NSentinel::TNodeInfo{name, NActors::TNodeLocation(location), {}}; for (ui64 npdisk : xrange(pdisksPerNode)) { NKikimrBlobStorage::TBaseConfig::TPDisk pdisk; diff --git a/ydb/core/cms/ut_helpers.cpp b/ydb/core/cms/ut_helpers.cpp index 64625436e6..1a5246157b 100644 --- a/ydb/core/cms/ut_helpers.cpp +++ b/ydb/core/cms/ut_helpers.cpp @@ -1,9 +1,5 @@ #include "ut_helpers.h" -Y_DECLARE_OUT_SPEC(, NKikimrCms::TAction::EType, os, type) { - os << NKikimrCms::TAction::EType_Name(type); -} - Y_DECLARE_OUT_SPEC(, NKikimrWhiteboard::TTabletStateInfo::ETabletState, os, state) { os << NKikimrWhiteboard::TTabletStateInfo::ETabletState_Name(state); } diff --git a/ydb/core/cms/ut_helpers.h b/ydb/core/cms/ut_helpers.h index b2d89a6e1e..696e6726d4 100644 --- a/ydb/core/cms/ut_helpers.h +++ b/ydb/core/cms/ut_helpers.h @@ -67,18 +67,49 @@ void AddActions(TAutoPtr<NCms::TEvCms::TEvPermissionRequest> &event, const NKiki AddActions(event, actions...); } +struct TRequestOptions { + TString User; + bool Partial; + bool DryRun; + bool Schedule; + bool Prepare; + + explicit TRequestOptions(const TString &user, bool partial, bool dry, bool schedule) + : User(user) + , Partial(partial) + , DryRun(dry) + , Schedule(schedule) + , Prepare(false) + {} + + explicit TRequestOptions(const TString &user) + : TRequestOptions(user, false, false, false) + {} + + TRequestOptions& WithPrepare() { + Prepare = true; + return *this; + } +}; + template <typename... Ts> -TAutoPtr<NCms::TEvCms::TEvPermissionRequest> MakePermissionRequest(const TString &user, bool partial, bool dry, bool schedule, Ts... actions) { +TAutoPtr<NCms::TEvCms::TEvPermissionRequest> MakePermissionRequest(const TRequestOptions &opts, Ts... actions) { TAutoPtr<NCms::TEvCms::TEvPermissionRequest> event = new NCms::TEvCms::TEvPermissionRequest; - event->Record.SetUser(user); - event->Record.SetPartialPermissionAllowed(partial); - event->Record.SetDryRun(dry); - event->Record.SetSchedule(schedule); + event->Record.SetUser(opts.User); + event->Record.SetPartialPermissionAllowed(opts.Partial); + event->Record.SetDryRun(opts.DryRun); + event->Record.SetSchedule(opts.Schedule); + event->Record.SetPrepare(opts.Prepare); AddActions(event, actions...); return event; } +template <typename... Ts> +TAutoPtr<NCms::TEvCms::TEvPermissionRequest> MakePermissionRequest(const TString &user, bool partial, bool dry, bool schedule, Ts... actions) { + return MakePermissionRequest(TRequestOptions(user, partial, dry, schedule), actions...); +} + inline void AddPermissions(TAutoPtr<NCms::TEvCms::TEvManagePermissionRequest> &ev, const TString &id) { *ev->Record.AddPermissions() = id; } diff --git a/ydb/core/protos/cms.proto b/ydb/core/protos/cms.proto index 9be2f66d20..c84a2f366d 100644 --- a/ydb/core/protos/cms.proto +++ b/ydb/core/protos/cms.proto @@ -161,6 +161,8 @@ message TPermissionRequest { // Availability mode is not preserved for scheduled events. optional EAvailabilityMode AvailabilityMode = 9 [default = MODE_MAX_AVAILABILITY]; optional string MaintenanceTaskId = 10; + // Prepare item (host/device) before granting permission + optional bool Prepare = 11 [default = false]; } enum EExtensionType { diff --git a/ydb/core/protos/out/out_cms.cpp b/ydb/core/protos/out/out_cms.cpp index a681fdb380..71012587da 100644 --- a/ydb/core/protos/out/out_cms.cpp +++ b/ydb/core/protos/out/out_cms.cpp @@ -25,3 +25,7 @@ Y_DECLARE_OUT_SPEC(, NKikimrCms::ETextFormat, stream, value) { Y_DECLARE_OUT_SPEC(, NKikimrCms::TLogRecordData::EType, stream, value) { stream << NKikimrCms::TLogRecordData::EType_Name(value); } + +Y_DECLARE_OUT_SPEC(, NKikimrCms::TAction::EType, stream, value) { + stream << NKikimrCms::TAction::EType_Name(value); +} |