diff options
| author | ilnaz <[email protected]> | 2023-06-14 18:49:37 +0300 |
|---|---|---|
| committer | ilnaz <[email protected]> | 2023-06-14 18:49:37 +0300 |
| commit | 3f3f15f2e06a1f183d72c229cacf1d0f8949aa57 (patch) | |
| tree | 48bdaa3b6f1ce0a3a21f38cf5ad4afab4b2c121c | |
| parent | c35c89f6e630781a97a2384c5783d2cbbe10d588 (diff) | |
Maintenance service api implementation
48 files changed, 1665 insertions, 257 deletions
diff --git a/ydb/core/cms/CMakeLists.darwin-x86_64.txt b/ydb/core/cms/CMakeLists.darwin-x86_64.txt index 6467f87857e..5af4c829e07 100644 --- a/ydb/core/cms/CMakeLists.darwin-x86_64.txt +++ b/ydb/core/cms/CMakeLists.darwin-x86_64.txt @@ -51,6 +51,7 @@ target_link_libraries(ydb-core-cms PUBLIC tools-enum_parser-enum_serialization_runtime ) target_sources(ydb-core-cms PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/core/cms/api_adapters.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/audit_log.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cluster_info.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cms.cpp diff --git a/ydb/core/cms/CMakeLists.linux-aarch64.txt b/ydb/core/cms/CMakeLists.linux-aarch64.txt index 94b7c7387b3..3b403d7d220 100644 --- a/ydb/core/cms/CMakeLists.linux-aarch64.txt +++ b/ydb/core/cms/CMakeLists.linux-aarch64.txt @@ -52,6 +52,7 @@ target_link_libraries(ydb-core-cms PUBLIC tools-enum_parser-enum_serialization_runtime ) target_sources(ydb-core-cms PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/core/cms/api_adapters.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/audit_log.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cluster_info.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cms.cpp diff --git a/ydb/core/cms/CMakeLists.linux-x86_64.txt b/ydb/core/cms/CMakeLists.linux-x86_64.txt index 94b7c7387b3..3b403d7d220 100644 --- a/ydb/core/cms/CMakeLists.linux-x86_64.txt +++ b/ydb/core/cms/CMakeLists.linux-x86_64.txt @@ -52,6 +52,7 @@ target_link_libraries(ydb-core-cms PUBLIC tools-enum_parser-enum_serialization_runtime ) target_sources(ydb-core-cms PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/core/cms/api_adapters.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/audit_log.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cluster_info.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cms.cpp diff --git a/ydb/core/cms/CMakeLists.windows-x86_64.txt b/ydb/core/cms/CMakeLists.windows-x86_64.txt index 6467f87857e..5af4c829e07 100644 --- a/ydb/core/cms/CMakeLists.windows-x86_64.txt +++ b/ydb/core/cms/CMakeLists.windows-x86_64.txt @@ -51,6 +51,7 @@ target_link_libraries(ydb-core-cms PUBLIC tools-enum_parser-enum_serialization_runtime ) target_sources(ydb-core-cms PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/core/cms/api_adapters.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/audit_log.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cluster_info.cpp ${CMAKE_SOURCE_DIR}/ydb/core/cms/cms.cpp diff --git a/ydb/core/cms/api_adapters.cpp b/ydb/core/cms/api_adapters.cpp new file mode 100644 index 00000000000..3833d2426fe --- /dev/null +++ b/ydb/core/cms/api_adapters.cpp @@ -0,0 +1,773 @@ +#include "cms_impl.h" + +#include <ydb/core/util/yverify_stream.h> +#include <ydb/public/api/protos/draft/ydb_maintenance.pb.h> +#include <ydb/library/yql/public/issue/protos/issue_severity.pb.h> + +#include <library/cpp/actors/core/actor_bootstrapped.h> +#include <library/cpp/actors/core/hfunc.h> + +#include <google/protobuf/util/time_util.h> + +#include <util/string/cast.h> + +namespace NKikimr::NCms { + +using TimeUtil = ::google::protobuf::util::TimeUtil; + +namespace { + + NKikimrCms::EAvailabilityMode ConvertAvailabilityMode(Ydb::Maintenance::AvailabilityMode mode) { + switch (mode) { + case Ydb::Maintenance::AVAILABILITY_MODE_STRONG: + return NKikimrCms::MODE_MAX_AVAILABILITY; + case Ydb::Maintenance::AVAILABILITY_MODE_WEAK: + return NKikimrCms::MODE_KEEP_AVAILABLE; + case Ydb::Maintenance::AVAILABILITY_MODE_FORCE: + return NKikimrCms::MODE_FORCE_RESTART; + default: + Y_FAIL("unreachable"); + } + } + + Ydb::Maintenance::AvailabilityMode ConvertAvailabilityMode(NKikimrCms::EAvailabilityMode mode) { + switch (mode) { + case NKikimrCms::MODE_MAX_AVAILABILITY: + return Ydb::Maintenance::AVAILABILITY_MODE_STRONG; + case NKikimrCms::MODE_KEEP_AVAILABLE: + return Ydb::Maintenance::AVAILABILITY_MODE_WEAK; + case NKikimrCms::MODE_FORCE_RESTART: + return Ydb::Maintenance::AVAILABILITY_MODE_FORCE; + default: + Y_FAIL("unreachable"); + } + } + + void ConvertAction(const NKikimrCms::TAction& cmsAction, Ydb::Maintenance::LockAction& action) { + *action.mutable_duration() = TimeUtil::MicrosecondsToDuration(cmsAction.GetDuration()); + + ui32 nodeId; + if (TryFromString(cmsAction.GetHost(), nodeId)) { + action.mutable_scope()->set_node_id(nodeId); + } else { + action.mutable_scope()->set_host(cmsAction.GetHost()); + } + } + + void ConvertAction(const NKikimrCms::TAction& cmsAction, Ydb::Maintenance::ActionState& actionState) { + ConvertAction(cmsAction, *actionState.mutable_action()->mutable_lock_action()); + // FIXME: specify action_uid + actionState.set_status(Ydb::Maintenance::ActionState::ACTION_STATUS_PENDING); + actionState.set_reason(Ydb::Maintenance::ActionState::ACTION_REASON_UNSPECIFIED); // FIXME: specify + } + + void ConvertActionUid(const TString& taskUid, const TString& permissionId, + Ydb::Maintenance::ActionUid& actionUid) + { + actionUid.set_task_uid(taskUid); + actionUid.set_action_id(permissionId); + } + + void ConvertPermission(const TString& taskUid, const NKikimrCms::TPermission& permission, + Ydb::Maintenance::ActionState& actionState) + { + ConvertAction(permission.GetAction(), *actionState.mutable_action()->mutable_lock_action()); + ConvertActionUid(taskUid, permission.GetId(), *actionState.mutable_action_uid()); + + actionState.set_status(Ydb::Maintenance::ActionState::ACTION_STATUS_PERFORMED); + actionState.set_reason(Ydb::Maintenance::ActionState::ACTION_REASON_OK); + *actionState.mutable_deadline() = TimeUtil::MicrosecondsToTimestamp(permission.GetDeadline()); + } + + void ConvertPermission(const TString& taskUid, const TPermissionInfo& permission, + Ydb::Maintenance::ActionState& actionState) + { + NKikimrCms::TPermission protoPermission; + permission.CopyTo(protoPermission); + ConvertPermission(taskUid, protoPermission, actionState); + } + +} // anonymous + +template <typename TDerived, typename TEvRequest, typename TEvResponse> +class TAdapterActor: public TActorBootstrapped<TDerived> { +protected: + using TBase = TAdapterActor<TDerived, TEvRequest, TEvResponse>; + + TCmsStatePtr GetCmsState() const { + Y_VERIFY(CmsState); + return CmsState; + } + + void Reply(THolder<TEvResponse>&& ev) { + this->Send(Request->Sender, std::move(ev)); + this->PassAway(); + } + + void Reply(Ydb::StatusIds::StatusCode code, const TString& error = {}) { + auto ev = MakeHolder<TEvResponse>(); + ev->Record.SetStatus(code); + + auto& issue = *ev->Record.AddIssues(); + issue.set_severity(NYql::TSeverityIds::S_ERROR); + issue.set_message(error); + + Reply(std::move(ev)); + } + +public: + static constexpr NKikimrServices::TActivity::EType ActorActivityType() { + return NKikimrServices::TActivity::CMS_API_ADAPTER; + } + + explicit TAdapterActor(typename TEvRequest::TPtr& ev, const TActorId& cmsActorId, TCmsStatePtr cmsState = nullptr) + : Request(ev) + , CmsActorId(cmsActorId) + , CmsState(cmsState) + { + } + +protected: + typename TEvRequest::TPtr Request; + const TActorId CmsActorId; + +private: + const TCmsStatePtr CmsState; + +}; // TAdapterActor + +class TListClusterNodes: public TAdapterActor< + TListClusterNodes, + TEvCms::TEvListClusterNodesRequest, + TEvCms::TEvListClusterNodesResponse> +{ + static Ydb::Maintenance::ItemState ConvertNodeState(NKikimrCms::EState state) { + switch (state) { + case NKikimrCms::UNKNOWN: + return Ydb::Maintenance::ITEM_STATE_UNSPECIFIED; + case NKikimrCms::UP: + return Ydb::Maintenance::ITEM_STATE_UP; + case NKikimrCms::RESTART: + return Ydb::Maintenance::ITEM_STATE_MAINTENANCE; + case NKikimrCms::DOWN: + return Ydb::Maintenance::ITEM_STATE_DOWN; + default: + Y_FAIL("unreachable"); + } + } + + static void ConvertNode(const TNodeInfo& in, Ydb::Maintenance::Node& out) { + out.set_node_id(in.NodeId); + out.set_host(in.Host); + out.set_port(in.IcPort); + out.set_state(ConvertNodeState(in.State)); + + auto& location = *out.mutable_location(); + location.set_data_center(in.Location.GetDataCenterId()); + location.set_module(in.Location.GetModuleId()); + location.set_rack(in.Location.GetRackId()); + location.set_unit(in.Location.GetUnitId()); + + if (in.Services & EService::DynamicNode) { + out.mutable_dynamic()->set_tenant(in.Tenant); + } else { + out.mutable_storage(); + } + } + +public: + using TBase::TBase; + + void Bootstrap() { + Send(CmsActorId, new TEvCms::TEvGetClusterInfoRequest()); + Become(&TThis::StateWork); + } + + STFUNC(StateWork) { + switch (ev->GetTypeRewrite()) { + hFunc(TEvCms::TEvGetClusterInfoResponse, Handle); + } + } + + void Handle(TEvCms::TEvGetClusterInfoResponse::TPtr& ev) { + auto clusterInfo = ev->Get()->Info; + + if (clusterInfo->IsOutdated()) { + return Reply(Ydb::StatusIds::UNAVAILABLE, "Cannot collect cluster info"); + } + + auto response = MakeHolder<TEvCms::TEvListClusterNodesResponse>(); + response->Record.SetStatus(Ydb::StatusIds::SUCCESS); + + for (const auto& [_, node] : clusterInfo->AllNodes()) { + ConvertNode(*node, *response->Record.MutableResult()->add_nodes()); + } + + Reply(std::move(response)); + } + +}; // TListClusterNodes + +template <typename TDerived, typename TEvRequest> +class TPermissionResponseProcessor: public TAdapterActor<TDerived, TEvRequest, TEvCms::TEvMaintenanceTaskResponse> { +protected: + using TBase = TPermissionResponseProcessor<TDerived, TEvRequest>; + +public: + using TAdapterActor<TDerived, TEvRequest, TEvCms::TEvMaintenanceTaskResponse>::TAdapterActor; + + STFUNC(StateWork) { + switch (ev->GetTypeRewrite()) { + hFunc(TEvCms::TEvPermissionResponse, Handle); + } + } + + void Handle(TEvCms::TEvPermissionResponse::TPtr& ev) { + const auto& record = ev->Get()->Record; + + switch (record.GetStatus().GetCode()) { + case NKikimrCms::TStatus::ALLOW: + case NKikimrCms::TStatus::ALLOW_PARTIAL: + case NKikimrCms::TStatus::DISALLOW: + case NKikimrCms::TStatus::DISALLOW_TEMP: + break; + case NKikimrCms::TStatus::ERROR_TEMP: + return this->Reply(Ydb::StatusIds::UNAVAILABLE, record.GetStatus().GetReason()); + case NKikimrCms::TStatus::WRONG_REQUEST: + case NKikimrCms::TStatus::ERROR: + case NKikimrCms::TStatus::NO_SUCH_HOST: + case NKikimrCms::TStatus::NO_SUCH_DEVICE: + case NKikimrCms::TStatus::NO_SUCH_SERVICE: + return this->Reply(Ydb::StatusIds::BAD_REQUEST, record.GetStatus().GetReason()); + case NKikimrCms::TStatus::UNAUTHORIZED: + return this->Reply(Ydb::StatusIds::UNAUTHORIZED, record.GetStatus().GetReason()); + case NKikimrCms::TStatus::OK: + case NKikimrCms::TStatus::UNKNOWN: + return this->Reply(Ydb::StatusIds::INTERNAL_ERROR, record.GetStatus().GetReason()); + } + + const auto& taskUid = static_cast<const TDerived*>(this)->GetTaskUid(); + + auto response = MakeHolder<TEvCms::TEvMaintenanceTaskResponse>(); + response->Record.SetStatus(Ydb::StatusIds::SUCCESS); + + auto& result = *response->Record.MutableResult(); + result.set_task_uid(taskUid); + + if (record.GetDeadline()) { + *result.mutable_retry_after() = TimeUtil::MicrosecondsToTimestamp(record.GetDeadline()); + } + + THashSet<TString> permissionsSeen; + // performed actions: new permissions + for (const auto& permission : record.GetPermissions()) { + permissionsSeen.insert(permission.GetId()); + ConvertPermission(taskUid, permission, *result.add_action_group_states()->add_action_states()); + } + + auto cmsState = this->GetCmsState(); + // performed actions: existing permissions + if (cmsState->MaintenanceTasks.contains(taskUid)) { + const auto& task = cmsState->MaintenanceTasks.at(taskUid); + for (const auto& id : task.Permissions) { + if (!cmsState->Permissions.contains(id) || permissionsSeen.contains(id)) { + continue; + } + + ConvertPermission(taskUid, cmsState->Permissions.at(id), + *result.add_action_group_states()->add_action_states()); + } + } + + // pending actions + if (cmsState->ScheduledRequests.contains(record.GetRequestId())) { + const auto& request = cmsState->ScheduledRequests.at(record.GetRequestId()); + for (const auto& action : request.Request.GetActions()) { + ConvertAction(action, *result.add_action_group_states()->add_action_states()); + } + } + + this->Reply(std::move(response)); + } + +}; // TPermissionResponseProcessor + +class TCreateMaintenanceTask: public TPermissionResponseProcessor< + TCreateMaintenanceTask, + TEvCms::TEvCreateMaintenanceTaskRequest> +{ + bool ValidateScope(const Ydb::Maintenance::ActionScope& scope) { + switch (scope.scope_case()) { + case Ydb::Maintenance::ActionScope::kNodeId: + case Ydb::Maintenance::ActionScope::kHost: + return true; + default: + Reply(Ydb::StatusIds::BAD_REQUEST, "Unknown scope"); + return false; + } + } + + bool ValidateAction(const Ydb::Maintenance::Action& action) { + switch (action.action_case()) { + case Ydb::Maintenance::Action::kLockAction: + return ValidateScope(action.lock_action().scope()); + default: + Reply(Ydb::StatusIds::BAD_REQUEST, "Unknown action"); + return false; + } + } + + bool ValidateRequest(const Ydb::Maintenance::CreateMaintenanceTaskRequest& request) { + switch (request.task_options().availability_mode()) { + case Ydb::Maintenance::AVAILABILITY_MODE_STRONG: + case Ydb::Maintenance::AVAILABILITY_MODE_WEAK: + case Ydb::Maintenance::AVAILABILITY_MODE_FORCE: + break; + default: + Reply(Ydb::StatusIds::BAD_REQUEST, "Unknown availability mode"); + return false; + } + + for (const auto& group : request.action_groups()) { + if (group.actions().size() < 1) { + Reply(Ydb::StatusIds::BAD_REQUEST, "Empty actions"); + return false; + } else if (group.actions().size() > 1) { + Reply(Ydb::StatusIds::UNSUPPORTED, "Composite action groups are not supported"); + return false; + } + + for (const auto& action : group.actions()) { + if (!ValidateAction(action)) { + return false; + } + } + } + + return true; + } + + static void ConvertAction(const Ydb::Maintenance::LockAction& action, NKikimrCms::TAction& cmsAction) { + cmsAction.SetType(NKikimrCms::TAction::SHUTDOWN_HOST); + cmsAction.SetDuration(TimeUtil::DurationToMicroseconds(action.duration())); + + const auto& scope = action.scope(); + switch (scope.scope_case()) { + case Ydb::Maintenance::ActionScope::kNodeId: + cmsAction.SetHost(ToString(scope.node_id())); + break; + case Ydb::Maintenance::ActionScope::kHost: + cmsAction.SetHost(scope.host()); + break; + default: + Y_FAIL("unreachable"); + } + } + + static void ConvertRequest(const TString& user, const Ydb::Maintenance::CreateMaintenanceTaskRequest& request, + NKikimrCms::TPermissionRequest& cmsRequest) + { + const auto& opts = request.task_options(); + + cmsRequest.SetMaintenanceTaskId(opts.task_uid()); + cmsRequest.SetUser(user); + cmsRequest.SetDryRun(opts.dry_run()); + cmsRequest.SetReason(opts.description()); + cmsRequest.SetAvailabilityMode(ConvertAvailabilityMode(opts.availability_mode())); + cmsRequest.SetPartialPermissionAllowed(true); + cmsRequest.SetSchedule(true); + + for (const auto& group : request.action_groups()) { + Y_VERIFY(group.actions().size() == 1); + for (const auto& action : group.actions()) { + if (action.has_lock_action()) { + ConvertAction(action.lock_action(), *cmsRequest.AddActions()); + } else { + Y_FAIL("unreachable"); + } + } + } + } + +public: + using TBase::TBase; + + void Bootstrap() { + const auto& user = Request->Get()->Record.GetUserSID(); + const auto& request = Request->Get()->Record.GetRequest(); + + if (!ValidateRequest(request)) { + return; + } + + auto cmsRequest = MakeHolder<TEvCms::TEvPermissionRequest>(); + ConvertRequest(user, request, cmsRequest->Record); + + Send(CmsActorId, std::move(cmsRequest)); + Become(&TThis::StateWork); + } + + const TString& GetTaskUid() const { + return Request->Get()->Record.GetRequest().task_options().task_uid(); + } + + // using processor's handler + +}; // TCreateMaintenanceTask + +class TRefreshMaintenanceTask: public TPermissionResponseProcessor< + TRefreshMaintenanceTask, + TEvCms::TEvRefreshMaintenanceTaskRequest> +{ +public: + using TBase::TBase; + + void Bootstrap() { + const auto& taskUid = GetTaskUid(); + auto cmsState = GetCmsState(); + + auto tit = cmsState->MaintenanceTasks.find(taskUid); + if (tit == cmsState->MaintenanceTasks.end()) { + return Reply(Ydb::StatusIds::BAD_REQUEST, "Task not found"); + } + + const auto& task = tit->second; + + auto rit = cmsState->ScheduledRequests.find(task.RequestId); + if (rit == cmsState->ScheduledRequests.end()) { + auto response = MakeHolder<TEvCms::TEvMaintenanceTaskResponse>(); + response->Record.SetStatus(Ydb::StatusIds::SUCCESS); + + auto& result = *response->Record.MutableResult(); + result.set_task_uid(taskUid); + + // performed actions + for (const auto& id : task.Permissions) { + if (!cmsState->Permissions.contains(id)) { + continue; + } + + ConvertPermission(taskUid, cmsState->Permissions.at(id), + *result.add_action_group_states()->add_action_states()); + } + + if (result.action_group_states().empty()) { + return Reply(Ydb::StatusIds::BAD_REQUEST, "Task has no active actions"); + } + + return Reply(std::move(response)); + } + + const auto& request = rit->second; + + auto cmsRequest = MakeHolder<TEvCms::TEvCheckRequest>(); + cmsRequest->Record.SetUser(task.Owner); + cmsRequest->Record.SetRequestId(task.RequestId); + cmsRequest->Record.SetAvailabilityMode(request.Request.GetAvailabilityMode()); + + Send(CmsActorId, std::move(cmsRequest)); + Become(&TThis::StateWork); + } + + const TString& GetTaskUid() const { + return Request->Get()->Record.GetRequest().task_uid(); + } + + // using processor's handler + +}; // TRefreshMaintenanceTask + +class TGetMaintenanceTask: public TAdapterActor< + TGetMaintenanceTask, + TEvCms::TEvGetMaintenanceTaskRequest, + TEvCms::TEvGetMaintenanceTaskResponse> +{ +public: + using TBase::TBase; + + void Bootstrap() { + const auto& taskUid = GetTaskUid(); + auto cmsState = GetCmsState(); + + auto it = cmsState->MaintenanceTasks.find(taskUid); + if (it == cmsState->MaintenanceTasks.end()) { + return Reply(Ydb::StatusIds::BAD_REQUEST, "Task not found"); + } + + const auto& task = it->second; + + if (!cmsState->ScheduledRequests.contains(task.RequestId)) { + auto response = MakeHolder<TEvCms::TEvGetMaintenanceTaskResponse>(); + response->Record.SetStatus(Ydb::StatusIds::SUCCESS); + + auto& result = *response->Record.MutableResult(); + result.mutable_task_options()->set_task_uid(taskUid); + + // performed actions + for (const auto& id : task.Permissions) { + if (!cmsState->Permissions.contains(id)) { + continue; + } + + ConvertPermission(taskUid, cmsState->Permissions.at(id), + *result.add_action_group_states()->add_action_states()); + } + + return Reply(std::move(response)); + } + + auto cmsRequest = MakeHolder<TEvCms::TEvManageRequestRequest>(); + cmsRequest->Record.SetUser(task.Owner); + cmsRequest->Record.SetRequestId(task.RequestId); + cmsRequest->Record.SetCommand(NKikimrCms::TManageRequestRequest::GET); + + Send(CmsActorId, std::move(cmsRequest)); + Become(&TThis::StateWork); + } + + const TString& GetTaskUid() const { + return Request->Get()->Record.GetRequest().task_uid(); + } + + STFUNC(StateWork) { + switch (ev->GetTypeRewrite()) { + hFunc(TEvCms::TEvManageRequestResponse, Handle); + } + } + + void Handle(TEvCms::TEvManageRequestResponse::TPtr& ev) { + const auto& taskUid = GetTaskUid(); + const auto& record = ev->Get()->Record; + + switch (record.GetStatus().GetCode()) { + case NKikimrCms::TStatus::OK: + break; + case NKikimrCms::TStatus::WRONG_REQUEST: + return Reply(Ydb::StatusIds::BAD_REQUEST, record.GetStatus().GetReason()); + default: + return Reply(Ydb::StatusIds::INTERNAL_ERROR, record.GetStatus().GetReason()); + } + + auto response = MakeHolder<TEvCms::TEvGetMaintenanceTaskResponse>(); + response->Record.SetStatus(Ydb::StatusIds::SUCCESS); + + auto& result = *response->Record.MutableResult(); + for (const auto& request : record.GetRequests()) { + auto& opts = *result.mutable_task_options(); + opts.set_task_uid(taskUid); + opts.set_description(request.GetReason()); + opts.set_availability_mode(ConvertAvailabilityMode(request.GetAvailabilityMode())); + + // pending actions + for (const auto& action : request.GetActions()) { + ConvertAction(action, *result.add_action_group_states()->add_action_states()); + } + } + + auto cmsState = GetCmsState(); + // performed actions + if (cmsState->MaintenanceTasks.contains(taskUid)) { + const auto& task = cmsState->MaintenanceTasks.at(taskUid); + for (const auto& id : task.Permissions) { + if (!cmsState->Permissions.contains(id)) { + continue; + } + + ConvertPermission(taskUid, cmsState->Permissions.at(id), + *result.add_action_group_states()->add_action_states()); + } + } + + Reply(std::move(response)); + } + +}; // TGetMaintenanceTask + +class TListMaintenanceTasks: public TAdapterActor< + TListMaintenanceTasks, + TEvCms::TEvListMaintenanceTasksRequest, + TEvCms::TEvListMaintenanceTasksResponse> +{ +public: + using TBase::TBase; + + void Bootstrap() { + const auto& user = Request->Get()->Record.GetRequest().user(); + + auto response = MakeHolder<TEvCms::TEvListMaintenanceTasksResponse>(); + response->Record.SetStatus(Ydb::StatusIds::SUCCESS); + + auto cmsState = GetCmsState(); + for (const auto& [taskUid, task] : cmsState->MaintenanceTasks) { + if (!user || user == task.Owner) { + response->Record.MutableResult()->add_tasks_uids(taskUid); + } + } + + Reply(std::move(response)); + } + +}; // TListMaintenanceTasks + +class TDropMaintenanceTask: public TAdapterActor< + TDropMaintenanceTask, + TEvCms::TEvDropMaintenanceTaskRequest, + TEvCms::TEvManageMaintenanceTaskResponse> +{ +public: + using TBase::TBase; + + void Bootstrap() { + auto cmsState = GetCmsState(); + + auto it = cmsState->MaintenanceTasks.find(Request->Get()->Record.GetRequest().task_uid()); + if (it == cmsState->MaintenanceTasks.end()) { + return Reply(Ydb::StatusIds::BAD_REQUEST, "Task not found"); + } + + const auto& task = it->second; + if (cmsState->ScheduledRequests.contains(task.RequestId)) { + auto cmsRequest = MakeHolder<TEvCms::TEvManageRequestRequest>(); + cmsRequest->Record.SetUser(task.Owner); + cmsRequest->Record.SetRequestId(task.RequestId); + cmsRequest->Record.SetCommand(NKikimrCms::TManageRequestRequest::REJECT); + + Send(CmsActorId, std::move(cmsRequest)); + } else { + auto cmsRequest = MakeHolder<TEvCms::TEvManagePermissionRequest>(); + cmsRequest->Record.SetUser(task.Owner); + cmsRequest->Record.SetCommand(NKikimrCms::TManagePermissionRequest::REJECT); + + for (const auto& id : task.Permissions) { + cmsRequest->Record.AddPermissions(id); + } + + Send(CmsActorId, std::move(cmsRequest)); + } + + Become(&TThis::StateWork); + } + + STFUNC(StateWork) { + switch (ev->GetTypeRewrite()) { + hFunc(TEvCms::TEvManageRequestResponse, Handle<TEvCms::TEvManageRequestResponse>); + hFunc(TEvCms::TEvManagePermissionResponse, Handle<TEvCms::TEvManagePermissionResponse>); + } + } + + template <typename TEvResponse> + void Handle(typename TEvResponse::TPtr& ev) { + const auto& record = ev->Get()->Record; + + switch (record.GetStatus().GetCode()) { + case NKikimrCms::TStatus::OK: + return Reply(Ydb::StatusIds::SUCCESS, record.GetStatus().GetReason()); + case NKikimrCms::TStatus::WRONG_REQUEST: + return Reply(Ydb::StatusIds::BAD_REQUEST, record.GetStatus().GetReason()); + default: + return Reply(Ydb::StatusIds::INTERNAL_ERROR, record.GetStatus().GetReason()); + } + } + +}; // TDropMaintenanceTask + +class TCompleteAction: public TAdapterActor< + TCompleteAction, + TEvCms::TEvCompleteActionRequest, + TEvCms::TEvManageActionResponse> +{ +public: + using TBase::TBase; + + void Bootstrap() { + auto cmsRequest = MakeHolder<TEvCms::TEvManagePermissionRequest>(); + cmsRequest->Record.SetCommand(NKikimrCms::TManagePermissionRequest::DONE); + + auto cmsState = GetCmsState(); + for (const auto& actionUid : Request->Get()->Record.GetRequest().action_uids()) { + auto it = cmsState->MaintenanceTasks.find(actionUid.task_uid()); + if (it == cmsState->MaintenanceTasks.end()) { + return Reply(Ydb::StatusIds::BAD_REQUEST, "Task not found"); + } + + const auto& task = it->second; + if (!cmsState->ScheduledRequests.contains(task.RequestId)) { + if (!task.Permissions.contains(actionUid.action_id())) { + return Reply(Ydb::StatusIds::BAD_REQUEST, "Action not found"); + } + } + + if (cmsRequest->Record.HasUser() && cmsRequest->Record.GetUser() != task.Owner) { + return Reply(Ydb::StatusIds::UNSUPPORTED, "Cannot complete actions of multiple owners at once"); + } else { + cmsRequest->Record.SetUser(task.Owner); + } + + cmsRequest->Record.AddPermissions(actionUid.action_id()); + } + + Send(CmsActorId, std::move(cmsRequest)); + Become(&TThis::StateWork); + } + + STFUNC(StateWork) { + switch (ev->GetTypeRewrite()) { + hFunc(TEvCms::TEvManagePermissionResponse, Handle); + } + } + + void Handle(TEvCms::TEvManagePermissionResponse::TPtr& ev) { + const auto& record = ev->Get()->Record; + + switch (record.GetStatus().GetCode()) { + case NKikimrCms::TStatus::OK: + break; + case NKikimrCms::TStatus::WRONG_REQUEST: + return Reply(Ydb::StatusIds::BAD_REQUEST, record.GetStatus().GetReason()); + default: + return Reply(Ydb::StatusIds::INTERNAL_ERROR, record.GetStatus().GetReason()); + } + + auto response = MakeHolder<TEvCms::TEvManageActionResponse>(); + response->Record.SetStatus(Ydb::StatusIds::SUCCESS); + + for (const auto& actionUid : Request->Get()->Record.GetRequest().action_uids()) { + auto& actionStatus = *response->Record.MutableResult()->add_action_statuses(); + *actionStatus.mutable_action_uid() = actionUid; + actionStatus.set_status(Ydb::StatusIds::SUCCESS); + } + + Reply(std::move(response)); + } + +}; // TCompleteAction + +void TCms::Handle(TEvCms::TEvListClusterNodesRequest::TPtr& ev, const TActorContext& ctx) { + ctx.RegisterWithSameMailbox(new TListClusterNodes(ev, SelfId())); +} + +void TCms::Handle(TEvCms::TEvCreateMaintenanceTaskRequest::TPtr& ev, const TActorContext& ctx) { + ctx.RegisterWithSameMailbox(new TCreateMaintenanceTask(ev, SelfId(), State)); +} + +void TCms::Handle(TEvCms::TEvRefreshMaintenanceTaskRequest::TPtr& ev, const TActorContext& ctx) { + ctx.RegisterWithSameMailbox(new TRefreshMaintenanceTask(ev, SelfId(), State)); +} + +void TCms::Handle(TEvCms::TEvGetMaintenanceTaskRequest::TPtr& ev, const TActorContext& ctx) { + ctx.RegisterWithSameMailbox(new TGetMaintenanceTask(ev, SelfId(), State)); +} + +void TCms::Handle(TEvCms::TEvListMaintenanceTasksRequest::TPtr& ev, const TActorContext& ctx) { + ctx.RegisterWithSameMailbox(new TListMaintenanceTasks(ev, SelfId(), State)); +} + +void TCms::Handle(TEvCms::TEvDropMaintenanceTaskRequest::TPtr& ev, const TActorContext& ctx) { + ctx.RegisterWithSameMailbox(new TDropMaintenanceTask(ev, SelfId(), State)); +} + +void TCms::Handle(TEvCms::TEvCompleteActionRequest::TPtr& ev, const TActorContext& ctx) { + ctx.RegisterWithSameMailbox(new TCompleteAction(ev, SelfId(), State)); +} + +} diff --git a/ydb/core/cms/cms.cpp b/ydb/core/cms/cms.cpp index 44ee2d5f340..e71a37aa33e 100644 --- a/ydb/core/cms/cms.cpp +++ b/ydb/core/cms/cms.cpp @@ -772,6 +772,7 @@ void TCms::AcceptPermissions(TPermissionResponse &resp, const TString &requestId } acceptTaskPermission(State->WalleTasks, State->WalleRequests, requestId, permission.GetId()); + acceptTaskPermission(State->MaintenanceTasks, State->MaintenanceRequests, requestId, permission.GetId()); } } @@ -930,6 +931,8 @@ void TCms::RemoveEmptyTasks(const TActorContext &ctx) { for (auto &id : FindEmptyTasks(State->WalleTasks, ctx)) Execute(CreateTxRemoveWalleTask(id), ctx); + for (auto &id : FindEmptyTasks(State->MaintenanceTasks, ctx)) + Execute(CreateTxRemoveMaintenanceTask(id), ctx); } void TCms::Cleanup(const TActorContext &ctx) @@ -1237,6 +1240,13 @@ void TCms::CheckAndEnqueueRequest(TEvCms::TEvPermissionRequest::TPtr &ev, const ev, TStatus::WRONG_REQUEST, "Missing user in request", ctx); } + if (rec.HasMaintenanceTaskId()) { + if (State->MaintenanceTasks.contains(rec.GetMaintenanceTaskId())) { + return ReplyWithError<TEvCms::TEvPermissionResponse>( + ev, TStatus::WRONG_REQUEST, "Maintenance task already exists", ctx); + } + } + EnqueueRequest(ev.Release(), ctx); } @@ -1581,7 +1591,7 @@ void TCms::Handle(TEvCms::TEvPermissionRequest::TPtr &ev, copy = new TRequestInfo(scheduled); State->ScheduledRequests.emplace(reqId, std::move(scheduled)); - } else if (user == WALLE_CMS_USER) { + } else if (user == WALLE_CMS_USER || rec.HasMaintenanceTaskId()) { scheduled.Owner = user; scheduled.RequestId = reqId; @@ -1591,8 +1601,13 @@ void TCms::Handle(TEvCms::TEvPermissionRequest::TPtr &ev, if (ok) AcceptPermissions(resp->Record, reqId, user, ctx); + TMaybe<TString> maintenanceTaskId; + if (rec.HasMaintenanceTaskId()) { + maintenanceTaskId.ConstructInPlace(rec.GetMaintenanceTaskId()); + } + auto handle = new IEventHandle(ev->Sender, SelfId(), resp.Release(), 0, ev->Cookie); - Execute(CreateTxStorePermissions(std::move(ev->Release()), handle, user, std::move(copy)), ctx); + Execute(CreateTxStorePermissions(std::move(ev->Release()), handle, user, std::move(copy), maintenanceTaskId), ctx); } TabletCounters->Percentile()[COUNTER_LATENCY_PERMISSION_REQUEST].IncrementFor((TInstant::Now() - requestStartTime).MilliSeconds()); diff --git a/ydb/core/cms/cms.h b/ydb/core/cms/cms.h index 6b64b952cf0..b947dacd04e 100644 --- a/ydb/core/cms/cms.h +++ b/ydb/core/cms/cms.h @@ -5,6 +5,7 @@ #include "cms_state.h" #include <ydb/core/protos/cms.pb.h> +#include <ydb/core/protos/maintenance.pb.h> #include <library/cpp/actors/interconnect/events_local.h> #include <library/cpp/actors/core/actor.h> @@ -57,6 +58,20 @@ struct TEvCms { EvGetSentinelStateRequest, EvGetSentinelStateResponse, + EvListClusterNodesRequest, + EvListClusterNodesResponse, + EvCreateMaintenanceTaskRequest, + EvRefreshMaintenanceTaskRequest, + EvMaintenanceTaskResponse, + EvGetMaintenanceTaskRequest, + EvGetMaintenanceTaskResponse, + EvListMaintenanceTasksRequest, + EvListMaintenanceTasksResponse, + EvDropMaintenanceTaskRequest, + EvManageMaintenanceTaskResponse, + EvCompleteActionRequest, + EvManageActionResponse, + EvWalleCreateTaskRequest = EvClusterStateRequest + 512, EvWalleCreateTaskResponse, EvWalleListTasksRequest, @@ -251,7 +266,7 @@ struct TEvCms { return Sprintf("%s { TaskId: %s }", ToStringHeader().data(), TaskId.data()); } }; - + struct TEvGetClusterInfoRequest : public TEventLocal<TEvGetClusterInfoRequest, EvGetClusterInfoRequest> { TString ToString() const override { return "Get Cluster Info Request"; @@ -259,7 +274,7 @@ struct TEvCms { }; struct TEvGetClusterInfoResponse : public TEventLocal<TEvGetClusterInfoResponse, EvGetClusterInfoResponse> { - TClusterInfoPtr Info; + TClusterInfoPtr Info; TString ToString() const override { return "Get Cluster Info Response"; @@ -325,6 +340,71 @@ struct TEvCms { NKikimrCms::TGetSentinelStateResponse, EvGetSentinelStateResponse> { }; + + struct TEvListClusterNodesRequest : public TEventPB<TEvListClusterNodesRequest, + NKikimrMaintenance::TListClusterNodesRequest, + EvListClusterNodesRequest> { + }; + + struct TEvListClusterNodesResponse : public TEventPB<TEvListClusterNodesResponse, + NKikimrMaintenance::TListClusterNodesResponse, + EvListClusterNodesResponse> { + }; + + struct TEvCreateMaintenanceTaskRequest : public TEventPB<TEvCreateMaintenanceTaskRequest, + NKikimrMaintenance::TCreateMaintenanceTaskRequest, + EvCreateMaintenanceTaskRequest> { + }; + + struct TEvRefreshMaintenanceTaskRequest : public TEventPB<TEvRefreshMaintenanceTaskRequest, + NKikimrMaintenance::TRefreshMaintenanceTaskRequest, + EvRefreshMaintenanceTaskRequest> { + }; + + struct TEvMaintenanceTaskResponse : public TEventPB<TEvMaintenanceTaskResponse, + NKikimrMaintenance::TMaintenanceTaskResponse, + EvMaintenanceTaskResponse> { + }; + + struct TEvGetMaintenanceTaskRequest : public TEventPB<TEvGetMaintenanceTaskRequest, + NKikimrMaintenance::TGetMaintenanceTaskRequest, + EvGetMaintenanceTaskRequest> { + }; + + struct TEvGetMaintenanceTaskResponse : public TEventPB<TEvGetMaintenanceTaskResponse, + NKikimrMaintenance::TGetMaintenanceTaskResponse, + EvGetMaintenanceTaskResponse> { + }; + + struct TEvListMaintenanceTasksRequest : public TEventPB<TEvListMaintenanceTasksRequest, + NKikimrMaintenance::TListMaintenanceTasksRequest, + EvListMaintenanceTasksRequest> { + }; + + struct TEvListMaintenanceTasksResponse : public TEventPB<TEvListMaintenanceTasksResponse, + NKikimrMaintenance::TListMaintenanceTasksResponse, + EvListMaintenanceTasksResponse> { + }; + + struct TEvDropMaintenanceTaskRequest : public TEventPB<TEvDropMaintenanceTaskRequest, + NKikimrMaintenance::TDropMaintenanceTaskRequest, + EvDropMaintenanceTaskRequest> { + }; + + struct TEvManageMaintenanceTaskResponse : public TEventPB<TEvManageMaintenanceTaskResponse, + NKikimrMaintenance::TManageMaintenanceTaskResponse, + EvManageMaintenanceTaskResponse> { + }; + + struct TEvCompleteActionRequest : public TEventPB<TEvCompleteActionRequest, + NKikimrMaintenance::TCompleteActionRequest, + EvCompleteActionRequest> { + }; + + struct TEvManageActionResponse : public TEventPB<TEvManageActionResponse, + NKikimrMaintenance::TManageActionResponse, + EvManageActionResponse> { + }; }; IActor *CreateCms(const TActorId &tablet, TTabletStorageInfo *info); diff --git a/ydb/core/cms/cms_impl.h b/ydb/core/cms/cms_impl.h index a448846a9bb..f8daaada1ca 100644 --- a/ydb/core/cms/cms_impl.h +++ b/ydb/core/cms/cms_impl.h @@ -135,8 +135,10 @@ private: ITransaction *CreateTxRemoveRequest(const TString &id, THolder<IEventBase> req, TAutoPtr<IEventHandle> resp); ITransaction *CreateTxRemovePermissions(TVector<TString> ids, THolder<IEventBase> req, TAutoPtr<IEventHandle> resp, bool expired = false); ITransaction *CreateTxRemoveWalleTask(const TString &id); + ITransaction *CreateTxRemoveMaintenanceTask(const TString &id); ITransaction *CreateTxStorePermissions(THolder<IEventBase> req, TAutoPtr<IEventHandle> resp, - const TString &owner, TAutoPtr<TRequestInfo> scheduled); + const TString &owner, TAutoPtr<TRequestInfo> scheduled, + const TMaybe<TString> &maintenanceTaskId = {}); ITransaction *CreateTxStoreWalleTask(const TTaskInfo &task, THolder<IEventBase> req, TAutoPtr<IEventHandle> resp); ITransaction *CreateTxUpdateConfig(TEvCms::TEvSetConfigRequest::TPtr &ev); ITransaction *CreateTxUpdateConfig(TEvConsole::TEvConfigNotificationRequest::TPtr &ev); @@ -238,6 +240,15 @@ private: HFunc(TEvCms::TEvWalleRemoveTaskRequest, Handle); HFunc(TEvCms::TEvStoreWalleTask, Handle); HFunc(TEvCms::TEvRemoveWalleTask, Handle); + // public api begin + HFunc(TEvCms::TEvListClusterNodesRequest, Handle); + HFunc(TEvCms::TEvCreateMaintenanceTaskRequest, Handle); + HFunc(TEvCms::TEvRefreshMaintenanceTaskRequest, Handle); + HFunc(TEvCms::TEvGetMaintenanceTaskRequest, Handle); + HFunc(TEvCms::TEvListMaintenanceTasksRequest, Handle); + HFunc(TEvCms::TEvDropMaintenanceTaskRequest, Handle); + HFunc(TEvCms::TEvCompleteActionRequest, Handle); + // public api end HFunc(TEvCms::TEvGetConfigRequest, Handle); HFunc(TEvCms::TEvSetConfigRequest, Handle); HFunc(TEvCms::TEvResetMarkerRequest, Handle); @@ -393,6 +404,15 @@ private: void Handle(TEvCms::TEvWalleRemoveTaskRequest::TPtr &ev, const TActorContext &ctx); void Handle(TEvCms::TEvStoreWalleTask::TPtr &ev, const TActorContext &ctx); void Handle(TEvCms::TEvRemoveWalleTask::TPtr &ev, const TActorContext &ctx); + // public api begin + void Handle(TEvCms::TEvListClusterNodesRequest::TPtr &ev, const TActorContext &ctx); + void Handle(TEvCms::TEvCreateMaintenanceTaskRequest::TPtr &ev, const TActorContext &ctx); + void Handle(TEvCms::TEvRefreshMaintenanceTaskRequest::TPtr &ev, const TActorContext &ctx); + void Handle(TEvCms::TEvGetMaintenanceTaskRequest::TPtr &ev, const TActorContext &ctx); + void Handle(TEvCms::TEvListMaintenanceTasksRequest::TPtr &ev, const TActorContext &ctx); + void Handle(TEvCms::TEvDropMaintenanceTaskRequest::TPtr &ev, const TActorContext &ctx); + void Handle(TEvCms::TEvCompleteActionRequest::TPtr &ev, const TActorContext &ctx); + // public api end void Handle(TEvCms::TEvGetConfigRequest::TPtr &ev, const TActorContext &ctx); void Handle(TEvCms::TEvSetConfigRequest::TPtr &ev, const TActorContext &ctx); void Handle(TEvCms::TEvResetMarkerRequest::TPtr &ev, const TActorContext &ctx); diff --git a/ydb/core/cms/cms_state.h b/ydb/core/cms/cms_state.h index 92e4eeea66d..d2581b5936a 100644 --- a/ydb/core/cms/cms_state.h +++ b/ydb/core/cms/cms_state.h @@ -13,12 +13,14 @@ namespace NKikimr::NCms { struct TTaskInfo { TString TaskId; TString RequestId; + TString Owner; TSet<TString> Permissions; TString ToString() const { return TStringBuilder() << "{" << " TaskId: " << TaskId << " RequestId: " << RequestId + << " Owner: " << Owner << " Permissions: [" << JoinSeq(", ", Permissions) << "]" << " }"; } @@ -39,6 +41,9 @@ struct TCmsState : public TAtomicRefCount<TCmsState> { THashMap<TString, TTaskInfo> WalleTasks; THashMap<TString, TString> WalleRequests; + THashMap<TString, TTaskInfo> MaintenanceTasks; + THashMap<TString, TString> MaintenanceRequests; + // CMS config. TCmsConfig Config; // CMS config proto cache diff --git a/ydb/core/cms/cms_tx_load_state.cpp b/ydb/core/cms/cms_tx_load_state.cpp index 17894caa106..663d4746029 100644 --- a/ydb/core/cms/cms_tx_load_state.cpp +++ b/ydb/core/cms/cms_tx_load_state.cpp @@ -31,13 +31,18 @@ public: auto permissionRowset = db.Table<Schema::Permission>().Range().Select<Schema::Permission::TColumns>(); auto requestRowset = db.Table<Schema::Request>().Range().Select<Schema::Request::TColumns>(); auto walleTaskRowset = db.Table<Schema::WalleTask>().Range().Select<Schema::WalleTask::TColumns>(); + auto maintenanceTasksRowset = db.Table<Schema::MaintenanceTasks>().Range().Select<Schema::MaintenanceTasks::TColumns>(); auto notificationRowset = db.Table<Schema::Notification>().Range().Select<Schema::Notification::TColumns>(); auto nodeTenantRowset = db.Table<Schema::NodeTenant>().Range().Select<Schema::NodeTenant::TColumns>(); auto logRowset = db.Table<Schema::LogRecords>().Range().Select<Schema::LogRecords::Timestamp>(); - if (!paramRow.IsReady() || !permissionRowset.IsReady() - || !requestRowset.IsReady() || !walleTaskRowset.IsReady() - || !notificationRowset.IsReady() || !logRowset.IsReady()) + if (!paramRow.IsReady() + || !permissionRowset.IsReady() + || !requestRowset.IsReady() + || !walleTaskRowset.IsReady() + || !maintenanceTasksRowset.IsReady() + || !notificationRowset.IsReady() + || !logRowset.IsReady()) return false; NKikimrCms::TCmsConfig config; @@ -66,6 +71,8 @@ public: state->WalleTasks.clear(); state->WalleRequests.clear(); + state->MaintenanceTasks.clear(); + state->MaintenanceRequests.clear(); state->Permissions.clear(); state->ScheduledRequests.clear(); state->Notifications.clear(); @@ -108,6 +115,25 @@ public: return false; } + while (!maintenanceTasksRowset.EndOfSet()) { + TString taskId = maintenanceTasksRowset.GetValue<Schema::MaintenanceTasks::TaskID>(); + TString requestId = maintenanceTasksRowset.GetValue<Schema::MaintenanceTasks::RequestID>(); + TString owner = maintenanceTasksRowset.GetValue<Schema::MaintenanceTasks::Owner>(); + + state->MaintenanceRequests.emplace(requestId, taskId); + state->MaintenanceTasks.emplace(taskId, TTaskInfo{ + .TaskId = taskId, + .RequestId = requestId, + .Owner = owner, + }); + + LOG_DEBUG(ctx, NKikimrServices::CMS, "Loaded maintenance task %s mapped to request %s", + taskId.data(), requestId.data()); + + if (!maintenanceTasksRowset.Next()) + return false; + } + while (!permissionRowset.EndOfSet()) { TString id = permissionRowset.GetValue<Schema::Permission::ID>(); TString requestId = permissionRowset.GetValue<Schema::Permission::RequestID>(); @@ -135,6 +161,14 @@ public: id.data(), taskId.data()); } + if (state->MaintenanceRequests.contains(requestId)) { + const auto &taskId = state->MaintenanceRequests[requestId]; + state->MaintenanceTasks[taskId].Permissions.insert(id); + + LOG_DEBUG(ctx, NKikimrServices::CMS, "Added permission %s to maintenance task %s", + id.data(), taskId.data()); + } + if (!permissionRowset.Next()) return false; } diff --git a/ydb/core/cms/cms_tx_remove_permissions.cpp b/ydb/core/cms/cms_tx_remove_permissions.cpp index ca617ab7d5f..27a7ffc5736 100644 --- a/ydb/core/cms/cms_tx_remove_permissions.cpp +++ b/ydb/core/cms/cms_tx_remove_permissions.cpp @@ -44,6 +44,11 @@ public: Self->State->WalleTasks.find(taskId)->second.Permissions.erase(id); } + if (Self->State->MaintenanceRequests.contains(requestId)) { + auto taskId = Self->State->MaintenanceRequests.find(requestId)->second; + Self->State->MaintenanceTasks.find(taskId)->second.Permissions.erase(id); + } + Self->AuditLog(ctx, TStringBuilder() << "Remove permission" << ": id# " << id << ", reason# " << (Request ? "explicit remove" : "scheduled cleanup")); diff --git a/ydb/core/cms/cms_tx_remove_task.cpp b/ydb/core/cms/cms_tx_remove_task.cpp index 3f2fc0549b7..96bdb302e4d 100644 --- a/ydb/core/cms/cms_tx_remove_task.cpp +++ b/ydb/core/cms/cms_tx_remove_task.cpp @@ -52,4 +52,8 @@ ITransaction *TCms::CreateTxRemoveWalleTask(const TString &id) { return new TTxRemoveTask<Schema::WalleTask>(this, id, State->WalleTasks, State->WalleRequests); } +ITransaction *TCms::CreateTxRemoveMaintenanceTask(const TString &id) { + return new TTxRemoveTask<Schema::MaintenanceTasks>(this, id, State->MaintenanceTasks, State->MaintenanceRequests); +} + } // namespace NKikimr::NCms diff --git a/ydb/core/cms/cms_tx_store_permissions.cpp b/ydb/core/cms/cms_tx_store_permissions.cpp index db8c2b49b47..1af75c5be6b 100644 --- a/ydb/core/cms/cms_tx_store_permissions.cpp +++ b/ydb/core/cms/cms_tx_store_permissions.cpp @@ -8,12 +8,13 @@ namespace NKikimr::NCms { class TCms::TTxStorePermissions : public TTransactionBase<TCms> { public: TTxStorePermissions(TCms *self, THolder<IEventBase> req, TAutoPtr<IEventHandle> resp, - const TString &owner, TAutoPtr<TRequestInfo> scheduled) + const TString &owner, TAutoPtr<TRequestInfo> scheduled, const TMaybe<TString> &maintenanceTaskId) : TBase(self) , Request(std::move(req)) , Response(std::move(resp)) , Owner(owner) , Scheduled(scheduled) + , MaintenanceTaskId(maintenanceTaskId) , NextPermissionId(self->State->NextPermissionId) , NextRequestId(self->State->NextRequestId) { @@ -29,6 +30,23 @@ public: NIceDb::TUpdate<Schema::Param::NextRequestID>(NextRequestId)); const auto &rec = Response->Get<TEvCms::TEvPermissionResponse>()->Record; + + if (MaintenanceTaskId) { + Y_VERIFY(Scheduled); + + Self->State->MaintenanceRequests.emplace(Scheduled->RequestId, *MaintenanceTaskId); + Self->State->MaintenanceTasks.emplace(*MaintenanceTaskId, TTaskInfo{ + .TaskId = *MaintenanceTaskId, + .RequestId = Scheduled->RequestId, + .Owner = Scheduled->Owner, + }); + + db.Table<Schema::MaintenanceTasks>().Key(*MaintenanceTaskId).Update( + NIceDb::TUpdate<Schema::MaintenanceTasks::RequestID>(Scheduled->RequestId), + NIceDb::TUpdate<Schema::MaintenanceTasks::Owner>(Scheduled->Owner) + ); + } + for (const auto &permission : rec.GetPermissions()) { const auto &id = permission.GetId(); const auto &requestId = Scheduled ? Scheduled->RequestId : ""; @@ -42,6 +60,11 @@ public: NIceDb::TUpdate<Schema::Permission::Deadline>(deadline), NIceDb::TUpdate<Schema::Permission::RequestID>(requestId)); + if (MaintenanceTaskId) { + Y_VERIFY(Self->State->MaintenanceTasks.contains(*MaintenanceTaskId)); + Self->State->MaintenanceTasks.at(*MaintenanceTaskId).Permissions.insert(id); + } + Self->AuditLog(ctx, TStringBuilder() << "Store permission" << ": id# " << id << ", validity# " << TInstant::MicroSeconds(deadline) @@ -92,14 +115,15 @@ private: TAutoPtr<IEventHandle> Response; TString Owner; TAutoPtr<TRequestInfo> Scheduled; + const TMaybe<TString> MaintenanceTaskId; ui64 NextPermissionId; ui64 NextRequestId; }; ITransaction *TCms::CreateTxStorePermissions(THolder<IEventBase> req, TAutoPtr<IEventHandle> resp, - const TString &owner, TAutoPtr<TRequestInfo> scheduled) + const TString &owner, TAutoPtr<TRequestInfo> scheduled, const TMaybe<TString> &maintenanceTaskId) { - return new TTxStorePermissions(this, std::move(req), std::move(resp), owner, std::move(scheduled)); + return new TTxStorePermissions(this, std::move(req), std::move(resp), owner, std::move(scheduled), maintenanceTaskId); } } // namespace NKikimr::NCms diff --git a/ydb/core/cms/scheme.h b/ydb/core/cms/scheme.h index f581ab03790..5bbd7dcc564 100644 --- a/ydb/core/cms/scheme.h +++ b/ydb/core/cms/scheme.h @@ -42,7 +42,6 @@ struct Schema : NIceDb::Schema { using TColumns = TableColumns<ID, Owner, Order, Content>; }; - struct WalleTask : Table<4> { struct TaskID : Column<1, NScheme::NTypeIds::Utf8> {}; struct RequestID : Column<2, NScheme::NTypeIds::Utf8> {}; @@ -130,8 +129,18 @@ struct Schema : NIceDb::Schema { using TColumns = TableColumns<NodeId, DiskId, Downtime>; }; + struct MaintenanceTasks : Table<14> { + struct TaskID : Column<1, NScheme::NTypeIds::Utf8> {}; + struct RequestID : Column<2, NScheme::NTypeIds::Utf8> {}; + struct Owner : Column<3, NScheme::NTypeIds::Utf8> {}; + + using TKey = TableKey<TaskID>; + using TColumns = TableColumns<TaskID, RequestID, Owner>; + }; + using TTables = SchemaTables<Param, Permission, Request, WalleTask, Notification, NodeTenant, - HostMarkers, NodeMarkers, PDiskMarkers, VDiskMarkers, LogRecords, NodeDowntimes, PDiskDowntimes>; + HostMarkers, NodeMarkers, PDiskMarkers, VDiskMarkers, LogRecords, NodeDowntimes, PDiskDowntimes, + MaintenanceTasks>; using TSettings = SchemaSettings<ExecutorLogBatching<true>, ExecutorLogFlushPeriod<TDuration::MicroSeconds(512).GetValue()>>; }; diff --git a/ydb/core/cms/ya.make b/ydb/core/cms/ya.make index 585a77a5272..e374be95321 100644 --- a/ydb/core/cms/ya.make +++ b/ydb/core/cms/ya.make @@ -1,6 +1,7 @@ LIBRARY() SRCS( + api_adapters.cpp audit_log.cpp base_handler.h cluster_info.cpp diff --git a/ydb/core/driver_lib/run/CMakeLists.darwin-x86_64.txt b/ydb/core/driver_lib/run/CMakeLists.darwin-x86_64.txt index e7bbabfbedf..1791870b26e 100644 --- a/ydb/core/driver_lib/run/CMakeLists.darwin-x86_64.txt +++ b/ydb/core/driver_lib/run/CMakeLists.darwin-x86_64.txt @@ -130,6 +130,7 @@ target_link_libraries(run PUBLIC ydb-services-fq ydb-services-kesus ydb-services-local_discovery + ydb-services-maintenance services-metadata-ds_table ydb-services-metadata services-bg_tasks-ds_table diff --git a/ydb/core/driver_lib/run/CMakeLists.linux-aarch64.txt b/ydb/core/driver_lib/run/CMakeLists.linux-aarch64.txt index b1c4e4f1a30..7d71a613048 100644 --- a/ydb/core/driver_lib/run/CMakeLists.linux-aarch64.txt +++ b/ydb/core/driver_lib/run/CMakeLists.linux-aarch64.txt @@ -131,6 +131,7 @@ target_link_libraries(run PUBLIC ydb-services-fq ydb-services-kesus ydb-services-local_discovery + ydb-services-maintenance services-metadata-ds_table ydb-services-metadata services-bg_tasks-ds_table diff --git a/ydb/core/driver_lib/run/CMakeLists.linux-x86_64.txt b/ydb/core/driver_lib/run/CMakeLists.linux-x86_64.txt index b1c4e4f1a30..7d71a613048 100644 --- a/ydb/core/driver_lib/run/CMakeLists.linux-x86_64.txt +++ b/ydb/core/driver_lib/run/CMakeLists.linux-x86_64.txt @@ -131,6 +131,7 @@ target_link_libraries(run PUBLIC ydb-services-fq ydb-services-kesus ydb-services-local_discovery + ydb-services-maintenance services-metadata-ds_table ydb-services-metadata services-bg_tasks-ds_table diff --git a/ydb/core/driver_lib/run/CMakeLists.windows-x86_64.txt b/ydb/core/driver_lib/run/CMakeLists.windows-x86_64.txt index e7bbabfbedf..1791870b26e 100644 --- a/ydb/core/driver_lib/run/CMakeLists.windows-x86_64.txt +++ b/ydb/core/driver_lib/run/CMakeLists.windows-x86_64.txt @@ -130,6 +130,7 @@ target_link_libraries(run PUBLIC ydb-services-fq ydb-services-kesus ydb-services-local_discovery + ydb-services-maintenance services-metadata-ds_table ydb-services-metadata services-bg_tasks-ds_table diff --git a/ydb/core/driver_lib/run/run.cpp b/ydb/core/driver_lib/run/run.cpp index a196f6c7fb7..d8c5c4a25b0 100644 --- a/ydb/core/driver_lib/run/run.cpp +++ b/ydb/core/driver_lib/run/run.cpp @@ -90,6 +90,7 @@ #include <ydb/services/fq/private_grpc.h> #include <ydb/services/kesus/grpc_service.h> #include <ydb/services/local_discovery/grpc_service.h> +#include <ydb/services/maintenance/grpc_service.h> #include <ydb/services/monitoring/grpc_service.h> #include <ydb/services/persqueue_cluster_discovery/grpc_service.h> #include <ydb/services/persqueue_v1/persqueue.h> @@ -566,6 +567,8 @@ void TKikimrRunner::InitializeGRpc(const TKikimrRunConfig& runConfig) { names["scripting"] = &hasScripting; TServiceCfg hasCms = services.empty(); names["cms"] = &hasCms; + TServiceCfg hasMaintenance = services.empty(); + names["maintenance"] = &hasMaintenance; TServiceCfg hasKesus = services.empty(); names["locking"] = names["kesus"] = &hasKesus; TServiceCfg hasMonitoring = services.empty(); @@ -815,6 +818,11 @@ void TKikimrRunner::InitializeGRpc(const TKikimrRunConfig& runConfig) { grpcRequestProxies[0], hasCms.IsRlAllowed())); } + if (hasMaintenance) { + server.AddService(new NGRpcService::TGRpcMaintenanceService(ActorSystem.Get(), Counters, + grpcRequestProxies[0], hasMaintenance.IsRlAllowed())); + } + if (hasDiscovery) { auto discoveryService = new NGRpcService::TGRpcDiscoveryService(ActorSystem.Get(), Counters,grpcRequestProxies[0], hasDiscovery.IsRlAllowed()); if (!opts.SslData.Empty()) { diff --git a/ydb/core/driver_lib/run/ya.make b/ydb/core/driver_lib/run/ya.make index e29db37d9a0..ce57ccfd593 100644 --- a/ydb/core/driver_lib/run/ya.make +++ b/ydb/core/driver_lib/run/ya.make @@ -145,6 +145,7 @@ PEERDIR( ydb/services/fq ydb/services/kesus ydb/services/local_discovery + ydb/services/maintenance ydb/services/metadata/ds_table ydb/services/metadata ydb/services/bg_tasks/ds_table diff --git a/ydb/core/grpc_services/CMakeLists.darwin-x86_64.txt b/ydb/core/grpc_services/CMakeLists.darwin-x86_64.txt index c25d9faed5b..4c2ee86dca7 100644 --- a/ydb/core/grpc_services/CMakeLists.darwin-x86_64.txt +++ b/ydb/core/grpc_services/CMakeLists.darwin-x86_64.txt @@ -117,6 +117,7 @@ target_sources(ydb-core-grpc_services PRIVATE ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_log_store.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_long_tx.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_node_registration.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_maintenance.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_make_directory.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_modify_permissions.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_monitoring.cpp diff --git a/ydb/core/grpc_services/CMakeLists.linux-aarch64.txt b/ydb/core/grpc_services/CMakeLists.linux-aarch64.txt index 8d1841680a4..5f2bd916cff 100644 --- a/ydb/core/grpc_services/CMakeLists.linux-aarch64.txt +++ b/ydb/core/grpc_services/CMakeLists.linux-aarch64.txt @@ -118,6 +118,7 @@ target_sources(ydb-core-grpc_services PRIVATE ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_log_store.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_long_tx.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_node_registration.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_maintenance.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_make_directory.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_modify_permissions.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_monitoring.cpp diff --git a/ydb/core/grpc_services/CMakeLists.linux-x86_64.txt b/ydb/core/grpc_services/CMakeLists.linux-x86_64.txt index 8d1841680a4..5f2bd916cff 100644 --- a/ydb/core/grpc_services/CMakeLists.linux-x86_64.txt +++ b/ydb/core/grpc_services/CMakeLists.linux-x86_64.txt @@ -118,6 +118,7 @@ target_sources(ydb-core-grpc_services PRIVATE ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_log_store.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_long_tx.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_node_registration.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_maintenance.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_make_directory.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_modify_permissions.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_monitoring.cpp diff --git a/ydb/core/grpc_services/CMakeLists.windows-x86_64.txt b/ydb/core/grpc_services/CMakeLists.windows-x86_64.txt index c25d9faed5b..4c2ee86dca7 100644 --- a/ydb/core/grpc_services/CMakeLists.windows-x86_64.txt +++ b/ydb/core/grpc_services/CMakeLists.windows-x86_64.txt @@ -117,6 +117,7 @@ target_sources(ydb-core-grpc_services PRIVATE ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_log_store.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_long_tx.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_node_registration.cpp + ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_maintenance.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_make_directory.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_modify_permissions.cpp ${CMAKE_SOURCE_DIR}/ydb/core/grpc_services/rpc_monitoring.cpp diff --git a/ydb/core/grpc_services/rpc_maintenance.cpp b/ydb/core/grpc_services/rpc_maintenance.cpp new file mode 100644 index 00000000000..4e860eb3b4b --- /dev/null +++ b/ydb/core/grpc_services/rpc_maintenance.cpp @@ -0,0 +1,168 @@ +#include "service_maintenance.h" + +#include <ydb/core/base/tablet_pipe.h> +#include <ydb/core/cms/cms.h> +#include <ydb/core/grpc_services/rpc_request_base.h> +#include <ydb/public/api/protos/draft/ydb_maintenance.pb.h> + +#include <library/cpp/actors/core/actor.h> +#include <library/cpp/actors/core/hfunc.h> + +namespace NKikimr::NGRpcService { + +using namespace Ydb; + +using TEvListClusterNodes = TGrpcRequestOperationCall<Maintenance::ListClusterNodesRequest, Maintenance::ListClusterNodesResponse>; +using TEvCreateMaintenanceTask = TGrpcRequestOperationCall<Maintenance::CreateMaintenanceTaskRequest, Maintenance::MaintenanceTaskResponse>; +using TEvRefreshMaintenanceTask = TGrpcRequestOperationCall<Maintenance::RefreshMaintenanceTaskRequest, Maintenance::MaintenanceTaskResponse>; +using TEvGetMaintenanceTask = TGrpcRequestOperationCall<Maintenance::GetMaintenanceTaskRequest, Maintenance::GetMaintenanceTaskResponse>; +using TEvListMaintenanceTasks = TGrpcRequestOperationCall<Maintenance::ListMaintenanceTasksRequest, Maintenance::ListMaintenanceTasksResponse>; +using TEvDropMaintenanceTask = TGrpcRequestOperationCall<Maintenance::DropMaintenanceTaskRequest, Maintenance::ManageMaintenanceTaskResponse>; +using TEvCompleteAction = TGrpcRequestOperationCall<Maintenance::CompleteActionRequest, Maintenance::ManageActionResponse>; + +template <typename TEvRequest, typename TEvCmsRequest, typename TEvCmsResponse> +class TMaintenanceRPC: public TRpcRequestActor<TMaintenanceRPC<TEvRequest, TEvCmsRequest, TEvCmsResponse>, TEvRequest, true> { + using TThis = TMaintenanceRPC<TEvRequest, TEvCmsRequest, TEvCmsResponse>; + using TBase = TRpcRequestActor<TThis, TEvRequest, true>; + + bool CheckAccess() const { + if (AppData()->AdministrationAllowedSIDs.empty()) { + return true; + } + + if (!this->UserToken) { + return false; + } + + for (const auto& sid : AppData()->AdministrationAllowedSIDs) { + if (this->UserToken->IsExist(sid)) { + return true; + } + } + + return false; + } + + void SendRequest() { + auto ev = MakeHolder<TEvCmsRequest>(); + ev->Record.MutableRequest()->CopyFrom(*this->GetProtoRequest()); + + if constexpr (std::is_same_v<TEvCmsRequest, NCms::TEvCms::TEvCreateMaintenanceTaskRequest>) { + if (this->UserToken) { + ev->Record.SetUserSID(this->UserToken->GetUserSID()); + } + } + + Y_VERIFY(AppData()->DomainsInfo); + Y_VERIFY(AppData()->DomainsInfo->Domains); + const auto group = AppData()->DomainsInfo->Domains.begin()->second->DefaultStateStorageGroup; + + NTabletPipe::TClientConfig pipeConfig; + pipeConfig.RetryPolicy = {.RetryLimitCount = 10}; + CmsPipe = this->RegisterWithSameMailbox(NTabletPipe::CreateClient(this->SelfId(), MakeCmsID(group), pipeConfig)); + + NTabletPipe::SendData(this->SelfId(), CmsPipe, ev.Release()); + } + + void Handle(typename TEvCmsResponse::TPtr& ev) { + const auto& record = ev->Get()->Record; + + Ydb::Operations::Operation operation; + operation.set_ready(true); + operation.set_status(record.GetStatus()); + if (record.IssuesSize()) { + operation.mutable_issues()->CopyFrom(record.GetIssues()); + } + + if constexpr (!std::is_same_v<TEvCmsResponse, NCms::TEvCms::TEvManageMaintenanceTaskResponse>) { + operation.mutable_result()->PackFrom(record.GetResult()); + } + + this->Reply(operation); + } + + void Handle(TEvTabletPipe::TEvClientConnected::TPtr& ev) { + if (ev->Get()->Status != NKikimrProto::OK) { + Unavailable(); + } + } + + void Unavailable() { + this->Reply(Ydb::StatusIds::UNAVAILABLE, "CMS is unavailable"); + } + + void PassAway() override { + NTabletPipe::CloseAndForgetClient(this->SelfId(), CmsPipe); + TBase::PassAway(); + } + +public: + using TBase::TBase; + + void Bootstrap() { + if (!CheckAccess()) { + auto error = TStringBuilder() << "Access denied"; + if (this->UserToken) { + error << ": '" << this->UserToken->GetUserSID() << "' is not an admin"; + } + + this->Reply(Ydb::StatusIds::UNAUTHORIZED, NKikimrIssues::TIssuesIds::ACCESS_DENIED, error); + } else { + SendRequest(); + this->Become(&TThis::StateWork); + } + } + + STRICT_STFUNC(StateWork, + hFunc(TEvCmsResponse, Handle) + hFunc(TEvTabletPipe::TEvClientConnected, Handle) + sFunc(TEvTabletPipe::TEvClientDestroyed, Unavailable) + ) + +private: + TActorId CmsPipe; +}; + +void DoListClusterNodes(std::unique_ptr<IRequestOpCtx> p, const IFacilityProvider& f) { + f.RegisterActor(new TMaintenanceRPC<TEvListClusterNodes, + NCms::TEvCms::TEvListClusterNodesRequest, + NCms::TEvCms::TEvListClusterNodesResponse>(p.release())); +} + +void DoCreateMaintenanceTask(std::unique_ptr<IRequestOpCtx> p, const IFacilityProvider& f) { + f.RegisterActor(new TMaintenanceRPC<TEvCreateMaintenanceTask, + NCms::TEvCms::TEvCreateMaintenanceTaskRequest, + NCms::TEvCms::TEvMaintenanceTaskResponse>(p.release())); +} + +void DoRefreshMaintenanceTask(std::unique_ptr<IRequestOpCtx> p, const IFacilityProvider& f) { + f.RegisterActor(new TMaintenanceRPC<TEvRefreshMaintenanceTask, + NCms::TEvCms::TEvRefreshMaintenanceTaskRequest, + NCms::TEvCms::TEvMaintenanceTaskResponse>(p.release())); +} + +void DoGetMaintenanceTask(std::unique_ptr<IRequestOpCtx> p, const IFacilityProvider& f) { + f.RegisterActor(new TMaintenanceRPC<TEvGetMaintenanceTask, + NCms::TEvCms::TEvGetMaintenanceTaskRequest, + NCms::TEvCms::TEvGetMaintenanceTaskResponse>(p.release())); +} + +void DoListMaintenanceTasks(std::unique_ptr<IRequestOpCtx> p, const IFacilityProvider& f) { + f.RegisterActor(new TMaintenanceRPC<TEvListMaintenanceTasks, + NCms::TEvCms::TEvListMaintenanceTasksRequest, + NCms::TEvCms::TEvListMaintenanceTasksResponse>(p.release())); +} + +void DoDropMaintenanceTask(std::unique_ptr<IRequestOpCtx> p, const IFacilityProvider& f) { + f.RegisterActor(new TMaintenanceRPC<TEvDropMaintenanceTask, + NCms::TEvCms::TEvDropMaintenanceTaskRequest, + NCms::TEvCms::TEvManageMaintenanceTaskResponse>(p.release())); +} + +void DoCompleteAction(std::unique_ptr<IRequestOpCtx> p, const IFacilityProvider& f) { + f.RegisterActor(new TMaintenanceRPC<TEvCompleteAction, + NCms::TEvCms::TEvCompleteActionRequest, + NCms::TEvCms::TEvManageActionResponse>(p.release())); +} + +} diff --git a/ydb/core/grpc_services/service_maintenance.h b/ydb/core/grpc_services/service_maintenance.h new file mode 100644 index 00000000000..3a7009bd370 --- /dev/null +++ b/ydb/core/grpc_services/service_maintenance.h @@ -0,0 +1,18 @@ +#pragma once + +#include <memory> + +namespace NKikimr::NGRpcService { + +class IRequestOpCtx; +class IFacilityProvider; + +void DoListClusterNodes(std::unique_ptr<IRequestOpCtx> p, const IFacilityProvider& f); +void DoCreateMaintenanceTask(std::unique_ptr<IRequestOpCtx> p, const IFacilityProvider& f); +void DoRefreshMaintenanceTask(std::unique_ptr<IRequestOpCtx> p, const IFacilityProvider& f); +void DoGetMaintenanceTask(std::unique_ptr<IRequestOpCtx> p, const IFacilityProvider& f); +void DoListMaintenanceTasks(std::unique_ptr<IRequestOpCtx> p, const IFacilityProvider& f); +void DoDropMaintenanceTask(std::unique_ptr<IRequestOpCtx> p, const IFacilityProvider& f); +void DoCompleteAction(std::unique_ptr<IRequestOpCtx> p, const IFacilityProvider& f); + +} diff --git a/ydb/core/grpc_services/ya.make b/ydb/core/grpc_services/ya.make index d6e169c22a9..454bfdc9ab6 100644 --- a/ydb/core/grpc_services/ya.make +++ b/ydb/core/grpc_services/ya.make @@ -55,6 +55,7 @@ SRCS( rpc_log_store.cpp rpc_long_tx.cpp rpc_node_registration.cpp + rpc_maintenance.cpp rpc_make_directory.cpp rpc_modify_permissions.cpp rpc_monitoring.cpp diff --git a/ydb/core/protos/CMakeLists.darwin-x86_64.txt b/ydb/core/protos/CMakeLists.darwin-x86_64.txt index 2742c4d4e7c..adad3d6aa30 100644 --- a/ydb/core/protos/CMakeLists.darwin-x86_64.txt +++ b/ydb/core/protos/CMakeLists.darwin-x86_64.txt @@ -1502,6 +1502,18 @@ get_built_tool_path( cpp_styleguide ) get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) +get_built_tool_path( TOOL_enum_parser_bin TOOL_enum_parser_dependency tools/enum_parser/enum_parser @@ -1601,6 +1613,7 @@ target_proto_messages(ydb-core-protos PRIVATE ${CMAKE_SOURCE_DIR}/ydb/core/protos/load_test.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/local.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/long_tx_service.proto + ${CMAKE_SOURCE_DIR}/ydb/core/protos/maintenance.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/metrics.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/minikql_engine.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/mon.proto diff --git a/ydb/core/protos/CMakeLists.linux-aarch64.txt b/ydb/core/protos/CMakeLists.linux-aarch64.txt index f127d672c5b..fdc2937cd49 100644 --- a/ydb/core/protos/CMakeLists.linux-aarch64.txt +++ b/ydb/core/protos/CMakeLists.linux-aarch64.txt @@ -1502,6 +1502,18 @@ get_built_tool_path( cpp_styleguide ) get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) +get_built_tool_path( TOOL_enum_parser_bin TOOL_enum_parser_dependency tools/enum_parser/enum_parser @@ -1602,6 +1614,7 @@ target_proto_messages(ydb-core-protos PRIVATE ${CMAKE_SOURCE_DIR}/ydb/core/protos/load_test.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/local.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/long_tx_service.proto + ${CMAKE_SOURCE_DIR}/ydb/core/protos/maintenance.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/metrics.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/minikql_engine.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/mon.proto diff --git a/ydb/core/protos/CMakeLists.linux-x86_64.txt b/ydb/core/protos/CMakeLists.linux-x86_64.txt index f127d672c5b..fdc2937cd49 100644 --- a/ydb/core/protos/CMakeLists.linux-x86_64.txt +++ b/ydb/core/protos/CMakeLists.linux-x86_64.txt @@ -1502,6 +1502,18 @@ get_built_tool_path( cpp_styleguide ) get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) +get_built_tool_path( TOOL_enum_parser_bin TOOL_enum_parser_dependency tools/enum_parser/enum_parser @@ -1602,6 +1614,7 @@ target_proto_messages(ydb-core-protos PRIVATE ${CMAKE_SOURCE_DIR}/ydb/core/protos/load_test.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/local.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/long_tx_service.proto + ${CMAKE_SOURCE_DIR}/ydb/core/protos/maintenance.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/metrics.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/minikql_engine.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/mon.proto diff --git a/ydb/core/protos/CMakeLists.windows-x86_64.txt b/ydb/core/protos/CMakeLists.windows-x86_64.txt index 2742c4d4e7c..adad3d6aa30 100644 --- a/ydb/core/protos/CMakeLists.windows-x86_64.txt +++ b/ydb/core/protos/CMakeLists.windows-x86_64.txt @@ -1502,6 +1502,18 @@ get_built_tool_path( cpp_styleguide ) get_built_tool_path( + TOOL_protoc_bin + TOOL_protoc_dependency + contrib/tools/protoc/bin + protoc +) +get_built_tool_path( + TOOL_cpp_styleguide_bin + TOOL_cpp_styleguide_dependency + contrib/tools/protoc/plugins/cpp_styleguide + cpp_styleguide +) +get_built_tool_path( TOOL_enum_parser_bin TOOL_enum_parser_dependency tools/enum_parser/enum_parser @@ -1601,6 +1613,7 @@ target_proto_messages(ydb-core-protos PRIVATE ${CMAKE_SOURCE_DIR}/ydb/core/protos/load_test.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/local.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/long_tx_service.proto + ${CMAKE_SOURCE_DIR}/ydb/core/protos/maintenance.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/metrics.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/minikql_engine.proto ${CMAKE_SOURCE_DIR}/ydb/core/protos/mon.proto diff --git a/ydb/core/protos/cms.proto b/ydb/core/protos/cms.proto index 7821d58a305..4f0e2c3a2db 100644 --- a/ydb/core/protos/cms.proto +++ b/ydb/core/protos/cms.proto @@ -159,6 +159,7 @@ message TPermissionRequest { optional ETenantPolicy TenantPolicy = 8 [default = DEFAULT]; // Availability mode is not preserved for scheduled events. optional EAvailabilityMode AvailabilityMode = 9 [default = MODE_MAX_AVAILABILITY]; + optional string MaintenanceTaskId = 10; } enum EExtensionType { diff --git a/ydb/core/protos/maintenance.proto b/ydb/core/protos/maintenance.proto new file mode 100644 index 00000000000..b0be9a31983 --- /dev/null +++ b/ydb/core/protos/maintenance.proto @@ -0,0 +1,70 @@ +import "ydb/public/api/protos/draft/ydb_maintenance.proto"; +import "ydb/public/api/protos/ydb_issue_message.proto"; +import "ydb/public/api/protos/ydb_status_codes.proto"; + +package NKikimrMaintenance; +option java_package = "ru.yandex.kikimr.proto"; + +message TListClusterNodesRequest { + optional Ydb.Maintenance.ListClusterNodesRequest Request = 1; +} + +message TListClusterNodesResponse { + optional Ydb.StatusIds.StatusCode Status = 1; + repeated Ydb.Issue.IssueMessage Issues = 2; + optional Ydb.Maintenance.ListClusterNodesResult Result = 3; +} + +message TCreateMaintenanceTaskRequest { + optional Ydb.Maintenance.CreateMaintenanceTaskRequest Request = 1; + optional string UserSID = 2; +} + +message TRefreshMaintenanceTaskRequest { + optional Ydb.Maintenance.RefreshMaintenanceTaskRequest Request = 1; +} + +message TMaintenanceTaskResponse { + optional Ydb.StatusIds.StatusCode Status = 1; + repeated Ydb.Issue.IssueMessage Issues = 2; + optional Ydb.Maintenance.MaintenanceTaskResult Result = 3; +} + +message TGetMaintenanceTaskRequest { + optional Ydb.Maintenance.GetMaintenanceTaskRequest Request = 1; +} + +message TGetMaintenanceTaskResponse { + optional Ydb.StatusIds.StatusCode Status = 1; + repeated Ydb.Issue.IssueMessage Issues = 2; + optional Ydb.Maintenance.GetMaintenanceTaskResult Result = 3; +} + +message TListMaintenanceTasksRequest { + optional Ydb.Maintenance.ListMaintenanceTasksRequest Request = 1; +} + +message TListMaintenanceTasksResponse { + optional Ydb.StatusIds.StatusCode Status = 1; + repeated Ydb.Issue.IssueMessage Issues = 2; + optional Ydb.Maintenance.ListMaintenanceTasksResult Result = 3; +} + +message TDropMaintenanceTaskRequest { + optional Ydb.Maintenance.DropMaintenanceTaskRequest Request = 1; +} + +message TManageMaintenanceTaskResponse { + optional Ydb.StatusIds.StatusCode Status = 1; + repeated Ydb.Issue.IssueMessage Issues = 2; +} + +message TCompleteActionRequest { + optional Ydb.Maintenance.CompleteActionRequest Request = 1; +} + +message TManageActionResponse { + optional Ydb.StatusIds.StatusCode Status = 1; + repeated Ydb.Issue.IssueMessage Issues = 2; + optional Ydb.Maintenance.ManageActionResult Result = 3; +} diff --git a/ydb/core/protos/services.proto b/ydb/core/protos/services.proto index e60f110a0c4..241d0d5091c 100644 --- a/ydb/core/protos/services.proto +++ b/ydb/core/protos/services.proto @@ -987,5 +987,6 @@ message TActivity { KQP_SOURCE_READ_ACTOR = 610; FEDERATION_DISCOVERY = 611; GRPC_REQ_SHARD_WRITER = 612; + CMS_API_ADAPTER = 613; }; }; diff --git a/ydb/core/protos/ya.make b/ydb/core/protos/ya.make index d0918753599..7b4f840f2e3 100644 --- a/ydb/core/protos/ya.make +++ b/ydb/core/protos/ya.make @@ -72,6 +72,7 @@ SRCS( load_test.proto local.proto long_tx_service.proto + maintenance.proto metrics.proto minikql_engine.proto mon.proto diff --git a/ydb/public/api/grpc/draft/ydb_maintenance_v1.proto b/ydb/public/api/grpc/draft/ydb_maintenance_v1.proto index 21832dd2dfe..c308e48d2ab 100644 --- a/ydb/public/api/grpc/draft/ydb_maintenance_v1.proto +++ b/ydb/public/api/grpc/draft/ydb_maintenance_v1.proto @@ -1,36 +1,25 @@ syntax = "proto3"; package Ydb.Maintenance.V1; -option java_package = "com.yandex.ydb.maintenance.service.v1"; +option java_package = "com.yandex.ydb.maintenance.v1"; import "ydb/public/api/protos/draft/ydb_maintenance.proto"; service MaintenanceService { - // List cluster hosts + // List cluster nodes. rpc ListClusterNodes(ListClusterNodesRequest) returns (ListClusterNodesResponse); - // List node devices - rpc ListNodesDevices(ListNodesDevicesRequest) returns (ListNodesDevicesResponse); - // Request for permissions + // Create maintenance task. rpc CreateMaintenanceTask(CreateMaintenanceTaskRequest) returns (MaintenanceTaskResponse); - // Get new results + // Try to perform maintenance task's actions (polling). rpc RefreshMaintenanceTask(RefreshMaintenanceTaskRequest) returns (MaintenanceTaskResponse); - // Get scheduled task - rpc GetMaintenanceTaskDetails(GetMaintenanceTaskRequest) returns (GetMaintenanceTaskResponse); - // List maintenance tasks + // Get detailed task information. + rpc GetMaintenanceTask(GetMaintenanceTaskRequest) returns (GetMaintenanceTaskResponse); + // List maintenance tasks. rpc ListMaintenanceTasks(ListMaintenanceTasksRequest) returns (ListMaintenanceTasksResponse); - - // Drop scheduled task + // Drop maintenance task. rpc DropMaintenanceTask(DropMaintenanceTaskRequest) returns (ManageMaintenanceTaskResponse); - // Update scheduled task deadline - rpc ProlongateMaintenanceTask(ProlongateMaintenanceTaskRequest) returns (ManageMaintenanceTaskResponse); - - // Mark action result as no more needed - rpc ReleaseActionResult(ReleaseActionResultRequest) returns (ManageActionResultResponse); - // Update action result's deadline - rpc ProlongateActionResult(ProlongateActionResultRequest) returns (ManageActionResultResponse); - // Get detailed action state messages. Used for debugging service tasks to find out - // the reason why an action does not get resolution. - rpc GetReadableActionReason(GetReadableActionReasonRequest) returns (GetReadableActionReasonResponse); + // Mark action as completed. + rpc CompleteAction(CompleteActionRequest) returns (ManageActionResponse); } diff --git a/ydb/public/api/protos/draft/ydb_maintenance.proto b/ydb/public/api/protos/draft/ydb_maintenance.proto index df66577709d..14d2c437552 100644 --- a/ydb/public/api/protos/draft/ydb_maintenance.proto +++ b/ydb/public/api/protos/draft/ydb_maintenance.proto @@ -1,304 +1,240 @@ syntax = "proto3"; option cc_enable_arenas = true; -package Ydb.Maintenance; -option java_package = "com.yandex.ydb.maintenance.service"; - +import "ydb/public/api/protos/annotations/validation.proto"; +import "ydb/public/api/protos/ydb_discovery.proto"; +import "ydb/public/api/protos/ydb_operation.proto"; import "ydb/public/api/protos/ydb_status_codes.proto"; import "google/protobuf/duration.proto"; import "google/protobuf/timestamp.proto"; +package Ydb.Maintenance; +option java_package = "com.yandex.ydb.maintenance"; -// Used to describe the scope of a single action -message ActionScope { - message PDiskId { - uint32 node_id = 1; - uint32 pdisk_id = 2; +// Represents state of an abstract item: e.g. node or device. +enum ItemState { + // Item's state couldn't be identified. + ITEM_STATE_UNSPECIFIED = 0; + // Item is up. + ITEM_STATE_UP = 1; + // Item is down due to planned maintenance. + ITEM_STATE_MAINTENANCE = 2; + // Item is down off-schedule. + ITEM_STATE_DOWN = 3; +} + +message Node { + message StorageNode { + } + + message DynamicNode { + string tenant = 1; + } + + uint32 node_id = 1; + string host = 2; + uint32 port = 3; + Ydb.Discovery.NodeLocation location = 4; + ItemState state = 5; + oneof type { + StorageNode storage = 6; + DynamicNode dynamic = 7; } +} + +message ListClusterNodesRequest { + Ydb.Operations.OperationParams operation_params = 1; +} + +message ListClusterNodesResult { + repeated Node nodes = 1; +} + +message ListClusterNodesResponse { + // operation.result = ListClusterNodesResult + Ydb.Operations.Operation operation = 1; +} + +enum AvailabilityMode { + AVAILABILITY_MODE_UNSPECIFIED = 0; + // In this mode allowed: + // - at most 1 unavailable disk in each storage group; + // - at most 1 unavailable state storage ring. + // For nodes tenant and cluster policies are followed. + AVAILABILITY_MODE_STRONG = 1; + + // In this mode: + // - total number of an unavailable disks in each storage group + // shouldn't exceed number of parity parts in that group; + // - it is allowed to disable (n_to_select - 1) / 2 state storage rings. + // Nodes are handled as in strong mode. + AVAILABILITY_MODE_WEAK = 2; + + // Ignore any storage group & state storage checks. + // Using this mode might cause data unavailability. + AVAILABILITY_MODE_FORCE = 3; +} + +message MaintenanceTaskOptions { + // User-defined _unique_ task identifier. + string task_uid = 1 [(length).le = 128]; + // User-defined description. + string description = 2 [(length).le = 128]; + // Availability mode. + AvailabilityMode availability_mode = 3; + bool dry_run = 4; +} + +// Used to describe the scope of a single action. +message ActionScope { oneof scope { - PDiskId pdisk_id = 1; - uint32 node_id = 2; - string host_name = 3; - // string RackName = 3; - // string DataCenter = 4; + uint32 node_id = 1; + string host = 2 [(length).le = 255]; } } -// Taking an exclusive lock to perform maintenance +// Taking an exclusive lock to perform maintenance. message LockAction { - ActionScope action_scope = 1; + ActionScope scope = 1; google.protobuf.Duration duration = 2; } -// Will not be implemented in the 1st version -// Switching to maintenance mode. Maintenance modes -// can overlap with each other -message SetMaintenanceModeAction { - ActionScope action_scope = 1; - bool drain_tablets = 2; - bool evict_vdisks = 3; - google.protobuf.Duration duration = 4; -} - message Action { oneof action { LockAction lock_action = 1; - SetMaintenanceModeAction set_maintainance_mode_action = 2; } } +message ActionGroup { + repeated Action actions = 1; +} + +message CreateMaintenanceTaskRequest { + Ydb.Operations.OperationParams operation_params = 1; + MaintenanceTaskOptions task_options = 2; + repeated ActionGroup action_groups = 3; +} + +message RefreshMaintenanceTaskRequest { + Ydb.Operations.OperationParams operation_params = 1; + string task_uid = 2 [(length).le = 128]; +} + message ActionUid { - string task_uid = 1; - // Unique id within a single task. Defined by cms - uint32 group_id = 2; - uint32 action_id = 3; + string task_uid = 1 [(length).le = 128]; + // Unique ids within a single task, assigned by the server. + string group_id = 2 [(length).le = 128]; + string action_id = 3 [(length).le = 128]; } message ActionState { enum ActionStatus { ACTION_STATUS_UNSPECIFIED = 0; - ACTION_STATUS_CREATED = 1; - ACTION_STATUS_WAITING = 2; - ACTION_STATUS_PENDING = 3; - // Action has granted result. - ACTION_STATUS_RESULT_PROVIDED = 4; - // Result withdrawn due to deadline - ACTION_STATUS_TIMEOUT_EXPIRED = 5; - // The user marked the action as completed - ACTION_STATUS_FINISHED_BY_USER = 6; + // Action can't be performed now. + ACTION_STATUS_PENDING = 1; + // Action performed: e.g. lock is taken. + ACTION_STATUS_PERFORMED = 2; } - // The reason why the state did not update enum ActionReason { ACTION_REASON_UNSPECIFIED = 0; - // Action is ok - ACTION_REASON_OK = 1; + // Everything is ok. + ACTION_REASON_OK = 1; // Affected storage group has too many unavailable (locked or down) vdisks. - // Can't grant another for this availability mode ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS = 2; - // Blob storage group is already broken - ACTION_REASON_STORAGE_GROUP_BROKEN = 3; - // Too many unavailable state storage rings, - // it is impossible to grant node from another ring - ACTION_REASON_TOO_MANY_UNAVAILABLE_STATE_STORAGE_RINGS = 4; - // State storage broken. Too many (more than (nToSelect - 1) / 2) unavailable rings - ACTION_REASON_STATE_STORAGE_BROKEN = 5; - // Issue in cluster disabled nodes limit - ACTION_REASON_DISABLED_NODES_LIMIT_RICHED = 6; - // Issue in tenant limits - ACTION_REASON_TENANT_DISABLED_NODES_LIMIT_RICHED = 7; - // Wrong request - ACTION_REASON_WRONG_REQUEST = 8; + // Too many unavailable state storage rings. + ACTION_REASON_TOO_MANY_UNAVAILABLE_STATE_STORAGE_RINGS = 3; + // Too many disabled nodes (storage & dynamic) in cluster. + ACTION_REASON_DISABLED_NODES_LIMIT_REACHED = 4; + // Too many disabled dynamic nodes of specific tenant. + ACTION_REASON_TENANT_DISABLED_NODES_LIMIT_REACHED = 5; + // Wrong request. + ACTION_REASON_WRONG_REQUEST = 6; + // Too many unavailable nodes with system tablets. + ACTION_REASON_SYS_TABLETS_NODE_LIMIT_REACHED = 7; } Action action = 1; - ActionStatus status = 2; - ActionUid action_uid = 3; + ActionUid action_uid = 2; + ActionStatus status = 3; ActionReason reason = 4; - // The time when the state was assigned - google.protobuf.Timestamp state_timestamp = 5; - // Fields specified for RESULT_GRANTED state - google.protobuf.Timestamp deadline = 6; -} - -message ActionGroup { - repeated Action actions = 1; + google.protobuf.Timestamp deadline = 5; } message ActionGroupStates { repeated ActionState action_states = 1; } -enum AvailabilityMode { - AVAILABILITY_MODE_UNSPECIFIED = 0; - // By default CMS tries to guarantee cluster availability - // by allowing at most 1 disabled disk in each storage group. - // For compute nodes tenant and cluster policies are followed. - // In this mode CMS allows at most 1 disable state storage ring - AVAILABILITY_MODE_STRONG = 1; - // This mode allows to move cluster restart/update forward - // in case some nodes are permanently down. In this mode - // CMS allows at most 1 locked (by permission to restart - // node or replace device) disk in a group. But total number - // of disabled disks for a group shouldn't exceed number - // of parity parts in that group. - // Compute nodes are handled as in default mode. - - // In this mode CMS allows (nToSelect - 1) / 2 state storage rings - AVAILABILITY_MODE_WEAK = 2; - // In this mode CMS allows to lock 1 disk in a group, but if it can't - // it waits for 15 minutes at gives 1 more node. - AVAILABILITY_MODE_SMART = 3; - // In this mode CMS allows at most 1 locked disk in a group - // ignoring its parity parts count. Allows to restart nodes - // even if multiple disks of some group are down. Using - // this mode might cause data unavailability. - // For compute nodes CMS follows tenant and cluster policies - // but allows to restart at least one node for tenant or - // cluster. - AVAILABILITY_MODE_FORCE = 4; -} - -enum ItemState { - // Device/node state couldn't be identified. - ITEM_STATE_UNSPECIFIED = 0; - // Device/node is up. - ITEM_STATE_UP = 1; - // Device/node is Up, but permission granded - ITEM_STATE_LOCKED = 2; - // Device/node is down due to planned restart. - ITEM_STATE_RESTART = 3; - // Device/node is down off-schedule. - ITEM_STATE_DOWN = 4; -} - -message ListClusterNodesRequest {} - -message ListClusterNodesResponse { - message Node { - uint32 node_id = 1; - string data_center = 2; - string rack = 3; - string fqdn = 4; - uint32 interconnect_port = 5; - ItemState state = 6; - optional string tenant = 7; - bool is_storage = 8; - bool is_dynamic = 9; - } - - repeated Node nodes = 1; -} - -message ListNodesDevicesRequest { - repeated uint32 node_id = 1; +message MaintenanceTaskResult { + string task_uid = 1; + repeated ActionGroupStates action_group_states = 2; + // Try again after this deadline. Specified if there are no performed actions. + optional google.protobuf.Timestamp retry_after = 3; } -message ListNodesDevicesResponse { - message Device { - string name = 1; - ItemState state = 2; - } - - message NodeDevices { - uint32 node_id = 1; - repeated Device devices = 2; - } - - repeated NodeDevices nodes_devices = 1; +message MaintenanceTaskResponse { + // operation.result = MaintenanceTaskResult + Ydb.Operations.Operation operation = 1; } -message MaintenanceTaskOptions { - // The maximum number of action groups in progress at a time - uint32 in_flight = 1; - bool dry_run = 2; - // Name of a task and some comment. - // Provided for the convenience of the user. - string name = 3; - string comment = 4; - // Availability mode is not preserved for scheduled events. - AvailabilityMode availability_mode = 5; - // User defined GUID - string task_uid = 6; - // Task with largest priority blocks other tasks - // until all actions are completed. Default is 0 - int64 priority = 7; +message GetMaintenanceTaskRequest { + Ydb.Operations.OperationParams operation_params = 1; + string task_uid = 2 [(length).le = 128]; } -message CreateMaintenanceTaskRequest { +message GetMaintenanceTaskResult { MaintenanceTaskOptions task_options = 1; - repeated ActionGroup action_groups = 2; - // Indicates that client is no longer interested in the task after - // the specified duration starting from the time task arrives at the cms. - // If not specified then default duration from CMS config is used. - google.protobuf.Duration task_timeout = 5; - + repeated ActionGroupStates action_group_states = 2; } -// Updated action states and tryes to grand permissions -message RefreshMaintenanceTaskRequest { - string task_uid = 2; -} - -message MaintenanceTaskResponse { - StatusIds.StatusCode status = 1; - string task_uid = 2; - repeated ActionGroupStates actions_states = 3; - // Try again after this deadline. Specified if there are no PERMISSION_GRANDED - // actions after request - optional google.protobuf.Timestamp deadline = 4; +message GetMaintenanceTaskResponse { + // operation.result = GetMaintenanceTaskResult + Ydb.Operations.Operation operation = 1; } message ListMaintenanceTasksRequest { - // If specified, it will return the tasks created by this user. - // Otherwise all tasks will be returned - optional string user = 1; + Ydb.Operations.OperationParams operation_params = 1; + // User SID (Security ID). + // If specified, it will return the tasks created by this user. + // Otherwise all tasks will be returned. + optional string user = 2; } -message ListMaintenanceTasksResponse { +message ListMaintenanceTasksResult { repeated string tasks_uids = 1; } -// Returns specified task -message GetMaintenanceTaskRequest { - string task_uid = 1; -} - -message GetMaintenanceTaskResponse { - MaintenanceTaskOptions task_options = 1; - repeated ActionGroupStates actions_group_states = 2; - google.protobuf.Timestamp task_deadline = 3; +message ListMaintenanceTasksResponse { + // operation.result = ListMaintenanceTasksResult + Ydb.Operations.Operation operation = 1; } -// Drop maintenance task message DropMaintenanceTaskRequest { - string task_uid = 1; -} - -// Extends Request deadline -message ProlongateMaintenanceTaskRequest { - string task_uid = 1; - google.protobuf.Timestamp new_deadline = 2; + Ydb.Operations.OperationParams operation_params = 1; + string task_uid = 2 [(length).le = 128]; } message ManageMaintenanceTaskResponse { - StatusIds.StatusCode status = 1; -} - -// Removes resolved result -message ReleaseActionResultRequest { - repeated ActionUid action_uid = 1; + Ydb.Operations.Operation operation = 1; } -// Extends results deadlines -message ProlongateActionResultRequest { - message ActionDuration { - ActionUid action_uid = 1; - google.protobuf.Timestamp new_deadline = 2; - } - repeated ActionDuration action_durations = 1; +message CompleteActionRequest { + Ydb.Operations.OperationParams operation_params = 1; + repeated ActionUid action_uids = 2; } -message ManageActionResultResponse { - message ResultStatus { +message ManageActionResult { + message Status { ActionUid action_uid = 1; StatusIds.StatusCode status = 2; } - repeated ResultStatus result_statuses = 1; + repeated Status action_statuses = 1; } -// Getting a detailed reason why the action doesn't get a result granted state -message GetReadableActionReasonRequest { - repeated ActionUid action_ids = 1; -} - -message GetReadableActionReasonResponse { - message Reason { - ActionState action_state = 1; - string Reason = 2; - } - repeated Reason reasons = 1; +message ManageActionResponse { + // operation.result = ManageActionResult + Ydb.Operations.Operation operation = 1; } diff --git a/ydb/services/CMakeLists.txt b/ydb/services/CMakeLists.txt index 5822fdab09a..4ec40bc8a3e 100644 --- a/ydb/services/CMakeLists.txt +++ b/ydb/services/CMakeLists.txt @@ -17,6 +17,7 @@ add_subdirectory(fq) add_subdirectory(kesus) add_subdirectory(lib) add_subdirectory(local_discovery) +add_subdirectory(maintenance) add_subdirectory(metadata) add_subdirectory(monitoring) add_subdirectory(persqueue_cluster_discovery) diff --git a/ydb/services/maintenance/CMakeLists.darwin-x86_64.txt b/ydb/services/maintenance/CMakeLists.darwin-x86_64.txt new file mode 100644 index 00000000000..0c048850836 --- /dev/null +++ b/ydb/services/maintenance/CMakeLists.darwin-x86_64.txt @@ -0,0 +1,21 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(ydb-services-maintenance) +target_link_libraries(ydb-services-maintenance PUBLIC + contrib-libs-cxxsupp + yutil + ydb-core-grpc_services + api-grpc + cpp-actors-core + cpp-grpc-server +) +target_sources(ydb-services-maintenance PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/services/maintenance/grpc_service.cpp +) diff --git a/ydb/services/maintenance/CMakeLists.linux-aarch64.txt b/ydb/services/maintenance/CMakeLists.linux-aarch64.txt new file mode 100644 index 00000000000..31a81fd3abb --- /dev/null +++ b/ydb/services/maintenance/CMakeLists.linux-aarch64.txt @@ -0,0 +1,22 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(ydb-services-maintenance) +target_link_libraries(ydb-services-maintenance PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + ydb-core-grpc_services + api-grpc + cpp-actors-core + cpp-grpc-server +) +target_sources(ydb-services-maintenance PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/services/maintenance/grpc_service.cpp +) diff --git a/ydb/services/maintenance/CMakeLists.linux-x86_64.txt b/ydb/services/maintenance/CMakeLists.linux-x86_64.txt new file mode 100644 index 00000000000..31a81fd3abb --- /dev/null +++ b/ydb/services/maintenance/CMakeLists.linux-x86_64.txt @@ -0,0 +1,22 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(ydb-services-maintenance) +target_link_libraries(ydb-services-maintenance PUBLIC + contrib-libs-linux-headers + contrib-libs-cxxsupp + yutil + ydb-core-grpc_services + api-grpc + cpp-actors-core + cpp-grpc-server +) +target_sources(ydb-services-maintenance PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/services/maintenance/grpc_service.cpp +) diff --git a/ydb/services/maintenance/CMakeLists.txt b/ydb/services/maintenance/CMakeLists.txt new file mode 100644 index 00000000000..f8b31df0c11 --- /dev/null +++ b/ydb/services/maintenance/CMakeLists.txt @@ -0,0 +1,17 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-aarch64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + include(CMakeLists.darwin-x86_64.txt) +elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA) + include(CMakeLists.windows-x86_64.txt) +elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA) + include(CMakeLists.linux-x86_64.txt) +endif() diff --git a/ydb/services/maintenance/CMakeLists.windows-x86_64.txt b/ydb/services/maintenance/CMakeLists.windows-x86_64.txt new file mode 100644 index 00000000000..0c048850836 --- /dev/null +++ b/ydb/services/maintenance/CMakeLists.windows-x86_64.txt @@ -0,0 +1,21 @@ + +# This file was generated by the build system used internally in the Yandex monorepo. +# Only simple modifications are allowed (adding source-files to targets, adding simple properties +# like target_include_directories). These modifications will be ported to original +# ya.make files by maintainers. Any complex modifications which can't be ported back to the +# original buildsystem will not be accepted. + + + +add_library(ydb-services-maintenance) +target_link_libraries(ydb-services-maintenance PUBLIC + contrib-libs-cxxsupp + yutil + ydb-core-grpc_services + api-grpc + cpp-actors-core + cpp-grpc-server +) +target_sources(ydb-services-maintenance PRIVATE + ${CMAKE_SOURCE_DIR}/ydb/services/maintenance/grpc_service.cpp +) diff --git a/ydb/services/maintenance/grpc_service.cpp b/ydb/services/maintenance/grpc_service.cpp new file mode 100644 index 00000000000..a485c4af4a4 --- /dev/null +++ b/ydb/services/maintenance/grpc_service.cpp @@ -0,0 +1,43 @@ +#include "grpc_service.h" + +#include <ydb/core/grpc_services/base/base.h> +#include <ydb/core/grpc_services/grpc_helper.h> +#include <ydb/core/grpc_services/grpc_request_proxy.h> +#include <ydb/core/grpc_services/rpc_calls.h> +#include <ydb/core/grpc_services/service_maintenance.h> + +namespace NKikimr::NGRpcService { + +void TGRpcMaintenanceService::SetupIncomingRequests(NGrpc::TLoggerPtr logger) { + Y_UNUSED(logger); + + auto getCounterBlock = CreateCounterCb(Counters_, ActorSystem_); + using namespace Ydb; + +#ifdef ADD_REQUEST +#error ADD_REQUEST macro already defined +#endif + +#define ADD_REQUEST(NAME, REQUEST, RESPONSE, CB) \ + MakeIntrusive<TGRpcRequest<Maintenance::REQUEST, Maintenance::RESPONSE, TGRpcMaintenanceService>> \ + (this, &Service_, CQ_, \ + [this](NGrpc::IRequestContextBase *ctx) { \ + NGRpcService::ReportGrpcReqToMon(*ActorSystem_, ctx->GetPeer()); \ + ActorSystem_->Send(GRpcRequestProxyId_, \ + new TGrpcRequestOperationCall<Maintenance::REQUEST, Maintenance::RESPONSE> \ + (ctx, &CB, TRequestAuxSettings{RLSWITCH(TRateLimiterMode::Rps), nullptr})); \ + }, &Maintenance::V1::MaintenanceService::AsyncService::Request ## NAME, \ + #NAME, logger, getCounterBlock("maintenance", #NAME))->Run(); + + ADD_REQUEST(ListClusterNodes, ListClusterNodesRequest, ListClusterNodesResponse, DoListClusterNodes); + ADD_REQUEST(CreateMaintenanceTask, CreateMaintenanceTaskRequest, MaintenanceTaskResponse, DoCreateMaintenanceTask); + ADD_REQUEST(RefreshMaintenanceTask, RefreshMaintenanceTaskRequest, MaintenanceTaskResponse, DoRefreshMaintenanceTask); + ADD_REQUEST(GetMaintenanceTask, GetMaintenanceTaskRequest, GetMaintenanceTaskResponse, DoGetMaintenanceTask); + ADD_REQUEST(ListMaintenanceTasks, ListMaintenanceTasksRequest, ListMaintenanceTasksResponse, DoListMaintenanceTasks); + ADD_REQUEST(DropMaintenanceTask, DropMaintenanceTaskRequest, ManageMaintenanceTaskResponse, DoDropMaintenanceTask); + ADD_REQUEST(CompleteAction, CompleteActionRequest, ManageActionResponse, DoCompleteAction); + +#undef ADD_REQUEST +} + +} diff --git a/ydb/services/maintenance/grpc_service.h b/ydb/services/maintenance/grpc_service.h new file mode 100644 index 00000000000..985572b7f7b --- /dev/null +++ b/ydb/services/maintenance/grpc_service.h @@ -0,0 +1,18 @@ +#pragma once + +#include <ydb/core/grpc_services/base/base_service.h> +#include <ydb/public/api/grpc/draft/ydb_maintenance_v1.grpc.pb.h> + +#include <library/cpp/actors/core/actorsystem.h> +#include <library/cpp/grpc/server/grpc_server.h> + +namespace NKikimr::NGRpcService { + +class TGRpcMaintenanceService: public TGrpcServiceBase<Ydb::Maintenance::V1::MaintenanceService> { +public: + using TGrpcServiceBase<Ydb::Maintenance::V1::MaintenanceService>::TGrpcServiceBase; +private: + void SetupIncomingRequests(NGrpc::TLoggerPtr logger); +}; + +} diff --git a/ydb/services/maintenance/ya.make b/ydb/services/maintenance/ya.make new file mode 100644 index 00000000000..17c28e634bd --- /dev/null +++ b/ydb/services/maintenance/ya.make @@ -0,0 +1,14 @@ +LIBRARY() + +SRCS( + grpc_service.cpp +) + +PEERDIR( + ydb/core/grpc_services + ydb/public/api/grpc + library/cpp/actors/core + library/cpp/grpc/server +) + +END() diff --git a/ydb/services/ya.make b/ydb/services/ya.make index a62562b1134..a89961e2bac 100644 --- a/ydb/services/ya.make +++ b/ydb/services/ya.make @@ -9,6 +9,7 @@ RECURSE( kesus lib local_discovery + maintenance metadata monitoring persqueue_cluster_discovery |
