diff options
author | hcpp <hcpp@ydb.tech> | 2023-09-21 14:27:25 +0300 |
---|---|---|
committer | hcpp <hcpp@ydb.tech> | 2023-09-21 14:56:13 +0300 |
commit | 082987dd3ffbf87d003bc010bac8a0f326836b4c (patch) | |
tree | 43432226e25d93957f79d316f2922771782dc07e | |
parent | 5fbb849336414b062bfcbec38284b9b73890d602 (diff) | |
download | ydb-082987dd3ffbf87d003bc010bac8a0f326836b4c.tar.gz |
unavailable status has been added
5 files changed, 63 insertions, 11 deletions
diff --git a/ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.cpp b/ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.cpp index a6f5d5e8b1..a8f29302fb 100644 --- a/ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.cpp +++ b/ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.cpp @@ -50,6 +50,9 @@ TFinalStatusCounters::TFinalStatusCounters(const ::NMonitoring::TDynamicCounterP AbortedByUser = subgroup->GetCounter("ABORTED_BY_USER", true); Failed = subgroup->GetCounter("FAILED", true); Paused = subgroup->GetCounter("PAUSED", true); + + auto subgroupReason = counters->GetSubgroup("subcomponent", "FinalReason"); + Unavailable = subgroup->GetCounter("Unavailable", true); } void TFinalStatusCounters::IncByStatus(FederatedQuery::QueryMeta::ComputeStatus finalStatus) { diff --git a/ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.h b/ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.h index 3940950577..9d19efdfd6 100644 --- a/ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.h +++ b/ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.h @@ -55,6 +55,9 @@ class TFinalStatusCounters: public virtual TThrRefBase { ::NMonitoring::TDynamicCounters::TCounterPtr Paused; public: + ::NMonitoring::TDynamicCounters::TCounterPtr Unavailable; + +public: TFinalStatusCounters(const ::NMonitoring::TDynamicCounterPtr& counters); void IncByStatus(FederatedQuery::QueryMeta::ComputeStatus finalStatus); diff --git a/ydb/core/fq/libs/control_plane_storage/events/events.h b/ydb/core/fq/libs/control_plane_storage/events/events.h index 88f77b5beb..bcc242e2b5 100644 --- a/ydb/core/fq/libs/control_plane_storage/events/events.h +++ b/ydb/core/fq/libs/control_plane_storage/events/events.h @@ -170,6 +170,7 @@ struct TEvControlPlaneStorage { EvDescribeDatabaseResponse, EvModifyDatabaseRequest, EvModifyDatabaseResponse, + EvFinalStatusReport, EvEnd, }; @@ -791,6 +792,26 @@ struct TEvControlPlaneStorage { NYql::TIssues Issues; TDebugInfoPtr DebugInfo; }; + + struct TEvFinalStatusReport : NActors::TEventLocal<TEvFinalStatusReport, EvFinalStatusReport> { + TEvFinalStatusReport(const TString& queryId, const TString& jobId, const TString& cloudId, const TString& scope, FederatedQuery::QueryMeta::ComputeStatus status, const NYql::TIssues& issues, const NYql::TIssues& transientIssues) + : QueryId(queryId) + , JobId(jobId) + , CloudId(cloudId) + , Scope(scope) + , Status(status) + , Issues(issues) + , TransientIssues(transientIssues) + {} + + TString QueryId; + TString JobId; + TString CloudId; + TString Scope; + FederatedQuery::QueryMeta::ComputeStatus Status = FederatedQuery::QueryMeta::COMPUTE_STATUS_UNSPECIFIED; + NYql::TIssues Issues; + NYql::TIssues TransientIssues; + }; }; } diff --git a/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp b/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp index 829155c6fd..c91bf9e908 100644 --- a/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp +++ b/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp @@ -20,11 +20,17 @@ struct TPingTaskParams { std::shared_ptr<std::vector<TString>> MeteringRecords; }; +struct TFinalStatus { + FederatedQuery::QueryMeta::ComputeStatus Status = FederatedQuery::QueryMeta::COMPUTE_STATUS_UNSPECIFIED; + NYql::TIssues Issues; + NYql::TIssues TransientIssues; +}; + TPingTaskParams ConstructHardPingTask( const Fq::Private::PingTaskRequest& request, std::shared_ptr<Fq::Private::PingTaskResult> response, const TString& tablePathPrefix, const TDuration& automaticQueriesTtl, const TDuration& taskLeaseTtl, const THashMap<ui64, TRetryPolicyItem>& retryPolicies, ::NMonitoring::TDynamicCounterPtr rootCounters, - uint64_t maxRequestSize, bool dumpRawStatistics) { + uint64_t maxRequestSize, bool dumpRawStatistics, const std::shared_ptr<TFinalStatus>& finalStatus) { auto scope = request.scope(); auto query_id = request.query_id().value(); @@ -331,6 +337,10 @@ TPingTaskParams ConstructHardPingTask( ythrow TCodeLineException(TIssuesIds::BAD_REQUEST) << "QueryInternal proto exceeded the size limit: " << internal.ByteSizeLong() << " of " << maxRequestSize << " " << TSizeFormatPrinter(internal).ToString(); } + finalStatus->Status = query.meta().status(); + NYql::IssuesFromMessage(query.issue(), finalStatus->Issues); + NYql::IssuesFromMessage(query.transient_issue(), finalStatus->TransientIssues); + TSqlQueryBuilder writeQueryBuilder(tablePathPrefix, "HardPingTask(write)"); writeQueryBuilder.AddString("tenant", request.tenant()); writeQueryBuilder.AddString("scope", request.scope()); @@ -528,19 +538,12 @@ void TYdbControlPlaneStorageActor::Handle(TEvControlPlaneStorage::TEvPingTaskReq } std::shared_ptr<Fq::Private::PingTaskResult> response = std::make_shared<Fq::Private::PingTaskResult>(); - - if (request.status()) { - Counters.GetFinalStatusCounters(cloudId, scope)->IncByStatus(request.status()); - } - - if (IsTerminalStatus(request.status())) { - LOG_YQ_AUDIT_SERVICE_INFO("FinalStatus: cloud id: [" << cloudId << "], scope: [" << scope << "], query id: [" << request.query_id() << "], job id: [" << request.job_id() << "], status: " << FederatedQuery::QueryMeta::ComputeStatus_Name(request.status())); - } + std::shared_ptr<TFinalStatus> finalStatus = std::make_shared<TFinalStatus>(); auto pingTaskParams = DoesPingTaskUpdateQueriesTable(request) ? ConstructHardPingTask(request, response, YdbConnection->TablePathPrefix, Config->AutomaticQueriesTtl, Config->TaskLeaseTtl, Config->RetryPolicies, Counters.Counters, Config->Proto.GetMaxRequestSize(), - Config->Proto.GetDumpRawStatistics()) : + Config->Proto.GetDumpRawStatistics(), finalStatus) : ConstructSoftPingTask(request, response, YdbConnection->TablePathPrefix, Config->TaskLeaseTtl); auto debugInfo = Config->Proto.GetEnableDebugMode() ? std::make_shared<TDebugInfo>() : TDebugInfoPtr{}; auto result = ReadModifyWrite(pingTaskParams.Query, pingTaskParams.Params, pingTaskParams.Prepare, requestCounters, debugInfo); @@ -558,13 +561,32 @@ void TYdbControlPlaneStorageActor::Handle(TEvControlPlaneStorage::TEvPingTaskReq success.Apply([=, actorSystem=NActors::TActivationContext::ActorSystem(), meteringRecords=pingTaskParams.MeteringRecords](const auto& future) { TDuration delta = TInstant::Now() - startTime; - LWPROBE(PingTaskRequest, queryId, delta, future.GetValue()); + const auto success = future.GetValue(); + LWPROBE(PingTaskRequest, queryId, delta, success); if (meteringRecords) { for (const auto& metric : *meteringRecords) { actorSystem->Send(NKikimr::NMetering::MakeMeteringServiceID(), new NKikimr::NMetering::TEvMetering::TEvWriteMeteringJson(metric)); } } + + if (success) { + actorSystem->Send(ControlPlaneStorageServiceActorId(), new TEvControlPlaneStorage::TEvFinalStatusReport(request.query_id().value(), request.job_id().value(), cloudId, scope, finalStatus->Status, finalStatus->Issues, finalStatus->TransientIssues)); + } }); } +void TYdbControlPlaneStorageActor::Handle(TEvControlPlaneStorage::TEvFinalStatusReport::TPtr& ev) { + const auto& event = *ev->Get(); + if (!IsTerminalStatus(event.Status)) { + return; + } + + static const TString unavailablePattern = "Kikimr cluster or one of its subsystems was unavailable"; + if (event.Issues.ToOneLineString().Contains(unavailablePattern) || event.TransientIssues.ToOneLineString().Contains(unavailablePattern)) { + Counters.GetFinalStatusCounters(event.CloudId, event.Scope)->Unavailable->Inc(); + } + Counters.GetFinalStatusCounters(event.CloudId, event.Scope)->IncByStatus(event.Status); + LOG_YQ_AUDIT_SERVICE_INFO("FinalStatus: cloud id: [" << event.CloudId << "], scope: [" << event.Scope << "], query id: [" << event.QueryId << "], job id: [" << event.JobId << "], status: " << FederatedQuery::QueryMeta::ComputeStatus_Name(event.Status)); +} + } // NFq diff --git a/ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_impl.h b/ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_impl.h index 6984be80f2..376f35ee4b 100644 --- a/ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_impl.h +++ b/ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_impl.h @@ -647,6 +647,7 @@ public: hFunc(TEvControlPlaneStorage::TEvCreateDatabaseRequest, Handle); hFunc(TEvControlPlaneStorage::TEvDescribeDatabaseRequest, Handle); hFunc(TEvControlPlaneStorage::TEvModifyDatabaseRequest, Handle); + hFunc(TEvControlPlaneStorage::TEvFinalStatusReport, Handle); ) void Handle(TEvControlPlaneStorage::TEvCreateQueryRequest::TPtr& ev); @@ -689,6 +690,8 @@ public: void Handle(TEvControlPlaneStorage::TEvDescribeDatabaseRequest::TPtr& ev); void Handle(TEvControlPlaneStorage::TEvModifyDatabaseRequest::TPtr& ev); + void Handle(TEvControlPlaneStorage::TEvFinalStatusReport::TPtr& ev); + template <class TEventPtr, class TRequestActor, ERequestTypeCommon requestType> void HandleRateLimiterImpl(TEventPtr& ev); |