aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorhcpp <hcpp@ydb.tech>2023-09-21 14:27:25 +0300
committerhcpp <hcpp@ydb.tech>2023-09-21 14:56:13 +0300
commit082987dd3ffbf87d003bc010bac8a0f326836b4c (patch)
tree43432226e25d93957f79d316f2922771782dc07e
parent5fbb849336414b062bfcbec38284b9b73890d602 (diff)
downloadydb-082987dd3ffbf87d003bc010bac8a0f326836b4c.tar.gz
unavailable status has been added
-rw-r--r--ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.cpp3
-rw-r--r--ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.h3
-rw-r--r--ydb/core/fq/libs/control_plane_storage/events/events.h21
-rw-r--r--ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp44
-rw-r--r--ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_impl.h3
5 files changed, 63 insertions, 11 deletions
diff --git a/ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.cpp b/ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.cpp
index a6f5d5e8b1..a8f29302fb 100644
--- a/ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.cpp
+++ b/ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.cpp
@@ -50,6 +50,9 @@ TFinalStatusCounters::TFinalStatusCounters(const ::NMonitoring::TDynamicCounterP
AbortedByUser = subgroup->GetCounter("ABORTED_BY_USER", true);
Failed = subgroup->GetCounter("FAILED", true);
Paused = subgroup->GetCounter("PAUSED", true);
+
+ auto subgroupReason = counters->GetSubgroup("subcomponent", "FinalReason");
+ Unavailable = subgroup->GetCounter("Unavailable", true);
}
void TFinalStatusCounters::IncByStatus(FederatedQuery::QueryMeta::ComputeStatus finalStatus) {
diff --git a/ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.h b/ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.h
index 3940950577..9d19efdfd6 100644
--- a/ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.h
+++ b/ydb/core/fq/libs/control_plane_storage/control_plane_storage_counters.h
@@ -55,6 +55,9 @@ class TFinalStatusCounters: public virtual TThrRefBase {
::NMonitoring::TDynamicCounters::TCounterPtr Paused;
public:
+ ::NMonitoring::TDynamicCounters::TCounterPtr Unavailable;
+
+public:
TFinalStatusCounters(const ::NMonitoring::TDynamicCounterPtr& counters);
void IncByStatus(FederatedQuery::QueryMeta::ComputeStatus finalStatus);
diff --git a/ydb/core/fq/libs/control_plane_storage/events/events.h b/ydb/core/fq/libs/control_plane_storage/events/events.h
index 88f77b5beb..bcc242e2b5 100644
--- a/ydb/core/fq/libs/control_plane_storage/events/events.h
+++ b/ydb/core/fq/libs/control_plane_storage/events/events.h
@@ -170,6 +170,7 @@ struct TEvControlPlaneStorage {
EvDescribeDatabaseResponse,
EvModifyDatabaseRequest,
EvModifyDatabaseResponse,
+ EvFinalStatusReport,
EvEnd,
};
@@ -791,6 +792,26 @@ struct TEvControlPlaneStorage {
NYql::TIssues Issues;
TDebugInfoPtr DebugInfo;
};
+
+ struct TEvFinalStatusReport : NActors::TEventLocal<TEvFinalStatusReport, EvFinalStatusReport> {
+ TEvFinalStatusReport(const TString& queryId, const TString& jobId, const TString& cloudId, const TString& scope, FederatedQuery::QueryMeta::ComputeStatus status, const NYql::TIssues& issues, const NYql::TIssues& transientIssues)
+ : QueryId(queryId)
+ , JobId(jobId)
+ , CloudId(cloudId)
+ , Scope(scope)
+ , Status(status)
+ , Issues(issues)
+ , TransientIssues(transientIssues)
+ {}
+
+ TString QueryId;
+ TString JobId;
+ TString CloudId;
+ TString Scope;
+ FederatedQuery::QueryMeta::ComputeStatus Status = FederatedQuery::QueryMeta::COMPUTE_STATUS_UNSPECIFIED;
+ NYql::TIssues Issues;
+ NYql::TIssues TransientIssues;
+ };
};
}
diff --git a/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp b/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp
index 829155c6fd..c91bf9e908 100644
--- a/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp
+++ b/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp
@@ -20,11 +20,17 @@ struct TPingTaskParams {
std::shared_ptr<std::vector<TString>> MeteringRecords;
};
+struct TFinalStatus {
+ FederatedQuery::QueryMeta::ComputeStatus Status = FederatedQuery::QueryMeta::COMPUTE_STATUS_UNSPECIFIED;
+ NYql::TIssues Issues;
+ NYql::TIssues TransientIssues;
+};
+
TPingTaskParams ConstructHardPingTask(
const Fq::Private::PingTaskRequest& request, std::shared_ptr<Fq::Private::PingTaskResult> response,
const TString& tablePathPrefix, const TDuration& automaticQueriesTtl, const TDuration& taskLeaseTtl,
const THashMap<ui64, TRetryPolicyItem>& retryPolicies, ::NMonitoring::TDynamicCounterPtr rootCounters,
- uint64_t maxRequestSize, bool dumpRawStatistics) {
+ uint64_t maxRequestSize, bool dumpRawStatistics, const std::shared_ptr<TFinalStatus>& finalStatus) {
auto scope = request.scope();
auto query_id = request.query_id().value();
@@ -331,6 +337,10 @@ TPingTaskParams ConstructHardPingTask(
ythrow TCodeLineException(TIssuesIds::BAD_REQUEST) << "QueryInternal proto exceeded the size limit: " << internal.ByteSizeLong() << " of " << maxRequestSize << " " << TSizeFormatPrinter(internal).ToString();
}
+ finalStatus->Status = query.meta().status();
+ NYql::IssuesFromMessage(query.issue(), finalStatus->Issues);
+ NYql::IssuesFromMessage(query.transient_issue(), finalStatus->TransientIssues);
+
TSqlQueryBuilder writeQueryBuilder(tablePathPrefix, "HardPingTask(write)");
writeQueryBuilder.AddString("tenant", request.tenant());
writeQueryBuilder.AddString("scope", request.scope());
@@ -528,19 +538,12 @@ void TYdbControlPlaneStorageActor::Handle(TEvControlPlaneStorage::TEvPingTaskReq
}
std::shared_ptr<Fq::Private::PingTaskResult> response = std::make_shared<Fq::Private::PingTaskResult>();
-
- if (request.status()) {
- Counters.GetFinalStatusCounters(cloudId, scope)->IncByStatus(request.status());
- }
-
- if (IsTerminalStatus(request.status())) {
- LOG_YQ_AUDIT_SERVICE_INFO("FinalStatus: cloud id: [" << cloudId << "], scope: [" << scope << "], query id: [" << request.query_id() << "], job id: [" << request.job_id() << "], status: " << FederatedQuery::QueryMeta::ComputeStatus_Name(request.status()));
- }
+ std::shared_ptr<TFinalStatus> finalStatus = std::make_shared<TFinalStatus>();
auto pingTaskParams = DoesPingTaskUpdateQueriesTable(request) ?
ConstructHardPingTask(request, response, YdbConnection->TablePathPrefix, Config->AutomaticQueriesTtl,
Config->TaskLeaseTtl, Config->RetryPolicies, Counters.Counters, Config->Proto.GetMaxRequestSize(),
- Config->Proto.GetDumpRawStatistics()) :
+ Config->Proto.GetDumpRawStatistics(), finalStatus) :
ConstructSoftPingTask(request, response, YdbConnection->TablePathPrefix, Config->TaskLeaseTtl);
auto debugInfo = Config->Proto.GetEnableDebugMode() ? std::make_shared<TDebugInfo>() : TDebugInfoPtr{};
auto result = ReadModifyWrite(pingTaskParams.Query, pingTaskParams.Params, pingTaskParams.Prepare, requestCounters, debugInfo);
@@ -558,13 +561,32 @@ void TYdbControlPlaneStorageActor::Handle(TEvControlPlaneStorage::TEvPingTaskReq
success.Apply([=, actorSystem=NActors::TActivationContext::ActorSystem(), meteringRecords=pingTaskParams.MeteringRecords](const auto& future) {
TDuration delta = TInstant::Now() - startTime;
- LWPROBE(PingTaskRequest, queryId, delta, future.GetValue());
+ const auto success = future.GetValue();
+ LWPROBE(PingTaskRequest, queryId, delta, success);
if (meteringRecords) {
for (const auto& metric : *meteringRecords) {
actorSystem->Send(NKikimr::NMetering::MakeMeteringServiceID(), new NKikimr::NMetering::TEvMetering::TEvWriteMeteringJson(metric));
}
}
+
+ if (success) {
+ actorSystem->Send(ControlPlaneStorageServiceActorId(), new TEvControlPlaneStorage::TEvFinalStatusReport(request.query_id().value(), request.job_id().value(), cloudId, scope, finalStatus->Status, finalStatus->Issues, finalStatus->TransientIssues));
+ }
});
}
+void TYdbControlPlaneStorageActor::Handle(TEvControlPlaneStorage::TEvFinalStatusReport::TPtr& ev) {
+ const auto& event = *ev->Get();
+ if (!IsTerminalStatus(event.Status)) {
+ return;
+ }
+
+ static const TString unavailablePattern = "Kikimr cluster or one of its subsystems was unavailable";
+ if (event.Issues.ToOneLineString().Contains(unavailablePattern) || event.TransientIssues.ToOneLineString().Contains(unavailablePattern)) {
+ Counters.GetFinalStatusCounters(event.CloudId, event.Scope)->Unavailable->Inc();
+ }
+ Counters.GetFinalStatusCounters(event.CloudId, event.Scope)->IncByStatus(event.Status);
+ LOG_YQ_AUDIT_SERVICE_INFO("FinalStatus: cloud id: [" << event.CloudId << "], scope: [" << event.Scope << "], query id: [" << event.QueryId << "], job id: [" << event.JobId << "], status: " << FederatedQuery::QueryMeta::ComputeStatus_Name(event.Status));
+}
+
} // NFq
diff --git a/ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_impl.h b/ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_impl.h
index 6984be80f2..376f35ee4b 100644
--- a/ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_impl.h
+++ b/ydb/core/fq/libs/control_plane_storage/ydb_control_plane_storage_impl.h
@@ -647,6 +647,7 @@ public:
hFunc(TEvControlPlaneStorage::TEvCreateDatabaseRequest, Handle);
hFunc(TEvControlPlaneStorage::TEvDescribeDatabaseRequest, Handle);
hFunc(TEvControlPlaneStorage::TEvModifyDatabaseRequest, Handle);
+ hFunc(TEvControlPlaneStorage::TEvFinalStatusReport, Handle);
)
void Handle(TEvControlPlaneStorage::TEvCreateQueryRequest::TPtr& ev);
@@ -689,6 +690,8 @@ public:
void Handle(TEvControlPlaneStorage::TEvDescribeDatabaseRequest::TPtr& ev);
void Handle(TEvControlPlaneStorage::TEvModifyDatabaseRequest::TPtr& ev);
+ void Handle(TEvControlPlaneStorage::TEvFinalStatusReport::TPtr& ev);
+
template <class TEventPtr, class TRequestActor, ERequestTypeCommon requestType>
void HandleRateLimiterImpl(TEventPtr& ev);