summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPisarenko Grigoriy <[email protected]>2024-07-16 11:33:17 +0300
committerGitHub <[email protected]>2024-07-16 11:33:17 +0300
commit38281ff7272d85bf2e58bf56452cbae18aee3e4b (patch)
tree51d0ca9866fd2506250c386305823da52580f42b
parent637786331a43cdbef68e4a45f5b76e5314b92f6c (diff)
YQ-3405 fixed endless retries for external error (#6626)
-rw-r--r--ydb/core/fq/libs/config/protos/control_plane_storage.proto6
-rw-r--r--ydb/core/fq/libs/control_plane_storage/config.cpp3
-rw-r--r--ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp4
-rw-r--r--ydb/core/fq/libs/control_plane_storage/util.cpp12
-rw-r--r--ydb/core/fq/libs/control_plane_storage/util.h6
5 files changed, 24 insertions, 7 deletions
diff --git a/ydb/core/fq/libs/config/protos/control_plane_storage.proto b/ydb/core/fq/libs/config/protos/control_plane_storage.proto
index 1c4b668d4cc..9cc2d31158d 100644
--- a/ydb/core/fq/libs/config/protos/control_plane_storage.proto
+++ b/ydb/core/fq/libs/config/protos/control_plane_storage.proto
@@ -24,11 +24,15 @@ message TQueryMapping {
// 1. StatusCode(s) are handled with defined policies, non-unique StatusCode(s) across all policies is UB
// 2. RetryCount and RetryPeriodMs are used to calculate actual RetryRate, if it exceeds RetryCount, query is aborted
+// - Number of retries during RetryPeriod time less than 2 * RetryCount due to RetryRate
// 3. BackoffPeriodMs is factor of RetryRate to delay query execution before next retry
-// 4. There are no default retry policy, all unhandled statuses are fatal
+// 4. RetryLimit is hard limit for amount query retry count, after that query is aborted
+// - If RetryLimit = 0, query can be abborted only by RetryRate
+// 5. There are no default retry policy, all unhandled statuses are fatal
message TRetryPolicy {
uint64 RetryCount = 1;
+ uint64 RetryLimit = 4;
string RetryPeriod = 2;
string BackoffPeriod = 3;
}
diff --git a/ydb/core/fq/libs/control_plane_storage/config.cpp b/ydb/core/fq/libs/control_plane_storage/config.cpp
index 2cd4dd6bbe7..41638b50674 100644
--- a/ydb/core/fq/libs/control_plane_storage/config.cpp
+++ b/ydb/core/fq/libs/control_plane_storage/config.cpp
@@ -50,10 +50,11 @@ TControlPlaneStorageConfig::TControlPlaneStorageConfig(const NConfig::TControlPl
for (const auto& mapping : Proto.GetRetryPolicyMapping()) {
auto& retryPolicy = mapping.GetPolicy();
auto retryCount = retryPolicy.GetRetryCount();
+ auto retryLimit = retryPolicy.GetRetryLimit();
auto retryPeriod = GetDuration(retryPolicy.GetRetryPeriod(), TDuration::Hours(1));
auto backoffPeriod = GetDuration(retryPolicy.GetBackoffPeriod(), TDuration::Zero());
for (const auto statusCode: mapping.GetStatusCode()) {
- RetryPolicies.emplace(statusCode, TRetryPolicyItem(retryCount, retryPeriod, backoffPeriod));
+ RetryPolicies.emplace(statusCode, TRetryPolicyItem(retryCount, retryLimit, retryPeriod, backoffPeriod));
}
}
diff --git a/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp b/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp
index b214e127d44..d4106e6e285 100644
--- a/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp
+++ b/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp
@@ -173,7 +173,7 @@ TPingTaskParams ConstructHardPingTask(
internal.clear_operation_id();
}
- TRetryPolicyItem policy(0, TDuration::Seconds(1), TDuration::Zero());
+ TRetryPolicyItem policy(0, 0, TDuration::Seconds(1), TDuration::Zero());
auto it = retryPolicies.find(request.status_code());
auto policyFound = it != retryPolicies.end();
if (policyFound) {
@@ -200,7 +200,7 @@ TPingTaskParams ConstructHardPingTask(
TStringBuilder builder;
builder << "Query failed with code " << NYql::NDqProto::StatusIds_StatusCode_Name(request.status_code());
if (policy.RetryCount) {
- builder << " (failure rate " << retryLimiter.RetryRate << " exceeds limit of " << policy.RetryCount << ")";
+ builder << " (" << retryLimiter.LastError << ")";
}
builder << " at " << Now();
diff --git a/ydb/core/fq/libs/control_plane_storage/util.cpp b/ydb/core/fq/libs/control_plane_storage/util.cpp
index db0f310b509..f064ffa58f2 100644
--- a/ydb/core/fq/libs/control_plane_storage/util.cpp
+++ b/ydb/core/fq/libs/control_plane_storage/util.cpp
@@ -28,7 +28,16 @@ bool TRetryLimiter::UpdateOnRetry(const TInstant& lastSeenAt, const TRetryPolicy
RetryRate = 0.0;
}
}
- bool shouldRetry = RetryRate < policy.RetryCount;
+
+ bool shouldRetry = true;
+ if (RetryRate >= policy.RetryCount) {
+ shouldRetry = false;
+ LastError = TStringBuilder() << "failure rate " << RetryRate << " exceeds limit of " << policy.RetryCount;
+ } else if (policy.RetryLimit && RetryCount >= policy.RetryLimit) {
+ shouldRetry = false;
+ LastError = TStringBuilder() << "retry count reached limit of " << policy.RetryLimit;
+ }
+
if (shouldRetry) {
RetryCount++;
RetryCounterUpdatedAt = now;
@@ -145,6 +154,7 @@ NConfig::TControlPlaneStorageConfig FillDefaultParameters(NConfig::TControlPlane
policyMapping.AddStatusCode(NYql::NDqProto::StatusIds::EXTERNAL_ERROR);
auto& policy = *policyMapping.MutablePolicy();
policy.SetRetryCount(10);
+ policy.SetRetryLimit(40);
policy.SetRetryPeriod("1m");
policy.SetBackoffPeriod("1s");
}
diff --git a/ydb/core/fq/libs/control_plane_storage/util.h b/ydb/core/fq/libs/control_plane_storage/util.h
index 2c95b6fe989..8d2b49a6d95 100644
--- a/ydb/core/fq/libs/control_plane_storage/util.h
+++ b/ydb/core/fq/libs/control_plane_storage/util.h
@@ -15,10 +15,11 @@ namespace NFq {
class TRetryPolicyItem {
public:
TRetryPolicyItem() = default;
- TRetryPolicyItem(ui64 retryCount, const TDuration& retryPeriod, const TDuration& backoffPeriod)
- : RetryCount(retryCount), RetryPeriod(retryPeriod), BackoffPeriod(backoffPeriod)
+ TRetryPolicyItem(ui64 retryCount, ui64 retryLimit, const TDuration& retryPeriod, const TDuration& backoffPeriod)
+ : RetryCount(retryCount), RetryLimit(retryLimit), RetryPeriod(retryPeriod), BackoffPeriod(backoffPeriod)
{ }
ui64 RetryCount = 0;
+ ui64 RetryLimit = 0;
TDuration RetryPeriod = TDuration::Zero();
TDuration BackoffPeriod = TDuration::Zero();
};
@@ -32,6 +33,7 @@ public:
ui64 RetryCount = 0;
TInstant RetryCounterUpdatedAt = TInstant::Zero();
double RetryRate = 0.0;
+ TString LastError;
};
bool IsTerminalStatus(FederatedQuery::QueryMeta::ComputeStatus status);