summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorlucius <[email protected]>2025-10-09 13:01:25 +0300
committerlucius <[email protected]>2025-10-09 13:33:39 +0300
commite7c5ac7fc2f710e8fe0223ec4ed56ecda316b9c4 (patch)
treeb7dc4fb5d2693c5be545e14831c2eb802f8f62df
parent65e378dec032b5dde3099702f121524cf022b9e6 (diff)
YT-25914: [cbo] use column DataWeight + update cost function + add version pragma
Новая кост-функция для cbo. Под флагом чтобы удобнее сравнивать со старой. 2 изменения: - Для оценки размера входных таблиц используется поколоночный dataweight (если есть), а не таблица целиком - В самой кост-функции убран outputSize, чтобы он не учитывался дважды: он неявно учтен в каждом следующем джоине как левый либо правый inputSize, а размер результата последнего джоина не зависит от перестановки джоинов commit_hash:d130848393114b1b4383035381dba7767aac62fb
-rw-r--r--yql/essentials/core/yql_type_annotation.h1
-rw-r--r--yql/essentials/providers/config/yql_config_provider.cpp15
-rw-r--r--yql/essentials/sql/v1/context.h1
-rw-r--r--yql/essentials/sql/v1/query.cpp5
-rw-r--r--yql/essentials/sql/v1/sql_query.cpp16
-rw-r--r--yt/yql/providers/yt/provider/yql_yt_join_reorder.cpp26
-rw-r--r--yt/yql/providers/yt/provider/yql_yt_provider_context.cpp58
-rw-r--r--yt/yql/providers/yt/provider/yql_yt_provider_context.h9
-rw-r--r--yt/yql/tests/sql/suites/join/cbo_4tables_version1.cfg5
-rw-r--r--yt/yql/tests/sql/suites/join/cbo_4tables_version1.yql18
10 files changed, 121 insertions, 33 deletions
diff --git a/yql/essentials/core/yql_type_annotation.h b/yql/essentials/core/yql_type_annotation.h
index b99f6dad33b..9412718ffdc 100644
--- a/yql/essentials/core/yql_type_annotation.h
+++ b/yql/essentials/core/yql_type_annotation.h
@@ -453,6 +453,7 @@ struct TTypeAnnotationContext: public TThrRefBase {
TFileStoragePtr FileStorage;
TQContext QContext;
ECostBasedOptimizerType CostBasedOptimizer = ECostBasedOptimizerType::Disable;
+ ui32 CostBasedOptimizerVersion = 0;
bool MatchRecognize = false;
EMatchRecognizeStreamingMode MatchRecognizeStreaming = EMatchRecognizeStreamingMode::Force;
i64 TimeOrderRecoverDelay = -10'000'000; //microseconds
diff --git a/yql/essentials/providers/config/yql_config_provider.cpp b/yql/essentials/providers/config/yql_config_provider.cpp
index 6d7f916ccb4..acd8d38527d 100644
--- a/yql/essentials/providers/config/yql_config_provider.cpp
+++ b/yql/essentials/providers/config/yql_config_provider.cpp
@@ -874,6 +874,21 @@ private:
ctx.AddError(TIssue(pos, TStringBuilder() << "Expected `disable|pg|native', but got: " << args[0]));
return false;
}
+ } else if (name == "CostBasedOptimizerVersion") {
+ if (args.size() != 1) {
+ ctx.AddError(TIssue(pos, TStringBuilder() << "Expected at most 1 argument, but got " << args.size()));
+ return false;
+ }
+ ui32 version;
+ if (!TryFromString(args[0], version)) {
+ ctx.AddError(TIssue(pos, TStringBuilder() << "Expected integer, but got: " << args[0]));
+ return false;
+ }
+ const ui32 maxCBOVersion = 1;
+ if (version > maxCBOVersion) {
+ ctx.AddError(TIssue(pos, TStringBuilder() << "Expected value <= " << maxCBOVersion << ", but got: " << args[0]));
+ }
+ Types_.CostBasedOptimizerVersion = version;
} else if (name == "_EnableMatchRecognize" || name == "DisableMatchRecognize") {
if (args.size() != 0) {
ctx.AddError(TIssue(pos, TStringBuilder() << "Expected no arguments, but got " << args.size()));
diff --git a/yql/essentials/sql/v1/context.h b/yql/essentials/sql/v1/context.h
index 1f6d660c7fb..49041cd11b2 100644
--- a/yql/essentials/sql/v1/context.h
+++ b/yql/essentials/sql/v1/context.h
@@ -328,6 +328,7 @@ public:
bool DqEngineEnable = false;
bool DqEngineForce = false;
TString CostBasedOptimizer;
+ TMaybe<ui32> CostBasedOptimizerVersion;
TMaybe<bool> JsonQueryReturnsJsonDocument;
TMaybe<bool> AnsiInForEmptyOrNullableItemsCollections;
TMaybe<bool> AnsiRankForNullableKeys = true;
diff --git a/yql/essentials/sql/v1/query.cpp b/yql/essentials/sql/v1/query.cpp
index e88f03b0c3c..f106b9344d4 100644
--- a/yql/essentials/sql/v1/query.cpp
+++ b/yql/essentials/sql/v1/query.cpp
@@ -3324,6 +3324,11 @@ public:
BuildQuotedAtom(Pos_, "CostBasedOptimizer"), BuildQuotedAtom(Pos_, ctx.CostBasedOptimizer))));
}
+ if (ctx.CostBasedOptimizerVersion) {
+ currentWorlds->Add(Y("let", "world", Y(TString(ConfigureName), "world", configSource,
+ BuildQuotedAtom(Pos_, "CostBasedOptimizerVersion"), BuildQuotedAtom(Pos_, ToString(*ctx.CostBasedOptimizerVersion)))));
+ }
+
if (ctx.JsonQueryReturnsJsonDocument.Defined()) {
TString pragmaName = "DisableJsonQueryReturnsJsonDocument";
if (*ctx.JsonQueryReturnsJsonDocument) {
diff --git a/yql/essentials/sql/v1/sql_query.cpp b/yql/essentials/sql/v1/sql_query.cpp
index bb846e93012..813423b0540 100644
--- a/yql/essentials/sql/v1/sql_query.cpp
+++ b/yql/essentials/sql/v1/sql_query.cpp
@@ -3388,6 +3388,22 @@ THashMap<TString, TPragmaDescr> PragmaDescrs{
}
return TNodePtr{};
}),
+ TableElemExt("CostBasedOptimizerVersion", [](CB_SIG) -> TMaybe<TNodePtr> {
+ auto& ctx = query.Context();
+ if (values.size() == 1 && values[0].GetLiteral()) {
+ ui32 version;
+ if (!TryFromString(*values[0].GetLiteral(), version)) {
+ query.Error() << "Expected integer argument for: " << pragma;
+ return {};
+ }
+ const ui32 maxCBOVersion = 1;
+ if (version > maxCBOVersion) {
+ query.Error() << "Expected value <= " << maxCBOVersion << " for: " << pragma;
+ }
+ ctx.CostBasedOptimizerVersion = version;
+ }
+ return TNodePtr{};
+ }),
TableElemExt("DisableCompactNamedExprs", [](CB_SIG) -> TMaybe<TNodePtr> {
auto& ctx = query.Context();
if (!ctx.Warning(ctx.Pos(), TIssuesIds::YQL_DEPRECATED_PRAGMA, [](auto& out) {
diff --git a/yt/yql/providers/yt/provider/yql_yt_join_reorder.cpp b/yt/yql/providers/yt/provider/yql_yt_join_reorder.cpp
index 8bca90d4b71..c2d410ac36e 100644
--- a/yt/yql/providers/yt/provider/yql_yt_join_reorder.cpp
+++ b/yt/yql/providers/yt/provider/yql_yt_join_reorder.cpp
@@ -172,6 +172,20 @@ public:
TYtJoinNodeOp* OriginalOp; // Only for nonReorderable
};
+TMaybe<uint64_t> ColumnsDataWeight(const TVector<TYtColumnStatistic>& columns) {
+ if (columns.empty()) {
+ return Nothing();
+ }
+ uint64_t result = 0;
+ for (const auto& column : columns) {
+ if (!column.DataWeight) {
+ return Nothing();
+ }
+ result += *column.DataWeight;
+ }
+ return result;
+}
+
class TOptimizerTreeBuilder
{
public:
@@ -207,7 +221,7 @@ public:
}
limits.LookupJoinMemLimit = Min(State->Configuration->LookupJoinLimit.Get().GetOrElse(0),
State->Configuration->EvaluationTableSizeLimit.Get().GetOrElse(Max<ui64>()));
- OutProviderCtx = std::make_shared<TYtProviderContext>(limits, std::move(ProviderRelInfo_));
+ OutProviderCtx = std::make_shared<TYtProviderContext>(limits, std::move(ProviderRelInfo_), State->Types->CostBasedOptimizerVersion);
}
private:
@@ -314,6 +328,7 @@ private:
stat->ColumnStatistics = TIntrusivePtr<TOptimizerStatistics::TColumnStatMap>(
new TOptimizerStatistics::TColumnStatMap());
auto providerStats = std::make_unique<TYtProviderStatistic>();
+ bool canUseDataWeight = true;
if (Y_UNLIKELY(!section.Settings().Empty()) && Y_UNLIKELY(section.Settings().Item(0).Name() == "Test")) {
for (const auto& setting : section.Settings()) {
@@ -330,6 +345,10 @@ private:
auto tableStat = TYtTableBaseInfo::GetStat(path.Table());
stat->ByteSize += tableStat->DataSize;
stat->Nrows += tableStat->RecordsCount;
+ const bool hasLookup = pathInfo.Table->Meta && pathInfo.Table->Meta->Attrs.Value("optimize_for", "scan") == "lookup";
+ if (hasLookup) {
+ canUseDataWeight = false;
+ }
}
if (section.Ref().GetState() >= TExprNode::EState::ConstrComplete) {
auto sorted = section.Ref().GetConstraint<TSortedConstraintNode>();
@@ -352,6 +371,11 @@ private:
TVector<TMaybe<IYtGateway::TPathStatResult::TExtendedResult>> extendedStats;
extendedStats = GetStatsFromCache(leaf, keyList);
columnInfo = ExtractColumnInfo(extendedStats);
+ if (canUseDataWeight && State->Types->CostBasedOptimizerVersion >= 1) {
+ if (const auto dataWeight = ColumnsDataWeight(columnInfo)) {
+ stat->ByteSize = *dataWeight;
+ }
+ }
}
TDynBitMap relBitmap;
diff --git a/yt/yql/providers/yt/provider/yql_yt_provider_context.cpp b/yt/yql/providers/yt/provider/yql_yt_provider_context.cpp
index 3c51fc5a729..4953a520f2d 100644
--- a/yt/yql/providers/yt/provider/yql_yt_provider_context.cpp
+++ b/yt/yql/providers/yt/provider/yql_yt_provider_context.cpp
@@ -10,16 +10,13 @@
namespace NYql {
-TMaybe<double> TYtProviderContext::ColumnNumUniqueValues(const TDynBitMap& relMap, const NDq::TJoinColumn& joinColumn) const {
- auto entry = ColumnIndex_.find(joinColumn.AttributeName);
+TMaybe<double> TYtProviderContext::ColumnNumUniqueValues(const TDynBitMap& relMap, const NDq::TJoinColumn& joinColumn) const {
+ const auto entry = ColumnIndex_.find(joinColumn.AttributeName);
if (entry == ColumnIndex_.end()) {
return Nothing();
}
- for (const auto& relColEntry : entry->second) {
- int relIndex = relColEntry.first;
- int colIndex = relColEntry.second;
- if (relMap[relIndex] && RelInfo_[relIndex].Label == joinColumn.RelName &&
- RelInfo_[relIndex].ColumnInfo[colIndex].EstimatedUniqueCount) {
+ for (const auto& [relIndex, colIndex] : entry->second) {
+ if (relMap[relIndex] && RelInfo_[relIndex].Label == joinColumn.RelName) {
return RelInfo_[relIndex].ColumnInfo[colIndex].EstimatedUniqueCount;
}
}
@@ -71,7 +68,7 @@ TString TYtProviderContext::DebugStatistic(const TVector<NDq::TJoinColumn>& colu
TMaybe<double> TYtProviderContext::FindMaxUniqueVals(const TYtProviderStatistic& specific, const TVector<NDq::TJoinColumn>& columns) const {
TMaybe<double> result;
for (const auto& joinColumn : columns) {
- auto val = ColumnNumUniqueValues(specific.RelBitmap, joinColumn);
+ const auto val = ColumnNumUniqueValues(specific.RelBitmap, joinColumn);
if (val && (!result || *val > *result)) {
result = val;
}
@@ -79,6 +76,8 @@ TMaybe<double> TYtProviderContext::FindMaxUniqueVals(const TYtProviderStatistic&
return result;
}
+namespace {
+
double ComputeCardinality(TMaybe<double> maxUniques, double leftCardinality, double rightCardinality) {
if (!maxUniques) {
double result = 0.2 * leftCardinality * rightCardinality;
@@ -91,9 +90,12 @@ double ComputeCardinality(TMaybe<double> maxUniques, double leftCardinality, dou
return result;
}
-TYtProviderContext::TYtProviderContext(TJoinAlgoLimits limits, TVector<TYtProviderRelInfo> relInfo)
+} // namespace
+
+TYtProviderContext::TYtProviderContext(TJoinAlgoLimits limits, TVector<TYtProviderRelInfo> relInfo, ui32 version)
: Limits_(limits)
- , RelInfo_(std::move(relInfo)) {
+ , RelInfo_(std::move(relInfo))
+ , Version_(version) {
for (int relIndex = 0; relIndex < std::ssize(RelInfo_); ++relIndex) {
const auto& rel = RelInfo_[relIndex];
for (int colIndex = 0; colIndex < std::ssize(rel.ColumnInfo); ++colIndex) {
@@ -158,31 +160,31 @@ TOptimizerStatistics TYtProviderContext::ComputeJoinStats(
TMaybe<double> maxUniques;
auto leftMaxUniques = FindMaxUniqueVals(*leftSpecific, leftJoinKeys);
- if (leftMaxUniques && (!maxUniques || *maxUniques < *leftMaxUniques)) {
- maxUniques = *leftMaxUniques;
- }
auto rightMaxUniques = FindMaxUniqueVals(*rightSpecific, rightJoinKeys);
- if (rightMaxUniques && (!maxUniques || *maxUniques < *rightMaxUniques)) {
- maxUniques = *rightMaxUniques;
+ if (!leftMaxUniques) {
+ maxUniques = rightMaxUniques;
+ } else if (!rightMaxUniques) {
+ maxUniques = leftMaxUniques;
+ } else {
+ maxUniques = Max(*leftMaxUniques, *rightMaxUniques);
}
auto resultSpecific = std::make_unique<TYtProviderStatistic>();
-
resultSpecific->RelBitmap = leftSpecific->RelBitmap | rightSpecific->RelBitmap;
resultSpecific->JoinAlgo = joinAlgo;
double outputCardinality = ComputeCardinality(maxUniques, leftStats.Nrows, rightStats.Nrows);
- double leftCardinalityFactor = leftStats.Nrows != 0.0 ? outputCardinality / leftStats.Nrows : 0.0;
- double rightCardinalityFactor = rightStats.Nrows != 0.0 ? outputCardinality / rightStats.Nrows : 0.0;
- double outputByteSize = leftCardinalityFactor * leftStats.ByteSize + rightCardinalityFactor * rightStats.ByteSize;
+ double leftRowSize = leftStats.Nrows != 0.0 ? leftStats.ByteSize / leftStats.Nrows : 0.0;
+ double rightRowSize = rightStats.Nrows != 0.0 ? rightStats.ByteSize / rightStats.Nrows : 0.0;
+ double outputByteSize = outputCardinality * (leftRowSize + rightRowSize);
- auto leftReadBytes = leftStats.ByteSize;
+ double leftReadBytes = leftStats.ByteSize;
double rightReadBytes = rightStats.ByteSize;
if (joinAlgo == EJoinAlgoType::LookupJoin) {
- auto leftJoinUniques = FindMaxUniqueVals(*leftSpecific, TVector<NDq::TJoinColumn>{leftJoinKeys[0]});
- auto rightJoinUniques = FindMaxUniqueVals(*rightSpecific, TVector<NDq::TJoinColumn>{rightJoinKeys[0]});
+ auto leftJoinUniques = ColumnNumUniqueValues(leftSpecific->RelBitmap, leftJoinKeys[0]);
+ auto rightJoinUniques = ColumnNumUniqueValues(rightSpecific->RelBitmap, rightJoinKeys[0]);
if (leftJoinUniques && rightJoinUniques && *rightJoinUniques < *leftJoinUniques) {
leftReadBytes *= *rightJoinUniques / *leftJoinUniques;
}
@@ -195,10 +197,14 @@ TOptimizerStatistics TYtProviderContext::ComputeJoinStats(
leftReadBytes = 0;
}
- double outputCost =
- leftStats.Cost + rightStats.Cost +
- leftReadBytes + rightReadBytes +
- outputByteSize;
+ double outputCost = leftStats.Cost + rightStats.Cost;
+ if (Version_ >= 1) {
+ // Don't add outputByteSize to outputCost because outputByteSize will be accounted as *ReadBytes in subsequent joins.
+ // The output size of the final join can be ignored since it remains constant regardless of the join order.
+ outputCost += leftReadBytes + rightReadBytes;
+ } else {
+ outputCost += leftReadBytes + rightReadBytes + outputByteSize;
+ }
TOptimizerStatistics result(
EStatisticsType::ManyManyJoin,
diff --git a/yt/yql/providers/yt/provider/yql_yt_provider_context.h b/yt/yql/providers/yt/provider/yql_yt_provider_context.h
index fcf1d176b38..8f7bbae14f0 100644
--- a/yt/yql/providers/yt/provider/yql_yt_provider_context.h
+++ b/yt/yql/providers/yt/provider/yql_yt_provider_context.h
@@ -10,7 +10,7 @@ namespace NYql {
struct TYtColumnStatistic {
TString ColumnName;
TMaybe<int64_t> EstimatedUniqueCount;
- TMaybe<int64_t> DataWeight;
+ TMaybe<uint64_t> DataWeight;
};
struct TRelSizeInfo {
@@ -39,7 +39,7 @@ public:
ui64 LookupJoinMaxRows;
};
- TYtProviderContext(TJoinAlgoLimits limits, TVector<TYtProviderRelInfo> relInfo);
+ TYtProviderContext(TJoinAlgoLimits limits, TVector<TYtProviderRelInfo> relInfo, ui32 version);
virtual TOptimizerStatistics ComputeJoinStats(
const TOptimizerStatistics& leftStats,
@@ -63,10 +63,6 @@ private:
bool IsMapJoinApplicable(const TOptimizerStatistics& table) const;
- TDynBitMap ExtractColumnsBitmap(const TDynBitMap& columnBitmap, const TVector<TString>& columns) const;
-
- TVector<TYtColumnStatistic> MergeColumnStatistics(const TYtProviderStatistic& leftSpecific, const TYtProviderStatistic& rightSpecific, const TDynBitMap& outputBitmap) const;
-
TMaybe<double> FindMaxUniqueVals(const TYtProviderStatistic& specific, const TVector<NDq::TJoinColumn>& columns) const;
TMaybe<double> ColumnNumUniqueValues(const TDynBitMap& relMap, const NDq::TJoinColumn& columnName) const;
@@ -76,6 +72,7 @@ private:
const TJoinAlgoLimits Limits_;
TVector<TYtProviderRelInfo> RelInfo_;
THashMap<TString, THashSet<std::pair<int, int>>> ColumnIndex_;
+ ui32 Version_ = 0;
};
} // namespace NYql
diff --git a/yt/yql/tests/sql/suites/join/cbo_4tables_version1.cfg b/yt/yql/tests/sql/suites/join/cbo_4tables_version1.cfg
new file mode 100644
index 00000000000..514082c6e4e
--- /dev/null
+++ b/yt/yql/tests/sql/suites/join/cbo_4tables_version1.cfg
@@ -0,0 +1,5 @@
+providers yt
+in InputA cbo_a.txt
+in InputB cbo_b.txt
+in InputC cbo_c.txt
+in InputD cbo_d.txt
diff --git a/yt/yql/tests/sql/suites/join/cbo_4tables_version1.yql b/yt/yql/tests/sql/suites/join/cbo_4tables_version1.yql
new file mode 100644
index 00000000000..d0218c3785a
--- /dev/null
+++ b/yt/yql/tests/sql/suites/join/cbo_4tables_version1.yql
@@ -0,0 +1,18 @@
+USE plato;
+pragma warning("disable", "8001"); -- CBO_MISSING_TABLE_STATS
+
+pragma CostBasedOptimizer="native";
+pragma CostBasedOptimizerVersion="1";
+pragma yt.MapJoinLimit="1000";
+pragma yt.LookupJoinLimit="1000";
+pragma yt.LookupJoinMaxRows="100";
+pragma yt.ExtendedStatsMaxChunkCount="0";
+pragma yt.JoinMergeTablesLimit="100";
+
+SELECT
+ InputA.Key1, InputA.Key2, InputA.Value, InputB.val, InputC.v, InputD.value as vald
+FROM
+ InputA
+ INNER JOIN InputD ON InputA.Key2 = InputD.k
+ INNER JOIN InputB ON InputA.Fk1 = InputB.k
+ INNER JOIN InputC ON InputA.Key1 = InputC.k