diff options
| author | lucius <[email protected]> | 2025-10-09 13:01:25 +0300 |
|---|---|---|
| committer | lucius <[email protected]> | 2025-10-09 13:33:39 +0300 |
| commit | e7c5ac7fc2f710e8fe0223ec4ed56ecda316b9c4 (patch) | |
| tree | b7dc4fb5d2693c5be545e14831c2eb802f8f62df | |
| parent | 65e378dec032b5dde3099702f121524cf022b9e6 (diff) | |
YT-25914: [cbo] use column DataWeight + update cost function + add version pragma
Новая кост-функция для cbo. Под флагом чтобы удобнее сравнивать со старой.
2 изменения:
- Для оценки размера входных таблиц используется поколоночный dataweight (если есть), а не таблица целиком
- В самой кост-функции убран outputSize, чтобы он не учитывался дважды: он неявно учтен в каждом следующем джоине как левый либо правый inputSize, а размер результата последнего джоина не зависит от перестановки джоинов
commit_hash:d130848393114b1b4383035381dba7767aac62fb
| -rw-r--r-- | yql/essentials/core/yql_type_annotation.h | 1 | ||||
| -rw-r--r-- | yql/essentials/providers/config/yql_config_provider.cpp | 15 | ||||
| -rw-r--r-- | yql/essentials/sql/v1/context.h | 1 | ||||
| -rw-r--r-- | yql/essentials/sql/v1/query.cpp | 5 | ||||
| -rw-r--r-- | yql/essentials/sql/v1/sql_query.cpp | 16 | ||||
| -rw-r--r-- | yt/yql/providers/yt/provider/yql_yt_join_reorder.cpp | 26 | ||||
| -rw-r--r-- | yt/yql/providers/yt/provider/yql_yt_provider_context.cpp | 58 | ||||
| -rw-r--r-- | yt/yql/providers/yt/provider/yql_yt_provider_context.h | 9 | ||||
| -rw-r--r-- | yt/yql/tests/sql/suites/join/cbo_4tables_version1.cfg | 5 | ||||
| -rw-r--r-- | yt/yql/tests/sql/suites/join/cbo_4tables_version1.yql | 18 |
10 files changed, 121 insertions, 33 deletions
diff --git a/yql/essentials/core/yql_type_annotation.h b/yql/essentials/core/yql_type_annotation.h index b99f6dad33b..9412718ffdc 100644 --- a/yql/essentials/core/yql_type_annotation.h +++ b/yql/essentials/core/yql_type_annotation.h @@ -453,6 +453,7 @@ struct TTypeAnnotationContext: public TThrRefBase { TFileStoragePtr FileStorage; TQContext QContext; ECostBasedOptimizerType CostBasedOptimizer = ECostBasedOptimizerType::Disable; + ui32 CostBasedOptimizerVersion = 0; bool MatchRecognize = false; EMatchRecognizeStreamingMode MatchRecognizeStreaming = EMatchRecognizeStreamingMode::Force; i64 TimeOrderRecoverDelay = -10'000'000; //microseconds diff --git a/yql/essentials/providers/config/yql_config_provider.cpp b/yql/essentials/providers/config/yql_config_provider.cpp index 6d7f916ccb4..acd8d38527d 100644 --- a/yql/essentials/providers/config/yql_config_provider.cpp +++ b/yql/essentials/providers/config/yql_config_provider.cpp @@ -874,6 +874,21 @@ private: ctx.AddError(TIssue(pos, TStringBuilder() << "Expected `disable|pg|native', but got: " << args[0])); return false; } + } else if (name == "CostBasedOptimizerVersion") { + if (args.size() != 1) { + ctx.AddError(TIssue(pos, TStringBuilder() << "Expected at most 1 argument, but got " << args.size())); + return false; + } + ui32 version; + if (!TryFromString(args[0], version)) { + ctx.AddError(TIssue(pos, TStringBuilder() << "Expected integer, but got: " << args[0])); + return false; + } + const ui32 maxCBOVersion = 1; + if (version > maxCBOVersion) { + ctx.AddError(TIssue(pos, TStringBuilder() << "Expected value <= " << maxCBOVersion << ", but got: " << args[0])); + } + Types_.CostBasedOptimizerVersion = version; } else if (name == "_EnableMatchRecognize" || name == "DisableMatchRecognize") { if (args.size() != 0) { ctx.AddError(TIssue(pos, TStringBuilder() << "Expected no arguments, but got " << args.size())); diff --git a/yql/essentials/sql/v1/context.h b/yql/essentials/sql/v1/context.h index 1f6d660c7fb..49041cd11b2 100644 --- a/yql/essentials/sql/v1/context.h +++ b/yql/essentials/sql/v1/context.h @@ -328,6 +328,7 @@ public: bool DqEngineEnable = false; bool DqEngineForce = false; TString CostBasedOptimizer; + TMaybe<ui32> CostBasedOptimizerVersion; TMaybe<bool> JsonQueryReturnsJsonDocument; TMaybe<bool> AnsiInForEmptyOrNullableItemsCollections; TMaybe<bool> AnsiRankForNullableKeys = true; diff --git a/yql/essentials/sql/v1/query.cpp b/yql/essentials/sql/v1/query.cpp index e88f03b0c3c..f106b9344d4 100644 --- a/yql/essentials/sql/v1/query.cpp +++ b/yql/essentials/sql/v1/query.cpp @@ -3324,6 +3324,11 @@ public: BuildQuotedAtom(Pos_, "CostBasedOptimizer"), BuildQuotedAtom(Pos_, ctx.CostBasedOptimizer)))); } + if (ctx.CostBasedOptimizerVersion) { + currentWorlds->Add(Y("let", "world", Y(TString(ConfigureName), "world", configSource, + BuildQuotedAtom(Pos_, "CostBasedOptimizerVersion"), BuildQuotedAtom(Pos_, ToString(*ctx.CostBasedOptimizerVersion))))); + } + if (ctx.JsonQueryReturnsJsonDocument.Defined()) { TString pragmaName = "DisableJsonQueryReturnsJsonDocument"; if (*ctx.JsonQueryReturnsJsonDocument) { diff --git a/yql/essentials/sql/v1/sql_query.cpp b/yql/essentials/sql/v1/sql_query.cpp index bb846e93012..813423b0540 100644 --- a/yql/essentials/sql/v1/sql_query.cpp +++ b/yql/essentials/sql/v1/sql_query.cpp @@ -3388,6 +3388,22 @@ THashMap<TString, TPragmaDescr> PragmaDescrs{ } return TNodePtr{}; }), + TableElemExt("CostBasedOptimizerVersion", [](CB_SIG) -> TMaybe<TNodePtr> { + auto& ctx = query.Context(); + if (values.size() == 1 && values[0].GetLiteral()) { + ui32 version; + if (!TryFromString(*values[0].GetLiteral(), version)) { + query.Error() << "Expected integer argument for: " << pragma; + return {}; + } + const ui32 maxCBOVersion = 1; + if (version > maxCBOVersion) { + query.Error() << "Expected value <= " << maxCBOVersion << " for: " << pragma; + } + ctx.CostBasedOptimizerVersion = version; + } + return TNodePtr{}; + }), TableElemExt("DisableCompactNamedExprs", [](CB_SIG) -> TMaybe<TNodePtr> { auto& ctx = query.Context(); if (!ctx.Warning(ctx.Pos(), TIssuesIds::YQL_DEPRECATED_PRAGMA, [](auto& out) { diff --git a/yt/yql/providers/yt/provider/yql_yt_join_reorder.cpp b/yt/yql/providers/yt/provider/yql_yt_join_reorder.cpp index 8bca90d4b71..c2d410ac36e 100644 --- a/yt/yql/providers/yt/provider/yql_yt_join_reorder.cpp +++ b/yt/yql/providers/yt/provider/yql_yt_join_reorder.cpp @@ -172,6 +172,20 @@ public: TYtJoinNodeOp* OriginalOp; // Only for nonReorderable }; +TMaybe<uint64_t> ColumnsDataWeight(const TVector<TYtColumnStatistic>& columns) { + if (columns.empty()) { + return Nothing(); + } + uint64_t result = 0; + for (const auto& column : columns) { + if (!column.DataWeight) { + return Nothing(); + } + result += *column.DataWeight; + } + return result; +} + class TOptimizerTreeBuilder { public: @@ -207,7 +221,7 @@ public: } limits.LookupJoinMemLimit = Min(State->Configuration->LookupJoinLimit.Get().GetOrElse(0), State->Configuration->EvaluationTableSizeLimit.Get().GetOrElse(Max<ui64>())); - OutProviderCtx = std::make_shared<TYtProviderContext>(limits, std::move(ProviderRelInfo_)); + OutProviderCtx = std::make_shared<TYtProviderContext>(limits, std::move(ProviderRelInfo_), State->Types->CostBasedOptimizerVersion); } private: @@ -314,6 +328,7 @@ private: stat->ColumnStatistics = TIntrusivePtr<TOptimizerStatistics::TColumnStatMap>( new TOptimizerStatistics::TColumnStatMap()); auto providerStats = std::make_unique<TYtProviderStatistic>(); + bool canUseDataWeight = true; if (Y_UNLIKELY(!section.Settings().Empty()) && Y_UNLIKELY(section.Settings().Item(0).Name() == "Test")) { for (const auto& setting : section.Settings()) { @@ -330,6 +345,10 @@ private: auto tableStat = TYtTableBaseInfo::GetStat(path.Table()); stat->ByteSize += tableStat->DataSize; stat->Nrows += tableStat->RecordsCount; + const bool hasLookup = pathInfo.Table->Meta && pathInfo.Table->Meta->Attrs.Value("optimize_for", "scan") == "lookup"; + if (hasLookup) { + canUseDataWeight = false; + } } if (section.Ref().GetState() >= TExprNode::EState::ConstrComplete) { auto sorted = section.Ref().GetConstraint<TSortedConstraintNode>(); @@ -352,6 +371,11 @@ private: TVector<TMaybe<IYtGateway::TPathStatResult::TExtendedResult>> extendedStats; extendedStats = GetStatsFromCache(leaf, keyList); columnInfo = ExtractColumnInfo(extendedStats); + if (canUseDataWeight && State->Types->CostBasedOptimizerVersion >= 1) { + if (const auto dataWeight = ColumnsDataWeight(columnInfo)) { + stat->ByteSize = *dataWeight; + } + } } TDynBitMap relBitmap; diff --git a/yt/yql/providers/yt/provider/yql_yt_provider_context.cpp b/yt/yql/providers/yt/provider/yql_yt_provider_context.cpp index 3c51fc5a729..4953a520f2d 100644 --- a/yt/yql/providers/yt/provider/yql_yt_provider_context.cpp +++ b/yt/yql/providers/yt/provider/yql_yt_provider_context.cpp @@ -10,16 +10,13 @@ namespace NYql { -TMaybe<double> TYtProviderContext::ColumnNumUniqueValues(const TDynBitMap& relMap, const NDq::TJoinColumn& joinColumn) const { - auto entry = ColumnIndex_.find(joinColumn.AttributeName); +TMaybe<double> TYtProviderContext::ColumnNumUniqueValues(const TDynBitMap& relMap, const NDq::TJoinColumn& joinColumn) const { + const auto entry = ColumnIndex_.find(joinColumn.AttributeName); if (entry == ColumnIndex_.end()) { return Nothing(); } - for (const auto& relColEntry : entry->second) { - int relIndex = relColEntry.first; - int colIndex = relColEntry.second; - if (relMap[relIndex] && RelInfo_[relIndex].Label == joinColumn.RelName && - RelInfo_[relIndex].ColumnInfo[colIndex].EstimatedUniqueCount) { + for (const auto& [relIndex, colIndex] : entry->second) { + if (relMap[relIndex] && RelInfo_[relIndex].Label == joinColumn.RelName) { return RelInfo_[relIndex].ColumnInfo[colIndex].EstimatedUniqueCount; } } @@ -71,7 +68,7 @@ TString TYtProviderContext::DebugStatistic(const TVector<NDq::TJoinColumn>& colu TMaybe<double> TYtProviderContext::FindMaxUniqueVals(const TYtProviderStatistic& specific, const TVector<NDq::TJoinColumn>& columns) const { TMaybe<double> result; for (const auto& joinColumn : columns) { - auto val = ColumnNumUniqueValues(specific.RelBitmap, joinColumn); + const auto val = ColumnNumUniqueValues(specific.RelBitmap, joinColumn); if (val && (!result || *val > *result)) { result = val; } @@ -79,6 +76,8 @@ TMaybe<double> TYtProviderContext::FindMaxUniqueVals(const TYtProviderStatistic& return result; } +namespace { + double ComputeCardinality(TMaybe<double> maxUniques, double leftCardinality, double rightCardinality) { if (!maxUniques) { double result = 0.2 * leftCardinality * rightCardinality; @@ -91,9 +90,12 @@ double ComputeCardinality(TMaybe<double> maxUniques, double leftCardinality, dou return result; } -TYtProviderContext::TYtProviderContext(TJoinAlgoLimits limits, TVector<TYtProviderRelInfo> relInfo) +} // namespace + +TYtProviderContext::TYtProviderContext(TJoinAlgoLimits limits, TVector<TYtProviderRelInfo> relInfo, ui32 version) : Limits_(limits) - , RelInfo_(std::move(relInfo)) { + , RelInfo_(std::move(relInfo)) + , Version_(version) { for (int relIndex = 0; relIndex < std::ssize(RelInfo_); ++relIndex) { const auto& rel = RelInfo_[relIndex]; for (int colIndex = 0; colIndex < std::ssize(rel.ColumnInfo); ++colIndex) { @@ -158,31 +160,31 @@ TOptimizerStatistics TYtProviderContext::ComputeJoinStats( TMaybe<double> maxUniques; auto leftMaxUniques = FindMaxUniqueVals(*leftSpecific, leftJoinKeys); - if (leftMaxUniques && (!maxUniques || *maxUniques < *leftMaxUniques)) { - maxUniques = *leftMaxUniques; - } auto rightMaxUniques = FindMaxUniqueVals(*rightSpecific, rightJoinKeys); - if (rightMaxUniques && (!maxUniques || *maxUniques < *rightMaxUniques)) { - maxUniques = *rightMaxUniques; + if (!leftMaxUniques) { + maxUniques = rightMaxUniques; + } else if (!rightMaxUniques) { + maxUniques = leftMaxUniques; + } else { + maxUniques = Max(*leftMaxUniques, *rightMaxUniques); } auto resultSpecific = std::make_unique<TYtProviderStatistic>(); - resultSpecific->RelBitmap = leftSpecific->RelBitmap | rightSpecific->RelBitmap; resultSpecific->JoinAlgo = joinAlgo; double outputCardinality = ComputeCardinality(maxUniques, leftStats.Nrows, rightStats.Nrows); - double leftCardinalityFactor = leftStats.Nrows != 0.0 ? outputCardinality / leftStats.Nrows : 0.0; - double rightCardinalityFactor = rightStats.Nrows != 0.0 ? outputCardinality / rightStats.Nrows : 0.0; - double outputByteSize = leftCardinalityFactor * leftStats.ByteSize + rightCardinalityFactor * rightStats.ByteSize; + double leftRowSize = leftStats.Nrows != 0.0 ? leftStats.ByteSize / leftStats.Nrows : 0.0; + double rightRowSize = rightStats.Nrows != 0.0 ? rightStats.ByteSize / rightStats.Nrows : 0.0; + double outputByteSize = outputCardinality * (leftRowSize + rightRowSize); - auto leftReadBytes = leftStats.ByteSize; + double leftReadBytes = leftStats.ByteSize; double rightReadBytes = rightStats.ByteSize; if (joinAlgo == EJoinAlgoType::LookupJoin) { - auto leftJoinUniques = FindMaxUniqueVals(*leftSpecific, TVector<NDq::TJoinColumn>{leftJoinKeys[0]}); - auto rightJoinUniques = FindMaxUniqueVals(*rightSpecific, TVector<NDq::TJoinColumn>{rightJoinKeys[0]}); + auto leftJoinUniques = ColumnNumUniqueValues(leftSpecific->RelBitmap, leftJoinKeys[0]); + auto rightJoinUniques = ColumnNumUniqueValues(rightSpecific->RelBitmap, rightJoinKeys[0]); if (leftJoinUniques && rightJoinUniques && *rightJoinUniques < *leftJoinUniques) { leftReadBytes *= *rightJoinUniques / *leftJoinUniques; } @@ -195,10 +197,14 @@ TOptimizerStatistics TYtProviderContext::ComputeJoinStats( leftReadBytes = 0; } - double outputCost = - leftStats.Cost + rightStats.Cost + - leftReadBytes + rightReadBytes + - outputByteSize; + double outputCost = leftStats.Cost + rightStats.Cost; + if (Version_ >= 1) { + // Don't add outputByteSize to outputCost because outputByteSize will be accounted as *ReadBytes in subsequent joins. + // The output size of the final join can be ignored since it remains constant regardless of the join order. + outputCost += leftReadBytes + rightReadBytes; + } else { + outputCost += leftReadBytes + rightReadBytes + outputByteSize; + } TOptimizerStatistics result( EStatisticsType::ManyManyJoin, diff --git a/yt/yql/providers/yt/provider/yql_yt_provider_context.h b/yt/yql/providers/yt/provider/yql_yt_provider_context.h index fcf1d176b38..8f7bbae14f0 100644 --- a/yt/yql/providers/yt/provider/yql_yt_provider_context.h +++ b/yt/yql/providers/yt/provider/yql_yt_provider_context.h @@ -10,7 +10,7 @@ namespace NYql { struct TYtColumnStatistic { TString ColumnName; TMaybe<int64_t> EstimatedUniqueCount; - TMaybe<int64_t> DataWeight; + TMaybe<uint64_t> DataWeight; }; struct TRelSizeInfo { @@ -39,7 +39,7 @@ public: ui64 LookupJoinMaxRows; }; - TYtProviderContext(TJoinAlgoLimits limits, TVector<TYtProviderRelInfo> relInfo); + TYtProviderContext(TJoinAlgoLimits limits, TVector<TYtProviderRelInfo> relInfo, ui32 version); virtual TOptimizerStatistics ComputeJoinStats( const TOptimizerStatistics& leftStats, @@ -63,10 +63,6 @@ private: bool IsMapJoinApplicable(const TOptimizerStatistics& table) const; - TDynBitMap ExtractColumnsBitmap(const TDynBitMap& columnBitmap, const TVector<TString>& columns) const; - - TVector<TYtColumnStatistic> MergeColumnStatistics(const TYtProviderStatistic& leftSpecific, const TYtProviderStatistic& rightSpecific, const TDynBitMap& outputBitmap) const; - TMaybe<double> FindMaxUniqueVals(const TYtProviderStatistic& specific, const TVector<NDq::TJoinColumn>& columns) const; TMaybe<double> ColumnNumUniqueValues(const TDynBitMap& relMap, const NDq::TJoinColumn& columnName) const; @@ -76,6 +72,7 @@ private: const TJoinAlgoLimits Limits_; TVector<TYtProviderRelInfo> RelInfo_; THashMap<TString, THashSet<std::pair<int, int>>> ColumnIndex_; + ui32 Version_ = 0; }; } // namespace NYql diff --git a/yt/yql/tests/sql/suites/join/cbo_4tables_version1.cfg b/yt/yql/tests/sql/suites/join/cbo_4tables_version1.cfg new file mode 100644 index 00000000000..514082c6e4e --- /dev/null +++ b/yt/yql/tests/sql/suites/join/cbo_4tables_version1.cfg @@ -0,0 +1,5 @@ +providers yt +in InputA cbo_a.txt +in InputB cbo_b.txt +in InputC cbo_c.txt +in InputD cbo_d.txt diff --git a/yt/yql/tests/sql/suites/join/cbo_4tables_version1.yql b/yt/yql/tests/sql/suites/join/cbo_4tables_version1.yql new file mode 100644 index 00000000000..d0218c3785a --- /dev/null +++ b/yt/yql/tests/sql/suites/join/cbo_4tables_version1.yql @@ -0,0 +1,18 @@ +USE plato; +pragma warning("disable", "8001"); -- CBO_MISSING_TABLE_STATS + +pragma CostBasedOptimizer="native"; +pragma CostBasedOptimizerVersion="1"; +pragma yt.MapJoinLimit="1000"; +pragma yt.LookupJoinLimit="1000"; +pragma yt.LookupJoinMaxRows="100"; +pragma yt.ExtendedStatsMaxChunkCount="0"; +pragma yt.JoinMergeTablesLimit="100"; + +SELECT + InputA.Key1, InputA.Key2, InputA.Value, InputB.val, InputC.v, InputD.value as vald +FROM + InputA + INNER JOIN InputD ON InputA.Key2 = InputD.k + INNER JOIN InputB ON InputA.Fk1 = InputB.k + INNER JOIN InputC ON InputA.Key1 = InputC.k |
