diff options
| author | Pavel Velikhov <[email protected]> | 2024-10-14 14:09:35 +0300 |
|---|---|---|
| committer | GitHub <[email protected]> | 2024-10-14 11:09:35 +0000 |
| commit | 5a2282b60e1706cd2c19c4c2860c80befb05ac66 (patch) | |
| tree | 43f843b22cc8c4fe42257b55304a139b36d19449 | |
| parent | 2baa7c5aa8da2f87d559b100adcd36d6c1bd85f8 (diff) | |
Refactored join conditions in CBO (#10366)
19 files changed, 260 insertions, 243 deletions
diff --git a/ydb/core/kqp/opt/logical/kqp_opt_cbo.cpp b/ydb/core/kqp/opt/logical/kqp_opt_cbo.cpp index 240f38fbaff..ebd541bb988 100644 --- a/ydb/core/kqp/opt/logical/kqp_opt_cbo.cpp +++ b/ydb/core/kqp/opt/logical/kqp_opt_cbo.cpp @@ -36,7 +36,7 @@ TMaybeNode<TKqlKeyInc> GetRightTableKeyPrefix(const TKqlKeyRange& range) { /** * KQP specific rule to check if a LookupJoin is applicable */ -bool IsLookupJoinApplicableDetailed(const std::shared_ptr<NYql::TRelOptimizerNode>& node, const TVector<TString>& joinColumns, const TKqpProviderContext& ctx) { +bool IsLookupJoinApplicableDetailed(const std::shared_ptr<NYql::TRelOptimizerNode>& node, const TVector<TJoinColumn>& joinColumns, const TKqpProviderContext& ctx) { auto rel = std::static_pointer_cast<TKqpRelOptimizerNode>(node); auto expr = TExprBase(rel->Node); @@ -45,7 +45,7 @@ bool IsLookupJoinApplicableDetailed(const std::shared_ptr<NYql::TRelOptimizerNod return false; } - if (find_if(joinColumns.begin(), joinColumns.end(), [&] (const TString& s) { return node->Stats->KeyColumns->Data[0] == s;}) != joinColumns.end()) { + if (std::find_if(joinColumns.begin(), joinColumns.end(), [&] (const TJoinColumn& c) { return node->Stats->KeyColumns->Data[0] == c.AttributeName;}) != joinColumns.end()) { return true; } @@ -97,8 +97,8 @@ bool IsLookupJoinApplicableDetailed(const std::shared_ptr<NYql::TRelOptimizerNod return false; } - if (prefixSize < node->Stats->KeyColumns->Data.size() && (find_if(joinColumns.begin(), joinColumns.end(), [&] (const TString& s) { - return node->Stats->KeyColumns->Data[prefixSize] == s; + if (prefixSize < node->Stats->KeyColumns->Data.size() && (std::find_if(joinColumns.begin(), joinColumns.end(), [&] (const TJoinColumn& c) { + return node->Stats->KeyColumns->Data[prefixSize] == c.AttributeName; }) == joinColumns.end())){ return false; } @@ -108,12 +108,11 @@ bool IsLookupJoinApplicableDetailed(const std::shared_ptr<NYql::TRelOptimizerNod bool IsLookupJoinApplicable(std::shared_ptr<IBaseOptimizerNode> left, std::shared_ptr<IBaseOptimizerNode> right, - const std::set<std::pair<TJoinColumn, TJoinColumn>>& joinConditions, - const TVector<TString>& leftJoinKeys, - const TVector<TString>& rightJoinKeys, + const TVector<TJoinColumn>& leftJoinKeys, + const TVector<TJoinColumn>& rightJoinKeys, TKqpProviderContext& ctx ) { - Y_UNUSED(left, joinConditions, leftJoinKeys); + Y_UNUSED(left, leftJoinKeys); if (!(right->Stats->StorageType == EStorageType::RowStorage)) { return false; @@ -130,7 +129,7 @@ bool IsLookupJoinApplicable(std::shared_ptr<IBaseOptimizerNode> left, } for (auto rightCol : rightJoinKeys) { - if (std::find(rightStats->KeyColumns->Data.begin(), rightStats->KeyColumns->Data.end(), rightCol) == rightStats->KeyColumns->Data.end()) { + if (find(rightStats->KeyColumns->Data.begin(), rightStats->KeyColumns->Data.end(), rightCol.AttributeName) == rightStats->KeyColumns->Data.end()) { return false; } } @@ -142,18 +141,17 @@ bool IsLookupJoinApplicable(std::shared_ptr<IBaseOptimizerNode> left, bool TKqpProviderContext::IsJoinApplicable(const std::shared_ptr<IBaseOptimizerNode>& left, const std::shared_ptr<IBaseOptimizerNode>& right, - const std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>>& joinConditions, - const TVector<TString>& leftJoinKeys, - const TVector<TString>& rightJoinKeys, + const TVector<TJoinColumn>& leftJoinKeys, + const TVector<TJoinColumn>& rightJoinKeys, EJoinAlgoType joinAlgo, - EJoinKind joinKind) { + EJoinKind joinKind) { switch( joinAlgo ) { case EJoinAlgoType::LookupJoin: if ((OptLevel != 3) && (left->Stats->Nrows > 1000)) { return false; } - return IsLookupJoinApplicable(left, right, joinConditions, leftJoinKeys, rightJoinKeys, *this); + return IsLookupJoinApplicable(left, right, leftJoinKeys, rightJoinKeys, *this); case EJoinAlgoType::LookupJoinReverse: if (joinKind != EJoinKind::LeftSemi) { @@ -162,7 +160,7 @@ bool TKqpProviderContext::IsJoinApplicable(const std::shared_ptr<IBaseOptimizerN if ((OptLevel != 3) && (right->Stats->Nrows > 1000)) { return false; } - return IsLookupJoinApplicable(right, left, joinConditions, rightJoinKeys, leftJoinKeys, *this); + return IsLookupJoinApplicable(right, left, rightJoinKeys, leftJoinKeys, *this); case EJoinAlgoType::MapJoin: return joinKind != EJoinKind::OuterJoin && joinKind != EJoinKind::Exclusion && right->Stats->ByteSize < 1e6; diff --git a/ydb/core/kqp/opt/logical/kqp_opt_cbo.h b/ydb/core/kqp/opt/logical/kqp_opt_cbo.h index 9df809aaacb..52aa93ef414 100644 --- a/ydb/core/kqp/opt/logical/kqp_opt_cbo.h +++ b/ydb/core/kqp/opt/logical/kqp_opt_cbo.h @@ -25,8 +25,7 @@ struct TKqpProviderContext : public NYql::TBaseProviderContext { virtual bool IsJoinApplicable(const std::shared_ptr<NYql::IBaseOptimizerNode>& left, const std::shared_ptr<NYql::IBaseOptimizerNode>& right, - const std::set<std::pair<NYql::NDq::TJoinColumn, NYql::NDq::TJoinColumn>>& joinConditions, - const TVector<TString>& leftJoinKeys, const TVector<TString>& rightJoinKeys, + const TVector<NYql::NDq::TJoinColumn>& leftJoinKeys, const TVector<NYql::NDq::TJoinColumn>& rightJoinKeys, NYql::EJoinAlgoType joinAlgo, NYql::EJoinKind joinKind) override; virtual double ComputeJoinCost(const NYql::TOptimizerStatistics& leftStats, const NYql::TOptimizerStatistics& rightStats, const double outputRows, const double outputByteSize, NYql::EJoinAlgoType joinAlgo) const override; diff --git a/ydb/library/yql/core/cbo/cbo_optimizer_new.cpp b/ydb/library/yql/core/cbo/cbo_optimizer_new.cpp index eb7d07e429d..380f60c26e1 100644 --- a/ydb/library/yql/core/cbo/cbo_optimizer_new.cpp +++ b/ydb/library/yql/core/cbo/cbo_optimizer_new.cpp @@ -77,7 +77,8 @@ void TRelOptimizerNode::Print(std::stringstream& stream, int ntabs) { TJoinOptimizerNode::TJoinOptimizerNode( const std::shared_ptr<IBaseOptimizerNode>& left, const std::shared_ptr<IBaseOptimizerNode>& right, - const std::set<std::pair<TJoinColumn, TJoinColumn>>& joinConditions, + TVector<TJoinColumn> leftKeys, + TVector<TJoinColumn> rightKeys, const EJoinKind joinType, const EJoinAlgoType joinAlgo, bool leftAny, @@ -86,18 +87,14 @@ TJoinOptimizerNode::TJoinOptimizerNode( ) : IBaseOptimizerNode(JoinNodeType) , LeftArg(left) , RightArg(right) - , JoinConditions(joinConditions) + , LeftJoinKeys(leftKeys) + , RightJoinKeys(rightKeys) , JoinType(joinType) , JoinAlgo(joinAlgo) , LeftAny(leftAny) , RightAny(rightAny) , IsReorderable(!nonReorderable) -{ - for (const auto& [l,r] : joinConditions ) { - LeftJoinKeys.push_back(l.AttributeName); - RightJoinKeys.push_back(r.AttributeName); - } -} +{} TVector<TString> TJoinOptimizerNode::Labels() { auto res = LeftArg->Labels(); @@ -120,10 +117,10 @@ void TJoinOptimizerNode::Print(std::stringstream& stream, int ntabs) { } stream << ") "; - for (auto c : JoinConditions){ - stream << c.first.RelName << "." << c.first.AttributeName - << "=" << c.second.RelName << "." - << c.second.AttributeName << ","; + for (size_t i=0; i<LeftJoinKeys.size(); i++){ + stream << LeftJoinKeys[i].RelName << "." << LeftJoinKeys[i].AttributeName + << "=" << RightJoinKeys[i].RelName << "." + << RightJoinKeys[i].AttributeName << ","; } stream << "\n"; @@ -138,13 +135,14 @@ void TJoinOptimizerNode::Print(std::stringstream& stream, int ntabs) { RightArg->Print(stream, ntabs+1); } -bool IsPKJoin(const TOptimizerStatistics& stats, const TVector<TString>& joinKeys) { +bool IsPKJoin(const TOptimizerStatistics& stats, const TVector<TJoinColumn>& joinKeys) { if (!stats.KeyColumns) { return false; } for(size_t i = 0; i < stats.KeyColumns->Data.size(); i++){ - if (std::find(joinKeys.begin(), joinKeys.end(), stats.KeyColumns->Data[i]) == joinKeys.end()) { + if (std::find_if(joinKeys.begin(), joinKeys.end(), + [&] (const TJoinColumn& c) { return c.AttributeName == stats.KeyColumns->Data[i];}) == joinKeys.end()) { return false; } } @@ -153,15 +151,13 @@ bool IsPKJoin(const TOptimizerStatistics& stats, const TVector<TString>& joinKey bool TBaseProviderContext::IsJoinApplicable(const std::shared_ptr<IBaseOptimizerNode>& left, const std::shared_ptr<IBaseOptimizerNode>& right, - const std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>>& joinConditions, - const TVector<TString>& leftJoinKeys, - const TVector<TString>& rightJoinKeys, + const TVector<TJoinColumn>& leftJoinKeys, + const TVector<TJoinColumn>& rightJoinKeys, EJoinAlgoType joinAlgo, EJoinKind joinKind) { Y_UNUSED(left); Y_UNUSED(right); - Y_UNUSED(joinConditions); Y_UNUSED(leftJoinKeys); Y_UNUSED(rightJoinKeys); Y_UNUSED(joinKind); @@ -182,30 +178,12 @@ double TBaseProviderContext::ComputeJoinCost(const TOptimizerStatistics& leftSta * * The build is on the right side, so we make the build side a bit more expensive than the probe */ -TOptimizerStatistics TBaseProviderContext::ComputeJoinStats( - const TOptimizerStatistics& leftStats, - const TOptimizerStatistics& rightStats, - const std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>>& joinConditions, - EJoinAlgoType joinAlgo, - EJoinKind joinKind, - TCardinalityHints::TCardinalityHint* maybeHint) const -{ - TVector<TString> leftJoinKeys; - TVector<TString> rightJoinKeys; - - for (auto c : joinConditions) { - leftJoinKeys.emplace_back(c.first.AttributeName); - rightJoinKeys.emplace_back(c.second.AttributeName); - } - - return ComputeJoinStats(leftStats, rightStats, leftJoinKeys, rightJoinKeys, joinAlgo, joinKind, maybeHint); -} TOptimizerStatistics TBaseProviderContext::ComputeJoinStats( const TOptimizerStatistics& leftStats, const TOptimizerStatistics& rightStats, - const TVector<TString>& leftJoinKeys, - const TVector<TString>& rightJoinKeys, + const TVector<TJoinColumn>& leftJoinKeys, + const TVector<TJoinColumn>& rightJoinKeys, EJoinAlgoType joinAlgo, EJoinKind joinKind, TCardinalityHints::TCardinalityHint* maybeHint) const @@ -265,9 +243,9 @@ TOptimizerStatistics TBaseProviderContext::ComputeJoinStats( std::optional<double> lhsUniqueVals; std::optional<double> rhsUniqueVals; if (leftStats.ColumnStatistics && rightStats.ColumnStatistics && !leftJoinKeys.empty() && !rightJoinKeys.empty()) { - auto lhs = leftJoinKeys[0]; + auto lhs = leftJoinKeys[0].AttributeName; lhsUniqueVals = leftStats.ColumnStatistics->Data[lhs].NumUniqueVals; - auto rhs = rightJoinKeys[0]; + auto rhs = rightJoinKeys[0].AttributeName; rightStats.ColumnStatistics->Data[rhs]; rhsUniqueVals = leftStats.ColumnStatistics->Data[lhs].NumUniqueVals; } diff --git a/ydb/library/yql/core/cbo/cbo_optimizer_new.h b/ydb/library/yql/core/cbo/cbo_optimizer_new.h index 0a564e4c359..af3b9452902 100644 --- a/ydb/library/yql/core/cbo/cbo_optimizer_new.h +++ b/ydb/library/yql/core/cbo/cbo_optimizer_new.h @@ -201,27 +201,18 @@ struct IProviderContext { virtual TOptimizerStatistics ComputeJoinStats( const TOptimizerStatistics& leftStats, const TOptimizerStatistics& rightStats, - const std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>>& joinConditions, - EJoinAlgoType joinAlgo, - EJoinKind joinKind, - TCardinalityHints::TCardinalityHint* maybeHint = nullptr) const = 0; - - virtual TOptimizerStatistics ComputeJoinStats( - const TOptimizerStatistics& leftStats, - const TOptimizerStatistics& rightStats, - const TVector<TString>& leftJoinKeys, - const TVector<TString>& rightJoinKeys, + const TVector<NDq::TJoinColumn>& leftJoinKeys, + const TVector<NDq::TJoinColumn>& rightJoinKeys, EJoinAlgoType joinAlgo, EJoinKind joinKind, TCardinalityHints::TCardinalityHint* maybeHint = nullptr) const = 0; virtual bool IsJoinApplicable(const std::shared_ptr<IBaseOptimizerNode>& left, const std::shared_ptr<IBaseOptimizerNode>& right, - const std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>>& joinConditions, - const TVector<TString>& leftJoinKeys, - const TVector<TString>& rightJoinKeys, + const TVector<NDq::TJoinColumn>& leftJoinKeys, + const TVector<NDq::TJoinColumn>& rightJoinKeys, EJoinAlgoType joinAlgo, - EJoinKind joinKind) = 0; + EJoinKind joinKin) = 0; }; /** @@ -233,27 +224,19 @@ struct TBaseProviderContext : public IProviderContext { double ComputeJoinCost(const TOptimizerStatistics& leftStats, const TOptimizerStatistics& rightStats, const double outputRows, const double outputByteSize, EJoinAlgoType joinAlgo) const override; - bool IsJoinApplicable(const std::shared_ptr<IBaseOptimizerNode>& left, - const std::shared_ptr<IBaseOptimizerNode>& right, - const std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>>& joinConditions, - const TVector<TString>& leftJoinKeys, - const TVector<TString>& rightJoinKeys, + bool IsJoinApplicable( + const std::shared_ptr<IBaseOptimizerNode>& leftStats, + const std::shared_ptr<IBaseOptimizerNode>& rightStats, + const TVector<NDq::TJoinColumn>& leftJoinKeys, + const TVector<NDq::TJoinColumn>& rightJoinKeys, EJoinAlgoType joinAlgo, EJoinKind joinKind) override; virtual TOptimizerStatistics ComputeJoinStats( const TOptimizerStatistics& leftStats, const TOptimizerStatistics& rightStats, - const TVector<TString>& leftJoinKeys, - const TVector<TString>& rightJoinKeys, - EJoinAlgoType joinAlgo, - EJoinKind joinKind, - TCardinalityHints::TCardinalityHint* maybeHint = nullptr) const override; - - virtual TOptimizerStatistics ComputeJoinStats( - const TOptimizerStatistics& leftStats, - const TOptimizerStatistics& rightStats, - const std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>>& joinConditions, + const TVector<NDq::TJoinColumn>& leftJoinKeys, + const TVector<NDq::TJoinColumn>& rightJoinKeys, EJoinAlgoType joinAlgo, EJoinKind joinKind, TCardinalityHints::TCardinalityHint* maybeHint = nullptr) const override; @@ -290,9 +273,8 @@ struct TRelOptimizerNode : public IBaseOptimizerNode { struct TJoinOptimizerNode : public IBaseOptimizerNode { std::shared_ptr<IBaseOptimizerNode> LeftArg; std::shared_ptr<IBaseOptimizerNode> RightArg; - const std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>> JoinConditions; - TVector<TString> LeftJoinKeys; - TVector<TString> RightJoinKeys; + TVector<NDq::TJoinColumn> LeftJoinKeys; + TVector<NDq::TJoinColumn> RightJoinKeys; EJoinKind JoinType; EJoinAlgoType JoinAlgo; /////////////////// 'ANY' flag means leaving only one row from the join side. @@ -303,7 +285,8 @@ struct TJoinOptimizerNode : public IBaseOptimizerNode { TJoinOptimizerNode(const std::shared_ptr<IBaseOptimizerNode>& left, const std::shared_ptr<IBaseOptimizerNode>& right, - const std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>>& joinConditions, + TVector<NDq::TJoinColumn> leftKeys, + TVector<NDq::TJoinColumn> rightKeys, const EJoinKind joinType, const EJoinAlgoType joinAlgo, bool leftAny, diff --git a/ydb/library/yql/core/yql_cost_function.h b/ydb/library/yql/core/yql_cost_function.h index cb12f37238b..b69c5941db6 100644 --- a/ydb/library/yql/core/yql_cost_function.h +++ b/ydb/library/yql/core/yql_cost_function.h @@ -38,9 +38,14 @@ namespace NDq { struct TJoinColumn { TString RelName; TString AttributeName; + TString AttributeNameWithAliases; + ui32 EquivalenceClass = 0; + bool IsConstant = false; - TJoinColumn(TString relName, TString attributeName) : RelName(relName), - AttributeName(std::move(attributeName)) {} + TJoinColumn(TString relName, TString attributeName) : + RelName(relName), + AttributeName(attributeName), + AttributeNameWithAliases(attributeName) {} bool operator == (const TJoinColumn& other) const { return RelName == other.RelName && AttributeName == other.AttributeName; diff --git a/ydb/library/yql/dq/opt/dq_cbo_ut.cpp b/ydb/library/yql/dq/opt/dq_cbo_ut.cpp index 3e088a59ded..8973518e0b6 100644 --- a/ydb/library/yql/dq/opt/dq_cbo_ut.cpp +++ b/ydb/library/yql/dq/opt/dq_cbo_ut.cpp @@ -45,15 +45,14 @@ Y_UNIT_TEST(JoinSearch2Rels) { auto rel2 = std::make_shared<TRelOptimizerNode>("b", std::make_shared<TOptimizerStatistics>(BaseTable, 1000000, 1, 0, 9000009)); - std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>> joinConditions; - joinConditions.insert({ - NDq::TJoinColumn("a", "1"), - NDq::TJoinColumn("b", "1") - }); + TVector<NDq::TJoinColumn> leftKeys = {NDq::TJoinColumn("a", "1")}; + TVector<NDq::TJoinColumn> rightKeys ={NDq::TJoinColumn("b", "1")}; + auto op = std::make_shared<TJoinOptimizerNode>( std::static_pointer_cast<IBaseOptimizerNode>(rel1), std::static_pointer_cast<IBaseOptimizerNode>(rel2), - joinConditions, + leftKeys, + rightKeys, InnerJoin, EJoinAlgoType::GraceJoin, true, @@ -86,30 +85,28 @@ Y_UNIT_TEST(JoinSearch3Rels) { auto rel3 = std::make_shared<TRelOptimizerNode>("c", std::make_shared<TOptimizerStatistics>(BaseTable, 10000, 1, 0, 9009)); - std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>> joinConditions; - joinConditions.insert({ - NDq::TJoinColumn("a", "1"), - NDq::TJoinColumn("b", "1") - }); + TVector<NDq::TJoinColumn> leftKeys = {NDq::TJoinColumn("a", "1")}; + TVector<NDq::TJoinColumn> rightKeys ={NDq::TJoinColumn("b", "1")}; + auto op1 = std::make_shared<TJoinOptimizerNode>( std::static_pointer_cast<IBaseOptimizerNode>(rel1), std::static_pointer_cast<IBaseOptimizerNode>(rel2), - joinConditions, + leftKeys, + rightKeys, InnerJoin, EJoinAlgoType::GraceJoin, false, false ); - joinConditions.insert({ - NDq::TJoinColumn("a", "1"), - NDq::TJoinColumn("c", "1") - }); + leftKeys.push_back(NDq::TJoinColumn("a", "1")); + rightKeys.push_back(NDq::TJoinColumn("c", "1")); auto op2 = std::make_shared<TJoinOptimizerNode>( std::static_pointer_cast<IBaseOptimizerNode>(op1), std::static_pointer_cast<IBaseOptimizerNode>(rel3), - joinConditions, + leftKeys, + rightKeys, InnerJoin, EJoinAlgoType::GraceJoin, true, diff --git a/ydb/library/yql/dq/opt/dq_opt_dphyp_solver.h b/ydb/library/yql/dq/opt/dq_opt_dphyp_solver.h index 32ae0fb96fb..09b3a676ffc 100644 --- a/ydb/library/yql/dq/opt/dq_opt_dphyp_solver.h +++ b/ydb/library/yql/dq/opt/dq_opt_dphyp_solver.h @@ -84,10 +84,8 @@ private: bool leftAny, bool rightAny, bool isCommutative, - const std::set<std::pair<TJoinColumn, TJoinColumn>>& joinConditions, - const std::set<std::pair<TJoinColumn, TJoinColumn>>& reversedJoinConditions, - const TVector<TString>& leftJoinKeys, - const TVector<TString>& rightJoinKeys, + const TVector<TJoinColumn>& leftJoinKeys, + const TVector<TJoinColumn>& rightJoinKeys, IProviderContext& ctx, TCardinalityHints::TCardinalityHint* maybeCardHint, TJoinAlgoHints::TJoinAlgoHint* maybeJoinHint @@ -414,17 +412,15 @@ template <typename TNodeSet> std::shared_ptr<TJoinOptimizerNodeInternal> TDPHypS bool leftAny, bool rightAny, bool isCommutative, - const std::set<std::pair<TJoinColumn, TJoinColumn>>& joinConditions, - const std::set<std::pair<TJoinColumn, TJoinColumn>>& reversedJoinConditions, - const TVector<TString>& leftJoinKeys, - const TVector<TString>& rightJoinKeys, + const TVector<TJoinColumn>& leftJoinKeys, + const TVector<TJoinColumn>& rightJoinKeys, IProviderContext& ctx, TCardinalityHints::TCardinalityHint* maybeCardHint, TJoinAlgoHints::TJoinAlgoHint* maybeJoinAlgoHint ) { if (maybeJoinAlgoHint) { maybeJoinAlgoHint->Applied = true; - return MakeJoinInternal(left, right, joinConditions, leftJoinKeys, rightJoinKeys, joinKind, maybeJoinAlgoHint->Algo, leftAny, rightAny, ctx, maybeCardHint); + return MakeJoinInternal(left, right, leftJoinKeys, rightJoinKeys, joinKind, maybeJoinAlgoHint->Algo, leftAny, rightAny, ctx, maybeCardHint); } double bestCost = std::numeric_limits<double>::infinity(); @@ -432,7 +428,7 @@ template <typename TNodeSet> std::shared_ptr<TJoinOptimizerNodeInternal> TDPHypS bool bestJoinIsReversed = false; for (auto joinAlgo : AllJoinAlgos) { - if (ctx.IsJoinApplicable(left, right, joinConditions, leftJoinKeys, rightJoinKeys, joinAlgo, joinKind)){ + if (ctx.IsJoinApplicable(left, right, leftJoinKeys, rightJoinKeys, joinAlgo, joinKind)){ auto cost = ctx.ComputeJoinStats(*left->Stats, *right->Stats, leftJoinKeys, rightJoinKeys, joinAlgo, joinKind, maybeCardHint).Cost; if (cost < bestCost) { bestCost = cost; @@ -442,7 +438,7 @@ template <typename TNodeSet> std::shared_ptr<TJoinOptimizerNodeInternal> TDPHypS } if (isCommutative) { - if (ctx.IsJoinApplicable(right, left, reversedJoinConditions, rightJoinKeys, leftJoinKeys, joinAlgo, joinKind)){ + if (ctx.IsJoinApplicable(right, left, rightJoinKeys, leftJoinKeys, joinAlgo, joinKind)){ auto cost = ctx.ComputeJoinStats(*right->Stats, *left->Stats, rightJoinKeys, leftJoinKeys, joinAlgo, joinKind, maybeCardHint).Cost; if (cost < bestCost) { bestCost = cost; @@ -456,10 +452,10 @@ template <typename TNodeSet> std::shared_ptr<TJoinOptimizerNodeInternal> TDPHypS Y_ENSURE(bestAlgo != EJoinAlgoType::Undefined, "No join was chosen!"); if (bestJoinIsReversed) { - return MakeJoinInternal(right, left, reversedJoinConditions, rightJoinKeys, leftJoinKeys, joinKind, bestAlgo, rightAny, leftAny, ctx, maybeCardHint); + return MakeJoinInternal(right, left, rightJoinKeys, leftJoinKeys, joinKind, bestAlgo, rightAny, leftAny, ctx, maybeCardHint); } - return MakeJoinInternal(left, right, joinConditions, leftJoinKeys, rightJoinKeys, joinKind, bestAlgo, leftAny, rightAny, ctx, maybeCardHint); + return MakeJoinInternal(left, right, leftJoinKeys, rightJoinKeys, joinKind, bestAlgo, leftAny, rightAny, ctx, maybeCardHint); } /* @@ -493,8 +489,6 @@ template<typename TNodeSet> void TDPHypSolver<TNodeSet>::EmitCsgCmp(const TNodeS csgCmpEdge->LeftAny, csgCmpEdge->RightAny, csgCmpEdge->IsCommutative, - csgCmpEdge->JoinConditions, - reversedEdge->JoinConditions, csgCmpEdge->LeftJoinKeys, csgCmpEdge->RightJoinKeys, Pctx_, diff --git a/ydb/library/yql/dq/opt/dq_opt_hypergraph_ut.cpp b/ydb/library/yql/dq/opt/dq_opt_hypergraph_ut.cpp index 02e43b3bbbe..6a80394d662 100644 --- a/ydb/library/yql/dq/opt/dq_opt_hypergraph_ut.cpp +++ b/ydb/library/yql/dq/opt/dq_opt_hypergraph_ut.cpp @@ -24,11 +24,11 @@ std::shared_ptr<IBaseOptimizerNode> CreateChain(size_t size, TString onAttribute auto ei = std::make_shared<TRelOptimizerNode>(eiStr, std::make_shared<TOptimizerStatistics>()); ei->Stats->Labels = std::make_shared<TVector<TString>>(TVector<TString>{eiStr}); - std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>> joinConditions; - joinConditions.insert({TJoinColumn(eiPrevStr, onAttribute), TJoinColumn(eiStr, onAttribute)}); + TVector<NDq::TJoinColumn> leftKeys = {TJoinColumn(eiPrevStr, onAttribute)}; + TVector<NDq::TJoinColumn> rightKeys = {TJoinColumn(eiStr, onAttribute)}; root = std::make_shared<TJoinOptimizerNode>( - root, ei, joinConditions, EJoinKind::InnerJoin, EJoinAlgoType::Undefined, false, false + root, ei, leftKeys, rightKeys, EJoinKind::InnerJoin, EJoinAlgoType::Undefined, false, false ); } @@ -105,23 +105,26 @@ Y_UNIT_TEST_SUITE(HypergraphBuild) { auto lhs = CreateChain(3, "228", "a"); auto rhs = CreateChain(2, "1337", "b"); - std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>> joinConditions; - joinConditions.insert({TJoinColumn("a3", "1337"), TJoinColumn("b1", "1337")}); + TVector<NDq::TJoinColumn> leftKeys = {TJoinColumn("a3", "1337")}; + TVector<NDq::TJoinColumn> rightKeys = {TJoinColumn("b1", "1337")}; // a1 --228-- a2 --228-- a3 --1337-- b1 --1337-- b2 auto root = std::make_shared<TJoinOptimizerNode>( - lhs, rhs, joinConditions, EJoinKind::InnerJoin, EJoinAlgoType::Undefined, false, false + lhs, rhs, leftKeys, rightKeys, EJoinKind::InnerJoin, EJoinAlgoType::Undefined, false, false ); - joinConditions.clear(); + leftKeys.clear(); + rightKeys.clear(); + + leftKeys.push_back(TJoinColumn("c2", "123")); + rightKeys.push_back(TJoinColumn("b2", "123")); - joinConditions.insert({TJoinColumn("c2", "123"), TJoinColumn("b2", "123")}); rhs = CreateChain(2, "228", "c"); // a1 --228-- a2 --228-- a3 --1337-- b1 --1337-- b2 --123-- c1 --228-- c2 // ^ we don't want to have transitive closure between c and a root = std::make_shared<TJoinOptimizerNode>( - root, rhs, joinConditions, EJoinKind::InnerJoin, EJoinAlgoType::Undefined, false, false + root, rhs, leftKeys, rightKeys, EJoinKind::InnerJoin, EJoinAlgoType::Undefined, false, false ); auto graph = MakeJoinHypergraph<TNodeSet>(root); @@ -184,7 +187,8 @@ Y_UNIT_TEST_SUITE(HypergraphBuild) { TJoinOptimizerNode( GetJoinArg(lhsArg), GetJoinArg(rhsArg), - {{TJoinColumn(lhsCond.c_str(), col), TJoinColumn(rhsCond.c_str(), col)}}, + {TJoinColumn(lhsCond.c_str(), col)}, + {TJoinColumn(rhsCond.c_str(), col)}, EJoinKind::InnerJoin, EJoinAlgoType::Undefined, false, diff --git a/ydb/library/yql/dq/opt/dq_opt_join_cost_based.cpp b/ydb/library/yql/dq/opt/dq_opt_join_cost_based.cpp index 32d3fd33f21..e5097ade867 100644 --- a/ydb/library/yql/dq/opt/dq_opt_join_cost_based.cpp +++ b/ydb/library/yql/dq/opt/dq_opt_join_cost_based.cpp @@ -82,7 +82,8 @@ std::shared_ptr<TJoinOptimizerNode> ConvertToJoinTree( right = *it; } - std::set<std::pair<TJoinColumn, TJoinColumn>> joinConds; + TVector<TJoinColumn> leftKeys; + TVector<TJoinColumn> rightKeys; size_t joinKeysCount = joinTuple.LeftKeys().Size() / 2; for (size_t i = 0; i < joinKeysCount; ++i) { @@ -90,15 +91,15 @@ std::shared_ptr<TJoinOptimizerNode> ConvertToJoinTree( auto leftScope = joinTuple.LeftKeys().Item(keyIndex).StringValue(); auto leftColumn = joinTuple.LeftKeys().Item(keyIndex + 1).StringValue(); + leftKeys.push_back(TJoinColumn(leftScope, leftColumn)); + auto rightScope = joinTuple.RightKeys().Item(keyIndex).StringValue(); auto rightColumn = joinTuple.RightKeys().Item(keyIndex + 1).StringValue(); - - joinConds.insert( std::make_pair( TJoinColumn(leftScope, leftColumn), - TJoinColumn(rightScope, rightColumn))); + rightKeys.push_back(TJoinColumn(rightScope, rightColumn)); } const auto linkSettings = GetEquiJoinLinkSettings(joinTuple.Options().Ref()); - return std::make_shared<TJoinOptimizerNode>(left, right, joinConds, ConvertToJoinKind(joinTuple.Type().StringValue()), EJoinAlgoType::Undefined, + return std::make_shared<TJoinOptimizerNode>(left, right, leftKeys, rightKeys, ConvertToJoinKind(joinTuple.Type().StringValue()), EJoinAlgoType::Undefined, linkSettings.LeftHints.contains("any"), linkSettings.RightHints.contains("any")); } @@ -138,11 +139,13 @@ TExprBase BuildTree(TExprContext& ctx, const TCoEquiJoin& equiJoin, TVector<TExprBase> rightJoinColumns; // Build join conditions - for( auto pair : reorderResult->JoinConditions) { - leftJoinColumns.push_back(BuildAtom(pair.first.RelName, equiJoin.Pos(), ctx)); - leftJoinColumns.push_back(BuildAtom(pair.first.AttributeName, equiJoin.Pos(), ctx)); - rightJoinColumns.push_back(BuildAtom(pair.second.RelName, equiJoin.Pos(), ctx)); - rightJoinColumns.push_back(BuildAtom(pair.second.AttributeName, equiJoin.Pos(), ctx)); + for( auto leftKey : reorderResult->LeftJoinKeys) { + leftJoinColumns.push_back(BuildAtom(leftKey.RelName, equiJoin.Pos(), ctx)); + leftJoinColumns.push_back(BuildAtom(leftKey.AttributeNameWithAliases, equiJoin.Pos(), ctx)); + } + for( auto rightKey : reorderResult->RightJoinKeys) { + rightJoinColumns.push_back(BuildAtom(rightKey.RelName, equiJoin.Pos(), ctx)); + rightJoinColumns.push_back(BuildAtom(rightKey.AttributeNameWithAliases, equiJoin.Pos(), ctx)); } TExprNode::TListType options(1U, diff --git a/ydb/library/yql/dq/opt/dq_opt_join_hypergraph.h b/ydb/library/yql/dq/opt/dq_opt_join_hypergraph.h index c6f5be64fb2..6a73c7149a6 100644 --- a/ydb/library/yql/dq/opt/dq_opt_join_hypergraph.h +++ b/ydb/library/yql/dq/opt/dq_opt_join_hypergraph.h @@ -31,7 +31,8 @@ public: bool leftAny, bool rightAny, bool isCommutative, - const std::set<std::pair<TJoinColumn, TJoinColumn>>& joinConditions + TVector<TJoinColumn>& leftJoinKeys, + TVector<TJoinColumn>& rightJoinKeys ) : Left(left) , Right(right) @@ -39,14 +40,24 @@ public: , LeftAny(leftAny) , RightAny(rightAny) , IsCommutative(isCommutative) - , JoinConditions(joinConditions) + , LeftJoinKeys(leftJoinKeys) + , RightJoinKeys(rightJoinKeys) , IsReversed(false) { - BuildCondVectors(); + RemoveAttributeAliases(); } bool AreCondVectorEqual() const { - return LeftJoinKeys == RightJoinKeys; + TVector<TString> leftAttrNames; + TVector<TString> rightAttrNames; + for (auto & l : LeftJoinKeys) { + leftAttrNames.push_back(l.AttributeName); + } + for (auto & r : RightJoinKeys) { + rightAttrNames.push_back(r.AttributeName); + } + + return leftAttrNames == rightAttrNames; } inline bool IsSimple() const { @@ -58,32 +69,25 @@ public: EJoinKind JoinKind; bool LeftAny, RightAny; bool IsCommutative; - std::set<std::pair<TJoinColumn, TJoinColumn>> JoinConditions; - TVector<TString> LeftJoinKeys; - TVector<TString> RightJoinKeys; + TVector<TJoinColumn> LeftJoinKeys; + TVector<TJoinColumn> RightJoinKeys; // JoinKind may not be commutative, so we need to know which edge is original and which is reversed. bool IsReversed; int64_t ReversedEdgeId = -1; - void BuildCondVectors() { - LeftJoinKeys.clear(); - RightJoinKeys.clear(); + void RemoveAttributeAliases() { - for (const auto& [left, right] : JoinConditions) { - auto leftKey = left.AttributeName; - auto rightKey = right.AttributeName; - - if (auto idx = leftKey.find_last_of('.'); idx != TString::npos) { - leftKey = leftKey.substr(idx+1); + for (auto& leftKey : LeftJoinKeys ) { + if (auto idx = leftKey.AttributeName.find_last_of('.'); idx != TString::npos) { + leftKey.AttributeName = leftKey.AttributeName.substr(idx+1); } + } - if (auto idx = rightKey.find_last_of('.'); idx != TString::npos) { - rightKey = rightKey.substr(idx+1); + for (auto& rightKey : RightJoinKeys ) { + if (auto idx = rightKey.AttributeName.find_last_of('.'); idx != TString::npos) { + rightKey.AttributeName = rightKey.AttributeName.substr(idx+1); } - - LeftJoinKeys.emplace_back(leftKey); - RightJoinKeys.emplace_back(rightKey); } } }; @@ -133,10 +137,30 @@ public: }; for (const auto& edge: Edges_) { + TString leftKeyStr; + TString rightKeyStr; + + for (auto& l: edge.LeftJoinKeys) { + leftKeyStr.append(l.RelName); + leftKeyStr.append("."); + leftKeyStr.append(l.AttributeName); + leftKeyStr.append(","); + } + + for (auto& r: edge.RightJoinKeys) { + rightKeyStr.append(r.RelName); + rightKeyStr.append("."); + rightKeyStr.append(r.AttributeName); + rightKeyStr.append(","); + } res .append(edgeSideToString(edge.Left)) .append(" -> ") .append(edgeSideToString(edge.Right)) + .append(" on ") + .append(leftKeyStr) + .append("==") + .append(rightKeyStr) .append("\n"); } @@ -164,17 +188,12 @@ public: AddEdgeImpl(edge); - std::set<std::pair<TJoinColumn, TJoinColumn>> reversedJoinConditions; - for (const auto& [lhs, rhs]: edge.JoinConditions) { - reversedJoinConditions.insert({rhs, lhs}); - } - TEdge reversedEdge = std::move(edge); std::swap(reversedEdge.Left, reversedEdge.Right); - reversedEdge.JoinConditions = std::move(reversedJoinConditions); + std::swap(reversedEdge.LeftJoinKeys, reversedEdge.RightJoinKeys); reversedEdge.IsReversed = true; reversedEdge.ReversedEdgeId = edgeId; - reversedEdge.BuildCondVectors(); + reversedEdge.RemoveAttributeAliases(); AddEdgeImpl(reversedEdge); } @@ -404,8 +423,15 @@ public: edges.begin(), edges.end(), [](const THyperedge& lhs, const THyperedge& rhs) { - auto lhsAttributeNames = lhs.LeftJoinKeys; - auto rhsAttributeNames = rhs.LeftJoinKeys; + TVector<TString> lhsAttributeNames; + TVector<TString> rhsAttributeNames; + + for (auto & l : lhs.LeftJoinKeys ) { + lhsAttributeNames.push_back(l.AttributeName); + } + for (auto & r : rhs.LeftJoinKeys ) { + rhsAttributeNames.push_back(r.AttributeName); + } std::sort(lhsAttributeNames.begin(), lhsAttributeNames.end()); std::sort(rhsAttributeNames.begin(), rhsAttributeNames.end()); @@ -439,9 +465,12 @@ private: bool isJoinCommutative = edges[groupBegin].IsCommutative; TVector<TString> groupConditionUsedAttributes; - for (const auto& [lhs, rhs]: edges[groupBegin].JoinConditions) { + for (const auto& lhs: edges[groupBegin].LeftJoinKeys) { groupConditionUsedAttributes.push_back(lhs.AttributeName); } + for (const auto& rhs: edges[groupBegin].RightJoinKeys) { + groupConditionUsedAttributes.push_back(rhs.AttributeName); + } TDisjointSets connectedComponents(nodeSetSize); for (size_t edgeId = groupBegin; edgeId < groupEnd; ++edgeId) { @@ -464,15 +493,15 @@ private: TString lhsRelName = nodes[i].RelationOptimizerNode->Labels()[0]; TString rhsRelName = nodes[j].RelationOptimizerNode->Labels()[0]; - std::set<std::pair<TJoinColumn, TJoinColumn>> joinConditions; + TVector<TJoinColumn> leftKeys; + TVector<TJoinColumn> rightKeys; + for (const auto& attributeName: groupConditionUsedAttributes){ - joinConditions.insert({ - TJoinColumn(lhsRelName, attributeName), - TJoinColumn(rhsRelName, attributeName) - }); + leftKeys.push_back(TJoinColumn(lhsRelName, attributeName)); + rightKeys.push_back(TJoinColumn(rhsRelName, attributeName)); } - auto e = THyperedge(lhs, rhs, groupJoinKind, false, false, isJoinCommutative, joinConditions); + auto e = THyperedge(lhs, rhs, groupJoinKind, false, false, isJoinCommutative, leftKeys, rightKeys); Graph_.AddEdge(std::move(e)); } } @@ -480,8 +509,16 @@ private: } bool HasOneGroup(const THyperedge& lhs, const THyperedge& rhs) { - auto lhsAttributeNames = lhs.LeftJoinKeys; - auto rhsAttributeNames = rhs.LeftJoinKeys; + TVector<TString> lhsAttributeNames; + TVector<TString> rhsAttributeNames; + + for (auto & l : lhs.LeftJoinKeys) { + lhsAttributeNames.push_back(l.AttributeName); + } + + for (auto & r : rhs.LeftJoinKeys) { + rhsAttributeNames.push_back(r.AttributeName); + } std::sort(lhsAttributeNames.begin(), lhsAttributeNames.end()); std::sort(rhsAttributeNames.begin(), rhsAttributeNames.end()); diff --git a/ydb/library/yql/dq/opt/dq_opt_join_tree_node.cpp b/ydb/library/yql/dq/opt/dq_opt_join_tree_node.cpp index 5b13ee7cbd6..d54b793009b 100644 --- a/ydb/library/yql/dq/opt/dq_opt_join_tree_node.cpp +++ b/ydb/library/yql/dq/opt/dq_opt_join_tree_node.cpp @@ -5,9 +5,8 @@ namespace NYql::NDq { std::shared_ptr<TJoinOptimizerNodeInternal> MakeJoinInternal( std::shared_ptr<IBaseOptimizerNode> left, std::shared_ptr<IBaseOptimizerNode> right, - const std::set<std::pair<TJoinColumn, TJoinColumn>>& joinConditions, - const TVector<TString>& leftJoinKeys, - const TVector<TString>& rightJoinKeys, + const TVector<TJoinColumn>& leftJoinKeys, + const TVector<TJoinColumn>& rightJoinKeys, EJoinKind joinKind, EJoinAlgoType joinAlgo, bool leftAny, @@ -15,7 +14,7 @@ std::shared_ptr<TJoinOptimizerNodeInternal> MakeJoinInternal( IProviderContext& ctx, TCardinalityHints::TCardinalityHint* maybeHint) { - auto res = std::make_shared<TJoinOptimizerNodeInternal>(left, right, joinConditions, leftJoinKeys, rightJoinKeys, joinKind, joinAlgo, leftAny, rightAny); + auto res = std::make_shared<TJoinOptimizerNodeInternal>(left, right, leftJoinKeys, rightJoinKeys, joinKind, joinAlgo, leftAny, rightAny); res->Stats = std::make_shared<TOptimizerStatistics>(ctx.ComputeJoinStats(*left->Stats, *right->Stats, leftJoinKeys, rightJoinKeys, joinAlgo, joinKind, maybeHint)); return res; } @@ -39,7 +38,7 @@ std::shared_ptr<TJoinOptimizerNode> ConvertFromInternal(const std::shared_ptr<IB right = ConvertFromInternal(right); } - auto newJoin = std::make_shared<TJoinOptimizerNode>(left, right, join->JoinConditions, join->JoinType, join->JoinAlgo, join->LeftAny, join->RightAny); + auto newJoin = std::make_shared<TJoinOptimizerNode>(left, right, join->LeftJoinKeys, join->RightJoinKeys, join->JoinType, join->JoinAlgo, join->LeftAny, join->RightAny); newJoin->Stats = join->Stats; return newJoin; } diff --git a/ydb/library/yql/dq/opt/dq_opt_join_tree_node.h b/ydb/library/yql/dq/opt/dq_opt_join_tree_node.h index 9e626bc356b..f8e50f3b336 100644 --- a/ydb/library/yql/dq/opt/dq_opt_join_tree_node.h +++ b/ydb/library/yql/dq/opt/dq_opt_join_tree_node.h @@ -18,9 +18,8 @@ struct TJoinOptimizerNodeInternal : public IBaseOptimizerNode { TJoinOptimizerNodeInternal( const std::shared_ptr<IBaseOptimizerNode>& left, const std::shared_ptr<IBaseOptimizerNode>& right, - const std::set<std::pair<TJoinColumn, TJoinColumn>>& joinConditions, - const TVector<TString>& leftJoinKeys, - const TVector<TString>& rightJoinKeys, + const TVector<TJoinColumn>& leftJoinKeys, + const TVector<TJoinColumn>& rightJoinKeys, const EJoinKind joinType, const EJoinAlgoType joinAlgo, const bool leftAny, @@ -29,7 +28,6 @@ struct TJoinOptimizerNodeInternal : public IBaseOptimizerNode { : IBaseOptimizerNode(JoinNodeType) , LeftArg(left) , RightArg(right) - , JoinConditions(joinConditions) , LeftJoinKeys(leftJoinKeys) , RightJoinKeys(rightJoinKeys) , JoinType(joinType) @@ -51,9 +49,8 @@ struct TJoinOptimizerNodeInternal : public IBaseOptimizerNode { std::shared_ptr<IBaseOptimizerNode> LeftArg; std::shared_ptr<IBaseOptimizerNode> RightArg; - const std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>>& JoinConditions; - const TVector<TString>& LeftJoinKeys; - const TVector<TString>& RightJoinKeys; + const TVector<TJoinColumn>& LeftJoinKeys; + const TVector<TJoinColumn>& RightJoinKeys; EJoinKind JoinType; EJoinAlgoType JoinAlgo; const bool LeftAny; @@ -66,9 +63,8 @@ struct TJoinOptimizerNodeInternal : public IBaseOptimizerNode { std::shared_ptr<TJoinOptimizerNodeInternal> MakeJoinInternal( std::shared_ptr<IBaseOptimizerNode> left, std::shared_ptr<IBaseOptimizerNode> right, - const std::set<std::pair<TJoinColumn, TJoinColumn>>& joinConditions, - const TVector<TString>& leftJoinKeys, - const TVector<TString>& rightJoinKeys, + const TVector<TJoinColumn>& leftJoinKeys, + const TVector<TJoinColumn>& rightJoinKeys, EJoinKind joinKind, EJoinAlgoType joinAlgo, bool leftAny, diff --git a/ydb/library/yql/dq/opt/dq_opt_make_join_hypergraph.h b/ydb/library/yql/dq/opt/dq_opt_make_join_hypergraph.h index 4e347ab5973..9d3443621e4 100644 --- a/ydb/library/yql/dq/opt/dq_opt_make_join_hypergraph.h +++ b/ydb/library/yql/dq/opt/dq_opt_make_join_hypergraph.h @@ -21,11 +21,13 @@ namespace NYql::NDq { inline TVector<TString> GetConditionUsedRelationNames(const std::shared_ptr<TJoinOptimizerNode>& joinNode) { TVector<TString> res; - res.reserve(joinNode->JoinConditions.size()); + res.reserve(joinNode->LeftJoinKeys.size()); - for (const auto& [lhsTable, rhsTable]: joinNode->JoinConditions) { - res.push_back(lhsTable.RelName); - res.push_back(rhsTable.RelName); + for (const auto& lhs : joinNode->LeftJoinKeys ) { + res.push_back(lhs.RelName); + } + for (const auto& rhs : joinNode->RightJoinKeys ) { + res.push_back(rhs.RelName); } return res; @@ -57,7 +59,7 @@ typename TJoinHypergraph<TNodeSet>::TEdge MakeHyperedge( TNodeSet right = TES & subtreeNodes[joinNode->RightArg]; bool isCommutative = OperatorIsCommutative(joinNode->JoinType) && (joinNode->IsReorderable); - return typename TJoinHypergraph<TNodeSet>::TEdge(left, right, joinNode->JoinType, joinNode->LeftAny, joinNode->RightAny, isCommutative, joinNode->JoinConditions); + return typename TJoinHypergraph<TNodeSet>::TEdge(left, right, joinNode->JoinType, joinNode->LeftAny, joinNode->RightAny, isCommutative, joinNode->LeftJoinKeys, joinNode->RightJoinKeys); } template<typename TNodeSet> diff --git a/ydb/library/yql/dq/opt/dq_opt_stat.cpp b/ydb/library/yql/dq/opt/dq_opt_stat.cpp index 149f72f79b6..a4b2c1299c3 100644 --- a/ydb/library/yql/dq/opt/dq_opt_stat.cpp +++ b/ydb/library/yql/dq/opt/dq_opt_stat.cpp @@ -30,6 +30,17 @@ namespace { return attributeName; } + TString ExtractAlias(TString attributeName) { + if (auto idx = attributeName.find_last_of('.'); idx != TString::npos) { + auto substr = attributeName.substr(0, idx); + if (auto idx2 = substr.find_last_of('.'); idx != TString::npos) { + substr = substr.substr(idx2+1); + } + return substr; + } + return TString(); + } + TVector<TString> InferLabels(std::shared_ptr<TOptimizerStatistics>& stats, TCoAtomList joinColumns) { if(stats->Labels) { return *stats->Labels; @@ -261,14 +272,18 @@ void InferStatisticsForMapJoin(const TExprNode::TPtr& input, TTypeAnnotationCont leftStats = ApplyCardinalityHints(leftStats, leftLabels, hints); rightStats = ApplyCardinalityHints(rightStats, rightLabels, hints); - TVector<TString> leftJoinKeys; - TVector<TString> rightJoinKeys; + TVector<TJoinColumn> leftJoinKeys; + TVector<TJoinColumn> rightJoinKeys; for (size_t i=0; i<join.LeftKeysColumnNames().Size(); i++) { - leftJoinKeys.push_back(RemoveAliases(join.LeftKeysColumnNames().Item(i).StringValue())); + auto alias = ExtractAlias(join.LeftKeysColumnNames().Item(i).StringValue()); + auto attrName = RemoveAliases(join.LeftKeysColumnNames().Item(i).StringValue()); + leftJoinKeys.push_back(TJoinColumn(alias, attrName)); } for (size_t i=0; i<join.RightKeysColumnNames().Size(); i++) { - rightJoinKeys.push_back(RemoveAliases(join.RightKeysColumnNames().Item(i).StringValue())); + auto alias = ExtractAlias(join.RightKeysColumnNames().Item(i).StringValue()); + auto attrName = RemoveAliases(join.RightKeysColumnNames().Item(i).StringValue()); + rightJoinKeys.push_back(TJoinColumn(alias, attrName)); } auto unionOfLabels = UnionLabels(leftLabels, rightLabels); @@ -312,14 +327,18 @@ void InferStatisticsForGraceJoin(const TExprNode::TPtr& input, TTypeAnnotationCo leftStats = ApplyCardinalityHints(leftStats, leftLabels, hints); rightStats = ApplyCardinalityHints(rightStats, rightLabels, hints); - TVector<TString> leftJoinKeys; - TVector<TString> rightJoinKeys; + TVector<TJoinColumn> leftJoinKeys; + TVector<TJoinColumn> rightJoinKeys; for (size_t i=0; i<join.LeftKeysColumnNames().Size(); i++) { - leftJoinKeys.push_back(RemoveAliases(join.LeftKeysColumnNames().Item(i).StringValue())); + auto alias = ExtractAlias(join.LeftKeysColumnNames().Item(i).StringValue()); + auto attrName = RemoveAliases(join.LeftKeysColumnNames().Item(i).StringValue()); + leftJoinKeys.push_back(TJoinColumn(alias, attrName)); } for (size_t i=0; i<join.RightKeysColumnNames().Size(); i++) { - rightJoinKeys.push_back(RemoveAliases(join.RightKeysColumnNames().Item(i).StringValue())); + auto alias = ExtractAlias(join.RightKeysColumnNames().Item(i).StringValue()); + auto attrName = RemoveAliases(join.RightKeysColumnNames().Item(i).StringValue()); + rightJoinKeys.push_back(TJoinColumn(alias, attrName)); } auto unionOfLabels = UnionLabels(leftLabels, rightLabels); diff --git a/ydb/library/yql/providers/dq/opt/logical_optimize.cpp b/ydb/library/yql/providers/dq/opt/logical_optimize.cpp index 71b478258a4..dfaff2a9115 100644 --- a/ydb/library/yql/providers/dq/opt/logical_optimize.cpp +++ b/ydb/library/yql/providers/dq/opt/logical_optimize.cpp @@ -49,8 +49,7 @@ struct TDqCBOProviderContext : public NYql::TBaseProviderContext { virtual bool IsJoinApplicable(const std::shared_ptr<NYql::IBaseOptimizerNode>& left, const std::shared_ptr<NYql::IBaseOptimizerNode>& right, - const std::set<std::pair<NYql::NDq::TJoinColumn, NYql::NDq::TJoinColumn>>& joinConditions, - const TVector<TString>& leftJoinKeys, const TVector<TString>& rightJoinKeys, + const TVector<TJoinColumn>& leftJoinKeys, const TVector<TJoinColumn>& rightJoinKeys, NYql::EJoinAlgoType joinAlgo, NYql::EJoinKind joinKind) override; virtual double ComputeJoinCost(const NYql::TOptimizerStatistics& leftStats, const NYql::TOptimizerStatistics& rightStats, const double outputRows, const double outputByteSize, NYql::EJoinAlgoType joinAlgo) const override; @@ -62,12 +61,10 @@ struct TDqCBOProviderContext : public NYql::TBaseProviderContext { bool TDqCBOProviderContext::IsJoinApplicable(const std::shared_ptr<NYql::IBaseOptimizerNode>& left, const std::shared_ptr<NYql::IBaseOptimizerNode>& right, - const std::set<std::pair<NYql::NDq::TJoinColumn, NYql::NDq::TJoinColumn>>& joinConditions, - const TVector<TString>& leftJoinKeys, const TVector<TString>& rightJoinKeys, + const TVector<TJoinColumn>& leftJoinKeys, const TVector<TJoinColumn>& rightJoinKeys, NYql::EJoinAlgoType joinAlgo, NYql::EJoinKind joinKind) { Y_UNUSED(left); Y_UNUSED(right); - Y_UNUSED(joinConditions); Y_UNUSED(leftJoinKeys); Y_UNUSED(rightJoinKeys); diff --git a/ydb/library/yql/providers/yt/provider/ut/yql_yt_cbo_ut.cpp b/ydb/library/yql/providers/yt/provider/ut/yql_yt_cbo_ut.cpp index f4092b08b0b..8ab1dc5962d 100644 --- a/ydb/library/yql/providers/yt/provider/ut/yql_yt_cbo_ut.cpp +++ b/ydb/library/yql/providers/yt/provider/ut/yql_yt_cbo_ut.cpp @@ -78,10 +78,11 @@ Y_UNIT_TEST(NonReordable) { auto left = std::make_shared<TRelOptimizerNode>("a", stat); auto right = std::make_shared<TRelOptimizerNode>("a", stat); - std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>> joinConditions; - joinConditions.insert({NDq::TJoinColumn{"a", "b"}, NDq::TJoinColumn{"a","c"}}); + TVector<NDq::TJoinColumn> leftKeys = {NDq::TJoinColumn{"a", "b"}}; + TVector<NDq::TJoinColumn> rightKeys = {NDq::TJoinColumn{"a","c"}}; + auto root = std::make_shared<TJoinOptimizerNode>( - left, right, joinConditions, EJoinKind::InnerJoin, EJoinAlgoType::GraceJoin, false, false, true); + left, right, leftKeys, rightKeys, EJoinKind::InnerJoin, EJoinAlgoType::GraceJoin, false, false, true); TBaseProviderContext optCtx; std::unique_ptr<IOptimizerNew> opt = std::unique_ptr<IOptimizerNew>(NDq::MakeNativeOptimizerNew(optCtx, 1024)); auto result = opt->JoinSearch(root); diff --git a/ydb/library/yql/providers/yt/provider/yql_yt_join_reorder.cpp b/ydb/library/yql/providers/yt/provider/yql_yt_join_reorder.cpp index 4c15ac7bdab..e09fa86d7b9 100644 --- a/ydb/library/yql/providers/yt/provider/yql_yt_join_reorder.cpp +++ b/ydb/library/yql/providers/yt/provider/yql_yt_join_reorder.cpp @@ -162,11 +162,12 @@ class TYtJoinOptimizerNode: public TJoinOptimizerNode { public: TYtJoinOptimizerNode(const std::shared_ptr<IBaseOptimizerNode>& left, const std::shared_ptr<IBaseOptimizerNode>& right, - const std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>>& joinConditions, + const TVector<NDq::TJoinColumn>& leftKeys, + const TVector<NDq::TJoinColumn>& rightKeys, const EJoinKind joinType, const EJoinAlgoType joinAlgo, TYtJoinNodeOp* originalOp) - : TJoinOptimizerNode(left, right, joinConditions, joinType, joinAlgo, + : TJoinOptimizerNode(left, right, leftKeys, rightKeys, joinType, joinAlgo, originalOp ? originalOp->LinkSettings.LeftHints.contains("any") : false, originalOp ? originalOp->LinkSettings.RightHints.contains("any") : false, originalOp != nullptr) @@ -209,7 +210,8 @@ private: std::shared_ptr<IBaseOptimizerNode> OnOp(TYtJoinNodeOp* op) { auto joinKind = ConvertToJoinKind(TString(op->JoinKind->Content())); YQL_ENSURE(op->LeftLabel->ChildrenSize() == op->RightLabel->ChildrenSize()); - std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>> joinConditions; + TVector<NDq::TJoinColumn> leftKeys; + TVector<NDq::TJoinColumn> rightKeys; for (ui32 i = 0; i < op->LeftLabel->ChildrenSize(); i += 2) { auto ltable = op->LeftLabel->Child(i)->Content(); auto lcolumn = op->LeftLabel->Child(i + 1)->Content(); @@ -219,7 +221,8 @@ private: AddRelJoinColumn(TString(rtable), TString(rcolumn)); NDq::TJoinColumn lcol{TString(ltable), TString(lcolumn)}; NDq::TJoinColumn rcol{TString(rtable), TString(rcolumn)}; - joinConditions.insert({lcol, rcol}); + leftKeys.push_back(lcol); + rightKeys.push_back(rcol); } auto left = ProcessNode(op->Left); auto right = ProcessNode(op->Right); @@ -228,7 +231,7 @@ private: ProviderCtx->HasHints = ProviderCtx->HasHints || !op->LinkSettings.LeftHints.empty() || !op->LinkSettings.RightHints.empty(); return std::make_shared<TYtJoinOptimizerNode>( - left, right, joinConditions, joinKind, EJoinAlgoType::GraceJoin, nonReorderable ? op : nullptr + left, right, leftKeys, rightKeys, joinKind, EJoinAlgoType::GraceJoin, nonReorderable ? op : nullptr ); } @@ -356,12 +359,13 @@ TYtJoinNode::TPtr BuildYtJoinTree(std::shared_ptr<IBaseOptimizerNode> node, TVec ret = MakeIntrusive<TYtJoinNodeOp>(); ret->JoinKind = ctx.NewAtom(pos, ConvertToJoinString(op->JoinType)); TVector<TExprNodePtr> leftLabel, rightLabel; - leftLabel.reserve(op->JoinConditions.size() * 2); - rightLabel.reserve(op->JoinConditions.size() * 2); - for (auto& [left, right] : op->JoinConditions) { + leftLabel.reserve(op->LeftJoinKeys.size() * 2); + rightLabel.reserve(op->RightJoinKeys.size() * 2); + for (auto& left : op->LeftJoinKeys) { leftLabel.emplace_back(ctx.NewAtom(pos, left.RelName)); leftLabel.emplace_back(ctx.NewAtom(pos, left.AttributeName)); - + } + for (auto& right : op->RightJoinKeys) { rightLabel.emplace_back(ctx.NewAtom(pos, right.RelName)); rightLabel.emplace_back(ctx.NewAtom(pos, right.AttributeName)); } diff --git a/ydb/library/yql/sql/pg/optimizer.cpp b/ydb/library/yql/sql/pg/optimizer.cpp index d1134092cbb..0548e7b5d5e 100644 --- a/ydb/library/yql/sql/pg/optimizer.cpp +++ b/ydb/library/yql/sql/pg/optimizer.cpp @@ -491,11 +491,11 @@ struct TPgOptimizerImpl std::vector<std::tuple<int,int,TStringBuf,TStringBuf>>& rightVars, const std::shared_ptr<TJoinOptimizerNode>& op) { - for (auto& [l, r]: op->JoinConditions) { - auto& ltable = l.RelName; - auto& lcol = l.AttributeName; - auto& rtable = r.RelName; - auto& rcol = r.AttributeName; + for (size_t i=0; i<op->LeftJoinKeys.size(); i++ ) { + auto& ltable = op->LeftJoinKeys[i].RelName; + auto& lcol = op->LeftJoinKeys[i].AttributeName; + auto& rtable = op->RightJoinKeys[i].RelName; + auto& rcol = op->RightJoinKeys[i].AttributeName; const auto& lrelIds = Table2RelIds[ltable]; YQL_ENSURE(!lrelIds.empty()); @@ -562,7 +562,7 @@ struct TPgOptimizerImpl MakeEqClasses(EqClasses, leftVars, rightVars); } else if (op->JoinType == LeftJoin || op->JoinType == RightJoin) { - CHECK(op->JoinConditions.size() == 1, "Only 1 var per join supported"); + CHECK(op->LeftJoinKeys.size() == 1 && op->RightJoinKeys.size() == 1, "Only 1 var per join supported"); std::vector<std::tuple<int,int,TStringBuf,TStringBuf>> leftVars, rightVars; ExtractVars(leftVars, rightVars, op); @@ -637,22 +637,23 @@ struct TPgOptimizerImpl YQL_ENSURE(node->LeftVars.size() == node->RightVars.size()); - std::set<std::pair<NDq::TJoinColumn, NDq::TJoinColumn>> joinConditions; + TVector<NDq::TJoinColumn> leftJoinKeys; + TVector<NDq::TJoinColumn> rightJoinKeys; + for (size_t i = 0; i < node->LeftVars.size(); i++) { auto [lrelId, lvarId] = node->LeftVars[i]; auto [rrelId, rvarId] = node->RightVars[i]; auto [ltable, lcolumn] = Var2TableCol[lrelId - 1][lvarId - 1]; auto [rtable, rcolumn] = Var2TableCol[rrelId - 1][rvarId - 1]; - joinConditions.insert({ - NDq::TJoinColumn{TString(ltable), TString(lcolumn)}, - NDq::TJoinColumn{TString(rtable), TString(rcolumn)} - }); + leftJoinKeys.push_back(NDq::TJoinColumn(TString(ltable), TString(lcolumn))); + rightJoinKeys.push_back(NDq::TJoinColumn(TString(rtable), TString(rcolumn))); } return std::make_shared<TJoinOptimizerNode>( left, right, - joinConditions, + leftJoinKeys, + rightJoinKeys, joinKind, EJoinAlgoType::MapJoin, false, diff --git a/ydb/library/yql/tests/sql/dq_file/part17/canondata/result.json b/ydb/library/yql/tests/sql/dq_file/part17/canondata/result.json index a8d3c9d90df..fb3e15ab50f 100644 --- a/ydb/library/yql/tests/sql/dq_file/part17/canondata/result.json +++ b/ydb/library/yql/tests/sql/dq_file/part17/canondata/result.json @@ -765,9 +765,9 @@ ], "test.test[dq-join_cbo_native_3_tables--Debug]": [ { - "checksum": "91570a2f667516ba1f3f28642698441f", - "size": 4802, - "uri": "https://{canondata_backend}/1942278/d3f67196e7e0096e289743f5dbfd5dc2f990f9e6/resource.tar.gz#test.test_dq-join_cbo_native_3_tables--Debug_/opt.yql_patched" + "checksum": "bc4f0d3c80bc05fdb553d9d07ed58fd2", + "size": 4846, + "uri": "https://{canondata_backend}/1597364/aa2251cc1cffd9f5ef1d8d1793ee54509ab8cdfc/resource.tar.gz#test.test_dq-join_cbo_native_3_tables--Debug_/opt.yql_patched" } ], "test.test[dq-join_cbo_native_3_tables--Plan]": [ |
