diff options
author | lucius <lucius@yandex-team.com> | 2025-07-04 10:22:12 +0300 |
---|---|---|
committer | lucius <lucius@yandex-team.com> | 2025-07-04 10:47:46 +0300 |
commit | 94d6d80a1937f3cf2817a2a1d7e04b28eaa7565f (patch) | |
tree | e7a84dc5da80dfde88a9b6c5be9eaca293753667 | |
parent | dba0afc5ce7049668d44c6f5cb1bd0fd5a7fbc63 (diff) | |
download | ydb-94d6d80a1937f3cf2817a2a1d7e04b28eaa7565f.tar.gz |
YQL-20085: fix FilterPushdownOverJoinOptionalSide
Тут чинятся 2 проблемы:
1) Rename settings во вложенных джоинах после преобразования не позволяли применить пропушенный предикат, что-нибудь типа "Unknown table name:… Failed to parse labels of struct…"
2) Если до пушдауна типизатор вывел distinct у джоина, а мы его заменили на union all, то типизатор не мог доказать что distinct сохранился и падал. Типизатор править не стали, решили через KeepUniqueDistinct.
Обе проблемы немного по-разному проявляются и чинятся в двух случаях:
1) Делаем пушдаун в инпут из последнего джоина (верхний/корневой в дереве) — исходный джоин заменяется на union all из двух джоинов
2) Пушдаун в инпут НЕ из последнего джоина (НЕ верхний/корневой в дереве) — union all появляется внутри "исходного" джоина
commit_hash:cc7cfbb973bc7dcabba202ab9a637c3b9f1cb5dd
8 files changed, 277 insertions, 32 deletions
diff --git a/yql/essentials/core/common_opt/yql_flatmap_over_join.cpp b/yql/essentials/core/common_opt/yql_flatmap_over_join.cpp index 1026e8f1b66..b85b0a8178e 100644 --- a/yql/essentials/core/common_opt/yql_flatmap_over_join.cpp +++ b/yql/essentials/core/common_opt/yql_flatmap_over_join.cpp @@ -406,17 +406,6 @@ TExprNode::TPtr CreateLabelList(const THashSet<TString>& labels, TExprContext& c return ctx.NewList(position, std::move(newKeys)); } -TExprNode::TPtr RemoveJoinKeysFromElimination(const TExprNode& settings, TExprContext& ctx) { - TExprNode::TListType updated; - for (auto setting : settings.Children()) { - if (setting->ChildrenSize() == 3 && setting->Child(0)->Content() == "rename" && setting->Child(2)->Content() == "") { - continue; - } - updated.push_back(setting); - } - return ctx.NewList(settings.Pos(), std::move(updated)); -} - TExprNode::TPtr FilterPushdownOverJoinOptionalSide(TExprNode::TPtr equiJoin, TExprNode::TPtr predicate, const TSet<TStringBuf>& usedFields, TExprNode::TPtr args, const TJoinLabels& labels, ui32 inputIndex, const TMap<TStringBuf, TVector<TStringBuf>>& renameMap, bool ordered, bool skipNulls, TExprContext& ctx, @@ -425,7 +414,8 @@ TExprNode::TPtr FilterPushdownOverJoinOptionalSide(TExprNode::TPtr equiJoin, TEx auto inputsCount = equiJoin->ChildrenSize() - 2; auto joinTree = equiJoin->Child(inputsCount); - if (!IsRightSideForLeftJoin(joinTree, labels, inputIndex).first) { + auto [leftJoinTree, parentJoinPtr] = IsRightSideForLeftJoin(joinTree, labels, inputIndex); + if (!leftJoinTree) { return equiJoin; } @@ -478,13 +468,12 @@ TExprNode::TPtr FilterPushdownOverJoinOptionalSide(TExprNode::TPtr equiJoin, TEx THashMap<TString, int> joinLabelCounters; CountLabelsInputUsage(joinTree, joinLabelCounters); - auto [leftJoinTree, parentJoinPtr] = IsRightSideForLeftJoin(joinTree, labels, inputIndex); YQL_ENSURE(leftJoinTree); // Left child of the `leftJoinTree` could be a tree, need to walk and decrement them all, the do not need be at fina EquiJoin. DecrementCountLabelsInputUsage(leftJoinTree, joinLabelCounters); - auto leftJoinSettings = equiJoin->ChildPtr(equiJoin->ChildrenSize() - 1); - auto newLeftJoinSettings = RemoveJoinKeysFromElimination(*leftJoinSettings, ctx); + const auto joinSettings = equiJoin->TailPtr(); + const auto innerSettings = parentJoinPtr ? RemoveSetting(*joinSettings, "rename", ctx) : joinSettings; auto innerJoinTree = ctx.ChangeChild(*leftJoinTree, 0, ctx.NewAtom(leftJoinTree->Pos(), "Inner")); auto leftOnlyJoinTree = ctx.ChangeChild(*leftJoinTree, 0, ctx.NewAtom(leftJoinTree->Pos(), "LeftOnly")); @@ -545,7 +534,7 @@ TExprNode::TPtr FilterPushdownOverJoinOptionalSide(TExprNode::TPtr equiJoin, TEx .Atom(1, innerJoinTree->ChildRef(2)->Content()) .Seal() .Add(i++, innerJoinTree) - .Add(i++, newLeftJoinSettings) + .Add(i++, innerSettings) .Seal() .Build(); @@ -577,7 +566,7 @@ TExprNode::TPtr FilterPushdownOverJoinOptionalSide(TExprNode::TPtr equiJoin, TEx .Atom(1, leftOnlyJoinTree->ChildRef(2)->Content()) .Seal() .Add(i++, leftOnlyJoinTree) - .Add(i++, newLeftJoinSettings) + .Add(i++, innerSettings) .Seal() .Build(); @@ -597,7 +586,8 @@ TExprNode::TPtr FilterPushdownOverJoinOptionalSide(TExprNode::TPtr equiJoin, TEx .Build(); if (!parentJoinPtr) { - return unionAll; + // TODO: Evaluate constraints in UnionAll automatically. See https://st.yandex-team.ru/YQL-20085#685bb01e8a10e760cdd58750. + return KeepUniqueDistinct(unionAll, *equiJoin, ctx); } TExprNode::TPtr remJoinKeys; @@ -636,18 +626,6 @@ TExprNode::TPtr FilterPushdownOverJoinOptionalSide(TExprNode::TPtr equiJoin, TEx auto newJoinTree = ctx.ReplaceNode(std::move(joinTree), *parentJoinPtr, newParentJoin); - i = 0; - auto newJoinSettings = ctx.Builder(pos) - .List() - .Do([&](TExprNodeBuilder& parent) -> TExprNodeBuilder& { - for (const auto& child : equiJoin->TailPtr()->ChildrenList()) { - parent.Add(i++, child); - } - return parent; - }) - .Seal() - .Build(); - // Combine join labels from left tree and associate them with result of `EquiJoin` from above. auto combinedLabelList = CombineLabels(leftJoinLabelsFull); auto combinedJoinLabels = CreateLabelList(combinedLabelList, ctx, pos); @@ -672,11 +650,11 @@ TExprNode::TPtr FilterPushdownOverJoinOptionalSide(TExprNode::TPtr equiJoin, TEx .Add(1, combinedJoinLabels) .Seal() .Add(i++, newJoinTree) - .Add(i++, newJoinSettings) + .Add(i++, joinSettings) .Seal() .Build(); - return newEquiJoin; + return KeepUniqueDistinct(newEquiJoin, *equiJoin, ctx); } class TJoinTreeRebuilder { diff --git a/yql/essentials/tests/sql/minirun/part2/canondata/result.json b/yql/essentials/tests/sql/minirun/part2/canondata/result.json index 28b1ecec99c..5c5de71f88e 100644 --- a/yql/essentials/tests/sql/minirun/part2/canondata/result.json +++ b/yql/essentials/tests/sql/minirun/part2/canondata/result.json @@ -814,6 +814,20 @@ "uri": "https://{canondata_backend}/995452/fe026cd31998bf6072028e6429713a6b1158be3d/resource.tar.gz#test.test_join-inmem_with_set_key_any-default.txt-Results_/results.txt" } ], + "test.test[join-yql-20085-distinct-default.txt-Debug]": [ + { + "checksum": "2226cab144ed961ea53e2e9e6f1d881e", + "size": 2555, + "uri": "https://{canondata_backend}/1936947/6e2695e9df81b811b7a80094733bbd1c0188ffdf/resource.tar.gz#test.test_join-yql-20085-distinct-default.txt-Debug_/opt.yql" + } + ], + "test.test[join-yql-20085-distinct-default.txt-Results]": [ + { + "checksum": "fc8289bfacb089191cf58d9e789043f8", + "size": 3023, + "uri": "https://{canondata_backend}/1936947/6e2695e9df81b811b7a80094733bbd1c0188ffdf/resource.tar.gz#test.test_join-yql-20085-distinct-default.txt-Results_/results.txt" + } + ], "test.test[json-json_query/on_empty-default.txt-Debug]": [ { "checksum": "8a7975441c14dc194368455e9b6b828a", diff --git a/yql/essentials/tests/sql/minirun/part7/canondata/result.json b/yql/essentials/tests/sql/minirun/part7/canondata/result.json index 8083b15dac8..c6cd82da8cf 100644 --- a/yql/essentials/tests/sql/minirun/part7/canondata/result.json +++ b/yql/essentials/tests/sql/minirun/part7/canondata/result.json @@ -597,6 +597,20 @@ "uri": "https://{canondata_backend}/1784117/2ea0b35b312aa96a64af8f612c875f1585849c84/resource.tar.gz#test.test_join-inmem_with_set_key-default.txt-Results_/results.txt" } ], + "test.test[join-yql-20085-rename-default.txt-Debug]": [ + { + "checksum": "b42cb95f29a207e98f21a3d590dd53e0", + "size": 1768, + "uri": "https://{canondata_backend}/1924537/0b69b467f37bc6c3cbfab22057a8d8e7bc979c15/resource.tar.gz#test.test_join-yql-20085-rename-default.txt-Debug_/opt.yql" + } + ], + "test.test[join-yql-20085-rename-default.txt-Results]": [ + { + "checksum": "9b46576e462043da56d489c4ad432580", + "size": 2130, + "uri": "https://{canondata_backend}/1924537/0b69b467f37bc6c3cbfab22057a8d8e7bc979c15/resource.tar.gz#test.test_join-yql-20085-rename-default.txt-Results_/results.txt" + } + ], "test.test[join-yql-6199-default.txt-Debug]": [ { "checksum": "8917f654d003e330ee7c8346505596cc", diff --git a/yql/essentials/tests/sql/sql2yql/canondata/result.json b/yql/essentials/tests/sql/sql2yql/canondata/result.json index 8562333d3e8..665f0364892 100644 --- a/yql/essentials/tests/sql/sql2yql/canondata/result.json +++ b/yql/essentials/tests/sql/sql2yql/canondata/result.json @@ -4108,6 +4108,20 @@ "uri": "https://{canondata_backend}/212715/e102f70d9232d825c746de10ca1c287eb99f76ad/resource.tar.gz#test_sql2yql.test_join-yql-19731_/sql.yql" } ], + "test_sql2yql.test[join-yql-20085-distinct]": [ + { + "checksum": "d63befb2bb098cc289dd5f13b4c6f034", + "size": 5007, + "uri": "https://{canondata_backend}/1916746/a9aae45c5a3c55697c94425d0d63b059f9129cca/resource.tar.gz#test_sql2yql.test_join-yql-20085-distinct_/sql.yql" + } + ], + "test_sql2yql.test[join-yql-20085-rename]": [ + { + "checksum": "b5b17a49b19430c9c7f65fb6580734d2", + "size": 3143, + "uri": "https://{canondata_backend}/1924537/ff990ab7def5cabc141ccc68eee760fe4f19ab8b/resource.tar.gz#test_sql2yql.test_join-yql-20085-rename_/sql.yql" + } + ], "test_sql2yql.test[join-yql-6199]": [ { "checksum": "5421f92ec269e5d76133e1ca54b6d0d0", @@ -10578,6 +10592,16 @@ "uri": "file://test_sql_format.test_join-yql-19731_/formatted.sql" } ], + "test_sql_format.test[join-yql-20085-distinct]": [ + { + "uri": "file://test_sql_format.test_join-yql-20085-distinct_/formatted.sql" + } + ], + "test_sql_format.test[join-yql-20085-rename]": [ + { + "uri": "file://test_sql_format.test_join-yql-20085-rename_/formatted.sql" + } + ], "test_sql_format.test[join-yql-6199]": [ { "uri": "file://test_sql_format.test_join-yql-6199_/formatted.sql" diff --git a/yql/essentials/tests/sql/sql2yql/canondata/test_sql_format.test_join-yql-20085-distinct_/formatted.sql b/yql/essentials/tests/sql/sql2yql/canondata/test_sql_format.test_join-yql-20085-distinct_/formatted.sql new file mode 100644 index 00000000000..0ca355a0148 --- /dev/null +++ b/yql/essentials/tests/sql/sql2yql/canondata/test_sql_format.test_join-yql-20085-distinct_/formatted.sql @@ -0,0 +1,66 @@ +PRAGMA FilterPushdownOverJoinOptionalSide; + +$t1 = ( + SELECT DISTINCT + k1 + FROM + AS_TABLE(AsList( + AsStruct(0 AS k1), + AsStruct(1 AS k1), + AsStruct(2 AS k1), + AsStruct(3 AS k1), + AsStruct(4 AS k1), + AsStruct(5 AS k1), + )) +); + +$t2 = ( + SELECT + k2, + SUM(v2) AS v2 + FROM + AS_TABLE(AsList( + AsStruct(2 AS k2, 1 AS v2), + AsStruct(3 AS k2, 1 AS v2), + AsStruct(4 AS k2, 1 AS v2), + AsStruct(5 AS k2, 1 AS v2), + AsStruct(6 AS k2, 1 AS v2), + AsStruct(7 AS k2, 1 AS v2), + )) + GROUP BY + k2 +); + +$t3 = ( + SELECT + k3, + SUM(v3) AS v3 + FROM + AS_TABLE(AsList( + AsStruct(4 AS k3, 1 AS v3), + AsStruct(5 AS k3, 1 AS v3), + AsStruct(6 AS k3, 1 AS v3), + AsStruct(7 AS k3, 1 AS v3), + AsStruct(8 AS k3, 1 AS v3), + AsStruct(9 AS k3, 1 AS v3), + )) + GROUP BY + k3 +); + +SELECT + * +FROM + $t1 AS t1 +LEFT JOIN + $t2 AS t2 +ON + t1.k1 == t2.k2 +LEFT JOIN + $t3 AS t3 +ON + t1.k1 == t3.k3 +WHERE + t2.v2 < 10 + AND t3.v3 < 10 +; diff --git a/yql/essentials/tests/sql/sql2yql/canondata/test_sql_format.test_join-yql-20085-rename_/formatted.sql b/yql/essentials/tests/sql/sql2yql/canondata/test_sql_format.test_join-yql-20085-rename_/formatted.sql new file mode 100644 index 00000000000..d8c53e4b505 --- /dev/null +++ b/yql/essentials/tests/sql/sql2yql/canondata/test_sql_format.test_join-yql-20085-rename_/formatted.sql @@ -0,0 +1,45 @@ +PRAGMA FilterPushdownOverJoinOptionalSide; + +$t1_data = AsList( + AsStruct(0 AS m1), + AsStruct(1 AS m1), + AsStruct(2 AS m1), + AsStruct(3 AS m1), + AsStruct(4 AS m1), + AsStruct(5 AS m1), +); + +$t2_data = AsList( + AsStruct(2 AS m2, 2 AS m3, 2 AS m4), + AsStruct(3 AS m2, 3 AS m3, 3 AS m4), + AsStruct(4 AS m2, 4 AS m3, 4 AS m4), + AsStruct(5 AS m2, 5 AS m3, 5 AS m4), + AsStruct(6 AS m2, 6 AS m3, 6 AS m4), + AsStruct(7 AS m2, 7 AS m3, 7 AS m4), +); + +$t3_data = AsList( + AsStruct(4 AS m5), + AsStruct(5 AS m5), + AsStruct(6 AS m5), +); + +SELECT + * +FROM ( + SELECT + * + FROM + AS_TABLE($t1_data) AS t1 + LEFT JOIN + AS_TABLE($t2_data) AS t2 + ON + t1.m1 == t2.m2 +) AS t_joined +LEFT SEMI JOIN + AS_TABLE($t3_data) AS t3 +ON + t_joined.m3 == t3.m5 +WHERE + t_joined.m4 < 5 +; diff --git a/yql/essentials/tests/sql/suites/join/yql-20085-distinct.sql b/yql/essentials/tests/sql/suites/join/yql-20085-distinct.sql new file mode 100644 index 00000000000..0ca355a0148 --- /dev/null +++ b/yql/essentials/tests/sql/suites/join/yql-20085-distinct.sql @@ -0,0 +1,66 @@ +PRAGMA FilterPushdownOverJoinOptionalSide; + +$t1 = ( + SELECT DISTINCT + k1 + FROM + AS_TABLE(AsList( + AsStruct(0 AS k1), + AsStruct(1 AS k1), + AsStruct(2 AS k1), + AsStruct(3 AS k1), + AsStruct(4 AS k1), + AsStruct(5 AS k1), + )) +); + +$t2 = ( + SELECT + k2, + SUM(v2) AS v2 + FROM + AS_TABLE(AsList( + AsStruct(2 AS k2, 1 AS v2), + AsStruct(3 AS k2, 1 AS v2), + AsStruct(4 AS k2, 1 AS v2), + AsStruct(5 AS k2, 1 AS v2), + AsStruct(6 AS k2, 1 AS v2), + AsStruct(7 AS k2, 1 AS v2), + )) + GROUP BY + k2 +); + +$t3 = ( + SELECT + k3, + SUM(v3) AS v3 + FROM + AS_TABLE(AsList( + AsStruct(4 AS k3, 1 AS v3), + AsStruct(5 AS k3, 1 AS v3), + AsStruct(6 AS k3, 1 AS v3), + AsStruct(7 AS k3, 1 AS v3), + AsStruct(8 AS k3, 1 AS v3), + AsStruct(9 AS k3, 1 AS v3), + )) + GROUP BY + k3 +); + +SELECT + * +FROM + $t1 AS t1 +LEFT JOIN + $t2 AS t2 +ON + t1.k1 == t2.k2 +LEFT JOIN + $t3 AS t3 +ON + t1.k1 == t3.k3 +WHERE + t2.v2 < 10 + AND t3.v3 < 10 +; diff --git a/yql/essentials/tests/sql/suites/join/yql-20085-rename.sql b/yql/essentials/tests/sql/suites/join/yql-20085-rename.sql new file mode 100644 index 00000000000..51700e18278 --- /dev/null +++ b/yql/essentials/tests/sql/suites/join/yql-20085-rename.sql @@ -0,0 +1,38 @@ +PRAGMA FilterPushdownOverJoinOptionalSide; + +$t1_data = AsList( + AsStruct(0 AS m1), + AsStruct(1 AS m1), + AsStruct(2 AS m1), + AsStruct(3 AS m1), + AsStruct(4 AS m1), + AsStruct(5 AS m1), +); + +$t2_data = AsList( + AsStruct(2 AS m2, 2 AS m3, 2 AS m4), + AsStruct(3 AS m2, 3 AS m3, 3 AS m4), + AsStruct(4 AS m2, 4 AS m3, 4 AS m4), + AsStruct(5 AS m2, 5 AS m3, 5 AS m4), + AsStruct(6 AS m2, 6 AS m3, 6 AS m4), + AsStruct(7 AS m2, 7 AS m3, 7 AS m4), +); + +$t3_data = AsList( + AsStruct(4 AS m5), + AsStruct(5 AS m5), + AsStruct(6 AS m5), +); + + +SELECT * +FROM ( + SELECT * + FROM AS_TABLE($t1_data) AS t1 + LEFT JOIN AS_TABLE($t2_data) AS t2 + ON t1.m1 == t2.m2 +) AS t_joined +LEFT SEMI JOIN AS_TABLE($t3_data) AS t3 +ON t_joined.m3 == t3.m5 +WHERE t_joined.m4 < 5 +; |