diff options
author | aidarsamer <[email protected]> | 2023-03-30 12:56:27 +0300 |
---|---|---|
committer | aidarsamer <[email protected]> | 2023-03-30 12:56:27 +0300 |
commit | 58206a8c5c05c6fd7af5136f77778b580ff93a88 (patch) | |
tree | f40c675c669689c7a76ec9cb7470126772084ea0 | |
parent | 4b564f145511952509eb3a175dd240c91681cc65 (diff) |
Apply LIKE filters on column shards after simple filters
5 files changed, 255 insertions, 127 deletions
diff --git a/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp b/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp index b90a45a2918..30d5a0f88c3 100644 --- a/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp +++ b/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp @@ -6,6 +6,8 @@ #include <ydb/library/yql/core/extract_predicate/extract_predicate.h> #include <ydb/library/yql/core/yql_opt_utils.h> +#include <unordered_set> + namespace NKikimr::NKqp::NOpt { using namespace NYql; @@ -15,6 +17,63 @@ namespace { static TMaybeNode<TExprBase> NullNode = TMaybeNode<TExprBase>(); +static const std::unordered_set<std::string> SecondLevelFilters = { + "string_contains", + "starts_with", + "ends_with" +}; + +struct TFilterOpsLevels { + TFilterOpsLevels(const TMaybeNode<TExprBase>& firstLevel, const TMaybeNode<TExprBase>& secondLevel) + : FirstLevelOps(firstLevel) + , SecondLevelOps(secondLevel) + {} + + TFilterOpsLevels(const TMaybeNode<TExprBase>& predicate) + : FirstLevelOps(predicate) + , SecondLevelOps(NullNode) + { + if (IsSecondLevelOp(predicate)) { + FirstLevelOps = NullNode; + SecondLevelOps = predicate; + } + } + + bool IsValid() { + return FirstLevelOps.IsValid() || SecondLevelOps.IsValid(); + } + + bool IsSecondLevelOp(const TMaybeNode<TExprBase>& predicate) { + if (auto maybeCompare = predicate.Maybe<TKqpOlapFilterCompare>()) { + auto op = maybeCompare.Cast().Operator().StringValue(); + if (SecondLevelFilters.find(op) != SecondLevelFilters.end()) { + return true; + } + } + return false; + } + + void WrapToNotOp(TExprContext& ctx, TPositionHandle pos) { + if (FirstLevelOps.IsValid()) { + FirstLevelOps = Build<TKqpOlapNot>(ctx, pos) + .Value(FirstLevelOps.Cast()) + .Done(); + } + + if (SecondLevelOps.IsValid()) { + SecondLevelOps = Build<TKqpOlapNot>(ctx, pos) + .Value(SecondLevelOps.Cast()) + .Done(); + } + } + + + TMaybeNode<TExprBase> FirstLevelOps; + TMaybeNode<TExprBase> SecondLevelOps; +}; + +static TFilterOpsLevels NullFilterOpsLevels = TFilterOpsLevels(NullNode, NullNode); + bool IsFalseLiteral(TExprBase node) { return node.Maybe<TCoBool>() && !FromString<bool>(node.Cast<TCoBool>().Literal().Value()); } @@ -329,70 +388,88 @@ TMaybeNode<TExprBase> CoalescePushdown(const TCoCoalesce& coalesce, TExprContext return NullNode; } -TMaybeNode<TExprBase> PredicatePushdown(const TExprBase& predicate, TExprContext& ctx, TPositionHandle pos) +TFilterOpsLevels PredicatePushdown(const TExprBase& predicate, TExprContext& ctx, TPositionHandle pos) { auto maybeCoalesce = predicate.Maybe<TCoCoalesce>(); if (maybeCoalesce.IsValid()) { - return CoalescePushdown(maybeCoalesce.Cast(), ctx, pos); + auto coalescePred = CoalescePushdown(maybeCoalesce.Cast(), ctx, pos); + return TFilterOpsLevels(coalescePred); } auto maybeExists = predicate.Maybe<TCoExists>(); if (maybeExists.IsValid()) { - return ExistsPushdown(maybeExists.Cast(), ctx, pos); + auto existsPred = ExistsPushdown(maybeExists.Cast(), ctx, pos); + return TFilterOpsLevels(existsPred); } auto maybePredicate = predicate.Maybe<TCoCompare>(); if (maybePredicate.IsValid()) { - return SimplePredicatePushdown(maybePredicate.Cast(), ctx, pos); + auto pred = SimplePredicatePushdown(maybePredicate.Cast(), ctx, pos); + return TFilterOpsLevels(pred); } if (predicate.Maybe<TCoNot>()) { auto notNode = predicate.Cast<TCoNot>(); - auto pushedNot = PredicatePushdown(notNode.Value(), ctx, pos); - - if (!pushedNot.IsValid()) { - return NullNode; - } - - return Build<TKqpOlapNot>(ctx, pos) - .Value(pushedNot.Cast()) - .Done(); + auto pushedFilters = PredicatePushdown(notNode.Value(), ctx, pos); + pushedFilters.WrapToNotOp(ctx, pos); + return pushedFilters; } if (!predicate.Maybe<TCoAnd>() && !predicate.Maybe<TCoOr>() && !predicate.Maybe<TCoXor>()) { - return NullNode; + return NullFilterOpsLevels; } - TVector<TExprBase> pushedOps; - pushedOps.reserve(predicate.Ptr()->ChildrenSize()); + TVector<TExprBase> firstLvlOps; + TVector<TExprBase> secondLvlOps; + firstLvlOps.reserve(predicate.Ptr()->ChildrenSize()); + secondLvlOps.reserve(predicate.Ptr()->ChildrenSize()); for (auto& child: predicate.Ptr()->Children()) { auto pushedChild = PredicatePushdown(TExprBase(child), ctx, pos); if (!pushedChild.IsValid()) { - return NullNode; + return NullFilterOpsLevels; } - pushedOps.emplace_back(pushedChild.Cast()); + if (pushedChild.FirstLevelOps.IsValid()) { + firstLvlOps.emplace_back(pushedChild.FirstLevelOps.Cast()); + } + if (pushedChild.SecondLevelOps.IsValid()) { + secondLvlOps.emplace_back(pushedChild.SecondLevelOps.Cast()); + } } if (predicate.Maybe<TCoAnd>()) { - return Build<TKqpOlapAnd>(ctx, pos) - .Add(pushedOps) - .Done(); + auto firstLvl = NullNode; + if (!firstLvlOps.empty()) { + firstLvl = Build<TKqpOlapAnd>(ctx, pos) + .Add(firstLvlOps) + .Done(); + } + auto secondLvl = NullNode; + if (!secondLvlOps.empty()) { + secondLvl = Build<TKqpOlapAnd>(ctx, pos) + .Add(secondLvlOps) + .Done(); + } + return TFilterOpsLevels(firstLvl, secondLvl); } if (predicate.Maybe<TCoOr>()) { - return Build<TKqpOlapOr>(ctx, pos) - .Add(pushedOps) + auto ops = Build<TKqpOlapOr>(ctx, pos) + .Add(firstLvlOps) + .Add(secondLvlOps) .Done(); + return TFilterOpsLevels(ops, NullNode); } Y_VERIFY_DEBUG(predicate.Maybe<TCoXor>()); - return Build<TKqpOlapXor>(ctx, pos) - .Add(pushedOps) + auto ops = Build<TKqpOlapXor>(ctx, pos) + .Add(firstLvlOps) + .Add(secondLvlOps) .Done(); + return TFilterOpsLevels(ops, NullNode); } void SplitForPartialPushdown(const TPredicateNode& predicateTree, TPredicateNode& predicatesToPush, TPredicateNode& remainingPredicates, @@ -476,18 +553,28 @@ TExprBase KqpPushOlapFilter(TExprBase node, TExprContext& ctx, const TKqpOptimiz YQL_ENSURE(predicatesToPush.IsValid(), "Predicates to push is invalid"); YQL_ENSURE(remainingPredicates.IsValid(), "Remaining predicates is invalid"); - auto pushedPredicate = PredicatePushdown(predicatesToPush.ExprNode.Cast(), ctx, node.Pos()); - YQL_ENSURE(pushedPredicate.IsValid(), "Pushed predicate should be always valid!"); + auto pushedFilters = PredicatePushdown(predicatesToPush.ExprNode.Cast(), ctx, node.Pos()); + YQL_ENSURE(pushedFilters.IsValid(), "Pushed predicate should be always valid!"); - auto olapFilter = Build<TKqpOlapFilter>(ctx, node.Pos()) - .Input(read.Process().Body()) - .Condition(pushedPredicate.Cast()) - .Done(); + TMaybeNode<TExprBase> olapFilter; + if (pushedFilters.FirstLevelOps.IsValid()) { + olapFilter = Build<TKqpOlapFilter>(ctx, node.Pos()) + .Input(read.Process().Body()) + .Condition(pushedFilters.FirstLevelOps.Cast()) + .Done(); + } + + if (pushedFilters.SecondLevelOps.IsValid()) { + olapFilter = Build<TKqpOlapFilter>(ctx, node.Pos()) + .Input(olapFilter.IsValid() ? olapFilter.Cast() : read.Process().Body()) + .Condition(pushedFilters.SecondLevelOps.Cast()) + .Done(); + } auto newProcessLambda = Build<TCoLambda>(ctx, node.Pos()) .Args({"olap_filter_row"}) .Body<TExprApplier>() - .Apply(olapFilter) + .Apply(olapFilter.Cast()) .With(read.Process().Args().Arg(0), "olap_filter_row") .Build() .Done(); diff --git a/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp b/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp index ba7e736cdb6..8fb80d889c1 100644 --- a/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp +++ b/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp @@ -485,6 +485,14 @@ void CompileOlapProgramImpl(TExprBase operation, TKqpOlapCompileContext& ctx) { if (auto maybeOlapOperation = operation.Maybe<TKqpOlapOperationBase>()) { CompileOlapProgramImpl(maybeOlapOperation.Cast().Input(), ctx); if (auto maybeFilter = operation.Maybe<TKqpOlapFilter>()) { + // On the first level of filters we apply fast and light filters: <, >, !=, == etc. + // On the second level we apply high-cost filters (LIKE operation) on top of filtered data from the 1st level. + if (maybeFilter.Cast().Input().Maybe<TKqpOlapFilter>()) { + // The 2nd level of filters use the result of 1st level as input. + // We create an empty projection to run first level apply. + // Because real execution of filters is done on Projection and GroupBy steps. + ctx.CreateProjection(); + } CompileFilter(maybeFilter.Cast(), ctx); } else if (auto maybeAgg = operation.Maybe<TKqpOlapAgg>()) { CompileAggregates(maybeAgg.Cast(), ctx); diff --git a/ydb/core/kqp/ut/olap/kqp_olap_ut.cpp b/ydb/core/kqp/ut/olap/kqp_olap_ut.cpp index 3ba59719c99..e17f7667ff1 100644 --- a/ydb/core/kqp/ut/olap/kqp_olap_ut.cpp +++ b/ydb/core/kqp/ut/olap/kqp_olap_ut.cpp @@ -1318,6 +1318,63 @@ Y_UNIT_TEST_SUITE(KqpOlap) { } } +#if SSA_RUNTIME_VERSION >= 2U + Y_UNIT_TEST(PredicatePushdown_DifferentLvlOfFilters) { + auto settings = TKikimrSettings() + .SetWithSampleTables(false); + TKikimrRunner kikimr(settings); + + TStreamExecScanQuerySettings scanSettings; + scanSettings.Explain(true); + + TLocalHelper(kikimr).CreateTestOlapTable(); + WriteTestData(kikimr, "/Root/olapStore/olapTable", 10000, 3000000, 5); + EnableDebugLogging(kikimr); + + auto tableClient = kikimr.GetTableClient(); + + std::vector< std::pair<TString, TString> > secondLvlFilters = { + { R"(`uid` LIKE "%30000%")", "TableFullScan" }, + { R"(`uid` NOT LIKE "%30000%")", "TableFullScan" }, + { R"(`uid` LIKE "uid%")", "TableFullScan" }, + { R"(`uid` LIKE "%001")", "TableFullScan" }, + { R"(`uid` LIKE "uid%001")", "Filter-TableFullScan" }, // We have filter (Size >= 6) + }; + std::string query = R"( + SELECT `timestamp` FROM `/Root/olapStore/olapTable` WHERE + `level` >= 1 AND + )"; + + for (auto filter : secondLvlFilters) { + auto it = tableClient.StreamExecuteScanQuery(query + filter.first, scanSettings).GetValueSync(); + UNIT_ASSERT_C(it.IsSuccess(), it.GetIssues().ToString()); + + auto result = CollectStreamResult(it); + NJson::TJsonValue plan; + NJson::ReadJsonTree(*result.PlanJson, &plan, true); + + auto readNode = FindPlanNodeByKv(plan, "Node Type", filter.second); + UNIT_ASSERT(readNode.IsDefined()); + + auto& operators = readNode.GetMapSafe().at("Operators").GetArraySafe(); + for (auto& op : operators) { + if (op.GetMapSafe().at("Name") == "TableFullScan") { + UNIT_ASSERT(op.GetMapSafe().at("SsaProgram").IsDefined()); + auto ssa = op.GetMapSafe().at("SsaProgram").GetStringRobust(); + int filterCmdCount = 0; + std::string::size_type pos = 0; + std::string filterCmd = R"("Filter":{)"; + while ((pos = ssa.find(filterCmd, pos)) != std::string::npos) { + ++filterCmdCount; + pos += filterCmd.size(); + } + UNIT_ASSERT_EQUAL(filterCmdCount, 2); + } + } + } + } +#endif + Y_UNIT_TEST(PredicatePushdown_MixStrictAndNotStrict) { auto settings = TKikimrSettings() .SetWithSampleTables(false); diff --git a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-21 b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-21 index d2182e183c4..59bac5b20b7 100644 --- a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-21 +++ b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-21 @@ -80,7 +80,7 @@ "Id": 106 }, "Constant": { - "Text": "google" + "Text": "" } } }, @@ -92,66 +92,58 @@ "Function": { "Arguments": [ { - "Id": 14 + "Id": 40 }, { "Id": 106 } ], - "Id": 9 + "Id": 2 } } }, { - "Assign": { - "Column": { - "Id": 108 - }, - "Constant": { - "Text": "" + "Filter": { + "Predicate": { + "Id": 107 } } }, { + "Projection": {} + }, + { "Assign": { "Column": { - "Id": 109 + "Id": 108 }, - "Function": { - "Arguments": [ - { - "Id": 40 - }, - { - "Id": 108 - } - ], - "Id": 2 + "Constant": { + "Text": "google" } } }, { "Assign": { "Column": { - "Id": 110 + "Id": 109 }, "Function": { "Arguments": [ { - "Id": 107 + "Id": 14 }, { - "Id": 109 + "Id": 108 } ], - "Id": 11 + "Id": 9 } } }, { "Filter": { "Predicate": { - "Id": 110 + "Id": 109 } } }, diff --git a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22 b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22 index fcf79ea2007..6cfbd7f4bee 100644 --- a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22 +++ b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22 @@ -96,7 +96,7 @@ "Id": 106 }, "Constant": { - "Text": "Google" + "Text": "" } } }, @@ -108,23 +108,33 @@ "Function": { "Arguments": [ { - "Id": 3 + "Id": 40 }, { "Id": 106 } ], - "Id": 9 + "Id": 2 } } }, { + "Filter": { + "Predicate": { + "Id": 107 + } + } + }, + { + "Projection": {} + }, + { "Assign": { "Column": { "Id": 108 }, "Constant": { - "Text": ".google." + "Text": "Google" } } }, @@ -136,7 +146,7 @@ "Function": { "Arguments": [ { - "Id": 14 + "Id": 3 }, { "Id": 108 @@ -151,74 +161,56 @@ "Column": { "Id": 110 }, - "Function": { - "Arguments": [ - { - "Id": 109 - } - ], - "Id": 10 - } - } - }, - { - "Assign": { - "Column": { - "Id": 111 - }, "Constant": { - "Text": "" + "Text": ".google." } } }, { "Assign": { "Column": { - "Id": 112 + "Id": 111 }, "Function": { "Arguments": [ { - "Id": 40 + "Id": 14 }, { - "Id": 111 + "Id": 110 } ], - "Id": 2 + "Id": 9 } } }, { "Assign": { "Column": { - "Id": 113 + "Id": 112 }, "Function": { "Arguments": [ { - "Id": 110 - }, - { - "Id": 112 + "Id": 111 } ], - "Id": 11 + "Id": 10 } } }, { "Assign": { "Column": { - "Id": 114 + "Id": 113 }, "Function": { "Arguments": [ { - "Id": 107 + "Id": 109 }, { - "Id": 113 + "Id": 112 } ], "Id": 11 @@ -228,7 +220,7 @@ { "Filter": { "Predicate": { - "Id": 114 + "Id": 113 } } }, @@ -325,7 +317,7 @@ "Id": 106 }, "Constant": { - "Text": "Google" + "Text": "" } } }, @@ -337,23 +329,33 @@ "Function": { "Arguments": [ { - "Id": 3 + "Id": 40 }, { "Id": 106 } ], - "Id": 9 + "Id": 2 } } }, { + "Filter": { + "Predicate": { + "Id": 107 + } + } + }, + { + "Projection": {} + }, + { "Assign": { "Column": { "Id": 108 }, "Constant": { - "Text": ".google." + "Text": "Google" } } }, @@ -365,7 +367,7 @@ "Function": { "Arguments": [ { - "Id": 14 + "Id": 3 }, { "Id": 108 @@ -380,74 +382,56 @@ "Column": { "Id": 110 }, - "Function": { - "Arguments": [ - { - "Id": 109 - } - ], - "Id": 10 - } - } - }, - { - "Assign": { - "Column": { - "Id": 111 - }, "Constant": { - "Text": "" + "Text": ".google." } } }, { "Assign": { "Column": { - "Id": 112 + "Id": 111 }, "Function": { "Arguments": [ { - "Id": 40 + "Id": 14 }, { - "Id": 111 + "Id": 110 } ], - "Id": 2 + "Id": 9 } } }, { "Assign": { "Column": { - "Id": 113 + "Id": 112 }, "Function": { "Arguments": [ { - "Id": 110 - }, - { - "Id": 112 + "Id": 111 } ], - "Id": 11 + "Id": 10 } } }, { "Assign": { "Column": { - "Id": 114 + "Id": 113 }, "Function": { "Arguments": [ { - "Id": 107 + "Id": 109 }, { - "Id": 113 + "Id": 112 } ], "Id": 11 @@ -457,7 +441,7 @@ { "Filter": { "Predicate": { - "Id": 114 + "Id": 113 } } }, |