summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoraidarsamer <[email protected]>2023-03-30 12:56:27 +0300
committeraidarsamer <[email protected]>2023-03-30 12:56:27 +0300
commit58206a8c5c05c6fd7af5136f77778b580ff93a88 (patch)
treef40c675c669689c7a76ec9cb7470126772084ea0
parent4b564f145511952509eb3a175dd240c91681cc65 (diff)
Apply LIKE filters on column shards after simple filters
-rw-r--r--ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp151
-rw-r--r--ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp8
-rw-r--r--ydb/core/kqp/ut/olap/kqp_olap_ut.cpp57
-rw-r--r--ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-2142
-rw-r--r--ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22124
5 files changed, 255 insertions, 127 deletions
diff --git a/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp b/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp
index b90a45a2918..30d5a0f88c3 100644
--- a/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp
+++ b/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp
@@ -6,6 +6,8 @@
#include <ydb/library/yql/core/extract_predicate/extract_predicate.h>
#include <ydb/library/yql/core/yql_opt_utils.h>
+#include <unordered_set>
+
namespace NKikimr::NKqp::NOpt {
using namespace NYql;
@@ -15,6 +17,63 @@ namespace {
static TMaybeNode<TExprBase> NullNode = TMaybeNode<TExprBase>();
+static const std::unordered_set<std::string> SecondLevelFilters = {
+ "string_contains",
+ "starts_with",
+ "ends_with"
+};
+
+struct TFilterOpsLevels {
+ TFilterOpsLevels(const TMaybeNode<TExprBase>& firstLevel, const TMaybeNode<TExprBase>& secondLevel)
+ : FirstLevelOps(firstLevel)
+ , SecondLevelOps(secondLevel)
+ {}
+
+ TFilterOpsLevels(const TMaybeNode<TExprBase>& predicate)
+ : FirstLevelOps(predicate)
+ , SecondLevelOps(NullNode)
+ {
+ if (IsSecondLevelOp(predicate)) {
+ FirstLevelOps = NullNode;
+ SecondLevelOps = predicate;
+ }
+ }
+
+ bool IsValid() {
+ return FirstLevelOps.IsValid() || SecondLevelOps.IsValid();
+ }
+
+ bool IsSecondLevelOp(const TMaybeNode<TExprBase>& predicate) {
+ if (auto maybeCompare = predicate.Maybe<TKqpOlapFilterCompare>()) {
+ auto op = maybeCompare.Cast().Operator().StringValue();
+ if (SecondLevelFilters.find(op) != SecondLevelFilters.end()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ void WrapToNotOp(TExprContext& ctx, TPositionHandle pos) {
+ if (FirstLevelOps.IsValid()) {
+ FirstLevelOps = Build<TKqpOlapNot>(ctx, pos)
+ .Value(FirstLevelOps.Cast())
+ .Done();
+ }
+
+ if (SecondLevelOps.IsValid()) {
+ SecondLevelOps = Build<TKqpOlapNot>(ctx, pos)
+ .Value(SecondLevelOps.Cast())
+ .Done();
+ }
+ }
+
+
+ TMaybeNode<TExprBase> FirstLevelOps;
+ TMaybeNode<TExprBase> SecondLevelOps;
+};
+
+static TFilterOpsLevels NullFilterOpsLevels = TFilterOpsLevels(NullNode, NullNode);
+
bool IsFalseLiteral(TExprBase node) {
return node.Maybe<TCoBool>() && !FromString<bool>(node.Cast<TCoBool>().Literal().Value());
}
@@ -329,70 +388,88 @@ TMaybeNode<TExprBase> CoalescePushdown(const TCoCoalesce& coalesce, TExprContext
return NullNode;
}
-TMaybeNode<TExprBase> PredicatePushdown(const TExprBase& predicate, TExprContext& ctx, TPositionHandle pos)
+TFilterOpsLevels PredicatePushdown(const TExprBase& predicate, TExprContext& ctx, TPositionHandle pos)
{
auto maybeCoalesce = predicate.Maybe<TCoCoalesce>();
if (maybeCoalesce.IsValid()) {
- return CoalescePushdown(maybeCoalesce.Cast(), ctx, pos);
+ auto coalescePred = CoalescePushdown(maybeCoalesce.Cast(), ctx, pos);
+ return TFilterOpsLevels(coalescePred);
}
auto maybeExists = predicate.Maybe<TCoExists>();
if (maybeExists.IsValid()) {
- return ExistsPushdown(maybeExists.Cast(), ctx, pos);
+ auto existsPred = ExistsPushdown(maybeExists.Cast(), ctx, pos);
+ return TFilterOpsLevels(existsPred);
}
auto maybePredicate = predicate.Maybe<TCoCompare>();
if (maybePredicate.IsValid()) {
- return SimplePredicatePushdown(maybePredicate.Cast(), ctx, pos);
+ auto pred = SimplePredicatePushdown(maybePredicate.Cast(), ctx, pos);
+ return TFilterOpsLevels(pred);
}
if (predicate.Maybe<TCoNot>()) {
auto notNode = predicate.Cast<TCoNot>();
- auto pushedNot = PredicatePushdown(notNode.Value(), ctx, pos);
-
- if (!pushedNot.IsValid()) {
- return NullNode;
- }
-
- return Build<TKqpOlapNot>(ctx, pos)
- .Value(pushedNot.Cast())
- .Done();
+ auto pushedFilters = PredicatePushdown(notNode.Value(), ctx, pos);
+ pushedFilters.WrapToNotOp(ctx, pos);
+ return pushedFilters;
}
if (!predicate.Maybe<TCoAnd>() && !predicate.Maybe<TCoOr>() && !predicate.Maybe<TCoXor>()) {
- return NullNode;
+ return NullFilterOpsLevels;
}
- TVector<TExprBase> pushedOps;
- pushedOps.reserve(predicate.Ptr()->ChildrenSize());
+ TVector<TExprBase> firstLvlOps;
+ TVector<TExprBase> secondLvlOps;
+ firstLvlOps.reserve(predicate.Ptr()->ChildrenSize());
+ secondLvlOps.reserve(predicate.Ptr()->ChildrenSize());
for (auto& child: predicate.Ptr()->Children()) {
auto pushedChild = PredicatePushdown(TExprBase(child), ctx, pos);
if (!pushedChild.IsValid()) {
- return NullNode;
+ return NullFilterOpsLevels;
}
- pushedOps.emplace_back(pushedChild.Cast());
+ if (pushedChild.FirstLevelOps.IsValid()) {
+ firstLvlOps.emplace_back(pushedChild.FirstLevelOps.Cast());
+ }
+ if (pushedChild.SecondLevelOps.IsValid()) {
+ secondLvlOps.emplace_back(pushedChild.SecondLevelOps.Cast());
+ }
}
if (predicate.Maybe<TCoAnd>()) {
- return Build<TKqpOlapAnd>(ctx, pos)
- .Add(pushedOps)
- .Done();
+ auto firstLvl = NullNode;
+ if (!firstLvlOps.empty()) {
+ firstLvl = Build<TKqpOlapAnd>(ctx, pos)
+ .Add(firstLvlOps)
+ .Done();
+ }
+ auto secondLvl = NullNode;
+ if (!secondLvlOps.empty()) {
+ secondLvl = Build<TKqpOlapAnd>(ctx, pos)
+ .Add(secondLvlOps)
+ .Done();
+ }
+ return TFilterOpsLevels(firstLvl, secondLvl);
}
if (predicate.Maybe<TCoOr>()) {
- return Build<TKqpOlapOr>(ctx, pos)
- .Add(pushedOps)
+ auto ops = Build<TKqpOlapOr>(ctx, pos)
+ .Add(firstLvlOps)
+ .Add(secondLvlOps)
.Done();
+ return TFilterOpsLevels(ops, NullNode);
}
Y_VERIFY_DEBUG(predicate.Maybe<TCoXor>());
- return Build<TKqpOlapXor>(ctx, pos)
- .Add(pushedOps)
+ auto ops = Build<TKqpOlapXor>(ctx, pos)
+ .Add(firstLvlOps)
+ .Add(secondLvlOps)
.Done();
+ return TFilterOpsLevels(ops, NullNode);
}
void SplitForPartialPushdown(const TPredicateNode& predicateTree, TPredicateNode& predicatesToPush, TPredicateNode& remainingPredicates,
@@ -476,18 +553,28 @@ TExprBase KqpPushOlapFilter(TExprBase node, TExprContext& ctx, const TKqpOptimiz
YQL_ENSURE(predicatesToPush.IsValid(), "Predicates to push is invalid");
YQL_ENSURE(remainingPredicates.IsValid(), "Remaining predicates is invalid");
- auto pushedPredicate = PredicatePushdown(predicatesToPush.ExprNode.Cast(), ctx, node.Pos());
- YQL_ENSURE(pushedPredicate.IsValid(), "Pushed predicate should be always valid!");
+ auto pushedFilters = PredicatePushdown(predicatesToPush.ExprNode.Cast(), ctx, node.Pos());
+ YQL_ENSURE(pushedFilters.IsValid(), "Pushed predicate should be always valid!");
- auto olapFilter = Build<TKqpOlapFilter>(ctx, node.Pos())
- .Input(read.Process().Body())
- .Condition(pushedPredicate.Cast())
- .Done();
+ TMaybeNode<TExprBase> olapFilter;
+ if (pushedFilters.FirstLevelOps.IsValid()) {
+ olapFilter = Build<TKqpOlapFilter>(ctx, node.Pos())
+ .Input(read.Process().Body())
+ .Condition(pushedFilters.FirstLevelOps.Cast())
+ .Done();
+ }
+
+ if (pushedFilters.SecondLevelOps.IsValid()) {
+ olapFilter = Build<TKqpOlapFilter>(ctx, node.Pos())
+ .Input(olapFilter.IsValid() ? olapFilter.Cast() : read.Process().Body())
+ .Condition(pushedFilters.SecondLevelOps.Cast())
+ .Done();
+ }
auto newProcessLambda = Build<TCoLambda>(ctx, node.Pos())
.Args({"olap_filter_row"})
.Body<TExprApplier>()
- .Apply(olapFilter)
+ .Apply(olapFilter.Cast())
.With(read.Process().Args().Arg(0), "olap_filter_row")
.Build()
.Done();
diff --git a/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp b/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp
index ba7e736cdb6..8fb80d889c1 100644
--- a/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp
+++ b/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp
@@ -485,6 +485,14 @@ void CompileOlapProgramImpl(TExprBase operation, TKqpOlapCompileContext& ctx) {
if (auto maybeOlapOperation = operation.Maybe<TKqpOlapOperationBase>()) {
CompileOlapProgramImpl(maybeOlapOperation.Cast().Input(), ctx);
if (auto maybeFilter = operation.Maybe<TKqpOlapFilter>()) {
+ // On the first level of filters we apply fast and light filters: <, >, !=, == etc.
+ // On the second level we apply high-cost filters (LIKE operation) on top of filtered data from the 1st level.
+ if (maybeFilter.Cast().Input().Maybe<TKqpOlapFilter>()) {
+ // The 2nd level of filters use the result of 1st level as input.
+ // We create an empty projection to run first level apply.
+ // Because real execution of filters is done on Projection and GroupBy steps.
+ ctx.CreateProjection();
+ }
CompileFilter(maybeFilter.Cast(), ctx);
} else if (auto maybeAgg = operation.Maybe<TKqpOlapAgg>()) {
CompileAggregates(maybeAgg.Cast(), ctx);
diff --git a/ydb/core/kqp/ut/olap/kqp_olap_ut.cpp b/ydb/core/kqp/ut/olap/kqp_olap_ut.cpp
index 3ba59719c99..e17f7667ff1 100644
--- a/ydb/core/kqp/ut/olap/kqp_olap_ut.cpp
+++ b/ydb/core/kqp/ut/olap/kqp_olap_ut.cpp
@@ -1318,6 +1318,63 @@ Y_UNIT_TEST_SUITE(KqpOlap) {
}
}
+#if SSA_RUNTIME_VERSION >= 2U
+ Y_UNIT_TEST(PredicatePushdown_DifferentLvlOfFilters) {
+ auto settings = TKikimrSettings()
+ .SetWithSampleTables(false);
+ TKikimrRunner kikimr(settings);
+
+ TStreamExecScanQuerySettings scanSettings;
+ scanSettings.Explain(true);
+
+ TLocalHelper(kikimr).CreateTestOlapTable();
+ WriteTestData(kikimr, "/Root/olapStore/olapTable", 10000, 3000000, 5);
+ EnableDebugLogging(kikimr);
+
+ auto tableClient = kikimr.GetTableClient();
+
+ std::vector< std::pair<TString, TString> > secondLvlFilters = {
+ { R"(`uid` LIKE "%30000%")", "TableFullScan" },
+ { R"(`uid` NOT LIKE "%30000%")", "TableFullScan" },
+ { R"(`uid` LIKE "uid%")", "TableFullScan" },
+ { R"(`uid` LIKE "%001")", "TableFullScan" },
+ { R"(`uid` LIKE "uid%001")", "Filter-TableFullScan" }, // We have filter (Size >= 6)
+ };
+ std::string query = R"(
+ SELECT `timestamp` FROM `/Root/olapStore/olapTable` WHERE
+ `level` >= 1 AND
+ )";
+
+ for (auto filter : secondLvlFilters) {
+ auto it = tableClient.StreamExecuteScanQuery(query + filter.first, scanSettings).GetValueSync();
+ UNIT_ASSERT_C(it.IsSuccess(), it.GetIssues().ToString());
+
+ auto result = CollectStreamResult(it);
+ NJson::TJsonValue plan;
+ NJson::ReadJsonTree(*result.PlanJson, &plan, true);
+
+ auto readNode = FindPlanNodeByKv(plan, "Node Type", filter.second);
+ UNIT_ASSERT(readNode.IsDefined());
+
+ auto& operators = readNode.GetMapSafe().at("Operators").GetArraySafe();
+ for (auto& op : operators) {
+ if (op.GetMapSafe().at("Name") == "TableFullScan") {
+ UNIT_ASSERT(op.GetMapSafe().at("SsaProgram").IsDefined());
+ auto ssa = op.GetMapSafe().at("SsaProgram").GetStringRobust();
+ int filterCmdCount = 0;
+ std::string::size_type pos = 0;
+ std::string filterCmd = R"("Filter":{)";
+ while ((pos = ssa.find(filterCmd, pos)) != std::string::npos) {
+ ++filterCmdCount;
+ pos += filterCmd.size();
+ }
+ UNIT_ASSERT_EQUAL(filterCmdCount, 2);
+ }
+ }
+ }
+ }
+#endif
+
Y_UNIT_TEST(PredicatePushdown_MixStrictAndNotStrict) {
auto settings = TKikimrSettings()
.SetWithSampleTables(false);
diff --git a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-21 b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-21
index d2182e183c4..59bac5b20b7 100644
--- a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-21
+++ b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-21
@@ -80,7 +80,7 @@
"Id": 106
},
"Constant": {
- "Text": "google"
+ "Text": ""
}
}
},
@@ -92,66 +92,58 @@
"Function": {
"Arguments": [
{
- "Id": 14
+ "Id": 40
},
{
"Id": 106
}
],
- "Id": 9
+ "Id": 2
}
}
},
{
- "Assign": {
- "Column": {
- "Id": 108
- },
- "Constant": {
- "Text": ""
+ "Filter": {
+ "Predicate": {
+ "Id": 107
}
}
},
{
+ "Projection": {}
+ },
+ {
"Assign": {
"Column": {
- "Id": 109
+ "Id": 108
},
- "Function": {
- "Arguments": [
- {
- "Id": 40
- },
- {
- "Id": 108
- }
- ],
- "Id": 2
+ "Constant": {
+ "Text": "google"
}
}
},
{
"Assign": {
"Column": {
- "Id": 110
+ "Id": 109
},
"Function": {
"Arguments": [
{
- "Id": 107
+ "Id": 14
},
{
- "Id": 109
+ "Id": 108
}
],
- "Id": 11
+ "Id": 9
}
}
},
{
"Filter": {
"Predicate": {
- "Id": 110
+ "Id": 109
}
}
},
diff --git a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22 b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22
index fcf79ea2007..6cfbd7f4bee 100644
--- a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22
+++ b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22
@@ -96,7 +96,7 @@
"Id": 106
},
"Constant": {
- "Text": "Google"
+ "Text": ""
}
}
},
@@ -108,23 +108,33 @@
"Function": {
"Arguments": [
{
- "Id": 3
+ "Id": 40
},
{
"Id": 106
}
],
- "Id": 9
+ "Id": 2
}
}
},
{
+ "Filter": {
+ "Predicate": {
+ "Id": 107
+ }
+ }
+ },
+ {
+ "Projection": {}
+ },
+ {
"Assign": {
"Column": {
"Id": 108
},
"Constant": {
- "Text": ".google."
+ "Text": "Google"
}
}
},
@@ -136,7 +146,7 @@
"Function": {
"Arguments": [
{
- "Id": 14
+ "Id": 3
},
{
"Id": 108
@@ -151,74 +161,56 @@
"Column": {
"Id": 110
},
- "Function": {
- "Arguments": [
- {
- "Id": 109
- }
- ],
- "Id": 10
- }
- }
- },
- {
- "Assign": {
- "Column": {
- "Id": 111
- },
"Constant": {
- "Text": ""
+ "Text": ".google."
}
}
},
{
"Assign": {
"Column": {
- "Id": 112
+ "Id": 111
},
"Function": {
"Arguments": [
{
- "Id": 40
+ "Id": 14
},
{
- "Id": 111
+ "Id": 110
}
],
- "Id": 2
+ "Id": 9
}
}
},
{
"Assign": {
"Column": {
- "Id": 113
+ "Id": 112
},
"Function": {
"Arguments": [
{
- "Id": 110
- },
- {
- "Id": 112
+ "Id": 111
}
],
- "Id": 11
+ "Id": 10
}
}
},
{
"Assign": {
"Column": {
- "Id": 114
+ "Id": 113
},
"Function": {
"Arguments": [
{
- "Id": 107
+ "Id": 109
},
{
- "Id": 113
+ "Id": 112
}
],
"Id": 11
@@ -228,7 +220,7 @@
{
"Filter": {
"Predicate": {
- "Id": 114
+ "Id": 113
}
}
},
@@ -325,7 +317,7 @@
"Id": 106
},
"Constant": {
- "Text": "Google"
+ "Text": ""
}
}
},
@@ -337,23 +329,33 @@
"Function": {
"Arguments": [
{
- "Id": 3
+ "Id": 40
},
{
"Id": 106
}
],
- "Id": 9
+ "Id": 2
}
}
},
{
+ "Filter": {
+ "Predicate": {
+ "Id": 107
+ }
+ }
+ },
+ {
+ "Projection": {}
+ },
+ {
"Assign": {
"Column": {
"Id": 108
},
"Constant": {
- "Text": ".google."
+ "Text": "Google"
}
}
},
@@ -365,7 +367,7 @@
"Function": {
"Arguments": [
{
- "Id": 14
+ "Id": 3
},
{
"Id": 108
@@ -380,74 +382,56 @@
"Column": {
"Id": 110
},
- "Function": {
- "Arguments": [
- {
- "Id": 109
- }
- ],
- "Id": 10
- }
- }
- },
- {
- "Assign": {
- "Column": {
- "Id": 111
- },
"Constant": {
- "Text": ""
+ "Text": ".google."
}
}
},
{
"Assign": {
"Column": {
- "Id": 112
+ "Id": 111
},
"Function": {
"Arguments": [
{
- "Id": 40
+ "Id": 14
},
{
- "Id": 111
+ "Id": 110
}
],
- "Id": 2
+ "Id": 9
}
}
},
{
"Assign": {
"Column": {
- "Id": 113
+ "Id": 112
},
"Function": {
"Arguments": [
{
- "Id": 110
- },
- {
- "Id": 112
+ "Id": 111
}
],
- "Id": 11
+ "Id": 10
}
}
},
{
"Assign": {
"Column": {
- "Id": 114
+ "Id": 113
},
"Function": {
"Arguments": [
{
- "Id": 107
+ "Id": 109
},
{
- "Id": 113
+ "Id": 112
}
],
"Id": 11
@@ -457,7 +441,7 @@
{
"Filter": {
"Predicate": {
- "Id": 114
+ "Id": 113
}
}
},