diff options
| author | aidarsamer <[email protected]> | 2023-03-15 21:25:17 +0300 |
|---|---|---|
| committer | aidarsamer <[email protected]> | 2023-03-15 21:25:17 +0300 |
| commit | ed5086c2dbf543b595a317eb344d87a72fa0aa94 (patch) | |
| tree | 49864e3d598e53b9e2960aadba9953c576a0a09e | |
| parent | 57f60f69ebe787311842d359a466d9195ab7fc8a (diff) | |
Implement LIKE filter pushdown to Column Shards
12 files changed, 394 insertions, 64 deletions
diff --git a/ydb/core/kqp/host/kqp_host.cpp b/ydb/core/kqp/host/kqp_host.cpp index 4a41b209f8a..788d8645a2e 100644 --- a/ydb/core/kqp/host/kqp_host.cpp +++ b/ydb/core/kqp/host/kqp_host.cpp @@ -1071,11 +1071,12 @@ private: settings.PathPrefix = tablePathPrefix; } settings.EndOfQueryCommit = sqlAutoCommit; - settings.Flags.insert("DisableEmitStartsWith"); settings.Flags.insert("FlexibleTypes"); if (SessionCtx->Query().Type == EKikimrQueryType::Scan) { // We enable EmitAggApply for aggregate pushdowns to Column Shards which are accessed by Scan query only settings.Flags.insert("EmitAggApply"); + } else { + settings.Flags.insert("DisableEmitStartsWith"); } ui16 actualSyntaxVersion = 0; diff --git a/ydb/core/kqp/host/kqp_type_ann.cpp b/ydb/core/kqp/host/kqp_type_ann.cpp index d5ad48d7fa4..8f34bb57492 100644 --- a/ydb/core/kqp/host/kqp_type_ann.cpp +++ b/ydb/core/kqp/host/kqp_type_ann.cpp @@ -798,10 +798,11 @@ bool ValidateOlapFilterConditions(const TExprNode* node, const TStructExprType* )); return false; } + static const std::unordered_set<std::string> FilterOps = {"eq", "neq", "lt", "lte", "gt", "gte", "string_contains", "starts_with", "ends_with"}; auto opStr = op->Content(); - if (opStr != "eq"sv && opStr != "neq"sv && opStr != "lt"sv && opStr != "lte"sv && opStr != "gt"sv && opStr != "gte"sv) { + if (FilterOps.find(TString(opStr)) == FilterOps.end()) { ctx.AddError(TIssue(ctx.GetPosition(node->Pos()), - TStringBuilder() << "Expected one of eq/neq/lt/lte/gt/gte operators in OLAP comparison filter, got: " << op->Content() + TStringBuilder() << "Expected one of eq/neq/lt/lte/gt/gte/string_contains/starts_with/ends_with operators in OLAP comparison filter, got: " << op->Content() )); return false; } diff --git a/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp b/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp index 1b102843829..9abfe14e885 100644 --- a/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp +++ b/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp @@ -136,6 +136,12 @@ TExprBase BuildOneElementComparison(const std::pair<TExprBase, TExprBase>& param compareOperator = "gt"; } else if (predicate.Maybe<TCoCmpGreaterOrEqual>() && !forceStrictComparison) { compareOperator = "gte"; + } else if (predicate.Maybe<TCoCmpStringContains>()) { + compareOperator = "string_contains"; + } else if (predicate.Maybe<TCoCmpStartsWith>()) { + compareOperator = "starts_with"; + } else if (predicate.Maybe<TCoCmpEndsWith>()) { + compareOperator = "ends_with"; } YQL_ENSURE(!compareOperator.empty(), "Unsupported comparison node: " << predicate.Ptr()->Content()); diff --git a/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter_collection.cpp b/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter_collection.cpp index cf2c08b5324..b9b3f5c8757 100644 --- a/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter_collection.cpp +++ b/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter_collection.cpp @@ -15,25 +15,23 @@ namespace { bool IsSupportedPredicate(const TCoCompare& predicate) { if (predicate.Maybe<TCoCmpEqual>()) { return true; - } - - if (predicate.Maybe<TCoCmpLess>()) { + } else if (predicate.Maybe<TCoCmpLess>()) { return true; - } - - if (predicate.Maybe<TCoCmpGreater>()) { + } else if (predicate.Maybe<TCoCmpGreater>()) { return true; - } - - if (predicate.Maybe<TCoCmpNotEqual>()) { + } else if (predicate.Maybe<TCoCmpNotEqual>()) { return true; - } - - if (predicate.Maybe<TCoCmpGreaterOrEqual>()) { + } else if (predicate.Maybe<TCoCmpGreaterOrEqual>()) { return true; - } - - if (predicate.Maybe<TCoCmpLessOrEqual>()) { + } else if (predicate.Maybe<TCoCmpLessOrEqual>()) { + return true; + } else if (predicate.Maybe<TCoCmpLessOrEqual>()) { + return true; + } else if (predicate.Maybe<TCoCmpStringContains>()) { + return true; + } else if (predicate.Maybe<TCoCmpStartsWith>()) { + return true; + } else if (predicate.Maybe<TCoCmpEndsWith>()) { return true; } diff --git a/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp b/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp index 1234213c364..ba7e736cdb6 100644 --- a/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp +++ b/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp @@ -295,6 +295,12 @@ TProgram::TAssignment* CompileComparison(const TKqpOlapFilterCompare& comparison function = TProgram::TAssignment::FUNC_CMP_GREATER; } else if (comparison.Operator() == "gte") { function = TProgram::TAssignment::FUNC_CMP_GREATER_EQUAL; + } else if (comparison.Operator() == "string_contains") { + function = TProgram::TAssignment::FUNC_STR_MATCH; + } else if (comparison.Operator() == "starts_with") { + function = TProgram::TAssignment::FUNC_STR_STARTS_WITH; + } else if (comparison.Operator() == "ends_with") { + function = TProgram::TAssignment::FUNC_STR_ENDS_WITH; } cmpFunc->SetId(function); diff --git a/ydb/core/kqp/ut/olap/kqp_olap_ut.cpp b/ydb/core/kqp/ut/olap/kqp_olap_ut.cpp index 34871b51924..579dc8076d7 100644 --- a/ydb/core/kqp/ut/olap/kqp_olap_ut.cpp +++ b/ydb/core/kqp/ut/olap/kqp_olap_ut.cpp @@ -1171,6 +1171,10 @@ Y_UNIT_TEST_SUITE(KqpOlap) { R"(CAST("2" As Int32) >= `level`)", R"(`timestamp` >= CAST(3000001u AS Timestamp))", R"((`timestamp`, `level`) >= (CAST(3000001u AS Timestamp), 3))", + R"(`uid` LIKE "%30000%")", + R"(`uid` LIKE "uid%")", + R"(`uid` LIKE "%001")", + R"(`uid` LIKE "uid%001")", }; std::vector<TString> testDataNoPush = { @@ -1211,6 +1215,7 @@ Y_UNIT_TEST_SUITE(KqpOlap) { } qBuilder << R"(PRAGMA Kikimr.OptEnablePredicateExtract = "false";)" << Endl; + qBuilder << R"(PRAGMA AnsiLike;)" << Endl; qBuilder << "SELECT `timestamp` FROM `/Root/olapStore/olapTable` WHERE "; qBuilder << predicate; qBuilder << " ORDER BY `timestamp`"; diff --git a/ydb/library/yql/core/expr_nodes/yql_expr_nodes.json b/ydb/library/yql/core/expr_nodes/yql_expr_nodes.json index 363b331e0b9..fe1e3e9d6f1 100644 --- a/ydb/library/yql/core/expr_nodes/yql_expr_nodes.json +++ b/ydb/library/yql/core/expr_nodes/yql_expr_nodes.json @@ -1198,6 +1198,11 @@ "Match": {"Type": "Callable", "Name": "EndsWith"} }, { + "Name": "TCoCmpStringContains", + "Base": "TCoCompare", + "Match": {"Type": "Callable", "Name": "StringContains"} + }, + { "Name": "TCoInc", "Base": "TCallable", "Match": {"Type": "Callable", "Name": "Inc"}, diff --git a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-20 b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-20 index 5c14a9d69a0..f65e0d2f5c8 100644 --- a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-20 +++ b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-20 @@ -55,16 +55,12 @@ "PlanNodeType": "Connection", "Plans": [ { - "Node Type": "Aggregate-Filter-TableFullScan", + "Node Type": "Aggregate-TableFullScan", "Operators": [ { "Name": "Aggregate" }, { - "Name": "Filter", - "Predicate": "Apply" - }, - { "Name": "TableFullScan", "ReadColumns": [ "URL" @@ -79,12 +75,38 @@ "SsaProgram": { "Command": [ { - "Projection": { - "Columns": [ - { - "Id": 14 - } - ] + "Assign": { + "Column": { + "Id": 106 + }, + "Constant": { + "Text": "google" + } + } + }, + { + "Assign": { + "Column": { + "Id": 107 + }, + "Function": { + "Arguments": [ + { + "Id": 14 + }, + { + "Id": 106 + } + ], + "Id": 9 + } + } + }, + { + "Filter": { + "Predicate": { + "Id": 107 + } } } ], diff --git a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-21 b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-21 index f6f9f083364..a844de76086 100644 --- a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-21 +++ b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-21 @@ -35,7 +35,7 @@ { "Limit": "10", "Name": "TopSort", - "TopSortBy": "$35.c" + "TopSortBy": "$23.c" }, { "Name": "Aggregate" @@ -52,7 +52,7 @@ "PlanNodeType": "Connection", "Plans": [ { - "Node Type": "Aggregate-Filter-TableFullScan", + "Node Type": "Aggregate-TableFullScan", "Operators": [ { "Aggregation": "{_yql_agg_0: MIN(item.URL),_yql_agg_1: Inc(state._yql_agg_1)}", @@ -60,10 +60,6 @@ "Name": "Aggregate" }, { - "Name": "Filter", - "Predicate": "Apply" - }, - { "Name": "TableFullScan", "ReadColumns": [ "SearchPhrase", @@ -84,7 +80,7 @@ "Id": 106 }, "Constant": { - "Text": "" + "Text": "google" } } }, @@ -96,20 +92,66 @@ "Function": { "Arguments": [ { - "Id": 40 + "Id": 14 }, { "Id": 106 } ], + "Id": 9 + } + } + }, + { + "Assign": { + "Column": { + "Id": 108 + }, + "Constant": { + "Text": "" + } + } + }, + { + "Assign": { + "Column": { + "Id": 109 + }, + "Function": { + "Arguments": [ + { + "Id": 40 + }, + { + "Id": 108 + } + ], "Id": 2 } } }, { + "Assign": { + "Column": { + "Id": 110 + }, + "Function": { + "Arguments": [ + { + "Id": 107 + }, + { + "Id": 109 + } + ], + "Id": 11 + } + } + }, + { "Filter": { "Predicate": { - "Id": 107 + "Id": 110 } } }, diff --git a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22 b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22 index 07324fc556e..f28aae83869 100644 --- a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22 +++ b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22 @@ -35,7 +35,7 @@ { "Limit": "10", "Name": "TopSort", - "TopSortBy": "$54.c" + "TopSortBy": "$40.c" }, { "Name": "Aggregate" @@ -66,7 +66,7 @@ "PlanNodeType": "Connection", "Plans": [ { - "Node Type": "Aggregate-Filter-TableFullScan", + "Node Type": "Aggregate-TableFullScan", "Operators": [ { "Aggregation": "{_yql_agg_0: MIN(item.URL),_yql_agg_1: MIN(item.Title),_yql_agg_2: Inc(state._yql_agg_2)}", @@ -74,10 +74,6 @@ "Name": "Aggregate" }, { - "Name": "Filter", - "Predicate": "Apply And Not" - }, - { "Name": "TableFullScan", "ReadColumns": [ "SearchPhrase", @@ -100,7 +96,7 @@ "Id": 106 }, "Constant": { - "Text": "" + "Text": "Google" } } }, @@ -112,20 +108,127 @@ "Function": { "Arguments": [ { - "Id": 40 + "Id": 3 }, { "Id": 106 } ], + "Id": 9 + } + } + }, + { + "Assign": { + "Column": { + "Id": 108 + }, + "Constant": { + "Text": ".google." + } + } + }, + { + "Assign": { + "Column": { + "Id": 109 + }, + "Function": { + "Arguments": [ + { + "Id": 14 + }, + { + "Id": 108 + } + ], + "Id": 9 + } + } + }, + { + "Assign": { + "Column": { + "Id": 110 + }, + "Function": { + "Arguments": [ + { + "Id": 109 + } + ], + "Id": 10 + } + } + }, + { + "Assign": { + "Column": { + "Id": 111 + }, + "Constant": { + "Text": "" + } + } + }, + { + "Assign": { + "Column": { + "Id": 112 + }, + "Function": { + "Arguments": [ + { + "Id": 40 + }, + { + "Id": 111 + } + ], "Id": 2 } } }, { + "Assign": { + "Column": { + "Id": 113 + }, + "Function": { + "Arguments": [ + { + "Id": 110 + }, + { + "Id": 112 + } + ], + "Id": 11 + } + } + }, + { + "Assign": { + "Column": { + "Id": 114 + }, + "Function": { + "Arguments": [ + { + "Id": 107 + }, + { + "Id": 113 + } + ], + "Id": 11 + } + } + }, + { "Filter": { "Predicate": { - "Id": 107 + "Id": 114 } } }, @@ -192,7 +295,7 @@ "PlanNodeType": "Connection", "Plans": [ { - "Node Type": "Aggregate-Filter-TableFullScan", + "Node Type": "Aggregate-TableFullScan", "Operators": [ { "Aggregation": "state", @@ -200,10 +303,6 @@ "Name": "Aggregate" }, { - "Name": "Filter", - "Predicate": "Apply And Not" - }, - { "Name": "TableFullScan", "ReadColumns": [ "SearchPhrase", @@ -226,7 +325,7 @@ "Id": 106 }, "Constant": { - "Text": "" + "Text": "Google" } } }, @@ -238,20 +337,127 @@ "Function": { "Arguments": [ { - "Id": 40 + "Id": 3 }, { "Id": 106 } ], + "Id": 9 + } + } + }, + { + "Assign": { + "Column": { + "Id": 108 + }, + "Constant": { + "Text": ".google." + } + } + }, + { + "Assign": { + "Column": { + "Id": 109 + }, + "Function": { + "Arguments": [ + { + "Id": 14 + }, + { + "Id": 108 + } + ], + "Id": 9 + } + } + }, + { + "Assign": { + "Column": { + "Id": 110 + }, + "Function": { + "Arguments": [ + { + "Id": 109 + } + ], + "Id": 10 + } + } + }, + { + "Assign": { + "Column": { + "Id": 111 + }, + "Constant": { + "Text": "" + } + } + }, + { + "Assign": { + "Column": { + "Id": 112 + }, + "Function": { + "Arguments": [ + { + "Id": 40 + }, + { + "Id": 111 + } + ], "Id": 2 } } }, { + "Assign": { + "Column": { + "Id": 113 + }, + "Function": { + "Arguments": [ + { + "Id": 110 + }, + { + "Id": 112 + } + ], + "Id": 11 + } + } + }, + { + "Assign": { + "Column": { + "Id": 114 + }, + "Function": { + "Arguments": [ + { + "Id": 107 + }, + { + "Id": 113 + } + ], + "Id": 11 + } + } + }, + { "Filter": { "Predicate": { - "Id": 107 + "Id": 114 } } }, diff --git a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-23 b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-23 index fd301e5418a..2658d48f60e 100644 --- a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-23 +++ b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-23 @@ -30,17 +30,13 @@ "PlanNodeType": "Connection", "Plans": [ { - "Node Type": "Limit-Filter-TableFullScan", + "Node Type": "Limit-TableFullScan", "Operators": [ { "Limit": "10", "Name": "Limit" }, { - "Name": "Filter", - "Predicate": "Apply" - }, - { "Name": "TableFullScan", "ReadColumns": [ "AdvEngineID", @@ -149,6 +145,7 @@ "WindowName", "WithHash" ], + "ReadLimit": "10", "ReadRanges": [ "EventTime (-\u221e, +\u221e)", "CounterID (-\u221e, +\u221e)", @@ -159,6 +156,41 @@ "SsaProgram": { "Command": [ { + "Assign": { + "Column": { + "Id": 106 + }, + "Constant": { + "Text": "google" + } + } + }, + { + "Assign": { + "Column": { + "Id": 107 + }, + "Function": { + "Arguments": [ + { + "Id": 14 + }, + { + "Id": 106 + } + ], + "Id": 9 + } + } + }, + { + "Filter": { + "Predicate": { + "Id": 107 + } + } + }, + { "Projection": { "Columns": [ { @@ -613,6 +645,7 @@ "WindowName", "WithHash" ], + "limit": "10", "scan_by": [ "EventTime (-\u221e, +\u221e)", "CounterID (-\u221e, +\u221e)", diff --git a/ydb/tests/functional/clickbench/test.py b/ydb/tests/functional/clickbench/test.py index 843b79eaa02..1b94a6e0210 100644 --- a/ydb/tests/functional/clickbench/test.py +++ b/ydb/tests/functional/clickbench/test.py @@ -21,13 +21,15 @@ def run_cli(argv): ) -def get_queries(filename): +def get_queries(filename, isColumnStore): path = os.path.join(yatest.common.source_path("ydb/tests/functional/clickbench"), filename) with open(path, "r") as r: data = r.read() for query in data.split('\n'): if not query: continue + if isColumnStore: + query = "PRAGMA AnsiLike;\n" + query yield query @@ -107,7 +109,10 @@ def test_run_benchmark(store): # just validating that benchmark can be executed successfully on this data. out_fpath = os.path.join(yatest.common.output_path(), 'click_bench.{}.results'.format(store)) - ret = run_cli(["workload", "clickbench", "run", "--output", out_fpath, "--table", path]) + querySettings = "" + if store == "column": + querySettings = "PRAGMA AnsiLike;" + ret = run_cli(["workload", "clickbench", "run", "--output", out_fpath, "--table", path, "--query-settings", querySettings]) assert_that(ret.exit_code, is_(0)) @@ -135,7 +140,7 @@ def test_run_determentistic(store): driver.wait(5) final_results = {} - for query_id, query in enumerate(get_queries("data/queries-deterministic.sql")): + for query_id, query in enumerate(get_queries("data/queries-deterministic.sql", store == "column")): results_to_canonize = execute_scan_query(driver, query, "`/local/clickbench/determentistic/{}/hits`".format(store)) key = "queries-deterministic-results-%s" % str(query_id) final_results[key] = save_canonical_data(results_to_canonize, key) @@ -160,7 +165,7 @@ def test_plans(store): final_results = {} - for query_id, query in enumerate(get_queries("data/queries-original.sql")): + for query_id, query in enumerate(get_queries("data/queries-original.sql", store == "column")): plan = explain_scan_query(driver, query, "`/local/clickbench/plans/{}/hits`".format(store)) key = "queries-original-plan-{}-{}".format(store, str(query_id)) final_results[key] = save_canonical_data(plan, key) |
