summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoraidarsamer <[email protected]>2023-03-15 21:25:17 +0300
committeraidarsamer <[email protected]>2023-03-15 21:25:17 +0300
commited5086c2dbf543b595a317eb344d87a72fa0aa94 (patch)
tree49864e3d598e53b9e2960aadba9953c576a0a09e
parent57f60f69ebe787311842d359a466d9195ab7fc8a (diff)
Implement LIKE filter pushdown to Column Shards
-rw-r--r--ydb/core/kqp/host/kqp_host.cpp3
-rw-r--r--ydb/core/kqp/host/kqp_type_ann.cpp5
-rw-r--r--ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp6
-rw-r--r--ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter_collection.cpp28
-rw-r--r--ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp6
-rw-r--r--ydb/core/kqp/ut/olap/kqp_olap_ut.cpp5
-rw-r--r--ydb/library/yql/core/expr_nodes/yql_expr_nodes.json5
-rw-r--r--ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-2044
-rw-r--r--ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-2160
-rw-r--r--ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22240
-rw-r--r--ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-2343
-rw-r--r--ydb/tests/functional/clickbench/test.py13
12 files changed, 394 insertions, 64 deletions
diff --git a/ydb/core/kqp/host/kqp_host.cpp b/ydb/core/kqp/host/kqp_host.cpp
index 4a41b209f8a..788d8645a2e 100644
--- a/ydb/core/kqp/host/kqp_host.cpp
+++ b/ydb/core/kqp/host/kqp_host.cpp
@@ -1071,11 +1071,12 @@ private:
settings.PathPrefix = tablePathPrefix;
}
settings.EndOfQueryCommit = sqlAutoCommit;
- settings.Flags.insert("DisableEmitStartsWith");
settings.Flags.insert("FlexibleTypes");
if (SessionCtx->Query().Type == EKikimrQueryType::Scan) {
// We enable EmitAggApply for aggregate pushdowns to Column Shards which are accessed by Scan query only
settings.Flags.insert("EmitAggApply");
+ } else {
+ settings.Flags.insert("DisableEmitStartsWith");
}
ui16 actualSyntaxVersion = 0;
diff --git a/ydb/core/kqp/host/kqp_type_ann.cpp b/ydb/core/kqp/host/kqp_type_ann.cpp
index d5ad48d7fa4..8f34bb57492 100644
--- a/ydb/core/kqp/host/kqp_type_ann.cpp
+++ b/ydb/core/kqp/host/kqp_type_ann.cpp
@@ -798,10 +798,11 @@ bool ValidateOlapFilterConditions(const TExprNode* node, const TStructExprType*
));
return false;
}
+ static const std::unordered_set<std::string> FilterOps = {"eq", "neq", "lt", "lte", "gt", "gte", "string_contains", "starts_with", "ends_with"};
auto opStr = op->Content();
- if (opStr != "eq"sv && opStr != "neq"sv && opStr != "lt"sv && opStr != "lte"sv && opStr != "gt"sv && opStr != "gte"sv) {
+ if (FilterOps.find(TString(opStr)) == FilterOps.end()) {
ctx.AddError(TIssue(ctx.GetPosition(node->Pos()),
- TStringBuilder() << "Expected one of eq/neq/lt/lte/gt/gte operators in OLAP comparison filter, got: " << op->Content()
+ TStringBuilder() << "Expected one of eq/neq/lt/lte/gt/gte/string_contains/starts_with/ends_with operators in OLAP comparison filter, got: " << op->Content()
));
return false;
}
diff --git a/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp b/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp
index 1b102843829..9abfe14e885 100644
--- a/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp
+++ b/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter.cpp
@@ -136,6 +136,12 @@ TExprBase BuildOneElementComparison(const std::pair<TExprBase, TExprBase>& param
compareOperator = "gt";
} else if (predicate.Maybe<TCoCmpGreaterOrEqual>() && !forceStrictComparison) {
compareOperator = "gte";
+ } else if (predicate.Maybe<TCoCmpStringContains>()) {
+ compareOperator = "string_contains";
+ } else if (predicate.Maybe<TCoCmpStartsWith>()) {
+ compareOperator = "starts_with";
+ } else if (predicate.Maybe<TCoCmpEndsWith>()) {
+ compareOperator = "ends_with";
}
YQL_ENSURE(!compareOperator.empty(), "Unsupported comparison node: " << predicate.Ptr()->Content());
diff --git a/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter_collection.cpp b/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter_collection.cpp
index cf2c08b5324..b9b3f5c8757 100644
--- a/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter_collection.cpp
+++ b/ydb/core/kqp/opt/physical/kqp_opt_phy_olap_filter_collection.cpp
@@ -15,25 +15,23 @@ namespace {
bool IsSupportedPredicate(const TCoCompare& predicate) {
if (predicate.Maybe<TCoCmpEqual>()) {
return true;
- }
-
- if (predicate.Maybe<TCoCmpLess>()) {
+ } else if (predicate.Maybe<TCoCmpLess>()) {
return true;
- }
-
- if (predicate.Maybe<TCoCmpGreater>()) {
+ } else if (predicate.Maybe<TCoCmpGreater>()) {
return true;
- }
-
- if (predicate.Maybe<TCoCmpNotEqual>()) {
+ } else if (predicate.Maybe<TCoCmpNotEqual>()) {
return true;
- }
-
- if (predicate.Maybe<TCoCmpGreaterOrEqual>()) {
+ } else if (predicate.Maybe<TCoCmpGreaterOrEqual>()) {
return true;
- }
-
- if (predicate.Maybe<TCoCmpLessOrEqual>()) {
+ } else if (predicate.Maybe<TCoCmpLessOrEqual>()) {
+ return true;
+ } else if (predicate.Maybe<TCoCmpLessOrEqual>()) {
+ return true;
+ } else if (predicate.Maybe<TCoCmpStringContains>()) {
+ return true;
+ } else if (predicate.Maybe<TCoCmpStartsWith>()) {
+ return true;
+ } else if (predicate.Maybe<TCoCmpEndsWith>()) {
return true;
}
diff --git a/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp b/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp
index 1234213c364..ba7e736cdb6 100644
--- a/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp
+++ b/ydb/core/kqp/query_compiler/kqp_olap_compiler.cpp
@@ -295,6 +295,12 @@ TProgram::TAssignment* CompileComparison(const TKqpOlapFilterCompare& comparison
function = TProgram::TAssignment::FUNC_CMP_GREATER;
} else if (comparison.Operator() == "gte") {
function = TProgram::TAssignment::FUNC_CMP_GREATER_EQUAL;
+ } else if (comparison.Operator() == "string_contains") {
+ function = TProgram::TAssignment::FUNC_STR_MATCH;
+ } else if (comparison.Operator() == "starts_with") {
+ function = TProgram::TAssignment::FUNC_STR_STARTS_WITH;
+ } else if (comparison.Operator() == "ends_with") {
+ function = TProgram::TAssignment::FUNC_STR_ENDS_WITH;
}
cmpFunc->SetId(function);
diff --git a/ydb/core/kqp/ut/olap/kqp_olap_ut.cpp b/ydb/core/kqp/ut/olap/kqp_olap_ut.cpp
index 34871b51924..579dc8076d7 100644
--- a/ydb/core/kqp/ut/olap/kqp_olap_ut.cpp
+++ b/ydb/core/kqp/ut/olap/kqp_olap_ut.cpp
@@ -1171,6 +1171,10 @@ Y_UNIT_TEST_SUITE(KqpOlap) {
R"(CAST("2" As Int32) >= `level`)",
R"(`timestamp` >= CAST(3000001u AS Timestamp))",
R"((`timestamp`, `level`) >= (CAST(3000001u AS Timestamp), 3))",
+ R"(`uid` LIKE "%30000%")",
+ R"(`uid` LIKE "uid%")",
+ R"(`uid` LIKE "%001")",
+ R"(`uid` LIKE "uid%001")",
};
std::vector<TString> testDataNoPush = {
@@ -1211,6 +1215,7 @@ Y_UNIT_TEST_SUITE(KqpOlap) {
}
qBuilder << R"(PRAGMA Kikimr.OptEnablePredicateExtract = "false";)" << Endl;
+ qBuilder << R"(PRAGMA AnsiLike;)" << Endl;
qBuilder << "SELECT `timestamp` FROM `/Root/olapStore/olapTable` WHERE ";
qBuilder << predicate;
qBuilder << " ORDER BY `timestamp`";
diff --git a/ydb/library/yql/core/expr_nodes/yql_expr_nodes.json b/ydb/library/yql/core/expr_nodes/yql_expr_nodes.json
index 363b331e0b9..fe1e3e9d6f1 100644
--- a/ydb/library/yql/core/expr_nodes/yql_expr_nodes.json
+++ b/ydb/library/yql/core/expr_nodes/yql_expr_nodes.json
@@ -1198,6 +1198,11 @@
"Match": {"Type": "Callable", "Name": "EndsWith"}
},
{
+ "Name": "TCoCmpStringContains",
+ "Base": "TCoCompare",
+ "Match": {"Type": "Callable", "Name": "StringContains"}
+ },
+ {
"Name": "TCoInc",
"Base": "TCallable",
"Match": {"Type": "Callable", "Name": "Inc"},
diff --git a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-20 b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-20
index 5c14a9d69a0..f65e0d2f5c8 100644
--- a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-20
+++ b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-20
@@ -55,16 +55,12 @@
"PlanNodeType": "Connection",
"Plans": [
{
- "Node Type": "Aggregate-Filter-TableFullScan",
+ "Node Type": "Aggregate-TableFullScan",
"Operators": [
{
"Name": "Aggregate"
},
{
- "Name": "Filter",
- "Predicate": "Apply"
- },
- {
"Name": "TableFullScan",
"ReadColumns": [
"URL"
@@ -79,12 +75,38 @@
"SsaProgram": {
"Command": [
{
- "Projection": {
- "Columns": [
- {
- "Id": 14
- }
- ]
+ "Assign": {
+ "Column": {
+ "Id": 106
+ },
+ "Constant": {
+ "Text": "google"
+ }
+ }
+ },
+ {
+ "Assign": {
+ "Column": {
+ "Id": 107
+ },
+ "Function": {
+ "Arguments": [
+ {
+ "Id": 14
+ },
+ {
+ "Id": 106
+ }
+ ],
+ "Id": 9
+ }
+ }
+ },
+ {
+ "Filter": {
+ "Predicate": {
+ "Id": 107
+ }
}
}
],
diff --git a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-21 b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-21
index f6f9f083364..a844de76086 100644
--- a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-21
+++ b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-21
@@ -35,7 +35,7 @@
{
"Limit": "10",
"Name": "TopSort",
- "TopSortBy": "$35.c"
+ "TopSortBy": "$23.c"
},
{
"Name": "Aggregate"
@@ -52,7 +52,7 @@
"PlanNodeType": "Connection",
"Plans": [
{
- "Node Type": "Aggregate-Filter-TableFullScan",
+ "Node Type": "Aggregate-TableFullScan",
"Operators": [
{
"Aggregation": "{_yql_agg_0: MIN(item.URL),_yql_agg_1: Inc(state._yql_agg_1)}",
@@ -60,10 +60,6 @@
"Name": "Aggregate"
},
{
- "Name": "Filter",
- "Predicate": "Apply"
- },
- {
"Name": "TableFullScan",
"ReadColumns": [
"SearchPhrase",
@@ -84,7 +80,7 @@
"Id": 106
},
"Constant": {
- "Text": ""
+ "Text": "google"
}
}
},
@@ -96,20 +92,66 @@
"Function": {
"Arguments": [
{
- "Id": 40
+ "Id": 14
},
{
"Id": 106
}
],
+ "Id": 9
+ }
+ }
+ },
+ {
+ "Assign": {
+ "Column": {
+ "Id": 108
+ },
+ "Constant": {
+ "Text": ""
+ }
+ }
+ },
+ {
+ "Assign": {
+ "Column": {
+ "Id": 109
+ },
+ "Function": {
+ "Arguments": [
+ {
+ "Id": 40
+ },
+ {
+ "Id": 108
+ }
+ ],
"Id": 2
}
}
},
{
+ "Assign": {
+ "Column": {
+ "Id": 110
+ },
+ "Function": {
+ "Arguments": [
+ {
+ "Id": 107
+ },
+ {
+ "Id": 109
+ }
+ ],
+ "Id": 11
+ }
+ }
+ },
+ {
"Filter": {
"Predicate": {
- "Id": 107
+ "Id": 110
}
}
},
diff --git a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22 b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22
index 07324fc556e..f28aae83869 100644
--- a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22
+++ b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-22
@@ -35,7 +35,7 @@
{
"Limit": "10",
"Name": "TopSort",
- "TopSortBy": "$54.c"
+ "TopSortBy": "$40.c"
},
{
"Name": "Aggregate"
@@ -66,7 +66,7 @@
"PlanNodeType": "Connection",
"Plans": [
{
- "Node Type": "Aggregate-Filter-TableFullScan",
+ "Node Type": "Aggregate-TableFullScan",
"Operators": [
{
"Aggregation": "{_yql_agg_0: MIN(item.URL),_yql_agg_1: MIN(item.Title),_yql_agg_2: Inc(state._yql_agg_2)}",
@@ -74,10 +74,6 @@
"Name": "Aggregate"
},
{
- "Name": "Filter",
- "Predicate": "Apply And Not"
- },
- {
"Name": "TableFullScan",
"ReadColumns": [
"SearchPhrase",
@@ -100,7 +96,7 @@
"Id": 106
},
"Constant": {
- "Text": ""
+ "Text": "Google"
}
}
},
@@ -112,20 +108,127 @@
"Function": {
"Arguments": [
{
- "Id": 40
+ "Id": 3
},
{
"Id": 106
}
],
+ "Id": 9
+ }
+ }
+ },
+ {
+ "Assign": {
+ "Column": {
+ "Id": 108
+ },
+ "Constant": {
+ "Text": ".google."
+ }
+ }
+ },
+ {
+ "Assign": {
+ "Column": {
+ "Id": 109
+ },
+ "Function": {
+ "Arguments": [
+ {
+ "Id": 14
+ },
+ {
+ "Id": 108
+ }
+ ],
+ "Id": 9
+ }
+ }
+ },
+ {
+ "Assign": {
+ "Column": {
+ "Id": 110
+ },
+ "Function": {
+ "Arguments": [
+ {
+ "Id": 109
+ }
+ ],
+ "Id": 10
+ }
+ }
+ },
+ {
+ "Assign": {
+ "Column": {
+ "Id": 111
+ },
+ "Constant": {
+ "Text": ""
+ }
+ }
+ },
+ {
+ "Assign": {
+ "Column": {
+ "Id": 112
+ },
+ "Function": {
+ "Arguments": [
+ {
+ "Id": 40
+ },
+ {
+ "Id": 111
+ }
+ ],
"Id": 2
}
}
},
{
+ "Assign": {
+ "Column": {
+ "Id": 113
+ },
+ "Function": {
+ "Arguments": [
+ {
+ "Id": 110
+ },
+ {
+ "Id": 112
+ }
+ ],
+ "Id": 11
+ }
+ }
+ },
+ {
+ "Assign": {
+ "Column": {
+ "Id": 114
+ },
+ "Function": {
+ "Arguments": [
+ {
+ "Id": 107
+ },
+ {
+ "Id": 113
+ }
+ ],
+ "Id": 11
+ }
+ }
+ },
+ {
"Filter": {
"Predicate": {
- "Id": 107
+ "Id": 114
}
}
},
@@ -192,7 +295,7 @@
"PlanNodeType": "Connection",
"Plans": [
{
- "Node Type": "Aggregate-Filter-TableFullScan",
+ "Node Type": "Aggregate-TableFullScan",
"Operators": [
{
"Aggregation": "state",
@@ -200,10 +303,6 @@
"Name": "Aggregate"
},
{
- "Name": "Filter",
- "Predicate": "Apply And Not"
- },
- {
"Name": "TableFullScan",
"ReadColumns": [
"SearchPhrase",
@@ -226,7 +325,7 @@
"Id": 106
},
"Constant": {
- "Text": ""
+ "Text": "Google"
}
}
},
@@ -238,20 +337,127 @@
"Function": {
"Arguments": [
{
- "Id": 40
+ "Id": 3
},
{
"Id": 106
}
],
+ "Id": 9
+ }
+ }
+ },
+ {
+ "Assign": {
+ "Column": {
+ "Id": 108
+ },
+ "Constant": {
+ "Text": ".google."
+ }
+ }
+ },
+ {
+ "Assign": {
+ "Column": {
+ "Id": 109
+ },
+ "Function": {
+ "Arguments": [
+ {
+ "Id": 14
+ },
+ {
+ "Id": 108
+ }
+ ],
+ "Id": 9
+ }
+ }
+ },
+ {
+ "Assign": {
+ "Column": {
+ "Id": 110
+ },
+ "Function": {
+ "Arguments": [
+ {
+ "Id": 109
+ }
+ ],
+ "Id": 10
+ }
+ }
+ },
+ {
+ "Assign": {
+ "Column": {
+ "Id": 111
+ },
+ "Constant": {
+ "Text": ""
+ }
+ }
+ },
+ {
+ "Assign": {
+ "Column": {
+ "Id": 112
+ },
+ "Function": {
+ "Arguments": [
+ {
+ "Id": 40
+ },
+ {
+ "Id": 111
+ }
+ ],
"Id": 2
}
}
},
{
+ "Assign": {
+ "Column": {
+ "Id": 113
+ },
+ "Function": {
+ "Arguments": [
+ {
+ "Id": 110
+ },
+ {
+ "Id": 112
+ }
+ ],
+ "Id": 11
+ }
+ }
+ },
+ {
+ "Assign": {
+ "Column": {
+ "Id": 114
+ },
+ "Function": {
+ "Arguments": [
+ {
+ "Id": 107
+ },
+ {
+ "Id": 113
+ }
+ ],
+ "Id": 11
+ }
+ }
+ },
+ {
"Filter": {
"Predicate": {
- "Id": 107
+ "Id": 114
}
}
},
diff --git a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-23 b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-23
index fd301e5418a..2658d48f60e 100644
--- a/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-23
+++ b/ydb/tests/functional/clickbench/canondata/test.test_plans_column_/queries-original-plan-column-23
@@ -30,17 +30,13 @@
"PlanNodeType": "Connection",
"Plans": [
{
- "Node Type": "Limit-Filter-TableFullScan",
+ "Node Type": "Limit-TableFullScan",
"Operators": [
{
"Limit": "10",
"Name": "Limit"
},
{
- "Name": "Filter",
- "Predicate": "Apply"
- },
- {
"Name": "TableFullScan",
"ReadColumns": [
"AdvEngineID",
@@ -149,6 +145,7 @@
"WindowName",
"WithHash"
],
+ "ReadLimit": "10",
"ReadRanges": [
"EventTime (-\u221e, +\u221e)",
"CounterID (-\u221e, +\u221e)",
@@ -159,6 +156,41 @@
"SsaProgram": {
"Command": [
{
+ "Assign": {
+ "Column": {
+ "Id": 106
+ },
+ "Constant": {
+ "Text": "google"
+ }
+ }
+ },
+ {
+ "Assign": {
+ "Column": {
+ "Id": 107
+ },
+ "Function": {
+ "Arguments": [
+ {
+ "Id": 14
+ },
+ {
+ "Id": 106
+ }
+ ],
+ "Id": 9
+ }
+ }
+ },
+ {
+ "Filter": {
+ "Predicate": {
+ "Id": 107
+ }
+ }
+ },
+ {
"Projection": {
"Columns": [
{
@@ -613,6 +645,7 @@
"WindowName",
"WithHash"
],
+ "limit": "10",
"scan_by": [
"EventTime (-\u221e, +\u221e)",
"CounterID (-\u221e, +\u221e)",
diff --git a/ydb/tests/functional/clickbench/test.py b/ydb/tests/functional/clickbench/test.py
index 843b79eaa02..1b94a6e0210 100644
--- a/ydb/tests/functional/clickbench/test.py
+++ b/ydb/tests/functional/clickbench/test.py
@@ -21,13 +21,15 @@ def run_cli(argv):
)
-def get_queries(filename):
+def get_queries(filename, isColumnStore):
path = os.path.join(yatest.common.source_path("ydb/tests/functional/clickbench"), filename)
with open(path, "r") as r:
data = r.read()
for query in data.split('\n'):
if not query:
continue
+ if isColumnStore:
+ query = "PRAGMA AnsiLike;\n" + query
yield query
@@ -107,7 +109,10 @@ def test_run_benchmark(store):
# just validating that benchmark can be executed successfully on this data.
out_fpath = os.path.join(yatest.common.output_path(), 'click_bench.{}.results'.format(store))
- ret = run_cli(["workload", "clickbench", "run", "--output", out_fpath, "--table", path])
+ querySettings = ""
+ if store == "column":
+ querySettings = "PRAGMA AnsiLike;"
+ ret = run_cli(["workload", "clickbench", "run", "--output", out_fpath, "--table", path, "--query-settings", querySettings])
assert_that(ret.exit_code, is_(0))
@@ -135,7 +140,7 @@ def test_run_determentistic(store):
driver.wait(5)
final_results = {}
- for query_id, query in enumerate(get_queries("data/queries-deterministic.sql")):
+ for query_id, query in enumerate(get_queries("data/queries-deterministic.sql", store == "column")):
results_to_canonize = execute_scan_query(driver, query, "`/local/clickbench/determentistic/{}/hits`".format(store))
key = "queries-deterministic-results-%s" % str(query_id)
final_results[key] = save_canonical_data(results_to_canonize, key)
@@ -160,7 +165,7 @@ def test_plans(store):
final_results = {}
- for query_id, query in enumerate(get_queries("data/queries-original.sql")):
+ for query_id, query in enumerate(get_queries("data/queries-original.sql", store == "column")):
plan = explain_scan_query(driver, query, "`/local/clickbench/plans/{}/hits`".format(store))
key = "queries-original-plan-{}-{}".format(store, str(query_id))
final_results[key] = save_canonical_data(plan, key)