aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorchertus <azuikov@ydb.tech>2023-03-14 11:34:19 +0300
committerchertus <azuikov@ydb.tech>2023-03-14 11:34:19 +0300
commit9b9393c30c6fd29ba2bb4a0ba0ef2dd7ce0e1e72 (patch)
treeb44d84999048c54905506b2f99a84f2cf0dbccda
parent905e6db573bae8dca6b071c468c29b8d3cd7a242 (diff)
downloadydb-9b9393c30c6fd29ba2bb4a0ba0ef2dd7ce0e1e72.tar.gz
fix and tests for LIKEs in SSA
-rw-r--r--ydb/core/formats/program.cpp8
-rw-r--r--ydb/core/formats/ut_program_step.cpp59
-rw-r--r--ydb/core/tx/columnshard/columnshard_common.cpp47
-rw-r--r--ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp121
4 files changed, 223 insertions, 12 deletions
diff --git a/ydb/core/formats/program.cpp b/ydb/core/formats/program.cpp
index 23ea3d120bd..420955a610c 100644
--- a/ydb/core/formats/program.cpp
+++ b/ydb/core/formats/program.cpp
@@ -192,10 +192,6 @@ EOperation ValidateOperation(EOperation op, ui32 argsSize) {
case EOperation::LessEqual:
case EOperation::Greater:
case EOperation::GreaterEqual:
- case EOperation::MatchSubstring:
- case EOperation::MatchLike:
- case EOperation::StartsWith:
- case EOperation::EndsWith:
case EOperation::And:
case EOperation::Or:
case EOperation::Xor:
@@ -237,6 +233,10 @@ EOperation ValidateOperation(EOperation op, ui32 argsSize) {
case EOperation::Invert:
case EOperation::Abs:
case EOperation::Negate:
+ case EOperation::StartsWith:
+ case EOperation::EndsWith:
+ case EOperation::MatchSubstring:
+ case EOperation::MatchLike:
if (argsSize == 1) {
return op;
}
diff --git a/ydb/core/formats/ut_program_step.cpp b/ydb/core/formats/ut_program_step.cpp
index f89b9f0f7a3..1f2ae885214 100644
--- a/ydb/core/formats/ut_program_step.cpp
+++ b/ydb/core/formats/ut_program_step.cpp
@@ -56,6 +56,38 @@ size_t FilterTestUnary(std::vector<std::shared_ptr<arrow::Array>> args, EOperati
return batch->num_rows();
}
+std::vector<bool> LikeTest(const std::vector<std::string>& data,
+ EOperation op, const std::string& pattern, bool ignoreCase = false)
+{
+ auto schema = std::make_shared<arrow::Schema>(std::vector{
+ std::make_shared<arrow::Field>("x", arrow::utf8())});
+ arrow::StringBuilder sb;
+ sb.AppendValues(data).ok();
+ auto batch = arrow::RecordBatch::Make(schema, data.size(), {*sb.Finish()});
+ UNIT_ASSERT(batch->ValidateFull().ok());
+
+ auto step = std::make_shared<TProgramStep>();
+ step->Assignes = {
+ TAssign("res", op, {"x"}, std::make_shared<arrow::compute::MatchSubstringOptions>(pattern, ignoreCase))
+ };
+ step->Projection = {"res"};
+ auto status = ApplyProgram(batch, TProgram({step}), GetCustomExecContext());
+ if (!status.ok()) {
+ Cerr << status.ToString() << "\n";
+ }
+ UNIT_ASSERT(status.ok());
+ UNIT_ASSERT(batch->ValidateFull().ok());
+ UNIT_ASSERT_VALUES_EQUAL(batch->num_columns(), 1);
+
+ auto& resColumn = static_cast<const arrow::BooleanArray&>(*batch->GetColumnByName("res"));
+ std::vector<bool> vec;
+ for (int i = 0; i < resColumn.length(); ++i) {
+ UNIT_ASSERT(!resColumn.IsNull(i)); // TODO
+ vec.push_back(resColumn.Value(i));
+ }
+ return vec;
+}
+
enum class ETest {
DEFAULT,
EMPTY,
@@ -361,6 +393,33 @@ Y_UNIT_TEST_SUITE(ProgramStep) {
UNIT_ASSERT(FilterTestUnary({x, z->make_array()}, EOperation::Invert, EOperation::Equal) == 3);
}
+ Y_UNIT_TEST(StartsWith) {
+ std::vector<bool> res = LikeTest({"aa", "abaaba", "baa", ""}, EOperation::StartsWith, "aa");
+ UNIT_ASSERT_VALUES_EQUAL(res.size(), 4);
+ UNIT_ASSERT_VALUES_EQUAL(res[0], true);
+ UNIT_ASSERT_VALUES_EQUAL(res[1], false);
+ UNIT_ASSERT_VALUES_EQUAL(res[2], false);
+ UNIT_ASSERT_VALUES_EQUAL(res[3], false);
+ }
+
+ Y_UNIT_TEST(EndsWith) {
+ std::vector<bool> res = LikeTest({"aa", "abaaba", "baa", ""}, EOperation::EndsWith, "aa");
+ UNIT_ASSERT_VALUES_EQUAL(res.size(), 4);
+ UNIT_ASSERT_VALUES_EQUAL(res[0], true);
+ UNIT_ASSERT_VALUES_EQUAL(res[1], false);
+ UNIT_ASSERT_VALUES_EQUAL(res[2], true);
+ UNIT_ASSERT_VALUES_EQUAL(res[3], false);
+ }
+
+ Y_UNIT_TEST(MatchSubstring) {
+ std::vector<bool> res = LikeTest({"aa", "abaaba", "baa", ""}, EOperation::MatchSubstring, "aa");
+ UNIT_ASSERT_VALUES_EQUAL(res.size(), 4);
+ UNIT_ASSERT_VALUES_EQUAL(res[0], true);
+ UNIT_ASSERT_VALUES_EQUAL(res[1], true);
+ UNIT_ASSERT_VALUES_EQUAL(res[2], true);
+ UNIT_ASSERT_VALUES_EQUAL(res[3], false);
+ }
+
Y_UNIT_TEST(ScalarTest) {
auto schema = std::make_shared<arrow::Schema>(std::vector{
std::make_shared<arrow::Field>("x", arrow::int64()),
diff --git a/ydb/core/tx/columnshard/columnshard_common.cpp b/ydb/core/tx/columnshard/columnshard_common.cpp
index d1a15b9b028..ff7ab88e9a5 100644
--- a/ydb/core/tx/columnshard/columnshard_common.cpp
+++ b/ydb/core/tx/columnshard/columnshard_common.cpp
@@ -46,6 +46,7 @@ TString FromCells(const TConstArrayRef<TCell>& cells, const TVector<std::pair<TS
struct TContext {
const IColumnResolver& ColumnResolver;
mutable THashMap<ui32, TString> Sources;
+ mutable THashMap<TString, std::shared_ptr<arrow::Scalar>> Constants;
explicit TContext(const IColumnResolver& columnResolver)
: ColumnResolver(columnResolver)
@@ -89,6 +90,19 @@ TAssign MakeFunction(const TContext& info, const std::string& name,
return castOpts;
};
+ auto mkLikeOptions = [&]() {
+ if (arguments.size() != 2 || !info.Constants.count(arguments[1])) {
+ return std::shared_ptr<arrow::compute::MatchSubstringOptions>();
+ }
+ auto patternScalar = info.Constants[arguments[1]];
+ if (!arrow::is_base_binary_like(patternScalar->type->id())) {
+ return std::shared_ptr<arrow::compute::MatchSubstringOptions>();
+ }
+ arguments.resize(1);
+ auto& pattern = static_cast<arrow::BaseBinaryScalar&>(*patternScalar).value;
+ return std::make_shared<arrow::compute::MatchSubstringOptions>(pattern->ToString()); // TODO: case-insensitive
+ };
+
switch (func.GetId()) {
case TId::FUNC_CMP_EQUAL:
return TAssign(name, EOperation::Equal, std::move(arguments));
@@ -106,14 +120,30 @@ TAssign MakeFunction(const TContext& info, const std::string& name,
return TAssign(name, EOperation::IsNull, std::move(arguments));
case TId::FUNC_STR_LENGTH:
return TAssign(name, EOperation::BinaryLength, std::move(arguments));
- case TId::FUNC_STR_MATCH:
- return TAssign(name, EOperation::MatchSubstring, std::move(arguments));
- case TId::FUNC_STR_MATCH_LIKE:
- return TAssign(name, EOperation::MatchLike, std::move(arguments));
- case TId::FUNC_STR_STARTS_WITH:
- return TAssign(name, EOperation::StartsWith, std::move(arguments));
- case TId::FUNC_STR_ENDS_WITH:
- return TAssign(name, EOperation::EndsWith, std::move(arguments));
+ case TId::FUNC_STR_MATCH: {
+ if (auto opts = mkLikeOptions()) {
+ return TAssign(name, EOperation::MatchSubstring, std::move(arguments), opts);
+ }
+ break;
+ }
+ case TId::FUNC_STR_MATCH_LIKE: {
+ if (auto opts = mkLikeOptions()) {
+ return TAssign(name, EOperation::MatchLike, std::move(arguments), opts);
+ }
+ break;
+ }
+ case TId::FUNC_STR_STARTS_WITH: {
+ if (auto opts = mkLikeOptions()) {
+ return TAssign(name, EOperation::StartsWith, std::move(arguments), opts);
+ }
+ break;
+ }
+ case TId::FUNC_STR_ENDS_WITH: {
+ if (auto opts = mkLikeOptions()) {
+ return TAssign(name, EOperation::EndsWith, std::move(arguments), opts);
+ }
+ break;
+ }
case TId::FUNC_BINARY_NOT:
return TAssign(name, EOperation::Invert, std::move(arguments));
case TId::FUNC_BINARY_AND:
@@ -283,6 +313,7 @@ bool ExtractAssign(const TContext& info, NSsa::TProgramStep& step, const NKikimr
if (!cnst.IsConstant()) {
return false;
}
+ info.Constants[columnName] = cnst.GetConstant();
step.Assignes.emplace_back(std::move(cnst));
break;
}
diff --git a/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp b/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp
index af33c4f1a84..bc226e43a4a 100644
--- a/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp
+++ b/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp
@@ -1164,6 +1164,34 @@ static NKikimrSSA::TProgram MakeSelect(TAssignment::EFunction compareId = TAssig
return ssa;
}
+// SELECT level, timestamp FROM t WHERE likeFunc(timestamp, pattern)
+// FUNC_STR_MATCH, FUNC_STR_STARTS_WITH, FUNC_STR_ENDS_WITH
+static NKikimrSSA::TProgram MakeSelectLike(TAssignment::EFunction likeId, const TString& pattern) {
+ NKikimrSSA::TProgram ssa;
+
+ std::vector<ui32> columnIds = {6}; // message
+
+ auto* line1 = ssa.AddCommand();
+ auto* l1_assign = line1->MutableAssign();
+ l1_assign->MutableColumn()->SetId(100);
+ l1_assign->MutableConstant()->SetText(pattern);
+
+ auto* line2 = ssa.AddCommand();
+ auto* l2_assign = line2->MutableAssign();
+ l2_assign->MutableColumn()->SetId(101);
+ auto* l2_func = l2_assign->MutableFunction();
+ l2_func->SetId(likeId);
+ l2_func->AddArguments()->SetId(columnIds[0]);
+ l2_func->AddArguments()->SetId(100);
+
+ auto* line3 = ssa.AddCommand();
+ line3->MutableFilter()->MutablePredicate()->SetId(101);
+
+ auto* line4 = ssa.AddCommand();
+ line4->MutableProjection()->AddColumns()->SetId(columnIds[0]);
+ return ssa;
+}
+
// SELECT min(x), max(x), some(x), count(x) FROM t [GROUP BY key[0], key[1], ...]
NKikimrSSA::TProgram MakeSelectAggregates(ui32 columnId, const std::vector<ui32>& keys = {},
bool addProjection = true)
@@ -1408,6 +1436,95 @@ void TestReadWithProgram(const TestTableDescription& table = {})
}
}
+void TestReadWithProgramLike(const TestTableDescription& table = {}) {
+ TTestBasicRuntime runtime;
+ TTester::Setup(runtime);
+
+ TActorId sender = runtime.AllocateEdgeActor();
+ CreateTestBootstrapper(runtime,
+ CreateTestTabletInfo(TTestTxConfig::TxTablet0, TTabletTypes::ColumnShard), &CreateColumnShard);
+
+ TDispatchOptions options;
+ options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvTablet::EvBoot));
+ runtime.DispatchEvents(options);
+
+ ui64 metaShard = TTestTxConfig::TxTablet1;
+ ui64 writeId = 0;
+ ui64 tableId = 1;
+ ui64 planStep = 100;
+ ui64 txId = 100;
+
+ SetupSchema(runtime, sender, tableId, table.Schema);
+
+ { // write some data
+ bool ok = WriteData(runtime, sender, metaShard, writeId, tableId, MakeTestBlob({0, 100}, table.Schema));
+ UNIT_ASSERT(ok);
+
+ ProposeCommit(runtime, sender, metaShard, txId, {writeId});
+ PlanCommit(runtime, sender, planStep, txId);
+ }
+
+ TString pattern = "1";
+ std::vector<NKikimrSSA::TProgram> ssas = {
+ MakeSelectLike(TAssignment::FUNC_STR_MATCH, pattern),
+ MakeSelectLike(TAssignment::FUNC_STR_STARTS_WITH, pattern),
+ MakeSelectLike(TAssignment::FUNC_STR_ENDS_WITH, pattern)
+ };
+
+ ui32 i = 0;
+ for (auto& ssa : ssas) {
+ TString programText;
+ {
+ TString serialized;
+ UNIT_ASSERT(ssa.SerializeToString(&serialized));
+ NKikimrSSA::TOlapProgram program;
+ program.SetProgram(serialized);
+ UNIT_ASSERT(program.SerializeToString(&programText));
+ }
+
+ auto* readEvent = new TEvColumnShard::TEvRead(sender, metaShard, planStep, txId, tableId);
+ auto& readProto = Proto(readEvent);
+
+ readProto.SetOlapProgramType(::NKikimrSchemeOp::EOlapProgramType::OLAP_PROGRAM_SSA_PROGRAM_WITH_PARAMETERS);
+ readProto.SetOlapProgram(programText);
+
+ ForwardToTablet(runtime, TTestTxConfig::TxTablet0, sender, readEvent);
+
+ TAutoPtr<IEventHandle> handle;
+ auto result = runtime.GrabEdgeEvent<TEvColumnShard::TEvReadResult>(handle);
+ UNIT_ASSERT(result);
+
+ auto& resRead = Proto(result);
+ UNIT_ASSERT_EQUAL(resRead.GetOrigin(), TTestTxConfig::TxTablet0);
+ UNIT_ASSERT_EQUAL(resRead.GetTxInitiator(), metaShard);
+ {
+ UNIT_ASSERT_EQUAL(resRead.GetStatus(), NKikimrTxColumnShard::EResultStatus::SUCCESS);
+ UNIT_ASSERT_EQUAL(resRead.GetBatch(), 0);
+ UNIT_ASSERT_EQUAL(resRead.GetFinished(), true);
+ UNIT_ASSERT(resRead.GetData().size() > 0);
+
+ auto& meta = resRead.GetMeta();
+ //auto& schema = meta.GetSchema();
+ TString readData = resRead.GetData();
+
+ switch (i) {
+ case 0:
+ UNIT_ASSERT(CheckColumns(readData, meta, {"message"}, 19));
+ break;
+ case 1:
+ UNIT_ASSERT(CheckColumns(readData, meta, {"message"}, 11));
+ break;
+ case 2:
+ UNIT_ASSERT(CheckColumns(readData, meta, {"message"}, 10));
+ break;
+ default:
+ break;
+ }
+ }
+ ++i;
+ }
+}
+
void TestSomePrograms(const TestTableDescription& table) {
TTestBasicRuntime runtime;
TTester::Setup(runtime);
@@ -1785,6 +1902,10 @@ Y_UNIT_TEST_SUITE(TColumnShardTestReadWrite) {
TestReadWithProgram();
}
+ Y_UNIT_TEST(ReadWithProgramLike) {
+ TestReadWithProgramLike();
+ }
+
Y_UNIT_TEST(ReadSomePrograms) {
TestTableDescription table;
table.Schema = {