diff options
author | chertus <azuikov@ydb.tech> | 2023-03-14 11:34:19 +0300 |
---|---|---|
committer | chertus <azuikov@ydb.tech> | 2023-03-14 11:34:19 +0300 |
commit | 9b9393c30c6fd29ba2bb4a0ba0ef2dd7ce0e1e72 (patch) | |
tree | b44d84999048c54905506b2f99a84f2cf0dbccda | |
parent | 905e6db573bae8dca6b071c468c29b8d3cd7a242 (diff) | |
download | ydb-9b9393c30c6fd29ba2bb4a0ba0ef2dd7ce0e1e72.tar.gz |
fix and tests for LIKEs in SSA
-rw-r--r-- | ydb/core/formats/program.cpp | 8 | ||||
-rw-r--r-- | ydb/core/formats/ut_program_step.cpp | 59 | ||||
-rw-r--r-- | ydb/core/tx/columnshard/columnshard_common.cpp | 47 | ||||
-rw-r--r-- | ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp | 121 |
4 files changed, 223 insertions, 12 deletions
diff --git a/ydb/core/formats/program.cpp b/ydb/core/formats/program.cpp index 23ea3d120bd..420955a610c 100644 --- a/ydb/core/formats/program.cpp +++ b/ydb/core/formats/program.cpp @@ -192,10 +192,6 @@ EOperation ValidateOperation(EOperation op, ui32 argsSize) { case EOperation::LessEqual: case EOperation::Greater: case EOperation::GreaterEqual: - case EOperation::MatchSubstring: - case EOperation::MatchLike: - case EOperation::StartsWith: - case EOperation::EndsWith: case EOperation::And: case EOperation::Or: case EOperation::Xor: @@ -237,6 +233,10 @@ EOperation ValidateOperation(EOperation op, ui32 argsSize) { case EOperation::Invert: case EOperation::Abs: case EOperation::Negate: + case EOperation::StartsWith: + case EOperation::EndsWith: + case EOperation::MatchSubstring: + case EOperation::MatchLike: if (argsSize == 1) { return op; } diff --git a/ydb/core/formats/ut_program_step.cpp b/ydb/core/formats/ut_program_step.cpp index f89b9f0f7a3..1f2ae885214 100644 --- a/ydb/core/formats/ut_program_step.cpp +++ b/ydb/core/formats/ut_program_step.cpp @@ -56,6 +56,38 @@ size_t FilterTestUnary(std::vector<std::shared_ptr<arrow::Array>> args, EOperati return batch->num_rows(); } +std::vector<bool> LikeTest(const std::vector<std::string>& data, + EOperation op, const std::string& pattern, bool ignoreCase = false) +{ + auto schema = std::make_shared<arrow::Schema>(std::vector{ + std::make_shared<arrow::Field>("x", arrow::utf8())}); + arrow::StringBuilder sb; + sb.AppendValues(data).ok(); + auto batch = arrow::RecordBatch::Make(schema, data.size(), {*sb.Finish()}); + UNIT_ASSERT(batch->ValidateFull().ok()); + + auto step = std::make_shared<TProgramStep>(); + step->Assignes = { + TAssign("res", op, {"x"}, std::make_shared<arrow::compute::MatchSubstringOptions>(pattern, ignoreCase)) + }; + step->Projection = {"res"}; + auto status = ApplyProgram(batch, TProgram({step}), GetCustomExecContext()); + if (!status.ok()) { + Cerr << status.ToString() << "\n"; + } + UNIT_ASSERT(status.ok()); + UNIT_ASSERT(batch->ValidateFull().ok()); + UNIT_ASSERT_VALUES_EQUAL(batch->num_columns(), 1); + + auto& resColumn = static_cast<const arrow::BooleanArray&>(*batch->GetColumnByName("res")); + std::vector<bool> vec; + for (int i = 0; i < resColumn.length(); ++i) { + UNIT_ASSERT(!resColumn.IsNull(i)); // TODO + vec.push_back(resColumn.Value(i)); + } + return vec; +} + enum class ETest { DEFAULT, EMPTY, @@ -361,6 +393,33 @@ Y_UNIT_TEST_SUITE(ProgramStep) { UNIT_ASSERT(FilterTestUnary({x, z->make_array()}, EOperation::Invert, EOperation::Equal) == 3); } + Y_UNIT_TEST(StartsWith) { + std::vector<bool> res = LikeTest({"aa", "abaaba", "baa", ""}, EOperation::StartsWith, "aa"); + UNIT_ASSERT_VALUES_EQUAL(res.size(), 4); + UNIT_ASSERT_VALUES_EQUAL(res[0], true); + UNIT_ASSERT_VALUES_EQUAL(res[1], false); + UNIT_ASSERT_VALUES_EQUAL(res[2], false); + UNIT_ASSERT_VALUES_EQUAL(res[3], false); + } + + Y_UNIT_TEST(EndsWith) { + std::vector<bool> res = LikeTest({"aa", "abaaba", "baa", ""}, EOperation::EndsWith, "aa"); + UNIT_ASSERT_VALUES_EQUAL(res.size(), 4); + UNIT_ASSERT_VALUES_EQUAL(res[0], true); + UNIT_ASSERT_VALUES_EQUAL(res[1], false); + UNIT_ASSERT_VALUES_EQUAL(res[2], true); + UNIT_ASSERT_VALUES_EQUAL(res[3], false); + } + + Y_UNIT_TEST(MatchSubstring) { + std::vector<bool> res = LikeTest({"aa", "abaaba", "baa", ""}, EOperation::MatchSubstring, "aa"); + UNIT_ASSERT_VALUES_EQUAL(res.size(), 4); + UNIT_ASSERT_VALUES_EQUAL(res[0], true); + UNIT_ASSERT_VALUES_EQUAL(res[1], true); + UNIT_ASSERT_VALUES_EQUAL(res[2], true); + UNIT_ASSERT_VALUES_EQUAL(res[3], false); + } + Y_UNIT_TEST(ScalarTest) { auto schema = std::make_shared<arrow::Schema>(std::vector{ std::make_shared<arrow::Field>("x", arrow::int64()), diff --git a/ydb/core/tx/columnshard/columnshard_common.cpp b/ydb/core/tx/columnshard/columnshard_common.cpp index d1a15b9b028..ff7ab88e9a5 100644 --- a/ydb/core/tx/columnshard/columnshard_common.cpp +++ b/ydb/core/tx/columnshard/columnshard_common.cpp @@ -46,6 +46,7 @@ TString FromCells(const TConstArrayRef<TCell>& cells, const TVector<std::pair<TS struct TContext { const IColumnResolver& ColumnResolver; mutable THashMap<ui32, TString> Sources; + mutable THashMap<TString, std::shared_ptr<arrow::Scalar>> Constants; explicit TContext(const IColumnResolver& columnResolver) : ColumnResolver(columnResolver) @@ -89,6 +90,19 @@ TAssign MakeFunction(const TContext& info, const std::string& name, return castOpts; }; + auto mkLikeOptions = [&]() { + if (arguments.size() != 2 || !info.Constants.count(arguments[1])) { + return std::shared_ptr<arrow::compute::MatchSubstringOptions>(); + } + auto patternScalar = info.Constants[arguments[1]]; + if (!arrow::is_base_binary_like(patternScalar->type->id())) { + return std::shared_ptr<arrow::compute::MatchSubstringOptions>(); + } + arguments.resize(1); + auto& pattern = static_cast<arrow::BaseBinaryScalar&>(*patternScalar).value; + return std::make_shared<arrow::compute::MatchSubstringOptions>(pattern->ToString()); // TODO: case-insensitive + }; + switch (func.GetId()) { case TId::FUNC_CMP_EQUAL: return TAssign(name, EOperation::Equal, std::move(arguments)); @@ -106,14 +120,30 @@ TAssign MakeFunction(const TContext& info, const std::string& name, return TAssign(name, EOperation::IsNull, std::move(arguments)); case TId::FUNC_STR_LENGTH: return TAssign(name, EOperation::BinaryLength, std::move(arguments)); - case TId::FUNC_STR_MATCH: - return TAssign(name, EOperation::MatchSubstring, std::move(arguments)); - case TId::FUNC_STR_MATCH_LIKE: - return TAssign(name, EOperation::MatchLike, std::move(arguments)); - case TId::FUNC_STR_STARTS_WITH: - return TAssign(name, EOperation::StartsWith, std::move(arguments)); - case TId::FUNC_STR_ENDS_WITH: - return TAssign(name, EOperation::EndsWith, std::move(arguments)); + case TId::FUNC_STR_MATCH: { + if (auto opts = mkLikeOptions()) { + return TAssign(name, EOperation::MatchSubstring, std::move(arguments), opts); + } + break; + } + case TId::FUNC_STR_MATCH_LIKE: { + if (auto opts = mkLikeOptions()) { + return TAssign(name, EOperation::MatchLike, std::move(arguments), opts); + } + break; + } + case TId::FUNC_STR_STARTS_WITH: { + if (auto opts = mkLikeOptions()) { + return TAssign(name, EOperation::StartsWith, std::move(arguments), opts); + } + break; + } + case TId::FUNC_STR_ENDS_WITH: { + if (auto opts = mkLikeOptions()) { + return TAssign(name, EOperation::EndsWith, std::move(arguments), opts); + } + break; + } case TId::FUNC_BINARY_NOT: return TAssign(name, EOperation::Invert, std::move(arguments)); case TId::FUNC_BINARY_AND: @@ -283,6 +313,7 @@ bool ExtractAssign(const TContext& info, NSsa::TProgramStep& step, const NKikimr if (!cnst.IsConstant()) { return false; } + info.Constants[columnName] = cnst.GetConstant(); step.Assignes.emplace_back(std::move(cnst)); break; } diff --git a/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp b/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp index af33c4f1a84..bc226e43a4a 100644 --- a/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp +++ b/ydb/core/tx/columnshard/ut_rw/ut_columnshard_read_write.cpp @@ -1164,6 +1164,34 @@ static NKikimrSSA::TProgram MakeSelect(TAssignment::EFunction compareId = TAssig return ssa; } +// SELECT level, timestamp FROM t WHERE likeFunc(timestamp, pattern) +// FUNC_STR_MATCH, FUNC_STR_STARTS_WITH, FUNC_STR_ENDS_WITH +static NKikimrSSA::TProgram MakeSelectLike(TAssignment::EFunction likeId, const TString& pattern) { + NKikimrSSA::TProgram ssa; + + std::vector<ui32> columnIds = {6}; // message + + auto* line1 = ssa.AddCommand(); + auto* l1_assign = line1->MutableAssign(); + l1_assign->MutableColumn()->SetId(100); + l1_assign->MutableConstant()->SetText(pattern); + + auto* line2 = ssa.AddCommand(); + auto* l2_assign = line2->MutableAssign(); + l2_assign->MutableColumn()->SetId(101); + auto* l2_func = l2_assign->MutableFunction(); + l2_func->SetId(likeId); + l2_func->AddArguments()->SetId(columnIds[0]); + l2_func->AddArguments()->SetId(100); + + auto* line3 = ssa.AddCommand(); + line3->MutableFilter()->MutablePredicate()->SetId(101); + + auto* line4 = ssa.AddCommand(); + line4->MutableProjection()->AddColumns()->SetId(columnIds[0]); + return ssa; +} + // SELECT min(x), max(x), some(x), count(x) FROM t [GROUP BY key[0], key[1], ...] NKikimrSSA::TProgram MakeSelectAggregates(ui32 columnId, const std::vector<ui32>& keys = {}, bool addProjection = true) @@ -1408,6 +1436,95 @@ void TestReadWithProgram(const TestTableDescription& table = {}) } } +void TestReadWithProgramLike(const TestTableDescription& table = {}) { + TTestBasicRuntime runtime; + TTester::Setup(runtime); + + TActorId sender = runtime.AllocateEdgeActor(); + CreateTestBootstrapper(runtime, + CreateTestTabletInfo(TTestTxConfig::TxTablet0, TTabletTypes::ColumnShard), &CreateColumnShard); + + TDispatchOptions options; + options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvTablet::EvBoot)); + runtime.DispatchEvents(options); + + ui64 metaShard = TTestTxConfig::TxTablet1; + ui64 writeId = 0; + ui64 tableId = 1; + ui64 planStep = 100; + ui64 txId = 100; + + SetupSchema(runtime, sender, tableId, table.Schema); + + { // write some data + bool ok = WriteData(runtime, sender, metaShard, writeId, tableId, MakeTestBlob({0, 100}, table.Schema)); + UNIT_ASSERT(ok); + + ProposeCommit(runtime, sender, metaShard, txId, {writeId}); + PlanCommit(runtime, sender, planStep, txId); + } + + TString pattern = "1"; + std::vector<NKikimrSSA::TProgram> ssas = { + MakeSelectLike(TAssignment::FUNC_STR_MATCH, pattern), + MakeSelectLike(TAssignment::FUNC_STR_STARTS_WITH, pattern), + MakeSelectLike(TAssignment::FUNC_STR_ENDS_WITH, pattern) + }; + + ui32 i = 0; + for (auto& ssa : ssas) { + TString programText; + { + TString serialized; + UNIT_ASSERT(ssa.SerializeToString(&serialized)); + NKikimrSSA::TOlapProgram program; + program.SetProgram(serialized); + UNIT_ASSERT(program.SerializeToString(&programText)); + } + + auto* readEvent = new TEvColumnShard::TEvRead(sender, metaShard, planStep, txId, tableId); + auto& readProto = Proto(readEvent); + + readProto.SetOlapProgramType(::NKikimrSchemeOp::EOlapProgramType::OLAP_PROGRAM_SSA_PROGRAM_WITH_PARAMETERS); + readProto.SetOlapProgram(programText); + + ForwardToTablet(runtime, TTestTxConfig::TxTablet0, sender, readEvent); + + TAutoPtr<IEventHandle> handle; + auto result = runtime.GrabEdgeEvent<TEvColumnShard::TEvReadResult>(handle); + UNIT_ASSERT(result); + + auto& resRead = Proto(result); + UNIT_ASSERT_EQUAL(resRead.GetOrigin(), TTestTxConfig::TxTablet0); + UNIT_ASSERT_EQUAL(resRead.GetTxInitiator(), metaShard); + { + UNIT_ASSERT_EQUAL(resRead.GetStatus(), NKikimrTxColumnShard::EResultStatus::SUCCESS); + UNIT_ASSERT_EQUAL(resRead.GetBatch(), 0); + UNIT_ASSERT_EQUAL(resRead.GetFinished(), true); + UNIT_ASSERT(resRead.GetData().size() > 0); + + auto& meta = resRead.GetMeta(); + //auto& schema = meta.GetSchema(); + TString readData = resRead.GetData(); + + switch (i) { + case 0: + UNIT_ASSERT(CheckColumns(readData, meta, {"message"}, 19)); + break; + case 1: + UNIT_ASSERT(CheckColumns(readData, meta, {"message"}, 11)); + break; + case 2: + UNIT_ASSERT(CheckColumns(readData, meta, {"message"}, 10)); + break; + default: + break; + } + } + ++i; + } +} + void TestSomePrograms(const TestTableDescription& table) { TTestBasicRuntime runtime; TTester::Setup(runtime); @@ -1785,6 +1902,10 @@ Y_UNIT_TEST_SUITE(TColumnShardTestReadWrite) { TestReadWithProgram(); } + Y_UNIT_TEST(ReadWithProgramLike) { + TestReadWithProgramLike(); + } + Y_UNIT_TEST(ReadSomePrograms) { TestTableDescription table; table.Schema = { |