aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorzverevgeny <zverevgeny@ydb.tech>2023-10-18 16:28:34 +0300
committerzverevgeny <zverevgeny@ydb.tech>2023-10-18 17:26:46 +0300
commitbf423fc3881e747996a364ef1766065204484578 (patch)
treeb4b8eb5b40e16fa9da4f7c4ab8f83ab80d020d7d
parentcaec94c7e0365f0c519af6ede6427f6079f7fbf2 (diff)
downloadydb-bf423fc3881e747996a364ef1766065204484578.tar.gz
YQL-16869 ut for Kleene star in match_recognize
-rw-r--r--ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h4
-rw-r--r--ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_nfa_ut.cpp175
2 files changed, 179 insertions, 0 deletions
diff --git a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h
index a93faa517c..1de9d34764 100644
--- a/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h
+++ b/ydb/library/yql/minikql/comp_nodes/mkql_match_recognize_nfa.h
@@ -194,6 +194,10 @@ public:
return std::nullopt;
}
+ size_t GetActiveStatesCount() const {
+ return ActiveStates.size();
+ }
+
private:
//TODO (zverevgeny): Consider to change to std::vector for the sake of perf
using TStateSet = std::set<TState>;
diff --git a/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_nfa_ut.cpp b/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_nfa_ut.cpp
index f9b96bc88c..7b541599df 100644
--- a/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_nfa_ut.cpp
+++ b/ydb/library/yql/minikql/comp_nodes/ut/mkql_match_recognize_nfa_ut.cpp
@@ -244,6 +244,181 @@ Y_UNIT_TEST_SUITE(MatchRecognizeNfa) {
UNIT_ASSERT_VALUES_EQUAL(1, setup.GetMatchedCount());
}
}
+
+ //Match every contiguous subset (n*n of input size)
+ //Pattern: Any*
+ //Input any event matches Any
+ Y_UNIT_TEST(AnyStar) {
+ TScopedAlloc alloc(__LOCATION__);
+ THolderFactory holderFactory(alloc.Ref(), memUsage);
+ //"Any*"
+ const TRowPattern pattern{{
+ TRowPatternFactor{"A", 1, 1000000000, false, false},
+ }};
+ TNfaSetup setup{pattern};
+ auto& defineA = setup.Defines.at(0);
+ auto& ctx = setup.Ctx();
+ defineA->SetValue(ctx, NUdf::TUnboxedValuePod{true});
+ TSparseList list;
+ const size_t inputSize = 100;
+ size_t totalMatches = 0;
+ for (size_t i = 0; i != inputSize; ++i) {
+ setup.Nfa.ProcessRow(list.Append(NUdf::TUnboxedValue{}), ctx);
+ const auto matches = setup.GetMatchedCount();
+ totalMatches += matches;
+ UNIT_ASSERT_VALUES_EQUAL(i + 1, matches);
+ UNIT_ASSERT_VALUES_EQUAL(i + 1, setup.Nfa.GetActiveStatesCount());
+ }
+ UNIT_ASSERT_VALUES_EQUAL(inputSize * (inputSize + 1) / 2, totalMatches);
+ }
+
+ //Pattern: A*
+ //Input: intermittent series events that match A
+ Y_UNIT_TEST(AStar) {
+ TScopedAlloc alloc(__LOCATION__);
+ THolderFactory holderFactory(alloc.Ref(), memUsage);
+ //"A*"
+ const TRowPattern pattern{{
+ TRowPatternFactor{"A", 1, 1000000000, false, false},
+ }};
+ TNfaSetup setup{pattern};
+ auto& defineA = setup.Defines.at(0);
+ auto& ctx = setup.Ctx();
+ TSparseList list;
+ const size_t inputSize = 100;
+ const size_t seriesPeriod = 10;
+ const size_t seriesLength = 3;
+ size_t totalMatches = 0;
+ //Intermittent series of matched events
+ for (size_t i = 0; i != inputSize; ++i) {
+ defineA->SetValue(ctx, NUdf::TUnboxedValuePod{i % seriesPeriod < seriesLength});
+ setup.Nfa.ProcessRow(list.Append(NUdf::TUnboxedValue{}), ctx);
+ const auto matches = setup.GetMatchedCount();
+ totalMatches += matches;
+ if (i % seriesPeriod < seriesLength) {
+ UNIT_ASSERT_VALUES_EQUAL(i % seriesPeriod + 1, matches);
+ UNIT_ASSERT(setup.Nfa.GetActiveStatesCount() <= 2 * seriesLength);
+ } else {
+ UNIT_ASSERT_VALUES_EQUAL(0, matches);
+ UNIT_ASSERT_VALUES_EQUAL(0, setup.Nfa.GetActiveStatesCount());
+ }
+ }
+ UNIT_ASSERT_VALUES_EQUAL(
+ (inputSize / seriesPeriod) * (seriesLength * (seriesLength + 1)) / 2,
+ totalMatches)
+ ;
+ }
+
+ //Pattern: A ANY* B
+ //Input: x x x A x x x B x x x
+ Y_UNIT_TEST(A_AnyStar_B_SingleMatch) {
+ TScopedAlloc alloc(__LOCATION__);
+ THolderFactory holderFactory(alloc.Ref(), memUsage);
+ //"A ANY* B"
+ const TRowPattern pattern{{
+ TRowPatternFactor{"A", 1, 1, false, false},
+ TRowPatternFactor{"ANY", 1, 1000000000, false, false},
+ TRowPatternFactor{"B", 1, 1, false, false},
+ }};
+ TNfaSetup setup{pattern};
+ auto& defineA = setup.Defines.at(0);
+ auto& defineAny = setup.Defines.at(1);
+ auto& defineB = setup.Defines.at(2);
+ auto& ctx = setup.Ctx();
+ TSparseList list;
+ defineAny->SetValue(ctx, NUdf::TUnboxedValuePod{true});
+ const size_t size = 100;
+ //Number of active states doesn't depend on input size for a single match
+ for (size_t i = 0; i != size; ++i) {
+ defineA->SetValue(ctx, NUdf::TUnboxedValuePod{i == 10});
+ defineB->SetValue(ctx, NUdf::TUnboxedValuePod{i == 60});
+ setup.Nfa.ProcessRow(list.Append(NUdf::TUnboxedValue{}), ctx);
+ UNIT_ASSERT_VALUES_EQUAL(i == 60 ? 1 : 0, setup.GetMatchedCount());
+ UNIT_ASSERT(setup.Nfa.GetActiveStatesCount() <= 3);
+ }
+ }
+
+ //Pattern: A ANY* B
+ //Input: x x x A x x x B x x x A x x x B ...
+ Y_UNIT_TEST(A_AnyStar_B_Series) {
+ TScopedAlloc alloc(__LOCATION__);
+ THolderFactory holderFactory(alloc.Ref(), memUsage);
+ //"A ANY* B"
+ const TRowPattern pattern{{
+ TRowPatternFactor{"A", 1, 1, false, false},
+ TRowPatternFactor{"ANY", 1, 1000000000, false, false},
+ TRowPatternFactor{"B", 1, 1, false, false},
+ }};
+ TNfaSetup setup{pattern};
+ auto& defineA = setup.Defines.at(0);
+ auto& defineAny = setup.Defines.at(1);
+ auto& defineB = setup.Defines.at(2);
+ auto& ctx = setup.Ctx();
+ TSparseList list;
+ defineAny->SetValue(ctx, NUdf::TUnboxedValuePod{true});
+ const size_t inputSize = 100;
+ const size_t seriesPeriod = 10;
+ const size_t offsetA = 2;
+ const size_t offsetB = 7;
+ size_t totalMatches = 0;
+ for (size_t i = 0; i != inputSize; ++i) {
+ defineA->SetValue(ctx, NUdf::TUnboxedValuePod{i % seriesPeriod == offsetA});
+ defineB->SetValue(ctx, NUdf::TUnboxedValuePod{i % seriesPeriod == offsetB});
+ setup.Nfa.ProcessRow(list.Append(NUdf::TUnboxedValue{}), ctx);
+ const auto expectedMatches = (i / seriesPeriod + 1);
+ if (i % seriesPeriod == offsetB) {
+ //Any matched A is a part for every subsequent B, because B matches ANY
+ const auto matches = setup.GetMatchedCount();
+ totalMatches += expectedMatches;
+ UNIT_ASSERT_VALUES_EQUAL(expectedMatches, matches);
+ } else {
+ UNIT_ASSERT_VALUES_EQUAL(0, setup.GetMatchedCount());
+ const auto expectedStates = (i / seriesPeriod + 1) * 2 + 2;
+ UNIT_ASSERT(setup.Nfa.GetActiveStatesCount() <= expectedStates);
+ }
+ }
+ const auto seriesCount = inputSize / seriesPeriod;
+ UNIT_ASSERT_VALUES_EQUAL(
+ seriesCount * (seriesCount + 1) / 2,
+ totalMatches
+ );
+ }
+
+ //Pattern: A ANY* B ANY* C
+ //Input: x x x A x x x B x x x C x x x
+ Y_UNIT_TEST(A_AnyStar_B_AnyStar_C_SingleMatch) {
+ TScopedAlloc alloc(__LOCATION__);
+ THolderFactory holderFactory(alloc.Ref(), memUsage);
+ //"A ANY* B ANY* C"
+ const TRowPattern pattern{{
+ TRowPatternFactor{"A", 1, 1, false, false},
+ TRowPatternFactor{"ANY", 1, 1000000000, false, false},
+ TRowPatternFactor{"B", 1, 1, false, false},
+ TRowPatternFactor{"ANY", 1, 1000000000, false, false},
+ TRowPatternFactor{"C", 1, 1, false, false},
+ }};
+ TNfaSetup setup{pattern};
+ auto& defineA = setup.Defines.at(0);
+ auto& defineAny = setup.Defines.at(1);
+ auto& defineB = setup.Defines.at(2);
+ auto& defineC = setup.Defines.at(3);
+ auto& ctx = setup.Ctx();
+ TSparseList list;
+ defineAny->SetValue(ctx, NUdf::TUnboxedValuePod{true});
+ const size_t inputSize = 100;
+ const size_t seriesPeriod = 100;
+ const size_t offsetA = 20;
+ const size_t offsetB = 50;
+ const size_t offsetC = 80;
+ for (size_t i = 0; i != inputSize; ++i) {
+ defineA->SetValue(ctx, NUdf::TUnboxedValuePod{i % seriesPeriod == offsetA});
+ defineB->SetValue(ctx, NUdf::TUnboxedValuePod{i % seriesPeriod == offsetB});
+ defineC->SetValue(ctx, NUdf::TUnboxedValuePod{i % seriesPeriod == offsetC});
+ setup.Nfa.ProcessRow(list.Append(NUdf::TUnboxedValue{}), ctx);
+ const auto matches = setup.GetMatchedCount();
+ UNIT_ASSERT_VALUES_EQUAL(i == 80 ? 1: 0, matches);
+ }
+ }
}
} //namespace NKikimr::NMiniKQL::NMatchRecognize