diff options
45 files changed, 1976 insertions, 1147 deletions
diff --git a/yql/essentials/core/sql_types/match_recognize.h b/yql/essentials/core/sql_types/match_recognize.h index e142587890..f64fe5e7fa 100644 --- a/yql/essentials/core/sql_types/match_recognize.h +++ b/yql/essentials/core/sql_types/match_recognize.h @@ -33,9 +33,6 @@ enum class EOutputColumnSource { Other, }; -constexpr size_t MaxPatternNesting = 20; //Limit recursion for patterns -constexpr size_t MaxPermutedItems = 6; - //Mixin columns for calculating measures enum class EMeasureInputDataSpecialColumns { Classifier = 0, diff --git a/yql/essentials/core/type_ann/type_ann_core.cpp b/yql/essentials/core/type_ann/type_ann_core.cpp index 4ec574876f..6b87ac4343 100644 --- a/yql/essentials/core/type_ann/type_ann_core.cpp +++ b/yql/essentials/core/type_ann/type_ann_core.cpp @@ -12982,6 +12982,7 @@ template <NKikimr::NUdf::EDataSlot DataSlot> Functions["NextValue"] = &NextValueWrapper; Functions["MatchRecognize"] = &MatchRecognizeWrapper; + Functions["MatchRecognizeMeasuresAggregates"] = &MatchRecognizeMeasuresAggregatesWrapper; Functions["MatchRecognizeParams"] = &MatchRecognizeParamsWrapper; Functions["MatchRecognizeMeasures"] = &MatchRecognizeMeasuresWrapper; Functions["MatchRecognizePattern"] = &MatchRecognizePatternWrapper; diff --git a/yql/essentials/core/type_ann/type_ann_match_recognize.cpp b/yql/essentials/core/type_ann/type_ann_match_recognize.cpp index 4a6d61192e..696435cd31 100644 --- a/yql/essentials/core/type_ann/type_ann_match_recognize.cpp +++ b/yql/essentials/core/type_ann/type_ann_match_recognize.cpp @@ -1,12 +1,31 @@ #include "type_ann_match_recognize.h" + #include <yql/essentials/core/sql_types/match_recognize.h> #include <yql/essentials/core/yql_match_recognize.h> namespace NYql::NTypeAnnImpl { -IGraphTransformer::TStatus -MatchRecognizeWrapper(const TExprNode::TPtr &input, TExprNode::TPtr &output, TContext &ctx) { - Y_UNUSED(output); +namespace { + +const TStructExprType* GetMatchedRowsRangesType(const TExprNode::TPtr& patternVars, TContext &ctx) { + const auto itemType = ctx.Expr.MakeType<TStructExprType>(TVector{ + ctx.Expr.MakeType<TItemExprType>("From", ctx.Expr.MakeType<TDataExprType>(EDataSlot::Uint64)), + ctx.Expr.MakeType<TItemExprType>("To", ctx.Expr.MakeType<TDataExprType>(EDataSlot::Uint64)) + }); + + TVector<const TItemExprType*> items; + for (const auto& var : patternVars->Children()) { + items.push_back(ctx.Expr.MakeType<TItemExprType>( + var->Content(), + ctx.Expr.MakeType<TListExprType>(itemType) + )); + } + return ctx.Expr.MakeType<TStructExprType>(items); +} + +} // anonymous namespace + +IGraphTransformer::TStatus MatchRecognizeWrapper(const TExprNode::TPtr& input, TExprNode::TPtr&, TContext& ctx) { if (!EnsureArgsCount(*input, 5, ctx.Expr)) { return IGraphTransformer::TStatus::Error; } @@ -15,7 +34,7 @@ MatchRecognizeWrapper(const TExprNode::TPtr &input, TExprNode::TPtr &output, TCo const auto partitionColumns = input->Child(2); const auto sortTraits = input->Child(3); const auto params = input->Child(4); - Y_UNUSED(source, sortTraits); + Y_UNUSED(sortTraits); auto status = ConvertToLambda(partitionKeySelector, ctx.Expr, 1, 1); if (status.Level != IGraphTransformer::TStatus::Ok) { return status; @@ -53,47 +72,89 @@ MatchRecognizeWrapper(const TExprNode::TPtr &input, TExprNode::TPtr &output, TCo return IGraphTransformer::TStatus::Ok; } -IGraphTransformer::TStatus -MatchRecognizeParamsWrapper(const TExprNode::TPtr &input, TExprNode::TPtr &output, TContext &ctx) { - Y_UNUSED(output); - if (!EnsureArgsCount(*input, 5, ctx.Expr)) { +IGraphTransformer::TStatus MatchRecognizeMeasuresAggregatesWrapper(const TExprNode::TPtr& input, TExprNode::TPtr&, TContext& ctx) { + if (!EnsureMinArgsCount(*input, 4, ctx.Expr)) { return IGraphTransformer::TStatus::Error; } - const auto measures = input->Child(0); - input->SetTypeAnn(measures->GetTypeAnn()); - return IGraphTransformer::TStatus::Ok; -} + const auto inputRowType = input->Child(0); + const auto patternVars = input->Child(1); + const auto names = input->Child(2); + const auto vars = input->Child(3); + constexpr size_t FirstLambdaIndex = 4; -namespace { + if (!EnsureType(*inputRowType, ctx.Expr)) { + return IGraphTransformer::TStatus::Error; + } + if (!EnsureTupleOfAtoms(*patternVars, ctx.Expr)) { + return IGraphTransformer::TStatus::Error; + } + if (!EnsureTupleOfAtoms(*names, ctx.Expr)) { + return IGraphTransformer::TStatus::Error; + } + if (!EnsureTupleOfAtoms(*vars, ctx.Expr)) { + return IGraphTransformer::TStatus::Error; + } + if (!EnsureArgsCount(*vars, names->ChildrenSize(), ctx.Expr)) { + return IGraphTransformer::TStatus::Error; + } + if (!EnsureArgsCount(*input, FirstLambdaIndex + names->ChildrenSize(), ctx.Expr)) { + return IGraphTransformer::TStatus::Error; + } -const TStructExprType* GetMatchedRowsRangesType(const TExprNode::TPtr& pattern, TContext &ctx) { - auto vars = GetPatternVars(NYql::NMatchRecognize::ConvertPattern(pattern, ctx.Expr, 0)); TVector<const TItemExprType*> items; - for (const auto& var: vars) { - const auto& item = ctx.Expr.MakeType<TStructExprType>(TVector<const TItemExprType*>{ - ctx.Expr.MakeType<TItemExprType>("From", ctx.Expr.MakeType<TDataExprType>(EDataSlot::Uint64)), - ctx.Expr.MakeType<TItemExprType>("To", ctx.Expr.MakeType<TDataExprType>(EDataSlot::Uint64)) - }); - items.push_back(ctx.Expr.MakeType<TItemExprType>(var, ctx.Expr.MakeType<TListExprType>(item))); + for (size_t i = 0; i < names->ChildrenSize(); ++i) { + auto lambda = input->Child(FirstLambdaIndex + i); + if (const auto varName = vars->Child(i)->Content()) { + const auto traits = input->Child(FirstLambdaIndex + i); + if (!EnsureTupleMinSize(*traits, 2, ctx.Expr)) { + return IGraphTransformer::TStatus::Error; + } + if (!EnsureTupleMaxSize(*traits, 3, ctx.Expr)) { + return IGraphTransformer::TStatus::Error; + } + if (!EnsureAtom(traits->Head(), ctx.Expr)) { + return IGraphTransformer::TStatus::Error; + } + lambda = traits->Child(1)->Child(NNodes::TCoAggregationTraits::idx_FinishHandler); + } + if (auto type = lambda->GetTypeAnn()) { + if (type->GetKind() != ETypeAnnotationKind::Optional) { + type = ctx.Expr.MakeType<TOptionalExprType>(type); + } + items.push_back(ctx.Expr.MakeType<TItemExprType>(names->Child(i)->Content(), type)); + } else { + return IGraphTransformer::TStatus::Repeat; + } } - return ctx.Expr.MakeType<TStructExprType>(items); + input->SetTypeAnn(ctx.Expr.MakeType<TStructExprType>(items)); + return IGraphTransformer::TStatus::Ok; } -}//namespace { +IGraphTransformer::TStatus MatchRecognizeParamsWrapper(const TExprNode::TPtr& input, TExprNode::TPtr&, TContext& ctx) { + if (!EnsureArgsCount(*input, 5, ctx.Expr)) { + return IGraphTransformer::TStatus::Error; + } + const auto measures = input->Child(0); + input->SetTypeAnn(measures->GetTypeAnn()); + return IGraphTransformer::TStatus::Ok; +} -IGraphTransformer::TStatus -MatchRecognizeMeasuresWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, - TContext &ctx) { - Y_UNUSED(output); +IGraphTransformer::TStatus MatchRecognizeMeasuresWrapper(const TExprNode::TPtr& input, TExprNode::TPtr&, TContext& ctx) { if (!EnsureMinArgsCount(*input, 3, ctx.Expr)) { return IGraphTransformer::TStatus::Error; } const auto inputRowType = input->Child(0); - const auto pattern = input->Child(1); + const auto patternVars = input->Child(1); const auto names = input->Child(2); constexpr size_t FirstLambdaIndex = 3; - if (!EnsureTupleOfAtoms(*names, ctx.Expr)) { + if (!EnsureType(*inputRowType, ctx.Expr)) { + return IGraphTransformer::TStatus::Error; + } + if (!EnsureTupleOfAtoms(*patternVars, ctx.Expr)) { + return IGraphTransformer::TStatus::Error; + } + if (!EnsureTupleOfAtoms(*names, ctx.Expr)) { return IGraphTransformer::TStatus::Error; } @@ -111,7 +172,7 @@ MatchRecognizeMeasuresWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& out MeasureInputDataSpecialColumnName(EMeasureInputDataSpecialColumns::MatchNumber), ctx.Expr.MakeType<TDataExprType>(EDataSlot::Uint64))); auto lambdaInputRowType = ctx.Expr.MakeType<TStructExprType>(lambdaInputRowColumns); - const auto& matchedRowsRanges = GetMatchedRowsRangesType(pattern, ctx); + const auto& matchedRowsRanges = GetMatchedRowsRangesType(patternVars, ctx); YQL_ENSURE(matchedRowsRanges); TVector<const TItemExprType*> items; for (size_t i = 0; i != names->ChildrenSize(); ++i) { @@ -139,27 +200,27 @@ MatchRecognizeMeasuresWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& out return IGraphTransformer::TStatus::Ok; } -IGraphTransformer::TStatus -MatchRecognizePatternWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, - TContext &ctx) { - Y_UNUSED(output); +IGraphTransformer::TStatus MatchRecognizePatternWrapper(const TExprNode::TPtr& input, TExprNode::TPtr&, TContext& ctx) { input->SetTypeAnn(ctx.Expr.MakeType<TVoidExprType>()); return IGraphTransformer::TStatus::Ok; } -IGraphTransformer::TStatus -MatchRecognizeDefinesWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, - TContext &ctx) { - Y_UNUSED(output); +IGraphTransformer::TStatus MatchRecognizeDefinesWrapper(const TExprNode::TPtr& input, TExprNode::TPtr&, TContext &ctx) { if (!EnsureMinArgsCount(*input, 3, ctx.Expr)) { return IGraphTransformer::TStatus::Error; } const auto inputRowType = input->Child(0); - const auto pattern = input->Child(1); + const auto patternVars = input->Child(1); const auto names = input->Child(2); - const size_t FirstLambdaIndex = 3; + constexpr size_t FirstLambdaIndex = 3; - if (!EnsureTupleOfAtoms(*names, ctx.Expr)) { + if (!EnsureType(*inputRowType, ctx.Expr)) { + return IGraphTransformer::TStatus::Error; + } + if (!EnsureTupleOfAtoms(*patternVars, ctx.Expr)) { + return IGraphTransformer::TStatus::Error; + } + if (!EnsureTupleOfAtoms(*names, ctx.Expr)) { return IGraphTransformer::TStatus::Error; } @@ -167,7 +228,7 @@ MatchRecognizeDefinesWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& outp return IGraphTransformer::TStatus::Error; } - const auto matchedRowsRanges = GetMatchedRowsRangesType(pattern, ctx); + const auto matchedRowsRanges = GetMatchedRowsRangesType(patternVars, ctx); TVector<const TItemExprType*> items; for (size_t i = 0; i != names->ChildrenSize(); ++i) { auto& lambda = input->ChildRef(FirstLambdaIndex + i); @@ -230,11 +291,9 @@ bool ValidateSettings(const TExprNode::TPtr& settings, TExprContext& ctx) { return true; } -} //namespace +} // anonymous namespace -IGraphTransformer::TStatus -MatchRecognizeCoreWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TExtContext& ctx) { - Y_UNUSED(output); +IGraphTransformer::TStatus MatchRecognizeCoreWrapper(const TExprNode::TPtr& input, TExprNode::TPtr&, TExtContext& ctx) { if (not ctx.Types.MatchRecognize) { ctx.Expr.AddError(TIssue(ctx.Expr.GetPosition(input->Pos()), "MATCH_RECOGNIZE is disabled")); return IGraphTransformer::TStatus::Error; @@ -253,7 +312,7 @@ MatchRecognizeCoreWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, } if (not ValidateSettings(settings, ctx.Expr)) { - return IGraphTransformer::TStatus::Error;; + return IGraphTransformer::TStatus::Error; } if (!EnsureFlowType(*source, ctx.Expr)) { diff --git a/yql/essentials/core/type_ann/type_ann_match_recognize.h b/yql/essentials/core/type_ann/type_ann_match_recognize.h index 65a2d4d5b3..d21b7f36b4 100644 --- a/yql/essentials/core/type_ann/type_ann_match_recognize.h +++ b/yql/essentials/core/type_ann/type_ann_match_recognize.h @@ -1,19 +1,18 @@ #pragma once -#include "type_ann_core.h" #include "type_ann_impl.h" #include <yql/essentials/ast/yql_expr.h> -#include <yql/essentials/ast/yql_expr_types.h> +#include <yql/essentials/core/yql_graph_transformer.h> +namespace NYql::NTypeAnnImpl { -namespace NYql { - namespace NTypeAnnImpl { - IGraphTransformer::TStatus MatchRecognizeWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TContext& ctx); - IGraphTransformer::TStatus MatchRecognizeParamsWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TContext& ctx); - IGraphTransformer::TStatus MatchRecognizeMeasuresWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TContext& ctx); - IGraphTransformer::TStatus MatchRecognizePatternWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TContext& ctx); - IGraphTransformer::TStatus MatchRecognizeDefinesWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TContext& ctx); - IGraphTransformer::TStatus MatchRecognizeCoreWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TExtContext& ctx); - } -} +IGraphTransformer::TStatus MatchRecognizeWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TContext& ctx); +IGraphTransformer::TStatus MatchRecognizeMeasuresAggregatesWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TContext& ctx); +IGraphTransformer::TStatus MatchRecognizeParamsWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TContext& ctx); +IGraphTransformer::TStatus MatchRecognizeMeasuresWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TContext& ctx); +IGraphTransformer::TStatus MatchRecognizePatternWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TContext& ctx); +IGraphTransformer::TStatus MatchRecognizeDefinesWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TContext& ctx); +IGraphTransformer::TStatus MatchRecognizeCoreWrapper(const TExprNode::TPtr& input, TExprNode::TPtr& output, TExtContext& ctx); + +} // namespace NYql::NTypeAnnImpl diff --git a/yql/essentials/core/yql_match_recognize.h b/yql/essentials/core/yql_match_recognize.h index ec38a837bc..6b741f5237 100644 --- a/yql/essentials/core/yql_match_recognize.h +++ b/yql/essentials/core/yql_match_recognize.h @@ -4,8 +4,7 @@ namespace NYql::NMatchRecognize { -inline TRowPattern ConvertPattern(const TExprNode::TPtr& pattern, TExprContext &ctx, size_t nestingLevel = 0) { - YQL_ENSURE(nestingLevel <= MaxPatternNesting, "To big nesting level in the pattern"); +inline TRowPattern ConvertPattern(const TExprNode::TPtr& pattern, TExprContext &ctx) { TRowPattern result; for (const auto& term: pattern->Children()) { result.push_back(TRowPatternTerm{}); @@ -14,7 +13,7 @@ inline TRowPattern ConvertPattern(const TExprNode::TPtr& pattern, TExprContext & result.back().push_back(TRowPatternFactor{ factor->Child(0)->IsAtom() ? TRowPatternPrimary(TString(factor->Child(0)->Content())) : - ConvertPattern(factor->Child(0), ctx, nestingLevel + 1), + ConvertPattern(factor->Child(0), ctx), FromString<ui64>(factor->Child(1)->Content()), FromString<ui64>(factor->Child(2)->Content()), FromString<bool>(factor->Child(3)->Content()), diff --git a/yql/essentials/core/yql_opt_match_recognize.cpp b/yql/essentials/core/yql_opt_match_recognize.cpp index f2bef7d7d4..3efae9e42b 100644 --- a/yql/essentials/core/yql_opt_match_recognize.cpp +++ b/yql/essentials/core/yql_opt_match_recognize.cpp @@ -9,6 +9,7 @@ namespace NYql { using namespace NNodes; namespace { + bool IsStreaming(const TExprNode::TPtr& input, const TTypeAnnotationContext& typeAnnCtx) { if (EMatchRecognizeStreamingMode::Disable == typeAnnCtx.MatchRecognizeStreaming){ return false; @@ -29,65 +30,147 @@ bool IsStreaming(const TExprNode::TPtr& input, const TTypeAnnotationContext& typ }); return hasPq; } -} //namespace - -// returns std::nullopt if all vars could be used -std::optional<TSet<TStringBuf>> FindUsedVars(const TExprNode::TPtr& params) { - TSet<TStringBuf> usedVars; - bool allVarsUsed = false; - - const auto createVisitor = [&usedVars, &allVarsUsed](const TExprNode::TPtr& varsArg) { - return [&varsArg, &usedVars, &allVarsUsed](const TExprNode::TPtr& node) -> bool { - if (node->IsCallable("Member")) { - if (node->Child(0) == varsArg) { - usedVars.insert(node->Child(1)->Content()); - return false; + +TExprNode::TPtr ExpandMatchRecognizeMeasuresAggregates(const TExprNode::TPtr& node, TExprContext& ctx, TTypeAnnotationContext& /* typeAnnCtx */) { + const auto pos = node->Pos(); + const auto vars = node->Child(3); + static constexpr size_t AggregatesLambdasStartPos = 4; + static constexpr size_t MeasuresLambdasStartPos = 3; + + return ctx.Builder(pos) + .Callable("MatchRecognizeMeasures") + .Add(0, node->ChildPtr(0)) + .Add(1, node->ChildPtr(1)) + .Add(2, node->ChildPtr(2)) + .Do([&](TExprNodeBuilder& parent) -> TExprNodeBuilder& { + for (size_t i = 0; i < vars->ChildrenSize(); ++i) { + const auto var = vars->Child(i)->Content(); + const auto handler = node->ChildPtr(AggregatesLambdasStartPos + i); + if (!var) { + auto value = handler->GetTypeAnn()->GetKind() != ETypeAnnotationKind::Optional + ? ctx.Builder(pos).Callable("Just").Add(0, handler).Seal().Build() + : handler; + parent.Add( + MeasuresLambdasStartPos + i, + ctx.Builder(pos) + .Lambda() + .Param("data") + .Param("vars") + .Add(0, std::move(value)) + .Seal() + .Build() + ); + continue; + } + parent.Add( + MeasuresLambdasStartPos + i, + ctx.Builder(pos) + .Lambda() + .Param("data") + .Param("vars") + .Callable(0, "Member") + .Callable(0, "Head") + .Callable(0, "Aggregate") + .Callable(0, "OrderedMap") + .Callable(0, "OrderedFlatMap") + .Callable(0, "Member") + .Arg(0, "vars") + .Atom(1, var) + .Seal() + .Lambda(1) + .Param("item") + .Callable(0, "ListFromRange") + .Callable(0, "Member") + .Arg(0, "item") + .Atom(1, "From") + .Seal() + .Callable(1, "+MayWarn") + .Callable(0, "Member") + .Arg(0, "item") + .Atom(1, "To") + .Seal() + .Callable(1, "Uint64") + .Atom(0, "1") + .Seal() + .Seal() + .Seal() + .Seal() + .Seal() + .Lambda(1) + .Param("index") + .Callable(0, "Unwrap") + .Callable(0, "Lookup") + .Callable(0, "ToIndexDict") + .Arg(0, "data") + .Seal() + .Arg(1, "index") + .Seal() + .Seal() + .Seal() + .Seal() + .List(1).Seal() + .List(2) + .Add(0, handler) + .Seal() + .List(3).Seal() + .Seal() + .Seal() + .Atom(1, handler->Child(0)->Content()) + .Seal() + .Seal() + .Build() + ); } - } - if (node == varsArg) { - allVarsUsed = true; - } - return true; - }; - }; + return parent; + }) + .Seal() + .Build(); +} + +THashSet<TStringBuf> FindUsedVars(const TExprNode::TPtr& params) { + THashSet<TStringBuf> result; const auto measures = params->Child(0); - static constexpr size_t measureLambdasStartPos = 3; - for (size_t pos = measureLambdasStartPos; pos != measures->ChildrenSize(); pos++) { - const auto lambda = measures->Child(pos); - const auto lambdaArgs = lambda->Child(0); - const auto lambdaBody = lambda->ChildPtr(1); - const auto varsArg = lambdaArgs->ChildPtr(1); - NYql::VisitExpr(lambdaBody, createVisitor(varsArg)); + const auto measuresVars = measures->Child(3); + for (const auto& var : measuresVars->Children()) { + result.insert(var->Content()); } const auto defines = params->Child(4); static constexpr size_t defineLambdasStartPos = 3; - for (size_t pos = defineLambdasStartPos; pos != defines->ChildrenSize(); pos++) { - const auto lambda = defines->Child(pos); + for (auto i = defineLambdasStartPos; i < defines->ChildrenSize(); ++i) { + const auto lambda = defines->Child(i); const auto lambdaArgs = lambda->Child(0); const auto lambdaBody = lambda->ChildPtr(1); - const auto varsArg = lambdaArgs->ChildPtr(1); - NYql::VisitExpr(lambdaBody, createVisitor(varsArg)); + const auto varsArg = lambdaArgs->Child(1); + NYql::VisitExpr( + lambdaBody, + [varsArg, &result](const TExprNode::TPtr& node) { + if (node->IsCallable("Member") && node->Child(0) == varsArg) { + result.insert(node->Child(1)->Content()); + return false; + } + return true; + } + ); } - return allVarsUsed ? std::nullopt : std::make_optional(usedVars); + return result; } -// usedVars can be std::nullopt if all vars could probably be used -TExprNode::TPtr MarkUnusedPatternVars(const TExprNode::TPtr& node, TExprContext& ctx, const std::optional<TSet<TStringBuf>> &usedVars, TStringBuf rowsPerMatch) { +TExprNode::TPtr MarkUnusedPatternVars(const TExprNode::TPtr& node, TExprContext& ctx, const THashSet<TStringBuf>& usedVars, TStringBuf rowsPerMatch) { const auto pos = node->Pos(); if (node->ChildrenSize() != 0 && node->Child(0)->IsAtom()) { const auto varName = node->Child(0)->Content(); - const auto output = node->Child(4); - const auto varUnused = ("RowsPerMatch_AllRows" != rowsPerMatch || !output) && usedVars && !usedVars->contains(varName); + const auto output = FromString<bool>(node->Child(4)->Content()); + const auto varUnused = ("RowsPerMatch_AllRows" != rowsPerMatch || !output) && !usedVars.contains(varName); return ctx.Builder(pos) .List() .Add(0, node->ChildPtr(0)) .Add(1, node->ChildPtr(1)) .Add(2, node->ChildPtr(2)) .Add(3, node->ChildPtr(3)) - .Add(4, output) + .Add(4, node->ChildPtr(4)) .Add(5, ctx.NewAtom(pos, ToString(varUnused))) .Seal() .Build(); @@ -105,13 +188,15 @@ TExprNode::TPtr MarkUnusedPatternVars(const TExprNode::TPtr& node, TExprContext& } } +} // anonymous namespace + TExprNode::TPtr ExpandMatchRecognize(const TExprNode::TPtr& node, TExprContext& ctx, TTypeAnnotationContext& typeAnnCtx) { - YQL_ENSURE(node->IsCallable({"MatchRecognize"})); + YQL_ENSURE(node->IsCallable("MatchRecognize")); const auto input = node->Child(0); const auto partitionKeySelector = node->Child(1); const auto partitionColumns = node->Child(2); const auto sortTraits = node->Child(3); - const auto params = node->ChildPtr(4); + const auto params = node->Child(4); const auto pos = node->Pos(); const bool isStreaming = IsStreaming(input, typeAnnCtx); @@ -120,6 +205,7 @@ TExprNode::TPtr ExpandMatchRecognize(const TExprNode::TPtr& node, TExprContext& "Streaming", ctx.NewAtom(pos, ToString(isStreaming)), ctx); const auto rowsPerMatch = params->Child(1)->Content(); + auto measures = ExpandMatchRecognizeMeasuresAggregates(params->ChildPtr(0), ctx, typeAnnCtx); const auto matchRecognize = ctx.Builder(pos) .Lambda() .Param("sortedPartition") @@ -131,7 +217,7 @@ TExprNode::TPtr ExpandMatchRecognize(const TExprNode::TPtr& node, TExprContext& .Add(1, partitionKeySelector) .Add(2, partitionColumns) .Callable(3, params->Content()) - .Add(0, params->ChildPtr(0)) + .Add(0, std::move(measures)) .Add(1, params->ChildPtr(1)) .Add(2, params->ChildPtr(2)) .Add(3, MarkUnusedPatternVars(params->ChildPtr(3), ctx, FindUsedVars(params), rowsPerMatch)) diff --git a/yql/essentials/minikql/comp_nodes/mkql_match_recognize.cpp b/yql/essentials/minikql/comp_nodes/mkql_match_recognize.cpp index d59ebee70f..5b0e60afa8 100644 --- a/yql/essentials/minikql/comp_nodes/mkql_match_recognize.cpp +++ b/yql/essentials/minikql/comp_nodes/mkql_match_recognize.cpp @@ -48,8 +48,8 @@ public: {} bool ProcessInputRow(NUdf::TUnboxedValue&& row, TComputationContext& ctx) { - Parameters.InputDataArg->SetValue(ctx, ctx.HolderFactory.Create<TListValue<TSparseList>>(Rows)); - Parameters.CurrentRowIndexArg->SetValue(ctx, NUdf::TUnboxedValuePod(Rows.Size())); + Parameters.InputDataArg->SetValue(ctx, ctx.HolderFactory.Create<TListValue>(Rows)); + Parameters.CurrentRowIndexArg->SetValue(ctx, NUdf::TUnboxedValuePod(Rows.LastRowIndex())); Nfa.ProcessRow(Rows.Append(std::move(row)), ctx); return HasMatched(); } @@ -68,7 +68,7 @@ public: } Parameters.MatchedVarsArg->SetValue(ctx, ctx.HolderFactory.Create<TMatchedVarsValue<TSparseList::TRange>>(ctx.HolderFactory, match->Vars)); Parameters.MeasureInputDataArg->SetValue(ctx, ctx.HolderFactory.Create<TMeasureInputDataValue>( - ctx.HolderFactory.Create<TListValue<TSparseList>>(Rows), + ctx.HolderFactory.Create<TListValue>(Rows), Parameters.MeasureInputColumnOrder, Parameters.MatchedVarsArg->GetValue(ctx), Parameters.VarNames, diff --git a/yql/essentials/minikql/comp_nodes/mkql_match_recognize_list.cpp b/yql/essentials/minikql/comp_nodes/mkql_match_recognize_list.cpp new file mode 100644 index 0000000000..ae8a04eb76 --- /dev/null +++ b/yql/essentials/minikql/comp_nodes/mkql_match_recognize_list.cpp @@ -0,0 +1,124 @@ +#include "mkql_match_recognize_list.h" + +namespace NKikimr::NMiniKQL::NMatchRecognize { + +namespace { + +class TIterator : public TTemporaryComputationValue<TIterator> { +public: + TIterator(TMemoryUsageInfo* memUsage, const TSparseList& parent) + : TTemporaryComputationValue<TIterator>(memUsage) + , Parent(parent) + , Current(Parent.Begin()) + {} + +private: + bool Skip() final { + return ++Current != Parent.End(); + } + + bool Next(NUdf::TUnboxedValue& value) final { + if (!Skip()) { + return false; + } + value = Current->second.Value; + return true; + } + + bool NextPair(NUdf::TUnboxedValue& key, NUdf::TUnboxedValue& payload) final { + if (!Next(payload)) { + return false; + } + key = NUdf::TUnboxedValuePod(Current->first); + return true; + } + + const TSparseList& Parent; + TSparseList::iterator Current; +}; + +class TKeysIterator : public TTemporaryComputationValue<TKeysIterator> { +public: + TKeysIterator(TMemoryUsageInfo* memUsage, const TSparseList& parent) + : TTemporaryComputationValue<TKeysIterator>(memUsage) + , Parent(parent) + , Current(Parent.Begin()) + {} +private: + bool Skip() final { + return ++Current != Parent.End(); + } + + bool Next(NUdf::TUnboxedValue& key) final { + if (!Skip()) { + return false; + } + key = NUdf::TUnboxedValuePod(Current->first); + return true; + } + + const TSparseList& Parent; + TSparseList::iterator Current; +}; + +} // anonymous namespace + +TListValue::TListValue(TMemoryUsageInfo* memUsage, const TSparseList& list) + : TComputationValue<TListValue>(memUsage) + , List(list) +{} + +bool TListValue::HasFastListLength() const { + return true; +} + +ui64 TListValue::GetListLength() const { + return GetDictLength(); +} + +ui64 TListValue::GetEstimatedListLength() const { + return GetListLength(); +} + +NUdf::TUnboxedValue TListValue::GetListIterator() const { + return GetPayloadsIterator(); +} + +bool TListValue::HasListItems() const { + return HasDictItems(); +} + +NUdf::IBoxedValuePtr TListValue::ToIndexDictImpl(const NUdf::IValueBuilder& builder) const { + Y_UNUSED(builder); + return const_cast<TListValue*>(this); +} + +ui64 TListValue::GetDictLength() const { + return List.Size(); +} + +NUdf::TUnboxedValue TListValue::GetDictIterator() const { + return NUdf::TUnboxedValuePod(new TIterator(GetMemInfo(), List)); +} + +NUdf::TUnboxedValue TListValue::GetKeysIterator() const { + return NUdf::TUnboxedValuePod(new TKeysIterator(GetMemInfo(), List)); +} + +NUdf::TUnboxedValue TListValue::GetPayloadsIterator() const { + return NUdf::TUnboxedValuePod(new TIterator(GetMemInfo(), List)); +} + +bool TListValue::Contains(const NUdf::TUnboxedValuePod& key) const { + return List.Contains(key.Get<ui64>()); +} + +NUdf::TUnboxedValue TListValue::Lookup(const NUdf::TUnboxedValuePod& key) const { + return List.Get(key.Get<ui64>()); +} + +bool TListValue::HasDictItems() const { + return !List.Empty(); +} + +} // namespace NKikimr::NMiniKQL::NMatchRecognize diff --git a/yql/essentials/minikql/comp_nodes/mkql_match_recognize_list.h b/yql/essentials/minikql/comp_nodes/mkql_match_recognize_list.h index e60f5465ef..c3569cc4be 100644 --- a/yql/essentials/minikql/comp_nodes/mkql_match_recognize_list.h +++ b/yql/essentials/minikql/comp_nodes/mkql_match_recognize_list.h @@ -1,101 +1,15 @@ #pragma once #include "mkql_match_recognize_save_load.h" -#include "mkql_match_recognize_version.h" #include <yql/essentials/minikql/defs.h> -#include <yql/essentials/minikql/computation/mkql_computation_node_impl.h> #include <yql/essentials/minikql/computation/mkql_computation_node_holders.h> -#include <yql/essentials/minikql/comp_nodes/mkql_saveload.h> +#include <yql/essentials/minikql/computation/mkql_computation_node_impl.h> #include <yql/essentials/public/udf/udf_value.h> #include <unordered_map> namespace NKikimr::NMiniKQL::NMatchRecognize { -class TSimpleList { -public: - ///Range that includes starting and ending points - ///Can not be empty - class TRange { - public: - TRange() - : FromIndex(Max()) - , ToIndex(Max()) - , NfaIndex_(Max()) - { - } - - explicit TRange(ui64 index) - : FromIndex(index) - , ToIndex(index) - , NfaIndex_(Max()) - { - } - - TRange(ui64 from, ui64 to) - : FromIndex(from) - , ToIndex(to) - , NfaIndex_(Max()) - { - MKQL_ENSURE(FromIndex <= ToIndex, "Internal logic error"); - } - - bool IsValid() const { - return FromIndex != Max<size_t>() && ToIndex != Max<size_t>(); - } - - size_t From() const { - MKQL_ENSURE(IsValid(), "Internal logic error"); - return FromIndex; - } - - size_t To() const { - MKQL_ENSURE(IsValid(), "Internal logic error"); - return ToIndex; - } - - [[nodiscard]] size_t NfaIndex() const { - MKQL_ENSURE(IsValid(), "Internal logic error"); - return NfaIndex_; - } - - size_t Size() const { - MKQL_ENSURE(IsValid(), "Internal logic error"); - return ToIndex - FromIndex + 1; - } - - void Extend() { - MKQL_ENSURE(IsValid(), "Internal logic error"); - ++ToIndex; - } - - private: - size_t FromIndex; - size_t ToIndex; - size_t NfaIndex_; - }; - - TRange Append(NUdf::TUnboxedValue&& value) { - TRange result(Rows.size()); - Rows.push_back(std::move(value)); - return result; - } - - size_t Size() const { - return Rows.size(); - } - - bool Empty() const { - return Size() == 0; - } - - NUdf::TUnboxedValue Get(size_t i) const { - return Rows.at(i); - } -private: - TUnboxedValueVector Rows; -}; - ///Stores only locked items ///Locks are holds by TRange ///When all locks on an item are released, the item is removed from the list @@ -108,17 +22,31 @@ class TSparseList { class TContainer: public TSimpleRefCount<TContainer> { public: using TPtr = TIntrusivePtr<TContainer>; + //TODO consider to replace hash table with contiguous chunks + using TStorage = TMKQLHashMap<size_t, TItem>; + using iterator = TStorage::const_iterator; - void Add(size_t index, NUdf::TUnboxedValue&& value) { - const auto& [iter, newOne] = Storage.emplace(index, TItem{std::move(value), 1}); - MKQL_ENSURE(newOne, "Internal logic error"); + [[nodiscard]] iterator Begin() const noexcept { + return Storage.begin(); } - size_t Size() const { + [[nodiscard]] iterator End() const noexcept { + return Storage.end(); + } + + [[nodiscard]] size_t Size() const noexcept { return Storage.size(); } - NUdf::TUnboxedValue Get(size_t i) const { + [[nodiscard]] size_t Empty() const noexcept { + return Storage.empty(); + } + + [[nodiscard]] bool Contains(size_t i) const noexcept { + return Storage.find(i) != Storage.cend(); + } + + [[nodiscard]] NUdf::TUnboxedValue Get(size_t i) const { if (const auto it = Storage.find(i); it != Storage.cend()) { return it->second.Value; } else { @@ -126,6 +54,11 @@ class TSparseList { } } + void Add(size_t index, NUdf::TUnboxedValue&& value) { + const auto& [iter, newOne] = Storage.emplace(index, TItem{std::move(value), 1}); + MKQL_ENSURE(newOne, "Internal logic error"); + } + void LockRange(size_t from, size_t to) { for (auto i = from; i <= to; ++i) { const auto it = Storage.find(i); @@ -165,17 +98,8 @@ class TSparseList { } private: - //TODO consider to replace hash table with contiguous chunks - using TStorage = std::unordered_map< - size_t, - TItem, - std::hash<size_t>, - std::equal_to<size_t>, - TMKQLAllocator<std::pair<const size_t, TItem>, EMemorySubPool::Temporary>>; - TStorage Storage; }; - using TContainerPtr = TContainer::TPtr; public: ///Range that includes starting and ending points @@ -303,7 +227,7 @@ public: } private: - TRange(TContainerPtr container, size_t index) + TRange(TContainer::TPtr container, size_t index) : Container(container) , FromIndex(index) , ToIndex(index) @@ -329,35 +253,48 @@ public: NfaIndex_ = Max(); } - TContainerPtr Container; + TContainer::TPtr Container; size_t FromIndex; size_t ToIndex; size_t NfaIndex_; }; -public: TRange Append(NUdf::TUnboxedValue&& value) { const auto index = ListSize++; Container->Add(index, std::move(value)); return TRange(Container, index); } - NUdf::TUnboxedValue Get(size_t i) const { - return Container->Get(i); + using iterator = TContainer::iterator; + + [[nodiscard]] iterator Begin() const noexcept { + return Container->Begin(); + } + + [[nodiscard]] iterator End() const noexcept { + return Container->End(); } ///Return total size of sparse list including absent values - size_t Size() const { + size_t LastRowIndex() const noexcept { return ListSize; } ///Return number of present values in sparse list - size_t Filled() const { + size_t Size() const noexcept { return Container->Size(); } - bool Empty() const { - return Size() == 0; + [[nodiscard]] bool Empty() const noexcept { + return Container->Empty(); + } + + [[nodiscard]] bool Contains(size_t i) const noexcept { + return Container->Contains(i); + } + + [[nodiscard]] NUdf::TUnboxedValue Get(size_t i) const { + return Container->Get(i); } void Save(TMrOutputSerializer& serializer) const { @@ -369,46 +306,32 @@ public: } private: - TContainerPtr Container = MakeIntrusive<TContainer>(); + TContainer::TPtr Container = MakeIntrusive<TContainer>(); size_t ListSize = 0; //impl: max index ever stored + 1 }; -template<typename L> -class TListValue: public TComputationValue<TListValue<L>> { +class TListValue final : public TComputationValue<TListValue> { public: - TListValue(TMemoryUsageInfo* memUsage, const L& list) - : TComputationValue<TListValue<L>>(memUsage) - , List(list) - { - } - - //TODO https://st.yandex-team.ru/YQL-16508 - //NUdf::TUnboxedValue GetListIterator() const override; - - bool HasFastListLength() const override { - return !List.Empty(); - } + TListValue(TMemoryUsageInfo* memUsage, const TSparseList& list); - ui64 GetListLength() const override { - return List.Size(); - } - - bool HasListItems() const override { - return !List.Empty(); - } + bool HasFastListLength() const final; + ui64 GetListLength() const final; + ui64 GetEstimatedListLength() const final; + NUdf::TUnboxedValue GetListIterator() const final; + bool HasListItems() const final; - NUdf::IBoxedValuePtr ToIndexDictImpl(const NUdf::IValueBuilder& builder) const override { - Y_UNUSED(builder); - return const_cast<TListValue*>(this); - } + NUdf::IBoxedValuePtr ToIndexDictImpl(const NUdf::IValueBuilder& builder) const final; - NUdf::TUnboxedValue Lookup(const NUdf::TUnboxedValuePod& key) const override { - return List.Get(key.Get<ui64>()); - } + ui64 GetDictLength() const final; + NUdf::TUnboxedValue GetDictIterator() const final; + NUdf::TUnboxedValue GetKeysIterator() const final; + NUdf::TUnboxedValue GetPayloadsIterator() const final; + bool Contains(const NUdf::TUnboxedValuePod& key) const final; + NUdf::TUnboxedValue Lookup(const NUdf::TUnboxedValuePod& key) const final; + bool HasDictItems() const final; private: - L List; + TSparseList List; }; -}//namespace NKikimr::NMiniKQL::NMatchRecognize - +} // namespace NKikimr::NMiniKQL::NMatchRecognize diff --git a/yql/essentials/minikql/comp_nodes/mkql_match_recognize_measure_arg.cpp b/yql/essentials/minikql/comp_nodes/mkql_match_recognize_measure_arg.cpp new file mode 100644 index 0000000000..9a152dfcec --- /dev/null +++ b/yql/essentials/minikql/comp_nodes/mkql_match_recognize_measure_arg.cpp @@ -0,0 +1,128 @@ +#include "mkql_match_recognize_measure_arg.h" + +namespace NKikimr::NMiniKQL::NMatchRecognize { + +TRowForMeasureValue::TRowForMeasureValue( + TMemoryUsageInfo* memInfo, + NUdf::TUnboxedValue inputRow, + ui64 rowIndex, + const TMeasureInputColumnOrder& columnOrder, + const NUdf::TUnboxedValue& matchedVars, + const TUnboxedValueVector& varNames, + ui64 matchNumber) +: TComputationValue<TRowForMeasureValue>(memInfo) +, InputRow(inputRow) +, RowIndex(rowIndex) +, ColumnOrder(columnOrder) +, MatchedVars(matchedVars) +, VarNames(varNames) +, MatchNumber(matchNumber) +{} + +NUdf::TUnboxedValue TRowForMeasureValue::GetElement(ui32 index) const { + switch(ColumnOrder[index].first) { + case NYql::NMatchRecognize::EMeasureInputDataSpecialColumns::Classifier: { + auto varIterator = MatchedVars.GetListIterator(); + MKQL_ENSURE(varIterator, "Internal logic error"); + NUdf::TUnboxedValue var; + size_t varIndex = 0; + while(varIterator.Next(var)) { + auto rangeIterator = var.GetListIterator(); + MKQL_ENSURE(varIterator, "Internal logic error"); + NUdf::TUnboxedValue range; + while(rangeIterator.Next(range)) { + const auto from = range.GetElement(0).Get<ui64>(); + const auto to = range.GetElement(1).Get<ui64>(); + if (RowIndex >= from and RowIndex <= to) { + return VarNames[varIndex]; + } + } + ++varIndex; + } + MKQL_ENSURE(MatchedVars.GetListLength() == varIndex, "Internal logic error"); + return MakeString(""); + } + case NYql::NMatchRecognize::EMeasureInputDataSpecialColumns::MatchNumber: + return NUdf::TUnboxedValuePod(MatchNumber); + case NYql::NMatchRecognize::EMeasureInputDataSpecialColumns::Last: //Last corresponds to columns from the input table row + return InputRow.GetElement(ColumnOrder[index].second); + } +} + +TMeasureInputDataValue::TMeasureInputDataValue( + TMemoryUsageInfo* memInfo, + const NUdf::TUnboxedValue& inputData, + const TMeasureInputColumnOrder& columnOrder, + const NUdf::TUnboxedValue& matchedVars, + const TUnboxedValueVector& varNames, + ui64 matchNumber) +: TComputationValue<TMeasureInputDataValue>(memInfo) +, InputData(inputData) +, ColumnOrder(columnOrder) +, MatchedVars(matchedVars) +, VarNames(varNames) +, MatchNumber(matchNumber) +{} + +bool TMeasureInputDataValue::HasFastListLength() const { + return true; +} + +ui64 TMeasureInputDataValue::GetListLength() const { + return GetDictLength(); +} + +ui64 TMeasureInputDataValue::GetEstimatedListLength() const { + return GetListLength(); +} + +NUdf::TUnboxedValue TMeasureInputDataValue::GetListIterator() const { + return GetPayloadsIterator(); +} + +bool TMeasureInputDataValue::HasListItems() const { + return HasDictItems(); +} + +NUdf::IBoxedValuePtr TMeasureInputDataValue::ToIndexDictImpl(const NUdf::IValueBuilder& builder) const { + Y_UNUSED(builder); + return const_cast<TMeasureInputDataValue*>(this); +} + +ui64 TMeasureInputDataValue::GetDictLength() const { + return InputData.GetDictLength(); +} + +NUdf::TUnboxedValue TMeasureInputDataValue::GetDictIterator() const { + return InputData.GetDictIterator(); +} + +NUdf::TUnboxedValue TMeasureInputDataValue::GetKeysIterator() const { + return InputData.GetKeysIterator(); +} + +NUdf::TUnboxedValue TMeasureInputDataValue::GetPayloadsIterator() const { + return InputData.GetPayloadsIterator(); +} + +bool TMeasureInputDataValue::Contains(const NUdf::TUnboxedValuePod& key) const { + return InputData.Contains(key); +} + +NUdf::TUnboxedValue TMeasureInputDataValue::Lookup(const NUdf::TUnboxedValuePod& key) const { + return NUdf::TUnboxedValuePod{new TRowForMeasureValue( + GetMemInfo(), + InputData.Lookup(key), + key.Get<ui64>(), + ColumnOrder, + MatchedVars, + VarNames, + MatchNumber + )}; +} + +bool TMeasureInputDataValue::HasDictItems() const { + return InputData.HasDictItems(); +} + +} // namespace NKikimr::NMiniKQL::NMatchRecognize diff --git a/yql/essentials/minikql/comp_nodes/mkql_match_recognize_measure_arg.h b/yql/essentials/minikql/comp_nodes/mkql_match_recognize_measure_arg.h index e738bbe9fb..db143e9b30 100644 --- a/yql/essentials/minikql/comp_nodes/mkql_match_recognize_measure_arg.h +++ b/yql/essentials/minikql/comp_nodes/mkql_match_recognize_measure_arg.h @@ -1,67 +1,27 @@ #pragma once -#include "mkql_match_recognize_matched_vars.h" -#include <yql/essentials/minikql/computation/mkql_computation_node_impl.h> #include <yql/essentials/core/sql_types/match_recognize.h> +#include <yql/essentials/minikql/computation/mkql_computation_node_holders.h> +#include <yql/essentials/minikql/computation/mkql_computation_node_impl.h> #include <yql/essentials/minikql/mkql_string_util.h> namespace NKikimr::NMiniKQL::NMatchRecognize { -using NYql::NMatchRecognize::EMeasureInputDataSpecialColumns; - -using TMeasureInputColumnOrder = std::vector<std::pair<EMeasureInputDataSpecialColumns, size_t>, TMKQLAllocator<std::pair<EMeasureInputDataSpecialColumns, size_t>>>; +using TMeasureInputColumnOrder = TMKQLVector<std::pair<NYql::NMatchRecognize::EMeasureInputDataSpecialColumns, size_t>>; -//Input row augmented with lightweight special columns for calculating MEASURE lambdas -class TRowForMeasureValue: public TComputationValue<TRowForMeasureValue> -{ +class TRowForMeasureValue final : public TComputationValue<TRowForMeasureValue> { public: TRowForMeasureValue( - TMemoryUsageInfo* memInfo, - NUdf::TUnboxedValue inputRow, - ui64 rowIndex, - const TMeasureInputColumnOrder& columnOrder, - const NUdf::TUnboxedValue& matchedVars, - const TUnboxedValueVector& varNames, - ui64 matchNumber - ) - : TComputationValue<TRowForMeasureValue>(memInfo) - , InputRow(inputRow) - , RowIndex(rowIndex) - , ColumnOrder(columnOrder) - , MatchedVars(matchedVars) - , VarNames(varNames) - , MatchNumber(matchNumber) - {} + TMemoryUsageInfo* memInfo, + NUdf::TUnboxedValue inputRow, + ui64 rowIndex, + const TMeasureInputColumnOrder& columnOrder, + const NUdf::TUnboxedValue& matchedVars, + const TUnboxedValueVector& varNames, + ui64 matchNumber); + + NUdf::TUnboxedValue GetElement(ui32 index) const final; - NUdf::TUnboxedValue GetElement(ui32 index) const override { - switch(ColumnOrder[index].first) { - case EMeasureInputDataSpecialColumns::Classifier: { - auto varIterator = MatchedVars.GetListIterator(); - MKQL_ENSURE(varIterator, "Internal logic error"); - NUdf::TUnboxedValue var; - size_t varIndex = 0; - while(varIterator.Next(var)) { - auto rangeIterator = var.GetListIterator(); - MKQL_ENSURE(varIterator, "Internal logic error"); - NUdf::TUnboxedValue range; - while(rangeIterator.Next(range)) { - const auto from = range.GetElement(0).Get<ui64>(); - const auto to = range.GetElement(1).Get<ui64>(); - if (RowIndex >= from and RowIndex <= to) { - return VarNames[varIndex]; - } - } - ++varIndex; - } - MKQL_ENSURE(MatchedVars.GetListLength() == varIndex, "Internal logic error"); - return MakeString(""); - } - case EMeasureInputDataSpecialColumns::MatchNumber: - return NUdf::TUnboxedValuePod(MatchNumber); - case EMeasureInputDataSpecialColumns::Last: //Last corresponds to columns from the input table row - return InputRow.GetElement(ColumnOrder[index].second); - } - } private: const NUdf::TUnboxedValue InputRow; const ui64 RowIndex; @@ -71,51 +31,32 @@ private: ui64 MatchNumber; }; -class TMeasureInputDataValue: public TComputationValue<TMeasureInputDataValue> { - using Base = TComputationValue<TMeasureInputDataValue>; +class TMeasureInputDataValue final : public TComputationValue<TMeasureInputDataValue> { public: - TMeasureInputDataValue(TMemoryUsageInfo* memInfo, - const NUdf::TUnboxedValue& inputData, - const TMeasureInputColumnOrder& columnOrder, - const NUdf::TUnboxedValue& matchedVars, - const TUnboxedValueVector& varNames, - ui64 matchNumber) - : Base(memInfo) - , InputData(inputData) - , ColumnOrder(columnOrder) - , MatchedVars(matchedVars) - , VarNames(varNames) - , MatchNumber(matchNumber) - {} - - bool HasFastListLength() const override { - return InputData.HasFastListLength(); - } + TMeasureInputDataValue( + TMemoryUsageInfo* memInfo, + const NUdf::TUnboxedValue& inputData, + const TMeasureInputColumnOrder& columnOrder, + const NUdf::TUnboxedValue& matchedVars, + const TUnboxedValueVector& varNames, + ui64 matchNumber); - ui64 GetListLength() const override { - return InputData.GetListLength(); - } + bool HasFastListLength() const final; + ui64 GetListLength() const final; + ui64 GetEstimatedListLength() const final; + NUdf::TUnboxedValue GetListIterator() const final; + bool HasListItems() const final; - //TODO https://st.yandex-team.ru/YQL-16508 - //NUdf::TUnboxedValue GetListIterator() const override; + NUdf::IBoxedValuePtr ToIndexDictImpl(const NUdf::IValueBuilder& builder) const final; - NUdf::IBoxedValuePtr ToIndexDictImpl(const NUdf::IValueBuilder& builder) const override { - Y_UNUSED(builder); - return const_cast<TMeasureInputDataValue*>(this); - } + ui64 GetDictLength() const final; + NUdf::TUnboxedValue GetDictIterator() const final; + NUdf::TUnboxedValue GetKeysIterator() const final; + NUdf::TUnboxedValue GetPayloadsIterator() const final; + bool Contains(const NUdf::TUnboxedValuePod& key) const final; + NUdf::TUnboxedValue Lookup(const NUdf::TUnboxedValuePod& key) const final; + bool HasDictItems() const final; - NUdf::TUnboxedValue Lookup(const NUdf::TUnboxedValuePod& key) const override { - auto inputRow = InputData.Lookup(key); - return NUdf::TUnboxedValuePod{new TRowForMeasureValue( - GetMemInfo(), - inputRow, - key.Get<ui64>(), - ColumnOrder, - MatchedVars, - VarNames, - MatchNumber - )}; - } private: const NUdf::TUnboxedValue InputData; const TMeasureInputColumnOrder& ColumnOrder; @@ -124,5 +65,4 @@ private: const ui64 MatchNumber; }; -}//namespace NKikimr::NMiniKQL::NMatchRecognize - +} // namespace NKikimr::NMiniKQL::NMatchRecognize diff --git a/yql/essentials/minikql/comp_nodes/mkql_match_recognize_rows_formatter.cpp b/yql/essentials/minikql/comp_nodes/mkql_match_recognize_rows_formatter.cpp index 74b28b5ab9..9178a9fbb2 100644 --- a/yql/essentials/minikql/comp_nodes/mkql_match_recognize_rows_formatter.cpp +++ b/yql/essentials/minikql/comp_nodes/mkql_match_recognize_rows_formatter.cpp @@ -63,6 +63,9 @@ public: const TSparseList& rows, const NUdf::TUnboxedValue& partitionKey, const TNfaTransitionGraph& graph) override { + if (Max<size_t>() == CurrentRowIndex_) { + return NUdf::TUnboxedValue{}; + } return GetMatchRow(ctx, rows, partitionKey, graph); } @@ -93,6 +96,8 @@ private: if (auto iter = ToIndexToMatchRangeLookup_.lower_bound(CurrentRowIndex_); iter == ToIndexToMatchRangeLookup_.end()) { MKQL_ENSURE(false, "Internal logic error"); + } else if (CurrentRowIndex_ < iter->second.From()) { + ++CurrentRowIndex_; } else if (auto transition = std::get_if<TMatchedVarTransition>(&graph.Transitions.at(iter->second.NfaIndex())); !transition) { MKQL_ENSURE(false, "Internal logic error"); @@ -106,9 +111,10 @@ private: return NUdf::TUnboxedValue{}; } const auto result = DoGetMatchRow(ctx, rows, partitionKey, graph); - ++CurrentRowIndex_; if (CurrentRowIndex_ == Match_.EndIndex) { Clear(); + } else { + ++CurrentRowIndex_; } return result; } diff --git a/yql/essentials/minikql/comp_nodes/ut/mkql_match_recognize_list_ut.cpp b/yql/essentials/minikql/comp_nodes/ut/mkql_match_recognize_list_ut.cpp index 7578cc2ebe..d3be80cfe6 100644 --- a/yql/essentials/minikql/comp_nodes/ut/mkql_match_recognize_list_ut.cpp +++ b/yql/essentials/minikql/comp_nodes/ut/mkql_match_recognize_list_ut.cpp @@ -5,73 +5,49 @@ namespace NKikimr::NMiniKQL::NMatchRecognize { -template<class L> -void CommonForSimpleAndSparse(const THolderFactory& holderFactory) { - using TList = L; - using TRange = typename L::TRange; - TList list; - TRange r; - for (ui64 i = 0; i != 10; ++i) { - r = list.Append(NUdf::TUnboxedValuePod{i}); - UNIT_ASSERT_VALUES_EQUAL(1, r.Size()); - NUdf::TUnboxedValue v = list.Get(i); - UNIT_ASSERT_VALUES_EQUAL(i, v.Get<ui64>()); - } - UNIT_ASSERT_VALUES_EQUAL(10, list.Size()); - { - auto r2 = list.Append(NUdf::TUnboxedValuePod{10}); - Y_UNUSED(r2); - r.Extend(); - } - UNIT_ASSERT_VALUES_EQUAL(11, list.Size()); - { - const NUdf::TUnboxedValue& v = list.Get(10); - UNIT_ASSERT_VALUES_EQUAL(10, v.Get<ui64>()); - } - //Test access via value - const NUdf::TUnboxedValue& listValue = holderFactory.Create<TListValue<L>>(list); - UNIT_ASSERT(listValue); - UNIT_ASSERT(listValue.HasValue()); - UNIT_ASSERT(listValue.HasListItems()); - UNIT_ASSERT(listValue.HasFastListLength()); - UNIT_ASSERT_VALUES_EQUAL(11, listValue.GetListLength()); - TDefaultValueBuilder valueBuilder(holderFactory); - auto listValueAsDict = NUdf::TBoxedValueAccessor::ToIndexDictImpl(*listValue.AsBoxed(), TDefaultValueBuilder(holderFactory)); - { - const NUdf::TUnboxedValue &v = NUdf::TBoxedValueAccessor::Lookup(*listValueAsDict, NUdf::TUnboxedValuePod{9}); - UNIT_ASSERT_VALUES_EQUAL(9, v.Get<ui64>()); - } - { - const NUdf::TUnboxedValue &v = NUdf::TBoxedValueAccessor::Lookup(*listValueAsDict, NUdf::TUnboxedValuePod{10}); - UNIT_ASSERT_VALUES_EQUAL(10, v.Get<ui64>()); - } -} - Y_UNIT_TEST_SUITE(MatchRecognizeList) { TMemoryUsageInfo memUsage("MatchRecognizeListTest"); - Y_UNIT_TEST(SimpleListCommon) { - TScopedAlloc alloc(__LOCATION__); - THolderFactory holderFactory(alloc.Ref(), memUsage); - CommonForSimpleAndSparse<TSimpleList>(holderFactory); - } Y_UNIT_TEST(SparseListCommon) { TScopedAlloc alloc(__LOCATION__); THolderFactory holderFactory(alloc.Ref(), memUsage); - CommonForSimpleAndSparse<TSparseList>(holderFactory); - } - Y_UNIT_TEST(SimpleListSpecific) { - TScopedAlloc alloc(__LOCATION__); - THolderFactory holderFactory(alloc.Ref(), memUsage); - TSimpleList list; - for (ui64 i = 0; i != 10; ++i) { - list.Append(NUdf::TUnboxedValuePod{i}); - } - //All added items are accessible regardless of held ranges(locks) + TSparseList list; + TSparseList::TRange r; for (ui64 i = 0; i != 10; ++i) { + r = list.Append(NUdf::TUnboxedValuePod{i}); + UNIT_ASSERT_VALUES_EQUAL(1, r.Size()); NUdf::TUnboxedValue v = list.Get(i); UNIT_ASSERT_VALUES_EQUAL(i, v.Get<ui64>()); } + UNIT_ASSERT_VALUES_EQUAL(10, list.LastRowIndex()); + { + auto r2 = list.Append(NUdf::TUnboxedValuePod{10}); + Y_UNUSED(r2); + r.Extend(); + } + UNIT_ASSERT_VALUES_EQUAL(11, list.LastRowIndex()); + { + const NUdf::TUnboxedValue& v = list.Get(10); + UNIT_ASSERT_VALUES_EQUAL(10, v.Get<ui64>()); + } + //Test access via value + const NUdf::TUnboxedValue& listValue = holderFactory.Create<TListValue>(list); + UNIT_ASSERT(listValue); + UNIT_ASSERT(listValue.HasValue()); + UNIT_ASSERT(listValue.HasListItems()); + UNIT_ASSERT(listValue.HasFastListLength()); + UNIT_ASSERT_VALUES_EQUAL(2, listValue.GetListLength()); + TDefaultValueBuilder valueBuilder(holderFactory); + auto listValueAsDict = NUdf::TBoxedValueAccessor::ToIndexDictImpl(*listValue.AsBoxed(), TDefaultValueBuilder(holderFactory)); + { + const NUdf::TUnboxedValue &v = NUdf::TBoxedValueAccessor::Lookup(*listValueAsDict, NUdf::TUnboxedValuePod{9}); + UNIT_ASSERT_VALUES_EQUAL(9, v.Get<ui64>()); + } + { + const NUdf::TUnboxedValue &v = NUdf::TBoxedValueAccessor::Lookup(*listValueAsDict, NUdf::TUnboxedValuePod{10}); + UNIT_ASSERT_VALUES_EQUAL(10, v.Get<ui64>()); + } } + Y_UNIT_TEST(SparseListSpecific) { TScopedAlloc alloc(__LOCATION__); THolderFactory holderFactory(alloc.Ref(), memUsage); @@ -81,7 +57,7 @@ Y_UNIT_TEST_SUITE(MatchRecognizeList) { list.Append(NUdf::TUnboxedValuePod{i}); } //Check no one is stored - UNIT_ASSERT_VALUES_EQUAL(0, list.Filled()); + UNIT_ASSERT_VALUES_EQUAL(0, list.Size()); for (ui64 i = 0; i != 10; ++i) { NUdf::TUnboxedValue v = list.Get(i); UNIT_ASSERT(!v); @@ -92,7 +68,7 @@ Y_UNIT_TEST_SUITE(MatchRecognizeList) { r = list.Append(NUdf::TUnboxedValuePod{i}); } //Check that only the last is stored - UNIT_ASSERT_VALUES_EQUAL(1, list.Filled()); + UNIT_ASSERT_VALUES_EQUAL(1, list.Size()); for (ui64 i = 0; i != 19; ++i) { NUdf::TUnboxedValue v = list.Get(i); UNIT_ASSERT(!v); @@ -106,26 +82,26 @@ Y_UNIT_TEST_SUITE(MatchRecognizeList) { TSparseList::TRange copiedRange{r}; TSparseList::TRange assignedRange{r}; assignedRange = copiedRange; - UNIT_ASSERT_VALUES_EQUAL(1, list.Filled()); + UNIT_ASSERT_VALUES_EQUAL(1, list.Size()); { NUdf::TUnboxedValue v = list.Get(19); UNIT_ASSERT_VALUES_EQUAL(19, v.Get<ui64>()); } r.Release(); - UNIT_ASSERT_VALUES_EQUAL(1, list.Filled()); + UNIT_ASSERT_VALUES_EQUAL(1, list.Size()); { NUdf::TUnboxedValue v = list.Get(19); UNIT_ASSERT_VALUES_EQUAL(19, v.Get<ui64>()); } - UNIT_ASSERT_VALUES_EQUAL(1, list.Filled()); + UNIT_ASSERT_VALUES_EQUAL(1, list.Size()); copiedRange.Release(); - UNIT_ASSERT_VALUES_EQUAL(1, list.Filled()); + UNIT_ASSERT_VALUES_EQUAL(1, list.Size()); { NUdf::TUnboxedValue v = list.Get(19); UNIT_ASSERT_VALUES_EQUAL(19, v.Get<ui64>()); } assignedRange.Release(); - UNIT_ASSERT_VALUES_EQUAL(0, list.Filled()); + UNIT_ASSERT_VALUES_EQUAL(0, list.Size()); { NUdf::TUnboxedValue v = list.Get(19); UNIT_ASSERT(!v); @@ -133,4 +109,4 @@ Y_UNIT_TEST_SUITE(MatchRecognizeList) { } } -}//namespace NKikimr::NMiniKQL::TMatchRecognize +} // namespace NKikimr::NMiniKQL::NMatchRecognize diff --git a/yql/essentials/minikql/comp_nodes/ut/mkql_match_recognize_matched_vars_ut.cpp b/yql/essentials/minikql/comp_nodes/ut/mkql_match_recognize_matched_vars_ut.cpp index b744e3d53b..368dedfa01 100644 --- a/yql/essentials/minikql/comp_nodes/ut/mkql_match_recognize_matched_vars_ut.cpp +++ b/yql/essentials/minikql/comp_nodes/ut/mkql_match_recognize_matched_vars_ut.cpp @@ -4,6 +4,105 @@ namespace NKikimr::NMiniKQL::NMatchRecognize { +class TSimpleList { +public: + using iterator = TUnboxedValueVector::const_iterator; + + [[nodiscard]] iterator Begin() const noexcept { + return Rows.begin(); + } + + [[nodiscard]] iterator End() const noexcept { + return Rows.end(); + } + + [[nodiscard]] size_t Size() const noexcept { + return Rows.size(); + } + + [[nodiscard]] bool Empty() const noexcept { + return Rows.empty(); + } + + [[nodiscard]] bool Contains(size_t i) const noexcept { + return i < Size(); + } + + [[nodiscard]] NUdf::TUnboxedValue Get(size_t i) const { + return Rows.at(i); + } + + ///Range that includes starting and ending points + ///Can not be empty + class TRange { + public: + TRange() + : FromIndex(Max()) + , ToIndex(Max()) + , NfaIndex_(Max()) + { + } + + explicit TRange(ui64 index) + : FromIndex(index) + , ToIndex(index) + , NfaIndex_(Max()) + { + } + + TRange(ui64 from, ui64 to) + : FromIndex(from) + , ToIndex(to) + , NfaIndex_(Max()) + { + MKQL_ENSURE(FromIndex <= ToIndex, "Internal logic error"); + } + + bool IsValid() const { + return FromIndex != Max<size_t>() && ToIndex != Max<size_t>(); + } + + size_t From() const { + MKQL_ENSURE(IsValid(), "Internal logic error"); + return FromIndex; + } + + size_t To() const { + MKQL_ENSURE(IsValid(), "Internal logic error"); + return ToIndex; + } + + [[nodiscard]] size_t NfaIndex() const { + MKQL_ENSURE(IsValid(), "Internal logic error"); + return NfaIndex_; + } + + size_t Size() const { + MKQL_ENSURE(IsValid(), "Internal logic error"); + return ToIndex - FromIndex + 1; + } + + void Extend() { + MKQL_ENSURE(IsValid(), "Internal logic error"); + ++ToIndex; + } + + private: + size_t FromIndex; + size_t ToIndex; + size_t NfaIndex_; + }; + + TRange Append(NUdf::TUnboxedValue&& value) { + TRange result(Rows.size()); + Rows.push_back(std::move(value)); + return result; + } + +private: + TUnboxedValueVector Rows; +}; + Y_UNIT_TEST_SUITE(MatchRecognizeMatchedVarExtend) { using TRange = TSimpleList::TRange; using TMatchedVar = TMatchedVar<TRange>; @@ -230,4 +329,4 @@ Y_UNIT_TEST_SUITE(MatchRecognizeMatchedVarsToValueByRef) { } } } -}//namespace NKikimr::NMiniKQL::TMatchRecognize +} // namespace NKikimr::NMiniKQL::NMatchRecognize diff --git a/yql/essentials/minikql/comp_nodes/ya.make.inc b/yql/essentials/minikql/comp_nodes/ya.make.inc index fbf5b04108..29656cf128 100644 --- a/yql/essentials/minikql/comp_nodes/ya.make.inc +++ b/yql/essentials/minikql/comp_nodes/ya.make.inc @@ -80,6 +80,8 @@ SET(ORIG_SOURCES mkql_mapnext.cpp mkql_map_join.cpp mkql_match_recognize.cpp + mkql_match_recognize_list.cpp + mkql_match_recognize_measure_arg.cpp mkql_match_recognize_rows_formatter.cpp mkql_multihopping.cpp mkql_multimap.cpp diff --git a/yql/essentials/minikql/computation/mkql_computation_node_holders.h b/yql/essentials/minikql/computation/mkql_computation_node_holders.h index f2bb09006d..23219cc5aa 100644 --- a/yql/essentials/minikql/computation/mkql_computation_node_holders.h +++ b/yql/essentials/minikql/computation/mkql_computation_node_holders.h @@ -28,6 +28,11 @@ class TMemoryUsageInfo; const ui32 CodegenArraysFallbackLimit = 1000u; +template <typename Type, EMemorySubPool MemoryPool = EMemorySubPool::Default> +using TMKQLVector = std::vector<Type, TMKQLAllocator<Type, MemoryPool>>; +template<typename Key, typename T, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>, EMemorySubPool MemoryPool = EMemorySubPool::Default> +using TMKQLHashMap = std::unordered_map<Key, T, Hash, KeyEqual, TMKQLAllocator<std::pair<const Key, T>, MemoryPool>>; + using TKeyTypes = std::vector<std::pair<NUdf::EDataSlot, bool>>; using TUnboxedValueVector = std::vector<NUdf::TUnboxedValue, TMKQLAllocator<NUdf::TUnboxedValue>>; using TTemporaryUnboxedValueVector = std::vector<NUdf::TUnboxedValue, TMKQLAllocator<NUdf::TUnboxedValue, EMemorySubPool::Temporary>>; @@ -1074,7 +1079,7 @@ public: //unavailable getters may be eliminated at compile time, but it'd make c private: TKeyTypes KeyTypes; bool IsTuple = false; - + //unsused pointers may be eliminated at compile time, but it'd make code much less readable NUdf::IEquate::TPtr Equate; NUdf::IHash::TPtr Hash; diff --git a/yql/essentials/mount/lib/yql/aggregate.yqls b/yql/essentials/mount/lib/yql/aggregate.yqls index b2450bf239..e687adbf80 100755 --- a/yql/essentials/mount/lib/yql/aggregate.yqls +++ b/yql/essentials/mount/lib/yql/aggregate.yqls @@ -34,6 +34,8 @@ (let bit_and_traits_factory_raw (lambda '(list_type) (Apply simple_traits_factory list_type (lambda '(one two) (BitAnd one two))))) (let bit_or_traits_factory_raw (lambda '(list_type) (Apply simple_traits_factory list_type (lambda '(one two) (BitOr one two))))) (let bit_xor_traits_factory_raw (lambda '(list_type) (Apply simple_traits_factory list_type (lambda '(one two) (BitXor one two))))) +(let first_traits_factory_raw (lambda '(list_type) (Apply simple_traits_factory list_type (lambda '(lhs rhs) (Coalesce rhs lhs))))) +(let last_traits_factory_raw (lambda '(list_type) (Apply simple_traits_factory list_type (lambda '(lhs rhs) (Coalesce lhs rhs))))) # list_type:type # support optional values @@ -592,7 +594,7 @@ (let max_traits_factory_opt (lambda '(list_type) (Apply simple_traits_factory list_type (lambda '(one two) (AggrMax one two))))) (let sum_traits_factory_opt (lambda '(list_type) (block '( (let item_type (Apply remove_optional_type (ListItemType list_type))) - (return (Apply simple_traits_factory_map list_type + (return (Apply simple_traits_factory_map list_type (lambda '(one two) (AggrAdd one two)) (lambda '(value) (MatchType item_type 'Interval (lambda '() (Apply convert_interval_to_decimal value)) (lambda '() (WidenIntegral value)))) (lambda '(value) (MatchType item_type 'Interval (lambda '() (Apply convert_decimal_to_interval value)) (lambda '() value))) @@ -601,6 +603,8 @@ (let bit_and_traits_factory_opt (lambda '(list_type) (Apply optional_traits_factory list_type bit_and_traits_factory_raw))) (let bit_or_traits_factory_opt (lambda '(list_type) (Apply optional_traits_factory list_type bit_or_traits_factory_raw))) (let bit_xor_traits_factory_opt (lambda '(list_type) (Apply optional_traits_factory list_type bit_xor_traits_factory_raw))) +(let first_traits_factory_opt (lambda '(list_type) (Apply optional_traits_factory list_type first_traits_factory_raw))) +(let last_traits_factory_opt (lambda '(list_type) (Apply optional_traits_factory list_type last_traits_factory_raw))) (let avg_traits_factory_opt (lambda '(list_type) (Apply optional_traits_factory list_type avg_traits_factory_raw))) (let variance_0_0_traits_factory_opt (lambda '(list_type) (Apply optional_traits_factory list_type variance_0_0_traits_factory_raw))) (let variance_1_0_traits_factory_opt (lambda '(list_type) (Apply optional_traits_factory list_type variance_1_0_traits_factory_raw))) @@ -674,6 +678,8 @@ (let bit_and_traits_factory (lambda '(list_type extractor) (Apply extractor_traits_factory list_type extractor bit_and_traits_factory_opt))) (let bit_or_traits_factory (lambda '(list_type extractor) (Apply extractor_traits_factory list_type extractor bit_or_traits_factory_opt))) (let bit_xor_traits_factory (lambda '(list_type extractor) (Apply extractor_traits_factory list_type extractor bit_xor_traits_factory_opt))) +(let first_traits_factory (lambda '(list_type extractor) (Apply extractor_traits_factory list_type extractor first_traits_factory_raw))) +(let last_traits_factory (lambda '(list_type extractor) (Apply extractor_traits_factory list_type extractor last_traits_factory_raw))) (let and_traits_factory (lambda '(list_type extractor) (Apply extractor_traits_factory list_type (lambda '(v) (SafeCast (Apply extractor v) (DataType 'Bool))) bool_and_traits_factory_opt))) (let or_traits_factory (lambda '(list_type extractor) (Apply extractor_traits_factory list_type (lambda '(v) (SafeCast (Apply extractor v) (DataType 'Bool))) bool_or_traits_factory_opt))) (let xor_traits_factory (lambda '(list_type extractor) (Apply extractor_traits_factory list_type (lambda '(v) (SafeCast (Apply extractor v) (DataType 'Bool))) bool_xor_traits_factory_opt))) @@ -963,6 +969,8 @@ (export bit_and_traits_factory) (export bit_or_traits_factory) (export bit_xor_traits_factory) +(export first_traits_factory) +(export last_traits_factory) (export and_traits_factory) (export or_traits_factory) (export xor_traits_factory) diff --git a/yql/essentials/sql/v1/builtin.cpp b/yql/essentials/sql/v1/builtin.cpp index c9953c6e0d..8e49ba39f2 100644 --- a/yql/essentials/sql/v1/builtin.cpp +++ b/yql/essentials/sql/v1/builtin.cpp @@ -3149,10 +3149,6 @@ struct TBuiltinFuncData { // Hopping intervals time functions {"hopstart", BuildSimpleBuiltinFactoryCallback<THoppingTime<true>>()}, {"hopend", BuildSimpleBuiltinFactoryCallback<THoppingTime<false>>()}, - - //MatchRecognize navigation functions - {"first", BuildNamedBuiltinFactoryCallback<TMatchRecognizeNavigate>("FIRST")}, - {"last", BuildNamedBuiltinFactoryCallback<TMatchRecognizeNavigate>("LAST")}, }; return builtinFuncs; } @@ -3269,6 +3265,10 @@ struct TBuiltinFuncData { {"firstvalueignorenulls", BuildAggrFuncFactoryCallback("FirstValueIgnoreNulls", "first_value_ignore_nulls_traits_factory", {OverWindow})}, {"lastvalueignorenulls", BuildAggrFuncFactoryCallback("LastValueIgnoreNulls", "last_value_ignore_nulls_traits_factory", {OverWindow})}, {"nthvalueignorenulls", BuildAggrFuncFactoryCallback("NthValueIgnoreNulls", "nth_value_ignore_nulls_traits_factory", {OverWindow}, NTH_VALUE)}, + + // MatchRecognize navigation functions + {"first", BuildAggrFuncFactoryCallback("First", "first_traits_factory")}, + {"last", BuildAggrFuncFactoryCallback("Last", "last_traits_factory")}, }; return aggrFuncs; } @@ -3633,7 +3633,17 @@ TNodePtr BuildBuiltinFunc(TContext& ctx, TPosition pos, TString name, const TVec return new TInvalidBuiltin(pos, TStringBuilder() << "Unknown aggregation function: " << *args[0]->GetLiteral("String")); } - return (*aggrCallback).second(pos, args, aggMode, true).Release(); + switch (ctx.GetColumnReferenceState()) { + case EColumnRefState::MatchRecognizeMeasures: + [[fallthrough]]; + case EColumnRefState::MatchRecognizeDefine: + return new TInvalidBuiltin(pos, "Cannot use aggregation factory inside the MATCH_RECOGNIZE context"); + default: + if ("first" == aggNormalizedName || "last" == aggNormalizedName) { + return new TInvalidBuiltin(pos, "Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context"); + } + return (*aggrCallback).second(pos, args, aggMode, true); + } } if (normalizedName == "aggregateby" || normalizedName == "multiaggregateby") { @@ -3651,7 +3661,19 @@ TNodePtr BuildBuiltinFunc(TContext& ctx, TPosition pos, TString name, const TVec auto aggrCallback = aggrFuncs.find(normalizedName); if (aggrCallback != aggrFuncs.end()) { - return (*aggrCallback).second(pos, args, aggMode, false).Release(); + switch (ctx.GetColumnReferenceState()) { + case EColumnRefState::MatchRecognizeMeasures: { + auto result = (*aggrCallback).second(pos, args, aggMode, false); + return BuildMatchRecognizeVarAccess(pos, std::move(result)); + } + case EColumnRefState::MatchRecognizeDefine: + return BuildMatchRecognizeDefineAggregate(ctx.Pos(), normalizedName, args); + default: + if ("first" == normalizedName || "last" == normalizedName) { + return new TInvalidBuiltin(pos, "Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context"); + } + return (*aggrCallback).second(pos, args, aggMode, false); + } } if (aggMode == EAggregateMode::Distinct || aggMode == EAggregateMode::OverWindowDistinct) { return new TInvalidBuiltin(pos, "DISTINCT can only be used in aggregation functions"); diff --git a/yql/essentials/sql/v1/context.h b/yql/essentials/sql/v1/context.h index 85a4739eaf..31185c0d77 100644 --- a/yql/essentials/sql/v1/context.h +++ b/yql/essentials/sql/v1/context.h @@ -85,7 +85,9 @@ namespace NSQLTranslationV1 { Allow, AsStringLiteral, AsPgType, - MatchRecognize, + MatchRecognizeMeasures, + MatchRecognizeDefine, + MatchRecognizeDefineAggregate, }; class TContext { @@ -204,12 +206,36 @@ namespace NSQLTranslationV1 { return TopLevelColumnReferenceState; } - TStringBuf GetMatchRecognizeDefineVar() const { - YQL_ENSURE(EColumnRefState::MatchRecognize == ColumnReferenceState, - "DefineVar can only be accessed within processing of MATCH_RECOGNIZE lambdas"); + [[nodiscard]] TString GetMatchRecognizeDefineVar() const { + YQL_ENSURE(EColumnRefState::MatchRecognizeMeasures == ColumnReferenceState + || EColumnRefState::MatchRecognizeDefine == ColumnReferenceState + || EColumnRefState::MatchRecognizeDefineAggregate == ColumnReferenceState, + "MATCH_RECOGNIZE Var can only be accessed within processing of MATCH_RECOGNIZE lambdas"); return MatchRecognizeDefineVar; } + TString ExtractMatchRecognizeAggrVar() { + YQL_ENSURE(EColumnRefState::MatchRecognizeMeasures == ColumnReferenceState + || EColumnRefState::MatchRecognizeDefine == ColumnReferenceState + || EColumnRefState::MatchRecognizeDefineAggregate == ColumnReferenceState, + "MATCH_RECOGNIZE Var can only be accessed within processing of MATCH_RECOGNIZE lambdas"); + return std::exchange(MatchRecognizeAggrVar, ""); + } + + [[nodiscard]] bool SetMatchRecognizeAggrVar(TString var) { + YQL_ENSURE(EColumnRefState::MatchRecognizeMeasures == ColumnReferenceState + || EColumnRefState::MatchRecognizeDefine == ColumnReferenceState + || EColumnRefState::MatchRecognizeDefineAggregate == ColumnReferenceState, + "MATCH_RECOGNIZE Var can only be accessed within processing of MATCH_RECOGNIZE lambdas"); + if (MatchRecognizeAggrVar.empty()) { + MatchRecognizeAggrVar = std::move(var); + } else if (MatchRecognizeAggrVar != var) { + Error() << "Illegal use of aggregates or navigation operators in MATCH_RECOGNIZE"; + return false; + } + return true; + } + TVector<NSQLTranslation::TSQLHint> PullHintForToken(NYql::TPosition tokenPos); void WarnUnusedHints(); @@ -231,6 +257,7 @@ namespace NSQLTranslationV1 { EColumnRefState ColumnReferenceState = EColumnRefState::Deny; EColumnRefState TopLevelColumnReferenceState = EColumnRefState::Deny; TString MatchRecognizeDefineVar; + TString MatchRecognizeAggrVar; TString NoColumnErrorContext = "in current scope"; TVector<TBlocks*> CurrentBlocks; @@ -347,7 +374,13 @@ namespace NSQLTranslationV1 { } else { Ctx.ColumnReferenceState = state; } - YQL_ENSURE(defineVar.empty() || EColumnRefState::MatchRecognize == state, "Internal logic error"); + YQL_ENSURE( + defineVar.empty() + || EColumnRefState::MatchRecognizeMeasures == state + || EColumnRefState::MatchRecognizeDefine == state + || EColumnRefState::MatchRecognizeDefineAggregate == state, + "Internal logic error" + ); ctx.MatchRecognizeDefineVar = defineVar; } diff --git a/yql/essentials/sql/v1/match_recognize.cpp b/yql/essentials/sql/v1/match_recognize.cpp index 84a20ae273..be2f508e87 100644 --- a/yql/essentials/sql/v1/match_recognize.cpp +++ b/yql/essentials/sql/v1/match_recognize.cpp @@ -2,111 +2,302 @@ #include "source.h" #include "context.h" +#include <util/generic/overloaded.h> + namespace NSQLTranslationV1 { namespace { -const auto VarDataName = "data"; -const auto VarMatchedVarsName = "vars"; -const auto VarLastRowIndexName = "lri"; +constexpr auto VarDataName = "data"; +constexpr auto VarMatchedVarsName = "vars"; +constexpr auto VarLastRowIndexName = "lri"; + +class TMatchRecognizeColumnAccessNode final : public TAstListNode { +public: + TMatchRecognizeColumnAccessNode(TPosition pos, TString var, TString column) + : TAstListNode(pos) + , Var(std::move(var)) + , Column(std::move(column)) { + } + + const TString* GetColumnName() const override { + return std::addressof(Column); + } + + bool DoInit(TContext& ctx, ISource* /* src */) override { + switch (ctx.GetColumnReferenceState()) { + case EColumnRefState::MatchRecognizeMeasures: + if (!ctx.SetMatchRecognizeAggrVar(Var)) { + return false; + } + Add( + "Member", + BuildAtom(Pos, "row"), + Q(Column) + ); + break; + case EColumnRefState::MatchRecognizeDefine: + if (ctx.GetMatchRecognizeDefineVar() != Var) { + ctx.Error() << "Row pattern navigation function is required"; + return false; + } + BuildLookup(VarLastRowIndexName); + break; + case EColumnRefState::MatchRecognizeDefineAggregate: + if (!ctx.SetMatchRecognizeAggrVar(Var)) { + return false; + } + BuildLookup("index"); + break; + default: + Y_ABORT("Unexpected column reference state"); + } + return true; + } + + TNodePtr DoClone() const override { + return new TMatchRecognizeColumnAccessNode(Pos, Var, Column); + } + +private: + void BuildLookup(TString varKeyName) { + Add( + "Member", + Y( + "Lookup", + Y( + "ToIndexDict", + BuildAtom(Pos, VarDataName) + ), + BuildAtom(Pos, std::move(varKeyName)) + ), + Q(Column) + ); + } + +private: + TString Var; + TString Column; +}; + +class TMatchRecognizeDefineAggregate final : public TAstListNode { +public: + TMatchRecognizeDefineAggregate(TPosition pos, TString name, TVector<TNodePtr> args) + : TAstListNode(pos) + , Name(std::move(name)) + , Args(std::move(args)) { + } + + bool DoInit(TContext& ctx, ISource* src) override { + Y_DEBUG_ABORT_UNLESS(ctx.GetColumnReferenceState() == EColumnRefState::MatchRecognizeDefine); + TColumnRefScope scope(ctx, EColumnRefState::MatchRecognizeDefineAggregate, false, ctx.GetMatchRecognizeDefineVar()); + if (Args.size() != 1) { + ctx.Error() << "Exactly one argument is required in MATCH_RECOGNIZE navigation function"; + return false; + } + const auto arg = Args[0]; + if (!arg->Init(ctx, src)) { + return false; + } + + const auto body = [&]() -> TNodePtr { + if ("first" == Name) { + return Y("Member", Y("Head", "item"), Q("From")); + } else if ("last" == Name) { + return Y("Member", Y("Last", "item"), Q("To")); + } else { + ctx.Error() << "Unknown row pattern navigation function: " << Name; + return {}; + } + }(); + if (!body) { + return false; + } + Add("Apply", BuildLambda(Pos, Y("index"), arg), body); + return true; + } -} //namespace { + TNodePtr DoClone() const override { + return new TMatchRecognizeDefineAggregate(Pos, Name, Args); + } -class TMatchRecognize: public TAstListNode { +private: + TString Name; + TVector<TNodePtr> Args; +}; + +class TMatchRecognizeVarAccessNode final : public INode { +public: + TMatchRecognizeVarAccessNode(TPosition pos, TNodePtr arg) + : INode(pos) + , Arg(std::move(arg)) { + } + + [[nodiscard]] const TString& GetVar() const noexcept { + return Var; + } + + TAggregationPtr GetAggregation() const override { + return Arg->GetAggregation(); + } + + bool DoInit(TContext& ctx, ISource* src) override { + if (!Arg->Init(ctx, src)) { + return false; + } + Var = ctx.ExtractMatchRecognizeAggrVar(); + Expr = [&]() -> TNodePtr { + switch (ctx.GetColumnReferenceState()) { + case EColumnRefState::MatchRecognizeMeasures: + return Arg; + case EColumnRefState::MatchRecognizeDefine: + return Y( + "Apply", + BuildLambda(Pos, Y("item"), Arg), + Y( + "Member", + BuildAtom(ctx.Pos(), VarMatchedVarsName), + Q(Var) + ) + ); + default: + Y_ABORT("Unexpected column reference state"); + } + }(); + return Expr->Init(ctx, src); + } + + TNodePtr DoClone() const override { + return new TMatchRecognizeVarAccessNode(Pos, Arg); + } + + TAstNode* Translate(TContext& ctx) const override { + return Expr->Translate(ctx); + } + +private: + TString Var; + TNodePtr Arg; + TNodePtr Expr; +}; + +class TMatchRecognize final : public TAstListNode { public: TMatchRecognize( - TPosition pos, - ISource* source, - const TString& inputTable, - std::pair<TPosition, TVector<TNamedFunction>>&& partitioners, - std::pair<TPosition, TVector<TSortSpecificationPtr>>&& sortSpecs, - std::pair<TPosition, TVector<TNamedFunction>>&& measures, - std::pair<TPosition, NYql::NMatchRecognize::ERowsPerMatch>&& rowsPerMatch, - std::pair<TPosition, NYql::NMatchRecognize::TAfterMatchSkipTo>&& skipTo, - std::pair<TPosition, NYql::NMatchRecognize::TRowPattern>&& pattern, - std::pair<TPosition, TNodePtr>&& subset, - std::pair<TPosition, TVector<TNamedFunction>>&& definitions - ): TAstListNode(pos, {BuildAtom(pos, "block")}) - { - Add(BuildBlockStatements( - pos, - source, - inputTable, - std::move(partitioners), - std::move(sortSpecs), - std::move(measures), - std::move(rowsPerMatch), - std::move(skipTo), - std::move(pattern), - std::move(subset), - std::move(definitions) - )); + TPosition pos, + TString label, + TNodePtr partitionKeySelector, + TNodePtr partitionColumns, + TVector<TSortSpecificationPtr> sortSpecs, + TVector<TNamedFunction> measures, + TNodePtr rowsPerMatch, + TNodePtr skipTo, + TNodePtr pattern, + TNodePtr patternVars, + TNodePtr subset, + TVector<TNamedFunction> definitions) + : TAstListNode(pos) + , Label(std::move(label)) + , PartitionKeySelector(std::move(partitionKeySelector)) + , PartitionColumns(std::move(partitionColumns)) + , SortSpecs(std::move(sortSpecs)) + , Measures(std::move(measures)) + , RowsPerMatch(std::move(rowsPerMatch)) + , SkipTo(std::move(skipTo)) + , Pattern(std::move(pattern)) + , PatternVars(std::move(patternVars)) + , Subset(std::move(subset)) + , Definitions(std::move(definitions)) { } + private: - TMatchRecognize(const TMatchRecognize& other) - : TAstListNode(other.Pos) - { - Nodes = CloneContainer(other.Nodes); - } - - TNodePtr BuildBlockStatements( - TPosition pos, - ISource* source, - const TString& inputTable, - std::pair<TPosition, TVector<TNamedFunction>>&& partitioners, - std::pair<TPosition, TVector<TSortSpecificationPtr>>&& sortSpecs, - std::pair<TPosition, TVector<TNamedFunction>>&& measures, - std::pair<TPosition, NYql::NMatchRecognize::ERowsPerMatch>&& rowsPerMatch, - std::pair<TPosition, NYql::NMatchRecognize::TAfterMatchSkipTo>&& skipTo, - std::pair<TPosition, NYql::NMatchRecognize::TRowPattern>&& pattern, - std::pair<TPosition, TNodePtr>&& subset, - std::pair<TPosition, TVector<TNamedFunction>>&& definitions - ) { - Y_UNUSED(pos); - - auto inputRowType = Y("ListItemType",Y("TypeOf", inputTable)); - - auto patternNode = Pattern(pattern.first, pattern.second); - - auto partitionColumns = Y(); - for (const auto& p: partitioners.second){ - partitionColumns->Add(BuildQuotedAtom(p.callable->GetPos(), p.name)); - } - partitionColumns = Q(partitionColumns); - auto partitionKeySelector = Y(); - for (const auto& p: partitioners.second){ - partitionKeySelector->Add(p.callable); - } - partitionKeySelector = BuildLambda(partitioners.first, Y("row"), Q(partitionKeySelector)); + bool DoInit(TContext& ctx, ISource* src) override { + auto inputRowType = Y("ListItemType", Y("TypeOf", Label)); + + if (!PartitionKeySelector->Init(ctx, src)) { + return false; + } + if (!PartitionColumns->Init(ctx, src)) { + return false; + } + + const auto sortTraits = SortSpecs.empty() ? Y("Void") : src->BuildSortSpec(SortSpecs, Label, true, false); + if (!sortTraits->Init(ctx, src)) { + return false; + } auto measureNames = Y(); - for (const auto& m: measures.second){ - measureNames->Add(BuildQuotedAtom(m.callable->GetPos(), m.name)); + for (const auto& m: Measures) { + measureNames->Add(BuildQuotedAtom(m.Callable->GetPos(), m.Name)); } - TNodePtr measuresNode = Y("MatchRecognizeMeasures", inputRowType, patternNode, Q(measureNames)); - for (const auto& m: measures.second){ - measuresNode->Add(BuildLambda(m.callable->GetPos(), Y(VarDataName, VarMatchedVarsName), m.callable)); + auto measureVars = Y(); + for (const auto& m: Measures) { + TColumnRefScope scope(ctx, EColumnRefState::MatchRecognizeMeasures); + if (!m.Callable->Init(ctx, src)) { + return false; + } + const auto varAccess = dynamic_cast<TMatchRecognizeVarAccessNode*>(m.Callable.Get()); + auto var = varAccess ? varAccess->GetVar() : ""; + measureVars->Add(BuildQuotedAtom(m.Callable->GetPos(), std::move(var))); } - auto defineNames = Y(); - for (const auto& d: definitions.second) { - defineNames->Add(BuildQuotedAtom(d.callable->GetPos(), d.name)); + auto measuresNode = Y("MatchRecognizeMeasuresAggregates", inputRowType, Q(PatternVars), Q(measureNames), Q(measureVars)); + for (const auto& m: Measures) { + auto aggr = m.Callable->GetAggregation(); + if (!aggr) { + // TODO(YQL-16508): support aggregations inside expressions + // ctx.Error(m.Callable->GetPos()) << "Cannot use aggregations inside expression"; + // return false; + measuresNode->Add(m.Callable); + } else { + const auto [traits, result] = aggr->AggregationTraits(Y("TypeOf", Label), false, false, false, ctx); + if (!result) { + return false; + } + measuresNode->Add(traits); + } + } + + if (!RowsPerMatch->Init(ctx, src)) { + return false; } - TNodePtr defineNode = Y("MatchRecognizeDefines", inputRowType, patternNode, Q(defineNames)); - for (const auto& d: definitions.second) { - defineNode->Add(BuildLambda(d.callable->GetPos(), Y(VarDataName, VarMatchedVarsName, VarLastRowIndexName), d.callable)); + if (!SkipTo->Init(ctx, src)) { + return false; + } + + if (!Pattern->Init(ctx, src)) { + return false; + } + + if (!PatternVars->Init(ctx, src)) { + return false; + } + + auto defineNames = Y(); + for (const auto& d: Definitions) { + defineNames->Add(BuildQuotedAtom(d.Callable->GetPos(), d.Name)); + } + auto defineNode = Y("MatchRecognizeDefines", inputRowType, Q(PatternVars), Q(defineNames)); + for (const auto& d: Definitions) { + TColumnRefScope scope(ctx, EColumnRefState::MatchRecognizeDefine, true, d.Name); + if (!d.Callable->Init(ctx, src)) { + return false; + } + defineNode->Add(BuildLambda(d.Callable->GetPos(), Y(VarDataName, VarMatchedVarsName, VarLastRowIndexName), d.Callable)); } - return Q(Y( - Y("let", "input", inputTable), - Y("let", "partitionKeySelector", partitionKeySelector), - Y("let", "partitionColumns", partitionColumns), - Y("let", "sortTraits", sortSpecs.second.empty()? Y("Void") : source->BuildSortSpec(sortSpecs.second, inputTable, true, false)), + Add( + "block", + Q(Y( + Y("let", "input", Label), + Y("let", "partitionKeySelector", PartitionKeySelector), + Y("let", "partitionColumns", PartitionColumns), + Y("let", "sortTraits", sortTraits), Y("let", "measures", measuresNode), - Y("let", "rowsPerMatch", BuildQuotedAtom(rowsPerMatch.first, "RowsPerMatch_" + ToString(rowsPerMatch.second))), - Y("let", "skipTo", BuildTuple(skipTo.first, {Q("AfterMatchSkip_" + ToString(skipTo.second.To)), Q(ToString(skipTo.second.Var))})), - Y("let", "pattern", patternNode), - Y("let", "subset", subset.second ? subset.second : Q("")), + Y("let", "rowsPerMatch", RowsPerMatch), + Y("let", "skipTo", SkipTo), + Y("let", "pattern", Pattern), + Y("let", "subset", Subset ? Subset : Q("")), Y("let", "define", defineNode), Y("let", "res", Y("MatchRecognize", "input", @@ -122,133 +313,76 @@ private: ) )), Y("return", "res") - )); - } - - TPtr PatternFactor(const TPosition& pos, const NYql::NMatchRecognize::TRowPatternFactor& factor) { - return BuildTuple(pos, { - factor.Primary.index() == 0 ? - BuildQuotedAtom(pos, std::get<0>(factor.Primary)) : - Pattern(pos, std::get<1>(factor.Primary)), - BuildQuotedAtom(pos, ToString(factor.QuantityMin)), - BuildQuotedAtom(pos, ToString(factor.QuantityMax)), - BuildQuotedAtom(pos, ToString(factor.Greedy)), - BuildQuotedAtom(pos, ToString(factor.Output)), - BuildQuotedAtom(pos, ToString(factor.Unused)) - }); - } - - - TPtr PatternTerm(const TPosition& pos, const NYql::NMatchRecognize::TRowPatternTerm& term) { - auto factors = Y(); - for (const auto& f: term) - factors->Add(PatternFactor(pos, f)); - return Q(std::move(factors)); + )) + ); + return true; } - TPtr Pattern(const TPosition& pos, const NYql::NMatchRecognize::TRowPattern& pattern) { - TNodePtr patternNode = Y("MatchRecognizePattern"); - for (const auto& t: pattern) { - patternNode->Add(PatternTerm(pos, t)); - } - return patternNode; + TNodePtr DoClone() const override { + return new TMatchRecognize( + Pos, + Label, + PartitionKeySelector, + PartitionColumns, + SortSpecs, + Measures, + RowsPerMatch, + SkipTo, + Pattern, + PatternVars, + Subset, + Definitions + ); } - TPtr DoClone() const final{ - return new TMatchRecognize(*this); - } +private: + TString Label; + TNodePtr PartitionKeySelector; + TNodePtr PartitionColumns; + TVector<TSortSpecificationPtr> SortSpecs; + TVector<TNamedFunction> Measures; + TNodePtr RowsPerMatch; + TNodePtr SkipTo; + TNodePtr Pattern; + TNodePtr PatternVars; + TNodePtr Subset; + TVector<TNamedFunction> Definitions; }; -TNodePtr TMatchRecognizeBuilder::Build(TContext& ctx, TString&& inputTable, ISource* source){ +} // anonymous namespace + +TNodePtr TMatchRecognizeBuilder::Build(TContext& ctx, TString label, ISource* src) { TNodePtr node = new TMatchRecognize( - Pos, - source, - std::move(inputTable), - std::move(Partitioners), - std::move(SortSpecs), - std::move(Measures), - std::move(RowsPerMatch), - std::move(SkipTo), - std::move(Pattern), - std::move(Subset), - std::move(Definitions) + Pos, + std::move(label), + std::move(PartitionKeySelector), + std::move(PartitionColumns), + std::move(SortSpecs), + std::move(Measures), + std::move(RowsPerMatch), + std::move(SkipTo), + std::move(Pattern), + std::move(PatternVars), + std::move(Subset), + std::move(Definitions) ); - if (!node->Init(ctx, source)) - return nullptr; + if (!node->Init(ctx, src)) { + return {}; + } return node; } -namespace { -const auto DefaultNavigatingFunction = "MatchRecognizeDefaultNavigating"; +TNodePtr BuildMatchRecognizeColumnAccess(TPosition pos, TString var, TString column) { + return MakeIntrusive<TMatchRecognizeColumnAccessNode>(pos, std::move(var), std::move(column)); } -bool TMatchRecognizeVarAccessNode::DoInit(TContext& ctx, ISource* src) { - //If referenced var is the var that is currently being defined - //then it's a reference to the last row in a partition - Node = new TMatchRecognizeNavigate(ctx.Pos(), DefaultNavigatingFunction, TVector<TNodePtr>{this->Clone()}); - return Node->Init(ctx, src); +TNodePtr BuildMatchRecognizeDefineAggregate(TPosition pos, TString name, TVector<TNodePtr> args) { + const auto result = MakeIntrusive<TMatchRecognizeDefineAggregate>(pos, std::move(name), std::move(args)); + return BuildMatchRecognizeVarAccess(pos, std::move(result)); } -bool TMatchRecognizeNavigate::DoInit(TContext& ctx, ISource* src) { - Y_UNUSED(src); - if (Args.size() != 1) { - ctx.Error(Pos) << "Exactly one argument is required in MATCH_RECOGNIZE navigation function"; - return false; - } - const auto varColumn = dynamic_cast<TMatchRecognizeVarAccessNode *>(Args[0].Get()); - if (not varColumn) { - ctx.Error(Pos) << "Row pattern navigation operations are applicable to row pattern variable only"; - return false; - } - const auto varData = BuildAtom(ctx.Pos(), VarDataName); - const auto varMatchedVars = BuildAtom(ctx.Pos(), VarMatchedVarsName); - const auto varLastRowIndex = BuildAtom(ctx.Pos(), VarLastRowIndexName); - - const auto matchedRanges = Y("Member", varMatchedVars, Q(varColumn->GetVar())); - TNodePtr navigatedRowIndex; - if (DefaultNavigatingFunction == Name) { - if (not varColumn->IsTheSameVar()) { - ctx.Error(Pos) << "Row pattern navigation function is required"; - return false; - } - navigatedRowIndex = varLastRowIndex; - } - else if ("PREV" == Name) { - if (not varColumn->IsTheSameVar()) { - ctx.Error(Pos) << "PREV relative to matched vars is not implemented yet"; - return false; - } - navigatedRowIndex = Y( - "-", - varLastRowIndex, - Y("Uint64", Q("1")) - ); - } else if ("FIRST" == Name) { - navigatedRowIndex = Y( - "Member", - Y("Head", matchedRanges), - Q("From") - ); - } else if ("LAST" == Name) { - navigatedRowIndex = Y( - "Member", - Y("Last", matchedRanges), - Q("To") - ); - } else { - ctx.Error(Pos) << "Internal logic error"; - return false; - } - Add("Member"); - Add( - Y( - "Lookup", - Y("ToIndexDict", varData), - navigatedRowIndex - ) - ), - Add(Q(varColumn->GetColumn())); - return true; +TNodePtr BuildMatchRecognizeVarAccess(TPosition pos, TNodePtr extractor) { + return MakeIntrusive<TMatchRecognizeVarAccessNode>(pos, std::move(extractor)); } } // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/match_recognize.h b/yql/essentials/sql/v1/match_recognize.h index 4b0e98b9b7..818bd6e600 100644 --- a/yql/essentials/sql/v1/match_recognize.h +++ b/yql/essentials/sql/v1/match_recognize.h @@ -1,125 +1,64 @@ #pragma once + #include "node.h" + #include <yql/essentials/core/sql_types/match_recognize.h> #include <util/generic/ptr.h> namespace NSQLTranslationV1 { struct TNamedFunction { - TNodePtr callable; //Callable with some free args - TString name; + TNodePtr Callable; + TString Name; }; class TMatchRecognizeBuilder: public TSimpleRefCount<TMatchRecognizeBuilder> { public: TMatchRecognizeBuilder( - TPosition clausePos, - std::pair<TPosition, TVector<TNamedFunction>>&& partitioners, - std::pair<TPosition, TVector<TSortSpecificationPtr>>&& sortSpecs, - std::pair<TPosition, TVector<TNamedFunction>>&& measures, - std::pair<TPosition, NYql::NMatchRecognize::ERowsPerMatch>&& rowsPerMatch, - std::pair<TPosition, NYql::NMatchRecognize::TAfterMatchSkipTo>&& skipTo, - std::pair<TPosition, NYql::NMatchRecognize::TRowPattern>&& pattern, - std::pair<TPosition, TNodePtr>&& subset, - std::pair<TPosition, TVector<TNamedFunction>>&& definitions - ) - : Pos(clausePos) - , Partitioners(std::move(partitioners)) - , SortSpecs(std::move(sortSpecs)) - , Measures(std::move(measures)) - , RowsPerMatch(std::move(rowsPerMatch)) - , SkipTo(std::move(skipTo)) - , Pattern(std::move(pattern)) - , Subset(std::move(subset)) - , Definitions(definitions) - + TPosition pos, + TNodePtr partitionKeySelector, + TNodePtr partitionColumns, + TVector<TSortSpecificationPtr> sortSpecs, + TVector<TNamedFunction> measures, + TNodePtr rowsPerMatch, + TNodePtr skipTo, + TNodePtr pattern, + TNodePtr patternVars, + TNodePtr subset, + TVector<TNamedFunction> definitions) + : Pos(pos) + , PartitionKeySelector(std::move(partitionKeySelector)) + , PartitionColumns(std::move(partitionColumns)) + , SortSpecs(std::move(sortSpecs)) + , Measures(std::move(measures)) + , RowsPerMatch(std::move(rowsPerMatch)) + , SkipTo(std::move(skipTo)) + , Pattern(std::move(pattern)) + , PatternVars(std::move(patternVars)) + , Subset(std::move(subset)) + , Definitions(std::move(definitions)) {} - TNodePtr Build(TContext& ctx, TString&& inputTable, ISource* source); -private: - TPosition Pos; - std::pair<TPosition, TVector<TNamedFunction>> Partitioners; - std::pair<TPosition, TVector<TSortSpecificationPtr>> SortSpecs; - std::pair<TPosition, TVector<TNamedFunction>> Measures; - std::pair<TPosition, NYql::NMatchRecognize::ERowsPerMatch> RowsPerMatch; - std::pair<TPosition, NYql::NMatchRecognize::TAfterMatchSkipTo> SkipTo; - std::pair<TPosition, NYql::NMatchRecognize::TRowPattern> Pattern; - std::pair<TPosition, TNodePtr> Subset; - std::pair<TPosition, TVector<TNamedFunction>> Definitions; -}; - -using TMatchRecognizeBuilderPtr=TIntrusivePtr<TMatchRecognizeBuilder> ; - -class TMatchRecognizeVarAccessNode: public INode { -public: - TMatchRecognizeVarAccessNode(TPosition pos, const TString& var, const TString& column, bool theSameVar) - : INode(pos) - , Var(var) - , TheSameVar(theSameVar) - , Column(column) - { - } - - TString GetVar() const { - return Var; - } - - bool IsTheSameVar() const { - return TheSameVar; - } - - TString GetColumn() const { - return Column; - } - bool DoInit(TContext& ctx, ISource* src) override; - - TAstNode* Translate(TContext& ctx) const override { - return Node->Translate(ctx); - } - - TPtr DoClone() const override { - YQL_ENSURE(!Node, "TMatchRecognizeVarAccessNode::Clone: Node must not be initialized"); - auto copy = new TMatchRecognizeVarAccessNode(Pos, Var, Column, TheSameVar); - return copy; - } - -protected: - void DoUpdateState() const override { - YQL_ENSURE(Node); - } - - void DoVisitChildren(const TVisitFunc& func, TVisitNodeSet& visited) const final { - Y_DEBUG_ABORT_UNLESS(Node); - Node->VisitTree(func, visited); - } + TNodePtr Build(TContext& ctx, TString label, ISource* source); private: - TNodePtr Node; - const TString Var; - const bool TheSameVar; //reference the same var as being defined by this expression; - const TString Column; + TPosition Pos; + TNodePtr PartitionKeySelector; + TNodePtr PartitionColumns; + TVector<TSortSpecificationPtr> SortSpecs; + TVector<TNamedFunction> Measures; + TNodePtr RowsPerMatch; + TNodePtr SkipTo; + TNodePtr Pattern; + TNodePtr PatternVars; + TNodePtr Subset; + TVector<TNamedFunction> Definitions; }; -class TMatchRecognizeNavigate: public TAstListNode { -public: - TMatchRecognizeNavigate(TPosition pos, const TString& name, const TVector<TNodePtr>& args) - : TAstListNode(pos) - , Name(name) - , Args(args) - { - } - -private: - TNodePtr DoClone() const override { - return new TMatchRecognizeNavigate(GetPos(), Name, CloneContainer(Args)); - } - - bool DoInit(TContext& ctx, ISource* src) override; +using TMatchRecognizeBuilderPtr = TIntrusivePtr<TMatchRecognizeBuilder>; -private: - const TString Name; - const TVector<TNodePtr> Args; -}; +TNodePtr BuildMatchRecognizeColumnAccess(TPosition pos, TString var, TString column); +TNodePtr BuildMatchRecognizeDefineAggregate(TPosition pos, TString name, TVector<TNodePtr> args); +TNodePtr BuildMatchRecognizeVarAccess(TPosition pos, TNodePtr extractor); } // namespace NSQLTranslationV1 - diff --git a/yql/essentials/sql/v1/node.cpp b/yql/essentials/sql/v1/node.cpp index 2339fa894e..eea9069abf 100644 --- a/yql/essentials/sql/v1/node.cpp +++ b/yql/essentials/sql/v1/node.cpp @@ -706,6 +706,14 @@ TAstNode* TAstDirectNode::Translate(TContext& ctx) const { return Node; } +TNodePtr BuildList(TPosition pos, TVector<TNodePtr> nodes) { + return new TAstListNodeImpl(pos, std::move(nodes)); +} + +TNodePtr BuildQuote(TPosition pos, TNodePtr expr) { + return BuildList(pos, {BuildAtom(pos, "quote", TNodeFlags::Default), expr}); +} + TNodePtr BuildAtom(TPosition pos, const TString& content, ui32 flags, bool isOptionalArg) { return new TAstAtomNodeImpl(pos, content, flags, isOptionalArg); } @@ -2669,10 +2677,6 @@ TNodePtr BuildAccess(TPosition pos, const TVector<INode::TIdPart>& ids, bool isL return new TAccessNode(pos, ids, isLookup); } -TNodePtr BuildMatchRecognizeVarAccess(TPosition pos, const TString& var, const TString& column, bool theSameVar) { - return new TMatchRecognizeVarAccessNode(pos, var, column, theSameVar); -} - void WarnIfAliasFromSelectIsUsedInGroupBy(TContext& ctx, const TVector<TNodePtr>& selectTerms, const TVector<TNodePtr>& groupByTerms, const TVector<TNodePtr>& groupByExprTerms) { diff --git a/yql/essentials/sql/v1/node.h b/yql/essentials/sql/v1/node.h index c9402a2f97..c197f7ac90 100644 --- a/yql/essentials/sql/v1/node.h +++ b/yql/essentials/sql/v1/node.h @@ -1437,6 +1437,8 @@ namespace NSQLTranslationV1 { TString TypeByAlias(const TString& alias, bool normalize = true); + TNodePtr BuildList(TPosition pos, TVector<TNodePtr> nodes = {}); + TNodePtr BuildQuote(TPosition pos, TNodePtr expr); TNodePtr BuildAtom(TPosition pos, const TString& content, ui32 flags = NYql::TNodeFlags::ArbitraryContent, bool isOptionalArg = false); TNodePtr BuildQuotedAtom(TPosition pos, const TString& content, ui32 flags = NYql::TNodeFlags::ArbitraryContent); @@ -1471,7 +1473,6 @@ namespace NSQLTranslationV1 { TNodePtr BuildColumn(TPosition pos, const TDeferredAtom& column, const TString& source = TString()); TNodePtr BuildColumnOrType(TPosition pos, const TString& column = TString()); TNodePtr BuildAccess(TPosition pos, const TVector<INode::TIdPart>& ids, bool isLookup); - TNodePtr BuildMatchRecognizeVarAccess(TPosition pos, const TString& var, const TString& column, bool theSameVar); TNodePtr BuildBind(TPosition pos, const TString& module, const TString& alias); TNodePtr BuildLambda(TPosition pos, TNodePtr params, TNodePtr body, const TString& resName = TString()); TNodePtr BuildLambda(TPosition pos, TNodePtr params, const TVector<TNodePtr>& bodies); diff --git a/yql/essentials/sql/v1/source.cpp b/yql/essentials/sql/v1/source.cpp index 4231a9d370..c2fb13b103 100644 --- a/yql/essentials/sql/v1/source.cpp +++ b/yql/essentials/sql/v1/source.cpp @@ -288,7 +288,7 @@ inline TNodePtr ISource::AliasOrColumn(const TNodePtr& node, bool withSource) { bool ISource::AddAggregationOverWindow(TContext& ctx, const TString& windowName, TAggregationPtr func) { if (ctx.DistinctOverWindow) { - YQL_ENSURE(func->IsOverWindow() || func->IsOverWindowDistinct()); + YQL_ENSURE(func->IsOverWindow() || func->IsOverWindowDistinct()); } else { YQL_ENSURE(func->IsOverWindow()); if (func->IsDistinct()) { diff --git a/yql/essentials/sql/v1/sql_expression.cpp b/yql/essentials/sql/v1/sql_expression.cpp index c2074cb50d..be401949c7 100644 --- a/yql/essentials/sql/v1/sql_expression.cpp +++ b/yql/essentials/sql/v1/sql_expression.cpp @@ -860,24 +860,7 @@ TNodePtr TSqlExpression::JsonApiExpr(const TRule_json_api_expr& node) { return result; } -TNodePtr MatchRecognizeVarAccess(TTranslation& ctx, const TString& var, const TRule_an_id_or_type& suffix, bool theSameVar) { - switch (suffix.GetAltCase()) { - case TRule_an_id_or_type::kAltAnIdOrType1: - break; - case TRule_an_id_or_type::kAltAnIdOrType2: - break; - case TRule_an_id_or_type::ALT_NOT_SET: - break; - } - const auto& column = Id( - suffix.GetAlt_an_id_or_type1() - .GetRule_id_or_type1().GetAlt_id_or_type1().GetRule_id1(), - ctx - ); - return BuildMatchRecognizeVarAccess(TPosition{}, var, column, theSameVar); -} - -TNodePtr TSqlExpression::RowPatternVarAccess(const TString& alias, const TRule_unary_subexpr_suffix_TBlock1_TBlock1_TAlt3_TBlock2 block) { +TNodePtr TSqlExpression::RowPatternVarAccess(TString var, const TRule_unary_subexpr_suffix_TBlock1_TBlock1_TAlt3_TBlock2 block) { switch (block.GetAltCase()) { case TRule_unary_subexpr_suffix_TBlock1_TBlock1_TAlt3_TBlock2::kAlt1: break; @@ -888,13 +871,10 @@ TNodePtr TSqlExpression::RowPatternVarAccess(const TString& alias, const TRule_u case TRule_an_id_or_type::kAltAnIdOrType1: { const auto &idOrType = block.GetAlt3().GetRule_an_id_or_type1().GetAlt_an_id_or_type1().GetRule_id_or_type1(); switch(idOrType.GetAltCase()) { - case TRule_id_or_type::kAltIdOrType1: - return BuildMatchRecognizeVarAccess( - Ctx.Pos(), - alias, - Id(idOrType.GetAlt_id_or_type1().GetRule_id1(), *this), - Ctx.GetMatchRecognizeDefineVar() == alias - ); + case TRule_id_or_type::kAltIdOrType1: { + const auto column = Id(idOrType.GetAlt_id_or_type1().GetRule_id1(), *this); + return BuildMatchRecognizeColumnAccess(Ctx.Pos(), std::move(var), std::move(column)); + } case TRule_id_or_type::kAltIdOrType2: break; case TRule_id_or_type::ALT_NOT_SET: @@ -911,7 +891,7 @@ TNodePtr TSqlExpression::RowPatternVarAccess(const TString& alias, const TRule_u case TRule_unary_subexpr_suffix_TBlock1_TBlock1_TAlt3_TBlock2::ALT_NOT_SET: Y_ABORT("You should change implementation according to grammar changes"); } - return TNodePtr{}; + return {}; } template<typename TUnaryCasualExprRule> @@ -991,13 +971,12 @@ TNodePtr TSqlExpression::UnaryCasualExpr(const TUnaryCasualExprRule& node, const case TRule_unary_subexpr_suffix::TBlock1::TBlock1::kAlt3: { // In case of MATCH_RECOGNIZE lambdas // X.Y is treated as Var.Column access - if (isColumnRef && EColumnRefState::MatchRecognize == Ctx.GetColumnReferenceState()) { - if (auto rowPatternVarAccess = RowPatternVarAccess( - name, - b.GetAlt3().GetBlock2()) - ) { - return rowPatternVarAccess; - } + if (isColumnRef && ( + EColumnRefState::MatchRecognizeMeasures == Ctx.GetColumnReferenceState() || + EColumnRefState::MatchRecognizeDefine == Ctx.GetColumnReferenceState() || + EColumnRefState::MatchRecognizeDefineAggregate == Ctx.GetColumnReferenceState() + )) { + return RowPatternVarAccess(std::move(name), b.GetAlt3().GetBlock2()); } break; } diff --git a/yql/essentials/sql/v1/sql_expression.h b/yql/essentials/sql/v1/sql_expression.h index 4f9100722c..adcf44b11b 100644 --- a/yql/essentials/sql/v1/sql_expression.h +++ b/yql/essentials/sql/v1/sql_expression.h @@ -109,7 +109,7 @@ private: TNodePtr BinOperList(const TString& opName, TVector<TNodePtr>::const_iterator begin, TVector<TNodePtr>::const_iterator end) const; - TNodePtr RowPatternVarAccess(const TString& alias, const TRule_unary_subexpr_suffix_TBlock1_TBlock1_TAlt3_TBlock2 block); + TNodePtr RowPatternVarAccess(TString var, const TRule_unary_subexpr_suffix_TBlock1_TBlock1_TAlt3_TBlock2 block); struct TCaseBranch { TNodePtr Pred; diff --git a/yql/essentials/sql/v1/sql_match_recognize.cpp b/yql/essentials/sql/v1/sql_match_recognize.cpp index 41415b7f23..6fcd56f4d6 100644 --- a/yql/essentials/sql/v1/sql_match_recognize.cpp +++ b/yql/essentials/sql/v1/sql_match_recognize.cpp @@ -1,367 +1,467 @@ #include "sql_match_recognize.h" + #include "node.h" #include "sql_expression.h" + #include <yql/essentials/core/sql_types/match_recognize.h> -#include <algorithm> namespace NSQLTranslationV1 { -using namespace NSQLv1Generated; - -namespace { - -TPosition TokenPosition(const TToken& token){ - return TPosition{token.GetColumn(), token.GetLine()}; -} - -TString PatternVar(const TRule_row_pattern_variable_name& node, TSqlMatchRecognizeClause& ctx){ - return Id(node.GetRule_identifier1(), ctx); -} - -} //namespace +TSqlMatchRecognizeClause::TSqlMatchRecognizeClause(TContext& ctx, NSQLTranslation::ESqlMode mode) : TSqlTranslation(ctx, mode) {} TMatchRecognizeBuilderPtr TSqlMatchRecognizeClause::CreateBuilder(const NSQLv1Generated::TRule_row_pattern_recognition_clause &matchRecognizeClause) { - TPosition pos(matchRecognizeClause.GetToken1().GetColumn(), matchRecognizeClause.GetToken1().GetLine()); + auto pos = GetPos(matchRecognizeClause.GetToken1()); if (!Ctx.FeatureR010) { Ctx.Error(pos, TIssuesIds::CORE) << "Unexpected MATCH_RECOGNIZE"; return {}; } - TVector<TNamedFunction> partitioners; - TPosition partitionsPos = pos; - if (matchRecognizeClause.HasBlock3()) { - const auto& partitionClause = matchRecognizeClause.GetBlock3().GetRule_window_partition_clause1(); - partitionsPos = TokenPosition(partitionClause.GetToken1()); - partitioners = ParsePartitionBy(partitionClause); - if (!partitioners) - return {}; - } - TVector<TSortSpecificationPtr> sortSpecs; - TPosition orderByPos = pos; - if (matchRecognizeClause.HasBlock4()) { - const auto& orderByClause = matchRecognizeClause.GetBlock4().GetRule_order_by_clause1(); - orderByPos = TokenPosition(orderByClause.GetToken1()); - if (!OrderByClause(orderByClause, sortSpecs)) { - return {}; - } - } - TPosition measuresPos = pos; - TVector<TNamedFunction> measures; - if (matchRecognizeClause.HasBlock5()) { - const auto& measuresClause = matchRecognizeClause.GetBlock5().GetRule_row_pattern_measures1(); - measuresPos = TokenPosition(measuresClause.GetToken1()); - measures = ParseMeasures(measuresClause.GetRule_row_pattern_measure_list2()); + auto [partitionKeySelector, partitionColumns] = ParsePartitionBy( + pos, + matchRecognizeClause.HasBlock3() + ? std::addressof(matchRecognizeClause.GetBlock3().GetRule_window_partition_clause1()) + : nullptr + ); + + auto sortSpecs = ParseOrderBy( + matchRecognizeClause.HasBlock4() + ? std::addressof(matchRecognizeClause.GetBlock4().GetRule_order_by_clause1()) + : nullptr + ); + if (!sortSpecs) { + return {}; } - auto rowsPerMatch = std::pair {pos, NYql::NMatchRecognize::ERowsPerMatch::OneRow}; - if (matchRecognizeClause.HasBlock6()) { - rowsPerMatch = ParseRowsPerMatch(matchRecognizeClause.GetBlock6().GetRule_row_pattern_rows_per_match1()); + auto measures = ParseMeasures( + matchRecognizeClause.HasBlock5() + ? std::addressof(matchRecognizeClause.GetBlock5().GetRule_row_pattern_measures1().GetRule_row_pattern_measure_list2()) + : nullptr + ); + + auto rowsPerMatch = ParseRowsPerMatch( + pos, + matchRecognizeClause.HasBlock6() + ? std::addressof(matchRecognizeClause.GetBlock6().GetRule_row_pattern_rows_per_match1()) + : nullptr + ); + if (!rowsPerMatch) { + return {}; } const auto& commonSyntax = matchRecognizeClause.GetRule_row_pattern_common_syntax7(); - if (commonSyntax.HasBlock2()) { const auto& initialOrSeek = commonSyntax.GetBlock2().GetRule_row_pattern_initial_or_seek1(); - Ctx.Error(TokenPosition(initialOrSeek.GetToken1())) << "InitialOrSeek subclause is not allowed in FROM clause"; + Ctx.Error(GetPos(initialOrSeek.GetToken1())) << "InitialOrSeek subclause is not allowed in FROM clause"; return {}; } - auto pattern = ParsePattern(commonSyntax.GetRule_row_pattern5()); - const auto& patternPos = TokenPosition(commonSyntax.token3()); - - //this block is located before pattern block in grammar, - // but depends on it, so it is processed after pattern block - std::pair<TPosition, NYql::NMatchRecognize::TAfterMatchSkipTo> skipTo { - pos, - NYql::NMatchRecognize::TAfterMatchSkipTo{ - NYql::NMatchRecognize::EAfterMatchSkipTo::PastLastRow, - TString() - } - }; - if (commonSyntax.HasBlock1()){ - skipTo = ParseAfterMatchSkipTo(commonSyntax.GetBlock1().GetRule_row_pattern_skip_to3()); - const auto varRequired = - NYql::NMatchRecognize::EAfterMatchSkipTo::ToFirst == skipTo.second.To || - NYql::NMatchRecognize::EAfterMatchSkipTo::ToLast == skipTo.second.To || - NYql::NMatchRecognize::EAfterMatchSkipTo::To == skipTo.second.To; - if (varRequired) { - const auto& allVars = NYql::NMatchRecognize::GetPatternVars(pattern); - if (allVars.find(skipTo.second.Var) == allVars.cend()) { - Ctx.Error(skipTo.first) << "Unknown pattern variable in AFTER MATCH"; - return {}; - } - } + PatternVarNames.clear(); + PatternVars = BuildList(pos); + auto pattern = ParsePattern(pos, commonSyntax.GetRule_row_pattern5(), 0, true); + if (!pattern) { + return {}; } + auto skipTo = ParseAfterMatchSkipTo( + pos, + commonSyntax.HasBlock1() + ? std::addressof(commonSyntax.GetBlock1().GetRule_row_pattern_skip_to3()) + : nullptr + ); + if (!skipTo) { + return {}; + } - TNodePtr subset; - TPosition subsetPos = pos; - if (commonSyntax.HasBlock7()) { - const auto& rowPatternSubset = commonSyntax.GetBlock7().GetRule_row_pattern_subset_clause1(); - subsetPos = TokenPosition(rowPatternSubset.GetToken1()); - Ctx.Error() << "SUBSET is not implemented yet"; - //TODO https://st.yandex-team.ru/YQL-16225 + auto subset = ParseSubset( + pos, + commonSyntax.HasBlock7() + ? std::addressof(commonSyntax.GetBlock7().GetRule_row_pattern_subset_clause1()) + : nullptr + ); + if (!subset) { return {}; } - const auto& definitions = ParseDefinitions(commonSyntax.GetRule_row_pattern_definition_list9()); - const auto& definitionsPos = TokenPosition(commonSyntax.GetToken8()); - const auto& rowPatternVariables = GetPatternVars(pattern); + auto definitions = ParseDefinitions(commonSyntax.GetRule_row_pattern_definition_list9()); for (const auto& [callable, name]: definitions) { - if (!rowPatternVariables.contains(name)) { + if (!PatternVarNames.contains(name)) { Ctx.Error(callable->GetPos()) << "ROW PATTERN VARIABLE " << name << " is defined, but not mentioned in the PATTERN"; return {}; } } - return new TMatchRecognizeBuilder{ + return new TMatchRecognizeBuilder( pos, - std::pair{partitionsPos, std::move(partitioners)}, - std::pair{orderByPos, std::move(sortSpecs)}, - std::pair{measuresPos, measures}, + std::move(partitionKeySelector), + std::move(partitionColumns), + std::move(*sortSpecs), + std::move(measures), std::move(rowsPerMatch), std::move(skipTo), - std::pair{patternPos, std::move(pattern)}, - std::pair{subsetPos, std::move(subset)}, - std::pair{definitionsPos, std::move(definitions)} - }; - + std::move(pattern), + std::move(PatternVars), + std::move(*subset), + std::move(definitions) + ); +} +std::tuple<TNodePtr, TNodePtr> TSqlMatchRecognizeClause::ParsePartitionBy(TPosition pos, const TRule_window_partition_clause* node) { + auto [partitionKeySelector, partitionColumns] = [&]() -> std::tuple<TNodePtr, TNodePtr> { + auto partitionKeySelector = BuildList(pos); + auto partitionColumns = BuildList(pos); + if (!node) { + return {partitionKeySelector, partitionColumns}; + } + TColumnRefScope scope(Ctx, EColumnRefState::Allow); + TVector<TNodePtr> partitionExprs; + if (!NamedExprList(node->GetRule_named_expr_list4(), partitionExprs)) { + return {partitionKeySelector, partitionColumns}; + } + for (const auto& p : partitionExprs) { + auto label = p->GetLabel(); + if (!label && p->GetColumnName()) { + label = *p->GetColumnName(); + } + partitionKeySelector->Add(p); + partitionColumns->Add(BuildQuotedAtom(p->GetPos(), label)); + } + return {partitionKeySelector, partitionColumns}; + }(); + return { + BuildLambda(pos, BuildList(pos, {BuildAtom(pos, "row")}), BuildQuote(pos, std::move(partitionKeySelector))), + BuildQuote(pos, std::move(partitionColumns)) + }; } -TVector<TNamedFunction> TSqlMatchRecognizeClause::ParsePartitionBy(const TRule_window_partition_clause& partitionClause) { - TColumnRefScope scope(Ctx, EColumnRefState::Allow); - TVector<TNodePtr> partitionExprs; - if (!NamedExprList( - partitionClause.GetRule_named_expr_list4(), - partitionExprs)) { - return {}; +TMaybe<TVector<TSortSpecificationPtr>> TSqlMatchRecognizeClause::ParseOrderBy(const TRule_order_by_clause* node) { + if (!node) { + return TVector<TSortSpecificationPtr>{}; } - TVector<TNamedFunction> partitioners; - for (const auto& p: partitionExprs) { - auto label = p->GetLabel(); - if (!label && p->GetColumnName()) { - label = *p->GetColumnName(); - } - partitioners.push_back(TNamedFunction{p, label}); + TVector<TSortSpecificationPtr> result; + if (!OrderByClause(*node, result)) { + return {}; } - return partitioners; + return result; } TNamedFunction TSqlMatchRecognizeClause::ParseOneMeasure(const TRule_row_pattern_measure_definition& node) { - TColumnRefScope scope(Ctx, EColumnRefState::MatchRecognize); - const auto& expr = TSqlExpression(Ctx, Mode).Build(node.GetRule_expr1()); - const auto& name = Id(node.GetRule_an_id3(), *this); - //Each measure must be a lambda, that accepts 2 args: + TColumnRefScope scope(Ctx, EColumnRefState::MatchRecognizeMeasures); + auto callable = TSqlExpression(Ctx, Mode).Build(node.GetRule_expr1()); + auto measureName = Id(node.GetRule_an_id3(), *this); + // Each measure must be a lambda, that accepts 2 args: // - List<InputTableColumns + _yql_Classifier, _yql_MatchNumber> // - Struct that maps row pattern variables to ranges in the queue - return {expr, name}; + return {std::move(callable), std::move(measureName)}; } -TVector<TNamedFunction> TSqlMatchRecognizeClause::ParseMeasures(const TRule_row_pattern_measure_list& node) { - TVector<TNamedFunction> result{ ParseOneMeasure(node.GetRule_row_pattern_measure_definition1()) }; - for (const auto& m: node.GetBlock2()) { +TVector<TNamedFunction> TSqlMatchRecognizeClause::ParseMeasures(const TRule_row_pattern_measure_list* node) { + if (!node) { + return {}; + } + TVector<TNamedFunction> result{ParseOneMeasure(node->GetRule_row_pattern_measure_definition1())}; + for (const auto& m: node->GetBlock2()) { result.push_back(ParseOneMeasure(m.GetRule_row_pattern_measure_definition2())); } return result; } -std::pair<TPosition, NYql::NMatchRecognize::ERowsPerMatch> TSqlMatchRecognizeClause::ParseRowsPerMatch(const TRule_row_pattern_rows_per_match& rowsPerMatchClause) { - - switch(rowsPerMatchClause.GetAltCase()) { - case TRule_row_pattern_rows_per_match::kAltRowPatternRowsPerMatch1: - return std::pair { - TokenPosition(rowsPerMatchClause.GetAlt_row_pattern_rows_per_match1().GetToken1()), - NYql::NMatchRecognize::ERowsPerMatch::OneRow - }; - case TRule_row_pattern_rows_per_match::kAltRowPatternRowsPerMatch2: - return std::pair { - TokenPosition(rowsPerMatchClause.GetAlt_row_pattern_rows_per_match2().GetToken1()), - NYql::NMatchRecognize::ERowsPerMatch::AllRows - }; +TNodePtr TSqlMatchRecognizeClause::ParseRowsPerMatch(TPosition pos, const TRule_row_pattern_rows_per_match* node) { + const auto result = [&]() -> NYql::NMatchRecognize::ERowsPerMatch { + if (!node) { + return NYql::NMatchRecognize::ERowsPerMatch::OneRow; + } + switch (node->GetAltCase()) { + case TRule_row_pattern_rows_per_match::kAltRowPatternRowsPerMatch1: { + const auto& rowsPerMatch = node->GetAlt_row_pattern_rows_per_match1(); + pos = GetPos(rowsPerMatch.GetToken1()); + return NYql::NMatchRecognize::ERowsPerMatch::OneRow; + } + case TRule_row_pattern_rows_per_match::kAltRowPatternRowsPerMatch2: { + const auto& rowsPerMatch = node->GetAlt_row_pattern_rows_per_match2(); + pos = GetPos(rowsPerMatch.GetToken1()); + return NYql::NMatchRecognize::ERowsPerMatch::AllRows; + } case TRule_row_pattern_rows_per_match::ALT_NOT_SET: Y_ABORT("You should change implementation according to grammar changes"); - } + } + }(); + return BuildQuotedAtom(pos, "RowsPerMatch_" + ToString(result)); } -std::pair<TPosition, NYql::NMatchRecognize::TAfterMatchSkipTo> TSqlMatchRecognizeClause::ParseAfterMatchSkipTo(const TRule_row_pattern_skip_to& skipToClause) { - switch (skipToClause.GetAltCase()) { - case TRule_row_pattern_skip_to::kAltRowPatternSkipTo1: - return std::pair{ - TokenPosition(skipToClause.GetAlt_row_pattern_skip_to1().GetToken1()), - NYql::NMatchRecognize::TAfterMatchSkipTo{NYql::NMatchRecognize::EAfterMatchSkipTo::NextRow, ""} - }; - case TRule_row_pattern_skip_to::kAltRowPatternSkipTo2: - return std::pair{ - TokenPosition(skipToClause.GetAlt_row_pattern_skip_to2().GetToken1()), - NYql::NMatchRecognize::TAfterMatchSkipTo{NYql::NMatchRecognize::EAfterMatchSkipTo::PastLastRow, ""} - }; - case TRule_row_pattern_skip_to::kAltRowPatternSkipTo3: - return std::pair{ - TokenPosition(skipToClause.GetAlt_row_pattern_skip_to3().GetToken1()), - NYql::NMatchRecognize::TAfterMatchSkipTo{ - NYql::NMatchRecognize::EAfterMatchSkipTo::ToFirst, - skipToClause.GetAlt_row_pattern_skip_to3().GetRule_row_pattern_skip_to_variable_name4().GetRule_row_pattern_variable_name1().GetRule_identifier1().GetToken1().GetValue() - } - }; - case TRule_row_pattern_skip_to::kAltRowPatternSkipTo4: - return std::pair{ - TokenPosition(skipToClause.GetAlt_row_pattern_skip_to4().GetToken1()), - NYql::NMatchRecognize::TAfterMatchSkipTo{ - NYql::NMatchRecognize::EAfterMatchSkipTo::ToLast, - skipToClause.GetAlt_row_pattern_skip_to4().GetRule_row_pattern_skip_to_variable_name4().GetRule_row_pattern_variable_name1().GetRule_identifier1().GetToken1().GetValue() - } - }; - case TRule_row_pattern_skip_to::kAltRowPatternSkipTo5: - return std::pair{ - TokenPosition(skipToClause.GetAlt_row_pattern_skip_to5().GetToken1()), - NYql::NMatchRecognize::TAfterMatchSkipTo{ - NYql::NMatchRecognize::EAfterMatchSkipTo::To, - skipToClause.GetAlt_row_pattern_skip_to5().GetRule_row_pattern_skip_to_variable_name3().GetRule_row_pattern_variable_name1().GetRule_identifier1().GetToken1().GetValue() - } - }; +TNodePtr TSqlMatchRecognizeClause::ParseAfterMatchSkipTo(TPosition pos, const TRule_row_pattern_skip_to* node) { + auto skipToPos = pos; + auto varPos = pos; + const auto result = [&]() -> TMaybe<NYql::NMatchRecognize::TAfterMatchSkipTo> { + if (!node) { + return NYql::NMatchRecognize::TAfterMatchSkipTo{NYql::NMatchRecognize::EAfterMatchSkipTo::PastLastRow, ""}; + } + switch (node->GetAltCase()) { + case TRule_row_pattern_skip_to::kAltRowPatternSkipTo1: { + const auto& skipTo = node->GetAlt_row_pattern_skip_to1(); + skipToPos = GetPos(skipTo.GetToken1()); + return NYql::NMatchRecognize::TAfterMatchSkipTo{NYql::NMatchRecognize::EAfterMatchSkipTo::NextRow, ""}; + } + case TRule_row_pattern_skip_to::kAltRowPatternSkipTo2: { + const auto& skipTo = node->GetAlt_row_pattern_skip_to2(); + skipToPos = GetPos(skipTo.GetToken1()); + return NYql::NMatchRecognize::TAfterMatchSkipTo{NYql::NMatchRecognize::EAfterMatchSkipTo::PastLastRow, ""}; + } + case TRule_row_pattern_skip_to::kAltRowPatternSkipTo3: { + const auto& skipTo = node->GetAlt_row_pattern_skip_to3(); + skipToPos = GetPos(skipTo.GetToken1()); + const auto& identifier = skipTo.GetRule_row_pattern_skip_to_variable_name4().GetRule_row_pattern_variable_name1().GetRule_identifier1(); + auto var = identifier.GetToken1().GetValue(); + varPos = GetPos(identifier.GetToken1()); + if (!PatternVarNames.contains(var)) { + Ctx.Error(varPos) << "Unknown pattern variable in AFTER MATCH SKIP TO FIRST"; + return {}; + } + return NYql::NMatchRecognize::TAfterMatchSkipTo{NYql::NMatchRecognize::EAfterMatchSkipTo::ToFirst, std::move(var)}; + } + case TRule_row_pattern_skip_to::kAltRowPatternSkipTo4: { + const auto& skipTo = node->GetAlt_row_pattern_skip_to4(); + skipToPos = GetPos(skipTo.GetToken1()); + const auto& identifier = skipTo.GetRule_row_pattern_skip_to_variable_name4().GetRule_row_pattern_variable_name1().GetRule_identifier1(); + auto var = identifier.GetToken1().GetValue(); + varPos = GetPos(identifier.GetToken1()); + if (!PatternVarNames.contains(var)) { + Ctx.Error(varPos) << "Unknown pattern variable in AFTER MATCH SKIP TO LAST"; + return {}; + } + return NYql::NMatchRecognize::TAfterMatchSkipTo{NYql::NMatchRecognize::EAfterMatchSkipTo::ToLast, std::move(var)}; + } + case TRule_row_pattern_skip_to::kAltRowPatternSkipTo5: { + const auto& skipTo = node->GetAlt_row_pattern_skip_to5(); + skipToPos = GetPos(skipTo.GetToken1()); + const auto& identifier = skipTo.GetRule_row_pattern_skip_to_variable_name3().GetRule_row_pattern_variable_name1().GetRule_identifier1(); + auto var = identifier.GetToken1().GetValue(); + varPos = GetPos(identifier.GetToken1()); + if (!PatternVarNames.contains(var)) { + Ctx.Error(varPos) << "Unknown pattern variable in AFTER MATCH SKIP TO"; + return {}; + } + return NYql::NMatchRecognize::TAfterMatchSkipTo{NYql::NMatchRecognize::EAfterMatchSkipTo::To, std::move(var)}; + } case TRule_row_pattern_skip_to::ALT_NOT_SET: Y_ABORT("You should change implementation according to grammar changes"); + } + }(); + if (!result) { + return {}; } + return BuildTuple(pos, { + BuildQuotedAtom(skipToPos, "AfterMatchSkip_" + ToString(result->To)), + BuildQuotedAtom(varPos, std::move(result->Var)) + }); } -NYql::NMatchRecognize::TRowPatternTerm TSqlMatchRecognizeClause::ParsePatternTerm(const TRule_row_pattern_term& node, size_t patternNestingLevel, bool outputArg) { - NYql::NMatchRecognize::TRowPatternTerm term; - TPosition pos; - for (const auto& factor: node.GetBlock1()) { - const auto& primaryVar = factor.GetRule_row_pattern_factor1().GetRule_row_pattern_primary1(); - NYql::NMatchRecognize::TRowPatternPrimary primary; - bool output = outputArg; - switch (primaryVar.GetAltCase()) { - case TRule_row_pattern_primary::kAltRowPatternPrimary1: - primary = PatternVar(primaryVar.GetAlt_row_pattern_primary1().GetRule_row_pattern_primary_variable_name1().GetRule_row_pattern_variable_name1(), *this); - break; - case TRule_row_pattern_primary::kAltRowPatternPrimary2: - primary = primaryVar.GetAlt_row_pattern_primary2().GetToken1().GetValue(); - Y_ENSURE("$" == std::get<0>(primary)); - break; - case TRule_row_pattern_primary::kAltRowPatternPrimary3: - primary = primaryVar.GetAlt_row_pattern_primary3().GetToken1().GetValue(); - Y_ENSURE("^" == std::get<0>(primary)); - break; - case TRule_row_pattern_primary::kAltRowPatternPrimary4: { - if (patternNestingLevel <= NYql::NMatchRecognize::MaxPatternNesting) { - primary = ParsePattern(primaryVar.GetAlt_row_pattern_primary4().GetBlock2().GetRule_row_pattern1(), patternNestingLevel + 1, output); - } else { - Ctx.Error(TokenPosition(primaryVar.GetAlt_row_pattern_primary4().GetToken1())) - << "To big nesting level in the pattern"; - return NYql::NMatchRecognize::TRowPatternTerm{}; - } - break; +TNodePtr TSqlMatchRecognizeClause::BuildPatternFactor(TPosition pos, TNodePtr primary, std::tuple<ui64, ui64, bool, bool, bool> quantifier) { + return std::apply([&](const auto& ...args) { + return BuildTuple(pos, {std::move(primary), BuildQuotedAtom(pos, ToString(args))...}); + }, quantifier); +} + +TNodePtr TSqlMatchRecognizeClause::ParsePatternFactor(TPosition pos, const TRule_row_pattern_factor& node, size_t nestingLevel, bool output) { + if (nestingLevel > MaxPatternNesting) { + Ctx.Error(pos) << "To big nesting level in the pattern"; + return {}; + } + auto primary = [&]() -> TNodePtr { + const auto& primaryAlt = node.GetRule_row_pattern_primary1(); + switch (primaryAlt.GetAltCase()) { + case TRule_row_pattern_primary::kAltRowPatternPrimary1: { + const auto& primary = primaryAlt.GetAlt_row_pattern_primary1(); + const auto& identifier = primary.GetRule_row_pattern_primary_variable_name1().GetRule_row_pattern_variable_name1().GetRule_identifier1(); + const auto varName = Id(identifier, *this); + const auto var = BuildQuotedAtom(GetPos(identifier.GetToken1()), varName); + if (PatternVarNames.insert(varName).second) { + PatternVars->Add(var); } - case TRule_row_pattern_primary::kAltRowPatternPrimary5: - output = false; - primary = ParsePattern(primaryVar.GetAlt_row_pattern_primary5().GetRule_row_pattern3(), patternNestingLevel + 1, output); - break; - case TRule_row_pattern_primary::kAltRowPatternPrimary6: { - std::vector<NYql::NMatchRecognize::TRowPatternPrimary> items{ParsePattern( - primaryVar.GetAlt_row_pattern_primary6().GetRule_row_pattern_permute1().GetRule_row_pattern3(), patternNestingLevel + 1, output) - }; - for (const auto& p: primaryVar.GetAlt_row_pattern_primary6().GetRule_row_pattern_permute1().GetBlock4()) { - items.push_back(ParsePattern(p.GetRule_row_pattern2(), patternNestingLevel + 1, output)); - } - //Permutations now is a syntactic sugar and converted to all possible alternatives - if (items.size() > NYql::NMatchRecognize::MaxPermutedItems) { - Ctx.Error(TokenPosition(primaryVar.GetAlt_row_pattern_primary4().GetToken1())) - << "Too many items in permute"; - return NYql::NMatchRecognize::TRowPatternTerm{}; - } - std::vector<size_t> indexes(items.size()); - std::generate(begin(indexes), end(indexes), [n = 0] () mutable { return n++; }); - NYql::NMatchRecognize::TRowPattern permuted; - do { - NYql::NMatchRecognize::TRowPatternTerm term; - term.reserve(indexes.size()); - for (size_t i = 0; i != indexes.size(); ++i) { - term.push_back({items[indexes[i]], 1, 1, true, false, false}); - } - permuted.push_back(std::move(term)); - } while (std::next_permutation(indexes.begin(), indexes.end())); - primary = permuted; - break; + return var; + } + case TRule_row_pattern_primary::kAltRowPatternPrimary2: { + const auto& primary = primaryAlt.GetAlt_row_pattern_primary2(); + const auto& token = primary.GetToken1(); + const auto varName = token.GetValue(); + const auto var = BuildQuotedAtom(GetPos(token), varName); + if (PatternVarNames.insert(varName).second) { + PatternVars->Add(var); } - case TRule_row_pattern_primary::ALT_NOT_SET: - Y_ABORT("You should change implementation according to grammar changes"); + return var; } - uint64_t quantityMin = 1; - uint64_t quantityMax = 1; - constexpr uint64_t infinity = std::numeric_limits<uint64_t>::max(); - bool greedy = true; - if (factor.GetRule_row_pattern_factor1().HasBlock2()) { - const auto& quantifier = factor.GetRule_row_pattern_factor1().GetBlock2().GetRule_row_pattern_quantifier1(); - switch(quantifier.GetAltCase()){ - case TRule_row_pattern_quantifier::kAltRowPatternQuantifier1: //* - quantityMin = 0; - quantityMax = infinity; - greedy = !quantifier.GetAlt_row_pattern_quantifier1().HasBlock2(); - break; - case TRule_row_pattern_quantifier::kAltRowPatternQuantifier2: //+ - quantityMax = infinity; - greedy = !quantifier.GetAlt_row_pattern_quantifier2().HasBlock2(); - break; - case TRule_row_pattern_quantifier::kAltRowPatternQuantifier3: //? - quantityMin = 0; - greedy = !quantifier.GetAlt_row_pattern_quantifier3().HasBlock2(); - break; - case TRule_row_pattern_quantifier::kAltRowPatternQuantifier4: //{ 2?, 4?} - if (quantifier.GetAlt_row_pattern_quantifier4().HasBlock2()) { - quantityMin = FromString(quantifier.GetAlt_row_pattern_quantifier4().GetBlock2().GetRule_integer1().GetToken1().GetValue()); - } - else { - quantityMin = 0;; - } - if (quantifier.GetAlt_row_pattern_quantifier4().HasBlock4()) { - quantityMax = FromString(quantifier.GetAlt_row_pattern_quantifier4().GetBlock4().GetRule_integer1().GetToken1().GetValue()); - } - else { - quantityMax = infinity; - } - greedy = !quantifier.GetAlt_row_pattern_quantifier4().HasBlock6(); - - break; - case TRule_row_pattern_quantifier::kAltRowPatternQuantifier5: - quantityMin = quantityMax = FromString(quantifier.GetAlt_row_pattern_quantifier5().GetRule_integer2().GetToken1().GetValue()); - break; - case TRule_row_pattern_quantifier::ALT_NOT_SET: - Y_ABORT("You should change implementation according to grammar changes"); + case TRule_row_pattern_primary::kAltRowPatternPrimary3: { + const auto& primary = primaryAlt.GetAlt_row_pattern_primary3(); + const auto& token = primary.GetToken1(); + const auto varName = token.GetValue(); + const auto var = BuildQuotedAtom(GetPos(token), varName); + if (PatternVarNames.insert(varName).second) { + PatternVars->Add(var); } + return var; + } + case TRule_row_pattern_primary::kAltRowPatternPrimary4: { + const auto& primary = primaryAlt.GetAlt_row_pattern_primary4(); + return ParsePattern(pos, primary.GetBlock2().GetRule_row_pattern1(), nestingLevel + 1, output); + } + case TRule_row_pattern_primary::kAltRowPatternPrimary5: { + const auto& primary = primaryAlt.GetAlt_row_pattern_primary5(); + output = false; + return ParsePattern(pos, primary.GetRule_row_pattern3(), nestingLevel + 1, output); + } + case TRule_row_pattern_primary::kAltRowPatternPrimary6: { + const auto& primary = primaryAlt.GetAlt_row_pattern_primary6(); + std::vector<TNodePtr> items{ + ParsePattern(pos, primary.GetRule_row_pattern_permute1().GetRule_row_pattern3(), nestingLevel + 1, output) + }; + for (const auto& p: primary.GetRule_row_pattern_permute1().GetBlock4()) { + items.push_back(ParsePattern(pos, p.GetRule_row_pattern2(), nestingLevel + 1, output)); + } + if (items.size() > MaxPermutedItems) { + Ctx.Error(GetPos(primary.GetRule_row_pattern_permute1().GetToken1())) << "Too many items in permute"; + return {}; + } + std::vector<size_t> indexes(items.size()); + Iota(indexes.begin(), indexes.end(), 0); + std::vector<TNodePtr> result; + do { + std::vector<TNodePtr> term; + term.reserve(items.size()); + for (auto index : indexes) { + term.push_back(BuildPatternFactor(pos, items[index], std::tuple{1, 1, true, output, false})); + } + result.push_back(BuildPatternTerm(pos, std::move(term))); + } while (std::next_permutation(indexes.begin(), indexes.end())); + return BuildPattern(pos, std::move(result)); + } + case TRule_row_pattern_primary::ALT_NOT_SET: + Y_ABORT("You should change implementation according to grammar changes"); } - term.push_back(NYql::NMatchRecognize::TRowPatternFactor{std::move(primary), quantityMin, quantityMax, greedy, output, false}); + }(); + if (!primary) { + return {}; } - return term; + + const auto quantifier = [&]() { + if (!node.HasBlock2()) { + const auto quantity = static_cast<ui64>(1); + return std::tuple{quantity, quantity, true, output, false}; + } + const auto& quantifierAlt = node.GetBlock2().GetRule_row_pattern_quantifier1(); + switch (quantifierAlt.GetAltCase()) { + case TRule_row_pattern_quantifier::kAltRowPatternQuantifier1: { // * + const auto& quantifier = quantifierAlt.GetAlt_row_pattern_quantifier1(); + pos = GetPos(quantifier.GetToken1()); + return std::tuple{static_cast<ui64>(0), static_cast<ui64>(Max()), !quantifier.HasBlock2(), output, false}; + } + case TRule_row_pattern_quantifier::kAltRowPatternQuantifier2: { // + + const auto& quantifier = quantifierAlt.GetAlt_row_pattern_quantifier2(); + pos = GetPos(quantifier.GetToken1()); + return std::tuple{static_cast<ui64>(1), static_cast<ui64>(Max()), !quantifier.HasBlock2(), output, false}; + } + case TRule_row_pattern_quantifier::kAltRowPatternQuantifier3: { // ? + const auto& quantifier = quantifierAlt.GetAlt_row_pattern_quantifier3(); + pos = GetPos(quantifier.GetToken1()); + return std::tuple{static_cast<ui64>(0), static_cast<ui64>(1), !quantifier.HasBlock2(), output, false}; + } + case TRule_row_pattern_quantifier::kAltRowPatternQuantifier4: { // {n?, m?} + const auto& quantifier = quantifierAlt.GetAlt_row_pattern_quantifier4(); + pos = GetPos(quantifier.GetToken1()); + return std::tuple{ + quantifier.HasBlock2() + ? FromString(quantifier.GetBlock2().GetRule_integer1().GetToken1().GetValue()) + : static_cast<ui64>(0), + quantifier.HasBlock4() + ? FromString(quantifier.GetBlock4().GetRule_integer1().GetToken1().GetValue()) + : static_cast<ui64>(Max()), + !quantifier.HasBlock6(), + output, + false + }; + } + case TRule_row_pattern_quantifier::kAltRowPatternQuantifier5: { // {n} + const auto quantifier = quantifierAlt.GetAlt_row_pattern_quantifier5(); + pos = GetPos(quantifier.GetToken1()); + const auto quantity = static_cast<ui64>(FromString(quantifier.GetRule_integer2().GetToken1().GetValue())); + return std::tuple{quantity, quantity, true, output, false}; + } + case TRule_row_pattern_quantifier::ALT_NOT_SET: + Y_ABORT("You should change implementation according to grammar changes"); + } + }(); + return BuildPatternFactor(pos, std::move(primary), std::move(quantifier)); } -NYql::NMatchRecognize::TRowPattern TSqlMatchRecognizeClause::ParsePattern(const TRule_row_pattern& node, size_t patternNestingLevel, bool output){ - TVector<NYql::NMatchRecognize::TRowPatternTerm> result; - result.push_back(ParsePatternTerm(node.GetRule_row_pattern_term1(), patternNestingLevel, output)); - for (const auto& term: node.GetBlock2()) - result.push_back(ParsePatternTerm(term.GetRule_row_pattern_term2(), patternNestingLevel, output)); +TNodePtr TSqlMatchRecognizeClause::BuildPatternTerm(TPosition pos, std::vector<TNodePtr> term) { + auto result = BuildList(pos); + for (auto& factor : term) { + if (!factor) { + return {}; + } + result->Add(std::move(factor)); + } + return BuildQuote(pos, std::move(result)); +} + +TNodePtr TSqlMatchRecognizeClause::ParsePatternTerm(TPosition pos, const TRule_row_pattern_term& node, size_t nestingLevel, bool output) { + std::vector<TNodePtr> result; + result.reserve(node.GetBlock1().size()); + for (const auto& factor: node.GetBlock1()) { + result.push_back(ParsePatternFactor(pos, factor.GetRule_row_pattern_factor1(), nestingLevel, output)); + } + return BuildPatternTerm(pos, std::move(result)); +} + +TNodePtr TSqlMatchRecognizeClause::BuildPattern(TPosition pos, std::vector<TNodePtr> pattern) { + const auto result = BuildList(pos, {BuildAtom(pos, "MatchRecognizePattern")}); + for (auto& term: pattern) { + if (!term) { + return {}; + } + result->Add(std::move(term)); + } return result; } -TNamedFunction TSqlMatchRecognizeClause::ParseOneDefinition(const TRule_row_pattern_definition& node){ - const auto& varName = PatternVar(node.GetRule_row_pattern_definition_variable_name1().GetRule_row_pattern_variable_name1(), *this); - TColumnRefScope scope(Ctx, EColumnRefState::MatchRecognize, true, varName); - const auto& searchCondition = TSqlExpression(Ctx, Mode).Build(node.GetRule_row_pattern_definition_search_condition3().GetRule_search_condition1().GetRule_expr1()); - return TNamedFunction{searchCondition, varName}; +TNodePtr TSqlMatchRecognizeClause::ParsePattern(TPosition pos, const TRule_row_pattern& node, size_t nestingLevel, bool output) { + std::vector<TNodePtr> result; + result.reserve(1 + node.GetBlock2().size()); + result.push_back(ParsePatternTerm(pos, node.GetRule_row_pattern_term1(), nestingLevel, output)); + for (const auto& term: node.GetBlock2()) { + result.push_back(ParsePatternTerm(pos, term.GetRule_row_pattern_term2(), nestingLevel, output)); + } + return BuildPattern(pos, std::move(result)); +} + +TMaybe<TNodePtr> TSqlMatchRecognizeClause::ParseSubset(TPosition pos, const TRule_row_pattern_subset_clause* node) { + if (!node) { + return TNodePtr{}; + } + pos = GetPos(node->GetToken1()); + // TODO https://st.yandex-team.ru/YQL-16225 + Ctx.Error(pos) << "SUBSET is not implemented yet"; + return {}; +} + +TNamedFunction TSqlMatchRecognizeClause::ParseOneDefinition(const TRule_row_pattern_definition& node) { + const auto& identifier = node.GetRule_row_pattern_definition_variable_name1().GetRule_row_pattern_variable_name1().GetRule_identifier1(); + auto defineName = Id(identifier, *this); + TColumnRefScope scope(Ctx, EColumnRefState::MatchRecognizeDefine, true, defineName); + const auto& searchCondition = node.GetRule_row_pattern_definition_search_condition3().GetRule_search_condition1().GetRule_expr1(); + auto callable = TSqlExpression(Ctx, Mode).Build(searchCondition); + // Each define must be a predicate lambda, that accepts 3 args: + // - List<input table rows> + // - A struct that maps row pattern variables to ranges in the queue + // - An index of the current row + return {std::move(callable), std::move(defineName)}; } TVector<TNamedFunction> TSqlMatchRecognizeClause::ParseDefinitions(const TRule_row_pattern_definition_list& node) { - TVector<TNamedFunction> result { ParseOneDefinition(node.GetRule_row_pattern_definition1())}; + TVector<TNamedFunction> result{ParseOneDefinition(node.GetRule_row_pattern_definition1())}; for (const auto& d: node.GetBlock2()) { - //Each define must be a predicate lambda, that accepts 3 args: - // - List<input table rows> - // - A struct that maps row pattern variables to ranges in the queue - // - An index of the current row result.push_back(ParseOneDefinition(d.GetRule_row_pattern_definition2())); } return result; } -} //namespace NSQLTranslationV1 +} // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/sql_match_recognize.h b/yql/essentials/sql/v1/sql_match_recognize.h index 219baeaa09..928ed35b5d 100644 --- a/yql/essentials/sql/v1/sql_match_recognize.h +++ b/yql/essentials/sql/v1/sql_match_recognize.h @@ -1,28 +1,38 @@ #pragma once -#include "sql_translation.h" #include "match_recognize.h" +#include "node.h" +#include "sql_translation.h" namespace NSQLTranslationV1 { -using namespace NSQLv1Generated; - -class TSqlMatchRecognizeClause: public TSqlTranslation { +class TSqlMatchRecognizeClause final : public TSqlTranslation { public: - TSqlMatchRecognizeClause(TContext& ctx, NSQLTranslation::ESqlMode mode) - : TSqlTranslation(ctx, mode) - {} + TSqlMatchRecognizeClause(TContext& ctx, NSQLTranslation::ESqlMode mode); TMatchRecognizeBuilderPtr CreateBuilder(const TRule_row_pattern_recognition_clause& node); + static constexpr size_t MaxPatternNesting = 20; + static constexpr size_t MaxPermutedItems = 6; + private: - TVector<TNamedFunction> ParsePartitionBy(const TRule_window_partition_clause& partitionClause); + std::tuple<TNodePtr, TNodePtr> ParsePartitionBy(TPosition pos, const TRule_window_partition_clause* node); + TMaybe<TVector<TSortSpecificationPtr>> ParseOrderBy(const TRule_order_by_clause* node); TNamedFunction ParseOneMeasure(const TRule_row_pattern_measure_definition& node); - TVector<TNamedFunction> ParseMeasures(const TRule_row_pattern_measure_list& node); - std::pair<TPosition, NYql::NMatchRecognize::ERowsPerMatch> ParseRowsPerMatch(const TRule_row_pattern_rows_per_match& rowsPerMatchClause); - std::pair<TPosition, NYql::NMatchRecognize::TAfterMatchSkipTo> ParseAfterMatchSkipTo(const TRule_row_pattern_skip_to& skipToClause); - NYql::NMatchRecognize::TRowPatternTerm ParsePatternTerm(const TRule_row_pattern_term& node, size_t patternNestingLevel, bool output); - NYql::NMatchRecognize::TRowPattern ParsePattern(const TRule_row_pattern& node, size_t patternNestingLevel = 1, bool output = true); + TVector<TNamedFunction> ParseMeasures(const TRule_row_pattern_measure_list* node); + TNodePtr ParseRowsPerMatch(TPosition pos, const TRule_row_pattern_rows_per_match* node); + TNodePtr ParseAfterMatchSkipTo(TPosition pos, const TRule_row_pattern_skip_to* node); + TNodePtr BuildPatternFactor(TPosition pos, TNodePtr primary, std::tuple<ui64, ui64, bool, bool, bool> quantifier); + TNodePtr ParsePatternFactor(TPosition pos, const TRule_row_pattern_factor& node, size_t nestingLevel, bool output); + TNodePtr BuildPatternTerm(TPosition pos, std::vector<TNodePtr> term); + TNodePtr ParsePatternTerm(TPosition pos, const TRule_row_pattern_term& node, size_t nestingLevel, bool output); + TNodePtr BuildPattern(TPosition pos, std::vector<TNodePtr> pattern); + TNodePtr ParsePattern(TPosition pos, const TRule_row_pattern& node, size_t nestingLevel, bool output); + TMaybe<TNodePtr> ParseSubset(TPosition pos, const TRule_row_pattern_subset_clause* node); TNamedFunction ParseOneDefinition(const TRule_row_pattern_definition& node); TVector<TNamedFunction> ParseDefinitions(const TRule_row_pattern_definition_list& node); + +private: + THashSet<TString> PatternVarNames; + TNodePtr PatternVars; }; } // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/sql_match_recognize_ut.cpp b/yql/essentials/sql/v1/sql_match_recognize_ut.cpp index f591ef0647..f599dc6e99 100644 --- a/yql/essentials/sql/v1/sql_match_recognize_ut.cpp +++ b/yql/essentials/sql/v1/sql_match_recognize_ut.cpp @@ -41,7 +41,7 @@ bool IsLambda(const NYql::TAstNode* node, ui32 numberOfArgs) { return false; } if (!node->GetChild(0)->IsAtom() || node->GetChild(0)->GetContent() != "lambda") { - return false; + return false; } return IsQuotedListOfSize(node->GetChild(1), numberOfArgs); } @@ -71,7 +71,7 @@ FROM Input MATCH_RECOGNIZE( auto matchRecognizeAndSample = R"( USE plato; SELECT * -FROM Input MATCH_RECOGNIZE( +FROM Input MATCH_RECOGNIZE( PATTERN ( A ) DEFINE A as A ) TABLESAMPLE BERNOULLI(1.0) @@ -148,13 +148,13 @@ FROM Input MATCH_RECOGNIZE( auto r = MatchRecognizeSqlToYql(stmt); UNIT_ASSERT(r.IsOk()); const auto measures = FindMatchRecognizeParam(r.Root, "measures"); - UNIT_ASSERT_VALUES_EQUAL(6, measures->GetChildrenCount()); + UNIT_ASSERT_VALUES_EQUAL(7, measures->GetChildrenCount()); const auto columnNames = measures->GetChild(3); UNIT_ASSERT(IsQuotedListOfSize(columnNames, 2)); UNIT_ASSERT_VALUES_EQUAL("T", columnNames->GetChild(1)->GetChild(0)->GetChild(1)->GetContent()); UNIT_ASSERT_VALUES_EQUAL("Key", columnNames->GetChild(1)->GetChild(1)->GetChild(1)->GetContent()); - UNIT_ASSERT(IsLambda(measures->GetChild(4), 2)); - UNIT_ASSERT(IsLambda(measures->GetChild(5), 2)); + UNIT_ASSERT(IsQuotedListOfSize(measures->GetChild(4), 2)); + UNIT_ASSERT(IsQuotedListOfSize(measures->GetChild(5), 2)); } Y_UNIT_TEST(RowsPerMatch) { { @@ -326,7 +326,7 @@ USE plato; SELECT * FROM Input MATCH_RECOGNIZE( INITIAL - PATTERN (A+ B* C?) + PATTERN (A+ B* C?) DEFINE A as A ) )"; @@ -340,7 +340,7 @@ USE plato; SELECT * FROM Input MATCH_RECOGNIZE( SEEK - PATTERN (A+ B* C?) + PATTERN (A+ B* C?) DEFINE A as A ) )"; @@ -353,7 +353,7 @@ FROM Input MATCH_RECOGNIZE( USE plato; SELECT * FROM Input MATCH_RECOGNIZE( - PATTERN (A+ B* C?) + PATTERN (A+ B* C?) DEFINE A as A ) )"; @@ -427,7 +427,7 @@ PATTERN ( } Y_UNIT_TEST(PatternLimitedNesting) { - const size_t MaxNesting = 20; + constexpr size_t MaxNesting = 20; for (size_t extraNesting = 0; extraNesting <= 1; ++extraNesting) { std::string pattern; for (size_t i = 0; i != MaxNesting + extraNesting; ++i) @@ -469,7 +469,7 @@ FROM Input MATCH_RECOGNIZE( }; auto getTheFactor = [](const NYql::TAstNode* root) { const auto& patternCallable = FindMatchRecognizeParam(root, "pattern"); - const auto& factor = patternCallable->GetChild(1)->GetChild(1)->GetChild(0)->GetChild(1); + const auto& factor = patternCallable->GetChild(1)->GetChild(1)->GetChild(0)->GetChild(1); return NYql::NMatchRecognize::TRowPatternFactor{ TString(), //primary var or subexpression, not used in this test FromString<uint64_t>(factor->GetChild(1)->GetChild(1)->GetContent()), //QuantityMin @@ -651,7 +651,7 @@ FROM Input MATCH_RECOGNIZE( } Y_UNIT_TEST(PermuteTooMuch) { - for (size_t n = 1; n <= NYql::NMatchRecognize::MaxPermutedItems + 1; ++n) { + for (size_t n = 1; n <= 6 + 1; ++n) { std::vector<std::string> vars(n); std::generate(begin(vars), end(vars), [n = 0] () mutable { return "A" + std::to_string(n++);}); const auto stmt = TString(R"( @@ -671,7 +671,7 @@ FROM Input MATCH_RECOGNIZE( )" ); const auto &r = MatchRecognizeSqlToYql(stmt); - if (n <= NYql::NMatchRecognize::MaxPermutedItems) { + if (n <= 6) { UNIT_ASSERT(r.IsOk()); } else { UNIT_ASSERT(!r.IsOk()); diff --git a/yql/essentials/sql/v1/sql_ut.cpp b/yql/essentials/sql/v1/sql_ut.cpp index 6bdd2c7e44..bd385c8feb 100644 --- a/yql/essentials/sql/v1/sql_ut.cpp +++ b/yql/essentials/sql/v1/sql_ut.cpp @@ -8162,3 +8162,42 @@ Y_UNIT_TEST_SUITE(QuerySplit) { };)"); } } + +Y_UNIT_TEST_SUITE(MatchRecognizeMeasuresAggregation) { + Y_UNIT_TEST(InsideSelect) { + ExpectFailWithError(R"sql( + SELECT FIRST(0), LAST(1); + )sql", + "<main>:2:20: Error: Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context\n" + "<main>:2:30: Error: Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context\n" + ); + } + + Y_UNIT_TEST(OutsideSelect) { + ExpectFailWithError(R"sql( + $lambda = ($x) -> (FIRST($x) + LAST($x)); + SELECT $lambda(x) FROM plato.Input; + )sql", + "<main>:2:32: Error: Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context\n" + "<main>:2:44: Error: Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context\n" + ); + } + + Y_UNIT_TEST(AsAggregateFunction) { + ExpectFailWithError(R"sql( + SELECT FIRST(x), LAST(x) FROM plato.Input; + )sql", + "<main>:2:20: Error: Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context\n" + "<main>:2:30: Error: Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context\n" + ); + } + + Y_UNIT_TEST(AsWindowFunction) { + ExpectFailWithError(R"sql( + SELECT FIRST(x) OVER(), LAST(x) OVER() FROM plato.Input; + )sql", + "<main>:2:20: Error: Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context\n" + "<main>:2:37: Error: Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context\n" + ); + } +} diff --git a/yql/essentials/sql/v1/sql_ut_antlr4.cpp b/yql/essentials/sql/v1/sql_ut_antlr4.cpp index 4b1233bfee..9463408886 100644 --- a/yql/essentials/sql/v1/sql_ut_antlr4.cpp +++ b/yql/essentials/sql/v1/sql_ut_antlr4.cpp @@ -8121,3 +8121,42 @@ $__ydb_transfer_lambda = ($x) -> { } } + +Y_UNIT_TEST_SUITE(MatchRecognizeMeasuresAggregation) { + Y_UNIT_TEST(InsideSelect) { + ExpectFailWithError(R"sql( + SELECT FIRST(0), LAST(1); + )sql", + "<main>:2:20: Error: Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context\n" + "<main>:2:30: Error: Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context\n" + ); + } + + Y_UNIT_TEST(OutsideSelect) { + ExpectFailWithError(R"sql( + $lambda = ($x) -> (FIRST($x) + LAST($x)); + SELECT $lambda(x) FROM plato.Input; + )sql", + "<main>:2:32: Error: Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context\n" + "<main>:2:44: Error: Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context\n" + ); + } + + Y_UNIT_TEST(AsAggregateFunction) { + ExpectFailWithError(R"sql( + SELECT FIRST(x), LAST(x) FROM plato.Input; + )sql", + "<main>:2:20: Error: Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context\n" + "<main>:2:30: Error: Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context\n" + ); + } + + Y_UNIT_TEST(AsWindowFunction) { + ExpectFailWithError(R"sql( + SELECT FIRST(x) OVER(), LAST(x) OVER() FROM plato.Input; + )sql", + "<main>:2:20: Error: Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context\n" + "<main>:2:37: Error: Cannot use FIRST and LAST outside the MATCH_RECOGNIZE context\n" + ); + } +} diff --git a/yql/essentials/tests/sql/minirun/part0/canondata/result.json b/yql/essentials/tests/sql/minirun/part0/canondata/result.json index cecc1d216b..55b679803b 100644 --- a/yql/essentials/tests/sql/minirun/part0/canondata/result.json +++ b/yql/essentials/tests/sql/minirun/part0/canondata/result.json @@ -696,9 +696,9 @@ ], "test.test[match_recognize-after_match_skip_past_last_row-default.txt-Debug]": [ { - "checksum": "5f35ea2e92d57979f9c4bba6b5646a34", - "size": 1152, - "uri": "https://{canondata_backend}/1920236/0e4eb7607962ae8fb5d7c33255de8ebf2bb42fa1/resource.tar.gz#test.test_match_recognize-after_match_skip_past_last_row-default.txt-Debug_/opt.yql" + "checksum": "ba5829938cca0f9da11d044432ba58c4", + "size": 1294, + "uri": "https://{canondata_backend}/1777230/d5f34e6dd6caad9e61f6ac66cc95521cc05a75cf/resource.tar.gz#test.test_match_recognize-after_match_skip_past_last_row-default.txt-Debug_/opt.yql" } ], "test.test[match_recognize-after_match_skip_past_last_row-default.txt-Results]": [ @@ -710,9 +710,9 @@ ], "test.test[match_recognize-alerts-streaming-default.txt-Debug]": [ { - "checksum": "4c0317b3fb6f4b79fec4d8b5d361a30a", - "size": 5963, - "uri": "https://{canondata_backend}/1936947/b760795ecc1ab66b9c70ac9ecfdf8a8ffa413f66/resource.tar.gz#test.test_match_recognize-alerts-streaming-default.txt-Debug_/opt.yql" + "checksum": "d84be5a785956f5561d3871d773bb117", + "size": 6021, + "uri": "https://{canondata_backend}/1777230/d5f34e6dd6caad9e61f6ac66cc95521cc05a75cf/resource.tar.gz#test.test_match_recognize-alerts-streaming-default.txt-Debug_/opt.yql" } ], "test.test[match_recognize-alerts-streaming-default.txt-Results]": [ @@ -724,16 +724,16 @@ ], "test.test[match_recognize-test_type-default.txt-Debug]": [ { - "checksum": "2a2950eafdd72b695adee8a0d1deddab", - "size": 2645, - "uri": "https://{canondata_backend}/1936947/b760795ecc1ab66b9c70ac9ecfdf8a8ffa413f66/resource.tar.gz#test.test_match_recognize-test_type-default.txt-Debug_/opt.yql" + "checksum": "332b177bb25cf91a2a59f949bc30997a", + "size": 2667, + "uri": "https://{canondata_backend}/1777230/d5f34e6dd6caad9e61f6ac66cc95521cc05a75cf/resource.tar.gz#test.test_match_recognize-test_type-default.txt-Debug_/opt.yql" } ], "test.test[match_recognize-test_type-default.txt-Results]": [ { - "checksum": "02e86332916457371461fd277e2f0003", - "size": 4370, - "uri": "https://{canondata_backend}/1936947/b760795ecc1ab66b9c70ac9ecfdf8a8ffa413f66/resource.tar.gz#test.test_match_recognize-test_type-default.txt-Results_/results.txt" + "checksum": "568681ef1c935d6bdeabf22735cc4c63", + "size": 5138, + "uri": "https://{canondata_backend}/1777230/d5f34e6dd6caad9e61f6ac66cc95521cc05a75cf/resource.tar.gz#test.test_match_recognize-test_type-default.txt-Results_/results.txt" } ], "test.test[optimizers-or_absorption--Debug]": [ diff --git a/yql/essentials/tests/sql/minirun/part1/canondata/result.json b/yql/essentials/tests/sql/minirun/part1/canondata/result.json index 4ce5118f55..e9313a184a 100644 --- a/yql/essentials/tests/sql/minirun/part1/canondata/result.json +++ b/yql/essentials/tests/sql/minirun/part1/canondata/result.json @@ -694,9 +694,9 @@ ], "test.test[match_recognize-greedy_quantifiers-default.txt-Debug]": [ { - "checksum": "f715209ae286a07b1b13eaa7a4e5bb74", - "size": 2103, - "uri": "https://{canondata_backend}/1920236/97f248ad438e8879d0fc75f582d39c0a52739159/resource.tar.gz#test.test_match_recognize-greedy_quantifiers-default.txt-Debug_/opt.yql" + "checksum": "66fb0a8ccd3814cb306c356fcecea0d1", + "size": 2147, + "uri": "https://{canondata_backend}/1777230/e0a11b3037046c8b251f0102af3c7cc58af5ef2c/resource.tar.gz#test.test_match_recognize-greedy_quantifiers-default.txt-Debug_/opt.yql" } ], "test.test[match_recognize-greedy_quantifiers-default.txt-Results]": [ diff --git a/yql/essentials/tests/sql/minirun/part2/canondata/result.json b/yql/essentials/tests/sql/minirun/part2/canondata/result.json index 4c2099af44..39f596db22 100644 --- a/yql/essentials/tests/sql/minirun/part2/canondata/result.json +++ b/yql/essentials/tests/sql/minirun/part2/canondata/result.json @@ -773,9 +773,9 @@ ], "test.test[match_recognize-simple_paritioning-default.txt-Debug]": [ { - "checksum": "ed2ffd7ee602e50eb77df1f51835692d", - "size": 3030, - "uri": "https://{canondata_backend}/1773845/e276b05a77b889bd8fbb83a0bc0087dea7abd8f4/resource.tar.gz#test.test_match_recognize-simple_paritioning-default.txt-Debug_/opt.yql" + "checksum": "459d2a84dbc7335c291bc01ab9573c6e", + "size": 3201, + "uri": "https://{canondata_backend}/1937492/74169e2f8afb099b48d4cfe4c1190fcfe62c417a/resource.tar.gz#test.test_match_recognize-simple_paritioning-default.txt-Debug_/opt.yql" } ], "test.test[match_recognize-simple_paritioning-default.txt-Results]": [ diff --git a/yql/essentials/tests/sql/minirun/part4/canondata/result.json b/yql/essentials/tests/sql/minirun/part4/canondata/result.json index 02a5ceee0c..c4ada02ff9 100644 --- a/yql/essentials/tests/sql/minirun/part4/canondata/result.json +++ b/yql/essentials/tests/sql/minirun/part4/canondata/result.json @@ -894,6 +894,20 @@ "uri": "https://{canondata_backend}/1931696/ea5c7a58e2d64730a365efbdc95cdd2474857fcb/resource.tar.gz#test.test_like-like_rewrite-default.txt-Results_/results.txt" } ], + "test.test[match_recognize-measures_aggregate-default.txt-Debug]": [ + { + "checksum": "57fd3f8f46c4a0ab7bf9ef9a4842ea75", + "size": 5880, + "uri": "https://{canondata_backend}/1775059/fa3782efce42b73c9d2a91d53e640bb71168f31f/resource.tar.gz#test.test_match_recognize-measures_aggregate-default.txt-Debug_/opt.yql" + } + ], + "test.test[match_recognize-measures_aggregate-default.txt-Results]": [ + { + "checksum": "58ad2a60431fd720f1968f892142143e", + "size": 11093, + "uri": "https://{canondata_backend}/1775059/fa3782efce42b73c9d2a91d53e640bb71168f31f/resource.tar.gz#test.test_match_recognize-measures_aggregate-default.txt-Results_/results.txt" + } + ], "test.test[optimizers-coalesce_propagate-default.txt-Debug]": [ { "checksum": "9ce2404a84d5d2a812bb99e30502c2a9", diff --git a/yql/essentials/tests/sql/minirun/part5/canondata/result.json b/yql/essentials/tests/sql/minirun/part5/canondata/result.json index 0fc1d285af..3bbd5eafd1 100644 --- a/yql/essentials/tests/sql/minirun/part5/canondata/result.json +++ b/yql/essentials/tests/sql/minirun/part5/canondata/result.json @@ -1098,9 +1098,9 @@ ], "test.test[match_recognize-alerts-default.txt-Debug]": [ { - "checksum": "17ab30106ff230f4c490cadb8641e3c8", - "size": 5965, - "uri": "https://{canondata_backend}/1936947/73a52333839d1b24061b13e03914d1ed469097ed/resource.tar.gz#test.test_match_recognize-alerts-default.txt-Debug_/opt.yql" + "checksum": "9f76da8aa8f0d280d8a98e4eb3f892b1", + "size": 6023, + "uri": "https://{canondata_backend}/1130705/e4b7d47d9742325daff68c922887a7262afbacba/resource.tar.gz#test.test_match_recognize-alerts-default.txt-Debug_/opt.yql" } ], "test.test[match_recognize-alerts-default.txt-Results]": [ @@ -1112,9 +1112,9 @@ ], "test.test[match_recognize-all_rows_per_match-default.txt-Debug]": [ { - "checksum": "94481db077832a3635f63f613be474dc", - "size": 3913, - "uri": "https://{canondata_backend}/1942278/297fcbf291f6e0fa588710a14ca3180cf33bc176/resource.tar.gz#test.test_match_recognize-all_rows_per_match-default.txt-Debug_/opt.yql" + "checksum": "d165424c12b99f87fd8aca943b72458e", + "size": 3852, + "uri": "https://{canondata_backend}/1130705/e4b7d47d9742325daff68c922887a7262afbacba/resource.tar.gz#test.test_match_recognize-all_rows_per_match-default.txt-Debug_/opt.yql" } ], "test.test[match_recognize-all_rows_per_match-default.txt-Results]": [ diff --git a/yql/essentials/tests/sql/minirun/part6/canondata/result.json b/yql/essentials/tests/sql/minirun/part6/canondata/result.json index 088e4d2a31..268e02f868 100644 --- a/yql/essentials/tests/sql/minirun/part6/canondata/result.json +++ b/yql/essentials/tests/sql/minirun/part6/canondata/result.json @@ -849,9 +849,9 @@ ], "test.test[match_recognize-alerts_without_order-default.txt-Debug]": [ { - "checksum": "c724b7b2600d261b02e191689e728f28", - "size": 5813, - "uri": "https://{canondata_backend}/1936947/0ddea11997745cd059fbd894ec6ff75cd45330e7/resource.tar.gz#test.test_match_recognize-alerts_without_order-default.txt-Debug_/opt.yql" + "checksum": "594f151e529e79595efe63067ad0cf67", + "size": 5900, + "uri": "https://{canondata_backend}/1775059/35c89b22136cd01420b6cc7ddf223a627fc65f0b/resource.tar.gz#test.test_match_recognize-alerts_without_order-default.txt-Debug_/opt.yql" } ], "test.test[match_recognize-alerts_without_order-default.txt-Results]": [ diff --git a/yql/essentials/tests/sql/minirun/part7/canondata/result.json b/yql/essentials/tests/sql/minirun/part7/canondata/result.json index 3adddc5fb4..900975682d 100644 --- a/yql/essentials/tests/sql/minirun/part7/canondata/result.json +++ b/yql/essentials/tests/sql/minirun/part7/canondata/result.json @@ -644,16 +644,16 @@ ], "test.test[match_recognize-test_type-streaming-default.txt-Debug]": [ { - "checksum": "5618af42a1caa66d16c80f966e94237f", - "size": 2546, - "uri": "https://{canondata_backend}/1936273/40a052daef6f43b6c825c32f19c5707a9107d4bc/resource.tar.gz#test.test_match_recognize-test_type-streaming-default.txt-Debug_/opt.yql" + "checksum": "78f9d1127f6c378a383e746f8e7ca2b6", + "size": 2582, + "uri": "https://{canondata_backend}/1900335/96f474c83eaaa22aac9f78dae9544c91548b2f2f/resource.tar.gz#test.test_match_recognize-test_type-streaming-default.txt-Debug_/opt.yql" } ], "test.test[match_recognize-test_type-streaming-default.txt-Results]": [ { - "checksum": "a426ea78777873f86c83681342304e1a", - "size": 4374, - "uri": "https://{canondata_backend}/1936273/40a052daef6f43b6c825c32f19c5707a9107d4bc/resource.tar.gz#test.test_match_recognize-test_type-streaming-default.txt-Results_/results.txt" + "checksum": "3c7b40f6b2281e1f94394cedd06f6f54", + "size": 5142, + "uri": "https://{canondata_backend}/1900335/96f474c83eaaa22aac9f78dae9544c91548b2f2f/resource.tar.gz#test.test_match_recognize-test_type-streaming-default.txt-Results_/results.txt" } ], "test.test[order_by-order_by_missing_project_column_nosimple--Debug]": [ diff --git a/yql/essentials/tests/sql/minirun/part8/canondata/result.json b/yql/essentials/tests/sql/minirun/part8/canondata/result.json index de9da9c5b2..99429e3c85 100644 --- a/yql/essentials/tests/sql/minirun/part8/canondata/result.json +++ b/yql/essentials/tests/sql/minirun/part8/canondata/result.json @@ -903,9 +903,9 @@ ], "test.test[match_recognize-permute-default.txt-Debug]": [ { - "checksum": "58f5d288f2e3837a813be53a69b0595b", - "size": 2421, - "uri": "https://{canondata_backend}/1817427/e19dd6d5c506c6d882f6e409830437f1245b321b/resource.tar.gz#test.test_match_recognize-permute-default.txt-Debug_/opt.yql" + "checksum": "f33a6938ad6524015fe493f9e45a9a1a", + "size": 2300, + "uri": "https://{canondata_backend}/1900335/3c460c2946ea734b521d2f5710a7c3237ef371b3/resource.tar.gz#test.test_match_recognize-permute-default.txt-Debug_/opt.yql" } ], "test.test[match_recognize-permute-default.txt-Results]": [ @@ -917,9 +917,9 @@ ], "test.test[match_recognize-simple_paritioning-streaming-default.txt-Debug]": [ { - "checksum": "5d5202ad383ce1d089d694cc81131011", - "size": 3076, - "uri": "https://{canondata_backend}/1773845/20507d42e246d2393b5ea9633b85501f042faf59/resource.tar.gz#test.test_match_recognize-simple_paritioning-streaming-default.txt-Debug_/opt.yql" + "checksum": "8e6443cf3e5f569f1e06dbf8207d334d", + "size": 3262, + "uri": "https://{canondata_backend}/1900335/3c460c2946ea734b521d2f5710a7c3237ef371b3/resource.tar.gz#test.test_match_recognize-simple_paritioning-streaming-default.txt-Debug_/opt.yql" } ], "test.test[match_recognize-simple_paritioning-streaming-default.txt-Results]": [ diff --git a/yql/essentials/tests/sql/sql2yql/canondata/result.json b/yql/essentials/tests/sql/sql2yql/canondata/result.json index 7ea57dddf1..0d7fe3bcf9 100644 --- a/yql/essentials/tests/sql/sql2yql/canondata/result.json +++ b/yql/essentials/tests/sql/sql2yql/canondata/result.json @@ -4096,86 +4096,93 @@ ], "test_sql2yql.test[match_recognize-after_match_skip_past_last_row]": [ { - "checksum": "060a3ed010fea646088a7a4494c4a505", - "size": 3125, - "uri": "https://{canondata_backend}/1942173/99e88108149e222741552e7e6cddef041d6a2846/resource.tar.gz#test_sql2yql.test_match_recognize-after_match_skip_past_last_row_/sql.yql" + "checksum": "6455d6bae63fe3d95a749a7952863604", + "size": 3104, + "uri": "https://{canondata_backend}/1920236/5e37b541c71c89b1b95dee0463a5a2e9bc5999f4/resource.tar.gz#test_sql2yql.test_match_recognize-after_match_skip_past_last_row_/sql.yql" } ], "test_sql2yql.test[match_recognize-alerts-streaming]": [ { - "checksum": "19cfd4ac71802d49039da78644667d60", - "size": 9768, - "uri": "https://{canondata_backend}/1775319/f1fa0c55bf9f13cff57cf1c990c2330caed8eb1b/resource.tar.gz#test_sql2yql.test_match_recognize-alerts-streaming_/sql.yql" + "checksum": "55a3af0bf19e627ca6dec04367c179c1", + "size": 10044, + "uri": "https://{canondata_backend}/1920236/5e37b541c71c89b1b95dee0463a5a2e9bc5999f4/resource.tar.gz#test_sql2yql.test_match_recognize-alerts-streaming_/sql.yql" } ], "test_sql2yql.test[match_recognize-alerts]": [ { - "checksum": "08c72510d05ec8baa8050b2ab75175d3", - "size": 9770, - "uri": "https://{canondata_backend}/1775319/f1fa0c55bf9f13cff57cf1c990c2330caed8eb1b/resource.tar.gz#test_sql2yql.test_match_recognize-alerts_/sql.yql" + "checksum": "abe472c1e4dd4d38ce3af7954c0a4425", + "size": 10046, + "uri": "https://{canondata_backend}/1920236/5e37b541c71c89b1b95dee0463a5a2e9bc5999f4/resource.tar.gz#test_sql2yql.test_match_recognize-alerts_/sql.yql" } ], "test_sql2yql.test[match_recognize-alerts_without_order]": [ { - "checksum": "d8075bb34a86c528ce47ea2be97a4e86", - "size": 9651, - "uri": "https://{canondata_backend}/1775319/f1fa0c55bf9f13cff57cf1c990c2330caed8eb1b/resource.tar.gz#test_sql2yql.test_match_recognize-alerts_without_order_/sql.yql" + "checksum": "7e6cd1cda9ddc8a2fe0f41ace902517e", + "size": 9927, + "uri": "https://{canondata_backend}/1920236/5e37b541c71c89b1b95dee0463a5a2e9bc5999f4/resource.tar.gz#test_sql2yql.test_match_recognize-alerts_without_order_/sql.yql" } ], "test_sql2yql.test[match_recognize-all_rows_per_match]": [ { - "checksum": "ee193116db0e897c8a3fbb98a75caaac", - "size": 6732, - "uri": "https://{canondata_backend}/1889210/a0cc9e3113699f51443284ed56291923dfc3735d/resource.tar.gz#test_sql2yql.test_match_recognize-all_rows_per_match_/sql.yql" + "checksum": "4eca671d5cbce0a4457b1b7918ff3a7b", + "size": 6627, + "uri": "https://{canondata_backend}/1920236/5e37b541c71c89b1b95dee0463a5a2e9bc5999f4/resource.tar.gz#test_sql2yql.test_match_recognize-all_rows_per_match_/sql.yql" } ], "test_sql2yql.test[match_recognize-greedy_quantifiers]": [ { - "checksum": "41e90a3a986f9b2a7a36a83b918667cc", - "size": 4352, - "uri": "https://{canondata_backend}/1942173/99e88108149e222741552e7e6cddef041d6a2846/resource.tar.gz#test_sql2yql.test_match_recognize-greedy_quantifiers_/sql.yql" + "checksum": "a37f9b7a1bbf0e3ab25d265bfb1cdf95", + "size": 4214, + "uri": "https://{canondata_backend}/1920236/5e37b541c71c89b1b95dee0463a5a2e9bc5999f4/resource.tar.gz#test_sql2yql.test_match_recognize-greedy_quantifiers_/sql.yql" + } + ], + "test_sql2yql.test[match_recognize-measures_aggregate]": [ + { + "checksum": "a5c9c77b24db505963aa1c7af3e9e9ba", + "size": 7401, + "uri": "https://{canondata_backend}/1920236/5e37b541c71c89b1b95dee0463a5a2e9bc5999f4/resource.tar.gz#test_sql2yql.test_match_recognize-measures_aggregate_/sql.yql" } ], "test_sql2yql.test[match_recognize-permute]": [ { - "checksum": "cf3fb8aecbdea0008e9bacd9025e97c8", - "size": 8409, - "uri": "https://{canondata_backend}/1942173/99e88108149e222741552e7e6cddef041d6a2846/resource.tar.gz#test_sql2yql.test_match_recognize-permute_/sql.yql" + "checksum": "319494095f46f445b1334166cb139412", + "size": 5237, + "uri": "https://{canondata_backend}/1920236/5e37b541c71c89b1b95dee0463a5a2e9bc5999f4/resource.tar.gz#test_sql2yql.test_match_recognize-permute_/sql.yql" } ], "test_sql2yql.test[match_recognize-simple_paritioning-streaming]": [ { - "checksum": "9971f2943b1f9fcf52dfdc36ceabeaf2", - "size": 5067, - "uri": "https://{canondata_backend}/1775319/f1fa0c55bf9f13cff57cf1c990c2330caed8eb1b/resource.tar.gz#test_sql2yql.test_match_recognize-simple_paritioning-streaming_/sql.yql" + "checksum": "517bd077b852e7e35f5ef61b6ce5ab2e", + "size": 4882, + "uri": "https://{canondata_backend}/1920236/5e37b541c71c89b1b95dee0463a5a2e9bc5999f4/resource.tar.gz#test_sql2yql.test_match_recognize-simple_paritioning-streaming_/sql.yql" } ], "test_sql2yql.test[match_recognize-simple_paritioning]": [ { - "checksum": "edf5680780dd393593d27f08aff839e2", - "size": 5036, - "uri": "https://{canondata_backend}/1775319/f1fa0c55bf9f13cff57cf1c990c2330caed8eb1b/resource.tar.gz#test_sql2yql.test_match_recognize-simple_paritioning_/sql.yql" + "checksum": "07184284d8c30188d7345a28f0cb4c8d", + "size": 4851, + "uri": "https://{canondata_backend}/1920236/5e37b541c71c89b1b95dee0463a5a2e9bc5999f4/resource.tar.gz#test_sql2yql.test_match_recognize-simple_paritioning_/sql.yql" } ], "test_sql2yql.test[match_recognize-test_type-streaming]": [ { - "checksum": "1efff3745da1ff0c1c7a784d84b6e773", - "size": 9899, - "uri": "https://{canondata_backend}/1775319/f1fa0c55bf9f13cff57cf1c990c2330caed8eb1b/resource.tar.gz#test_sql2yql.test_match_recognize-test_type-streaming_/sql.yql" + "checksum": "26d2b2f5b48b4f537d82a9c70f2509e5", + "size": 9394, + "uri": "https://{canondata_backend}/1920236/5e37b541c71c89b1b95dee0463a5a2e9bc5999f4/resource.tar.gz#test_sql2yql.test_match_recognize-test_type-streaming_/sql.yql" } ], "test_sql2yql.test[match_recognize-test_type]": [ { - "checksum": "c3477e1413f59cd6ecff1b7b5b1f7cdc", - "size": 9904, - "uri": "https://{canondata_backend}/1775319/f1fa0c55bf9f13cff57cf1c990c2330caed8eb1b/resource.tar.gz#test_sql2yql.test_match_recognize-test_type_/sql.yql" + "checksum": "9782d694d28decb54e3194bddadcaf87", + "size": 9399, + "uri": "https://{canondata_backend}/1920236/5e37b541c71c89b1b95dee0463a5a2e9bc5999f4/resource.tar.gz#test_sql2yql.test_match_recognize-test_type_/sql.yql" } ], "test_sql2yql.test[match_recognize-test_type_predicate]": [ { - "checksum": "1578ebf4c45fe97f2acf85bd83c5707d", - "size": 3307, - "uri": "https://{canondata_backend}/1775319/f1fa0c55bf9f13cff57cf1c990c2330caed8eb1b/resource.tar.gz#test_sql2yql.test_match_recognize-test_type_predicate_/sql.yql" + "checksum": "bebea49359906f66cdfff88c13f467ec", + "size": 3221, + "uri": "https://{canondata_backend}/1920236/5e37b541c71c89b1b95dee0463a5a2e9bc5999f4/resource.tar.gz#test_sql2yql.test_match_recognize-test_type_predicate_/sql.yql" } ], "test_sql2yql.test[optimizers-and_absorption]": [ @@ -10087,6 +10094,11 @@ "uri": "file://test_sql_format.test_match_recognize-greedy_quantifiers_/formatted.sql" } ], + "test_sql_format.test[match_recognize-measures_aggregate]": [ + { + "uri": "file://test_sql_format.test_match_recognize-measures_aggregate_/formatted.sql" + } + ], "test_sql_format.test[match_recognize-permute]": [ { "uri": "file://test_sql_format.test_match_recognize-permute_/formatted.sql" @@ -10800,6 +10812,13 @@ "uri": "https://{canondata_backend}/1937429/434276f26b2857be3c5ad3fdbbf877d2bf775ac5/resource.tar.gz#test_sql_negative.test_action-parallel_for_values-default.txt_/err_file.out" } ], + "test_sql_negative.test[aggr_factory-fail_on_match_recognize_specific-default.txt]": [ + { + "checksum": "2eaa5b85aec56e50ab4209c4508ff432", + "size": 366, + "uri": "https://{canondata_backend}/1817427/6ce5bfb2e802fb52a4df6bc050f6fcc1af3305e1/resource.tar.gz#test_sql_negative.test_aggr_factory-fail_on_match_recognize_specific-default.txt_/err_file.out" + } + ], "test_sql_negative.test[aggregate-fail_group_by_struct_member-default.txt]": [ { "checksum": "18d827f5fa448873f5b2ad05b664be70", @@ -10807,6 +10826,13 @@ "uri": "https://{canondata_backend}/1937429/434276f26b2857be3c5ad3fdbbf877d2bf775ac5/resource.tar.gz#test_sql_negative.test_aggregate-fail_group_by_struct_member-default.txt_/err_file.out" } ], + "test_sql_negative.test[aggregate-fail_on_match_recognize_specific-default.txt]": [ + { + "checksum": "d4b1dfac0c7ded90c1a932c0bca076a9", + "size": 252, + "uri": "https://{canondata_backend}/1817427/6ce5bfb2e802fb52a4df6bc050f6fcc1af3305e1/resource.tar.gz#test_sql_negative.test_aggregate-fail_on_match_recognize_specific-default.txt_/err_file.out" + } + ], "test_sql_negative.test[aggregate-having_without_aggregation-default.txt]": [ { "checksum": "59ad103860c119f608fcecc67fdf1973", @@ -11024,6 +11050,13 @@ "uri": "https://{canondata_backend}/1937429/434276f26b2857be3c5ad3fdbbf877d2bf775ac5/resource.tar.gz#test_sql_negative.test_lambda-lambda_no_dollar_assign-default.txt_/err_file.out" } ], + "test_sql_negative.test[match_recognize-measures_aggr_factory-default.txt]": [ + { + "checksum": "e66a11807db413ca11443f09d81f13f7", + "size": 209, + "uri": "https://{canondata_backend}/1899731/ca0a5d07c2713bd45430516401913aa15dee4ba5/resource.tar.gz#test_sql_negative.test_match_recognize-measures_aggr_factory-default.txt_/err_file.out" + } + ], "test_sql_negative.test[order_by-order_by_subquery-default.txt]": [ { "checksum": "373a17f049d856da759cae5496b6c772", diff --git a/yql/essentials/tests/sql/sql2yql/canondata/test_sql_format.test_match_recognize-measures_aggregate_/formatted.sql b/yql/essentials/tests/sql/sql2yql/canondata/test_sql_format.test_match_recognize-measures_aggregate_/formatted.sql new file mode 100644 index 0000000000..3178e73e9c --- /dev/null +++ b/yql/essentials/tests/sql/sql2yql/canondata/test_sql_format.test_match_recognize-measures_aggregate_/formatted.sql @@ -0,0 +1,49 @@ +PRAGMA FeatureR010 = 'prototype'; + +$input = ( + SELECT + * + FROM + AS_TABLE([ + <|time: 0, value: 1u, name: 'A'|>, + <|time: 100, value: 2u, name: 'A'|>, + <|time: 200, value: 3u, name: 'B'|>, + <|time: 300, value: 3u, name: 'B'|>, + <|time: 400, value: 4u, name: 'A'|>, + <|time: 500, value: 5u, name: 'A'|>, + ]) +); + +SELECT + * +FROM + $input MATCH_RECOGNIZE ( + ORDER BY + CAST(time AS Timestamp) + MEASURES + SUM(A.value + 1u + LENGTH(A.name)) AS aggr_expr, + FIRST(A.value) AS first_a, + LAST(A.value) AS last_a, + COUNT(A.value) AS count_a, + COUNT(DISTINCT A.value) AS count_distinct_a, + AGGREGATE_LIST(A.value) AS aggrlist_a, + AGGREGATE_LIST_DISTINCT(A.value) AS aggrlist_distinct_a, + FIRST(B.value) AS first_b, + LAST(B.value) AS last_b, + COUNT(B.value) AS count_b, + COUNT(DISTINCT B.value) AS count_distinct_b, + AGGREGATE_LIST(B.value) AS aggrlist_b, + AGGREGATE_LIST_DISTINCT(B.value) AS aggrlist_distinct_b, + FIRST(C.value) AS first_c, + LAST(C.value) AS last_c, + COUNT(C.value) AS count_c, + COUNT(DISTINCT C.value) AS count_distinct_c, + AGGREGATE_LIST(C.value) AS aggrlist_c, + AGGREGATE_LIST_DISTINCT(C.value) AS aggrlist_distinct_c + PATTERN (A * B C * B A *) + DEFINE + A AS A.name == 'A' AND COALESCE(FIRST(B.value + 1u + LENGTH(B.name)) == 5, TRUE), + B AS B.name == 'B' AND FIRST(A.value + 1u + LENGTH(A.name)) == 3, + C AS C.name == 'C' + ) +; diff --git a/yql/essentials/tests/sql/suites/aggr_factory/fail_on_match_recognize_specific.sqlx b/yql/essentials/tests/sql/suites/aggr_factory/fail_on_match_recognize_specific.sqlx new file mode 100644 index 0000000000..92059f0a23 --- /dev/null +++ b/yql/essentials/tests/sql/suites/aggr_factory/fail_on_match_recognize_specific.sqlx @@ -0,0 +1,14 @@ +$input = SELECT * FROM AS_TABLE([ + <|time: 0, value: 1u, name: "A"|>, + <|time: 100, value: 2u, name: "A"|>, + <|time: 200, value: 3u, name: "B"|>, + <|time: 300, value: 3u, name: "B"|>, + <|time: 400, value: 4u, name: "A"|>, + <|time: 500, value: 5u, name: "A"|>, +]); + +SELECT + AGGREGATE_BY(value, AggregationFactory("first")) as first, + AGGREGATE_BY(value, AggregationFactory("last")) as last +FROM $input +GROUP BY name; diff --git a/yql/essentials/tests/sql/suites/aggregate/fail_on_match_recognize_specific.sqlx b/yql/essentials/tests/sql/suites/aggregate/fail_on_match_recognize_specific.sqlx new file mode 100644 index 0000000000..79b8bf01ac --- /dev/null +++ b/yql/essentials/tests/sql/suites/aggregate/fail_on_match_recognize_specific.sqlx @@ -0,0 +1,14 @@ +$input = SELECT * FROM AS_TABLE([ + <|time: 0, value: 1u, name: "A"|>, + <|time: 100, value: 2u, name: "A"|>, + <|time: 200, value: 3u, name: "B"|>, + <|time: 300, value: 3u, name: "B"|>, + <|time: 400, value: 4u, name: "A"|>, + <|time: 500, value: 5u, name: "A"|>, +]); + +SELECT + FIRST(value) as first, + LAST(value) as last +FROM $input +GROUP BY name; diff --git a/yql/essentials/tests/sql/suites/match_recognize/measures_aggr_factory.sqlx b/yql/essentials/tests/sql/suites/match_recognize/measures_aggr_factory.sqlx new file mode 100644 index 0000000000..7d74c6792f --- /dev/null +++ b/yql/essentials/tests/sql/suites/match_recognize/measures_aggr_factory.sqlx @@ -0,0 +1,14 @@ +PRAGMA FeatureR010="prototype"; + +$input = SELECT * FROM AS_TABLE([ + <|time: 0, value: 1u, name: "A"|>, +]); + +SELECT * FROM $input MATCH_RECOGNIZE ( + ORDER BY CAST(time AS Timestamp) + MEASURES + AGGREGATE_BY(A.value, AggregationFactory("sum")) AS aggr_factory_expr + PATTERN (A) + DEFINE + A AS A.name = "A" +); diff --git a/yql/essentials/tests/sql/suites/match_recognize/measures_aggregate.sql b/yql/essentials/tests/sql/suites/match_recognize/measures_aggregate.sql new file mode 100644 index 0000000000..eec70ff8bc --- /dev/null +++ b/yql/essentials/tests/sql/suites/match_recognize/measures_aggregate.sql @@ -0,0 +1,39 @@ +PRAGMA FeatureR010="prototype"; + +$input = SELECT * FROM AS_TABLE([ + <|time: 0, value: 1u, name: "A"|>, + <|time: 100, value: 2u, name: "A"|>, + <|time: 200, value: 3u, name: "B"|>, + <|time: 300, value: 3u, name: "B"|>, + <|time: 400, value: 4u, name: "A"|>, + <|time: 500, value: 5u, name: "A"|>, +]); + +SELECT * FROM $input MATCH_RECOGNIZE ( + ORDER BY CAST(time AS Timestamp) + MEASURES + SUM(A.value + 1u + LENGTH(A.name)) AS aggr_expr, + FIRST(A.value) AS first_a, + LAST(A.value) AS last_a, + COUNT(A.value) AS count_a, + COUNT(DISTINCT A.value) AS count_distinct_a, + AGGREGATE_LIST(A.value) AS aggrlist_a, + AGGREGATE_LIST_DISTINCT(A.value) AS aggrlist_distinct_a, + FIRST(B.value) AS first_b, + LAST(B.value) AS last_b, + COUNT(B.value) AS count_b, + COUNT(DISTINCT B.value) AS count_distinct_b, + AGGREGATE_LIST(B.value) AS aggrlist_b, + AGGREGATE_LIST_DISTINCT(B.value) AS aggrlist_distinct_b, + FIRST(C.value) AS first_c, + LAST(C.value) AS last_c, + COUNT(C.value) AS count_c, + COUNT(DISTINCT C.value) AS count_distinct_c, + AGGREGATE_LIST(C.value) AS aggrlist_c, + AGGREGATE_LIST_DISTINCT(C.value) AS aggrlist_distinct_c + PATTERN (A* B C* B A*) + DEFINE + A AS A.name = "A" AND COALESCE(FIRST(B.value + 1u + LENGTH(B.name)) = 5, TRUE), + B AS B.name = "B" AND FIRST(A.value + 1u + LENGTH(A.name)) = 3, + C AS C.name = "C" +); |