summaryrefslogtreecommitdiffstats
path: root/yql/essentials/minikql/jsonpath/parser/ast_builder.cpp
diff options
context:
space:
mode:
authorvvvv <[email protected]>2024-12-18 21:57:17 +0300
committervvvv <[email protected]>2024-12-18 22:58:34 +0300
commitf7f8a1dcd4b4b4a1af7acc6d8127d285c3850795 (patch)
treec22ca71d9eb197030fbd946452f4aadf613df95c /yql/essentials/minikql/jsonpath/parser/ast_builder.cpp
parent9690d79e797ada85de4a648fbc1e386ea4621efe (diff)
Extracted parser from jsonpath lib
init commit_hash:8c6dd46ee72034f0480757612dcceb524d19a1d1
Diffstat (limited to 'yql/essentials/minikql/jsonpath/parser/ast_builder.cpp')
-rw-r--r--yql/essentials/minikql/jsonpath/parser/ast_builder.cpp500
1 files changed, 500 insertions, 0 deletions
diff --git a/yql/essentials/minikql/jsonpath/parser/ast_builder.cpp b/yql/essentials/minikql/jsonpath/parser/ast_builder.cpp
new file mode 100644
index 00000000000..2cfecb94310
--- /dev/null
+++ b/yql/essentials/minikql/jsonpath/parser/ast_builder.cpp
@@ -0,0 +1,500 @@
+#include "ast_builder.h"
+#include "ast_nodes.h"
+#include "parse_double.h"
+
+#include <yql/essentials/core/issue/protos/issue_id.pb.h>
+#include <yql/essentials/minikql/jsonpath/rewrapper/proto/serialization.pb.h>
+#include <yql/essentials/ast/yql_ast_escaping.h>
+
+#include <util/generic/singleton.h>
+#include <util/system/compiler.h>
+#include <util/string/cast.h>
+#include <util/string/builder.h>
+#include <util/charset/utf8.h>
+#include <util/system/cpu_id.h>
+
+#include <cmath>
+
+using namespace NYql;
+using namespace NYql::NJsonPath;
+using namespace NJsonPathGenerated;
+using namespace NReWrapper;
+
+namespace {
+
+constexpr ui32 RegexpLibId = NReWrapper::TSerialization::YDB_REWRAPPER_LIB_ID;
+
+TPosition GetPos(const TToken& token) {
+ return TPosition(token.GetColumn(), token.GetLine());
+}
+
+bool TryStringContent(const TString& str, TString& result, TString& error, bool onlyDoubleQuoted = true) {
+ result.clear();
+ error.clear();
+
+ const bool doubleQuoted = str.StartsWith('"') && str.EndsWith('"');
+ const bool singleQuoted = str.StartsWith('\'') && str.EndsWith('\'');
+ if (!doubleQuoted && !singleQuoted) {
+ error = "String must be quoted";
+ return false;
+ }
+ if (singleQuoted && onlyDoubleQuoted) {
+ error = "Only double quoted strings allowed";
+ return false;
+ }
+
+ char quoteChar = doubleQuoted ? '"' : '\'';
+ size_t readBytes = 0;
+ TStringBuf atom(str);
+ atom.Skip(1);
+ TStringOutput sout(result);
+ result.reserve(str.size());
+
+ auto unescapeResult = UnescapeArbitraryAtom(atom, quoteChar, &sout, &readBytes);
+
+ if (unescapeResult == EUnescapeResult::OK) {
+ return true;
+ } else {
+ error = UnescapeResultToString(unescapeResult);
+ return false;
+ }
+}
+
+}
+
+TAstBuilder::TAstBuilder(TIssues& issues)
+ : Issues(issues)
+{
+}
+
+void TAstBuilder::Error(TPosition pos, const TStringBuf message) {
+ Issues.AddIssue(pos, message);
+ Issues.back().SetCode(TIssuesIds::JSONPATH_PARSE_ERROR, TSeverityIds::S_ERROR);
+}
+
+TArrayAccessNode::TSubscript TAstBuilder::BuildArraySubscript(const TRule_array_subscript& node) {
+ TAstNodePtr from = BuildExpr(node.GetRule_expr1());
+ TAstNodePtr to = nullptr;
+ if (node.HasBlock2()) {
+ to = BuildExpr(node.GetBlock2().GetRule_expr2());
+ }
+ return {from, to};
+}
+
+TAstNodePtr TAstBuilder::BuildArrayAccessor(const TRule_array_accessor& node, TAstNodePtr input) {
+ TVector<TArrayAccessNode::TSubscript> subscripts;
+ subscripts.reserve(1 + node.Block3Size());
+
+ subscripts.push_back(BuildArraySubscript(node.GetRule_array_subscript2()));
+ for (size_t i = 0; i < node.Block3Size(); i++) {
+ subscripts.push_back(BuildArraySubscript(node.GetBlock3(i).GetRule_array_subscript2()));
+ }
+
+ return new TArrayAccessNode(GetPos(node.GetToken1()), subscripts, input);
+}
+
+TAstNodePtr TAstBuilder::BuildWildcardArrayAccessor(const TRule_wildcard_array_accessor& node, TAstNodePtr input) {
+ return new TWildcardArrayAccessNode(GetPos(node.GetToken1()), input);
+}
+
+TString TAstBuilder::BuildIdentifier(const TRule_identifier& node) {
+ switch (node.GetAltCase()) {
+ case TRule_identifier::kAltIdentifier1:
+ return node.GetAlt_identifier1().GetToken1().GetValue();
+ case TRule_identifier::kAltIdentifier2:
+ return node.GetAlt_identifier2().GetRule_keyword1().GetToken1().GetValue();
+ case TRule_identifier::ALT_NOT_SET:
+ Y_ABORT("Alternative for 'identifier' rule is not set");
+ }
+}
+
+TAstNodePtr TAstBuilder::BuildMemberAccessor(const TRule_member_accessor& node, TAstNodePtr input) {
+ TString name;
+ const auto& nameBlock = node.GetBlock2();
+ switch (nameBlock.GetAltCase()) {
+ case TRule_member_accessor_TBlock2::kAlt1:
+ name = BuildIdentifier(nameBlock.GetAlt1().GetRule_identifier1());
+ break;
+ case TRule_member_accessor_TBlock2::kAlt2: {
+ const auto& token = nameBlock.GetAlt2().GetToken1();
+ TString error;
+ if (!TryStringContent(token.GetValue(), name, error, /* onlyDoubleQuoted */ false)) {
+ Error(GetPos(token), error);
+ return nullptr;
+ }
+ break;
+ }
+ case TRule_member_accessor_TBlock2::ALT_NOT_SET:
+ Y_ABORT("Alternative for 'member_accessor' rule is not set");
+ }
+
+ return new TMemberAccessNode(GetPos(node.GetToken1()), name, input);
+}
+
+TAstNodePtr TAstBuilder::BuildWildcardMemberAccessor(const TRule_wildcard_member_accessor& node, TAstNodePtr input) {
+ const auto& token = node.GetToken2();
+ return new TWildcardMemberAccessNode(GetPos(token), input);
+}
+
+TAstNodePtr TAstBuilder::BuildFilter(const TRule_filter& node, TAstNodePtr input) {
+ const auto predicate = BuildExpr(node.GetRule_expr3());
+ return new TFilterPredicateNode(GetPos(node.GetToken2()), predicate, input);
+}
+
+TAstNodePtr TAstBuilder::BuildMethod(const TRule_method& node, TAstNodePtr input) {
+ const auto& token = node.GetToken2();
+ const auto pos = GetPos(token);
+ const auto& value = token.GetValue();
+ auto type = EMethodType::Double;
+ if (value == "abs") {
+ type = EMethodType::Abs;
+ } else if (value == "floor") {
+ type = EMethodType::Floor;
+ } else if (value == "ceiling") {
+ type = EMethodType::Ceiling;
+ } else if (value == "type") {
+ type = EMethodType::Type;
+ } else if (value == "size") {
+ type = EMethodType::Size;
+ } else if (value == "keyvalue") {
+ type = EMethodType::KeyValue;
+ }
+
+ return new TMethodCallNode(pos, type, input);
+}
+
+TAstNodePtr TAstBuilder::BuildAccessorOp(const TRule_accessor_op& node, TAstNodePtr input) {
+ switch (node.GetAltCase()) {
+ case TRule_accessor_op::kAltAccessorOp1:
+ return BuildMemberAccessor(node.GetAlt_accessor_op1().GetRule_member_accessor1(), input);
+ case TRule_accessor_op::kAltAccessorOp2:
+ return BuildWildcardMemberAccessor(node.GetAlt_accessor_op2().GetRule_wildcard_member_accessor1(), input);
+ case TRule_accessor_op::kAltAccessorOp3:
+ return BuildArrayAccessor(node.GetAlt_accessor_op3().GetRule_array_accessor1(), input);
+ case TRule_accessor_op::kAltAccessorOp4:
+ return BuildWildcardArrayAccessor(node.GetAlt_accessor_op4().GetRule_wildcard_array_accessor1(), input);
+ case TRule_accessor_op::kAltAccessorOp5:
+ return BuildFilter(node.GetAlt_accessor_op5().GetRule_filter1(), input);
+ case TRule_accessor_op::kAltAccessorOp6:
+ return BuildMethod(node.GetAlt_accessor_op6().GetRule_method1(), input);
+ case TRule_accessor_op::ALT_NOT_SET:
+ Y_ABORT("Alternative for 'accessor_op' rule is not set");
+ }
+}
+
+TAstNodePtr TAstBuilder::BuildPrimary(const TRule_primary& node) {
+ switch (node.GetAltCase()) {
+ case TRule_primary::kAltPrimary1: {
+ const auto& token = node.GetAlt_primary1().GetToken1();
+ const auto& numberString = token.GetValue();
+ const double parsedValue = ParseDouble(numberString);
+ if (Y_UNLIKELY(std::isnan(parsedValue))) {
+ Y_ABORT("Invalid number was allowed by JsonPath grammar");
+ }
+ if (Y_UNLIKELY(std::isinf(parsedValue))) {
+ Error(GetPos(token), "Number literal is infinity");
+ return nullptr;
+ }
+ return new TNumberLiteralNode(GetPos(token), parsedValue);
+ }
+ case TRule_primary::kAltPrimary2: {
+ const auto& token = node.GetAlt_primary2().GetToken1();
+ return new TContextObjectNode(GetPos(token));
+ }
+ case TRule_primary::kAltPrimary3: {
+ const auto& token = node.GetAlt_primary3().GetToken1();
+ return new TLastArrayIndexNode(GetPos(token));
+ }
+ case TRule_primary::kAltPrimary4: {
+ const auto& primary = node.GetAlt_primary4().GetBlock1();
+ const auto input = BuildExpr(primary.GetRule_expr2());
+ if (primary.HasBlock4()) {
+ const auto& token = primary.GetBlock4().GetToken1();
+ return new TIsUnknownPredicateNode(GetPos(token), input);
+ }
+ return input;
+ }
+ case TRule_primary::kAltPrimary5: {
+ const auto& token = node.GetAlt_primary5().GetToken1();
+ return new TVariableNode(GetPos(token), token.GetValue().substr(1));
+ }
+ case TRule_primary::kAltPrimary6: {
+ const auto& token = node.GetAlt_primary6().GetToken1();
+ return new TBooleanLiteralNode(GetPos(token), true);
+ }
+ case TRule_primary::kAltPrimary7: {
+ const auto& token = node.GetAlt_primary7().GetToken1();
+ return new TBooleanLiteralNode(GetPos(token), false);
+ }
+ case TRule_primary::kAltPrimary8: {
+ const auto& token = node.GetAlt_primary8().GetToken1();
+ return new TNullLiteralNode(GetPos(token));
+ }
+ case TRule_primary::kAltPrimary9: {
+ const auto& token = node.GetAlt_primary9().GetToken1();
+ TString value;
+ TString error;
+ if (!TryStringContent(token.GetValue(), value, error)) {
+ Error(GetPos(token), error);
+ return nullptr;
+ }
+ return new TStringLiteralNode(GetPos(token), value);
+ }
+ case TRule_primary::kAltPrimary10: {
+ const auto& token = node.GetAlt_primary10().GetToken1();
+ return new TFilterObjectNode(GetPos(token));
+ }
+ case TRule_primary::ALT_NOT_SET:
+ Y_ABORT("Alternative for 'primary' rule is not set");
+ }
+}
+
+TAstNodePtr TAstBuilder::BuildAccessorExpr(const TRule_accessor_expr& node) {
+ TAstNodePtr input = BuildPrimary(node.GetRule_primary1());
+ for (size_t i = 0; i < node.Block2Size(); i++) {
+ input = BuildAccessorOp(node.GetBlock2(i).GetRule_accessor_op1(), input);
+ }
+ return input;
+}
+
+TAstNodePtr TAstBuilder::BuildPlainExpr(const TRule_plain_expr& node) {
+ return BuildAccessorExpr(node.GetRule_accessor_expr1());
+}
+
+TAstNodePtr TAstBuilder::BuildLikeRegexExpr(const TRule_like_regex_expr& node, TAstNodePtr input) {
+ const auto& regexToken = node.GetToken2();
+ TString regex;
+ TString error;
+ if (!TryStringContent(regexToken.GetValue(), regex, error)) {
+ Error(GetPos(regexToken), error);
+ return nullptr;
+ }
+
+ ui32 parsedFlags = 0;
+ if (node.HasBlock3()) {
+ TString flags;
+ const auto& flagsToken = node.GetBlock3().GetToken2();
+ if (!TryStringContent(flagsToken.GetValue(), flags, error)) {
+ Error(GetPos(flagsToken), error);
+ return nullptr;
+ }
+
+ for (char flag : flags) {
+ switch (flag) {
+ case 'i':
+ parsedFlags |= FLAGS_CASELESS;
+ break;
+ default:
+ Error(GetPos(flagsToken), TStringBuilder() << "Unsupported regex flag '" << flag << "'");
+ break;
+ }
+ }
+ }
+
+ IRePtr compiledRegex;
+ try {
+ compiledRegex = NDispatcher::Compile(regex, parsedFlags,
+ NDispatcher::Has(RegexpLibId) ? RegexpLibId : TSerialization::kRe2);
+ } catch (const NReWrapper::TCompileException& e) {
+ Error(GetPos(regexToken), e.AsStrBuf());
+ return nullptr;
+ }
+
+ return new TLikeRegexPredicateNode(GetPos(node.GetToken1()), input, std::move(compiledRegex));
+}
+
+TAstNodePtr TAstBuilder::BuildPredicateExpr(const TRule_predicate_expr& node) {
+ switch (node.GetAltCase()) {
+ case TRule_predicate_expr::kAltPredicateExpr1: {
+ const auto& predicate = node.GetAlt_predicate_expr1().GetBlock1();
+ const auto input = BuildPlainExpr(predicate.GetRule_plain_expr1());
+ if (!predicate.HasBlock2()) {
+ return input;
+ }
+
+ const auto& block = predicate.GetBlock2();
+ switch (block.GetAltCase()) {
+ case TRule_predicate_expr_TAlt1_TBlock1_TBlock2::kAlt1: {
+ const auto& innerBlock = block.GetAlt1().GetRule_starts_with_expr1();
+ const auto& prefix = BuildPlainExpr(innerBlock.GetRule_plain_expr3());
+ return new TStartsWithPredicateNode(GetPos(innerBlock.GetToken1()), input, prefix);
+ }
+ case TRule_predicate_expr_TAlt1_TBlock1_TBlock2::kAlt2: {
+ return BuildLikeRegexExpr(block.GetAlt2().GetRule_like_regex_expr1(), input);
+ }
+ case TRule_predicate_expr_TAlt1_TBlock1_TBlock2::ALT_NOT_SET:
+ Y_ABORT("Alternative for inner block of 'predicate_expr' rule is not set");
+ }
+ Y_UNREACHABLE();
+ }
+ case TRule_predicate_expr::kAltPredicateExpr2: {
+ const auto& predicate = node.GetAlt_predicate_expr2().GetBlock1();
+ const auto input = BuildExpr(predicate.GetRule_expr3());
+ return new TExistsPredicateNode(GetPos(predicate.GetToken1()), input);
+ }
+ case TRule_predicate_expr::ALT_NOT_SET:
+ Y_ABORT("Alternative for 'predicate' rule is not set");
+ }
+ Y_UNREACHABLE();
+}
+
+TAstNodePtr TAstBuilder::BuildUnaryExpr(const TRule_unary_expr& node) {
+ const auto predicateExpr = BuildPredicateExpr(node.GetRule_predicate_expr2());
+ if (!node.HasBlock1()) {
+ return predicateExpr;
+ }
+
+ const auto& opToken = node.GetBlock1().GetToken1();
+ const auto& opValue = opToken.GetValue();
+ auto operation = EUnaryOperation::Plus;
+ if (opValue == "-") {
+ operation = EUnaryOperation::Minus;
+ } else if (opValue == "!") {
+ operation = EUnaryOperation::Not;
+ }
+ return new TUnaryOperationNode(GetPos(opToken), operation, predicateExpr);
+}
+
+TAstNodePtr TAstBuilder::BuildMulExpr(const TRule_mul_expr& node) {
+ TAstNodePtr result = BuildUnaryExpr(node.GetRule_unary_expr1());
+
+ for (size_t i = 0; i < node.Block2Size(); i++) {
+ const auto& block = node.GetBlock2(i);
+
+ const auto& opToken = block.GetToken1();
+ const auto& opValue = opToken.GetValue();
+ auto operation = EBinaryOperation::Multiply;
+ if (opValue == "/") {
+ operation = EBinaryOperation::Divide;
+ } else if (opValue == "%") {
+ operation = EBinaryOperation::Modulo;
+ }
+
+ const auto rightOperand = BuildUnaryExpr(block.GetRule_unary_expr2());
+ result = new TBinaryOperationNode(GetPos(opToken), operation, result, rightOperand);
+ }
+
+ return result;
+}
+
+TAstNodePtr TAstBuilder::BuildAddExpr(const TRule_add_expr& node) {
+ TAstNodePtr result = BuildMulExpr(node.GetRule_mul_expr1());
+
+ for (size_t i = 0; i < node.Block2Size(); i++) {
+ const auto& block = node.GetBlock2(i);
+
+ const auto& opToken = block.GetToken1();
+ auto operation = EBinaryOperation::Add;
+ if (opToken.GetValue() == "-") {
+ operation = EBinaryOperation::Substract;
+ }
+
+ const auto rightOperand = BuildMulExpr(block.GetRule_mul_expr2());
+ result = new TBinaryOperationNode(GetPos(opToken), operation, result, rightOperand);
+ }
+
+ return result;
+}
+
+TAstNodePtr TAstBuilder::BuildCompareExpr(const TRule_compare_expr& node) {
+ TAstNodePtr result = BuildAddExpr(node.GetRule_add_expr1());
+
+ if (node.HasBlock2()) {
+ const auto& block = node.GetBlock2();
+
+ const auto& opToken = block.GetToken1();
+ const auto& opValue = opToken.GetValue();
+ auto operation = EBinaryOperation::Less;
+ if (opValue == "<=") {
+ operation = EBinaryOperation::LessEqual;
+ } else if (opValue == ">") {
+ operation = EBinaryOperation::Greater;
+ } else if (opValue == ">=") {
+ operation = EBinaryOperation::GreaterEqual;
+ }
+
+ const auto rightOperand = BuildAddExpr(block.GetRule_add_expr2());
+ result = new TBinaryOperationNode(GetPos(opToken), operation, result, rightOperand);
+ }
+
+ return result;
+}
+
+TAstNodePtr TAstBuilder::BuildEqualExpr(const TRule_equal_expr& node) {
+ TAstNodePtr result = BuildCompareExpr(node.GetRule_compare_expr1());
+
+ if (node.HasBlock2()) {
+ const auto& block = node.GetBlock2();
+
+ const auto& opToken = block.GetToken1();
+ const auto& opValue = opToken.GetValue();
+ auto operation = EBinaryOperation::Equal;
+ if (opValue == "<>" || opValue == "!=") {
+ operation = EBinaryOperation::NotEqual;
+ }
+
+ const auto rightOperand = BuildCompareExpr(block.GetRule_compare_expr2());
+ result = new TBinaryOperationNode(GetPos(opToken), operation, result, rightOperand);
+ }
+
+ return result;
+}
+
+TAstNodePtr TAstBuilder::BuildAndExpr(const TRule_and_expr& node) {
+ TAstNodePtr result = BuildEqualExpr(node.GetRule_equal_expr1());
+
+ for (size_t i = 0; i < node.Block2Size(); i++) {
+ const auto& block = node.GetBlock2(i);
+
+ const auto& opToken = block.GetToken1();
+ const auto rightOperand = BuildEqualExpr(block.GetRule_equal_expr2());
+ result = new TBinaryOperationNode(GetPos(opToken), EBinaryOperation::And, result, rightOperand);
+ }
+
+ return result;
+}
+
+TAstNodePtr TAstBuilder::BuildOrExpr(const TRule_or_expr& node) {
+ TAstNodePtr result = BuildAndExpr(node.GetRule_and_expr1());
+
+ for (size_t i = 0; i < node.Block2Size(); i++) {
+ const auto& block = node.GetBlock2(i);
+
+ const auto& opToken = block.GetToken1();
+ const auto rightOperand = BuildAndExpr(block.GetRule_and_expr2());
+ result = new TBinaryOperationNode(GetPos(opToken), EBinaryOperation::Or, result, rightOperand);
+ }
+
+ return result;
+}
+
+TAstNodePtr TAstBuilder::BuildExpr(const TRule_expr& node) {
+ return BuildOrExpr(node.GetRule_or_expr1());
+}
+
+TAstNodePtr TAstBuilder::BuildJsonPath(const TRule_jsonpath& node) {
+ TPosition pos;
+ auto mode = EJsonPathMode::Lax;
+ if (node.HasBlock1()) {
+ const auto& modeToken = node.GetBlock1().GetToken1();
+ pos = GetPos(modeToken);
+ if (modeToken.GetValue() == "strict") {
+ mode = EJsonPathMode::Strict;
+ }
+ }
+
+ const auto expr = BuildExpr(node.GetRule_expr2());
+ return new TRootNode(pos, expr, mode);
+}
+
+TAstNodePtr TAstBuilder::Build(const TJsonPathParserAST& ast) {
+ return BuildJsonPath(ast.GetRule_jsonpath());
+}
+
+namespace NYql::NJsonPath {
+
+ui32 GetReLibId() {
+ return RegexpLibId;
+}
+
+}