summaryrefslogtreecommitdiffstats
path: root/yql/essentials/ast/yql_ast.cpp
diff options
context:
space:
mode:
authorvvvv <[email protected]>2024-11-06 10:40:15 +0300
committervvvv <[email protected]>2024-11-06 10:50:39 +0300
commit8cf98f8169af124576399a29eac2bc2a691124e3 (patch)
tree54260d85e28822402e0d17d1e840ccfb62b33b61 /yql/essentials/ast/yql_ast.cpp
parent13cc9ffc62224711fd2923aed53525fc7d1838f8 (diff)
Moved yql/ast YQL-19206
init commit_hash:a6a63582073784b9318cc04ffcc1e212f3df703b
Diffstat (limited to 'yql/essentials/ast/yql_ast.cpp')
-rw-r--r--yql/essentials/ast/yql_ast.cpp665
1 files changed, 665 insertions, 0 deletions
diff --git a/yql/essentials/ast/yql_ast.cpp b/yql/essentials/ast/yql_ast.cpp
new file mode 100644
index 00000000000..4ba247130ba
--- /dev/null
+++ b/yql/essentials/ast/yql_ast.cpp
@@ -0,0 +1,665 @@
+#include "yql_ast.h"
+#include "yql_ast_escaping.h"
+
+#include <util/string/builder.h>
+#include <util/system/compiler.h>
+#include <library/cpp/containers/stack_vector/stack_vec.h>
+#include <yql/essentials/utils/utf8.h>
+
+#include <cstdlib>
+
+namespace NYql {
+
+namespace {
+
+ inline bool IsWhitespace(char c) {
+ return c == ' ' || c == '\n' || c == '\r' || c == '\t';
+ }
+
+ inline bool IsListStart(char c) { return c == '('; }
+ inline bool IsListEnd(char c) { return c == ')'; }
+ inline bool IsCommentStart(char c) { return c == '#'; }
+ inline bool IsQuoteStart(char c) { return c == '\''; }
+ inline bool IsStringStart(char c) { return c == '"'; }
+ inline bool IsHexStringStart(char c) { return c == 'x'; }
+ inline bool IsMultilineStringStart(char c) { return c == '@'; }
+
+ inline bool NeedEscaping(const TStringBuf& str) {
+ for (char ch: str) {
+ if (IsWhitespace(ch) || IsListStart(ch) ||
+ IsListEnd(ch) || IsCommentStart(ch) ||
+ IsQuoteStart(ch) || IsStringStart(ch) ||
+ !isprint(ch & 0xff))
+ {
+ return true;
+ }
+ }
+
+ return str.empty();
+ }
+
+ ///////////////////////////////////////////////////////////////////////////
+ // TAstParser
+ ///////////////////////////////////////////////////////////////////////////
+ class TAstParserContext {
+ public:
+ inline TAstParserContext(const TStringBuf& str, TMemoryPool* externalPool, const TString& file)
+ : Str_(str)
+ , Position_(1, 1, file)
+ , Offset_(0)
+ , Pool_(externalPool)
+ {
+ if (!Pool_) {
+ InnerPool_ = std::make_unique<TMemoryPool>(4096);
+ Pool_ = InnerPool_.get();
+ }
+ }
+
+ inline char Peek() const {
+ Y_ABORT_UNLESS(!AtEnd());
+ return Str_[Offset_];
+ }
+
+ inline bool AtEnd() const {
+ return Str_.size() == Offset_;
+ }
+
+ inline char Next() {
+ Y_ABORT_UNLESS(!AtEnd());
+ char ch = Str_[Offset_];
+ if (ch == '\n') {
+ ++Position_.Row;
+ Position_.Column = 1;
+ } else {
+ ++Position_.Column;
+ }
+
+ ++Offset_;
+ return ch;
+ }
+
+ // stops right afetr stopChar
+ inline void SeekTo(char stopChar) {
+ while (!AtEnd() && Next() != stopChar) {
+ // empty loop
+ }
+ }
+
+ inline TStringBuf GetToken(ui32 begin, ui32 end) {
+ Y_ABORT_UNLESS(end >= begin);
+ return Str_.SubString(begin, end - begin);
+ }
+
+ inline bool IsAtomEnded() {
+ if (AtEnd()) {
+ return true;
+ }
+ char c = Peek();
+ return IsWhitespace(c) || IsListStart(c) || IsListEnd(c);
+ }
+
+ inline const TStringBuf& Str() const { return Str_; }
+ inline ui32 Offset() const { return Offset_; }
+ inline const TPosition& Position() const { return Position_; }
+ inline TMemoryPool& Pool() { return *Pool_; }
+ inline std::unique_ptr<TMemoryPool>&& InnerPool() { return std::move(InnerPool_); }
+
+ private:
+ TStringBuf Str_;
+ TPosition Position_;
+ ui32 Offset_;
+ TMemoryPool* Pool_;
+ std::unique_ptr<TMemoryPool> InnerPool_;
+ };
+
+ ///////////////////////////////////////////////////////////////////////////
+ // TAstParser
+ ///////////////////////////////////////////////////////////////////////////
+ class TAstParser {
+ public:
+ TAstParser(const TStringBuf& str, TMemoryPool* externalPool, const TString& file)
+ : Ctx_(str, externalPool, file)
+ {
+ }
+
+ TAstParseResult Parse() {
+ TAstNode* root = nullptr;
+ if (!IsUtf8(Ctx_.Str())) {
+ AddError("Invalid UTF8 input");
+ } else {
+ root = ParseList(0U);
+
+ SkipSpace();
+ if (!Ctx_.AtEnd()) {
+ AddError("Unexpected symbols after end of root list");
+ }
+ }
+
+ TAstParseResult result;
+ if (!Issues_.Empty()) {
+ result.Issues = std::move(Issues_);
+ } else {
+ result.Root = root;
+ result.Pool = Ctx_.InnerPool();
+ }
+ return result;
+ }
+
+ private:
+ inline void AddError(const TString& message) {
+ Issues_.AddIssue(Ctx_.Position(), message);
+ }
+
+ inline void SkipComment() {
+ Ctx_.SeekTo('\n');
+ }
+
+ void SkipSpace() {
+ while (!Ctx_.AtEnd()) {
+ char c = Ctx_.Peek();
+ if (IsWhitespace(c)) {
+ Ctx_.Next();
+ continue;
+ }
+
+ if (IsCommentStart(c)) {
+ SkipComment();
+ continue;
+ }
+
+ break;
+ }
+ }
+
+ TAstNode* ParseList(size_t level) {
+ if (level >= 1000U) {
+ AddError("Too deep graph!");
+ return nullptr;
+ }
+
+ SkipSpace();
+
+ if (Ctx_.AtEnd()) {
+ AddError("Unexpected end");
+ return nullptr;
+ }
+
+ if (!IsListStart(Ctx_.Peek())) {
+ AddError("Expected (");
+ return nullptr;
+ }
+
+ Ctx_.Next();
+
+ TSmallVec<TAstNode*> children;
+ auto listPos = Ctx_.Position();
+ while (true) {
+ SkipSpace();
+
+ if (Ctx_.AtEnd()) {
+ AddError("Expected )");
+ return nullptr;
+ }
+
+ if (IsListEnd(Ctx_.Peek())) {
+ Ctx_.Next();
+ return TAstNode::NewList(listPos, children.data(), children.size(), Ctx_.Pool());
+ }
+
+ TAstNode* elem = ParseElement(level);
+ if (!elem)
+ return nullptr;
+
+ children.push_back(elem);
+ }
+ }
+
+ TAstNode* ParseElement(size_t level) {
+ if (Ctx_.AtEnd()) {
+ AddError("Expected element");
+ return nullptr;
+ }
+
+ char c = Ctx_.Peek();
+ if (IsQuoteStart(c)) {
+ auto resPosition = Ctx_.Position();
+ Ctx_.Next();
+
+ char ch = Ctx_.Peek();
+ if (Ctx_.AtEnd() || IsWhitespace(ch) || IsCommentStart(ch) ||
+ IsListEnd(ch))
+ {
+ AddError("Expected quotation");
+ return nullptr;
+ }
+
+ TAstNode* content = IsListStart(ch)
+ ? ParseList(++level)
+ : ParseAtom();
+ if (!content)
+ return nullptr;
+
+ return TAstNode::NewList(resPosition, Ctx_.Pool(), &TAstNode::QuoteAtom, content);
+ }
+
+ if (IsListStart(c))
+ return ParseList(++level);
+
+ return ParseAtom();
+ }
+
+ TAstNode* ParseAtom() {
+ if (Ctx_.AtEnd()) {
+ AddError("Expected atom");
+ return nullptr;
+ }
+
+ auto resPosition = Ctx_.Position();
+ ui32 atomStart = Ctx_.Offset();
+
+ while (true) {
+ char c = Ctx_.Peek();
+ // special symbols = 0x20, 0x0a, 0x0d, 0x09 space
+ // 0x22, 0x23, 0x28, 0x29 "#()
+ // 0x27 '
+ // special symbols = 0x40, 0x78 @x
+ // &0x3f = 0x00,0x38
+#define MASK(x) (1ull << ui64(x))
+ const ui64 mask1 = MASK(0x20) | MASK(0x0a) | MASK(0x0d)
+ | MASK(0x09) | MASK(0x22) | MASK(0x23) | MASK(0x28) | MASK(0x29) | MASK(0x27);
+ const ui64 mask2 = MASK(0x00) | MASK(0x38);
+#undef MASK
+ if (!(c & 0x80) && ((1ull << (c & 0x3f)) & (c <= 0x3f ? mask1 : mask2))) {
+ if (IsWhitespace(c) || IsListStart(c) || IsListEnd(c))
+ break;
+
+ if (IsCommentStart(c)) {
+ AddError("Unexpected comment");
+ return nullptr;
+ }
+
+ if (IsQuoteStart(c)) {
+ AddError("Unexpected quotation");
+ return nullptr;
+ }
+
+ // multiline starts with '@@'
+ if (IsMultilineStringStart(c)) {
+ Ctx_.Next();
+ if (Ctx_.AtEnd()) break;
+
+ if (!IsMultilineStringStart(Ctx_.Peek())) {
+ continue;
+ }
+
+ TString token;
+ if (!TryParseMultilineToken(token)) {
+ return nullptr;
+ }
+
+ if (!Ctx_.IsAtomEnded()) {
+ AddError("Unexpected end of @@");
+ return nullptr;
+ }
+
+ return TAstNode::NewAtom(resPosition, token, Ctx_.Pool(), TNodeFlags::MultilineContent);
+ }
+ // hex string starts with 'x"'
+ else if (IsHexStringStart(c)) {
+ Ctx_.Next(); // skip 'x'
+ if (Ctx_.AtEnd()) break;
+
+ if (!IsStringStart(Ctx_.Peek())) {
+ continue;
+ }
+
+ Ctx_.Next(); // skip first '"'
+
+ size_t readBytes = 0;
+ TStringStream ss;
+ TStringBuf atom = Ctx_.Str().SubStr(Ctx_.Offset());
+ EUnescapeResult unescapeResult = UnescapeBinaryAtom(
+ atom, '"', &ss, &readBytes);
+
+ // advance position
+ while (readBytes-- != 0) {
+ Ctx_.Next();
+ }
+
+ if (unescapeResult != EUnescapeResult::OK) {
+ AddError(TString(UnescapeResultToString(unescapeResult)));
+ return nullptr;
+ }
+
+ Ctx_.Next(); // skip last '"'
+ if (!Ctx_.IsAtomEnded()) {
+ AddError("Unexpected end of \"");
+ return nullptr;
+ }
+
+ return TAstNode::NewAtom(resPosition, ss.Str(), Ctx_.Pool(), TNodeFlags::BinaryContent);
+ }
+ else if (IsStringStart(c)) {
+ if (Ctx_.Offset() != atomStart) {
+ AddError("Unexpected \"");
+ return nullptr;
+ }
+
+ Ctx_.Next(); // skip first '"'
+
+ size_t readBytes = 0;
+ TStringStream ss;
+ TStringBuf atom = Ctx_.Str().SubStr(Ctx_.Offset());
+ EUnescapeResult unescapeResult = UnescapeArbitraryAtom(
+ atom, '"', &ss, &readBytes);
+
+ // advance position
+ while (readBytes-- != 0) {
+ Ctx_.Next();
+ }
+
+ if (unescapeResult != EUnescapeResult::OK) {
+ AddError(TString(UnescapeResultToString(unescapeResult)));
+ return nullptr;
+ }
+
+ if (!Ctx_.IsAtomEnded()) {
+ AddError("Unexpected end of \"");
+ return nullptr;
+ }
+
+ return TAstNode::NewAtom(resPosition, ss.Str(), Ctx_.Pool(), TNodeFlags::ArbitraryContent);
+ }
+ }
+
+ Ctx_.Next();
+ if (Ctx_.AtEnd()) {
+ break;
+ }
+ }
+
+ return TAstNode::NewAtom(resPosition, Ctx_.GetToken(atomStart, Ctx_.Offset()), Ctx_.Pool());
+ }
+
+ bool TryParseMultilineToken(TString& token) {
+ Ctx_.Next(); // skip second '@'
+
+ ui32 start = Ctx_.Offset();
+ while (true) {
+ Ctx_.SeekTo('@');
+
+ if (Ctx_.AtEnd()) {
+ AddError("Unexpected multiline atom end");
+ return false;
+ }
+
+ ui32 count = 1; // we seek to first '@'
+ while (!Ctx_.AtEnd() && Ctx_.Peek() == '@') {
+ count++;
+ Ctx_.Next();
+ if (count == 4) {
+ // Reduce each four '@' to two
+ token.append(Ctx_.GetToken(start, Ctx_.Offset() - 2));
+ start = Ctx_.Offset();
+ count = 0;
+ }
+ }
+ if (count >= 2) {
+ break;
+ }
+ }
+
+ // two '@@' at the end
+ token.append(Ctx_.GetToken(start, Ctx_.Offset() - 2));
+ return true;
+ }
+
+ private:
+ TAstParserContext Ctx_;
+ TIssues Issues_;
+ };
+
+ ///////////////////////////////////////////////////////////////////////////
+ // ast node printing functions
+ ///////////////////////////////////////////////////////////////////////////
+
+ inline bool IsQuoteNode(const TAstNode& node) {
+ return node.GetChildrenCount() == 2
+ && node.GetChild(0)->GetType() == TAstNode::Atom
+ && node.GetChild(0)->GetContent() == TStringBuf("quote");
+ }
+
+ inline bool IsBlockNode(const TAstNode& node) {
+ return node.GetChildrenCount() == 2
+ && node.GetChild(0)->GetType() == TAstNode::Atom
+ && node.GetChild(0)->GetContent() == TStringBuf("block");
+ }
+
+ Y_NO_INLINE void Indent(IOutputStream& out, ui32 indentation) {
+ char* whitespaces = reinterpret_cast<char*>(alloca(indentation));
+ memset(whitespaces, ' ', indentation);
+ out.Write(whitespaces, indentation);
+ }
+
+ void MultilineAtomPrint(IOutputStream& out, const TStringBuf& str) {
+ out << TStringBuf("@@");
+ size_t idx = str.find('@');
+ if (idx == TString::npos) {
+ out << str;
+ } else {
+ const char* begin = str.data();
+ do {
+ ui32 count = 0;
+ for (; idx < str.length() && str[idx] == '@'; ++idx) {
+ ++count;
+ }
+
+ if (count % 2 == 0) {
+ out.Write(begin, idx - (begin - str.data()) - count);
+ begin = str.data() + idx;
+
+ while (count--) {
+ out.Write(TStringBuf("@@"));
+ }
+ }
+ idx = str.find('@', idx);
+ } while (idx != TString::npos);
+ out.Write(begin, str.end() - begin);
+ }
+ out << TStringBuf("@@");
+ }
+
+ void PrintNode(IOutputStream& out, const TAstNode& node) {
+ if (node.GetType() == TAstNode::Atom) {
+ if (node.GetFlags() & TNodeFlags::ArbitraryContent) {
+ EscapeArbitraryAtom(node.GetContent(), '"', &out);
+ } else if (node.GetFlags() & TNodeFlags::BinaryContent) {
+ EscapeBinaryAtom(node.GetContent(), '"', &out);
+ } else if (node.GetFlags() & TNodeFlags::MultilineContent) {
+ MultilineAtomPrint(out, node.GetContent());
+ } else if (node.GetContent().empty()) {
+ EscapeArbitraryAtom(node.GetContent(), '"', &out);
+ } else {
+ out << node.GetContent();
+ }
+ } else if (node.GetType() == TAstNode::List) {
+ if (!node.GetChildrenCount()) {
+ out << TStringBuf("()");
+ } else if (IsQuoteNode(node)) {
+ out << '\'';
+ PrintNode(out, *node.GetChild(1));
+ } else {
+ out << '(';
+ ui32 index = 0;
+ while (true) {
+ PrintNode(out, *node.GetChild(index));
+ ++index;
+ if (index == node.GetChildrenCount()) break;
+ out << ' ';
+ }
+ out << ')';
+ }
+ }
+ }
+
+ void PrettyPrintNode(
+ IOutputStream& out, const TAstNode& node,
+ i32 indent, i32 blockLevel, i32 localIndent, ui32 flags)
+ {
+ if (!(flags & TAstPrintFlags::PerLine)) {
+ Indent(out, indent * 2);
+ } else if (localIndent == 1) {
+ Indent(out, blockLevel * 2);
+ }
+
+ bool isQuote = false;
+ if (node.GetType() == TAstNode::Atom) {
+ if (node.GetFlags() & TNodeFlags::ArbitraryContent) {
+ if ((flags & TAstPrintFlags::AdaptArbitraryContent) &&
+ !NeedEscaping(node.GetContent()))
+ {
+ out << node.GetContent();
+ } else {
+ EscapeArbitraryAtom(node.GetContent(), '"', &out);
+ }
+ } else if (node.GetFlags() & TNodeFlags::BinaryContent) {
+ EscapeBinaryAtom(node.GetContent(), '"', &out);
+ } else if (node.GetFlags() & TNodeFlags::MultilineContent) {
+ MultilineAtomPrint(out, node.GetContent());
+ } else {
+ if (((flags & TAstPrintFlags::AdaptArbitraryContent) && NeedEscaping(node.GetContent())) ||
+ node.GetContent().empty())
+ {
+ EscapeArbitraryAtom(node.GetContent(), '"', &out);
+ } else {
+ out << node.GetContent();
+ }
+ }
+ } else if (node.GetType() == TAstNode::List) {
+ isQuote = IsQuoteNode(node);
+ if (isQuote && (flags & TAstPrintFlags::ShortQuote)) {
+ out << '\'';
+ if (localIndent == 0 || !(flags & TAstPrintFlags::PerLine)) {
+ out << Endl;
+ }
+
+ PrettyPrintNode(out, *node.GetChild(1), indent + 1, blockLevel, localIndent + 1, flags);
+ } else {
+ out << '(';
+ if (localIndent == 0 || !(flags & TAstPrintFlags::PerLine)) {
+ out << Endl;
+ }
+
+ bool isBlock = IsBlockNode(node);
+ for (ui32 index = 0; index < node.GetChildrenCount(); ++index) {
+ if (localIndent > 0 && (index > 0) && (flags & TAstPrintFlags::PerLine)) {
+ out << ' ';
+ }
+
+ if (isBlock && (index > 0)) {
+ PrettyPrintNode(out, *node.GetChild(index), indent + 1, blockLevel + 1, -1, flags);
+ } else {
+ PrettyPrintNode(out, *node.GetChild(index), indent + 1, blockLevel, localIndent + 1, flags);
+ }
+ }
+ }
+
+ if (!isQuote || !(flags & TAstPrintFlags::ShortQuote)) {
+ if (!(flags & TAstPrintFlags::PerLine)) {
+ Indent(out, indent * 2);
+ }
+
+ if (localIndent == 0 && blockLevel > 0) {
+ Indent(out, (blockLevel - 1) * 2);
+ }
+
+ out << ')';
+ }
+ }
+
+ if (!isQuote || !(flags & TAstPrintFlags::ShortQuote)) {
+ if (localIndent > 0 || blockLevel == 0) {
+ if (localIndent <= 1 || !(flags & TAstPrintFlags::PerLine)) {
+ out << Endl;
+ }
+ }
+ }
+ }
+
+ void DestroyNode(TAstNode* node) {
+ if (node->IsList()) {
+ for (ui32 i = 0; i < node->GetChildrenCount(); ++i) {
+ DestroyNode(node->GetChild(i));
+ }
+ }
+
+ if (node != &TAstNode::QuoteAtom) {
+ node->Destroy();
+ }
+ }
+} // namespace
+
+TAstParseResult::~TAstParseResult() {
+ Destroy();
+}
+
+TAstParseResult::TAstParseResult(TAstParseResult&& other)
+ : Pool(std::move(other.Pool))
+ , Root(other.Root)
+ , Issues(std::move(other.Issues))
+ , PgAutoParamValues(std::move(other.PgAutoParamValues))
+ , ActualSyntaxType(other.ActualSyntaxType)
+{
+ other.Root = nullptr;
+}
+
+TAstParseResult& TAstParseResult::operator=(TAstParseResult&& other) {
+ Destroy();
+ Pool = std::move(other.Pool);
+ Root = other.Root;
+ other.Root = nullptr;
+ Issues = std::move(other.Issues);
+ PgAutoParamValues = std::move(other.PgAutoParamValues);
+ ActualSyntaxType = other.ActualSyntaxType;
+ return *this;
+}
+
+void TAstParseResult::Destroy() {
+ if (Root) {
+ DestroyNode(Root);
+ Root = nullptr;
+ }
+}
+
+TAstParseResult ParseAst(const TStringBuf& str, TMemoryPool* externalPool, const TString& file)
+{
+ TAstParser parser(str, externalPool, file);
+ return parser.Parse();
+}
+
+void TAstNode::PrintTo(IOutputStream& out) const {
+ PrintNode(out, *this);
+}
+
+void TAstNode::PrettyPrintTo(IOutputStream& out, ui32 flags) const {
+ PrettyPrintNode(out, *this, 0, 0, 0, flags);
+}
+
+TAstNode TAstNode::QuoteAtom(TPosition(0, 0), TStringBuf("quote"), TNodeFlags::Default);
+
+} // namespace NYql
+
+template<>
+void Out<NYql::TAstNode::EType>(class IOutputStream &o, NYql::TAstNode::EType x) {
+#define YQL_AST_NODE_TYPE_MAP_TO_STRING_IMPL(name, ...) \
+ case ::NYql::TAstNode::name: \
+ o << #name; \
+ return;
+
+ switch (x) {
+ YQL_AST_NODE_TYPE_MAP(YQL_AST_NODE_TYPE_MAP_TO_STRING_IMPL)
+ default:
+ o << static_cast<int>(x);
+ return;
+ }
+}