diff options
author | aneporada <aneporada@yandex-team.com> | 2024-11-12 07:55:22 +0300 |
---|---|---|
committer | aneporada <aneporada@yandex-team.com> | 2024-11-12 08:10:50 +0300 |
commit | 621c8c7dade57165d6d431295f7a9a9143a062fa (patch) | |
tree | cc5dee0e889655fded4d4878071a0a6a76d2e17d | |
parent | 55cec9f6b0618fb3570fc8ef66aad151f4932591 (diff) | |
download | ydb-621c8c7dade57165d6d431295f7a9a9143a062fa.tar.gz |
Merge GH PR #9404
commit_hash:d780798556aedbe2be898d69185380f2ecb95f9c
-rw-r--r-- | yql/essentials/parser/lexer_common/lexer.h | 4 | ||||
-rw-r--r-- | yql/essentials/parser/lexer_common/tokens.cpp | 2 | ||||
-rw-r--r-- | yql/essentials/parser/proto_ast/antlr3/proto_ast_antlr3.h | 2 | ||||
-rw-r--r-- | yql/essentials/parser/proto_ast/antlr4/proto_ast_antlr4.h | 5 | ||||
-rw-r--r-- | yql/essentials/sql/v1/SQLv1.g.in | 9 | ||||
-rw-r--r-- | yql/essentials/sql/v1/SQLv1Antlr4.g.in | 9 | ||||
-rw-r--r-- | yql/essentials/sql/v1/format/sql_format.cpp | 3 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/lexer.cpp | 6 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/lexer.h | 6 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/lexer_ut.cpp | 185 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/ut/ya.make | 12 | ||||
-rw-r--r-- | yql/essentials/sql/v1/lexer/ya.make | 4 | ||||
-rw-r--r-- | yql/essentials/sql/v1/sql_ut_antlr4.cpp | 8 | ||||
-rw-r--r-- | yql/essentials/udfs/common/file/file_udf.cpp | 47 | ||||
-rw-r--r-- | yql/essentials/utils/line_split.cpp | 39 | ||||
-rw-r--r-- | yql/essentials/utils/line_split.h | 16 | ||||
-rw-r--r-- | yql/essentials/utils/ya.make | 1 |
17 files changed, 296 insertions, 62 deletions
diff --git a/yql/essentials/parser/lexer_common/lexer.h b/yql/essentials/parser/lexer_common/lexer.h index 560ea20482..f5d9600681 100644 --- a/yql/essentials/parser/lexer_common/lexer.h +++ b/yql/essentials/parser/lexer_common/lexer.h @@ -18,7 +18,9 @@ struct TParsedToken { // TODO: TStringBuf for Name & Content TString Name; TString Content; - // Position of first token symbol + // Position of first token byte/symbol + // When antlr3 lexer is used, LinePos is a position as in a byte array, + // but when antlr4 lexer is used, LinePos is a position as in a symbol array, ui32 Line = 0; // starts from 1 ui32 LinePos = 0; // starts from 0 }; diff --git a/yql/essentials/parser/lexer_common/tokens.cpp b/yql/essentials/parser/lexer_common/tokens.cpp index b37b0139ef..014a815e43 100644 --- a/yql/essentials/parser/lexer_common/tokens.cpp +++ b/yql/essentials/parser/lexer_common/tokens.cpp @@ -1,6 +1,5 @@ #include "lexer.h" - namespace NSQLTranslation { IOutputStream& OutputTokens(IOutputStream& out, TParsedTokenList::const_iterator begin, TParsedTokenList::const_iterator end) { @@ -18,5 +17,4 @@ bool Tokenize(ILexer& lexer, const TString& query, const TString& queryName, TPa return lexer.Tokenize(query, queryName, onNextToken, issues, maxErrors); } - } diff --git a/yql/essentials/parser/proto_ast/antlr3/proto_ast_antlr3.h b/yql/essentials/parser/proto_ast/antlr3/proto_ast_antlr3.h index 8f14ff6979..22f40fd1fd 100644 --- a/yql/essentials/parser/proto_ast/antlr3/proto_ast_antlr3.h +++ b/yql/essentials/parser/proto_ast/antlr3/proto_ast_antlr3.h @@ -60,6 +60,7 @@ namespace NProtoAST { try { Lexer.ReportErrors(&errors); auto src = Lexer.get_tokSource(); + for (;;) { auto token = src->nextToken(); auto type = token->getType(); @@ -69,6 +70,7 @@ namespace NProtoAST { last.Content = token->getText(); last.Line = token->get_line(); last.LinePos = token->get_charPositionInLine(); + onNextToken(std::move(last)); if (isEOF) { break; diff --git a/yql/essentials/parser/proto_ast/antlr4/proto_ast_antlr4.h b/yql/essentials/parser/proto_ast/antlr4/proto_ast_antlr4.h index 9a638ff1a5..81973a400a 100644 --- a/yql/essentials/parser/proto_ast/antlr4/proto_ast_antlr4.h +++ b/yql/essentials/parser/proto_ast/antlr4/proto_ast_antlr4.h @@ -89,6 +89,11 @@ namespace NProtoAST { void CollectTokens(IErrorCollector& errors, const NSQLTranslation::ILexer::TTokenCallback& onNextToken) { try { + bool error = false; + typename antlr4::YqlErrorListener listener(&errors, &error); + Lexer.removeErrorListeners(); + Lexer.addErrorListener(&listener); + for (;;) { auto token = Lexer.nextToken(); auto type = token->getType(); diff --git a/yql/essentials/sql/v1/SQLv1.g.in b/yql/essentials/sql/v1/SQLv1.g.in index d640c6d611..653fd7a4b0 100644 --- a/yql/essentials/sql/v1/SQLv1.g.in +++ b/yql/essentials/sql/v1/SQLv1.g.in @@ -1724,7 +1724,6 @@ MINUS: '-'; TILDA: '~'; ASTERISK: '*'; SLASH: '/'; -BACKSLASH: '\\'; PERCENT: '%'; SEMICOLON: ';'; DOT: '.'; @@ -1736,9 +1735,6 @@ COLON: ':'; COMMAT: '@'; DOUBLE_COMMAT: '@@'; DOLLAR: '$'; -QUOTE_DOUBLE: '"'; // This comment for fix syntax highlighting " -QUOTE_SINGLE: '\''; -BACKTICK: '`'; LBRACE_CURLY: '{'; RBRACE_CURLY: '}'; CARET: '^'; @@ -1747,6 +1743,11 @@ ARROW: '->'; RBRACE_SQUARE: ']'; LBRACE_SQUARE: '['; // pair ] +fragment BACKSLASH: '\\'; +fragment QUOTE_DOUBLE: '"'; +fragment QUOTE_SINGLE: '\''; +fragment BACKTICK: '`'; + // http://www.antlr.org/wiki/pages/viewpage.action?pageId=1782 fragment A:('a'|'A'); fragment B:('b'|'B'); diff --git a/yql/essentials/sql/v1/SQLv1Antlr4.g.in b/yql/essentials/sql/v1/SQLv1Antlr4.g.in index 25c6fa438b..875774323a 100644 --- a/yql/essentials/sql/v1/SQLv1Antlr4.g.in +++ b/yql/essentials/sql/v1/SQLv1Antlr4.g.in @@ -1723,7 +1723,6 @@ MINUS: '-'; TILDA: '~'; ASTERISK: '*'; SLASH: '/'; -BACKSLASH: '\\'; PERCENT: '%'; SEMICOLON: ';'; DOT: '.'; @@ -1735,9 +1734,6 @@ COLON: ':'; COMMAT: '@'; DOUBLE_COMMAT: '@@'; DOLLAR: '$'; -QUOTE_DOUBLE: '"'; // This comment for fix syntax highlighting " -QUOTE_SINGLE: '\''; -BACKTICK: '`'; LBRACE_CURLY: '{'; RBRACE_CURLY: '}'; CARET: '^'; @@ -1746,6 +1742,11 @@ ARROW: '->'; RBRACE_SQUARE: ']'; LBRACE_SQUARE: '['; // pair ] +fragment BACKSLASH: '\\'; +fragment QUOTE_DOUBLE: '"'; +fragment QUOTE_SINGLE: '\''; +fragment BACKTICK: '`'; + // http://www.antlr.org/wiki/pages/viewpage.action?pageId=1782 fragment A:('a'|'A'); fragment B:('b'|'B'); diff --git a/yql/essentials/sql/v1/format/sql_format.cpp b/yql/essentials/sql/v1/format/sql_format.cpp index 463c52ede4..80ce5d139e 100644 --- a/yql/essentials/sql/v1/format/sql_format.cpp +++ b/yql/essentials/sql/v1/format/sql_format.cpp @@ -26,6 +26,7 @@ using namespace NSQLv1Generated; using NSQLTranslation::TParsedToken; using NSQLTranslation::TParsedTokenList; +using NSQLTranslationV1::IsProbablyKeyword; using TTokenIterator = TParsedTokenList::const_iterator; TTokenIterator SkipWS(TTokenIterator curr, TTokenIterator end) { @@ -55,7 +56,7 @@ bool Validate(const TParsedTokenList& query, const TParsedTokenList& formattedQu if (in->Name != out->Name) { return false; } - if (AsciiEqualsIgnoreCase(in->Name, in->Content)) { + if (IsProbablyKeyword(*in)) { if (!AsciiEqualsIgnoreCase(in->Content, out->Content)) { return false; } diff --git a/yql/essentials/sql/v1/lexer/lexer.cpp b/yql/essentials/sql/v1/lexer/lexer.cpp index b6d2362f21..1d38ec3d8b 100644 --- a/yql/essentials/sql/v1/lexer/lexer.cpp +++ b/yql/essentials/sql/v1/lexer/lexer.cpp @@ -9,6 +9,8 @@ #include <yql/essentials/parser/proto_ast/gen/v1_antlr4/SQLv1Antlr4Lexer.h> #include <yql/essentials/parser/proto_ast/gen/v1_ansi_antlr4/SQLv1Antlr4Lexer.h> +#include <util/string/ascii.h> + #if defined(_tsan_enabled_) #include <util/system/mutex.h> #endif @@ -74,4 +76,8 @@ NSQLTranslation::ILexer::TPtr MakeLexer(bool ansi, bool antlr4) { return NSQLTranslation::ILexer::TPtr(new TV1Lexer(ansi, antlr4)); } +bool IsProbablyKeyword(const NSQLTranslation::TParsedToken& token) { + return AsciiEqualsIgnoreCase(token.Name, token.Content); +} + } // namespace NSQLTranslationV1 diff --git a/yql/essentials/sql/v1/lexer/lexer.h b/yql/essentials/sql/v1/lexer/lexer.h index fe0102be79..25bfe28f81 100644 --- a/yql/essentials/sql/v1/lexer/lexer.h +++ b/yql/essentials/sql/v1/lexer/lexer.h @@ -6,4 +6,10 @@ namespace NSQLTranslationV1 { NSQLTranslation::ILexer::TPtr MakeLexer(bool ansi, bool antlr4); +// "Probably" because YQL keyword can be an identifier +// depending on a query context. For example +// in SELECT * FROM group - group is an identifier, but +// in SELECT * FROM ... GROUP BY ... - group is a keyword. +bool IsProbablyKeyword(const NSQLTranslation::TParsedToken& token); + } diff --git a/yql/essentials/sql/v1/lexer/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/lexer_ut.cpp new file mode 100644 index 0000000000..679ae472b1 --- /dev/null +++ b/yql/essentials/sql/v1/lexer/lexer_ut.cpp @@ -0,0 +1,185 @@ +#include "lexer.h" + +#include <yql/essentials/core/issue/yql_issue.h> +#include <yql/essentials/sql/settings/translation_settings.h> + +#include <library/cpp/testing/unittest/registar.h> + +using namespace NSQLTranslation; +using namespace NSQLTranslationV1; + +std::pair<TParsedTokenList, NYql::TIssues> Tokenize(ILexer::TPtr& lexer, TString queryUtf8) { + TParsedTokenList tokens; + NYql::TIssues issues; + Tokenize(*lexer, queryUtf8, "Query", tokens, issues, SQL_MAX_PARSER_ERRORS); + return {tokens, issues}; +} + +TVector<TString> GetIssueMessages(ILexer::TPtr& lexer, TString queryUtf8) { + TVector<TString> messages; + for (const auto& issue : Tokenize(lexer, queryUtf8).second) { + messages.emplace_back(issue.ToString(/* oneLine = */ true)); + } + return messages; +} + +TVector<TString> GetTokenViews(ILexer::TPtr& lexer, TString queryUtf8) { + TVector<TString> names; + for (auto& token : Tokenize(lexer, queryUtf8).first) { + TString view = std::move(token.Name); + if (view == "ID_PLAIN" || view == "STRING_VALUE") { + view.append(" ("); + view.append(token.Content); + view.append(")"); + } + names.emplace_back(std::move(view)); + } + return names; +} + +void AssertEquivialent(const TParsedToken& lhs, const TParsedToken& rhs) { + if (lhs.Name == "EOF" && rhs.Name == "EOF") { + return; + } + + UNIT_ASSERT_VALUES_EQUAL(lhs.Name, rhs.Name); + UNIT_ASSERT_VALUES_EQUAL(lhs.Content, rhs.Content); + UNIT_ASSERT_VALUES_EQUAL(lhs.Line, rhs.Line); +} + +void AssertEquivialent(const TParsedTokenList& lhs, const TParsedTokenList& rhs) { + UNIT_ASSERT_VALUES_EQUAL(lhs.size(), rhs.size()); + for (size_t i = 0; i < lhs.size(); ++i) { + AssertEquivialent(lhs.at(i), rhs.at(i)); + } +} + +Y_UNIT_TEST_SUITE(SQLv1Lexer) { + Y_UNIT_TEST(AntlrVersionIndependent) { + const TVector<TString> queriesUtf8 = { + "", + " ", + "SELECT", + "SEL", // identifier + "SELECT FROM test", + "SELECT * FROM", + " SELECT * FROM ", + "SELECT \"\xF0\x9F\x98\x8A\" FROM ydb", + ( + "SELECT \"\xF0\x9F\x98\x8A Hello, друзья\", count, name\n" + "FROM table -- главная таблица 数据库 \n" + "WHERE count < 6\n" + " AND name = \"可靠性\"\n" + " AND count > 12"), + "\"select\"select", + }; + + auto lexer3 = MakeLexer(/* ansi = */ false, /* antlr4 = */ false); + auto lexer4 = MakeLexer(/* ansi = */ false, /* antlr4 = */ true); + + for (const auto& query : queriesUtf8) { + auto [tokens3, issues3] = Tokenize(lexer3, query); + auto [tokens4, issues4] = Tokenize(lexer4, query); + AssertEquivialent(tokens3, tokens4); + UNIT_ASSERT(issues3.Empty()); + UNIT_ASSERT(issues4.Empty()); + } + } + + TVector<TString> InvalidQueries(); + + void TestInvalidTokensSkipped(bool antlr4, const TVector<TVector<TString>>& expected) { + auto lexer = MakeLexer(/* ansi = */ false, antlr4); + + auto input = InvalidQueries(); + UNIT_ASSERT_VALUES_EQUAL(input.size(), expected.size()); + + for (size_t i = 0; i < input.size(); ++i) { + UNIT_ASSERT_VALUES_EQUAL(GetTokenViews(lexer, input[i]), expected[i]); + } + } + + TVector<TString> InvalidQueries() { + return { + /* 0: */ "\xF0\x9F\x98\x8A", + /* 1: */ "select \"aaaa", + /* 2: */ "\"\\\"", + /* 3: */ "\xF0\x9F\x98\x8A SELECT * FR", + /* 4: */ "! SELECT * from", + /* 5: */ "\xF0\x9F\x98\x8Aselect ! from", + /* 6: */ "\"", + /* 7: */ "!select", + /* 8: */ "SELECT \\\"\xF0\x9F\x98\x8A\\\" FROM test", + }; + } + + Y_UNIT_TEST(ErrorRecoveryAntlr3) { + TVector<TVector<TString>> actual = { + /* 0: */ {"EOF"}, + /* 1: */ {"SELECT", "WS", "EOF"}, + /* 2: */ {"EOF"}, + /* 3: */ {"WS", "SELECT", "WS", "ASTERISK", "WS", "ID_PLAIN (FR)", "EOF"}, + /* 4: */ {"ID_PLAIN (ELECT)", "WS", "ASTERISK", "WS", "WS", "FROM", "EOF"}, + /* 5: */ {"SELECT", "WS", "ID_PLAIN (rom)", "EOF"}, + /* 6: */ {"EOF"}, + /* 7: */ {"ID_PLAIN (lect)", "EOF"}, + /* 8: */ {"SELECT", "WS", "EOF"}, + }; + TestInvalidTokensSkipped(/* antlr4 = */ false, actual); + } + + Y_UNIT_TEST(ErrorRecoveryAntlr4) { + TVector<TVector<TString>> actual = { + /* 0: */ {"EOF"}, + /* 1: */ {"SELECT", "WS", "EOF"}, + /* 2: */ {"EOF"}, + /* 3: */ {"WS", "SELECT", "WS", "ASTERISK", "WS", "ID_PLAIN (FR)", "EOF"}, + /* 4: */ {"SELECT", "WS", "ASTERISK", "WS", "WS", "FROM", "EOF"}, + /* 5: */ {"SELECT", "WS", "FROM", "EOF"}, + /* 6: */ {"EOF"}, + /* 7: */ {"ID_PLAIN (elect)", "EOF"}, + /* 8: */ {"SELECT", "WS", "EOF"}, + }; + TestInvalidTokensSkipped(/* antlr4 = */ true, actual); + } + + Y_UNIT_TEST(IssuesCollected) { + auto lexer3 = MakeLexer(/* ansi = */ false, /* antlr4 = */ false); + auto lexer4 = MakeLexer(/* ansi = */ false, /* antlr4 = */ true); + + for (const auto& query : InvalidQueries()) { + auto issues3 = GetIssueMessages(lexer3, query); + auto issues4 = GetIssueMessages(lexer4, query); + + UNIT_ASSERT(!issues3.empty()); + UNIT_ASSERT(!issues4.empty()); + } + } + + Y_UNIT_TEST(IssueMessagesAntlr3) { + auto lexer3 = MakeLexer(/* ansi = */ false, /* antlr4 = */ false); + + auto actual = GetIssueMessages(lexer3, "\xF0\x9F\x98\x8A SELECT * FR"); + + TVector<TString> expected = { + "<main>:1:0: Error: Unexpected character '\xF0\x9F\x98\x8A' (Unicode character <128522>) : cannot match to any predicted input...", + "<main>:1:1: Error: Unexpected character : cannot match to any predicted input...", + "<main>:1:2: Error: Unexpected character : cannot match to any predicted input...", + "<main>:1:3: Error: Unexpected character : cannot match to any predicted input...", + }; + + UNIT_ASSERT_VALUES_EQUAL(actual, expected); + } + + Y_UNIT_TEST(IssueMessagesAntlr4) { + auto lexer4 = MakeLexer(/* ansi = */ false, /* antlr4 = */ true); + + auto actual = GetIssueMessages(lexer4, "\xF0\x9F\x98\x8A SELECT * FR"); + + TVector<TString> expected = { + "<main>:1:0: Error: token recognition error at: '\xF0\x9F\x98\x8A'", + }; + + UNIT_ASSERT_VALUES_EQUAL(actual, expected); + } +} diff --git a/yql/essentials/sql/v1/lexer/ut/ya.make b/yql/essentials/sql/v1/lexer/ut/ya.make new file mode 100644 index 0000000000..a05178ada5 --- /dev/null +++ b/yql/essentials/sql/v1/lexer/ut/ya.make @@ -0,0 +1,12 @@ +UNITTEST_FOR(yql/essentials/sql/v1/lexer) + +PEERDIR( + yql/essentials/core/issue + yql/essentials/parser/lexer_common +) + +SRCS( + lexer_ut.cpp +) + +END() diff --git a/yql/essentials/sql/v1/lexer/ya.make b/yql/essentials/sql/v1/lexer/ya.make index 5174f6f595..cd5eea1b70 100644 --- a/yql/essentials/sql/v1/lexer/ya.make +++ b/yql/essentials/sql/v1/lexer/ya.make @@ -17,3 +17,7 @@ SUPPRESSIONS( ) END() + +RECURSE_FOR_TESTS( + ut +) diff --git a/yql/essentials/sql/v1/sql_ut_antlr4.cpp b/yql/essentials/sql/v1/sql_ut_antlr4.cpp index 7f11822cca..b92ea5f460 100644 --- a/yql/essentials/sql/v1/sql_ut_antlr4.cpp +++ b/yql/essentials/sql/v1/sql_ut_antlr4.cpp @@ -3004,7 +3004,7 @@ Y_UNIT_TEST_SUITE(SqlToYQLErrors) { UNIT_ASSERT(!res.Root); TString a1 = Err2Str(res); - TString a2(R"foo(<main>:1:16: Error: Unknown cluster: edar + TString a2(R"foo(<main>:1:14: Error: token recognition error at: 'с' )foo"); UNIT_ASSERT_NO_DIFF(a1, a2); @@ -3016,8 +3016,8 @@ Y_UNIT_TEST_SUITE(SqlToYQLErrors) { UNIT_ASSERT(!res1.Root); UNIT_ASSERT(!res2.Root); - UNIT_ASSERT_NO_DIFF(Err2Str(res1), "<main>:1:12: Error: mismatched input 'b' expecting {<EOF>, ';'}\n"); - UNIT_ASSERT_NO_DIFF(Err2Str(res2), "<main>:1:12: Error: mismatched input 'b' expecting {<EOF>, ';'}\n"); + UNIT_ASSERT_NO_DIFF(Err2Str(res1), "<main>:1:13: Error: token recognition error at: '';'\n"); + UNIT_ASSERT_NO_DIFF(Err2Str(res2), "<main>:1:13: Error: token recognition error at: '\";'\n"); } Y_UNIT_TEST(InvalidHexInStringLiteral) { @@ -3055,7 +3055,7 @@ Y_UNIT_TEST_SUITE(SqlToYQLErrors) { Y_UNIT_TEST(InvalidStringFromTable) { NYql::TAstParseResult res = SqlToYql("select \"FOO\"\"BAR from plato.foo"); UNIT_ASSERT(!res.Root); - UNIT_ASSERT_NO_DIFF(Err2Str(res), "<main>:1:12: Error: mismatched input '\"' expecting {<EOF>, ';'}\n"); + UNIT_ASSERT_NO_DIFF(Err2Str(res), "<main>:1:12: Error: token recognition error at: '\"BAR from plato.foo'\n"); } Y_UNIT_TEST(InvalidDoubleAtStringFromTable) { diff --git a/yql/essentials/udfs/common/file/file_udf.cpp b/yql/essentials/udfs/common/file/file_udf.cpp index 57db826591..fa5f84a2c5 100644 --- a/yql/essentials/udfs/common/file/file_udf.cpp +++ b/yql/essentials/udfs/common/file/file_udf.cpp @@ -1,4 +1,5 @@ #include <yql/essentials/public/udf/udf_helpers.h> +#include <yql/essentials/utils/line_split.h> #include <util/generic/yexception.h> #include <util/stream/buffered.h> @@ -236,52 +237,6 @@ namespace { const TTerminateFunc TerminateFunc; }; - class TLineSplitter { - public: - TLineSplitter(IInputStream& stream) - : Stream_(stream) - { - } - - size_t Next(TString& st) { - st.clear(); - char c; - size_t ret = 0; - if (HasPendingLineChar_) { - st.push_back(PendingLineChar_); - HasPendingLineChar_ = false; - ++ret; - } - - while (Stream_.ReadChar(c)) { - ++ret; - if (c == '\n') { - break; - } else if (c == '\r') { - if (Stream_.ReadChar(c)) { - ++ret; - if (c != '\n') { - --ret; - PendingLineChar_ = c; - HasPendingLineChar_ = true; - } - } - - break; - } else { - st.push_back(c); - } - } - - return ret; - } - - private: - IInputStream& Stream_; - bool HasPendingLineChar_ = false; - char PendingLineChar_ = 0; - }; - template <class TUserType> class TLineByLineBoxedValueIterator: public TBoxedValue { public: diff --git a/yql/essentials/utils/line_split.cpp b/yql/essentials/utils/line_split.cpp new file mode 100644 index 0000000000..657d0edd9b --- /dev/null +++ b/yql/essentials/utils/line_split.cpp @@ -0,0 +1,39 @@ +#include "line_split.h" + +TLineSplitter::TLineSplitter(IInputStream& stream) + : Stream_(stream) +{ +} + +size_t TLineSplitter::Next(TString& st) { + st.clear(); + char c; + size_t ret = 0; + if (HasPendingLineChar_) { + st.push_back(PendingLineChar_); + HasPendingLineChar_ = false; + ++ret; + } + + while (Stream_.ReadChar(c)) { + ++ret; + if (c == '\n') { + break; + } else if (c == '\r') { + if (Stream_.ReadChar(c)) { + ++ret; + if (c != '\n') { + --ret; + PendingLineChar_ = c; + HasPendingLineChar_ = true; + } + } + + break; + } else { + st.push_back(c); + } + } + + return ret; +} diff --git a/yql/essentials/utils/line_split.h b/yql/essentials/utils/line_split.h new file mode 100644 index 0000000000..65e940990c --- /dev/null +++ b/yql/essentials/utils/line_split.h @@ -0,0 +1,16 @@ +#pragma once + +#include <util/stream/input.h> +#include <util/generic/string.h> + +class TLineSplitter final { +public: + explicit TLineSplitter(IInputStream& stream); + + size_t Next(TString& st); + +private: + IInputStream& Stream_; + bool HasPendingLineChar_ = false; + char PendingLineChar_ = 0; +}; diff --git a/yql/essentials/utils/ya.make b/yql/essentials/utils/ya.make index 8f0181520a..8f34cc54ba 100644 --- a/yql/essentials/utils/ya.make +++ b/yql/essentials/utils/ya.make @@ -13,6 +13,7 @@ SRCS( hash.cpp hash.h limiting_allocator.cpp + line_split.cpp md5_stream.cpp md5_stream.h method_index.cpp |