aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoraneporada <aneporada@yandex-team.com>2024-11-12 07:55:22 +0300
committeraneporada <aneporada@yandex-team.com>2024-11-12 08:10:50 +0300
commit621c8c7dade57165d6d431295f7a9a9143a062fa (patch)
treecc5dee0e889655fded4d4878071a0a6a76d2e17d
parent55cec9f6b0618fb3570fc8ef66aad151f4932591 (diff)
downloadydb-621c8c7dade57165d6d431295f7a9a9143a062fa.tar.gz
Merge GH PR #9404
commit_hash:d780798556aedbe2be898d69185380f2ecb95f9c
-rw-r--r--yql/essentials/parser/lexer_common/lexer.h4
-rw-r--r--yql/essentials/parser/lexer_common/tokens.cpp2
-rw-r--r--yql/essentials/parser/proto_ast/antlr3/proto_ast_antlr3.h2
-rw-r--r--yql/essentials/parser/proto_ast/antlr4/proto_ast_antlr4.h5
-rw-r--r--yql/essentials/sql/v1/SQLv1.g.in9
-rw-r--r--yql/essentials/sql/v1/SQLv1Antlr4.g.in9
-rw-r--r--yql/essentials/sql/v1/format/sql_format.cpp3
-rw-r--r--yql/essentials/sql/v1/lexer/lexer.cpp6
-rw-r--r--yql/essentials/sql/v1/lexer/lexer.h6
-rw-r--r--yql/essentials/sql/v1/lexer/lexer_ut.cpp185
-rw-r--r--yql/essentials/sql/v1/lexer/ut/ya.make12
-rw-r--r--yql/essentials/sql/v1/lexer/ya.make4
-rw-r--r--yql/essentials/sql/v1/sql_ut_antlr4.cpp8
-rw-r--r--yql/essentials/udfs/common/file/file_udf.cpp47
-rw-r--r--yql/essentials/utils/line_split.cpp39
-rw-r--r--yql/essentials/utils/line_split.h16
-rw-r--r--yql/essentials/utils/ya.make1
17 files changed, 296 insertions, 62 deletions
diff --git a/yql/essentials/parser/lexer_common/lexer.h b/yql/essentials/parser/lexer_common/lexer.h
index 560ea20482..f5d9600681 100644
--- a/yql/essentials/parser/lexer_common/lexer.h
+++ b/yql/essentials/parser/lexer_common/lexer.h
@@ -18,7 +18,9 @@ struct TParsedToken {
// TODO: TStringBuf for Name & Content
TString Name;
TString Content;
- // Position of first token symbol
+ // Position of first token byte/symbol
+ // When antlr3 lexer is used, LinePos is a position as in a byte array,
+ // but when antlr4 lexer is used, LinePos is a position as in a symbol array,
ui32 Line = 0; // starts from 1
ui32 LinePos = 0; // starts from 0
};
diff --git a/yql/essentials/parser/lexer_common/tokens.cpp b/yql/essentials/parser/lexer_common/tokens.cpp
index b37b0139ef..014a815e43 100644
--- a/yql/essentials/parser/lexer_common/tokens.cpp
+++ b/yql/essentials/parser/lexer_common/tokens.cpp
@@ -1,6 +1,5 @@
#include "lexer.h"
-
namespace NSQLTranslation {
IOutputStream& OutputTokens(IOutputStream& out, TParsedTokenList::const_iterator begin, TParsedTokenList::const_iterator end) {
@@ -18,5 +17,4 @@ bool Tokenize(ILexer& lexer, const TString& query, const TString& queryName, TPa
return lexer.Tokenize(query, queryName, onNextToken, issues, maxErrors);
}
-
}
diff --git a/yql/essentials/parser/proto_ast/antlr3/proto_ast_antlr3.h b/yql/essentials/parser/proto_ast/antlr3/proto_ast_antlr3.h
index 8f14ff6979..22f40fd1fd 100644
--- a/yql/essentials/parser/proto_ast/antlr3/proto_ast_antlr3.h
+++ b/yql/essentials/parser/proto_ast/antlr3/proto_ast_antlr3.h
@@ -60,6 +60,7 @@ namespace NProtoAST {
try {
Lexer.ReportErrors(&errors);
auto src = Lexer.get_tokSource();
+
for (;;) {
auto token = src->nextToken();
auto type = token->getType();
@@ -69,6 +70,7 @@ namespace NProtoAST {
last.Content = token->getText();
last.Line = token->get_line();
last.LinePos = token->get_charPositionInLine();
+
onNextToken(std::move(last));
if (isEOF) {
break;
diff --git a/yql/essentials/parser/proto_ast/antlr4/proto_ast_antlr4.h b/yql/essentials/parser/proto_ast/antlr4/proto_ast_antlr4.h
index 9a638ff1a5..81973a400a 100644
--- a/yql/essentials/parser/proto_ast/antlr4/proto_ast_antlr4.h
+++ b/yql/essentials/parser/proto_ast/antlr4/proto_ast_antlr4.h
@@ -89,6 +89,11 @@ namespace NProtoAST {
void CollectTokens(IErrorCollector& errors, const NSQLTranslation::ILexer::TTokenCallback& onNextToken) {
try {
+ bool error = false;
+ typename antlr4::YqlErrorListener listener(&errors, &error);
+ Lexer.removeErrorListeners();
+ Lexer.addErrorListener(&listener);
+
for (;;) {
auto token = Lexer.nextToken();
auto type = token->getType();
diff --git a/yql/essentials/sql/v1/SQLv1.g.in b/yql/essentials/sql/v1/SQLv1.g.in
index d640c6d611..653fd7a4b0 100644
--- a/yql/essentials/sql/v1/SQLv1.g.in
+++ b/yql/essentials/sql/v1/SQLv1.g.in
@@ -1724,7 +1724,6 @@ MINUS: '-';
TILDA: '~';
ASTERISK: '*';
SLASH: '/';
-BACKSLASH: '\\';
PERCENT: '%';
SEMICOLON: ';';
DOT: '.';
@@ -1736,9 +1735,6 @@ COLON: ':';
COMMAT: '@';
DOUBLE_COMMAT: '@@';
DOLLAR: '$';
-QUOTE_DOUBLE: '"'; // This comment for fix syntax highlighting "
-QUOTE_SINGLE: '\'';
-BACKTICK: '`';
LBRACE_CURLY: '{';
RBRACE_CURLY: '}';
CARET: '^';
@@ -1747,6 +1743,11 @@ ARROW: '->';
RBRACE_SQUARE: ']';
LBRACE_SQUARE: '['; // pair ]
+fragment BACKSLASH: '\\';
+fragment QUOTE_DOUBLE: '"';
+fragment QUOTE_SINGLE: '\'';
+fragment BACKTICK: '`';
+
// http://www.antlr.org/wiki/pages/viewpage.action?pageId=1782
fragment A:('a'|'A');
fragment B:('b'|'B');
diff --git a/yql/essentials/sql/v1/SQLv1Antlr4.g.in b/yql/essentials/sql/v1/SQLv1Antlr4.g.in
index 25c6fa438b..875774323a 100644
--- a/yql/essentials/sql/v1/SQLv1Antlr4.g.in
+++ b/yql/essentials/sql/v1/SQLv1Antlr4.g.in
@@ -1723,7 +1723,6 @@ MINUS: '-';
TILDA: '~';
ASTERISK: '*';
SLASH: '/';
-BACKSLASH: '\\';
PERCENT: '%';
SEMICOLON: ';';
DOT: '.';
@@ -1735,9 +1734,6 @@ COLON: ':';
COMMAT: '@';
DOUBLE_COMMAT: '@@';
DOLLAR: '$';
-QUOTE_DOUBLE: '"'; // This comment for fix syntax highlighting "
-QUOTE_SINGLE: '\'';
-BACKTICK: '`';
LBRACE_CURLY: '{';
RBRACE_CURLY: '}';
CARET: '^';
@@ -1746,6 +1742,11 @@ ARROW: '->';
RBRACE_SQUARE: ']';
LBRACE_SQUARE: '['; // pair ]
+fragment BACKSLASH: '\\';
+fragment QUOTE_DOUBLE: '"';
+fragment QUOTE_SINGLE: '\'';
+fragment BACKTICK: '`';
+
// http://www.antlr.org/wiki/pages/viewpage.action?pageId=1782
fragment A:('a'|'A');
fragment B:('b'|'B');
diff --git a/yql/essentials/sql/v1/format/sql_format.cpp b/yql/essentials/sql/v1/format/sql_format.cpp
index 463c52ede4..80ce5d139e 100644
--- a/yql/essentials/sql/v1/format/sql_format.cpp
+++ b/yql/essentials/sql/v1/format/sql_format.cpp
@@ -26,6 +26,7 @@ using namespace NSQLv1Generated;
using NSQLTranslation::TParsedToken;
using NSQLTranslation::TParsedTokenList;
+using NSQLTranslationV1::IsProbablyKeyword;
using TTokenIterator = TParsedTokenList::const_iterator;
TTokenIterator SkipWS(TTokenIterator curr, TTokenIterator end) {
@@ -55,7 +56,7 @@ bool Validate(const TParsedTokenList& query, const TParsedTokenList& formattedQu
if (in->Name != out->Name) {
return false;
}
- if (AsciiEqualsIgnoreCase(in->Name, in->Content)) {
+ if (IsProbablyKeyword(*in)) {
if (!AsciiEqualsIgnoreCase(in->Content, out->Content)) {
return false;
}
diff --git a/yql/essentials/sql/v1/lexer/lexer.cpp b/yql/essentials/sql/v1/lexer/lexer.cpp
index b6d2362f21..1d38ec3d8b 100644
--- a/yql/essentials/sql/v1/lexer/lexer.cpp
+++ b/yql/essentials/sql/v1/lexer/lexer.cpp
@@ -9,6 +9,8 @@
#include <yql/essentials/parser/proto_ast/gen/v1_antlr4/SQLv1Antlr4Lexer.h>
#include <yql/essentials/parser/proto_ast/gen/v1_ansi_antlr4/SQLv1Antlr4Lexer.h>
+#include <util/string/ascii.h>
+
#if defined(_tsan_enabled_)
#include <util/system/mutex.h>
#endif
@@ -74,4 +76,8 @@ NSQLTranslation::ILexer::TPtr MakeLexer(bool ansi, bool antlr4) {
return NSQLTranslation::ILexer::TPtr(new TV1Lexer(ansi, antlr4));
}
+bool IsProbablyKeyword(const NSQLTranslation::TParsedToken& token) {
+ return AsciiEqualsIgnoreCase(token.Name, token.Content);
+}
+
} // namespace NSQLTranslationV1
diff --git a/yql/essentials/sql/v1/lexer/lexer.h b/yql/essentials/sql/v1/lexer/lexer.h
index fe0102be79..25bfe28f81 100644
--- a/yql/essentials/sql/v1/lexer/lexer.h
+++ b/yql/essentials/sql/v1/lexer/lexer.h
@@ -6,4 +6,10 @@ namespace NSQLTranslationV1 {
NSQLTranslation::ILexer::TPtr MakeLexer(bool ansi, bool antlr4);
+// "Probably" because YQL keyword can be an identifier
+// depending on a query context. For example
+// in SELECT * FROM group - group is an identifier, but
+// in SELECT * FROM ... GROUP BY ... - group is a keyword.
+bool IsProbablyKeyword(const NSQLTranslation::TParsedToken& token);
+
}
diff --git a/yql/essentials/sql/v1/lexer/lexer_ut.cpp b/yql/essentials/sql/v1/lexer/lexer_ut.cpp
new file mode 100644
index 0000000000..679ae472b1
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/lexer_ut.cpp
@@ -0,0 +1,185 @@
+#include "lexer.h"
+
+#include <yql/essentials/core/issue/yql_issue.h>
+#include <yql/essentials/sql/settings/translation_settings.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+using namespace NSQLTranslation;
+using namespace NSQLTranslationV1;
+
+std::pair<TParsedTokenList, NYql::TIssues> Tokenize(ILexer::TPtr& lexer, TString queryUtf8) {
+ TParsedTokenList tokens;
+ NYql::TIssues issues;
+ Tokenize(*lexer, queryUtf8, "Query", tokens, issues, SQL_MAX_PARSER_ERRORS);
+ return {tokens, issues};
+}
+
+TVector<TString> GetIssueMessages(ILexer::TPtr& lexer, TString queryUtf8) {
+ TVector<TString> messages;
+ for (const auto& issue : Tokenize(lexer, queryUtf8).second) {
+ messages.emplace_back(issue.ToString(/* oneLine = */ true));
+ }
+ return messages;
+}
+
+TVector<TString> GetTokenViews(ILexer::TPtr& lexer, TString queryUtf8) {
+ TVector<TString> names;
+ for (auto& token : Tokenize(lexer, queryUtf8).first) {
+ TString view = std::move(token.Name);
+ if (view == "ID_PLAIN" || view == "STRING_VALUE") {
+ view.append(" (");
+ view.append(token.Content);
+ view.append(")");
+ }
+ names.emplace_back(std::move(view));
+ }
+ return names;
+}
+
+void AssertEquivialent(const TParsedToken& lhs, const TParsedToken& rhs) {
+ if (lhs.Name == "EOF" && rhs.Name == "EOF") {
+ return;
+ }
+
+ UNIT_ASSERT_VALUES_EQUAL(lhs.Name, rhs.Name);
+ UNIT_ASSERT_VALUES_EQUAL(lhs.Content, rhs.Content);
+ UNIT_ASSERT_VALUES_EQUAL(lhs.Line, rhs.Line);
+}
+
+void AssertEquivialent(const TParsedTokenList& lhs, const TParsedTokenList& rhs) {
+ UNIT_ASSERT_VALUES_EQUAL(lhs.size(), rhs.size());
+ for (size_t i = 0; i < lhs.size(); ++i) {
+ AssertEquivialent(lhs.at(i), rhs.at(i));
+ }
+}
+
+Y_UNIT_TEST_SUITE(SQLv1Lexer) {
+ Y_UNIT_TEST(AntlrVersionIndependent) {
+ const TVector<TString> queriesUtf8 = {
+ "",
+ " ",
+ "SELECT",
+ "SEL", // identifier
+ "SELECT FROM test",
+ "SELECT * FROM",
+ " SELECT * FROM ",
+ "SELECT \"\xF0\x9F\x98\x8A\" FROM ydb",
+ (
+ "SELECT \"\xF0\x9F\x98\x8A Hello, друзья\", count, name\n"
+ "FROM table -- главная таблица 数据库 \n"
+ "WHERE count < 6\n"
+ " AND name = \"可靠性\"\n"
+ " AND count > 12"),
+ "\"select\"select",
+ };
+
+ auto lexer3 = MakeLexer(/* ansi = */ false, /* antlr4 = */ false);
+ auto lexer4 = MakeLexer(/* ansi = */ false, /* antlr4 = */ true);
+
+ for (const auto& query : queriesUtf8) {
+ auto [tokens3, issues3] = Tokenize(lexer3, query);
+ auto [tokens4, issues4] = Tokenize(lexer4, query);
+ AssertEquivialent(tokens3, tokens4);
+ UNIT_ASSERT(issues3.Empty());
+ UNIT_ASSERT(issues4.Empty());
+ }
+ }
+
+ TVector<TString> InvalidQueries();
+
+ void TestInvalidTokensSkipped(bool antlr4, const TVector<TVector<TString>>& expected) {
+ auto lexer = MakeLexer(/* ansi = */ false, antlr4);
+
+ auto input = InvalidQueries();
+ UNIT_ASSERT_VALUES_EQUAL(input.size(), expected.size());
+
+ for (size_t i = 0; i < input.size(); ++i) {
+ UNIT_ASSERT_VALUES_EQUAL(GetTokenViews(lexer, input[i]), expected[i]);
+ }
+ }
+
+ TVector<TString> InvalidQueries() {
+ return {
+ /* 0: */ "\xF0\x9F\x98\x8A",
+ /* 1: */ "select \"aaaa",
+ /* 2: */ "\"\\\"",
+ /* 3: */ "\xF0\x9F\x98\x8A SELECT * FR",
+ /* 4: */ "! SELECT * from",
+ /* 5: */ "\xF0\x9F\x98\x8Aselect ! from",
+ /* 6: */ "\"",
+ /* 7: */ "!select",
+ /* 8: */ "SELECT \\\"\xF0\x9F\x98\x8A\\\" FROM test",
+ };
+ }
+
+ Y_UNIT_TEST(ErrorRecoveryAntlr3) {
+ TVector<TVector<TString>> actual = {
+ /* 0: */ {"EOF"},
+ /* 1: */ {"SELECT", "WS", "EOF"},
+ /* 2: */ {"EOF"},
+ /* 3: */ {"WS", "SELECT", "WS", "ASTERISK", "WS", "ID_PLAIN (FR)", "EOF"},
+ /* 4: */ {"ID_PLAIN (ELECT)", "WS", "ASTERISK", "WS", "WS", "FROM", "EOF"},
+ /* 5: */ {"SELECT", "WS", "ID_PLAIN (rom)", "EOF"},
+ /* 6: */ {"EOF"},
+ /* 7: */ {"ID_PLAIN (lect)", "EOF"},
+ /* 8: */ {"SELECT", "WS", "EOF"},
+ };
+ TestInvalidTokensSkipped(/* antlr4 = */ false, actual);
+ }
+
+ Y_UNIT_TEST(ErrorRecoveryAntlr4) {
+ TVector<TVector<TString>> actual = {
+ /* 0: */ {"EOF"},
+ /* 1: */ {"SELECT", "WS", "EOF"},
+ /* 2: */ {"EOF"},
+ /* 3: */ {"WS", "SELECT", "WS", "ASTERISK", "WS", "ID_PLAIN (FR)", "EOF"},
+ /* 4: */ {"SELECT", "WS", "ASTERISK", "WS", "WS", "FROM", "EOF"},
+ /* 5: */ {"SELECT", "WS", "FROM", "EOF"},
+ /* 6: */ {"EOF"},
+ /* 7: */ {"ID_PLAIN (elect)", "EOF"},
+ /* 8: */ {"SELECT", "WS", "EOF"},
+ };
+ TestInvalidTokensSkipped(/* antlr4 = */ true, actual);
+ }
+
+ Y_UNIT_TEST(IssuesCollected) {
+ auto lexer3 = MakeLexer(/* ansi = */ false, /* antlr4 = */ false);
+ auto lexer4 = MakeLexer(/* ansi = */ false, /* antlr4 = */ true);
+
+ for (const auto& query : InvalidQueries()) {
+ auto issues3 = GetIssueMessages(lexer3, query);
+ auto issues4 = GetIssueMessages(lexer4, query);
+
+ UNIT_ASSERT(!issues3.empty());
+ UNIT_ASSERT(!issues4.empty());
+ }
+ }
+
+ Y_UNIT_TEST(IssueMessagesAntlr3) {
+ auto lexer3 = MakeLexer(/* ansi = */ false, /* antlr4 = */ false);
+
+ auto actual = GetIssueMessages(lexer3, "\xF0\x9F\x98\x8A SELECT * FR");
+
+ TVector<TString> expected = {
+ "<main>:1:0: Error: Unexpected character '\xF0\x9F\x98\x8A' (Unicode character <128522>) : cannot match to any predicted input...",
+ "<main>:1:1: Error: Unexpected character : cannot match to any predicted input...",
+ "<main>:1:2: Error: Unexpected character : cannot match to any predicted input...",
+ "<main>:1:3: Error: Unexpected character : cannot match to any predicted input...",
+ };
+
+ UNIT_ASSERT_VALUES_EQUAL(actual, expected);
+ }
+
+ Y_UNIT_TEST(IssueMessagesAntlr4) {
+ auto lexer4 = MakeLexer(/* ansi = */ false, /* antlr4 = */ true);
+
+ auto actual = GetIssueMessages(lexer4, "\xF0\x9F\x98\x8A SELECT * FR");
+
+ TVector<TString> expected = {
+ "<main>:1:0: Error: token recognition error at: '\xF0\x9F\x98\x8A'",
+ };
+
+ UNIT_ASSERT_VALUES_EQUAL(actual, expected);
+ }
+}
diff --git a/yql/essentials/sql/v1/lexer/ut/ya.make b/yql/essentials/sql/v1/lexer/ut/ya.make
new file mode 100644
index 0000000000..a05178ada5
--- /dev/null
+++ b/yql/essentials/sql/v1/lexer/ut/ya.make
@@ -0,0 +1,12 @@
+UNITTEST_FOR(yql/essentials/sql/v1/lexer)
+
+PEERDIR(
+ yql/essentials/core/issue
+ yql/essentials/parser/lexer_common
+)
+
+SRCS(
+ lexer_ut.cpp
+)
+
+END()
diff --git a/yql/essentials/sql/v1/lexer/ya.make b/yql/essentials/sql/v1/lexer/ya.make
index 5174f6f595..cd5eea1b70 100644
--- a/yql/essentials/sql/v1/lexer/ya.make
+++ b/yql/essentials/sql/v1/lexer/ya.make
@@ -17,3 +17,7 @@ SUPPRESSIONS(
)
END()
+
+RECURSE_FOR_TESTS(
+ ut
+)
diff --git a/yql/essentials/sql/v1/sql_ut_antlr4.cpp b/yql/essentials/sql/v1/sql_ut_antlr4.cpp
index 7f11822cca..b92ea5f460 100644
--- a/yql/essentials/sql/v1/sql_ut_antlr4.cpp
+++ b/yql/essentials/sql/v1/sql_ut_antlr4.cpp
@@ -3004,7 +3004,7 @@ Y_UNIT_TEST_SUITE(SqlToYQLErrors) {
UNIT_ASSERT(!res.Root);
TString a1 = Err2Str(res);
- TString a2(R"foo(<main>:1:16: Error: Unknown cluster: edar
+ TString a2(R"foo(<main>:1:14: Error: token recognition error at: 'с'
)foo");
UNIT_ASSERT_NO_DIFF(a1, a2);
@@ -3016,8 +3016,8 @@ Y_UNIT_TEST_SUITE(SqlToYQLErrors) {
UNIT_ASSERT(!res1.Root);
UNIT_ASSERT(!res2.Root);
- UNIT_ASSERT_NO_DIFF(Err2Str(res1), "<main>:1:12: Error: mismatched input 'b' expecting {<EOF>, ';'}\n");
- UNIT_ASSERT_NO_DIFF(Err2Str(res2), "<main>:1:12: Error: mismatched input 'b' expecting {<EOF>, ';'}\n");
+ UNIT_ASSERT_NO_DIFF(Err2Str(res1), "<main>:1:13: Error: token recognition error at: '';'\n");
+ UNIT_ASSERT_NO_DIFF(Err2Str(res2), "<main>:1:13: Error: token recognition error at: '\";'\n");
}
Y_UNIT_TEST(InvalidHexInStringLiteral) {
@@ -3055,7 +3055,7 @@ Y_UNIT_TEST_SUITE(SqlToYQLErrors) {
Y_UNIT_TEST(InvalidStringFromTable) {
NYql::TAstParseResult res = SqlToYql("select \"FOO\"\"BAR from plato.foo");
UNIT_ASSERT(!res.Root);
- UNIT_ASSERT_NO_DIFF(Err2Str(res), "<main>:1:12: Error: mismatched input '\"' expecting {<EOF>, ';'}\n");
+ UNIT_ASSERT_NO_DIFF(Err2Str(res), "<main>:1:12: Error: token recognition error at: '\"BAR from plato.foo'\n");
}
Y_UNIT_TEST(InvalidDoubleAtStringFromTable) {
diff --git a/yql/essentials/udfs/common/file/file_udf.cpp b/yql/essentials/udfs/common/file/file_udf.cpp
index 57db826591..fa5f84a2c5 100644
--- a/yql/essentials/udfs/common/file/file_udf.cpp
+++ b/yql/essentials/udfs/common/file/file_udf.cpp
@@ -1,4 +1,5 @@
#include <yql/essentials/public/udf/udf_helpers.h>
+#include <yql/essentials/utils/line_split.h>
#include <util/generic/yexception.h>
#include <util/stream/buffered.h>
@@ -236,52 +237,6 @@ namespace {
const TTerminateFunc TerminateFunc;
};
- class TLineSplitter {
- public:
- TLineSplitter(IInputStream& stream)
- : Stream_(stream)
- {
- }
-
- size_t Next(TString& st) {
- st.clear();
- char c;
- size_t ret = 0;
- if (HasPendingLineChar_) {
- st.push_back(PendingLineChar_);
- HasPendingLineChar_ = false;
- ++ret;
- }
-
- while (Stream_.ReadChar(c)) {
- ++ret;
- if (c == '\n') {
- break;
- } else if (c == '\r') {
- if (Stream_.ReadChar(c)) {
- ++ret;
- if (c != '\n') {
- --ret;
- PendingLineChar_ = c;
- HasPendingLineChar_ = true;
- }
- }
-
- break;
- } else {
- st.push_back(c);
- }
- }
-
- return ret;
- }
-
- private:
- IInputStream& Stream_;
- bool HasPendingLineChar_ = false;
- char PendingLineChar_ = 0;
- };
-
template <class TUserType>
class TLineByLineBoxedValueIterator: public TBoxedValue {
public:
diff --git a/yql/essentials/utils/line_split.cpp b/yql/essentials/utils/line_split.cpp
new file mode 100644
index 0000000000..657d0edd9b
--- /dev/null
+++ b/yql/essentials/utils/line_split.cpp
@@ -0,0 +1,39 @@
+#include "line_split.h"
+
+TLineSplitter::TLineSplitter(IInputStream& stream)
+ : Stream_(stream)
+{
+}
+
+size_t TLineSplitter::Next(TString& st) {
+ st.clear();
+ char c;
+ size_t ret = 0;
+ if (HasPendingLineChar_) {
+ st.push_back(PendingLineChar_);
+ HasPendingLineChar_ = false;
+ ++ret;
+ }
+
+ while (Stream_.ReadChar(c)) {
+ ++ret;
+ if (c == '\n') {
+ break;
+ } else if (c == '\r') {
+ if (Stream_.ReadChar(c)) {
+ ++ret;
+ if (c != '\n') {
+ --ret;
+ PendingLineChar_ = c;
+ HasPendingLineChar_ = true;
+ }
+ }
+
+ break;
+ } else {
+ st.push_back(c);
+ }
+ }
+
+ return ret;
+}
diff --git a/yql/essentials/utils/line_split.h b/yql/essentials/utils/line_split.h
new file mode 100644
index 0000000000..65e940990c
--- /dev/null
+++ b/yql/essentials/utils/line_split.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <util/stream/input.h>
+#include <util/generic/string.h>
+
+class TLineSplitter final {
+public:
+ explicit TLineSplitter(IInputStream& stream);
+
+ size_t Next(TString& st);
+
+private:
+ IInputStream& Stream_;
+ bool HasPendingLineChar_ = false;
+ char PendingLineChar_ = 0;
+};
diff --git a/yql/essentials/utils/ya.make b/yql/essentials/utils/ya.make
index 8f0181520a..8f34cc54ba 100644
--- a/yql/essentials/utils/ya.make
+++ b/yql/essentials/utils/ya.make
@@ -13,6 +13,7 @@ SRCS(
hash.cpp
hash.h
limiting_allocator.cpp
+ line_split.cpp
md5_stream.cpp
md5_stream.h
method_index.cpp