aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/unicode/set/unicode_set_parser.cpp
diff options
context:
space:
mode:
authoramnosov <amnosov@yandex-team.com>2022-10-26 11:59:40 +0300
committeramnosov <amnosov@yandex-team.com>2022-10-26 11:59:40 +0300
commit4225eab76862f099d4d55a0205ab0cdd39c0433c (patch)
tree842ff268488999a8f54243cfb10ba96fb333645b /library/cpp/unicode/set/unicode_set_parser.cpp
parent2399206380b6eab57bb7b9ad0bf0ecf851c94c1d (diff)
downloadydb-4225eab76862f099d4d55a0205ab0cdd39c0433c.tar.gz
Unicode::Is{Category}
Unicode::Is{Category} udfs added
Diffstat (limited to 'library/cpp/unicode/set/unicode_set_parser.cpp')
-rw-r--r--library/cpp/unicode/set/unicode_set_parser.cpp109
1 files changed, 109 insertions, 0 deletions
diff --git a/library/cpp/unicode/set/unicode_set_parser.cpp b/library/cpp/unicode/set/unicode_set_parser.cpp
new file mode 100644
index 0000000000..754c7add41
--- /dev/null
+++ b/library/cpp/unicode/set/unicode_set_parser.cpp
@@ -0,0 +1,109 @@
+#include "quoted_pair.h"
+#include "unicode_set_lexer.h"
+
+#include <util/string/cast.h>
+#include <util/charset/wide.h>
+
+namespace NUnicode {
+ namespace NPrivate {
+#define UNEXPECTED_TOKEN throw yexception() << "Unexpected token: " << lexer.GetLastToken()
+
+#define EXPECT_TOKEN(type) \
+ if (lexer.GetToken() != type) { \
+ UNEXPECTED_TOKEN; \
+ }
+
+ void ParseUnicodeSet(TUnicodeSet& set, TUnicodeSetLexer& lexer);
+
+ void ParseCharSequence(TUnicodeSet& set, TUnicodeSetLexer& lexer) {
+ wchar32 prevChar = 0;
+ bool range = false;
+ for (EUnicodeSetTokenType type = lexer.GetToken(); type != USTT_RBRACKET; type = lexer.GetToken()) {
+ wchar32 curChar = 0;
+ switch (type) {
+ case USTT_SYMBOL:
+ curChar = lexer.GetLastToken().Symbol;
+ break;
+ case USTT_NEGATION:
+ curChar = '^';
+ break;
+ case USTT_QUOTED_PAIR:
+ ResolveUnicodeQuotedPair(lexer.GetLastToken().Symbol, curChar, set);
+ break;
+ case USTT_CODEPOINT8:
+ case USTT_CODEPOINT16:
+ case USTT_CODEPOINT32:
+ curChar = IntFromString<ui32, 16>(lexer.GetLastToken().Data);
+ if (curChar >= TUnicodeSet::CODEPOINT_HIGH) {
+ throw yexception() << "Invalid unicode codepoint: " << lexer.GetLastToken();
+ }
+ break;
+ case USTT_RANGE:
+ if (0 == prevChar) {
+ UNEXPECTED_TOKEN;
+ }
+ range = true;
+ continue;
+ case USTT_LBRACKET: {
+ lexer.PushBack();
+ TUnicodeSet inner;
+ ParseUnicodeSet(inner, lexer);
+ set.Add(inner);
+ break;
+ }
+ default:
+ UNEXPECTED_TOKEN;
+ }
+ if (curChar) {
+ if (range) {
+ if (prevChar >= curChar) {
+ throw yexception() << "Invalid character range";
+ }
+ set.Add(prevChar, curChar);
+ curChar = 0;
+ } else {
+ set.Add(curChar);
+ }
+ } else if (range) {
+ UNEXPECTED_TOKEN;
+ }
+ range = false;
+ prevChar = curChar;
+ }
+ if (range) {
+ UNEXPECTED_TOKEN;
+ }
+ lexer.PushBack();
+ }
+
+ void ParseUnicodeSet(TUnicodeSet& set, TUnicodeSetLexer& lexer) {
+ EXPECT_TOKEN(USTT_LBRACKET);
+ bool invert = false;
+ if (USTT_NEGATION == lexer.GetToken()) {
+ invert = true;
+ } else {
+ lexer.PushBack();
+ }
+
+ if (USTT_CATEGORY == lexer.GetToken()) {
+ set.AddCategory(WideToUTF8(lexer.GetLastToken().Data));
+ } else {
+ lexer.PushBack();
+ ParseCharSequence(set, lexer);
+ }
+
+ EXPECT_TOKEN(USTT_RBRACKET);
+
+ if (invert) {
+ set.Invert();
+ }
+ }
+
+ void ParseUnicodeSet(TUnicodeSet& set, const TWtringBuf& data) {
+ TUnicodeSetLexer lexer(data);
+ ParseUnicodeSet(set, lexer);
+ EXPECT_TOKEN(USTT_EOS);
+ }
+
+ } // NPrivate
+}