diff options
author | amnosov <amnosov@yandex-team.com> | 2022-10-26 11:59:40 +0300 |
---|---|---|
committer | amnosov <amnosov@yandex-team.com> | 2022-10-26 11:59:40 +0300 |
commit | 4225eab76862f099d4d55a0205ab0cdd39c0433c (patch) | |
tree | 842ff268488999a8f54243cfb10ba96fb333645b /library/cpp/unicode/set/unicode_set_parser.cpp | |
parent | 2399206380b6eab57bb7b9ad0bf0ecf851c94c1d (diff) | |
download | ydb-4225eab76862f099d4d55a0205ab0cdd39c0433c.tar.gz |
Unicode::Is{Category}
Unicode::Is{Category} udfs added
Diffstat (limited to 'library/cpp/unicode/set/unicode_set_parser.cpp')
-rw-r--r-- | library/cpp/unicode/set/unicode_set_parser.cpp | 109 |
1 files changed, 109 insertions, 0 deletions
diff --git a/library/cpp/unicode/set/unicode_set_parser.cpp b/library/cpp/unicode/set/unicode_set_parser.cpp new file mode 100644 index 0000000000..754c7add41 --- /dev/null +++ b/library/cpp/unicode/set/unicode_set_parser.cpp @@ -0,0 +1,109 @@ +#include "quoted_pair.h" +#include "unicode_set_lexer.h" + +#include <util/string/cast.h> +#include <util/charset/wide.h> + +namespace NUnicode { + namespace NPrivate { +#define UNEXPECTED_TOKEN throw yexception() << "Unexpected token: " << lexer.GetLastToken() + +#define EXPECT_TOKEN(type) \ + if (lexer.GetToken() != type) { \ + UNEXPECTED_TOKEN; \ + } + + void ParseUnicodeSet(TUnicodeSet& set, TUnicodeSetLexer& lexer); + + void ParseCharSequence(TUnicodeSet& set, TUnicodeSetLexer& lexer) { + wchar32 prevChar = 0; + bool range = false; + for (EUnicodeSetTokenType type = lexer.GetToken(); type != USTT_RBRACKET; type = lexer.GetToken()) { + wchar32 curChar = 0; + switch (type) { + case USTT_SYMBOL: + curChar = lexer.GetLastToken().Symbol; + break; + case USTT_NEGATION: + curChar = '^'; + break; + case USTT_QUOTED_PAIR: + ResolveUnicodeQuotedPair(lexer.GetLastToken().Symbol, curChar, set); + break; + case USTT_CODEPOINT8: + case USTT_CODEPOINT16: + case USTT_CODEPOINT32: + curChar = IntFromString<ui32, 16>(lexer.GetLastToken().Data); + if (curChar >= TUnicodeSet::CODEPOINT_HIGH) { + throw yexception() << "Invalid unicode codepoint: " << lexer.GetLastToken(); + } + break; + case USTT_RANGE: + if (0 == prevChar) { + UNEXPECTED_TOKEN; + } + range = true; + continue; + case USTT_LBRACKET: { + lexer.PushBack(); + TUnicodeSet inner; + ParseUnicodeSet(inner, lexer); + set.Add(inner); + break; + } + default: + UNEXPECTED_TOKEN; + } + if (curChar) { + if (range) { + if (prevChar >= curChar) { + throw yexception() << "Invalid character range"; + } + set.Add(prevChar, curChar); + curChar = 0; + } else { + set.Add(curChar); + } + } else if (range) { + UNEXPECTED_TOKEN; + } + range = false; + prevChar = curChar; + } + if (range) { + UNEXPECTED_TOKEN; + } + lexer.PushBack(); + } + + void ParseUnicodeSet(TUnicodeSet& set, TUnicodeSetLexer& lexer) { + EXPECT_TOKEN(USTT_LBRACKET); + bool invert = false; + if (USTT_NEGATION == lexer.GetToken()) { + invert = true; + } else { + lexer.PushBack(); + } + + if (USTT_CATEGORY == lexer.GetToken()) { + set.AddCategory(WideToUTF8(lexer.GetLastToken().Data)); + } else { + lexer.PushBack(); + ParseCharSequence(set, lexer); + } + + EXPECT_TOKEN(USTT_RBRACKET); + + if (invert) { + set.Invert(); + } + } + + void ParseUnicodeSet(TUnicodeSet& set, const TWtringBuf& data) { + TUnicodeSetLexer lexer(data); + ParseUnicodeSet(set, lexer); + EXPECT_TOKEN(USTT_EOS); + } + + } // NPrivate +} |