diff options
author | amnosov <amnosov@yandex-team.com> | 2022-10-26 11:59:40 +0300 |
---|---|---|
committer | amnosov <amnosov@yandex-team.com> | 2022-10-26 11:59:40 +0300 |
commit | 4225eab76862f099d4d55a0205ab0cdd39c0433c (patch) | |
tree | 842ff268488999a8f54243cfb10ba96fb333645b /library/cpp/unicode/set/unicode_set_lexer.rl6 | |
parent | 2399206380b6eab57bb7b9ad0bf0ecf851c94c1d (diff) | |
download | ydb-4225eab76862f099d4d55a0205ab0cdd39c0433c.tar.gz |
Unicode::Is{Category}
Unicode::Is{Category} udfs added
Diffstat (limited to 'library/cpp/unicode/set/unicode_set_lexer.rl6')
-rw-r--r-- | library/cpp/unicode/set/unicode_set_lexer.rl6 | 125 |
1 files changed, 125 insertions, 0 deletions
diff --git a/library/cpp/unicode/set/unicode_set_lexer.rl6 b/library/cpp/unicode/set/unicode_set_lexer.rl6 new file mode 100644 index 0000000000..ebbc131556 --- /dev/null +++ b/library/cpp/unicode/set/unicode_set_lexer.rl6 @@ -0,0 +1,125 @@ +#include <library/cpp/unicode/set/unicode_set_lexer.h> + +#include <util/generic/yexception.h> + +namespace NUnicode { +namespace NPrivate { + +%%{ + machine unicode_set_lexer; + + alphtype unsigned short; + + action IncorrectCategoryError { + throw yexception() << "incorrect category"; + } + + action IncorrectEscapedCodepointError { + throw yexception() << "incorrect escaped codepoint"; + } + + action IncorrectQuotedPairError { + throw yexception() << "incorrect quoted pair"; + } + + id = alpha (alnum | '_')*; + escape = [%\\]; + + category = (':' id ':') <>^IncorrectCategoryError; + xdigit8 = xdigit{8} @^IncorrectEscapedCodepointError; + xdigit4 = xdigit{4} @^IncorrectEscapedCodepointError; + xdigit2 = xdigit{2} @^IncorrectEscapedCodepointError; + symbol = any @^IncorrectQuotedPairError; + + main := |* + '^' => { + return YieldToken(USTT_NEGATION); + }; + '-' => { + return YieldToken(USTT_RANGE); + }; + '[' => { + return YieldToken(USTT_LBRACKET); + }; + ']' => { + return YieldToken(USTT_RBRACKET); + }; + category => { + return YieldToken(USTT_CATEGORY, ts + 1, te - ts -2); + }; + escape 'U' xdigit8 => { + return YieldToken(USTT_CODEPOINT32, ts + 2, 8); + }; + escape 'u' xdigit4 => { + return YieldToken(USTT_CODEPOINT16, ts + 2, 4); + }; + escape 'x' xdigit2 => { + return YieldToken(USTT_CODEPOINT8, ts + 2, 2); + }; + escape symbol => { + return YieldToken(USTT_QUOTED_PAIR, *(ts + 1)); + }; + any => { + return YieldToken(USTT_SYMBOL, *ts); + }; + *|; + +}%% + +namespace { + +%% write data; + +} + +TUnicodeSetLexer::TUnicodeSetLexer(const TWtringBuf& data) + : Data(data) + , cs(0) + , act(0) + , ts(NULL) + , te(NULL) + , p(Data.data()) + , pe(Data.data() + Data.size()) + , eof(pe) + , UseLast(false) +{ + %% write init; +} + +EUnicodeSetTokenType TUnicodeSetLexer::GetToken() { + if (UseLast) { + UseLast = false; + return LastToken.Type; + } + + %% write exec; + + return YieldToken(USTT_EOS); +} + +EUnicodeSetTokenType TUnicodeSetLexer::YieldToken(EUnicodeSetTokenType type) { + Reset(); + LastToken = TUnicodeSetToken(type); + return type; +} + +EUnicodeSetTokenType TUnicodeSetLexer::YieldToken(EUnicodeSetTokenType type, wchar16 symbol) { + Reset(); + LastToken = TUnicodeSetToken(type, symbol); + return type; +} + +EUnicodeSetTokenType TUnicodeSetLexer::YieldToken(EUnicodeSetTokenType type, const wchar16* dataBegin, size_t dataSize) { + Reset(); + LastToken = TUnicodeSetToken(type, dataBegin, dataSize); + return type; +} + +void TUnicodeSetLexer::Reset() { + p = te; + ts = NULL; + te = NULL; +} + +} // NPrivate +} // NUnicode |