diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/multitoken_v3.rl | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/multitoken_v3.rl')
-rw-r--r-- | library/cpp/tokenizer/multitoken_v3.rl | 77 |
1 files changed, 77 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/multitoken_v3.rl b/library/cpp/tokenizer/multitoken_v3.rl new file mode 100644 index 0000000000..53fce9be87 --- /dev/null +++ b/library/cpp/tokenizer/multitoken_v3.rl @@ -0,0 +1,77 @@ +%%{ + machine MultitokenDef; + + # AddLastToken(ts, tokend) should be implemented except member functions called here + + action begin_token { + BeginToken(ts, p); + } + + action begin_word { + BeginToken(ts, p, TOKEN_WORD); + } + + action begin_number { + BeginToken(ts, p, TOKEN_NUMBER); + } + + action update_token { + UpdateToken(); + } + + action add_token { + AddToken(); + } + + action update_prefix { + UpdatePrefix(*p); + } + + action update_suffix { + UpdateSuffix(*p); + } + + # @ATTENTION if '%' is added to subtokdelim it breaks the code in MakeMultitokenEntry(): utf8 = Find(.., PERCENT_CHAR, ..); + # in this case two chars that follow '%' must be checked for one of '0123456789ABCDEF' + # @note when '%' action fired 'p' points to the next character so to take the previous character use 'p[-1]' + + tokendelim = ( cc_apostrophe %{ SetTokenDelim(TOKDELIM_APOSTROPHE, p[-1]); } ) + | ( cc_minus %{ SetTokenDelim(TOKDELIM_MINUS, p[-1]); } ); # ['-] = tokdelim + + multitokendelim = ( cc_plus %{ SetTokenDelim(TOKDELIM_PLUS, p[-1]); } ) + | ( cc_underscore %{ SetTokenDelim(TOKDELIM_UNDERSCORE, p[-1]); } ) + | ( cc_slash %{ SetTokenDelim(TOKDELIM_SLASH, p[-1]); } ) + | ( cc_atsign %{ SetTokenDelim(TOKDELIM_AT_SIGN, p[-1]); } ) + | ( cc_dot %{ SetTokenDelim(TOKDELIM_DOT, p[-1]); } ); # [+_/@.] = identdelim + [.] + + tokpart = ( tokchar ( tokchar | accent )* ); # | ( yspecialkey ); + numpart = ( ydigit ( ydigit | accent )* ); + + tokfirst = ( ( ( accent* >begin_token ) ( tokpart >begin_word ) ) $update_token %add_token ); + tokfirst_special = ( ( ( accent* >begin_token ) ( yspecialkey >begin_word ) ) $update_token %add_token ); + toknext = ( tokpart >begin_word $update_token %add_token ); + + numfirst = ( ( ( accent* >begin_token ) ( numpart >begin_number ) ) $update_token %add_token ); + numnext = ( numpart >begin_number $update_token %add_token ); + + #wordpart = tokfirst; + + toksuffix = (cc_numbersign | cc_plus | cc_plus . cc_plus) $update_suffix; # ([#] | [+] | [++]) + + # - in case of " abc&x301;123 " accent is attached to "abc" + # - 'accent*' cannot be removed from the front 'token' and 'number' because in this case text "abc-&x301;123" or + # "123-&x301;abc" it will be processed incorrectly + # - begin_token can be called twice in case "exa­́mple" so BeginToken() has 'if (CurCharSpan.Len == 0)' + # and it processes only the first call + + solidtoken = ( tokfirst ( numnext toknext )* ) + | ( numfirst toknext ( numnext toknext )* ) + | ( numfirst ( toknext numnext )* ) + | ( tokfirst numnext ( toknext numnext )* ) + | (tokfirst_special); + + multitoken = ( solidtoken ( tokendelim solidtoken ){,4} ); + multitokenwithsuffix = ( ( tokprefix $update_prefix )? multitoken toksuffix? ); + compositemultitoken = ( multitokenwithsuffix ( multitokendelim multitokenwithsuffix )* ); + +}%% |