aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/tokenizer/multitoken_v2.rl
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
committerqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
commit22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
treebffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/multitoken_v2.rl
parent332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
downloadydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/multitoken_v2.rl')
-rw-r--r--library/cpp/tokenizer/multitoken_v2.rl77
1 files changed, 77 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/multitoken_v2.rl b/library/cpp/tokenizer/multitoken_v2.rl
new file mode 100644
index 0000000000..7c66273e97
--- /dev/null
+++ b/library/cpp/tokenizer/multitoken_v2.rl
@@ -0,0 +1,77 @@
+%%{
+ machine MultitokenDef;
+
+ # AddLastToken(ts, tokend) should be implemented except member functions called here
+
+ action begin_token {
+ BeginToken(ts, p);
+ }
+
+ action begin_word {
+ BeginToken(ts, p, TOKEN_WORD);
+ }
+
+ action begin_number {
+ BeginToken(ts, p, TOKEN_NUMBER);
+ }
+
+ action update_token {
+ UpdateToken();
+ }
+
+ action add_token {
+ AddToken();
+ }
+
+ action update_prefix {
+ UpdatePrefix(*p);
+ }
+
+ action update_suffix {
+ UpdateSuffix(*p);
+ }
+
+ # @ATTENTION if '%' is added to subtokdelim it breaks the code in MakeMultitokenEntry(): utf8 = Find(.., PERCENT_CHAR, ..);
+ # in this case two chars that follow '%' must be checked for one of '0123456789ABCDEF'
+ # @note when '%' action fired 'p' points to the next character so to take the previous character use 'p[-1]'
+
+ tokendelim = ( cc_apostrophe %{ SetTokenDelim(TOKDELIM_APOSTROPHE, p[-1]); } )
+ | ( cc_minus %{ SetTokenDelim(TOKDELIM_MINUS, p[-1]); } ); # ['-] = tokdelim
+
+ multitokendelim = ( cc_plus %{ SetTokenDelim(TOKDELIM_PLUS, p[-1]); } )
+ | ( cc_underscore %{ SetTokenDelim(TOKDELIM_UNDERSCORE, p[-1]); } )
+ | ( cc_slash %{ SetTokenDelim(TOKDELIM_SLASH, p[-1]); } )
+ | ( cc_atsign %{ SetTokenDelim(TOKDELIM_AT_SIGN, p[-1]); } )
+ | ( cc_dot %{ SetTokenDelim(TOKDELIM_DOT, p[-1]); } ); # [+_/@.] = identdelim + [.]
+
+ tokpart = ( tokchar ( tokchar | accent )* ); # | ( yspecialkey );
+ numpart = ( ydigit ( ydigit | accent )* );
+
+ tokfirst = ( ( ( accent* >begin_token ) ( tokpart >begin_word ) ) $update_token %add_token );
+ tokfirst_special = ( ( ( accent* >begin_token ) ( yspecialkey >begin_word ) ) $update_token %add_token );
+ toknext = ( tokpart >begin_word $update_token %add_token );
+
+ numfirst = ( ( ( accent* >begin_token ) ( numpart >begin_number ) ) $update_token %add_token );
+ numnext = ( numpart >begin_number $update_token %add_token );
+
+ #wordpart = tokfirst;
+
+ toksuffix = (cc_numbersign | cc_plus | cc_plus.cc_plus) $update_suffix; # ([#] | [+] | [+][+])
+
+ # - in case of " abc&x301;123 " accent is attached to "abc"
+ # - 'accent*' cannot be removed from the front 'token' and 'number' because in this case text "abc-&x301;123" or
+ # "123-&x301;abc" it will be processed incorrectly
+ # - begin_token can be called twice in case "exa&shy;&#x301;mple" so BeginToken() has 'if (CurCharSpan.Len == 0)'
+ # and it processes only the first call
+
+ solidtoken = ( tokfirst ( numnext toknext )* )
+ | ( numfirst toknext ( numnext toknext )* )
+ | ( numfirst ( toknext numnext )* )
+ | ( tokfirst numnext ( toknext numnext )* )
+ | (tokfirst_special);
+
+ multitoken = ( solidtoken ( tokendelim solidtoken ){,4} );
+ multitokenwithsuffix = ( ( tokprefix $update_prefix )? multitoken toksuffix? );
+ compositemultitoken = ( multitokenwithsuffix ( multitokendelim multitokenwithsuffix )* );
+
+}%%