diff options
author | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
---|---|---|
committer | qrort <qrort@yandex-team.com> | 2022-11-30 23:47:12 +0300 |
commit | 22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch) | |
tree | bffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/symbols.rl | |
parent | 332b99e2173f0425444abb759eebcb2fafaa9209 (diff) | |
download | ydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz |
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/symbols.rl')
-rw-r--r-- | library/cpp/tokenizer/symbols.rl | 48 |
1 files changed, 48 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/symbols.rl b/library/cpp/tokenizer/symbols.rl new file mode 100644 index 00000000000..3a77bd2cad2 --- /dev/null +++ b/library/cpp/tokenizer/symbols.rl @@ -0,0 +1,48 @@ +%%{ + machine Symbols; + + include CharacterClasses "charclasses_8.rl"; + + # + # CODES_YANDEX symbols + # + + EOF = cc_zero; + accent = cc_accent; # required for multitoken.rl + + yc_lf = cc_linefeed; # [\n] + yc_cr = cc_carriagereturn; # [\r] + yc_sp = cc_whitespace; # [\t\n\v\f\r ] + + yspecialkey = cc_math_non_ascii | cc_currency_non_ascii | cc_special_non_ascii | cc_numerosign | cc_copyrightsign; + yspecial = accent | cc_softhyphen | cc_nbsp | cc_sectionsign | cc_special | cc_special_non_ascii | cc_numerosign | cc_copyrightsign; + + ydigit = cc_digit; + ycapital = cc_capitalalpha; + ysmall = cc_smallalpha; + + yalpha = ycapital | ysmall | cc_unicasealpha; + yalnum = ydigit | yalpha; + + ytitle = ydigit | ycapital | cc_unicasealpha; # may be at the beginning of sentence + cjk_title = ytitle | cc_ideograph; + ylower = ysmall; # the same as (yalnum - ytitle) + + termpunct = cc_termpunct; + cjk_termpunct = cc_cjk_termpunct; + + # Multitoken composition: delimiters and suffixes + tokdelim = cc_apostrophe | cc_minus; # [\'\-] TODO: add yc_underscore [_] + tokprefix = cc_numbersign | cc_atsign | cc_dollarsign; # [#@$] + + # 1..31 | termpunct | [ \"#\$%&\'()*+,\-/;<=>@\[\\\]\^_\`{|}~] | 0x7F | yspecial + # yc_07 and yc_1B do not exist + miscnlp = + (cc_nbsp | + cc_misctext | + yspecial) - yspecialkey; + + # fallback + othermisc = any - yalnum - cc_zero - miscnlp - cc_ideograph - cc_surrogatelead - cc_surrogatetail - yspecialkey; +}%% + |