aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/tokenizer/symbols.rl
diff options
context:
space:
mode:
authorqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
committerqrort <qrort@yandex-team.com>2022-11-30 23:47:12 +0300
commit22f8ae0e3f5d68b92aecccdf96c1d841a0334311 (patch)
treebffa27765faf54126ad44bcafa89fadecb7a73d7 /library/cpp/tokenizer/symbols.rl
parent332b99e2173f0425444abb759eebcb2fafaa9209 (diff)
downloadydb-22f8ae0e3f5d68b92aecccdf96c1d841a0334311.tar.gz
validate canons without yatest_common
Diffstat (limited to 'library/cpp/tokenizer/symbols.rl')
-rw-r--r--library/cpp/tokenizer/symbols.rl48
1 files changed, 48 insertions, 0 deletions
diff --git a/library/cpp/tokenizer/symbols.rl b/library/cpp/tokenizer/symbols.rl
new file mode 100644
index 00000000000..3a77bd2cad2
--- /dev/null
+++ b/library/cpp/tokenizer/symbols.rl
@@ -0,0 +1,48 @@
+%%{
+ machine Symbols;
+
+ include CharacterClasses "charclasses_8.rl";
+
+ #
+ # CODES_YANDEX symbols
+ #
+
+ EOF = cc_zero;
+ accent = cc_accent; # required for multitoken.rl
+
+ yc_lf = cc_linefeed; # [\n]
+ yc_cr = cc_carriagereturn; # [\r]
+ yc_sp = cc_whitespace; # [\t\n\v\f\r ]
+
+ yspecialkey = cc_math_non_ascii | cc_currency_non_ascii | cc_special_non_ascii | cc_numerosign | cc_copyrightsign;
+ yspecial = accent | cc_softhyphen | cc_nbsp | cc_sectionsign | cc_special | cc_special_non_ascii | cc_numerosign | cc_copyrightsign;
+
+ ydigit = cc_digit;
+ ycapital = cc_capitalalpha;
+ ysmall = cc_smallalpha;
+
+ yalpha = ycapital | ysmall | cc_unicasealpha;
+ yalnum = ydigit | yalpha;
+
+ ytitle = ydigit | ycapital | cc_unicasealpha; # may be at the beginning of sentence
+ cjk_title = ytitle | cc_ideograph;
+ ylower = ysmall; # the same as (yalnum - ytitle)
+
+ termpunct = cc_termpunct;
+ cjk_termpunct = cc_cjk_termpunct;
+
+ # Multitoken composition: delimiters and suffixes
+ tokdelim = cc_apostrophe | cc_minus; # [\'\-] TODO: add yc_underscore [_]
+ tokprefix = cc_numbersign | cc_atsign | cc_dollarsign; # [#@$]
+
+ # 1..31 | termpunct | [ \"#\$%&\'()*+,\-/;<=>@\[\\\]\^_\`{|}~] | 0x7F | yspecial
+ # yc_07 and yc_1B do not exist
+ miscnlp =
+ (cc_nbsp |
+ cc_misctext |
+ yspecial) - yspecialkey;
+
+ # fallback
+ othermisc = any - yalnum - cc_zero - miscnlp - cc_ideograph - cc_surrogatelead - cc_surrogatetail - yspecialkey;
+}%%
+