aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/tokenizer/symbols.rl
blob: 3a77bd2cad207cb41e690f230843c10ac4693486 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
%%{
    machine Symbols;

    include CharacterClasses "charclasses_8.rl";

    #
    # CODES_YANDEX symbols
    #

    EOF = cc_zero;
    accent = cc_accent; # required for multitoken.rl

    yc_lf = cc_linefeed; # [\n]
    yc_cr = cc_carriagereturn; # [\r]
    yc_sp = cc_whitespace; # [\t\n\v\f\r ]

    yspecialkey = cc_math_non_ascii | cc_currency_non_ascii | cc_special_non_ascii | cc_numerosign | cc_copyrightsign;
    yspecial = accent | cc_softhyphen | cc_nbsp | cc_sectionsign | cc_special | cc_special_non_ascii | cc_numerosign | cc_copyrightsign;

    ydigit = cc_digit;
    ycapital = cc_capitalalpha;
    ysmall = cc_smallalpha;

    yalpha = ycapital | ysmall | cc_unicasealpha;
    yalnum = ydigit | yalpha;

    ytitle = ydigit | ycapital | cc_unicasealpha; # may be at the beginning of sentence
    cjk_title = ytitle | cc_ideograph;
    ylower = ysmall;            # the same as (yalnum - ytitle)

    termpunct = cc_termpunct;
    cjk_termpunct = cc_cjk_termpunct;

    # Multitoken composition: delimiters and suffixes
    tokdelim = cc_apostrophe | cc_minus;   # [\'\-] TODO: add yc_underscore [_]
    tokprefix = cc_numbersign | cc_atsign | cc_dollarsign; # [#@$]

    # 1..31 | termpunct | [ \"#\$%&\'()*+,\-/;<=>@\[\\\]\^_\`{|}~] | 0x7F | yspecial
    # yc_07 and yc_1B do not exist
    miscnlp =
        (cc_nbsp  |
        cc_misctext |
        yspecial) - yspecialkey;

    # fallback
    othermisc = any - yalnum - cc_zero - miscnlp - cc_ideograph - cc_surrogatelead - cc_surrogatetail - yspecialkey;
}%%