blob: 3a77bd2cad207cb41e690f230843c10ac4693486 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
%%{
machine Symbols;
include CharacterClasses "charclasses_8.rl";
#
# CODES_YANDEX symbols
#
EOF = cc_zero;
accent = cc_accent; # required for multitoken.rl
yc_lf = cc_linefeed; # [\n]
yc_cr = cc_carriagereturn; # [\r]
yc_sp = cc_whitespace; # [\t\n\v\f\r ]
yspecialkey = cc_math_non_ascii | cc_currency_non_ascii | cc_special_non_ascii | cc_numerosign | cc_copyrightsign;
yspecial = accent | cc_softhyphen | cc_nbsp | cc_sectionsign | cc_special | cc_special_non_ascii | cc_numerosign | cc_copyrightsign;
ydigit = cc_digit;
ycapital = cc_capitalalpha;
ysmall = cc_smallalpha;
yalpha = ycapital | ysmall | cc_unicasealpha;
yalnum = ydigit | yalpha;
ytitle = ydigit | ycapital | cc_unicasealpha; # may be at the beginning of sentence
cjk_title = ytitle | cc_ideograph;
ylower = ysmall; # the same as (yalnum - ytitle)
termpunct = cc_termpunct;
cjk_termpunct = cc_cjk_termpunct;
# Multitoken composition: delimiters and suffixes
tokdelim = cc_apostrophe | cc_minus; # [\'\-] TODO: add yc_underscore [_]
tokprefix = cc_numbersign | cc_atsign | cc_dollarsign; # [#@$]
# 1..31 | termpunct | [ \"#\$%&\'()*+,\-/;<=>@\[\\\]\^_\`{|}~] | 0x7F | yspecial
# yc_07 and yc_1B do not exist
miscnlp =
(cc_nbsp |
cc_misctext |
yspecial) - yspecialkey;
# fallback
othermisc = any - yalnum - cc_zero - miscnlp - cc_ideograph - cc_surrogatelead - cc_surrogatetail - yspecialkey;
}%%
|