1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
|
%%{
machine MultitokenDef;
# AddLastToken(ts, tokend) should be implemented except member functions called here
action begin_token {
BeginToken(ts, p);
}
action begin_word {
BeginToken(ts, p, TOKEN_WORD);
}
action begin_number {
BeginToken(ts, p, TOKEN_NUMBER);
}
action update_token {
UpdateToken();
}
action add_token {
AddToken();
}
action update_prefix {
UpdatePrefix(*p);
}
action update_suffix {
UpdateSuffix(*p);
}
# @ATTENTION if '%' is added to subtokdelim it breaks the code in MakeMultitokenEntry(): utf8 = Find(.., PERCENT_CHAR, ..);
# in this case two chars that follow '%' must be checked for one of '0123456789ABCDEF'
# @note when '%' action fired 'p' points to the next character so to take the previous character use 'p[-1]'
tokendelim = ( cc_apostrophe %{ SetTokenDelim(TOKDELIM_APOSTROPHE, p[-1]); } )
| ( cc_minus %{ SetTokenDelim(TOKDELIM_MINUS, p[-1]); } ); # ['-] = tokdelim
multitokendelim = ( cc_plus %{ SetTokenDelim(TOKDELIM_PLUS, p[-1]); } )
| ( cc_underscore %{ SetTokenDelim(TOKDELIM_UNDERSCORE, p[-1]); } )
| ( cc_slash %{ SetTokenDelim(TOKDELIM_SLASH, p[-1]); } )
| ( cc_atsign %{ SetTokenDelim(TOKDELIM_AT_SIGN, p[-1]); } )
| ( cc_dot %{ SetTokenDelim(TOKDELIM_DOT, p[-1]); } ); # [+_/@.] = identdelim + [.]
tokpart = ( tokchar ( tokchar | accent )* ); # | ( yspecialkey );
numpart = ( ydigit ( ydigit | accent )* );
tokfirst = ( ( ( accent* >begin_token ) ( tokpart >begin_word ) ) $update_token %add_token );
tokfirst_special = ( ( ( accent* >begin_token ) ( yspecialkey >begin_word ) ) $update_token %add_token );
toknext = ( tokpart >begin_word $update_token %add_token );
numfirst = ( ( ( accent* >begin_token ) ( numpart >begin_number ) ) $update_token %add_token );
numnext = ( numpart >begin_number $update_token %add_token );
#wordpart = tokfirst;
toksuffix = (cc_numbersign | cc_plus | cc_plus.cc_plus) $update_suffix; # ([#] | [+] | [+][+])
# - in case of " abc&x301;123 " accent is attached to "abc"
# - 'accent*' cannot be removed from the front 'token' and 'number' because in this case text "abc-&x301;123" or
# "123-&x301;abc" it will be processed incorrectly
# - begin_token can be called twice in case "exa­́mple" so BeginToken() has 'if (CurCharSpan.Len == 0)'
# and it processes only the first call
solidtoken = ( tokfirst ( numnext toknext )* )
| ( numfirst toknext ( numnext toknext )* )
| ( numfirst ( toknext numnext )* )
| ( tokfirst numnext ( toknext numnext )* )
| (tokfirst_special);
multitoken = ( solidtoken ( tokendelim solidtoken ){,4} );
multitokenwithsuffix = ( ( tokprefix $update_prefix )? multitoken toksuffix? );
compositemultitoken = ( multitokenwithsuffix ( multitokendelim multitokenwithsuffix )* );
}%%
|