library/cpp/tokenizer/multitokenparser.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203

#pragma once

#include <library/cpp/token/nlptypes.h>
#include <library/cpp/token/token_structure.h>
#include "multitokenutil.h"

// NOTE: this class inherited by request tokenizers only
class TMultitokenParser {
    TWideToken Multitoken; //!< length of the whole token can include suffixes, if difference between pos and endpos of two tokens is greater than 1 then there is suffix
    TCharSpan CurCharSpan;
    wchar16 PrefixChar;
    wchar16 SuffixChar;

protected:
    TMultitokenParser()
        : PrefixChar(0)
        , SuffixChar(0)
    {
    }

    //! used for deferring tokens in TReqTokenizer
    void SetMultitoken(const TWideToken& tok) {
        Multitoken = tok;
    }

    const TWideToken& GetMultitoken() const {
        return Multitoken;
    }

    //! @param token    start of the multitoken
    //! @param len      length includes suffix: 'ab-c+' length is equal to 5, in case of prefix operator: '-!abc' length is equal to 3
    //!                 and prefix: '#hashtag', '@user_name'
    void SetMultitoken(const wchar16* token, size_t len) {
        TTokenStructure& subtokens = Multitoken.SubTokens;
        if (!subtokens.empty()) {
            // positions of the first subtoken can be non-zero in the request parser
            // but 'token' in this case must point to the first character of the first subtoken
            const size_t pos = subtokens[0].Pos - subtokens[0].PrefixLen;
            Y_ASSERT((subtokens.back().EndPos() - pos + subtokens.back().SuffixLen) == len);
            if (pos) {
                const size_t n = subtokens.size();
                for (size_t i = 0; i < n; ++i)
                    subtokens[i].Pos -= pos;
            }
        }
        Multitoken.Token = token;
        Multitoken.Leng = len;

        Y_ASSERT(CheckMultitoken(Multitoken));
    }

    const TTokenStructure& GetSubtokens() const {
        return Multitoken.SubTokens;
    }

    void BeginToken(const wchar16* tokstart, const wchar16* p) {
        // it can be called twice in case "exa&shy;&#x301;mple", see nlptok.rl, mixedtoken, tokfirst/toknext, numfirst/numnext
        if (CurCharSpan.Len == 0)
            CurCharSpan.Pos = p - tokstart;
    }

    void BeginToken(const wchar16* tokstart, const wchar16* p, ETokenType type) {
        BeginToken(tokstart, p);
        CurCharSpan.Type = type;
    }

    void UpdateToken() {
        CurCharSpan.Len += 1;
    }

    void AddToken() {
        Y_ASSERT(CurCharSpan.Len);
        Y_ASSERT(CurCharSpan.Type == TOKEN_WORD || CurCharSpan.Type == TOKEN_NUMBER);

        TTokenStructure& tokens = Multitoken.SubTokens;

        // @todo if number of tokens is greater than 64 then the last token can consist of numbers, letters and delimiters...
        tokens.push_back(CurCharSpan);

        const size_t n = tokens.size();
        if (n > 1)
            CorrectDelimiters(tokens[n - 2], SuffixChar, tokens[n - 1], PrefixChar);

        CurCharSpan.Pos = 0;
        CurCharSpan.Len = 0; // it is checked in AddLastToken()
        CurCharSpan.PrefixLen = 0;
        PrefixChar = 0;
        SuffixChar = 0;
    }

    void AddIdeograph(size_t len) {
        Y_ASSERT(!CurCharSpan.Len && (len == 1 || len == 2));
        TTokenStructure& tokens = Multitoken.SubTokens;
        Y_ASSERT(tokens.empty());
        tokens.push_back(0, len, TOKEN_WORD);
    }

    void AddLastToken(const wchar16* tokstart, const wchar16* tokend) {
        // - CurCharSpan.Len assigned to 0 in AddToken() because in case of multitoken with '.' at the end, for
        //   example: " well-formed. " parser already called to %add_token because '.' can be delimiter of the next token
        if (CurCharSpan.Len) {
            const wchar16* const actualStart = tokstart + CurCharSpan.Pos;
            // for ex. "5% " can have (actualStart == tokend) because '%' could be part of the next token with utf8 characters
            if (actualStart < tokend) {
                const size_t actualLen = tokend - actualStart;
                if (CurCharSpan.Len != actualLen) // for example "WORD% NEXTWORD" - '%' could be part of UTF8 encoded character and already counted...
                    CurCharSpan.Len = actualLen;
                AddToken();
            } else
                CancelToken();
        } else
            CancelToken();

        TTokenStructure& tokens = Multitoken.SubTokens;
        if (!tokens.empty())
            tokens.back().TokenDelim = TOKDELIM_NULL; // reset delimiter if any
    }

    //! correct the last token if it contains words and numbers and changes length of multitoken
    //! @param len      length of multitoken (including all subtokens), for ex. (te - ts)
    //! @return true if the last token is valid, false - last token is cut off and length is changed
    //! @note in case of '+!abc-...-xyz' length includes '+!', this function doesn't take into account offset of the first subtoken
    //!       if number of subtokens equal to 63 all superfluous subtokens are put into the last subtoken TOKEN_MIXED
    //!       which is cut off in this function
    bool CheckLastToken(size_t& len) {
        TTokenStructure& tokens = Multitoken.SubTokens;
        if (tokens.size() == MAX_SUBTOKENS && tokens.back().Type == TOKEN_MIXED) {
            tokens.pop_back();
            // change delimiter (+) to suffix
            TCharSpan& lasttok = tokens.back();
            len = lasttok.EndPos() + lasttok.SuffixLen;
            // actually '+' should be added if subtoken has suffix '+' because '++' is valid suffix as well
            if (lasttok.TokenDelim == TOKDELIM_PLUS && lasttok.SuffixLen == 0) {
                lasttok.SuffixLen = 1;
                len += 1;
            }
            lasttok.TokenDelim = TOKDELIM_NULL;
            return false;
        }
        Y_ASSERT(tokens.empty() || tokens.back().TokenDelim == TOKDELIM_NULL);
        return true;
    }

    //! @return result of CheckLastToken()
    //! @note positions of the first subtoken can be non-zero in case: +abc, -!xyz,
    //!       tokstart in this cases can point to + and - respectively,
    //!       SetMultitoken() resets position of the first subtoken to 0
    //! @param pos      old position value of the first subtoken before resetting it to 0
    //! @param len      new len of multitoken after cutting off the last subtoken if it is invalid
    //! @note in case of '+!abc-efg' returned 'len' includes the prefix operators '+!' and is equal to 9
    bool SetRequestMultitoken(const wchar16* tokstart, const wchar16* tokend, size_t& len) {
        AddLastToken(tokstart, tokend);
        const TTokenStructure& subtokens = GetSubtokens();
        Y_ASSERT(!subtokens.empty());
        const TCharSpan& firsttok = subtokens[0];
        const TCharSpan& lasttok = subtokens.back();
        len = lasttok.EndPos() + lasttok.SuffixLen; // can't use (te - ts) because postfix can be there
        const bool res = CheckLastToken(len);
        const size_t firsttokStart = firsttok.Pos - firsttok.PrefixLen;
        SetMultitoken(tokstart + firsttokStart, len - firsttokStart);
        return res;
    }

    void UpdatePrefix(wchar16 c) {
        Y_ASSERT(c == '#' || c == '@' || c == '$');
        CurCharSpan.PrefixLen = 1; // length of prefix can't be more than 1
        PrefixChar = c;
    }

    void UpdateSuffix(wchar16 c) {
        Y_ASSERT(c == '#' || c == '+');
        TTokenStructure& tokens = Multitoken.SubTokens;
        if (!tokens.empty()) {
            tokens.back().SuffixLen += 1;
            SuffixChar = c;
        } else
            Y_ASSERT(!"can't update suffix: no subtokens");
    }

    void CancelToken() {
        // example: "abc 5% def", '%' can be the first symbol of utf8 encoded character so token is started by call to BeginToken()
        // and then UpdateToken() is called as well but there is no call to AddToken() because '%' is interpreted as a misc character so
        // CurCharSpan.Len must be reset
        CurCharSpan.Len = 0;
        CurCharSpan.PrefixLen = 0;
        PrefixChar = 0;
        SuffixChar = 0;
    }

    void ClearSubtokens() {
        Multitoken.SubTokens.clear();
    }

    //! @param delim    type of delimiter
    //! @param c        delimiter unicode character
    void SetTokenDelim(ETokenDelim delim, wchar16 /*c*/) {
        Y_ASSERT(!Multitoken.SubTokens.empty());
        Multitoken.SubTokens.back().TokenDelim = delim;
        // @todo remove this condition because unicode delimiters are removed before lemmatization
        //        if (c >= 0x7F) // if it is non-ASCII character
        //            SimpleMultitoken = false;
    }
};