library/cpp/token/token_iterator.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210

#pragma once

#include "nlptypes.h"
#include "token_structure.h"

//! merges subtokens of a multitoken to floats and marks to provide backward compatibility for new tokenization of marks
//! @note there are exclusions that not compatible with the old tokenization:
//!       [v1.0] -> [v 1.0] instead of old [v1 0]
//!       [a+b]  -> [a + b] instead of old [a+ b]
class TTokenIterator {
    const TWideToken& Tok;
    const TTokenStructure& Subtokens;
    TTokenStructure Tokens;
    NLP_TYPE NlpType;
    size_t First;
    const size_t Last;

private:
    static bool BreakMultitoken(const TTokenStructure& subtokens, size_t first, size_t last, size_t i, size_t n) {
        Y_ASSERT(i >= first && i <= last);
        if (i == last)
            return true;

        const TCharSpan& s = subtokens[i];

        if (s.SuffixLen != 0 || subtokens[i + 1].PrefixLen != 0)
            return true; // no prefix/suffix in the middle

        if (s.TokenDelim == TOKDELIM_NULL) {
            if (i < (last - 1) && subtokens[i + 1].Type == TOKEN_NUMBER && subtokens[i + 1].TokenDelim == TOKDELIM_DOT && subtokens[i + 2].Type == TOKEN_NUMBER)
                return true; // v1.0 -> v /+1 1.0

            if (i == first || s.Type != TOKEN_NUMBER || subtokens[i - 1].TokenDelim != TOKDELIM_DOT || subtokens[i - 1].Type != TOKEN_NUMBER)
                return false; // if the current token '2': 1-2a then the current token is a part of a mark

            return true; // 1.2a -> 1.2 a
        }

        if (s.Type == TOKEN_NUMBER) {
            if (n == 2) {
                Y_ASSERT(i > first && (i + 1) == (first + n));
                if (subtokens[i - 1].TokenDelim == TOKDELIM_DOT) // && subtokens[i + 1].Type == TOKEN_NUMBER)
                    return true;                                 // it is FLOAT
            }

            if (s.TokenDelim == TOKDELIM_DOT && subtokens[i + 1].Type == TOKEN_NUMBER)
                return false; // the current token is a part of a float

            return true; // the current token is number
        }

        // the current token is word

        if (s.TokenDelim != TOKDELIM_APOSTROPHE && s.TokenDelim != TOKDELIM_MINUS)
            return true; // baden-baden, caffrey's

        // delimiter is '-' or '\''

        if (s.Type != subtokens[i + 1].Type)
            return true; // types of tokens are different

        if (i > first && subtokens[i - 1].TokenDelim == TOKDELIM_NULL)
            return true; // the current token 'a' and the previous token '1' has no delimiter: 1a-b

        return (i < (last - 1) && subtokens[i + 1].TokenDelim == TOKDELIM_NULL); // mark follows the current token 'a': a-b2
    }

public:
    explicit TTokenIterator(const TWideToken& tok)
        : Tok(tok)
        , Subtokens(tok.SubTokens)
        , NlpType(NLP_END)
        , First(0)
        , Last(tok.SubTokens.size() - 1)
    {
        Y_ASSERT(tok.SubTokens.size());
    }
    //! returns true if one more multitoken is found
    bool Next() {
        if (Finished())
            return false;

        Tokens.clear();
        size_t i = First;
        do {
            const TCharSpan& s = Subtokens[i];
            if (!Tokens.empty() && Tokens.back().TokenDelim == TOKDELIM_NULL) {
                TCharSpan& mark = Tokens.back();
                mark.Len += s.Len;
                mark.SuffixLen = s.SuffixLen;
                mark.Type = TOKEN_MARK; // change type
                NlpType = NLP_MARK;
            } else {
                Y_ASSERT(Tokens.empty() || Tokens.back().Type == s.Type);
                Tokens.push_back(s);
                NlpType = (s.Type == TOKEN_WORD ? NLP_WORD : NLP_INTEGER);
            }
        } while (!BreakMultitoken(Subtokens, First, Last, i++, Tokens.size()));
        Y_ASSERT(!Tokens.empty());

        if (NlpType == NLP_INTEGER && Tokens.size() == 2) {
            Y_ASSERT(Tokens[0].SuffixLen == 0 && Tokens[0].TokenDelim == TOKDELIM_DOT); // && Tokens[1].SuffixLen == 0);
            NlpType = NLP_FLOAT;
            TCharSpan& first = Tokens[0];
            const TCharSpan& second = Tokens[1];
            first.Len = second.EndPos() - first.Pos;
            first.SuffixLen = second.SuffixLen;
            first.Type = TOKEN_FLOAT;
            first.TokenDelim = TOKDELIM_NULL;
            Tokens.resize(1);
        }

        Tokens.back().TokenDelim = TOKDELIM_NULL; // reset the last delimiter
        First = i;
        return true;
    }
    //! @note positions of subtokens of the original multitoken are not changed;
    //!       all tokens can have suffixes
    const TTokenStructure& Get() const {
        return Tokens;
    }
    bool Finished() const {
        return First > Last;
    }
    //! the first subtoken of multitoken has position equal to 0
    //! @note only word tokens can have suffixes
    void GetMultitoken(TWideToken& tok) const {
        Y_ASSERT(!Tokens.empty());
        tok.SubTokens = Tokens;
        TTokenStructure& subtokens = tok.SubTokens;
        const TCharSpan& first = subtokens[0];
        TCharSpan& last = subtokens.back();
        tok.Token = Tok.Token + first.Pos;
        if (last.Type == TOKEN_WORD) {
            tok.Leng = last.EndPos() + last.SuffixLen - first.Pos;
            if (!Finished() && Subtokens[First].PrefixLen) {
                const ui16 suffixLen = GetAdditionalSuffixLen();
                tok.Leng += suffixLen;
                last.SuffixLen += suffixLen;
            }
        } else {
            tok.Leng = last.EndPos() - first.Pos;
            last.SuffixLen = 0;
            if (NlpType == NLP_INTEGER && !Finished() && Subtokens[First].PrefixLen) {
                const ui16 suffixLen = GetIntegerSuffixLen();
                tok.Leng += suffixLen;
                last.SuffixLen = suffixLen;
            }
        }
        subtokens[0].PrefixLen = 0;
        const size_t diff = first.Pos;
        for (auto& subtoken : subtokens)
            subtoken.Pos -= diff;
    }
    ui16 GetAdditionalSuffixLen() const {
        const TCharSpan& origtok = Subtokens[First - 1];
        Y_ASSERT(origtok.Type == TOKEN_WORD && !Finished() && Subtokens[First].PrefixLen);
        ui16 suffixLen = 0;
        if (origtok.TokenDelim == TOKDELIM_PLUS)
            suffixLen = 1;
        return suffixLen;
    }
    ui16 GetIntegerSuffixLen() const {
        Y_ASSERT(NlpType == NLP_INTEGER && !Finished() && Subtokens[First].PrefixLen && Tokens.size() == 1);
        const TCharSpan& origtok = Subtokens[First - 1];
        ui16 suffixLen = 0;
        if (origtok.TokenDelim == TOKDELIM_PLUS) {
            suffixLen = origtok.SuffixLen;
            if (origtok.SuffixLen < 2)
                suffixLen += 1;
        }
        return suffixLen;
    }
    //! returns NLP type of multitoken returned by GetMultitoken(tok)
    NLP_TYPE GetNlpType() const {
        return NlpType;
    }
    //! called for the first prefix, other prefixes returned as delimiters by GetDelimiter()
    void GetPrefix(TWideToken& tok) const {
        Y_ASSERT(Tokens.empty()); // Next() must NOT be called
        if (Subtokens.empty() || Subtokens[0].PrefixLen == 0) {
            tok.Leng = 0;
            tok.SubTokens.clear();
        } else {
            tok.Token = Tok.Token;
            tok.Leng = Subtokens[0].PrefixLen;
            tok.SubTokens.clear();
        }
    }
    //! @note NLP type of token is NLP_MISCTEXT;
    //!       prefixes always considered as "misctext";
    //!       suffixes of non-words considered as "misctext";
    //!       this function can be called after the last token as well,
    //!       especially when the last non-word token has the suffix
    void GetDelimiter(TWideToken& tok) const {
        Y_ASSERT(!Tokens.empty()); // Next() must be called
        //Y_ASSERT(!Finished());
        const TCharSpan& prev = Tokens.back();
        size_t endpos = prev.EndPos();
        if (prev.Type == TOKEN_WORD) {
            endpos += prev.SuffixLen;
            if (!Finished() && Subtokens[First].PrefixLen)
                endpos += GetAdditionalSuffixLen();
        } else if (NlpType == NLP_INTEGER && !Finished() && Subtokens[First].PrefixLen)
            endpos += GetIntegerSuffixLen();
        tok.Token = Tok.Token + endpos;
        tok.Leng = (Finished() ? Tok.Leng : Tok.SubTokens[First].Pos) - endpos; // length can be equal to 0 in case v1.0 -> v 1.0
        tok.SubTokens.clear();
    }
};