aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/tokenizer/nlpparserbase.h
blob: 2e294af7c9ccdaa04536bf212b725f286202bb15 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#pragma once

#include <library/cpp/token/nlptypes.h>
#include <library/cpp/token/token_structure.h>
#include "multitokenutil.h"

// replresents single multitoken in the collection of TNlpParserBase
class TParserToken {
    TTokenStructure Subtokens;
    NLP_TYPE NlpType; // type of multitoken
    bool Hyphen;

public:
    TParserToken()
        : NlpType(NLP_WORD)
        , Hyphen(false)
    {
    }

    explicit TParserToken(const TCharSpan& subtok)
        : NlpType(NLP_WORD)
        , Hyphen(false)
    {
        Subtokens.push_back(subtok);
    }
    bool HasHyphen() const {
        return Hyphen;
    }
    void SetHyphen(EHyphenType type) {
        Y_ASSERT(!Subtokens.empty());
        Subtokens.back().Hyphen = type;
        Hyphen = true;
    }
    void SetTokenDelim(ETokenDelim delim) {
        Y_ASSERT(!Subtokens.empty());
        Subtokens.back().TokenDelim = delim;
    }
    NLP_TYPE GetNlpType() const {
        return NlpType;
    }
    size_t GetSubtokenCount() const {
        return Subtokens.size();
    }
    size_t GetStart() const {
        Y_ASSERT(!Subtokens.empty());
        return Subtokens[0].Pos - Subtokens[0].PrefixLen;
    }
    size_t GetEnd() const {
        Y_ASSERT(!Subtokens.empty());
        return Subtokens.back().EndPos() + Subtokens.back().SuffixLen;
    }
    size_t GetLength() const {
        return GetEnd() - GetStart();
    }
    // allowed suffixes: "+", "++" and "#"
    void AddSubtoken(const TCharSpan& span, size_t prefixLen, wchar16 prefixChar, wchar16 suffixChar) {
        Y_ASSERT(Subtokens.size() < MAX_SUBTOKENS);

        if (Subtokens.empty()) {
            NlpType = (span.Type == TOKEN_WORD ? NLP_WORD : NLP_INTEGER);
        } else {
            if (NlpType != NLP_MARK && span.Type != Subtokens.back().Type)
                NlpType = NLP_MARK;
        }

        Subtokens.push_back(span);
        Subtokens.back().PrefixLen = prefixLen;

        const size_t n = Subtokens.size();
        if (n > 1)
            CorrectDelimiters(Subtokens[n - 2], suffixChar, Subtokens[n - 1], prefixChar);

        Y_ASSERT(NlpType == NLP_WORD || NlpType == NLP_MARK || NlpType == NLP_INTEGER);
    }
    void AddIdeograph(size_t len) {
        Y_ASSERT(Subtokens.empty());
        Subtokens.push_back(0, len, TOKEN_WORD);
        NlpType = NLP_WORD;
    }
    void SwapSubtokens(TTokenStructure& other) {
        Subtokens.swap(other);
    }
    // returns length of all subtokens including prefix of the first subtoken and suffix of the last subtoken
    size_t CorrectPositions() {
        Y_ASSERT(!Subtokens.empty());
        // position of the first subtoken can be non-zero in case of non-first subtoken
        // but TWideToken::Token must point to the first character of the first subtoken (prefix included if any)
        const size_t pos = Subtokens[0].Pos - Subtokens[0].PrefixLen;
        if (pos) {
            const size_t n = Subtokens.size();
            for (size_t i = 0; i < n; ++i)
                Subtokens[i].Pos -= pos;
        }
        Y_ASSERT(Subtokens[0].Pos == Subtokens[0].PrefixLen); // PrefixLen can be equal to 0
        return Subtokens.back().EndPos() + Subtokens.back().SuffixLen;
    }
    void UpdateSuffix() {
        if (Subtokens.empty())
            Y_ASSERT(!"can't update suffix: no subtokens");
        else
            Subtokens.back().SuffixLen += 1;
    }
    size_t GetSuffixLength() const {
        return Subtokens.empty() ? 0 : Subtokens.back().SuffixLen;
    }
    void ResetSuffix() {
        if (!Subtokens.empty()) {
            Subtokens.back().SuffixLen = 0;
        }
    }
    void CorrectLastToken(TCharSpan& last) {
        Y_ASSERT(Subtokens.size() == MAX_SUBTOKENS);
        last = Subtokens.back();
        Subtokens.pop_back();
        // change delimiter (+) to suffix
        TCharSpan& lasttok = Subtokens.back();
        // actually '+' should be added if subtoken has suffix '+' because '++' is valid suffix as well
        if (lasttok.TokenDelim == TOKDELIM_PLUS && lasttok.SuffixLen == 0)
            lasttok.SuffixLen = 1;
        else if (lasttok.TokenDelim == TOKDELIM_AT_SIGN && last.PrefixLen == 0)
            last.PrefixLen = 1;
        lasttok.TokenDelim = TOKDELIM_NULL;
    }
    void Reset() {
        Subtokens.clear();
        NlpType = NLP_WORD;
        Hyphen = false;
    }
};

// represents collection of multitokens in TNlpParser
class TNlpParserBase {
    TVector<TParserToken> Tokens; // it has at least 1 multitoken
    TParserToken* Current;
    TCharSpan CurCharSpan;
    size_t PrefixLen;
    wchar16 PrefixChar;
    wchar16 SuffixChar;

public:
    TNlpParserBase()
        : Tokens(1)
        , Current(&Tokens[0])
        , PrefixLen(0)
        , PrefixChar(0)
        , SuffixChar(0)
    {
    }
    TParserToken& GetToken(size_t i) {
        return Tokens[i];
    }
    size_t GetTokenCount() const {
        return Tokens.size();
    }
    void BeginToken(const wchar16* tokstart, const wchar16* p) {
        // it can be called twice in case "exa&shy;&#x301;mple", see nlptok.rl, mixedtoken, tokfirst/toknext, numfirst/numnext
        if (CurCharSpan.Len == 0)
            CurCharSpan.Pos = p - tokstart;
    }
    void BeginToken(const wchar16* tokstart, const wchar16* p, ETokenType type) {
        BeginToken(tokstart, p);
        CurCharSpan.Type = type;
    }
    void UpdateToken() {
        CurCharSpan.Len += 1;
    }
    void AddToken() {
        Y_ASSERT(CurCharSpan.Len);
        Y_ASSERT(CurCharSpan.Type == TOKEN_WORD || CurCharSpan.Type == TOKEN_NUMBER);

        if (Current->GetSubtokenCount() == MAX_SUBTOKENS) {
            TCharSpan last; // for backward compatibility the last subtoken of the previous multitoken is popped and ...
            Current->CorrectLastToken(last);
            Tokens.push_back(TParserToken(last)); // ... added to the front of the new multitoken
            Current = &Tokens.back();             // new parser token, Current reassigned immediately after push_back()
        }

        Current->AddSubtoken(CurCharSpan, PrefixLen, PrefixChar, SuffixChar);

        CurCharSpan.Pos = 0;
        CurCharSpan.Len = 0; // it is checked in AddLastToken()
        PrefixLen = 0;
        PrefixChar = 0;
        SuffixChar = 0;
    }
    void AddIdeograph(size_t len) {
        Y_ASSERT(!CurCharSpan.Len && (len == 1 || len == 2));
        Current->AddIdeograph(len);
    }
    void AddLastToken(const wchar16* tokstart, const wchar16* tokend) {
        // - CurCharSpan.Len assigned to 0 in AddToken() because in case of multitoken with '.' at the end, for
        //   example: " well-formed. " parser already called to %add_token because '.' can be delimiter of the next token
        if (CurCharSpan.Len) {
            const wchar16* const actualStart = tokstart + CurCharSpan.Pos;
            // for ex. "5% " can have (actualStart == tokend) because '%' could be part of the next token with utf8 characters
            if (actualStart < tokend) {
                const size_t actualLen = tokend - actualStart;
                if (CurCharSpan.Len != actualLen) // for example "WORD% NEXTWORD" - '%' could be part of UTF8 encoded character and already counted...
                    CurCharSpan.Len = actualLen;
                AddToken();
            } else
                CancelToken();
        } else
            CancelToken();

        if (Current->GetSubtokenCount())
            Current->SetTokenDelim(TOKDELIM_NULL); // reset delimiter if any
    }
    void UpdatePrefix(wchar16 c) {
        Y_ASSERT(c == '#' || c == '@' || c == '$');
        PrefixLen = 1; // length of prefix can't be more than 1
        PrefixChar = c;
    }
    void UpdateSuffix(wchar16 c) {
        Y_ASSERT(c == '#' || c == '+');
        Current->UpdateSuffix();
        SuffixChar = c;
    }
    TParserToken* GetCurrentToken() {
        return Current;
    }
    void CancelToken() {
        // example: "abc 5% def", '%' can be the first symbol of utf8 encoded character so token is started by call to BeginToken()
        // and then UpdateToken() is called as well but there is no call to AddToken() because '%' is interpreted as a misc character so
        // CurCharSpan.Len must be reset
        CurCharSpan.Len = 0;
        PrefixLen = 0;
        PrefixChar = 0;
        SuffixChar = 0;
    }
    void ResetTokens() {
        Tokens.resize(1);
        Current = &Tokens[0];
        Current->Reset();
    }
    void SetSoftHyphen() {
        Current->SetHyphen(HYPHEN_SOFT);
    }
    void SetHyphenation() {
        Current->SetHyphen(HYPHEN_ORDINARY);
    }
    void SetTokenDelim(ETokenDelim delim, wchar16 /*c*/) {
        Current->SetTokenDelim(delim);
    }
};