library/cpp/token/charfilter.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224

#pragma once

#include <util/charset/wide.h>
#include <util/generic/singleton.h>

#include "token_structure.h"

//! represents a set of accent characters
//! @note this class is intended to be a singleton because if has quite large array (65 KB)
class TAccentTable: private TNonCopyable {
public:
    TAccentTable();

    bool operator[](wchar16 c) const {
        return Data[c];
    }

private:
    enum { DATA_SIZE = 0xFFFF };
    unsigned char Data[DATA_SIZE];
};

//! represents an accessor to the accent table, this class is able to be copied
class TAccents {
public:
    TAccents()
        : Table(*HugeSingleton<TAccentTable>())
    {
    }

    //! retruns @c true if character is an accent symbol
    bool Check(wchar16 c) const {
        return Table[c];
    }

private:
    const TAccentTable& Table;
};

//! removes characters from @c TWideToken using a filter
//! @note the checker class must have the default constructor and @c Check member function
template <typename TChecker>
class TCharFilter {
    TWideToken Token;
    TCharTemp Buffer;
    TChecker Checker;

private:
    //! copies already verified data - characters and subtokens
    wchar16* CopyData(const TWideToken& token, size_t n, size_t t, wchar16* const data) {
        std::char_traits<wchar16>::copy(data, token.Token, n); // without current character

        Token.SubTokens.clear();
        for (size_t i = 0; i < t; ++i) {
            Token.SubTokens.push_back(token.SubTokens[i]);
        }

        return data + n;
    }

public:
    explicit TCharFilter(size_t bufSize)
        : Buffer(bufSize)
    {
    }

    //! removes accent characters from token
    //! @return if there is no any accent character the function returns the source token
    //!         otherwise the function returns the internal token
    const TWideToken& Filter(const TWideToken& token) {
        if (token.SubTokens.empty())
            return FilterTokenNoSubtokens(token);
        else
            return FilterTokenWithSubtokens(token);
    }

    static bool HasChars(const TWideToken& token) {
        TChecker checker;
        const TTokenStructure& subtokens = token.SubTokens;
        const wchar16* const s = token.Token; // source character sequence
        size_t i = 0;                       // the index of the current source character
        size_t t = 0;                       // the index of the next source subtoken

        while (i < token.Leng) {
            if (t < subtokens.size()) {
                if (i >= subtokens[t].Pos && i < subtokens[t].EndPos()) {
                    // inside a token
                    if (checker.Check(s[i])) {
                        return true;
                    }
                }

                ++i;

                if (i >= subtokens[t].EndPos()) {
                    ++t;
                }
            } else {
                break;
            }
        }

        return false;
    }

private:
    const TWideToken& FilterTokenWithSubtokens(const TWideToken& token) {
        Y_ASSERT(!token.SubTokens.empty());
        Y_ASSERT(token.SubTokens.back().EndPos() <= token.Leng);

        const TTokenStructure& subtokens = token.SubTokens;
        const wchar16* const s = token.Token; // source character sequence
        size_t i = 0;                       // the index of the current source character
        size_t t = 0;                       // the index of the next source subtoken

        while (i < token.Leng) {
            if (t < subtokens.size()) {
                if (i >= subtokens[t].Pos && i < subtokens[t].EndPos()) {
                    // inside a token
                    if (Checker.Check(s[i]))
                        return FilterTokenWithSubtokens(token, s, i, t, Buffer.Data());
                }

                ++i;

                if (i >= subtokens[t].EndPos()) {
                    ++t;
                }
            } else {
                break;
            }
        }

        return token;
    }

    const TWideToken& FilterTokenWithSubtokens(
        const TWideToken& token, const wchar16* s, size_t i, size_t t, wchar16* const buffer) {
        Y_ASSERT(i < token.Leng && t < token.SubTokens.size() && s >= token.Token);

        const TTokenStructure& subtokens = token.SubTokens;
        wchar16* d = CopyData(token, i, t, buffer); // destination character
        TCharSpan span = subtokens[t];

        while (i < token.Leng) {
            if (t < subtokens.size()) {
                if (i >= subtokens[t].Pos && i < subtokens[t].EndPos()) {
                    // inside a token
                    if (Checker.Check(s[i])) {
                        Y_ASSERT(span.Len);
                        --span.Len;
                    } else {
                        *d++ = s[i];
                    }
                } else {
                    // outside of tokens
                    *d++ = s[i];
                }

                ++i;

                if (i >= subtokens[t].EndPos()) {
                    ++t;

                    if (span.Len)
                        Token.SubTokens.push_back(span);

                    if (t < subtokens.size()) {
                        const size_t diff = i - (d - buffer);
                        Y_ASSERT(subtokens[t].Pos >= diff);
                        span.Pos = subtokens[t].Pos - diff;
                        span.Len = subtokens[t].Len;
                    }
                }
            } else {
                // copy the remainder of characters
                const size_t n = token.Leng - i;
                std::char_traits<wchar16>::copy(d, &s[i], n);
                d += n;
                break;
            }
        }

        Token.Token = buffer;
        Token.Leng = d - buffer;
        Y_ASSERT(!Token.SubTokens.size() || (Token.SubTokens.size() && Token.Leng >= Token.SubTokens.back().EndPos()));
        return Token;
    }

    const TWideToken& FilterTokenNoSubtokens(const TWideToken& token) {
        Y_ASSERT(token.SubTokens.empty());
        const wchar16* s = token.Token;
        const wchar16* const e = s + token.Leng;

        for (; s != e; ++s) {
            if (Checker.Check(*s))
                return FilterTokenNoSubtokens(token.Token, s, e, Buffer.Data());
        }

        return token;
    }

    const TWideToken& FilterTokenNoSubtokens(
        const wchar16* const token, const wchar16* s, const wchar16* const e, wchar16* const buffer) {
        const size_t n = s - token;
        std::char_traits<wchar16>::copy(buffer, token, n);
        wchar16* d = buffer + n;

        for (; s != e; ++s) {
            if (!Checker.Check(*s))
                *d++ = *s;
        }

        Token.Token = buffer;
        Token.Leng = d - buffer;
        Y_ASSERT(Token.Leng);
        return Token;
    }
};

const wchar32* LemmerDecomposition(wchar32 ch, bool advancedGermanUmlauts = true, bool extTable = false);
size_t NormalizeUnicode(const wchar16* word, size_t length, wchar16* converted, size_t bufLen, bool advancedGermanUmlauts = true, bool extTable = false);
TUtf16String NormalizeUnicode(const TUtf16String& w, bool advancedGermanUmlauts = true, bool extTable = false);
TUtf16String NormalizeUnicode(const TWtringBuf& wbuf, bool advancedGermanUmlauts = true, bool extTable = false);