aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/charset/codepage.h
blob: 9e73fbd6154e9f1b5e4d7e0e2c06672e9d4dd383 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#pragma once

#include "doccodes.h"

#include <util/charset/recode_result.h>
#include <util/charset/unidata.h> // all wchar32 functions
#include <util/charset/utf8.h>
#include <util/generic/string.h>
#include <util/generic/ylimits.h>
#include <util/generic/yexception.h>
#include <util/system/yassert.h>
#include <util/system/defaults.h>

#include <cctype>

struct CodePage;
struct Recoder;
struct Encoder;

/*****************************************************************\
*                    struct CodePage                              *
\*****************************************************************/
struct CodePage {
    ECharset CPEnum;       // int MIBEnum;
    const char* Names[30]; // name[0] -- preferred mime-name
    wchar32 unicode[256];
    const char* DefaultChar; //[CCL_NUM]

    bool IsLower(unsigned char ch) const {
        return ::IsLower(unicode[ch]);
    }
    bool IsUpper(unsigned char ch) const {
        return ::IsUpper(unicode[ch]);
    }
    bool IsAlpha(unsigned char ch) const {
        return ::IsAlpha(unicode[ch]);
    }
    bool IsDigit(unsigned char ch) const {
        return ::IsDigit(unicode[ch]);
    }
    bool IsXdigit(unsigned char ch) const {
        return ::IsXdigit(unicode[ch]);
    }
    bool IsAlnum(unsigned char ch) const {
        return ::IsAlnum(unicode[ch]);
    }
    bool IsSpace(unsigned char ch) const {
        return ::IsSpace(unicode[ch]);
    }
    bool IsPunct(unsigned char ch) const {
        return ::IsPunct(unicode[ch]);
    }
    bool IsCntrl(unsigned char ch) const {
        return ::IsCntrl(unicode[ch]);
    }
    bool IsGraph(unsigned char ch) const {
        return ::IsGraph(unicode[ch]);
    }
    bool IsPrint(unsigned char ch) const {
        return ::IsPrint(unicode[ch]);
    }
    bool IsComposed(unsigned char ch) const {
        return ::IsComposed(unicode[ch]);
    }

    // return pointer to char after the last char
    char* ToLower(const char* begin, const char* end, char* to) const;
    char* ToLower(const char* begin, char* to) const;

    // return pointer to char after the last char
    char* ToUpper(const char* begin, const char* end, char* to) const;
    char* ToUpper(const char* begin, char* to) const;

    int stricmp(const char* s1, const char* s2) const;
    int strnicmp(const char* s1, const char* s2, size_t len) const;

    inline unsigned char ToUpper(unsigned char ch) const;
    inline unsigned char ToLower(unsigned char ch) const;
    inline unsigned char ToTitle(unsigned char ch) const;

    inline int ToDigit(unsigned char ch) const {
        return ::ToDigit(unicode[ch]);
    }

    static void Initialize();

    inline bool SingleByteCodepage() const {
        return DefaultChar != nullptr; 
    }
    inline bool NativeCodepage() const {
        return SingleByteCodepage() || CPEnum == CODES_UTF8;
    }
};

class TCodePageHash;

namespace NCodepagePrivate {
    class TCodepagesMap {
    private:
        static const int DataShift = 2;
        static const int DataSize = CODES_MAX + DataShift;
        const CodePage* Data[DataSize];

    private:
        inline const CodePage* GetPrivate(ECharset e) const {
            Y_ASSERT(e + DataShift >= 0 && e + DataShift < DataSize); 
            return Data[e + DataShift];
        }

        void SetData(const CodePage* cp);

    public:
        TCodepagesMap();

        inline const CodePage* Get(ECharset e) const {
            const CodePage* res = GetPrivate(e);
            if (!res->SingleByteCodepage()) {
                ythrow yexception() << "CodePage (" << (int)e << ") structure can only be used for single byte encodings"; 
            }

            return res;
        }

        inline bool SingleByteCodepage(ECharset e) const {
            return GetPrivate(e)->SingleByteCodepage();
        }
        inline bool NativeCodepage(ECharset e) const {
            return GetPrivate(e)->NativeCodepage();
        }
        inline const char* NameByCharset(ECharset e) const {
            return GetPrivate(e)->Names[0];
        }

        static const TCodepagesMap& Instance();

        friend class ::TCodePageHash;
    };

    inline bool NativeCodepage(ECharset e) {
        return ::NCodepagePrivate::TCodepagesMap::Instance().NativeCodepage(e);
    }
}

inline bool SingleByteCodepage(ECharset e) {
    return ::NCodepagePrivate::TCodepagesMap::Instance().SingleByteCodepage(e);
}

inline bool ValidCodepage(ECharset e) {
    return e >= 0 && e < CODES_MAX;
}

inline const CodePage* CodePageByCharset(ECharset e) {
    return ::NCodepagePrivate::TCodepagesMap::Instance().Get(e);
}

ECharset CharsetByName(TStringBuf name);

// Same as CharsetByName, but throws yexception() if name is invalid
ECharset CharsetByNameOrDie(TStringBuf name);

inline ECharset CharsetByCodePage(const CodePage* CP) {
    return CP->CPEnum;
}

inline const char* NameByCharset(ECharset e) {
    return ::NCodepagePrivate::TCodepagesMap::Instance().NameByCharset(e);
}

inline const char* NameByCharsetSafe(ECharset e) {
    if (CODES_UNKNOWN < e && e < CODES_MAX)
        return ::NCodepagePrivate::TCodepagesMap::Instance().NameByCharset(e);
    else
        ythrow yexception() << "unknown encoding: " << (int)e; 
}

inline const char* NameByCodePage(const CodePage* CP) {
    return CP->Names[0];
}

inline const CodePage* CodePageByName(const char* name) {
    ECharset code = CharsetByName(name);
    if (code == CODES_UNKNOWN)
        return nullptr; 

    return CodePageByCharset(code);
}

ECharset EncodingHintByName(const char* name);

/*****************************************************************\
*                    struct Encoder                               *
\*****************************************************************/
struct Encoder {
    char* Table[256];
    const char* DefaultChar;

    inline char Code(wchar32 ch) const {
        if (ch > 0xFFFF)
            return 0;
        return (unsigned char)Table[(ch >> 8) & 255][ch & 255];
    }

    inline char Tr(wchar32 ch) const {
        char code = Code(ch);
        if (code == 0 && ch != 0)
            code = DefaultChar[NUnicode::CharType(ch)];
        Y_ASSERT(code != 0 || ch == 0); 
        return code;
    }

    inline unsigned char operator[](wchar32 ch) const {
        return Tr(ch);
    }

    void Tr(const wchar32* in, char* out, size_t len) const;
    void Tr(const wchar32* in, char* out) const;
    char* DefaultPlane;
};

/*****************************************************************\
*                    struct Recoder                               *
\*****************************************************************/
struct Recoder {
    unsigned char Table[257];

    void Create(const CodePage& source, const CodePage& target);
    void Create(const CodePage& source, const Encoder* wideTarget);

    void Create(const CodePage& page, wchar32 (*mapper)(wchar32));
    void Create(const CodePage& page, const Encoder* widePage, wchar32 (*mapper)(wchar32));

    inline unsigned char Tr(unsigned char c) const {
        return Table[c];
    }
    inline unsigned char operator[](unsigned char c) const {
        return Table[c];
    }
    void Tr(const char* in, char* out, size_t len) const;
    void Tr(const char* in, char* out) const;
    void Tr(char* in_out, size_t len) const;
    void Tr(char* in_out) const;
};

extern const struct Encoder& WideCharToYandex;

const Encoder& EncoderByCharset(ECharset enc);

namespace NCodepagePrivate {
    class TCodePageData {
    private:
        static const CodePage* const AllCodePages[];

        static const Recoder rcdr_to_yandex[];
        static const Recoder rcdr_from_yandex[];
        static const Recoder rcdr_to_lower[];
        static const Recoder rcdr_to_upper[];
        static const Recoder rcdr_to_title[];

        static const Encoder* const EncodeTo[];

        friend struct ::CodePage;
        friend class TCodepagesMap;
        friend RECODE_RESULT _recodeToYandex(ECharset, const char*, char*, size_t, size_t, size_t&, size_t&);
        friend RECODE_RESULT _recodeFromYandex(ECharset, const char*, char*, size_t, size_t, size_t&, size_t&);
        friend const Encoder& ::EncoderByCharset(ECharset enc);
    };
}

inline const Encoder& EncoderByCharset(ECharset enc) {
    if (!SingleByteCodepage(enc)) {
        ythrow yexception() << "Encoder structure can only be used for single byte encodings";
    }

    return *NCodepagePrivate::TCodePageData::EncodeTo[enc];
}

inline unsigned char CodePage::ToUpper(unsigned char ch) const {
    return NCodepagePrivate::TCodePageData::rcdr_to_upper[CPEnum].Table[ch];
}
inline unsigned char CodePage::ToLower(unsigned char ch) const {
    return NCodepagePrivate::TCodePageData::rcdr_to_lower[CPEnum].Table[ch];
}
inline unsigned char CodePage::ToTitle(unsigned char ch) const {
    return NCodepagePrivate::TCodePageData::rcdr_to_title[CPEnum].Table[ch];
}

extern const CodePage& csYandex;

/// these functions change (lowers) [end] position in case of utf-8
/// null character is NOT assumed or written at [*end]
void DecodeUnknownPlane(wchar16* start, wchar16*& end, const ECharset enc4unk);
void DecodeUnknownPlane(wchar32* start, wchar32*& end, const ECharset enc4unk);

inline void ToLower(char* s, size_t n, const CodePage& cp = csYandex) {
    char* const e = s + n;
    for (; s != e; ++s)
        *s = cp.ToLower(*s);
}

inline void ToUpper(char* s, size_t n, const CodePage& cp = csYandex) {
    char* const e = s + n;
    for (; s != e; ++s)
        *s = cp.ToUpper(*s);
}

inline TString ToLower(TString s, const CodePage& cp, size_t pos = 0, size_t n = TString::npos) {
    s.Transform([&cp](size_t, char c) { return cp.ToLower(c); }, pos, n);
    return s;
}

inline TString ToUpper(TString s, const CodePage& cp, size_t pos = 0, size_t n = TString::npos) {
    s.Transform([&cp](size_t, char c) { return cp.ToUpper(c); }, pos, n);
    return s;
}

inline TString ToTitle(TString s, const CodePage& cp, size_t pos = 0, size_t n = TString::npos) {
    s.Transform(
        [pos, &cp](size_t i, char c) {
            return i == pos ? cp.ToTitle(c) : cp.ToLower(c);
        },
        pos,
        n);
    return s;
}