library/cpp/charset/wide.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306

#pragma once

#include "codepage.h"
#include "iconv.h"

#include <util/charset/recode_result.h>
#include <util/charset/unidata.h>
#include <util/charset/utf8.h>
#include <util/charset/wide.h>
#include <util/generic/string.h> 
#include <util/generic/algorithm.h>
#include <util/generic/yexception.h>
#include <util/memory/tempbuf.h>
#include <util/system/yassert.h>

//! converts text from unicode to yandex codepage
//! @attention destination buffer must be long enough to fit all characters of the text
//! @note @c dest buffer must fit at least @c len number of characters
template <typename TCharType>
inline size_t WideToChar(const TCharType* text, size_t len, char* dest, ECharset enc) {
    Y_ASSERT(SingleByteCodepage(enc));

    const char* start = dest;

    const Encoder* const encoder = &EncoderByCharset(enc);
    const TCharType* const last = text + len;
    for (const TCharType* cur = text; cur != last; ++dest) {
        *dest = encoder->Tr(ReadSymbolAndAdvance(cur, last));
    }

    return dest - start;
}

//! converts text to unicode using a codepage object
//! @attention destination buffer must be long enough to fit all characters of the text
//! @note @c dest buffer must fit at least @c len number of characters;
//!       if you need convert zero terminated string you should determine length of the
//!       string using the @c strlen function and pass as the @c len parameter;
//!       it does not make sense to create an additional version of this function because
//!       it will call to @c strlen anyway in order to allocate destination buffer
template <typename TCharType>
inline void CharToWide(const char* text, size_t len, TCharType* dest, const CodePage& cp) {
    const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
    const unsigned char* const last = cur + len;
    for (; cur != last; ++cur, ++dest) {
        *dest = static_cast<TCharType>(cp.unicode[*cur]); // static_cast is safe as no 1char codepage contains non-BMP symbols
    }
}

namespace NDetail {
    namespace NBaseOps {
        // Template interface base recoding drivers, do not perform any memory management,
        // do not care about buffer size, so supplied @dst
        // should have enough room for the result (with proper reserve for the worst case)

        // Depending on template params, perform conversion of single-byte/multi-byte/utf8 string to/from wide string.

        template <typename TCharType>
        inline TBasicStringBuf<TCharType> RecodeSingleByteChar(const TStringBuf src, TCharType* dst, const CodePage& cp) {
            Y_ASSERT(cp.SingleByteCodepage());
            ::CharToWide(src.data(), src.size(), dst, cp);
            return TBasicStringBuf<TCharType>(dst, src.size());
        }

        template <typename TCharType>
        inline TStringBuf RecodeSingleByteChar(const TBasicStringBuf<TCharType> src, char* dst, const CodePage& cp) {
            Y_ASSERT(cp.SingleByteCodepage());
            ::WideToChar(src.data(), src.size(), dst, cp.CPEnum);
            return TStringBuf(dst, src.size());
        }

        template <typename TCharType>
        inline TBasicStringBuf<TCharType> RecodeMultiByteChar(const TStringBuf src, TCharType* dst, ECharset encoding) {
            Y_ASSERT(!NCodepagePrivate::NativeCodepage(encoding));
            size_t read = 0;
            size_t written = 0;
            ::NICONVPrivate::RecodeToUnicode(encoding, src.data(), dst, src.size(), src.size(), read, written);
            return TBasicStringBuf<TCharType>(dst, written);
        }

        template <typename TCharType>
        inline TStringBuf RecodeMultiByteChar(const TBasicStringBuf<TCharType> src, char* dst, ECharset encoding) {
            Y_ASSERT(!NCodepagePrivate::NativeCodepage(encoding));
            size_t read = 0;
            size_t written = 0;
            ::NICONVPrivate::RecodeFromUnicode(encoding, src.data(), dst, src.size(), src.size() * 3, read, written);
            return TStringBuf(dst, written);
        }

        template <typename TCharType>
        inline TBasicStringBuf<TCharType> RecodeUtf8(const TStringBuf src, TCharType* dst) {
            size_t len = 0;
            if (!::UTF8ToWide(src.data(), src.size(), dst, len))
                ythrow yexception() << "Invalid UTF8: \"" << src.SubStr(0, 50) << (src.size() > 50 ? "...\"" : "\"");
            return TBasicStringBuf<TCharType>(dst, len);
        }

        template <typename TCharType>
        inline TStringBuf RecodeUtf8(const TBasicStringBuf<TCharType> src, char* dst) {
            size_t len = 0;
            ::WideToUTF8(src.data(), src.size(), dst, len);
            return TStringBuf(dst, len);
        }

        // Select one of re-coding methods from above, based on provided @encoding

        template <typename TCharFrom, typename TCharTo>
        TBasicStringBuf<TCharTo> Recode(const TBasicStringBuf<TCharFrom> src, TCharTo* dst, ECharset encoding) {
            if (encoding == CODES_UTF8)
                return RecodeUtf8(src, dst);
            else if (SingleByteCodepage(encoding))
                return RecodeSingleByteChar(src, dst, *CodePageByCharset(encoding));
            else
                return RecodeMultiByteChar(src, dst, encoding);
        }

    }

    template <typename TCharFrom>
    struct TRecodeTraits;

    template <>
    struct TRecodeTraits<char> {
        using TCharTo = wchar16;
        using TStringBufTo = TWtringBuf;
        using TStringTo = TUtf16String; 
        enum { ReserveSize = 4 }; // How many TCharFrom characters we should reserve for one TCharTo character in worst case
                                  // Here an unicode character can be converted up to 4 bytes of UTF8
    };

    template <>
    struct TRecodeTraits<wchar16> {
        using TCharTo = char;
        using TStringBufTo = TStringBuf;
        using TStringTo = TString; 
        enum { ReserveSize = 2 }; // possible surrogate pairs ?
    };

    // Operations with destination buffer where recoded string will be written
    template <typename TResult>
    struct TRecodeResultOps {
        // default implementation will work with TString and TUtf16String - 99% of usage 
        using TResultChar = typename TResult::char_type;

        static inline size_t Size(const TResult& dst) {
            return dst.size();
        }

        static inline TResultChar* Reserve(TResult& dst, size_t len) {
            dst.ReserveAndResize(len);
            return dst.begin();
        }

        static inline void Truncate(TResult& dst, size_t len) {
            dst.resize(len);
        }
    };

    // Main template interface for recoding in both directions

    template <typename TCharFrom, typename TResult>
    typename TRecodeTraits<TCharFrom>::TStringBufTo Recode(const TBasicStringBuf<TCharFrom> src, TResult& dst, ECharset encoding) {
        using TCharTo = typename TRecodeTraits<TCharFrom>::TCharTo;
        // make enough room for re-coded string
        TCharTo* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, src.size() * TRecodeTraits<TCharTo>::ReserveSize);
        // do re-coding
        TBasicStringBuf<TCharTo> res = NBaseOps::Recode(src, dstbuf, encoding);
        // truncate result back to proper size
        TRecodeResultOps<TResult>::Truncate(dst, res.size());
        return res;
    }

    // appending version of Recode()
    template <typename TCharFrom, typename TResult>
    typename TRecodeTraits<TCharFrom>::TStringBufTo RecodeAppend(const TBasicStringBuf<TCharFrom> src, TResult& dst, ECharset encoding) {
        using TCharTo = typename TRecodeTraits<TCharFrom>::TCharTo;
        size_t dstOrigSize = TRecodeResultOps<TResult>::Size(dst);
        TCharTo* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, dstOrigSize + src.size() * TRecodeTraits<TCharTo>::ReserveSize);
        TBasicStringBuf<TCharTo> appended = NBaseOps::Recode(src, dstbuf + dstOrigSize, encoding);
        size_t dstFinalSize = dstOrigSize + appended.size();
        TRecodeResultOps<TResult>::Truncate(dst, dstFinalSize);
        return TBasicStringBuf<TCharTo>(dstbuf, dstFinalSize);
    }

    // special implementation for robust utf8 functions
    template <typename TResult>
    TWtringBuf RecodeUTF8Robust(const TStringBuf src, TResult& dst) {
        // make enough room for re-coded string
        wchar16* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, src.size() * TRecodeTraits<wchar16>::ReserveSize);

        // do re-coding
        size_t written = 0;
        UTF8ToWide<true>(src.data(), src.size(), dstbuf, written);

        // truncate result back to proper size
        TRecodeResultOps<TResult>::Truncate(dst, written);
        return TWtringBuf(dstbuf, written);
    }

    template <typename TCharFrom>
    inline typename TRecodeTraits<TCharFrom>::TStringTo Recode(const TBasicStringBuf<TCharFrom> src, ECharset encoding) {
        typename TRecodeTraits<TCharFrom>::TStringTo res;
        Recode<TCharFrom>(src, res, encoding);
        return res;
    }
}

// Write result into @dst. Return string-buffer pointing to re-coded content of @dst.

template <bool robust>
inline TWtringBuf CharToWide(const TStringBuf src, TUtf16String& dst, ECharset encoding) {
    if (robust && CODES_UTF8 == encoding)
        return ::NDetail::RecodeUTF8Robust(src, dst);
    return ::NDetail::Recode<char>(src, dst, encoding);
}

inline TWtringBuf CharToWide(const TStringBuf src, TUtf16String& dst, ECharset encoding) {
    return ::NDetail::Recode<char>(src, dst, encoding);
}

inline TStringBuf WideToChar(const TWtringBuf src, TString& dst, ECharset encoding) {
    return ::NDetail::Recode<wchar16>(src, dst, encoding);
}

//! calls either to @c WideToUTF8 or @c WideToChar depending on the encoding type
inline TString WideToChar(const wchar16* text, size_t len, ECharset enc) { 
    if (NCodepagePrivate::NativeCodepage(enc)) {
        if (enc == CODES_UTF8)
            return WideToUTF8(text, len);

        TString s = TString::Uninitialized(len); 
        s.remove(WideToChar(text, len, s.begin(), enc));

        return s;
    }

    TString s = TString::Uninitialized(len * 3); 

    size_t read = 0;
    size_t written = 0;
    NICONVPrivate::RecodeFromUnicode(enc, text, s.begin(), len, s.size(), read, written);
    s.remove(written);

    return s;
}

inline TUtf16String CharToWide(const char* text, size_t len, const CodePage& cp) { 
    TUtf16String w = TUtf16String::Uninitialized(len); 
    CharToWide(text, len, w.begin(), cp);
    return w;
}

//! calls either to @c UTF8ToWide or @c CharToWide depending on the encoding type
template <bool robust>
inline TUtf16String CharToWide(const char* text, size_t len, ECharset enc) { 
    if (NCodepagePrivate::NativeCodepage(enc)) {
        if (enc == CODES_UTF8)
            return UTF8ToWide<robust>(text, len);

        return CharToWide(text, len, *CodePageByCharset(enc));
    }

    TUtf16String w = TUtf16String::Uninitialized(len * 2); 

    size_t read = 0;
    size_t written = 0;
    NICONVPrivate::RecodeToUnicode(enc, text, w.begin(), len, len, read, written);
    w.remove(written);

    return w;
}

//! converts text from UTF8 to unicode, if conversion fails it uses codepage to convert the text
//! @param text     text to be converted
//! @param len      length of the text in characters
//! @param cp       a codepage that is used in case of failed conversion from UTF8
inline TUtf16String UTF8ToWide(const char* text, size_t len, const CodePage& cp) { 
    TUtf16String w = TUtf16String::Uninitialized(len); 
    size_t written = 0;
    if (UTF8ToWide(text, len, w.begin(), written))
        w.remove(written);
    else
        CharToWide(text, len, w.begin(), cp);
    return w;
}

inline TString WideToChar(const TWtringBuf w, ECharset enc) {
    return WideToChar(w.data(), w.size(), enc);
}

inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) {
    return CharToWide<false>(s.data(), s.size(), enc);
}

template <bool robust>
inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) {
    return CharToWide<robust>(s.data(), s.size(), enc);
}

inline TUtf16String CharToWide(const TStringBuf s, const CodePage& cp) {
    return CharToWide(s.data(), s.size(), cp);
}

// true if @text can be fully encoded to specified @encoding,
// with possibility to recover exact original text after decoding
bool CanBeEncoded(TWtringBuf text, ECharset encoding);