aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/charset/recyr.hh
blob: 7362c8940a427fbaf6a843a0691b75885cc12ba4 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#pragma once

#include <cstdlib>

#include <util/charset/recode_result.h>
#include <util/generic/ptr.h> 
#include <util/generic/yexception.h>
 
#include "codepage.h" 
#include "doccodes.h" 
#include "iconv.h" 
#include "recyr_int.hh"

///////////////////////////////////////////////////////////////////////////////////////
//     input buf -> output buf                                                       //
///////////////////////////////////////////////////////////////////////////////////////
template <class TCharType>
inline RECODE_RESULT RecodeToUnicode(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) {
    static_assert(sizeof(TCharType) > 1, "expect wide character type");

    return NCodepagePrivate::_recodeToUnicode(from, in, out, inSize, outSize, inRead, outWritten);
}

template <class TCharType>
inline RECODE_RESULT RecodeFromUnicode(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) {
    static_assert(sizeof(TCharType) > 1, "expect wide character type");

    return NCodepagePrivate::_recodeFromUnicode(to, in, out, inSize, outSize, inRead, outWritten);
}

inline RECODE_RESULT RecodeFromUnicode(ECharset to, wchar32 rune, char* out, size_t outSize, size_t& outWritten) {
    return NCodepagePrivate::_recodeFromUnicode(to, rune, out, outSize, outWritten);
}

template <class TCharType>
inline RECODE_RESULT RecodeToUnicode(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize) {
    size_t inRead = 0;
    size_t outWritten = 0;
    return RecodeToUnicode(from, in, out, inSize, outSize, inRead, outWritten);
} 
 
template <class TCharType>
inline RECODE_RESULT RecodeFromUnicode(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize) {
    size_t inRead = 0;
    size_t outWritten = 0;
    return RecodeFromUnicode(to, in, out, inSize, outSize, inRead, outWritten);
} 
 
inline RECODE_RESULT RecodeFromUnicode(ECharset theEncoding, const wchar16* chars, size_t length,
                                       char* bytes, size_t size, size_t* read = nullptr, size_t* written = nullptr) {
    size_t w = 0, r = 0;
    RECODE_RESULT rc = ::RecodeFromUnicode(theEncoding, chars, bytes, length, size, r, w);
    if (read)
        *read = r;
    if (written)
        *written = w;
    return rc;
}

inline RECODE_RESULT Recode(ECharset from, ECharset to, const char* in, char* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) {
    inRead = 0;
    outWritten = 0;
 
    if (!ValidCodepage(to) || !ValidCodepage(from))
        return RECODE_ERROR; 
 
    if (to == from)
        return NCodepagePrivate::_recodeCopy(in, out, inSize, outSize, inRead, outWritten);

    if (NCodepagePrivate::NativeCodepage(from) && NCodepagePrivate::NativeCodepage(to)) {
        if (from == CODES_UTF8)
            return NCodepagePrivate::_recodeFromUTF8(to, in, out, inSize, outSize, inRead, outWritten);
        if (to == CODES_UTF8)
            return NCodepagePrivate::_recodeToUTF8(from, in, out, inSize, outSize, inRead, outWritten);
        if (from == CODES_YANDEX)
            return NCodepagePrivate::_recodeFromYandex(to, in, out, inSize, outSize, inRead, outWritten);
        if (to == CODES_YANDEX)
            return NCodepagePrivate::_recodeToYandex(from, in, out, inSize, outSize, inRead, outWritten);
    } else if (NICONVPrivate::CanConvert(from, to)) {
        return NICONVPrivate::RecodeNoThrow(from, to, in, out, inSize, outSize, inRead, outWritten);
    } 

    size_t wideSize = inSize * 3;
    TArrayHolder<wchar16> wide(new wchar16[wideSize]);
 
    size_t wideRead = 0; 
    size_t wideWritten = 0; 
 
    RECODE_RESULT res = RecodeToUnicode(from, in, wide.Get(), inSize, wideSize, inRead, wideWritten);
    if (res != RECODE_OK) 
        return res; 
 
    res = RecodeFromUnicode(to, wide.Get(), out, wideWritten, outSize, wideRead, outWritten);
 
    return res; 
}

inline RECODE_RESULT Recode(ECharset from, ECharset to, const char* in, char* out, size_t inSize, size_t outSize) {
    size_t inRead = 0;
    size_t outWritten = 0;
    return Recode(from, to, in, out, inSize, outSize, inRead, outWritten);
} 
 
/**
 * Recode from one charset to another; throw an exception if conversion failed
 * @param[in] from the source character set
 * @param[in] to the target character set
 * @param[in] in    the input string buffer
 * @param[out] out  the output string object if conversion was successful
 * @return false if conversion was not attempted (charsets were the same),
 *         true if successful
 */
inline bool Recode(ECharset from, ECharset to, const TStringBuf& in, TString& out) {
    if (to == from)
        return false;

    const size_t inSize = in.length();
    const size_t outSize = SingleByteCodepage(to) ? inSize : 3 * inSize;
    out.clear(); // so we don't copy stuff around when resizing
    out.ReserveAndResize(outSize);

    size_t inRead = 0;
    size_t outWritten = 0;
    const RECODE_RESULT res = Recode(from, to, in.data(), out.begin(), inSize, outSize, inRead, outWritten);
    Y_ENSURE(RECODE_OK == res, "Recode failed. ");
    if (outWritten > outSize)
        ythrow yexception() << "Recode overrun the buffer: size="
                            << outSize << " need=" << outWritten;

    out.remove(outWritten);
    return true;
}

///////////////////////////////////////////////////////////////////////////////////////
//     TString -> TString                                                              //
///////////////////////////////////////////////////////////////////////////////////////
inline TString Recode(ECharset from, ECharset to, const TString& in) {
    TString out;
    return to != from && Recode(from, to, in, out) ? out : in;
}
inline TString RecodeToYandex(ECharset from, const TString& in) {
    return Recode(from, CODES_YANDEX, in);
}
inline TString RecodeFromYandex(ECharset to, const TString& in) {
    return Recode(CODES_YANDEX, to, in);
}

inline TString RecodeToHTMLEntities(ECharset from, const TString& in) {
    RECODE_RESULT res;
    size_t outWritten, inRead;
    TString out;
    out.resize(in.length() * (4 + 4));
    res = NCodepagePrivate::_recodeToHTMLEntities(from, in.c_str(), out.begin(), in.length(), out.length(), inRead, outWritten);
    if (res == RECODE_EOOUTPUT) { //input contains many 8-byte characters?
        out.resize(in.length() * (4 + 8));
        res = NCodepagePrivate::_recodeToHTMLEntities(from, in.c_str(), out.begin(), in.length(), out.length(), inRead, outWritten);
    }
    if (res != RECODE_OK) { 
        ythrow yexception() << "Recode to HTML entities failed";
    } 
 
    out.resize(outWritten - 1);
    return out;
}