aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/charset/decodeunknownplane.cpp
blob: 201a25cd54bbcb9ed706cb168d298c4564483b1f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#include "ci_string.h"
#include "codepage.h"
#include "recyr.hh"

#include <util/system/hi_lo.h>
#include <util/generic/vector.h>

template <typename TxChar>
static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size_t& rune_len, const TxChar* s, const TxChar* end) {
    if ((*s & 0xFF00) != 0xF000) {
        rune_len = 1;
        rune = *s;
        return RECODE_OK;
    }

    rune_len = 0;

    size_t _len = UTF8RuneLen((unsigned char)(*s));
    if (s + _len > end)
        return RECODE_EOINPUT; //[EOINPUT]
    if (_len == 0)
        return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte

    wchar32 _rune = (ui8)(*s++); //[00000000 0XXXXXXX]
    if (_len > 1) {
        _rune &= UTF8LeadByteMask(_len);
        wchar32 ch = *s++;
        if ((ch & 0xFFC0) != 0xF080)
            return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
        _rune <<= 6;
        _rune |= ch & 0x3F; //[00000XXX XXYYYYYY]
        if (_len > 2) {
            ch = *s++;
            if ((ch & 0xFFC0) != 0xF080)
                return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
            _rune <<= 6;
            _rune |= ch & 0x3F; //[XXXXYYYY YYZZZZZZ]
            if (_len > 3) {
                ch = *s;
                if ((ch & 0xFFC0) != 0xF080)
                    return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
                _rune <<= 6;
                _rune |= ch & 0x3F; //[XXXYY YYYYZZZZ ZZQQQQQQ]
            }
        }
    }
    rune_len = _len;
    if (_rune > Max<TxChar>())
        rune = ' '; // maybe put sequence
    else
        rune = TxChar(_rune);
    return RECODE_OK;
}

template <typename TxChar>
void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) {
    TxChar* e = ee;
    if (SingleByteCodepage(enc)) {
        const CodePage* cp = CodePageByCharset(enc);
        for (TxChar* s = str; s < e; s++) {
            if (Hi8(Lo16(*s)) == 0xF0)
                *s = (TxChar)cp->unicode[Lo8(Lo16(*s))]; // NOT mb compliant
        }
    } else if (enc == CODES_UTF8) {
        TxChar* s;
        TxChar* d;

        for (s = d = str; s < e;) {
            size_t l = 0;

            if (utf8_read_rune_from_unknown_plane(*d, l, s, e) == RECODE_OK) {
                d++, s += l;
            } else {
                *d++ = BROKEN_RUNE;
                ++s;
            }
        }
        e = d;
    } else if (enc == CODES_UNKNOWN) {
        for (TxChar* s = str; s < e; s++) {
            if (Hi8(Lo16(*s)) == 0xF0)
                *s = Lo8(Lo16(*s));
        }
    } else {
        Y_ASSERT(!SingleByteCodepage(enc));

        TxChar* s = str;
        TxChar* d = str;

        TVector<char> buf;

        size_t read = 0;
        size_t written = 0;
        for (; s < e; ++s) {
            if (Hi8(Lo16(*s)) == 0xF0) {
                buf.push_back(Lo8(Lo16(*s)));
            } else {
                if (!buf.empty()) {
                    if (RecodeToUnicode(enc, buf.data(), d, buf.size(), e - d, read, written) == RECODE_OK) {
                        Y_ASSERT(read == buf.size());
                        d += written;
                    } else { // just copying broken symbols
                        Y_ASSERT(buf.size() <= static_cast<size_t>(e - d));
                        Copy(buf.data(), buf.size(), d);
                        d += buf.size();
                    }
                    buf.clear();
                }
                *d++ = *s;
            }
        }
    }
    ee = e;
}

void DecodeUnknownPlane(wchar16* str, wchar16*& ee, const ECharset enc) {
    DoDecodeUnknownPlane(str, ee, enc);
}
void DecodeUnknownPlane(wchar32* str, wchar32*& ee, const ECharset enc) {
    DoDecodeUnknownPlane(str, ee, enc);
}