aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/unicode/normalization/custom_encoder.cpp
blob: f164a53f3bda2878519250de51b301194b3ab38b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#include "custom_encoder.h" 
#include "normalization.h" 
 
#include <util/string/cast.h> 
#include <util/stream/output.h> 
 
void TCustomEncoder::addToTable(wchar32 ucode, unsigned char code, const CodePage* target) { 
    unsigned char plane = (unsigned char)(ucode >> 8); 
    unsigned char pos = (unsigned char)(ucode & 255); 
    if (Table[plane] == DefaultPlane) { 
        Table[plane] = new char[256]; 
        memset(Table[plane], 0, 256 * sizeof(char));
    } 
 
    if (Table[plane][pos] == 0) { 
        Table[plane][pos] = code; 
    } else { 
        Y_ASSERT(target && *target->Names);
        if (static_cast<unsigned char>(Table[plane][pos]) > 127 && code) { 
            Cerr << "WARNING: Only lower part of ASCII should have duplicate encodings " 
                 << target->Names[0]
                 << " " << IntToString<16>(ucode)
                 << " " << IntToString<16>(code)
                 << " " << IntToString<16>(static_cast<unsigned char>(Table[plane][pos]))
                 << Endl;
        } 
    } 
} 
 
bool isGoodDecomp(wchar32 rune, wchar32 decomp) {
    if ( 
        (NUnicode::NPrivate::CharInfo(rune) == NUnicode::NPrivate::CharInfo(decomp)) || (IsAlpha(rune) && IsAlpha(decomp)) || (IsNumeric(rune) && IsNumeric(decomp)) || (IsQuotation(rune) && IsQuotation(decomp)))
    { 
        return true; 
    } 
    return false; 
} 
 
void TCustomEncoder::Create(const CodePage* target, bool extended) { 
    Y_ASSERT(target);
 
    DefaultChar = (const char*)target->DefaultChar; 
 
    DefaultPlane = new char[256]; 
 
    memset(DefaultPlane, 0, 256 * sizeof(char));
    for (size_t i = 0; i != 256; ++i) 
        Table[i] = DefaultPlane; 
 
    for (size_t i = 0; i != 256; ++i) { 
        wchar32 ucode = target->unicode[i]; 
        if (ucode != BROKEN_RUNE) // always UNASSIGNED 
            addToTable(ucode, (unsigned char)i, target);
    } 
 
    if (!extended) 
        return; 
 
    for (wchar32 w = 1; w < 65535; w++) {
        if (Code(w) == 0) {
            wchar32 dw = w; 
            while (IsComposed(dw) && Code(dw) == 0) {
                const wchar32* decomp_p = NUnicode::Decomposition<true>(dw); 
                Y_ASSERT(decomp_p != nullptr);
 
                dw = decomp_p[0]; 
                if (std::char_traits<wchar32>::length(decomp_p) > 1 && (dw == (wchar32)' ' || dw == (wchar32)'('))
                    dw = decomp_p[1]; 
            } 
            if (Code(dw) != 0 && isGoodDecomp(w, dw)) 
                addToTable(w, Code(dw), target); 
        } 
    } 
} 
 
TCustomEncoder::~TCustomEncoder() { 
    for (size_t i = 0; i != 256; ++i) { 
        if (Table[i] != DefaultPlane) { 
            delete[] Table[i];
        } 
    } 
    delete[] DefaultPlane;
}