aboutsummaryrefslogblamecommitdiffstats
path: root/library/cpp/unicode/normalization/custom_encoder.cpp
blob: c6f186405f13d44d331aa95b49104c6c2b749b32 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11









                                                                                            
                                                    



                                 
                                           
                                                                                        



                                                                                         


         
                                                 
        
                                                                                                                                                                                                                   





                                                                    
                     



                                                   
                                                




                                                      
                                                        



                  
                                         
                           
                                                     
                                                                            
                                              
                                 
                                                                                                                  









                                                     
                              
         
                          
 
#include "custom_encoder.h"
#include "normalization.h"

#include <util/string/cast.h>
#include <util/stream/output.h>

void TCustomEncoder::addToTable(wchar32 ucode, unsigned char code, const CodePage* target) {
    unsigned char plane = (unsigned char)(ucode >> 8);
    unsigned char pos = (unsigned char)(ucode & 255);
    if (Table[plane] == DefaultPlane) {
        Table[plane] = new char[256];
        memset(Table[plane], 0, 256 * sizeof(char));
    }

    if (Table[plane][pos] == 0) {
        Table[plane][pos] = code;
    } else {
        Y_ASSERT(target && *target->Names);
        if (static_cast<unsigned char>(Table[plane][pos]) > 127 && code) {
            Cerr << "WARNING: Only lower part of ASCII should have duplicate encodings "
                 << target->Names[0]
                 << " " << IntToString<16>(ucode)
                 << " " << IntToString<16>(code)
                 << " " << IntToString<16>(static_cast<unsigned char>(Table[plane][pos]))
                 << Endl;
        }
    }
}

bool isGoodDecomp(wchar32 rune, wchar32 decomp) {
    if (
        (NUnicode::NPrivate::CharInfo(rune) == NUnicode::NPrivate::CharInfo(decomp)) || (IsAlpha(rune) && IsAlpha(decomp)) || (IsNumeric(rune) && IsNumeric(decomp)) || (IsQuotation(rune) && IsQuotation(decomp)))
    {
        return true;
    }
    return false;
}

void TCustomEncoder::Create(const CodePage* target, bool extended) {
    Y_ASSERT(target);

    DefaultChar = (const char*)target->DefaultChar;

    DefaultPlane = new char[256];

    memset(DefaultPlane, 0, 256 * sizeof(char));
    for (size_t i = 0; i != 256; ++i)
        Table[i] = DefaultPlane;

    for (size_t i = 0; i != 256; ++i) {
        wchar32 ucode = target->unicode[i];
        if (ucode != BROKEN_RUNE) // always UNASSIGNED
            addToTable(ucode, (unsigned char)i, target);
    }

    if (!extended)
        return;

    for (wchar32 w = 1; w < 65535; w++) {
        if (Code(w) == 0) {
            wchar32 dw = w;
            while (IsComposed(dw) && Code(dw) == 0) {
                const wchar32* decomp_p = NUnicode::Decomposition<true>(dw);
                Y_ASSERT(decomp_p != nullptr);

                dw = decomp_p[0];
                if (std::char_traits<wchar32>::length(decomp_p) > 1 && (dw == (wchar32)' ' || dw == (wchar32)'('))
                    dw = decomp_p[1];
            }
            if (Code(dw) != 0 && isGoodDecomp(w, dw))
                addToTable(w, Code(dw), target);
        }
    }
}

TCustomEncoder::~TCustomEncoder() {
    for (size_t i = 0; i != 256; ++i) {
        if (Table[i] != DefaultPlane) {
            delete[] Table[i];
        }
    }
    delete[] DefaultPlane;
}