aboutsummaryrefslogblamecommitdiffstats
path: root/library/cpp/charset/recyr_int.hh
blob: 353af53305e9c090b31f93c8f2da561be98bfd19 (plain) (tree)
1
2
3
4
5
6
7
8
9
            
 
                                       
                             
                                
                                 
 
                     
                     
                  
 
                            









                                                                                                                                         
     


                                                                                                                                                          
 


                                                                 
 

                                                                 
                                                                                                                 



                                                     
     
 

                                                                                                                                                          
                                                       
                                                   
 


                                                                 
 


                                                                                                       
                                                                                     









                                                                                                        
     
























                                                                                                                                                            
 








                                                                                                                                                       
                                                                





                                                                 
             
         
                                                   
 
                                                                                     
 
                   
 













                                                                                                                                                                    
 







                                                                                                                                                           
 
                                                                                   
                                                                   





                                                                                    
     
                                                                                                           
                                                                                
     
 
                                                            
 
                                          
                                      
      
 
                                          
                                      
      
 

                                                                                                                                                       
 
                                                                                 
 
                                                                                                                                   
 




                                                                                                                                                                  
 


                                                                
 
                                     
 
                                   
 
                         
 





                                                                                                                      
 










                                                                                                
             
                   
     



                                                                                                                                                             
 

















                                                                               
             
                 
         


                                   
     


                                                                                                                                                                  
 
                                                                                                                  
 
                                                                                           
 
                                                                                           
 


                                                                                                                                                                  
 
                                                                                                                  
 
                                                                                                             
 
                                                                                                           
     

                                                                                                                      
 


                                                                                                         
 
                                                                                         
 
                                                                                       
 



                                                                                                                                                                  
 
                                                                                          
 
                                                                                                
 
                                                 
 
 
#pragma once

#include <util/charset/recode_result.h>
#include <util/charset/utf8.h>
#include <util/generic/ptr.h>
#include <util/generic/string.h>
#include <util/system/defaults.h>

#include "codepage.h"
#include "doccodes.h"
#include "iconv.h"
#include "wide.h"

namespace NCodepagePrivate {
    inline RECODE_RESULT _recodeCopy(const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
        in_readed = in_size;
        RECODE_RESULT res = RECODE_OK;
        if (in_readed > out_size) {
            res = RECODE_EOOUTPUT;
            in_readed = out_size;
        }
        if (in != out)
            memcpy(out, in, in_readed);
        out_writed = in_readed;
        return res;
    }

    inline RECODE_RESULT _recodeToUTF8(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
        if (From == CODES_UTF8)
            return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed);
        const CodePage* cp = CodePageByCharset(From);

        const unsigned char* in_start = (const unsigned char*)in;
        const unsigned char* in_end = in_start + in_size;
        const unsigned char* out_start = (unsigned char*)out;
        const unsigned char* out_end = out_start + out_size;

        size_t rune_len;
        RECODE_RESULT res = RECODE_OK;
        while ((unsigned char*)in < in_end && res == RECODE_OK) {
            res = SafeWriteUTF8Char(cp->unicode[(unsigned char)(*in++)], rune_len, (unsigned char*)out, out_end);
            out += rune_len;
        }
        in_readed = (unsigned char*)in - in_start;
        out_writed = (unsigned char*)out - out_start;
        return res;
    }

    inline RECODE_RESULT _recodeFromUTF8(ECharset to, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
        if (to == CODES_UTF8)
            return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed);
        Y_ASSERT(CODES_UNKNOWN < to && to < CODES_MAX);
        const Encoder* enc = &EncoderByCharset(to);

        const unsigned char* in_start = (const unsigned char*)in;
        const unsigned char* in_end = in_start + in_size;
        const unsigned char* out_start = (unsigned char*)out;
        const unsigned char* out_end = out_start + out_size;

        wchar32 rune;
        size_t rune_len;
        RECODE_RESULT res = RECODE_OK;
        while ((const unsigned char*)in < in_end && (res == RECODE_OK || res == RECODE_BROKENSYMBOL)) {
            res = SafeReadUTF8Char(rune, rune_len, (const unsigned char*)in, in_end);
            if (res == RECODE_BROKENSYMBOL)
                rune_len = 1;
            if (res != RECODE_EOINPUT)
                *out++ = enc->Tr(rune);
            in += rune_len;
            if (res == RECODE_OK && (const unsigned char*)in < in_end && (unsigned char*)out >= out_end)
                res = RECODE_EOOUTPUT;
        }
        in_readed = (unsigned char*)in - in_start;
        out_writed = (unsigned char*)out - out_start;
        return res;
    }

    inline RECODE_RESULT _recodeToYandex(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
        if (From == CODES_YANDEX)
            return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed);
        if (From == CODES_UTF8)
            return _recodeFromUTF8(CODES_YANDEX, in, out, in_size, out_size, in_readed, out_writed);
        in_readed = (out_size > in_size) ? in_size : out_size;
        const Recoder& rcdr = NCodepagePrivate::TCodePageData::rcdr_to_yandex[From];
        rcdr.Tr(in, out, in_readed);
        out_writed = in_readed;
        if (out_size < in_size)
            return RECODE_EOOUTPUT;
        return RECODE_OK;
    }
    inline RECODE_RESULT _recodeFromYandex(ECharset To, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
        if (To == CODES_YANDEX)
            return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed);
        if (To == CODES_UTF8)
            return _recodeToUTF8(CODES_YANDEX, in, out, in_size, out_size, in_readed, out_writed);
        in_readed = (out_size > in_size) ? in_size : out_size;
        const Recoder& rcdr = NCodepagePrivate::TCodePageData::rcdr_from_yandex[To];
        rcdr.Tr(in, out, in_readed);
        out_writed = in_readed;
        if (out_size < in_size)
            return RECODE_EOOUTPUT;
        return RECODE_OK;
    }

    template <class TCharType>
    inline RECODE_RESULT _recodeUTF8ToUnicode(const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
        const unsigned char* inp = (const unsigned char*)in;
        const unsigned char* in_end = inp + in_size;
        TCharType* outp = out;
        const TCharType* out_end = outp + out_size;
        size_t rune_len;
        wchar32 rune;
        RECODE_RESULT res = RECODE_OK;
        while ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && inp < in_end && outp < out_end) {
            res = SafeReadUTF8Char(rune, rune_len, inp, in_end);
            if (res == RECODE_BROKENSYMBOL)
                rune_len = 1;
            if (res == RECODE_OK || res == RECODE_BROKENSYMBOL) {
                if (!WriteSymbol(rune, outp, out_end)) {
                    break;
                }
                inp += rune_len;
            }
        }
        in_readed = inp - (const unsigned char*)in;
        out_writed = outp - out;

        if ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && in_readed != in_size)
            return RECODE_EOOUTPUT;

        return res;
    }

    template <class TCharType>
    inline RECODE_RESULT _recodeSBToUnicode(ECharset From, const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
        const CodePage* cp = CodePageByCharset(From);
        const unsigned char* inp = (const unsigned char*)in;
        const unsigned char* in_end = inp + in_size;
        TCharType* outp = out;
        const TCharType* out_end = outp + out_size;
        while (inp < in_end && outp < out_end)
            *outp++ = static_cast<TCharType>(cp->unicode[*inp++]);
        in_readed = inp - (const unsigned char*)in;
        out_writed = outp - out;
        if (in_readed != in_size)
            return RECODE_EOOUTPUT;
        return RECODE_OK;
    }

    template <class TCharType>
    inline RECODE_RESULT _recodeUnicodeToUTF8Impl(const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
        const TCharType* inp = in;
        const TCharType* in_end = in + in_size;
        unsigned char* outp = (unsigned char*)out;
        const unsigned char* out_end = outp + out_size;
        size_t rune_len;
        wchar32 rune;
        RECODE_RESULT res = RECODE_OK;

        while ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && inp != in_end) {
            rune = ReadSymbolAndAdvance(inp, in_end);
            res = SafeWriteUTF8Char(rune, rune_len, outp, out_end);
            if (outp >= out_end && (res == RECODE_OK || res == RECODE_BROKENSYMBOL))
                res = RECODE_EOOUTPUT;
            outp += rune_len;
        }
        in_readed = inp - in;
        out_writed = outp - (const unsigned char*)out;
        return res;
    }

    inline RECODE_RESULT _recodeUnicodeToUTF8(wchar32 rune, char* out, size_t out_size, size_t& nwritten) {
        return SafeWriteUTF8Char(rune, nwritten, (unsigned char*)out, out_size);
    }

    template <class TCharType, int Size = sizeof(TCharType)>
    struct TCharTypeSwitch;

    template <class TCharType>
    struct TCharTypeSwitch<TCharType, 2> {
        using TRealCharType = wchar16;
    };

    template <class TCharType>
    struct TCharTypeSwitch<TCharType, 4> {
        using TRealCharType = wchar32;
    };

    template <class TCharType>
    inline RECODE_RESULT _recodeUnicodeToUTF8(const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
        static_assert(sizeof(TCharType) > 1, "expect some wide type");

        using TRealCharType = typename TCharTypeSwitch<TCharType>::TRealCharType;

        return _recodeUnicodeToUTF8Impl(reinterpret_cast<const TRealCharType*>(in), out, in_size, out_size, in_readed, out_writed);
    }

    template <class TCharType>
    inline RECODE_RESULT _recodeUnicodeToSB(ECharset To, const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
        const TCharType* inp = in;
        const TCharType* in_end = in + in_size;
        const char* out_begin = out;
        const char* out_end = out + out_size;

        const Encoder* enc = &EncoderByCharset(To);
        while (inp != in_end && out != out_end) {
            *out++ = enc->Tr(ReadSymbolAndAdvance(inp, in_end));
        }

        in_readed = inp - in;
        out_writed = out - out_begin;

        if (in_readed != in_size)
            return RECODE_EOOUTPUT;

        return RECODE_OK;
    }

    inline RECODE_RESULT _recodeUnicodeToSB(ECharset To, wchar32 rune, char* out, size_t out_size, size_t& nwritten) {
        if (0 == out_size)
            return RECODE_EOOUTPUT;
        *out = EncoderByCharset(To).Tr(rune);
        nwritten = 1;
        return RECODE_OK;
    }

    inline RECODE_RESULT _rune2hex(wchar32 in, char* out, size_t out_size, size_t& out_writed) {
        static const char hex_digs[] = "0123456789ABCDEF";
        out_writed = 0;
        RECODE_RESULT res = RECODE_OK;
        for (int i = 7; i >= 0; i--) {
            unsigned char h = (unsigned char)(in >> (i * 4) & 0x0F);
            if (h || i == 0) {
                if (out_writed + 1 >= out_size) {
                    res = RECODE_EOOUTPUT;
                    break;
                }
                out[out_writed++] = hex_digs[h];
            }
        }
        return res;
    }

    inline RECODE_RESULT _recodeUnicodeToHTMLEntities(const wchar32* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
        const wchar32* in_end = in + in_size;
        const char* out_beg = out;
        const wchar32* in_beg = in;
        RECODE_RESULT res = RECODE_OK;

        const char* out_end = out + out_size - 1;
        while (in < in_end && out < out_end) {
            if (*in < 0x80 && *in != '<' && *in != '&' && *in != '>') { //ascii
                *out++ = char(*in & 0x00FF);
            } else { //entity
                char* ent = out;
                size_t ent_writed;
                if (ent > out_end - 6) {
                    res = RECODE_EOOUTPUT;
                    break;
                }
                memcpy(ent, "&#x", 3);
                ent += 3;
                res = _rune2hex(*in, ent, out_end - 1 - ent, ent_writed);
                if (res != RECODE_OK)
                    break;
                ent += ent_writed;
                *ent++ = ';';
                out = ent;
            }
            in++;
        }
        *out++ = '\x00';
        out_writed = out - out_beg;
        in_readed = in - in_beg;
        return res;
    }

    template <class TCharType>
    inline RECODE_RESULT _recodeToUnicode(ECharset From, const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
        if (!ValidCodepage(From))
            return RECODE_ERROR;

        if (!NCodepagePrivate::NativeCodepage(From))
            return NICONVPrivate::RecodeToUnicodeNoThrow(From, in, out, in_size, out_size, in_readed, out_writed);

        if (From == CODES_UTF8)
            return _recodeUTF8ToUnicode(in, out, in_size, out_size, in_readed, out_writed);

        return _recodeSBToUnicode(From, in, out, in_size, out_size, in_readed, out_writed);
    }

    template <class TCharType>
    inline RECODE_RESULT _recodeFromUnicode(ECharset To, const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
        if (!ValidCodepage(To))
            return RECODE_ERROR;

        if (!NCodepagePrivate::NativeCodepage(To))
            return NICONVPrivate::RecodeFromUnicodeNoThrow(To, in, out, in_size, out_size, in_readed, out_writed);

        if (To == CODES_UTF8)
            return NCodepagePrivate::_recodeUnicodeToUTF8(in, out, in_size, out_size, in_readed, out_writed);

        return NCodepagePrivate::_recodeUnicodeToSB(To, in, out, in_size, out_size, in_readed, out_writed);
    }

    inline RECODE_RESULT _recodeFromUnicode(ECharset To, wchar32 rune, char* out, size_t out_size, size_t& nwritten) {
        if (!ValidCodepage(To))
            return RECODE_ERROR;

        if (!NCodepagePrivate::NativeCodepage(To)) {
            size_t nread = 0;
            return NICONVPrivate::RecodeFromUnicodeNoThrow(To, &rune, out, 1, out_size, nread, nwritten);
        }

        if (To == CODES_UTF8)
            return NCodepagePrivate::_recodeUnicodeToUTF8(rune, out, out_size, nwritten);

        return NCodepagePrivate::_recodeUnicodeToSB(To, rune, out, out_size, nwritten);
    }

    inline RECODE_RESULT _recodeToHTMLEntities(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
        TArrayHolder<wchar32> bufHolder(new wchar32[in_size]);
        wchar32* buf = bufHolder.Get();
        size_t unicode_size;
        RECODE_RESULT res1, res2;

        //first pass - to unicode
        res1 = _recodeToUnicode(From, in, buf, in_size, in_size, in_readed, unicode_size);

        //second pass - to entities
        res2 = _recodeUnicodeToHTMLEntities(buf, out, in_size, out_size, in_readed, out_writed);

        return (res2 != RECODE_OK) ? res2 : res1;
    }

}