diff options
author | Anton Samokhvalov <pg83@yandex.ru> | 2022-02-10 16:45:15 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:15 +0300 |
commit | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch) | |
tree | da2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /library/cpp/charset | |
parent | 778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff) | |
download | ydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz |
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/charset')
-rw-r--r-- | library/cpp/charset/codepage.cpp | 278 | ||||
-rw-r--r-- | library/cpp/charset/codepage.h | 90 | ||||
-rw-r--r-- | library/cpp/charset/codepage_ut.cpp | 138 | ||||
-rw-r--r-- | library/cpp/charset/cp_encrec.cpp | 10 | ||||
-rw-r--r-- | library/cpp/charset/doccodes.cpp | 2 | ||||
-rw-r--r-- | library/cpp/charset/doccodes.h | 72 | ||||
-rw-r--r-- | library/cpp/charset/iconv.cpp | 186 | ||||
-rw-r--r-- | library/cpp/charset/iconv.h | 82 | ||||
-rw-r--r-- | library/cpp/charset/iconv_ut.cpp | 54 | ||||
-rw-r--r-- | library/cpp/charset/recyr.hh | 34 | ||||
-rw-r--r-- | library/cpp/charset/recyr_int.hh | 540 | ||||
-rw-r--r-- | library/cpp/charset/wide.h | 20 | ||||
-rw-r--r-- | library/cpp/charset/wide_ut.cpp | 96 | ||||
-rw-r--r-- | library/cpp/charset/ya.make | 6 |
14 files changed, 804 insertions, 804 deletions
diff --git a/library/cpp/charset/codepage.cpp b/library/cpp/charset/codepage.cpp index 0431bef31b..816f3fec67 100644 --- a/library/cpp/charset/codepage.cpp +++ b/library/cpp/charset/codepage.cpp @@ -1,45 +1,45 @@ #include "ci_string.h" -#include "wide.h" -#include "recyr.hh" -#include "codepage.h" - -#include <util/string/cast.h> +#include "wide.h" +#include "recyr.hh" +#include "codepage.h" + +#include <util/string/cast.h> #include <util/string/subst.h> -#include <util/string/util.h> +#include <util/string/util.h> #include <util/system/hi_lo.h> -#include <util/system/yassert.h> -#include <util/generic/hash.h> +#include <util/system/yassert.h> +#include <util/generic/hash.h> #include <util/generic/string.h> -#include <util/generic/vector.h> -#include <util/generic/hash_set.h> -#include <util/generic/singleton.h> +#include <util/generic/vector.h> +#include <util/generic/hash_set.h> +#include <util/generic/singleton.h> #include <util/generic/yexception.h> #include <util/memory/pool.h> - -#include <cstring> - -#include <ctype.h> - + +#include <cstring> + +#include <ctype.h> + using namespace NCodepagePrivate; -void Recoder::Create(const CodePage& source, const CodePage& target) { +void Recoder::Create(const CodePage& source, const CodePage& target) { const Encoder* wideTarget = &EncoderByCharset(target.CPEnum); Create(source, wideTarget); } -void Recoder::Create(const CodePage& page, wchar32 (*mapfunc)(wchar32)) { +void Recoder::Create(const CodePage& page, wchar32 (*mapfunc)(wchar32)) { const Encoder* widePage = &EncoderByCharset(page.CPEnum); Create(page, widePage, mapfunc); } -template <class T, class T1> +template <class T, class T1> static inline T1 Apply(T b, T e, T1 to, const Recoder& mapper) { - while (b != e) { - *to++ = mapper.Table[(unsigned char)*b++]; - } - - return to; -} - + while (b != e) { + *to++ = mapper.Table[(unsigned char)*b++]; + } + + return to; +} + template <class T, class T1> static inline T1 Apply(T b, T1 to, const Recoder& mapper) { while (*b != 0) { @@ -49,21 +49,21 @@ static inline T1 Apply(T b, T1 to, const Recoder& mapper) { return to; } -char* CodePage::ToLower(const char* b, const char* e, char* to) const { - return Apply(b, e, to, TCodePageData::rcdr_to_lower[CPEnum]); -} +char* CodePage::ToLower(const char* b, const char* e, char* to) const { + return Apply(b, e, to, TCodePageData::rcdr_to_lower[CPEnum]); +} char* CodePage::ToLower(const char* b, char* to) const { return Apply(b, to, TCodePageData::rcdr_to_lower[CPEnum]); } - -char* CodePage::ToUpper(const char* b, const char* e, char* to) const { + +char* CodePage::ToUpper(const char* b, const char* e, char* to) const { return Apply(b, e, to, TCodePageData::rcdr_to_upper[CPEnum]); -} +} char* CodePage::ToUpper(const char* b, char* to) const { return Apply(b, to, TCodePageData::rcdr_to_upper[CPEnum]); } - -int CodePage::stricmp(const char* dst, const char* src) const { + +int CodePage::stricmp(const char* dst, const char* src) const { unsigned char f, l; do { f = ToLower(*dst++); @@ -86,18 +86,18 @@ int CodePage::strnicmp(const char* dst, const char* src, size_t len) const { static const CodePage UNSUPPORTED_CODEPAGE = { CODES_UNSUPPORTED, - { - "unsupported", - }, + { + "unsupported", + }, {}, nullptr, }; static const CodePage UNKNOWN_CODEPAGE = { CODES_UNKNOWN, - { - "unknown", - }, + { + "unknown", + }, {}, nullptr, }; @@ -122,14 +122,14 @@ NCodepagePrivate::TCodepagesMap::TCodepagesMap() { } } -const NCodepagePrivate::TCodepagesMap& NCodepagePrivate::TCodepagesMap::Instance() { - return *Singleton<NCodepagePrivate::TCodepagesMap>(); -} - +const NCodepagePrivate::TCodepagesMap& NCodepagePrivate::TCodepagesMap::Instance() { + return *Singleton<NCodepagePrivate::TCodepagesMap>(); +} + class TCodePageHash { private: using TData = THashMap<TStringBuf, ECharset, ci_hash, ci_equal_to>; - + TData Data; TMemoryPool Pool; @@ -153,7 +153,7 @@ private: temp = name; SubstGlobal(temp, '-', '_'); AddNameWithCheck(temp, code); - + temp = name; SubstGlobal(temp, '_', '-'); AddNameWithCheck(temp, code); @@ -176,8 +176,8 @@ public: AddName(name, e); AddName(xPrefix + name, e); - } - } + } + } } inline ECharset CharsetByName(TStringBuf name) { @@ -204,7 +204,7 @@ ECharset CharsetByNameOrDie(TStringBuf name) { } template <typename TxChar> -static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size_t& rune_len, const TxChar* s, const TxChar* end) { +static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size_t& rune_len, const TxChar* s, const TxChar* end) { if ((*s & 0xFF00) != 0xF000) { rune_len = 1; rune = *s; @@ -214,37 +214,37 @@ static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size rune_len = 0; size_t _len = UTF8RuneLen((unsigned char)(*s)); - if (s + _len > end) - return RECODE_EOINPUT; //[EOINPUT] - if (_len == 0) - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte + if (s + _len > end) + return RECODE_EOINPUT; //[EOINPUT] + if (_len == 0) + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte - wchar32 _rune = (ui8)(*s++); //[00000000 0XXXXXXX] + wchar32 _rune = (ui8)(*s++); //[00000000 0XXXXXXX] if (_len > 1) { _rune &= UTF8LeadByteMask(_len); wchar32 ch = *s++; if ((ch & 0xFFC0) != 0xF080) - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte _rune <<= 6; - _rune |= ch & 0x3F; //[00000XXX XXYYYYYY] + _rune |= ch & 0x3F; //[00000XXX XXYYYYYY] if (_len > 2) { ch = *s++; if ((ch & 0xFFC0) != 0xF080) - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte _rune <<= 6; - _rune |= ch & 0x3F; //[XXXXYYYY YYZZZZZZ] + _rune |= ch & 0x3F; //[XXXXYYYY YYZZZZZZ] if (_len > 3) { ch = *s; if ((ch & 0xFFC0) != 0xF080) return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte _rune <<= 6; - _rune |= ch & 0x3F; //[XXXYY YYYYZZZZ ZZQQQQQQ] + _rune |= ch & 0x3F; //[XXXYY YYYYZZZZ ZZQQQQQQ] } } } rune_len = _len; if (_rune > Max<TxChar>()) - rune = ' '; // maybe put sequence + rune = ' '; // maybe put sequence else rune = TxChar(_rune); return RECODE_OK; @@ -262,16 +262,16 @@ void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) { } else if (enc == CODES_UTF8) { TxChar* s; TxChar* d; - - for (s = d = str; s < e;) { + + for (s = d = str; s < e;) { size_t l = 0; - + if (utf8_read_rune_from_unknown_plane(*d, l, s, e) == RECODE_OK) { d++, s += l; } else { *d++ = BROKEN_RUNE; ++s; - } + } } e = d; } else if (enc == CODES_UNKNOWN) { @@ -289,7 +289,7 @@ void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) { size_t read = 0; size_t written = 0; - for (; s < e; ++s) { + for (; s < e; ++s) { if (Hi8(Lo16(*s)) == 0xF0) { buf.push_back(Lo8(Lo16(*s))); } else { @@ -318,28 +318,28 @@ void DecodeUnknownPlane(wchar32* str, wchar32*& ee, const ECharset enc) { DoDecodeUnknownPlane(str, ee, enc); } -namespace { +namespace { class THashSetType: public THashSet<TString> { - public: + public: inline void Add(const TString& s) { - insert(s); - } - + insert(s); + } + inline bool Has(const TString& s) const noexcept { - return find(s) != end(); - } - }; -} - + return find(s) != end(); + } + }; +} + class TWindowsPrefixesHashSet: public THashSetType { public: inline TWindowsPrefixesHashSet() { - Add("win"); - Add("wincp"); - Add("window"); + Add("win"); + Add("wincp"); + Add("window"); Add("windowcp"); - Add("windows"); - Add("windowscp"); + Add("windows"); + Add("windowscp"); Add("ansi"); Add("ansicp"); } @@ -364,19 +364,19 @@ public: }; class TLatinToIsoHash: public THashMap<const char*, TString, ci_hash, ci_equal_to> { -public: - inline TLatinToIsoHash() { - insert(value_type("latin1", "iso-8859-1")); - insert(value_type("latin2", "iso-8859-2")); - insert(value_type("latin3", "iso-8859-3")); - insert(value_type("latin4", "iso-8859-4")); - insert(value_type("latin5", "iso-8859-9")); - insert(value_type("latin6", "iso-8859-10")); - insert(value_type("latin7", "iso-8859-13")); - insert(value_type("latin8", "iso-8859-14")); - insert(value_type("latin9", "iso-8859-15")); - insert(value_type("latin10", "iso-8859-16")); - } +public: + inline TLatinToIsoHash() { + insert(value_type("latin1", "iso-8859-1")); + insert(value_type("latin2", "iso-8859-2")); + insert(value_type("latin3", "iso-8859-3")); + insert(value_type("latin4", "iso-8859-4")); + insert(value_type("latin5", "iso-8859-9")); + insert(value_type("latin6", "iso-8859-10")); + insert(value_type("latin7", "iso-8859-13")); + insert(value_type("latin8", "iso-8859-14")); + insert(value_type("latin9", "iso-8859-15")); + insert(value_type("latin10", "iso-8859-16")); + } }; static inline void NormalizeEncodingPrefixes(TString& enc) { @@ -391,14 +391,14 @@ static inline void NormalizeEncodingPrefixes(TString& enc) { } } - if (Singleton<TWindowsPrefixesHashSet>()->Has(prefix)) { + if (Singleton<TWindowsPrefixesHashSet>()->Has(prefix)) { enc.remove(0, preflen); enc.prepend("windows-"); return; } - if (Singleton<TCpPrefixesHashSet>()->Has(prefix)) { - if (enc.length() > preflen + 3 && !strncmp(enc.c_str() + preflen, "125", 3) && isdigit(enc[preflen + 3])) { + if (Singleton<TCpPrefixesHashSet>()->Has(prefix)) { + if (enc.length() > preflen + 3 && !strncmp(enc.c_str() + preflen, "125", 3) && isdigit(enc[preflen + 3])) { enc.remove(0, preflen); enc.prepend("windows-"); return; @@ -408,7 +408,7 @@ static inline void NormalizeEncodingPrefixes(TString& enc) { return; } - if (Singleton<TIsoPrefixesHashSet>()->Has(prefix)) { + if (Singleton<TIsoPrefixesHashSet>()->Has(prefix)) { if (enc.length() == preflen + 1 || enc.length() == preflen + 2) { TString enccopy = enc.substr(preflen); enccopy.prepend("latin"); @@ -428,46 +428,46 @@ static inline void NormalizeEncodingPrefixes(TString& enc) { class TEncodingNamesHashSet: public THashSetType { public: TEncodingNamesHashSet() { - Add("iso-8859-1"); - Add("iso-8859-2"); - Add("iso-8859-3"); - Add("iso-8859-4"); - Add("iso-8859-5"); - Add("iso-8859-6"); - Add("iso-8859-7"); - Add("iso-8859-8"); - Add("iso-8859-8-i"); - Add("iso-8859-9"); - Add("iso-8859-10"); - Add("iso-8859-11"); - Add("iso-8859-12"); - Add("iso-8859-13"); - Add("iso-8859-14"); - Add("iso-8859-15"); - Add("windows-1250"); - Add("windows-1251"); - Add("windows-1252"); - Add("windows-1253"); - Add("windows-1254"); - Add("windows-1255"); - Add("windows-1256"); - Add("windows-1257"); - Add("windows-1258"); - Add("windows-874"); - Add("iso-2022-jp"); - Add("euc-jp"); - Add("shift-jis"); - Add("shiftjis"); - Add("iso-2022-kr"); - Add("euc-kr"); - Add("gb-2312"); - Add("gb2312"); - Add("gb-18030"); - Add("gb18030"); - Add("gbk"); - Add("big5"); - Add("tis-620"); - Add("tis620"); + Add("iso-8859-1"); + Add("iso-8859-2"); + Add("iso-8859-3"); + Add("iso-8859-4"); + Add("iso-8859-5"); + Add("iso-8859-6"); + Add("iso-8859-7"); + Add("iso-8859-8"); + Add("iso-8859-8-i"); + Add("iso-8859-9"); + Add("iso-8859-10"); + Add("iso-8859-11"); + Add("iso-8859-12"); + Add("iso-8859-13"); + Add("iso-8859-14"); + Add("iso-8859-15"); + Add("windows-1250"); + Add("windows-1251"); + Add("windows-1252"); + Add("windows-1253"); + Add("windows-1254"); + Add("windows-1255"); + Add("windows-1256"); + Add("windows-1257"); + Add("windows-1258"); + Add("windows-874"); + Add("iso-2022-jp"); + Add("euc-jp"); + Add("shift-jis"); + Add("shiftjis"); + Add("iso-2022-kr"); + Add("euc-kr"); + Add("gb-2312"); + Add("gb2312"); + Add("gb-18030"); + Add("gb18030"); + Add("gbk"); + Add("big5"); + Add("tis-620"); + Add("tis620"); } }; @@ -494,7 +494,7 @@ ECharset EncodingHintByName(const char* encname) { // Do some normalization TString enc(encname, lastpos - encname + 1); enc.to_lower(); - for (char* p = enc.begin(); p != enc.end(); ++p) { + for (char* p = enc.begin(); p != enc.end(); ++p) { if (*p == ' ' || *p == '=' || *p == '_') *p = '-'; } @@ -505,7 +505,7 @@ ECharset EncodingHintByName(const char* encname) { if (hint != CODES_UNKNOWN) return hint; - if (Singleton<TEncodingNamesHashSet>()->Has(enc)) + if (Singleton<TEncodingNamesHashSet>()->Has(enc)) return CODES_UNSUPPORTED; return CODES_UNKNOWN; } diff --git a/library/cpp/charset/codepage.h b/library/cpp/charset/codepage.h index 30a02a4610..2911174dce 100644 --- a/library/cpp/charset/codepage.h +++ b/library/cpp/charset/codepage.h @@ -1,6 +1,6 @@ #pragma once -#include "doccodes.h" +#include "doccodes.h" #include <util/charset/recode_result.h> #include <util/charset/unidata.h> // all wchar32 functions @@ -8,11 +8,11 @@ #include <util/generic/string.h> #include <util/generic/ylimits.h> #include <util/generic/yexception.h> -#include <util/system/yassert.h> -#include <util/system/defaults.h> - -#include <cctype> - +#include <util/system/yassert.h> +#include <util/system/defaults.h> + +#include <cctype> + struct CodePage; struct Recoder; struct Encoder; @@ -21,10 +21,10 @@ struct Encoder; * struct CodePage * \*****************************************************************/ struct CodePage { - ECharset CPEnum; // int MIBEnum; - const char* Names[30]; // name[0] -- preferred mime-name - wchar32 unicode[256]; - const char* DefaultChar; //[CCL_NUM] + ECharset CPEnum; // int MIBEnum; + const char* Names[30]; // name[0] -- preferred mime-name + wchar32 unicode[256]; + const char* DefaultChar; //[CCL_NUM] bool IsLower(unsigned char ch) const { return ::IsLower(unicode[ch]); @@ -38,7 +38,7 @@ struct CodePage { bool IsDigit(unsigned char ch) const { return ::IsDigit(unicode[ch]); } - bool IsXdigit(unsigned char ch) const { + bool IsXdigit(unsigned char ch) const { return ::IsXdigit(unicode[ch]); } bool IsAlnum(unsigned char ch) const { @@ -62,18 +62,18 @@ struct CodePage { bool IsComposed(unsigned char ch) const { return ::IsComposed(unicode[ch]); } - + // return pointer to char after the last char - char* ToLower(const char* begin, const char* end, char* to) const; + char* ToLower(const char* begin, const char* end, char* to) const; char* ToLower(const char* begin, char* to) const; - // return pointer to char after the last char - char* ToUpper(const char* begin, const char* end, char* to) const; + // return pointer to char after the last char + char* ToUpper(const char* begin, const char* end, char* to) const; char* ToUpper(const char* begin, char* to) const; - - int stricmp(const char* s1, const char* s2) const; - int strnicmp(const char* s1, const char* s2, size_t len) const; - + + int stricmp(const char* s1, const char* s2) const; + int strnicmp(const char* s1, const char* s2, size_t len) const; + inline unsigned char ToUpper(unsigned char ch) const; inline unsigned char ToLower(unsigned char ch) const; inline unsigned char ToTitle(unsigned char ch) const; @@ -131,18 +131,18 @@ namespace NCodepagePrivate { return GetPrivate(e)->Names[0]; } - static const TCodepagesMap& Instance(); - + static const TCodepagesMap& Instance(); + friend class ::TCodePageHash; }; inline bool NativeCodepage(ECharset e) { - return ::NCodepagePrivate::TCodepagesMap::Instance().NativeCodepage(e); + return ::NCodepagePrivate::TCodepagesMap::Instance().NativeCodepage(e); } } inline bool SingleByteCodepage(ECharset e) { - return ::NCodepagePrivate::TCodepagesMap::Instance().SingleByteCodepage(e); + return ::NCodepagePrivate::TCodepagesMap::Instance().SingleByteCodepage(e); } inline bool ValidCodepage(ECharset e) { @@ -150,7 +150,7 @@ inline bool ValidCodepage(ECharset e) { } inline const CodePage* CodePageByCharset(ECharset e) { - return ::NCodepagePrivate::TCodepagesMap::Instance().Get(e); + return ::NCodepagePrivate::TCodepagesMap::Instance().Get(e); } ECharset CharsetByName(TStringBuf name); @@ -163,12 +163,12 @@ inline ECharset CharsetByCodePage(const CodePage* CP) { } inline const char* NameByCharset(ECharset e) { - return ::NCodepagePrivate::TCodepagesMap::Instance().NameByCharset(e); + return ::NCodepagePrivate::TCodepagesMap::Instance().NameByCharset(e); } inline const char* NameByCharsetSafe(ECharset e) { if (CODES_UNKNOWN < e && e < CODES_MAX) - return ::NCodepagePrivate::TCodepagesMap::Instance().NameByCharset(e); + return ::NCodepagePrivate::TCodepagesMap::Instance().NameByCharset(e); else ythrow yexception() << "unknown encoding: " << (int)e; } @@ -194,21 +194,21 @@ struct Encoder { char* Table[256]; const char* DefaultChar; - inline char Code(wchar32 ch) const { + inline char Code(wchar32 ch) const { if (ch > 0xFFFF) return 0; - return (unsigned char)Table[(ch >> 8) & 255][ch & 255]; + return (unsigned char)Table[(ch >> 8) & 255][ch & 255]; } - inline char Tr(wchar32 ch) const { + inline char Tr(wchar32 ch) const { char code = Code(ch); if (code == 0 && ch != 0) - code = DefaultChar[NUnicode::CharType(ch)]; + code = DefaultChar[NUnicode::CharType(ch)]; Y_ASSERT(code != 0 || ch == 0); return code; } - inline unsigned char operator[](wchar32 ch) const { + inline unsigned char operator[](wchar32 ch) const { return Tr(ch); } @@ -223,25 +223,25 @@ struct Encoder { struct Recoder { unsigned char Table[257]; - void Create(const CodePage& source, const CodePage& target); - void Create(const CodePage& source, const Encoder* wideTarget); + void Create(const CodePage& source, const CodePage& target); + void Create(const CodePage& source, const Encoder* wideTarget); - void Create(const CodePage& page, wchar32 (*mapper)(wchar32)); - void Create(const CodePage& page, const Encoder* widePage, wchar32 (*mapper)(wchar32)); + void Create(const CodePage& page, wchar32 (*mapper)(wchar32)); + void Create(const CodePage& page, const Encoder* widePage, wchar32 (*mapper)(wchar32)); - inline unsigned char Tr(unsigned char c) const { + inline unsigned char Tr(unsigned char c) const { return Table[c]; } - inline unsigned char operator[](unsigned char c) const { + inline unsigned char operator[](unsigned char c) const { return Table[c]; } - void Tr(const char* in, char* out, size_t len) const; - void Tr(const char* in, char* out) const; - void Tr(char* in_out, size_t len) const; - void Tr(char* in_out) const; + void Tr(const char* in, char* out, size_t len) const; + void Tr(const char* in, char* out) const; + void Tr(char* in_out, size_t len) const; + void Tr(char* in_out) const; }; -extern const struct Encoder& WideCharToYandex; +extern const struct Encoder& WideCharToYandex; const Encoder& EncoderByCharset(ECharset enc); @@ -255,7 +255,7 @@ namespace NCodepagePrivate { static const Recoder rcdr_to_lower[]; static const Recoder rcdr_to_upper[]; static const Recoder rcdr_to_title[]; - + static const Encoder* const EncodeTo[]; friend struct ::CodePage; @@ -264,7 +264,7 @@ namespace NCodepagePrivate { friend RECODE_RESULT _recodeFromYandex(ECharset, const char*, char*, size_t, size_t, size_t&, size_t&); friend const Encoder& ::EncoderByCharset(ECharset enc); }; -} +} inline const Encoder& EncoderByCharset(ECharset enc) { if (!SingleByteCodepage(enc)) { @@ -319,6 +319,6 @@ inline TString ToTitle(TString s, const CodePage& cp, size_t pos = 0, size_t n = return i == pos ? cp.ToTitle(c) : cp.ToLower(c); }, pos, - n); + n); return s; } diff --git a/library/cpp/charset/codepage_ut.cpp b/library/cpp/charset/codepage_ut.cpp index c3ac3ac478..1a572cac44 100644 --- a/library/cpp/charset/codepage_ut.cpp +++ b/library/cpp/charset/codepage_ut.cpp @@ -1,47 +1,47 @@ #include "codepage.h" #include "recyr.hh" -#include "wide.h" - +#include "wide.h" + #include <library/cpp/testing/unittest/registar.h> - + #include <util/charset/utf8.h> #include <util/system/yassert.h> -#if defined(_MSC_VER) -#pragma warning(disable : 4309) /*truncation of constant value*/ +#if defined(_MSC_VER) +#pragma warning(disable : 4309) /*truncation of constant value*/ #endif namespace { const char yandexUpperCase[] = - "\x81\x82\x83\x84\x85\x86\x87" - "\x8E" - "\xA1\xA2\xA3\xA4\xA5\xA6" - "\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF" + "\x81\x82\x83\x84\x85\x86\x87" + "\x8E" + "\xA1\xA2\xA3\xA4\xA5\xA6" + "\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF" "\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF" "\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF"; const char yandexLowerCase[] = - "\x91\x92\x93\x94\x95\x96\x97" - "\x9E" - "\xB1\xB2\xB3\xB4\xB5\xB6" - "\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF" + "\x91\x92\x93\x94\x95\x96\x97" + "\x9E" + "\xB1\xB2\xB3\xB4\xB5\xB6" + "\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF" "\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF" "\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF"; } -class TCodepageTest: public TTestBase { +class TCodepageTest: public TTestBase { private: UNIT_TEST_SUITE(TCodepageTest); - UNIT_TEST(TestUTF); - UNIT_TEST(TestUTFFromUnknownPlane); - UNIT_TEST(TestBrokenMultibyte); - UNIT_TEST(TestSurrogatePairs); - UNIT_TEST(TestEncodingHints); - UNIT_TEST(TestToLower); - UNIT_TEST(TestToUpper); - UNIT_TEST(TestUpperLower); - UNIT_TEST(TestBrokenRune); - UNIT_TEST(TestCanEncode); + UNIT_TEST(TestUTF); + UNIT_TEST(TestUTFFromUnknownPlane); + UNIT_TEST(TestBrokenMultibyte); + UNIT_TEST(TestSurrogatePairs); + UNIT_TEST(TestEncodingHints); + UNIT_TEST(TestToLower); + UNIT_TEST(TestToUpper); + UNIT_TEST(TestUpperLower); + UNIT_TEST(TestBrokenRune); + UNIT_TEST(TestCanEncode); UNIT_TEST_SUITE_END(); public: @@ -55,18 +55,18 @@ public: void TestCanEncode(); - inline void TestUpperLower() { + inline void TestUpperLower() { const CodePage* cp = CodePageByCharset(CODES_ASCII); - char tmp[100]; - + char tmp[100]; + TStringBuf s = "abcde"; - - TStringBuf upper(tmp, cp->ToUpper(s.begin(), s.end(), tmp)); + + TStringBuf upper(tmp, cp->ToUpper(s.begin(), s.end(), tmp)); UNIT_ASSERT_VALUES_EQUAL(upper, TStringBuf("ABCDE")); - - TStringBuf lower(tmp, cp->ToLower(upper.begin(), upper.end(), tmp)); + + TStringBuf lower(tmp, cp->ToLower(upper.begin(), upper.end(), tmp)); UNIT_ASSERT_VALUES_EQUAL(lower, TStringBuf("abcde")); - } + } void TestBrokenRune() { UNIT_ASSERT_VALUES_EQUAL(BROKEN_RUNE, 0xFFFDu); @@ -78,7 +78,7 @@ UNIT_TEST_SUITE_REGISTRATION(TCodepageTest); void TCodepageTest::TestUTF() { for (wchar32 i = 0; i <= 0x10FFFF; i++) { unsigned char buffer[32]; - Zero(buffer); + Zero(buffer); size_t rune_len; size_t ref_len = 0; @@ -120,7 +120,7 @@ void TCodepageTest::TestUTF() { UNIT_ASSERT(res == RECODE_BROKENSYMBOL); } } - const char* badStrings[] = { + const char* badStrings[] = { "\xfe", "\xff", "\xcc\xc0", @@ -153,7 +153,7 @@ void TCodepageTest::TestUTF() { }; for (size_t i = 0; i < Y_ARRAY_SIZE(badStrings); ++i) { wchar32 rune; - const ui8* p = (const ui8*)badStrings[i]; + const ui8* p = (const ui8*)badStrings[i]; size_t len; RECODE_RESULT res = SafeReadUTF8Char(rune, len, p, p + strlen(badStrings[i])); UNIT_ASSERT(res == RECODE_BROKENSYMBOL); @@ -174,17 +174,17 @@ void TCodepageTest::TestBrokenMultibyte() { UNIT_ASSERT(nread == 1); UNIT_ASSERT(nwritten == 0); - const char bigSample[] = {'\xC3', '\x87', '\xC3', '\x8E', '\xC2', '\xB0', '\xC3', '\x85', '\xC3', '\x85', '\xC3', '\xB8'}; + const char bigSample[] = {'\xC3', '\x87', '\xC3', '\x8E', '\xC2', '\xB0', '\xC3', '\x85', '\xC3', '\x85', '\xC3', '\xB8'}; res = RecodeToUnicode(cp, bigSample, recodeResult, Y_ARRAY_SIZE(bigSample), Y_ARRAY_SIZE(recodeResult), nread, nwritten); UNIT_ASSERT(res == RECODE_OK); UNIT_ASSERT(nread == Y_ARRAY_SIZE(bigSample)); } void TCodepageTest::TestUTFFromUnknownPlane() { - static const wchar32 sampletext[] = {0x61, 0x62, 0x63, 0x20, - 0x430, 0x431, 0x432, 0x20, - 0x1001, 0x1002, 0x1003, 0x20, - 0x10001, 0x10002, 0x10003}; + static const wchar32 sampletext[] = {0x61, 0x62, 0x63, 0x20, + 0x430, 0x431, 0x432, 0x20, + 0x1001, 0x1002, 0x1003, 0x20, + 0x10001, 0x10002, 0x10003}; static const size_t BUFFER_SIZE = 1024; char bytebuffer[BUFFER_SIZE]; @@ -192,17 +192,17 @@ void TCodepageTest::TestUTFFromUnknownPlane() { size_t readchars = 0; size_t writtenbytes = 0; size_t samplelen = Y_ARRAY_SIZE(sampletext); + + RECODE_RESULT res = RecodeFromUnicode(CODES_UTF8, sampletext, bytebuffer, samplelen, BUFFER_SIZE, readchars, writtenbytes); - RECODE_RESULT res = RecodeFromUnicode(CODES_UTF8, sampletext, bytebuffer, samplelen, BUFFER_SIZE, readchars, writtenbytes); - - UNIT_ASSERT(res == RECODE_OK); - UNIT_ASSERT(samplelen == readchars); + UNIT_ASSERT(res == RECODE_OK); + UNIT_ASSERT(samplelen == readchars); size_t writtenbytes2 = 0; char bytebuffer2[BUFFER_SIZE]; for (size_t i = 0; i != samplelen; ++i) { size_t nwr = 0; - const int res = RecodeFromUnicode(CODES_UTF8, sampletext[i], bytebuffer2 + writtenbytes2, BUFFER_SIZE - writtenbytes2, nwr); + const int res = RecodeFromUnicode(CODES_UTF8, sampletext[i], bytebuffer2 + writtenbytes2, BUFFER_SIZE - writtenbytes2, nwr); UNIT_ASSERT_VALUES_EQUAL(res, int(RECODE_OK)); writtenbytes2 += nwr; UNIT_ASSERT(BUFFER_SIZE > writtenbytes2); @@ -213,43 +213,43 @@ void TCodepageTest::TestUTFFromUnknownPlane() { size_t readbytes = 0; size_t writtenchars = 0; - res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars); + res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars); - UNIT_ASSERT(res == RECODE_OK); - UNIT_ASSERT(readbytes == writtenbytes); + UNIT_ASSERT(res == RECODE_OK); + UNIT_ASSERT(readbytes == writtenbytes); wchar32* charbufferend = charbuffer + writtenchars; - DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8); + DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8); - UNIT_ASSERT(charbufferend == charbuffer + samplelen); - for (size_t i = 0; i < samplelen; ++i) - UNIT_ASSERT(sampletext[i] == charbuffer[i]); + UNIT_ASSERT(charbufferend == charbuffer + samplelen); + for (size_t i = 0; i < samplelen; ++i) + UNIT_ASSERT(sampletext[i] == charbuffer[i]); // Now, concatenate the thing with an explicit character and retest - res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars); - UNIT_ASSERT(res == RECODE_OK); - UNIT_ASSERT(readbytes == writtenbytes); + res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars); + UNIT_ASSERT(res == RECODE_OK); + UNIT_ASSERT(readbytes == writtenbytes); charbuffer[writtenchars] = 0x1234; size_t morewrittenchars = 0; - res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer + writtenchars + 1, writtenbytes, BUFFER_SIZE, readbytes, morewrittenchars); - UNIT_ASSERT(res == RECODE_OK); - UNIT_ASSERT(readbytes == writtenbytes); - UNIT_ASSERT(writtenchars == morewrittenchars); + res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer + writtenchars + 1, writtenbytes, BUFFER_SIZE, readbytes, morewrittenchars); + UNIT_ASSERT(res == RECODE_OK); + UNIT_ASSERT(readbytes == writtenbytes); + UNIT_ASSERT(writtenchars == morewrittenchars); charbuffer[2 * writtenchars + 1] = 0x5678; charbufferend = charbuffer + 2 * writtenchars + 2; - DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8); + DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8); - UNIT_ASSERT(charbufferend == charbuffer + 2 * samplelen + 2); + UNIT_ASSERT(charbufferend == charbuffer + 2 * samplelen + 2); for (size_t i = 0; i < samplelen; ++i) { - UNIT_ASSERT(sampletext[i] == charbuffer[i]); - UNIT_ASSERT(sampletext[i] == charbuffer[samplelen + 1 + i]); + UNIT_ASSERT(sampletext[i] == charbuffer[i]); + UNIT_ASSERT(sampletext[i] == charbuffer[samplelen + 1 + i]); } - UNIT_ASSERT(0x1234 == charbuffer[samplelen]); - UNIT_ASSERT(0x5678 == charbuffer[2 * samplelen + 1]); + UNIT_ASSERT(0x1234 == charbuffer[samplelen]); + UNIT_ASSERT(0x5678 == charbuffer[2 * samplelen + 1]); // test TChar version // bytebuffer of len writtenbytes contains sampletext of len samplelen chars in utf8 @@ -261,7 +261,7 @@ void TCodepageTest::TestUTFFromUnknownPlane() { for (size_t i = 0; i < wtr.size(); ++i) { if (sampletext[i] >= 0x10000) { UNIT_ASSERT_VALUES_EQUAL(wtr[i], ' '); - } else { + } else { UNIT_ASSERT_VALUES_EQUAL(wtr[i], sampletext[i]); } } @@ -290,11 +290,11 @@ static void TestSurrogates(const char* str, const wchar16* wide, size_t wideSize void TCodepageTest::TestSurrogatePairs() { const char* utf8NonBMP = "\xf4\x80\x89\x84\xf4\x80\x89\x87\xf4\x80\x88\xba"; - wchar16 wNonBMPDummy[] = {0xDBC0, 0xDE44, 0xDBC0, 0xDE47, 0xDBC0, 0xDE3A}; + wchar16 wNonBMPDummy[] = {0xDBC0, 0xDE44, 0xDBC0, 0xDE47, 0xDBC0, 0xDE3A}; TestSurrogates(utf8NonBMP, wNonBMPDummy, Y_ARRAY_SIZE(wNonBMPDummy)); const char* utf8NonBMP2 = "ab\xf4\x80\x89\x87n"; - wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'}; + wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'}; TestSurrogates(utf8NonBMP2, wNonBMPDummy2, Y_ARRAY_SIZE(wNonBMPDummy2)); } @@ -356,7 +356,7 @@ static void TestCanEncodeEach(const TWtringBuf& text, ECharset encoding, bool ex for (size_t i = 0; i < text.size(); ++i) { if (CanBeEncoded(text.SubStr(i, 1), encoding) != expectedResult) ythrow yexception() << "assertion failed: encoding " << NameByCharset(encoding) - << " on '" << text.SubStr(i, 1) << "' (expected " << expectedResult << ")"; + << " on '" << text.SubStr(i, 1) << "' (expected " << expectedResult << ")"; } // whole text UNIT_ASSERT_EQUAL(CanBeEncoded(text, encoding), expectedResult); diff --git a/library/cpp/charset/cp_encrec.cpp b/library/cpp/charset/cp_encrec.cpp index e4570cd628..aa68278a04 100644 --- a/library/cpp/charset/cp_encrec.cpp +++ b/library/cpp/charset/cp_encrec.cpp @@ -1,5 +1,5 @@ -#include "codepage.h" - +#include "codepage.h" + #include <util/stream/output.h> void Encoder::Tr(const wchar32* in, char* out, size_t len) const { @@ -13,14 +13,14 @@ void Encoder::Tr(const wchar32* in, char* out) const { } while (*in++); } -void Recoder::Create(const CodePage& source, const Encoder* wideTarget) { - for (size_t i = 0; i != 256; ++i) { +void Recoder::Create(const CodePage& source, const Encoder* wideTarget) { + for (size_t i = 0; i != 256; ++i) { Table[i] = wideTarget->Tr(source.unicode[i]); Y_ASSERT(Table[i] != 0 || i == 0); } } -void Recoder::Create(const CodePage& page, const Encoder* widePage, wchar32 (*mapfunc)(wchar32)) { +void Recoder::Create(const CodePage& page, const Encoder* widePage, wchar32 (*mapfunc)(wchar32)) { for (size_t i = 0; i != 256; ++i) { char c = widePage->Code((*mapfunc)(page.unicode[i])); Table[i] = (c == 0 && i != 0) ? (unsigned char)i : (unsigned char)c; diff --git a/library/cpp/charset/doccodes.cpp b/library/cpp/charset/doccodes.cpp index 1fc17a3275..e0384a7f88 100644 --- a/library/cpp/charset/doccodes.cpp +++ b/library/cpp/charset/doccodes.cpp @@ -1 +1 @@ -#include "doccodes.h" +#include "doccodes.h" diff --git a/library/cpp/charset/doccodes.h b/library/cpp/charset/doccodes.h index 75c87adf9e..0aa7eb2d2b 100644 --- a/library/cpp/charset/doccodes.h +++ b/library/cpp/charset/doccodes.h @@ -1,45 +1,45 @@ #pragma once -enum ECharset { +enum ECharset { CODES_UNSUPPORTED = -2, // valid but unsupported encoding - CODES_UNKNOWN = -1, // invalid or unspecified encoding - CODES_WIN, // [ 0] WINDOWS_1251 Windows - CODES_KOI8, // [ 1] KOI8_U Koi8-u - CODES_ALT, // [ 2] IBM_866 MS DOS, alternative - CODES_MAC, // [ 3] MAC_CYRILLIC Macintosh - CODES_MAIN, // [ 4] ISO_LATIN_CYRILLIC Main - CODES_ASCII, // [ 5] WINDOWS_1252 Latin 1 - CODES_RESERVED_3, // reserved code: use it for new encodings before adding them to the end of the list - CODES_WIN_EAST, // [ 7] WINDOWS_1250 WIN PL - CODES_ISO_EAST, // [ 8] ISO_8859_2 ISO PL + CODES_UNKNOWN = -1, // invalid or unspecified encoding + CODES_WIN, // [ 0] WINDOWS_1251 Windows + CODES_KOI8, // [ 1] KOI8_U Koi8-u + CODES_ALT, // [ 2] IBM_866 MS DOS, alternative + CODES_MAC, // [ 3] MAC_CYRILLIC Macintosh + CODES_MAIN, // [ 4] ISO_LATIN_CYRILLIC Main + CODES_ASCII, // [ 5] WINDOWS_1252 Latin 1 + CODES_RESERVED_3, // reserved code: use it for new encodings before adding them to the end of the list + CODES_WIN_EAST, // [ 7] WINDOWS_1250 WIN PL + CODES_ISO_EAST, // [ 8] ISO_8859_2 ISO PL // our superset of subset of windows-1251 - CODES_YANDEX, // [ 9] YANDEX - CODES_UTF_16BE, // [10] UTF_16BE - CODES_UTF_16LE, // [11] UTF_16LE + CODES_YANDEX, // [ 9] YANDEX + CODES_UTF_16BE, // [10] UTF_16BE + CODES_UTF_16LE, // [11] UTF_16LE // missing standard codepages - CODES_IBM855, // [12] IBM_855 - CODES_UTF8, // [13] UTF8 - CODES_UNKNOWNPLANE, // [14] Unrecognized characters are mapped into the PUA: U+F000..U+F0FF + CODES_IBM855, // [12] IBM_855 + CODES_UTF8, // [13] UTF8 + CODES_UNKNOWNPLANE, // [14] Unrecognized characters are mapped into the PUA: U+F000..U+F0FF - CODES_KAZWIN, // [15] WINDOWS_1251_K Kazakh version of Windows-1251 - CODES_TATWIN, // [16] WINDOWS_1251_T Tatarian version of Windows-1251 - CODES_ARMSCII, // [17] Armenian ASCII - CODES_GEO_ITA, // [18] Academy of Sciences Georgian - CODES_GEO_PS, // [19] Georgian Parliament - CODES_ISO_8859_3, // [20] Latin-3: Turkish, Maltese and Esperanto - CODES_ISO_8859_4, // [21] Latin-4: Estonian, Latvian, Lithuanian, Greenlandic, Sami - CODES_ISO_8859_6, // [22] Latin/Arabic: Arabic - CODES_ISO_8859_7, // [23] Latin/Greek: Greek - CODES_ISO_8859_8, // [24] Latin/Hebrew: Hebrew - CODES_ISO_8859_9, // [25] Latin-5 or Turkish: Turkish - CODES_ISO_8859_13, // [26] Latin-7 or Baltic Rim: Baltic languages - CODES_ISO_8859_15, // [27] Latin-9: Western European languages - CODES_ISO_8859_16, // [28] Latin-10: South-Eastern European languages - CODES_WINDOWS_1253, // [29] for Greek - CODES_WINDOWS_1254, // [30] for Turkish - CODES_WINDOWS_1255, // [31] for Hebrew - CODES_WINDOWS_1256, // [32] for Arabic - CODES_WINDOWS_1257, // [33] for Estonian, Latvian and Lithuanian + CODES_KAZWIN, // [15] WINDOWS_1251_K Kazakh version of Windows-1251 + CODES_TATWIN, // [16] WINDOWS_1251_T Tatarian version of Windows-1251 + CODES_ARMSCII, // [17] Armenian ASCII + CODES_GEO_ITA, // [18] Academy of Sciences Georgian + CODES_GEO_PS, // [19] Georgian Parliament + CODES_ISO_8859_3, // [20] Latin-3: Turkish, Maltese and Esperanto + CODES_ISO_8859_4, // [21] Latin-4: Estonian, Latvian, Lithuanian, Greenlandic, Sami + CODES_ISO_8859_6, // [22] Latin/Arabic: Arabic + CODES_ISO_8859_7, // [23] Latin/Greek: Greek + CODES_ISO_8859_8, // [24] Latin/Hebrew: Hebrew + CODES_ISO_8859_9, // [25] Latin-5 or Turkish: Turkish + CODES_ISO_8859_13, // [26] Latin-7 or Baltic Rim: Baltic languages + CODES_ISO_8859_15, // [27] Latin-9: Western European languages + CODES_ISO_8859_16, // [28] Latin-10: South-Eastern European languages + CODES_WINDOWS_1253, // [29] for Greek + CODES_WINDOWS_1254, // [30] for Turkish + CODES_WINDOWS_1255, // [31] for Hebrew + CODES_WINDOWS_1256, // [32] for Arabic + CODES_WINDOWS_1257, // [33] for Estonian, Latvian and Lithuanian // these codes are all the other 8bit codes known by libiconv // they follow in alphanumeric order diff --git a/library/cpp/charset/iconv.cpp b/library/cpp/charset/iconv.cpp index df43471470..605d0699ef 100644 --- a/library/cpp/charset/iconv.cpp +++ b/library/cpp/charset/iconv.cpp @@ -1,94 +1,94 @@ -#include "iconv.h" - -#include <contrib/libs/libiconv/iconv.h> - -using namespace NICONVPrivate; - -TDescriptor::TDescriptor(const char* from, const char* to) - : Descriptor_(libiconv_open(to, from)) - , From_(from) - , To_(to) -{ - if (!Invalid()) { - int temp = 1; - - libiconvctl(Descriptor_, ICONV_SET_DISCARD_ILSEQ, &temp); - } -} - -TDescriptor::~TDescriptor() { - if (!Invalid()) { - libiconv_close(Descriptor_); - } -} - -size_t NICONVPrivate::RecodeImpl(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) { - Y_ASSERT(!descriptor.Invalid()); - Y_ASSERT(in); - Y_ASSERT(out); - +#include "iconv.h" + +#include <contrib/libs/libiconv/iconv.h> + +using namespace NICONVPrivate; + +TDescriptor::TDescriptor(const char* from, const char* to) + : Descriptor_(libiconv_open(to, from)) + , From_(from) + , To_(to) +{ + if (!Invalid()) { + int temp = 1; + + libiconvctl(Descriptor_, ICONV_SET_DISCARD_ILSEQ, &temp); + } +} + +TDescriptor::~TDescriptor() { + if (!Invalid()) { + libiconv_close(Descriptor_); + } +} + +size_t NICONVPrivate::RecodeImpl(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) { + Y_ASSERT(!descriptor.Invalid()); + Y_ASSERT(in); + Y_ASSERT(out); + char* inPtr = const_cast<char*>(in); - char* outPtr = out; - size_t inSizeMod = inSize; - size_t outSizeMod = outSize; - size_t res = libiconv(descriptor.Get(), &inPtr, &inSizeMod, &outPtr, &outSizeMod); - - read = inSize - inSizeMod; - written = outSize - outSizeMod; - - return res; -} - -void NICONVPrivate::DoRecode(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) { - if (descriptor.Invalid()) { - ythrow yexception() << "Can not convert from " << descriptor.From() << " to " << descriptor.To(); - } - - size_t res = RecodeImpl(descriptor, in, out, inSize, outSize, read, written); - - if (res == static_cast<size_t>(-1)) { - switch (errno) { - case EILSEQ: - read = inSize; - break; - - case EINVAL: - read = inSize; - break; - - case E2BIG: - ythrow yexception() << "Iconv error: output buffer is too small"; - - default: - ythrow yexception() << "Unknown iconv error"; - } - } -} - -RECODE_RESULT NICONVPrivate::DoRecodeNoThrow(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) { - if (descriptor.Invalid()) { - return RECODE_ERROR; - } - - size_t res = RecodeImpl(descriptor, in, out, inSize, outSize, read, written); - - if (res == static_cast<size_t>(-1)) { - switch (errno) { - case EILSEQ: - read = inSize; - break; - - case EINVAL: - read = inSize; - break; - - case E2BIG: - return RECODE_EOOUTPUT; - - default: - return RECODE_ERROR; - } - } - - return RECODE_OK; -} + char* outPtr = out; + size_t inSizeMod = inSize; + size_t outSizeMod = outSize; + size_t res = libiconv(descriptor.Get(), &inPtr, &inSizeMod, &outPtr, &outSizeMod); + + read = inSize - inSizeMod; + written = outSize - outSizeMod; + + return res; +} + +void NICONVPrivate::DoRecode(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) { + if (descriptor.Invalid()) { + ythrow yexception() << "Can not convert from " << descriptor.From() << " to " << descriptor.To(); + } + + size_t res = RecodeImpl(descriptor, in, out, inSize, outSize, read, written); + + if (res == static_cast<size_t>(-1)) { + switch (errno) { + case EILSEQ: + read = inSize; + break; + + case EINVAL: + read = inSize; + break; + + case E2BIG: + ythrow yexception() << "Iconv error: output buffer is too small"; + + default: + ythrow yexception() << "Unknown iconv error"; + } + } +} + +RECODE_RESULT NICONVPrivate::DoRecodeNoThrow(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) { + if (descriptor.Invalid()) { + return RECODE_ERROR; + } + + size_t res = RecodeImpl(descriptor, in, out, inSize, outSize, read, written); + + if (res == static_cast<size_t>(-1)) { + switch (errno) { + case EILSEQ: + read = inSize; + break; + + case EINVAL: + read = inSize; + break; + + case E2BIG: + return RECODE_EOOUTPUT; + + default: + return RECODE_ERROR; + } + } + + return RECODE_OK; +} diff --git a/library/cpp/charset/iconv.h b/library/cpp/charset/iconv.h index ac13539347..58188bb33d 100644 --- a/library/cpp/charset/iconv.h +++ b/library/cpp/charset/iconv.h @@ -10,66 +10,66 @@ namespace NICONVPrivate { inline const char* CharsetName(ECharset code) { return NameByCharset(code); } - + inline const char* CharsetName(const char* code) { return code; } - template <int size> + template <int size> inline const char* UnicodeNameBySize(); - template <> + template <> inline const char* UnicodeNameBySize<1>() { return "UTF-8"; } - - template <> + + template <> inline const char* UnicodeNameBySize<2>() { return "UTF-16LE"; } - - template <> + + template <> inline const char* UnicodeNameBySize<4>() { return "UCS-4LE"; } - template <class C> + template <class C> inline const char* UnicodeName() { return UnicodeNameBySize<sizeof(C)>(); } class TDescriptor : NNonCopyable::TNonCopyable { private: - void* Descriptor_; - const char* From_; - const char* To_; + void* Descriptor_; + const char* From_; + const char* To_; public: template <class TFrom, class TTo> inline TDescriptor(TFrom from, TTo to) - : TDescriptor(CharsetName(from), CharsetName(to)) + : TDescriptor(CharsetName(from), CharsetName(to)) { } - TDescriptor(const char* from, const char* to); - - ~TDescriptor(); - - inline void* Get() const { - return Descriptor_; - } - - inline bool Invalid() const { - return Descriptor_ == (void*)(-1); + TDescriptor(const char* from, const char* to); + + ~TDescriptor(); + + inline void* Get() const { + return Descriptor_; } - inline const char* From() const noexcept { - return From_; + inline bool Invalid() const { + return Descriptor_ == (void*)(-1); } - inline const char* To() const noexcept { - return To_; + inline const char* From() const noexcept { + return From_; } + + inline const char* To() const noexcept { + return To_; + } }; template <class TFrom, class TTo> @@ -79,43 +79,43 @@ namespace NICONVPrivate { return !descriptor.Invalid(); } - size_t RecodeImpl(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written); - void DoRecode(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written); + size_t RecodeImpl(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written); + void DoRecode(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written); template <class TFrom, class TTo> inline void Recode(TFrom from, TTo to, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) { TDescriptor descriptor(from, to); - DoRecode(descriptor, in, out, inSize, outSize, read, written); + DoRecode(descriptor, in, out, inSize, outSize, read, written); } - template <class TCharType> - inline void RecodeToUnicode(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize, size_t& read, size_t& written) { + template <class TCharType> + inline void RecodeToUnicode(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize, size_t& read, size_t& written) { const size_t charSize = sizeof(TCharType); Recode(from, UnicodeName<TCharType>(), in, reinterpret_cast<char*>(out), inSize, outSize * charSize, read, written); written /= charSize; } - template <class TCharType> - inline void RecodeFromUnicode(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) { + template <class TCharType> + inline void RecodeFromUnicode(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) { const size_t charSize = sizeof(TCharType); Recode(UnicodeName<TCharType>(), to, reinterpret_cast<const char*>(in), out, inSize * charSize, outSize, read, written); read /= charSize; } - RECODE_RESULT DoRecodeNoThrow(const TDescriptor& d, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written); - + RECODE_RESULT DoRecodeNoThrow(const TDescriptor& d, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written); + template <class TFrom, class TTo> inline RECODE_RESULT RecodeNoThrow(TFrom from, TTo to, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) { TDescriptor descriptor(from, to); - return DoRecodeNoThrow(descriptor, in, out, inSize, outSize, read, written); + return DoRecodeNoThrow(descriptor, in, out, inSize, outSize, read, written); } - template <class TCharType> - inline RECODE_RESULT RecodeToUnicodeNoThrow(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize, size_t& read, size_t& written) { + template <class TCharType> + inline RECODE_RESULT RecodeToUnicodeNoThrow(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize, size_t& read, size_t& written) { const size_t charSize = sizeof(TCharType); RECODE_RESULT res = RecodeNoThrow(from, UnicodeName<TCharType>(), in, reinterpret_cast<char*>(out), inSize, outSize * charSize, read, written); @@ -124,8 +124,8 @@ namespace NICONVPrivate { return res; } - template <class TCharType> - inline RECODE_RESULT RecodeFromUnicodeNoThrow(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) { + template <class TCharType> + inline RECODE_RESULT RecodeFromUnicodeNoThrow(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) { const size_t charSize = sizeof(TCharType); RECODE_RESULT res = RecodeNoThrow(UnicodeName<TCharType>(), to, reinterpret_cast<const char*>(in), out, inSize * charSize, outSize, read, written); @@ -133,4 +133,4 @@ namespace NICONVPrivate { return res; } -} +} diff --git a/library/cpp/charset/iconv_ut.cpp b/library/cpp/charset/iconv_ut.cpp index e8c56f6d49..f79d76f7c2 100644 --- a/library/cpp/charset/iconv_ut.cpp +++ b/library/cpp/charset/iconv_ut.cpp @@ -1,7 +1,7 @@ -#include "wide.h" -#include "recyr.hh" -#include "codepage.h" - +#include "wide.h" +#include "recyr.hh" +#include "codepage.h" + #include <library/cpp/testing/unittest/registar.h> static void TestIconv(const TString& utf8, const TString& other, ECharset enc) { @@ -38,30 +38,30 @@ static void TestIconv(const TString& utf8, const TString& other, ECharset enc) { UNIT_ASSERT(temp == other); } -class TIconvTest: public TTestBase { - static void TestSurrogates(const char* str, const wchar16* wide, size_t wideSize) { - size_t sSize = strlen(str); - size_t wSize = sSize * 2; - TArrayHolder<wchar16> w(new wchar16[wSize]); - - size_t read = 0; - size_t written = 0; - NICONVPrivate::RecodeToUnicode(CODES_UTF8, str, w.Get(), sSize, wSize, read, written); - UNIT_ASSERT(read == sSize); - UNIT_ASSERT(written == wideSize); - UNIT_ASSERT(!memcmp(w.Get(), wide, wideSize)); - - TArrayHolder<char> s(new char[sSize]); - NICONVPrivate::RecodeFromUnicode(CODES_UTF8, w.Get(), s.Get(), wideSize, sSize, read, written); - UNIT_ASSERT(read == wideSize); - UNIT_ASSERT(written == sSize); - UNIT_ASSERT(!memcmp(s.Get(), str, sSize)); - } +class TIconvTest: public TTestBase { + static void TestSurrogates(const char* str, const wchar16* wide, size_t wideSize) { + size_t sSize = strlen(str); + size_t wSize = sSize * 2; + TArrayHolder<wchar16> w(new wchar16[wSize]); + + size_t read = 0; + size_t written = 0; + NICONVPrivate::RecodeToUnicode(CODES_UTF8, str, w.Get(), sSize, wSize, read, written); + UNIT_ASSERT(read == sSize); + UNIT_ASSERT(written == wideSize); + UNIT_ASSERT(!memcmp(w.Get(), wide, wideSize)); + + TArrayHolder<char> s(new char[sSize]); + NICONVPrivate::RecodeFromUnicode(CODES_UTF8, w.Get(), s.Get(), wideSize, sSize, read, written); + UNIT_ASSERT(read == wideSize); + UNIT_ASSERT(written == sSize); + UNIT_ASSERT(!memcmp(s.Get(), str, sSize)); + } private: UNIT_TEST_SUITE(TIconvTest); - UNIT_TEST(TestBig5); - UNIT_TEST(TestSurrogatePairs); + UNIT_TEST(TestBig5); + UNIT_TEST(TestSurrogatePairs); UNIT_TEST_SUITE_END(); public: @@ -75,11 +75,11 @@ public: void TestSurrogatePairs() { const char* utf8NonBMP = "\xf4\x80\x89\x84\xf4\x80\x89\x87\xf4\x80\x88\xba"; - wchar16 wNonBMPDummy[] = {0xDBC0, 0xDE44, 0xDBC0, 0xDE47, 0xDBC0, 0xDE3A}; + wchar16 wNonBMPDummy[] = {0xDBC0, 0xDE44, 0xDBC0, 0xDE47, 0xDBC0, 0xDE3A}; TestSurrogates(utf8NonBMP, wNonBMPDummy, Y_ARRAY_SIZE(wNonBMPDummy)); const char* utf8NonBMP2 = "ab\xf4\x80\x89\x87n"; - wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'}; + wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'}; TestSurrogates(utf8NonBMP2, wNonBMPDummy2, Y_ARRAY_SIZE(wNonBMPDummy2)); } }; diff --git a/library/cpp/charset/recyr.hh b/library/cpp/charset/recyr.hh index 5ec8734bcf..c5e752616e 100644 --- a/library/cpp/charset/recyr.hh +++ b/library/cpp/charset/recyr.hh @@ -14,39 +14,39 @@ /////////////////////////////////////////////////////////////////////////////////////// // input buf -> output buf // /////////////////////////////////////////////////////////////////////////////////////// -template <class TCharType> -inline RECODE_RESULT RecodeToUnicode(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) { - static_assert(sizeof(TCharType) > 1, "expect wide character type"); - +template <class TCharType> +inline RECODE_RESULT RecodeToUnicode(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) { + static_assert(sizeof(TCharType) > 1, "expect wide character type"); + return NCodepagePrivate::_recodeToUnicode(from, in, out, inSize, outSize, inRead, outWritten); } -template <class TCharType> -inline RECODE_RESULT RecodeFromUnicode(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) { - static_assert(sizeof(TCharType) > 1, "expect wide character type"); - +template <class TCharType> +inline RECODE_RESULT RecodeFromUnicode(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) { + static_assert(sizeof(TCharType) > 1, "expect wide character type"); + return NCodepagePrivate::_recodeFromUnicode(to, in, out, inSize, outSize, inRead, outWritten); } -inline RECODE_RESULT RecodeFromUnicode(ECharset to, wchar32 rune, char* out, size_t outSize, size_t& outWritten) { +inline RECODE_RESULT RecodeFromUnicode(ECharset to, wchar32 rune, char* out, size_t outSize, size_t& outWritten) { return NCodepagePrivate::_recodeFromUnicode(to, rune, out, outSize, outWritten); } -template <class TCharType> +template <class TCharType> inline RECODE_RESULT RecodeToUnicode(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize) { size_t inRead = 0; size_t outWritten = 0; return RecodeToUnicode(from, in, out, inSize, outSize, inRead, outWritten); } -template <class TCharType> +template <class TCharType> inline RECODE_RESULT RecodeFromUnicode(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize) { size_t inRead = 0; size_t outWritten = 0; return RecodeFromUnicode(to, in, out, inSize, outSize, inRead, outWritten); } -inline RECODE_RESULT RecodeFromUnicode(ECharset theEncoding, const wchar16* chars, size_t length, +inline RECODE_RESULT RecodeFromUnicode(ECharset theEncoding, const wchar16* chars, size_t length, char* bytes, size_t size, size_t* read = nullptr, size_t* written = nullptr) { size_t w = 0, r = 0; RECODE_RESULT rc = ::RecodeFromUnicode(theEncoding, chars, bytes, length, size, r, w); @@ -57,7 +57,7 @@ inline RECODE_RESULT RecodeFromUnicode(ECharset theEncoding, const wchar16* char return rc; } -inline RECODE_RESULT Recode(ECharset from, ECharset to, const char* in, char* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) { +inline RECODE_RESULT Recode(ECharset from, ECharset to, const char* in, char* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) { inRead = 0; outWritten = 0; @@ -125,7 +125,7 @@ inline bool Recode(ECharset from, ECharset to, const TStringBuf& in, TString& ou Y_ENSURE(RECODE_OK == res, "Recode failed. "); if (outWritten > outSize) ythrow yexception() << "Recode overrun the buffer: size=" - << outSize << " need=" << outWritten; + << outSize << " need=" << outWritten; out.remove(outWritten); return true; @@ -149,10 +149,10 @@ inline TString RecodeToHTMLEntities(ECharset from, const TString& in) { RECODE_RESULT res; size_t outWritten, inRead; TString out; - out.resize(in.length() * (4 + 4)); + out.resize(in.length() * (4 + 4)); res = NCodepagePrivate::_recodeToHTMLEntities(from, in.c_str(), out.begin(), in.length(), out.length(), inRead, outWritten); - if (res == RECODE_EOOUTPUT) { //input contains many 8-byte characters? - out.resize(in.length() * (4 + 8)); + if (res == RECODE_EOOUTPUT) { //input contains many 8-byte characters? + out.resize(in.length() * (4 + 8)); res = NCodepagePrivate::_recodeToHTMLEntities(from, in.c_str(), out.begin(), in.length(), out.length(), inRead, outWritten); } if (res != RECODE_OK) { diff --git a/library/cpp/charset/recyr_int.hh b/library/cpp/charset/recyr_int.hh index 353af53305..c61822037f 100644 --- a/library/cpp/charset/recyr_int.hh +++ b/library/cpp/charset/recyr_int.hh @@ -5,332 +5,332 @@ #include <util/generic/ptr.h> #include <util/generic/string.h> #include <util/system/defaults.h> - + #include "codepage.h" #include "doccodes.h" #include "iconv.h" #include "wide.h" namespace NCodepagePrivate { - inline RECODE_RESULT _recodeCopy(const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { - in_readed = in_size; - RECODE_RESULT res = RECODE_OK; - if (in_readed > out_size) { - res = RECODE_EOOUTPUT; - in_readed = out_size; - } - if (in != out) - memcpy(out, in, in_readed); - out_writed = in_readed; - return res; + inline RECODE_RESULT _recodeCopy(const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { + in_readed = in_size; + RECODE_RESULT res = RECODE_OK; + if (in_readed > out_size) { + res = RECODE_EOOUTPUT; + in_readed = out_size; + } + if (in != out) + memcpy(out, in, in_readed); + out_writed = in_readed; + return res; } - inline RECODE_RESULT _recodeToUTF8(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { - if (From == CODES_UTF8) - return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed); - const CodePage* cp = CodePageByCharset(From); + inline RECODE_RESULT _recodeToUTF8(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { + if (From == CODES_UTF8) + return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed); + const CodePage* cp = CodePageByCharset(From); - const unsigned char* in_start = (const unsigned char*)in; - const unsigned char* in_end = in_start + in_size; - const unsigned char* out_start = (unsigned char*)out; - const unsigned char* out_end = out_start + out_size; + const unsigned char* in_start = (const unsigned char*)in; + const unsigned char* in_end = in_start + in_size; + const unsigned char* out_start = (unsigned char*)out; + const unsigned char* out_end = out_start + out_size; - size_t rune_len; - RECODE_RESULT res = RECODE_OK; - while ((unsigned char*)in < in_end && res == RECODE_OK) { + size_t rune_len; + RECODE_RESULT res = RECODE_OK; + while ((unsigned char*)in < in_end && res == RECODE_OK) { res = SafeWriteUTF8Char(cp->unicode[(unsigned char)(*in++)], rune_len, (unsigned char*)out, out_end); - out += rune_len; - } - in_readed = (unsigned char*)in - in_start; - out_writed = (unsigned char*)out - out_start; - return res; + out += rune_len; + } + in_readed = (unsigned char*)in - in_start; + out_writed = (unsigned char*)out - out_start; + return res; } - inline RECODE_RESULT _recodeFromUTF8(ECharset to, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { - if (to == CODES_UTF8) - return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed); + inline RECODE_RESULT _recodeFromUTF8(ECharset to, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { + if (to == CODES_UTF8) + return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed); Y_ASSERT(CODES_UNKNOWN < to && to < CODES_MAX); - const Encoder* enc = &EncoderByCharset(to); + const Encoder* enc = &EncoderByCharset(to); - const unsigned char* in_start = (const unsigned char*)in; - const unsigned char* in_end = in_start + in_size; - const unsigned char* out_start = (unsigned char*)out; - const unsigned char* out_end = out_start + out_size; + const unsigned char* in_start = (const unsigned char*)in; + const unsigned char* in_end = in_start + in_size; + const unsigned char* out_start = (unsigned char*)out; + const unsigned char* out_end = out_start + out_size; - wchar32 rune; - size_t rune_len; - RECODE_RESULT res = RECODE_OK; - while ((const unsigned char*)in < in_end && (res == RECODE_OK || res == RECODE_BROKENSYMBOL)) { + wchar32 rune; + size_t rune_len; + RECODE_RESULT res = RECODE_OK; + while ((const unsigned char*)in < in_end && (res == RECODE_OK || res == RECODE_BROKENSYMBOL)) { res = SafeReadUTF8Char(rune, rune_len, (const unsigned char*)in, in_end); - if (res == RECODE_BROKENSYMBOL) - rune_len = 1; - if (res != RECODE_EOINPUT) - *out++ = enc->Tr(rune); - in += rune_len; - if (res == RECODE_OK && (const unsigned char*)in < in_end && (unsigned char*)out >= out_end) - res = RECODE_EOOUTPUT; - } - in_readed = (unsigned char*)in - in_start; - out_writed = (unsigned char*)out - out_start; - return res; + if (res == RECODE_BROKENSYMBOL) + rune_len = 1; + if (res != RECODE_EOINPUT) + *out++ = enc->Tr(rune); + in += rune_len; + if (res == RECODE_OK && (const unsigned char*)in < in_end && (unsigned char*)out >= out_end) + res = RECODE_EOOUTPUT; + } + in_readed = (unsigned char*)in - in_start; + out_writed = (unsigned char*)out - out_start; + return res; } - inline RECODE_RESULT _recodeToYandex(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { - if (From == CODES_YANDEX) - return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed); - if (From == CODES_UTF8) - return _recodeFromUTF8(CODES_YANDEX, in, out, in_size, out_size, in_readed, out_writed); - in_readed = (out_size > in_size) ? in_size : out_size; - const Recoder& rcdr = NCodepagePrivate::TCodePageData::rcdr_to_yandex[From]; - rcdr.Tr(in, out, in_readed); - out_writed = in_readed; - if (out_size < in_size) - return RECODE_EOOUTPUT; - return RECODE_OK; - } - inline RECODE_RESULT _recodeFromYandex(ECharset To, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { - if (To == CODES_YANDEX) - return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed); - if (To == CODES_UTF8) - return _recodeToUTF8(CODES_YANDEX, in, out, in_size, out_size, in_readed, out_writed); - in_readed = (out_size > in_size) ? in_size : out_size; - const Recoder& rcdr = NCodepagePrivate::TCodePageData::rcdr_from_yandex[To]; - rcdr.Tr(in, out, in_readed); - out_writed = in_readed; - if (out_size < in_size) - return RECODE_EOOUTPUT; - return RECODE_OK; - } - - template <class TCharType> - inline RECODE_RESULT _recodeUTF8ToUnicode(const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { - const unsigned char* inp = (const unsigned char*)in; - const unsigned char* in_end = inp + in_size; - TCharType* outp = out; - const TCharType* out_end = outp + out_size; - size_t rune_len; - wchar32 rune; - RECODE_RESULT res = RECODE_OK; - while ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && inp < in_end && outp < out_end) { + inline RECODE_RESULT _recodeToYandex(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { + if (From == CODES_YANDEX) + return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed); + if (From == CODES_UTF8) + return _recodeFromUTF8(CODES_YANDEX, in, out, in_size, out_size, in_readed, out_writed); + in_readed = (out_size > in_size) ? in_size : out_size; + const Recoder& rcdr = NCodepagePrivate::TCodePageData::rcdr_to_yandex[From]; + rcdr.Tr(in, out, in_readed); + out_writed = in_readed; + if (out_size < in_size) + return RECODE_EOOUTPUT; + return RECODE_OK; + } + inline RECODE_RESULT _recodeFromYandex(ECharset To, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { + if (To == CODES_YANDEX) + return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed); + if (To == CODES_UTF8) + return _recodeToUTF8(CODES_YANDEX, in, out, in_size, out_size, in_readed, out_writed); + in_readed = (out_size > in_size) ? in_size : out_size; + const Recoder& rcdr = NCodepagePrivate::TCodePageData::rcdr_from_yandex[To]; + rcdr.Tr(in, out, in_readed); + out_writed = in_readed; + if (out_size < in_size) + return RECODE_EOOUTPUT; + return RECODE_OK; + } + + template <class TCharType> + inline RECODE_RESULT _recodeUTF8ToUnicode(const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { + const unsigned char* inp = (const unsigned char*)in; + const unsigned char* in_end = inp + in_size; + TCharType* outp = out; + const TCharType* out_end = outp + out_size; + size_t rune_len; + wchar32 rune; + RECODE_RESULT res = RECODE_OK; + while ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && inp < in_end && outp < out_end) { res = SafeReadUTF8Char(rune, rune_len, inp, in_end); - if (res == RECODE_BROKENSYMBOL) - rune_len = 1; - if (res == RECODE_OK || res == RECODE_BROKENSYMBOL) { - if (!WriteSymbol(rune, outp, out_end)) { - break; - } - inp += rune_len; + if (res == RECODE_BROKENSYMBOL) + rune_len = 1; + if (res == RECODE_OK || res == RECODE_BROKENSYMBOL) { + if (!WriteSymbol(rune, outp, out_end)) { + break; + } + inp += rune_len; } } - in_readed = inp - (const unsigned char*)in; - out_writed = outp - out; - - if ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && in_readed != in_size) - return RECODE_EOOUTPUT; - - return res; - } - - template <class TCharType> - inline RECODE_RESULT _recodeSBToUnicode(ECharset From, const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { - const CodePage* cp = CodePageByCharset(From); - const unsigned char* inp = (const unsigned char*)in; - const unsigned char* in_end = inp + in_size; - TCharType* outp = out; - const TCharType* out_end = outp + out_size; - while (inp < in_end && outp < out_end) - *outp++ = static_cast<TCharType>(cp->unicode[*inp++]); - in_readed = inp - (const unsigned char*)in; - out_writed = outp - out; - if (in_readed != in_size) - return RECODE_EOOUTPUT; - return RECODE_OK; - } - - template <class TCharType> - inline RECODE_RESULT _recodeUnicodeToUTF8Impl(const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { - const TCharType* inp = in; - const TCharType* in_end = in + in_size; - unsigned char* outp = (unsigned char*)out; - const unsigned char* out_end = outp + out_size; - size_t rune_len; - wchar32 rune; - RECODE_RESULT res = RECODE_OK; - - while ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && inp != in_end) { - rune = ReadSymbolAndAdvance(inp, in_end); + in_readed = inp - (const unsigned char*)in; + out_writed = outp - out; + + if ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && in_readed != in_size) + return RECODE_EOOUTPUT; + + return res; + } + + template <class TCharType> + inline RECODE_RESULT _recodeSBToUnicode(ECharset From, const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { + const CodePage* cp = CodePageByCharset(From); + const unsigned char* inp = (const unsigned char*)in; + const unsigned char* in_end = inp + in_size; + TCharType* outp = out; + const TCharType* out_end = outp + out_size; + while (inp < in_end && outp < out_end) + *outp++ = static_cast<TCharType>(cp->unicode[*inp++]); + in_readed = inp - (const unsigned char*)in; + out_writed = outp - out; + if (in_readed != in_size) + return RECODE_EOOUTPUT; + return RECODE_OK; + } + + template <class TCharType> + inline RECODE_RESULT _recodeUnicodeToUTF8Impl(const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { + const TCharType* inp = in; + const TCharType* in_end = in + in_size; + unsigned char* outp = (unsigned char*)out; + const unsigned char* out_end = outp + out_size; + size_t rune_len; + wchar32 rune; + RECODE_RESULT res = RECODE_OK; + + while ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && inp != in_end) { + rune = ReadSymbolAndAdvance(inp, in_end); res = SafeWriteUTF8Char(rune, rune_len, outp, out_end); - if (outp >= out_end && (res == RECODE_OK || res == RECODE_BROKENSYMBOL)) - res = RECODE_EOOUTPUT; - outp += rune_len; - } - in_readed = inp - in; - out_writed = outp - (const unsigned char*)out; - return res; + if (outp >= out_end && (res == RECODE_OK || res == RECODE_BROKENSYMBOL)) + res = RECODE_EOOUTPUT; + outp += rune_len; + } + in_readed = inp - in; + out_writed = outp - (const unsigned char*)out; + return res; } - inline RECODE_RESULT _recodeUnicodeToUTF8(wchar32 rune, char* out, size_t out_size, size_t& nwritten) { + inline RECODE_RESULT _recodeUnicodeToUTF8(wchar32 rune, char* out, size_t out_size, size_t& nwritten) { return SafeWriteUTF8Char(rune, nwritten, (unsigned char*)out, out_size); - } + } - template <class TCharType, int Size = sizeof(TCharType)> - struct TCharTypeSwitch; + template <class TCharType, int Size = sizeof(TCharType)> + struct TCharTypeSwitch; - template <class TCharType> - struct TCharTypeSwitch<TCharType, 2> { + template <class TCharType> + struct TCharTypeSwitch<TCharType, 2> { using TRealCharType = wchar16; - }; + }; - template <class TCharType> - struct TCharTypeSwitch<TCharType, 4> { + template <class TCharType> + struct TCharTypeSwitch<TCharType, 4> { using TRealCharType = wchar32; - }; - - template <class TCharType> - inline RECODE_RESULT _recodeUnicodeToUTF8(const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { - static_assert(sizeof(TCharType) > 1, "expect some wide type"); + }; + template <class TCharType> + inline RECODE_RESULT _recodeUnicodeToUTF8(const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { + static_assert(sizeof(TCharType) > 1, "expect some wide type"); + using TRealCharType = typename TCharTypeSwitch<TCharType>::TRealCharType; - return _recodeUnicodeToUTF8Impl(reinterpret_cast<const TRealCharType*>(in), out, in_size, out_size, in_readed, out_writed); - } - - template <class TCharType> - inline RECODE_RESULT _recodeUnicodeToSB(ECharset To, const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { - const TCharType* inp = in; - const TCharType* in_end = in + in_size; - const char* out_begin = out; - const char* out_end = out + out_size; - - const Encoder* enc = &EncoderByCharset(To); - while (inp != in_end && out != out_end) { - *out++ = enc->Tr(ReadSymbolAndAdvance(inp, in_end)); - } - - in_readed = inp - in; - out_writed = out - out_begin; - - if (in_readed != in_size) - return RECODE_EOOUTPUT; - - return RECODE_OK; - } - - inline RECODE_RESULT _recodeUnicodeToSB(ECharset To, wchar32 rune, char* out, size_t out_size, size_t& nwritten) { - if (0 == out_size) - return RECODE_EOOUTPUT; - *out = EncoderByCharset(To).Tr(rune); - nwritten = 1; - return RECODE_OK; - } - - inline RECODE_RESULT _rune2hex(wchar32 in, char* out, size_t out_size, size_t& out_writed) { - static const char hex_digs[] = "0123456789ABCDEF"; - out_writed = 0; - RECODE_RESULT res = RECODE_OK; - for (int i = 7; i >= 0; i--) { - unsigned char h = (unsigned char)(in >> (i * 4) & 0x0F); - if (h || i == 0) { - if (out_writed + 1 >= out_size) { - res = RECODE_EOOUTPUT; - break; - } - out[out_writed++] = hex_digs[h]; + return _recodeUnicodeToUTF8Impl(reinterpret_cast<const TRealCharType*>(in), out, in_size, out_size, in_readed, out_writed); + } + + template <class TCharType> + inline RECODE_RESULT _recodeUnicodeToSB(ECharset To, const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { + const TCharType* inp = in; + const TCharType* in_end = in + in_size; + const char* out_begin = out; + const char* out_end = out + out_size; + + const Encoder* enc = &EncoderByCharset(To); + while (inp != in_end && out != out_end) { + *out++ = enc->Tr(ReadSymbolAndAdvance(inp, in_end)); + } + + in_readed = inp - in; + out_writed = out - out_begin; + + if (in_readed != in_size) + return RECODE_EOOUTPUT; + + return RECODE_OK; + } + + inline RECODE_RESULT _recodeUnicodeToSB(ECharset To, wchar32 rune, char* out, size_t out_size, size_t& nwritten) { + if (0 == out_size) + return RECODE_EOOUTPUT; + *out = EncoderByCharset(To).Tr(rune); + nwritten = 1; + return RECODE_OK; + } + + inline RECODE_RESULT _rune2hex(wchar32 in, char* out, size_t out_size, size_t& out_writed) { + static const char hex_digs[] = "0123456789ABCDEF"; + out_writed = 0; + RECODE_RESULT res = RECODE_OK; + for (int i = 7; i >= 0; i--) { + unsigned char h = (unsigned char)(in >> (i * 4) & 0x0F); + if (h || i == 0) { + if (out_writed + 1 >= out_size) { + res = RECODE_EOOUTPUT; + break; + } + out[out_writed++] = hex_digs[h]; } } - return res; + return res; } - inline RECODE_RESULT _recodeUnicodeToHTMLEntities(const wchar32* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { - const wchar32* in_end = in + in_size; - const char* out_beg = out; - const wchar32* in_beg = in; - RECODE_RESULT res = RECODE_OK; - - const char* out_end = out + out_size - 1; - while (in < in_end && out < out_end) { - if (*in < 0x80 && *in != '<' && *in != '&' && *in != '>') { //ascii - *out++ = char(*in & 0x00FF); - } else { //entity - char* ent = out; - size_t ent_writed; - if (ent > out_end - 6) { - res = RECODE_EOOUTPUT; - break; - } - memcpy(ent, "&#x", 3); - ent += 3; - res = _rune2hex(*in, ent, out_end - 1 - ent, ent_writed); - if (res != RECODE_OK) - break; - ent += ent_writed; - *ent++ = ';'; - out = ent; + inline RECODE_RESULT _recodeUnicodeToHTMLEntities(const wchar32* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { + const wchar32* in_end = in + in_size; + const char* out_beg = out; + const wchar32* in_beg = in; + RECODE_RESULT res = RECODE_OK; + + const char* out_end = out + out_size - 1; + while (in < in_end && out < out_end) { + if (*in < 0x80 && *in != '<' && *in != '&' && *in != '>') { //ascii + *out++ = char(*in & 0x00FF); + } else { //entity + char* ent = out; + size_t ent_writed; + if (ent > out_end - 6) { + res = RECODE_EOOUTPUT; + break; + } + memcpy(ent, "&#x", 3); + ent += 3; + res = _rune2hex(*in, ent, out_end - 1 - ent, ent_writed); + if (res != RECODE_OK) + break; + ent += ent_writed; + *ent++ = ';'; + out = ent; } - in++; + in++; } - *out++ = '\x00'; - out_writed = out - out_beg; - in_readed = in - in_beg; - return res; + *out++ = '\x00'; + out_writed = out - out_beg; + in_readed = in - in_beg; + return res; } - template <class TCharType> - inline RECODE_RESULT _recodeToUnicode(ECharset From, const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { - if (!ValidCodepage(From)) - return RECODE_ERROR; + template <class TCharType> + inline RECODE_RESULT _recodeToUnicode(ECharset From, const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { + if (!ValidCodepage(From)) + return RECODE_ERROR; - if (!NCodepagePrivate::NativeCodepage(From)) - return NICONVPrivate::RecodeToUnicodeNoThrow(From, in, out, in_size, out_size, in_readed, out_writed); + if (!NCodepagePrivate::NativeCodepage(From)) + return NICONVPrivate::RecodeToUnicodeNoThrow(From, in, out, in_size, out_size, in_readed, out_writed); - if (From == CODES_UTF8) - return _recodeUTF8ToUnicode(in, out, in_size, out_size, in_readed, out_writed); + if (From == CODES_UTF8) + return _recodeUTF8ToUnicode(in, out, in_size, out_size, in_readed, out_writed); - return _recodeSBToUnicode(From, in, out, in_size, out_size, in_readed, out_writed); - } + return _recodeSBToUnicode(From, in, out, in_size, out_size, in_readed, out_writed); + } - template <class TCharType> - inline RECODE_RESULT _recodeFromUnicode(ECharset To, const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { - if (!ValidCodepage(To)) - return RECODE_ERROR; + template <class TCharType> + inline RECODE_RESULT _recodeFromUnicode(ECharset To, const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { + if (!ValidCodepage(To)) + return RECODE_ERROR; - if (!NCodepagePrivate::NativeCodepage(To)) - return NICONVPrivate::RecodeFromUnicodeNoThrow(To, in, out, in_size, out_size, in_readed, out_writed); + if (!NCodepagePrivate::NativeCodepage(To)) + return NICONVPrivate::RecodeFromUnicodeNoThrow(To, in, out, in_size, out_size, in_readed, out_writed); - if (To == CODES_UTF8) - return NCodepagePrivate::_recodeUnicodeToUTF8(in, out, in_size, out_size, in_readed, out_writed); + if (To == CODES_UTF8) + return NCodepagePrivate::_recodeUnicodeToUTF8(in, out, in_size, out_size, in_readed, out_writed); - return NCodepagePrivate::_recodeUnicodeToSB(To, in, out, in_size, out_size, in_readed, out_writed); + return NCodepagePrivate::_recodeUnicodeToSB(To, in, out, in_size, out_size, in_readed, out_writed); } - inline RECODE_RESULT _recodeFromUnicode(ECharset To, wchar32 rune, char* out, size_t out_size, size_t& nwritten) { - if (!ValidCodepage(To)) - return RECODE_ERROR; + inline RECODE_RESULT _recodeFromUnicode(ECharset To, wchar32 rune, char* out, size_t out_size, size_t& nwritten) { + if (!ValidCodepage(To)) + return RECODE_ERROR; - if (!NCodepagePrivate::NativeCodepage(To)) { - size_t nread = 0; - return NICONVPrivate::RecodeFromUnicodeNoThrow(To, &rune, out, 1, out_size, nread, nwritten); - } + if (!NCodepagePrivate::NativeCodepage(To)) { + size_t nread = 0; + return NICONVPrivate::RecodeFromUnicodeNoThrow(To, &rune, out, 1, out_size, nread, nwritten); + } - if (To == CODES_UTF8) - return NCodepagePrivate::_recodeUnicodeToUTF8(rune, out, out_size, nwritten); + if (To == CODES_UTF8) + return NCodepagePrivate::_recodeUnicodeToUTF8(rune, out, out_size, nwritten); - return NCodepagePrivate::_recodeUnicodeToSB(To, rune, out, out_size, nwritten); - } + return NCodepagePrivate::_recodeUnicodeToSB(To, rune, out, out_size, nwritten); + } - inline RECODE_RESULT _recodeToHTMLEntities(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { - TArrayHolder<wchar32> bufHolder(new wchar32[in_size]); - wchar32* buf = bufHolder.Get(); - size_t unicode_size; - RECODE_RESULT res1, res2; + inline RECODE_RESULT _recodeToHTMLEntities(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) { + TArrayHolder<wchar32> bufHolder(new wchar32[in_size]); + wchar32* buf = bufHolder.Get(); + size_t unicode_size; + RECODE_RESULT res1, res2; - //first pass - to unicode - res1 = _recodeToUnicode(From, in, buf, in_size, in_size, in_readed, unicode_size); + //first pass - to unicode + res1 = _recodeToUnicode(From, in, buf, in_size, in_size, in_readed, unicode_size); - //second pass - to entities - res2 = _recodeUnicodeToHTMLEntities(buf, out, in_size, out_size, in_readed, out_writed); + //second pass - to entities + res2 = _recodeUnicodeToHTMLEntities(buf, out, in_size, out_size, in_readed, out_writed); - return (res2 != RECODE_OK) ? res2 : res1; - } + return (res2 != RECODE_OK) ? res2 : res1; + } -} +} diff --git a/library/cpp/charset/wide.h b/library/cpp/charset/wide.h index 32d30e849e..b7a391f0a5 100644 --- a/library/cpp/charset/wide.h +++ b/library/cpp/charset/wide.h @@ -1,15 +1,15 @@ #pragma once #include "codepage.h" -#include "iconv.h" - +#include "iconv.h" + #include <util/charset/recode_result.h> #include <util/charset/unidata.h> #include <util/charset/utf8.h> #include <util/charset/wide.h> #include <util/generic/string.h> #include <util/generic/algorithm.h> -#include <util/generic/yexception.h> +#include <util/generic/yexception.h> #include <util/memory/tempbuf.h> #include <util/system/yassert.h> @@ -19,7 +19,7 @@ template <typename TCharType> inline size_t WideToChar(const TCharType* text, size_t len, char* dest, ECharset enc) { Y_ASSERT(SingleByteCodepage(enc)); - + const char* start = dest; const Encoder* const encoder = &EncoderByCharset(enc); @@ -114,7 +114,7 @@ namespace NDetail { return RecodeMultiByteChar(src, dst, encoding); } - } + } template <typename TCharFrom> struct TRecodeTraits; @@ -124,8 +124,8 @@ namespace NDetail { using TCharTo = wchar16; using TStringBufTo = TWtringBuf; using TStringTo = TUtf16String; - enum { ReserveSize = 4 }; // How many TCharFrom characters we should reserve for one TCharTo character in worst case - // Here an unicode character can be converted up to 4 bytes of UTF8 + enum { ReserveSize = 4 }; // How many TCharFrom characters we should reserve for one TCharTo character in worst case + // Here an unicode character can be converted up to 4 bytes of UTF8 }; template <> @@ -133,7 +133,7 @@ namespace NDetail { using TCharTo = char; using TStringBufTo = TStringBuf; using TStringTo = TString; - enum { ReserveSize = 2 }; // possible surrogate pairs ? + enum { ReserveSize = 2 }; // possible surrogate pairs ? }; // Operations with destination buffer where recoded string will be written @@ -203,7 +203,7 @@ namespace NDetail { Recode<TCharFrom>(src, res, encoding); return res; } -} +} // Write result into @dst. Return string-buffer pointing to re-coded content of @dst. @@ -291,7 +291,7 @@ inline TString WideToChar(const TWtringBuf w, ECharset enc) { inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) { return CharToWide<false>(s.data(), s.size(), enc); } - + template <bool robust> inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) { return CharToWide<robust>(s.data(), s.size(), enc); diff --git a/library/cpp/charset/wide_ut.cpp b/library/cpp/charset/wide_ut.cpp index 78947d51ba..fc727fb1b4 100644 --- a/library/cpp/charset/wide_ut.cpp +++ b/library/cpp/charset/wide_ut.cpp @@ -1,14 +1,14 @@ -#include "wide.h" -#include "codepage.h" +#include "wide.h" +#include "codepage.h" #include "recyr.hh" - + #include <library/cpp/testing/unittest/registar.h> - + #include <util/charset/utf8.h> -#include <util/digest/numeric.h> +#include <util/digest/numeric.h> #include <util/generic/hash_set.h> -#include <algorithm> +#include <algorithm> namespace { //! three UTF8 encoded russian letters (A, B, V) @@ -21,7 +21,7 @@ namespace { 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F, 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F, 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, - 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, 0x00}; + 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, 0x00}; const char utf8CyrillicAlphabet[] = "\xd0\x90\xd0\x91\xd0\x92\xd0\x93\xd0\x94\xd0\x95\xd0\x96\xd0\x97" "\xd0\x98\xd0\x99\xd0\x9a\xd0\x9b\xd0\x9c\xd0\x9d\xd0\x9e\xd0\x9f" @@ -34,7 +34,7 @@ namespace { TString CreateYandexText() { const int len = 256; - char text[len] = {0}; + char text[len] = {0}; for (int i = 0; i < len; ++i) { text[i] = static_cast<char>(i); } @@ -61,7 +61,7 @@ namespace { for (int i = 0; i < len; ++i) { if (i <= 0x7F) { // ASCII characters without 0x7 and 0x1B text[i] = static_cast<wchar16>(i); - } else if (i >= 0xC0 && i <= 0xFF) { // russian characters (without YO and yo) + } else if (i >= 0xC0 && i <= 0xFF) { // russian characters (without YO and yo) text[i] = static_cast<wchar16>(i + 0x0350); // 0x0410 - 0x044F } } @@ -94,27 +94,27 @@ namespace { '\xd0', '\xb7', '\xd0', '\xb8', '\xd0', '\xb9', '\xd0', '\xba', '\xd0', '\xbb', '\xd0', '\xbc', '\xd0', '\xbd', '\xd0', '\xbe', '\xd0', '\xbf', '\xd1', '\x80', '\xd1', '\x81', '\xd1', '\x82', '\xd1', '\x83', '\xd1', '\x84', '\xd1', '\x85', '\xd1', '\x86', '\xd1', '\x87', '\xd1', '\x88', '\xd1', '\x89', '\xd1', '\x8a', '\xd1', '\x8b', '\xd1', '\x8c', '\xd1', '\x8d', '\xd1', '\x8e', - '\xd1', '\x8f'}; + '\xd1', '\x8f'}; return TString(text, Y_ARRAY_SIZE(text)); } //! use this function to dump UTF8 text into a file in case of any changes - // void DumpUTF8Text() { + // void DumpUTF8Text() { // TString s = WideToUTF8(UnicodeText); - // std::ofstream f("utf8.txt"); - // f << std::hex; - // for (int i = 0; i < (int)s.size(); ++i) { - // f << "0x" << std::setw(2) << std::setfill('0') << (int)(ui8)s[i] << ", "; - // if ((i + 1) % 16 == 0) - // f << std::endl; - // } - // } + // std::ofstream f("utf8.txt"); + // f << std::hex; + // for (int i = 0; i < (int)s.size(); ++i) { + // f << "0x" << std::setw(2) << std::setfill('0') << (int)(ui8)s[i] << ", "; + // if ((i + 1) % 16 == 0) + // f << std::endl; + // } + // } } //! this unit tests ensure validity of Yandex-Unicode and UTF8-Unicode conversions //! @note only those conversions are verified because they are used in index -class TConversionTest: public TTestBase { +class TConversionTest: public TTestBase { private: //! @note every of the text can have zeros in the middle const TString YandexText; @@ -123,13 +123,13 @@ private: private: UNIT_TEST_SUITE(TConversionTest); - UNIT_TEST(TestCharToWide); - UNIT_TEST(TestWideToChar); + UNIT_TEST(TestCharToWide); + UNIT_TEST(TestWideToChar); UNIT_TEST(TestYandexEncoding); - UNIT_TEST(TestRecodeIntoString); - UNIT_TEST(TestRecodeAppend); - UNIT_TEST(TestRecode); - UNIT_TEST(TestUnicodeLimit); + UNIT_TEST(TestRecodeIntoString); + UNIT_TEST(TestRecodeAppend); + UNIT_TEST(TestRecode); + UNIT_TEST(TestUnicodeLimit); UNIT_TEST_SUITE_END(); public: @@ -152,23 +152,23 @@ public: UNIT_TEST_SUITE_REGISTRATION(TConversionTest); // test conversions (char -> wchar32), (wchar32 -> char) and (wchar32 -> wchar16) -#define TEST_WCHAR32(sbuf, wbuf, enc) \ - do { \ - /* convert char to wchar32 */ \ - TTempBuf tmpbuf1(sbuf.length() * sizeof(wchar32)); \ +#define TEST_WCHAR32(sbuf, wbuf, enc) \ + do { \ + /* convert char to wchar32 */ \ + TTempBuf tmpbuf1(sbuf.length() * sizeof(wchar32)); \ const TBasicStringBuf<wchar32> s4buf = NDetail::NBaseOps::Recode<char>(sbuf, reinterpret_cast<wchar32*>(tmpbuf1.Data()), enc); \ - \ - /* convert wchar32 to char */ \ - TTempBuf tmpbuf2(s4buf.length() * 4); \ - const TStringBuf s1buf = NDetail::NBaseOps::Recode(s4buf, tmpbuf2.Data(), enc); \ - \ - /* convert wchar32 to wchar16 */ \ - const TUtf16String wstr2 = UTF32ToWide(s4buf.data(), s4buf.length()); \ - \ - /* test conversions */ \ - UNIT_ASSERT_VALUES_EQUAL(sbuf, s1buf); \ - UNIT_ASSERT_VALUES_EQUAL(wbuf, wstr2); \ - } while (false) + \ + /* convert wchar32 to char */ \ + TTempBuf tmpbuf2(s4buf.length() * 4); \ + const TStringBuf s1buf = NDetail::NBaseOps::Recode(s4buf, tmpbuf2.Data(), enc); \ + \ + /* convert wchar32 to wchar16 */ \ + const TUtf16String wstr2 = UTF32ToWide(s4buf.data(), s4buf.length()); \ + \ + /* test conversions */ \ + UNIT_ASSERT_VALUES_EQUAL(sbuf, s1buf); \ + UNIT_ASSERT_VALUES_EQUAL(wbuf, wstr2); \ + } while (false) void TConversionTest::TestCharToWide() { TUtf16String w = CharToWide(YandexText, CODES_YANDEX); @@ -210,7 +210,7 @@ void TConversionTest::TestYandexEncoding() { UNIT_ASSERT(w == wideCyrillicAlphabet); const char* utf8NonBMP2 = "ab\xf4\x80\x89\x87n"; - wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'}; + wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'}; TestSurrogates(utf8NonBMP2, wNonBMPDummy2, Y_ARRAY_SIZE(wNonBMPDummy2), CODES_UTF8); { @@ -232,7 +232,7 @@ void TConversionTest::TestRecodeIntoString() { TString sYandex(UnicodeText.size() * 4, 'x'); const char* sdata = sYandex.data(); TStringBuf sres = NDetail::Recode<wchar16>(UnicodeText, sYandex, CODES_YANDEX); - UNIT_ASSERT(sYandex == YandexText); // same content + UNIT_ASSERT(sYandex == YandexText); // same content UNIT_ASSERT(sYandex.data() == sdata); // reserved buffer reused UNIT_ASSERT(sYandex.data() == sres.data()); // same buffer UNIT_ASSERT(sYandex.size() == sres.size()); // same size @@ -242,7 +242,7 @@ void TConversionTest::TestRecodeIntoString() { sUnicode.reserve(YandexText.size() * 4); const wchar16* wdata = sUnicode.data(); TWtringBuf wres = NDetail::Recode<char>(YandexText, sUnicode, CODES_YANDEX); - UNIT_ASSERT(sUnicode == UnicodeText); // same content + UNIT_ASSERT(sUnicode == UnicodeText); // same content UNIT_ASSERT(sUnicode.data() == wdata); // reserved buffer reused UNIT_ASSERT(sUnicode.data() == wres.data()); // same buffer UNIT_ASSERT(sUnicode.size() == wres.size()); // same size @@ -250,8 +250,8 @@ void TConversionTest::TestRecodeIntoString() { TString sUtf8 = " "; size_t scap = sUtf8.capacity(); sres = NDetail::Recode<wchar16>(UnicodeText, sUtf8, CODES_UTF8); - UNIT_ASSERT(sUtf8 == UTF8Text); // same content - UNIT_ASSERT(sUtf8.capacity() > scap); // increased buffer capacity (supplied was too small) + UNIT_ASSERT(sUtf8 == UTF8Text); // same content + UNIT_ASSERT(sUtf8.capacity() > scap); // increased buffer capacity (supplied was too small) UNIT_ASSERT(sUtf8.data() == sres.data()); // same buffer UNIT_ASSERT(sUtf8.size() == sres.size()); // same size TEST_WCHAR32(sUtf8, UnicodeText, CODES_UTF8); @@ -260,7 +260,7 @@ void TConversionTest::TestRecodeIntoString() { wdata = sUnicode.data(); TUtf16String copy = sUnicode; // increase ref-counter wres = NDetail::Recode<char>(UTF8Text, sUnicode, CODES_UTF8); - UNIT_ASSERT(sUnicode == UnicodeText); // same content + UNIT_ASSERT(sUnicode == UnicodeText); // same content #ifndef TSTRING_IS_STD_STRING UNIT_ASSERT(sUnicode.data() != wdata); // re-allocated (shared buffer supplied) UNIT_ASSERT(sUnicode.data() == wres.data()); // same buffer diff --git a/library/cpp/charset/ya.make b/library/cpp/charset/ya.make index 7565566bf0..8906c507f0 100644 --- a/library/cpp/charset/ya.make +++ b/library/cpp/charset/ya.make @@ -1,10 +1,10 @@ -LIBRARY() +LIBRARY() OWNER(alzobnin) SRCS( - generated/cp_data.cpp - generated/encrec_data.cpp + generated/cp_data.cpp + generated/encrec_data.cpp codepage.cpp cp_encrec.cpp doccodes.cpp |