diff options
author | Anton Samokhvalov <pg83@yandex.ru> | 2022-02-10 16:45:15 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:15 +0300 |
commit | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch) | |
tree | da2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /library/cpp/charset/codepage.cpp | |
parent | 778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff) | |
download | ydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz |
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/charset/codepage.cpp')
-rw-r--r-- | library/cpp/charset/codepage.cpp | 278 |
1 files changed, 139 insertions, 139 deletions
diff --git a/library/cpp/charset/codepage.cpp b/library/cpp/charset/codepage.cpp index 0431bef31b..816f3fec67 100644 --- a/library/cpp/charset/codepage.cpp +++ b/library/cpp/charset/codepage.cpp @@ -1,45 +1,45 @@ #include "ci_string.h" -#include "wide.h" -#include "recyr.hh" -#include "codepage.h" - -#include <util/string/cast.h> +#include "wide.h" +#include "recyr.hh" +#include "codepage.h" + +#include <util/string/cast.h> #include <util/string/subst.h> -#include <util/string/util.h> +#include <util/string/util.h> #include <util/system/hi_lo.h> -#include <util/system/yassert.h> -#include <util/generic/hash.h> +#include <util/system/yassert.h> +#include <util/generic/hash.h> #include <util/generic/string.h> -#include <util/generic/vector.h> -#include <util/generic/hash_set.h> -#include <util/generic/singleton.h> +#include <util/generic/vector.h> +#include <util/generic/hash_set.h> +#include <util/generic/singleton.h> #include <util/generic/yexception.h> #include <util/memory/pool.h> - -#include <cstring> - -#include <ctype.h> - + +#include <cstring> + +#include <ctype.h> + using namespace NCodepagePrivate; -void Recoder::Create(const CodePage& source, const CodePage& target) { +void Recoder::Create(const CodePage& source, const CodePage& target) { const Encoder* wideTarget = &EncoderByCharset(target.CPEnum); Create(source, wideTarget); } -void Recoder::Create(const CodePage& page, wchar32 (*mapfunc)(wchar32)) { +void Recoder::Create(const CodePage& page, wchar32 (*mapfunc)(wchar32)) { const Encoder* widePage = &EncoderByCharset(page.CPEnum); Create(page, widePage, mapfunc); } -template <class T, class T1> +template <class T, class T1> static inline T1 Apply(T b, T e, T1 to, const Recoder& mapper) { - while (b != e) { - *to++ = mapper.Table[(unsigned char)*b++]; - } - - return to; -} - + while (b != e) { + *to++ = mapper.Table[(unsigned char)*b++]; + } + + return to; +} + template <class T, class T1> static inline T1 Apply(T b, T1 to, const Recoder& mapper) { while (*b != 0) { @@ -49,21 +49,21 @@ static inline T1 Apply(T b, T1 to, const Recoder& mapper) { return to; } -char* CodePage::ToLower(const char* b, const char* e, char* to) const { - return Apply(b, e, to, TCodePageData::rcdr_to_lower[CPEnum]); -} +char* CodePage::ToLower(const char* b, const char* e, char* to) const { + return Apply(b, e, to, TCodePageData::rcdr_to_lower[CPEnum]); +} char* CodePage::ToLower(const char* b, char* to) const { return Apply(b, to, TCodePageData::rcdr_to_lower[CPEnum]); } - -char* CodePage::ToUpper(const char* b, const char* e, char* to) const { + +char* CodePage::ToUpper(const char* b, const char* e, char* to) const { return Apply(b, e, to, TCodePageData::rcdr_to_upper[CPEnum]); -} +} char* CodePage::ToUpper(const char* b, char* to) const { return Apply(b, to, TCodePageData::rcdr_to_upper[CPEnum]); } - -int CodePage::stricmp(const char* dst, const char* src) const { + +int CodePage::stricmp(const char* dst, const char* src) const { unsigned char f, l; do { f = ToLower(*dst++); @@ -86,18 +86,18 @@ int CodePage::strnicmp(const char* dst, const char* src, size_t len) const { static const CodePage UNSUPPORTED_CODEPAGE = { CODES_UNSUPPORTED, - { - "unsupported", - }, + { + "unsupported", + }, {}, nullptr, }; static const CodePage UNKNOWN_CODEPAGE = { CODES_UNKNOWN, - { - "unknown", - }, + { + "unknown", + }, {}, nullptr, }; @@ -122,14 +122,14 @@ NCodepagePrivate::TCodepagesMap::TCodepagesMap() { } } -const NCodepagePrivate::TCodepagesMap& NCodepagePrivate::TCodepagesMap::Instance() { - return *Singleton<NCodepagePrivate::TCodepagesMap>(); -} - +const NCodepagePrivate::TCodepagesMap& NCodepagePrivate::TCodepagesMap::Instance() { + return *Singleton<NCodepagePrivate::TCodepagesMap>(); +} + class TCodePageHash { private: using TData = THashMap<TStringBuf, ECharset, ci_hash, ci_equal_to>; - + TData Data; TMemoryPool Pool; @@ -153,7 +153,7 @@ private: temp = name; SubstGlobal(temp, '-', '_'); AddNameWithCheck(temp, code); - + temp = name; SubstGlobal(temp, '_', '-'); AddNameWithCheck(temp, code); @@ -176,8 +176,8 @@ public: AddName(name, e); AddName(xPrefix + name, e); - } - } + } + } } inline ECharset CharsetByName(TStringBuf name) { @@ -204,7 +204,7 @@ ECharset CharsetByNameOrDie(TStringBuf name) { } template <typename TxChar> -static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size_t& rune_len, const TxChar* s, const TxChar* end) { +static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size_t& rune_len, const TxChar* s, const TxChar* end) { if ((*s & 0xFF00) != 0xF000) { rune_len = 1; rune = *s; @@ -214,37 +214,37 @@ static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size rune_len = 0; size_t _len = UTF8RuneLen((unsigned char)(*s)); - if (s + _len > end) - return RECODE_EOINPUT; //[EOINPUT] - if (_len == 0) - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte + if (s + _len > end) + return RECODE_EOINPUT; //[EOINPUT] + if (_len == 0) + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte - wchar32 _rune = (ui8)(*s++); //[00000000 0XXXXXXX] + wchar32 _rune = (ui8)(*s++); //[00000000 0XXXXXXX] if (_len > 1) { _rune &= UTF8LeadByteMask(_len); wchar32 ch = *s++; if ((ch & 0xFFC0) != 0xF080) - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte _rune <<= 6; - _rune |= ch & 0x3F; //[00000XXX XXYYYYYY] + _rune |= ch & 0x3F; //[00000XXX XXYYYYYY] if (_len > 2) { ch = *s++; if ((ch & 0xFFC0) != 0xF080) - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte _rune <<= 6; - _rune |= ch & 0x3F; //[XXXXYYYY YYZZZZZZ] + _rune |= ch & 0x3F; //[XXXXYYYY YYZZZZZZ] if (_len > 3) { ch = *s; if ((ch & 0xFFC0) != 0xF080) return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte _rune <<= 6; - _rune |= ch & 0x3F; //[XXXYY YYYYZZZZ ZZQQQQQQ] + _rune |= ch & 0x3F; //[XXXYY YYYYZZZZ ZZQQQQQQ] } } } rune_len = _len; if (_rune > Max<TxChar>()) - rune = ' '; // maybe put sequence + rune = ' '; // maybe put sequence else rune = TxChar(_rune); return RECODE_OK; @@ -262,16 +262,16 @@ void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) { } else if (enc == CODES_UTF8) { TxChar* s; TxChar* d; - - for (s = d = str; s < e;) { + + for (s = d = str; s < e;) { size_t l = 0; - + if (utf8_read_rune_from_unknown_plane(*d, l, s, e) == RECODE_OK) { d++, s += l; } else { *d++ = BROKEN_RUNE; ++s; - } + } } e = d; } else if (enc == CODES_UNKNOWN) { @@ -289,7 +289,7 @@ void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) { size_t read = 0; size_t written = 0; - for (; s < e; ++s) { + for (; s < e; ++s) { if (Hi8(Lo16(*s)) == 0xF0) { buf.push_back(Lo8(Lo16(*s))); } else { @@ -318,28 +318,28 @@ void DecodeUnknownPlane(wchar32* str, wchar32*& ee, const ECharset enc) { DoDecodeUnknownPlane(str, ee, enc); } -namespace { +namespace { class THashSetType: public THashSet<TString> { - public: + public: inline void Add(const TString& s) { - insert(s); - } - + insert(s); + } + inline bool Has(const TString& s) const noexcept { - return find(s) != end(); - } - }; -} - + return find(s) != end(); + } + }; +} + class TWindowsPrefixesHashSet: public THashSetType { public: inline TWindowsPrefixesHashSet() { - Add("win"); - Add("wincp"); - Add("window"); + Add("win"); + Add("wincp"); + Add("window"); Add("windowcp"); - Add("windows"); - Add("windowscp"); + Add("windows"); + Add("windowscp"); Add("ansi"); Add("ansicp"); } @@ -364,19 +364,19 @@ public: }; class TLatinToIsoHash: public THashMap<const char*, TString, ci_hash, ci_equal_to> { -public: - inline TLatinToIsoHash() { - insert(value_type("latin1", "iso-8859-1")); - insert(value_type("latin2", "iso-8859-2")); - insert(value_type("latin3", "iso-8859-3")); - insert(value_type("latin4", "iso-8859-4")); - insert(value_type("latin5", "iso-8859-9")); - insert(value_type("latin6", "iso-8859-10")); - insert(value_type("latin7", "iso-8859-13")); - insert(value_type("latin8", "iso-8859-14")); - insert(value_type("latin9", "iso-8859-15")); - insert(value_type("latin10", "iso-8859-16")); - } +public: + inline TLatinToIsoHash() { + insert(value_type("latin1", "iso-8859-1")); + insert(value_type("latin2", "iso-8859-2")); + insert(value_type("latin3", "iso-8859-3")); + insert(value_type("latin4", "iso-8859-4")); + insert(value_type("latin5", "iso-8859-9")); + insert(value_type("latin6", "iso-8859-10")); + insert(value_type("latin7", "iso-8859-13")); + insert(value_type("latin8", "iso-8859-14")); + insert(value_type("latin9", "iso-8859-15")); + insert(value_type("latin10", "iso-8859-16")); + } }; static inline void NormalizeEncodingPrefixes(TString& enc) { @@ -391,14 +391,14 @@ static inline void NormalizeEncodingPrefixes(TString& enc) { } } - if (Singleton<TWindowsPrefixesHashSet>()->Has(prefix)) { + if (Singleton<TWindowsPrefixesHashSet>()->Has(prefix)) { enc.remove(0, preflen); enc.prepend("windows-"); return; } - if (Singleton<TCpPrefixesHashSet>()->Has(prefix)) { - if (enc.length() > preflen + 3 && !strncmp(enc.c_str() + preflen, "125", 3) && isdigit(enc[preflen + 3])) { + if (Singleton<TCpPrefixesHashSet>()->Has(prefix)) { + if (enc.length() > preflen + 3 && !strncmp(enc.c_str() + preflen, "125", 3) && isdigit(enc[preflen + 3])) { enc.remove(0, preflen); enc.prepend("windows-"); return; @@ -408,7 +408,7 @@ static inline void NormalizeEncodingPrefixes(TString& enc) { return; } - if (Singleton<TIsoPrefixesHashSet>()->Has(prefix)) { + if (Singleton<TIsoPrefixesHashSet>()->Has(prefix)) { if (enc.length() == preflen + 1 || enc.length() == preflen + 2) { TString enccopy = enc.substr(preflen); enccopy.prepend("latin"); @@ -428,46 +428,46 @@ static inline void NormalizeEncodingPrefixes(TString& enc) { class TEncodingNamesHashSet: public THashSetType { public: TEncodingNamesHashSet() { - Add("iso-8859-1"); - Add("iso-8859-2"); - Add("iso-8859-3"); - Add("iso-8859-4"); - Add("iso-8859-5"); - Add("iso-8859-6"); - Add("iso-8859-7"); - Add("iso-8859-8"); - Add("iso-8859-8-i"); - Add("iso-8859-9"); - Add("iso-8859-10"); - Add("iso-8859-11"); - Add("iso-8859-12"); - Add("iso-8859-13"); - Add("iso-8859-14"); - Add("iso-8859-15"); - Add("windows-1250"); - Add("windows-1251"); - Add("windows-1252"); - Add("windows-1253"); - Add("windows-1254"); - Add("windows-1255"); - Add("windows-1256"); - Add("windows-1257"); - Add("windows-1258"); - Add("windows-874"); - Add("iso-2022-jp"); - Add("euc-jp"); - Add("shift-jis"); - Add("shiftjis"); - Add("iso-2022-kr"); - Add("euc-kr"); - Add("gb-2312"); - Add("gb2312"); - Add("gb-18030"); - Add("gb18030"); - Add("gbk"); - Add("big5"); - Add("tis-620"); - Add("tis620"); + Add("iso-8859-1"); + Add("iso-8859-2"); + Add("iso-8859-3"); + Add("iso-8859-4"); + Add("iso-8859-5"); + Add("iso-8859-6"); + Add("iso-8859-7"); + Add("iso-8859-8"); + Add("iso-8859-8-i"); + Add("iso-8859-9"); + Add("iso-8859-10"); + Add("iso-8859-11"); + Add("iso-8859-12"); + Add("iso-8859-13"); + Add("iso-8859-14"); + Add("iso-8859-15"); + Add("windows-1250"); + Add("windows-1251"); + Add("windows-1252"); + Add("windows-1253"); + Add("windows-1254"); + Add("windows-1255"); + Add("windows-1256"); + Add("windows-1257"); + Add("windows-1258"); + Add("windows-874"); + Add("iso-2022-jp"); + Add("euc-jp"); + Add("shift-jis"); + Add("shiftjis"); + Add("iso-2022-kr"); + Add("euc-kr"); + Add("gb-2312"); + Add("gb2312"); + Add("gb-18030"); + Add("gb18030"); + Add("gbk"); + Add("big5"); + Add("tis-620"); + Add("tis620"); } }; @@ -494,7 +494,7 @@ ECharset EncodingHintByName(const char* encname) { // Do some normalization TString enc(encname, lastpos - encname + 1); enc.to_lower(); - for (char* p = enc.begin(); p != enc.end(); ++p) { + for (char* p = enc.begin(); p != enc.end(); ++p) { if (*p == ' ' || *p == '=' || *p == '_') *p = '-'; } @@ -505,7 +505,7 @@ ECharset EncodingHintByName(const char* encname) { if (hint != CODES_UNKNOWN) return hint; - if (Singleton<TEncodingNamesHashSet>()->Has(enc)) + if (Singleton<TEncodingNamesHashSet>()->Has(enc)) return CODES_UNSUPPORTED; return CODES_UNKNOWN; } |