diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/charset/codepage.cpp | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/charset/codepage.cpp')
-rw-r--r-- | library/cpp/charset/codepage.cpp | 511 |
1 files changed, 511 insertions, 0 deletions
diff --git a/library/cpp/charset/codepage.cpp b/library/cpp/charset/codepage.cpp new file mode 100644 index 0000000000..0431bef31b --- /dev/null +++ b/library/cpp/charset/codepage.cpp @@ -0,0 +1,511 @@ +#include "ci_string.h" +#include "wide.h" +#include "recyr.hh" +#include "codepage.h" + +#include <util/string/cast.h> +#include <util/string/subst.h> +#include <util/string/util.h> +#include <util/system/hi_lo.h> +#include <util/system/yassert.h> +#include <util/generic/hash.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/generic/hash_set.h> +#include <util/generic/singleton.h> +#include <util/generic/yexception.h> +#include <util/memory/pool.h> + +#include <cstring> + +#include <ctype.h> + +using namespace NCodepagePrivate; + +void Recoder::Create(const CodePage& source, const CodePage& target) { + const Encoder* wideTarget = &EncoderByCharset(target.CPEnum); + Create(source, wideTarget); +} +void Recoder::Create(const CodePage& page, wchar32 (*mapfunc)(wchar32)) { + const Encoder* widePage = &EncoderByCharset(page.CPEnum); + Create(page, widePage, mapfunc); +} + +template <class T, class T1> +static inline T1 Apply(T b, T e, T1 to, const Recoder& mapper) { + while (b != e) { + *to++ = mapper.Table[(unsigned char)*b++]; + } + + return to; +} + +template <class T, class T1> +static inline T1 Apply(T b, T1 to, const Recoder& mapper) { + while (*b != 0) { + *to++ = mapper.Table[(unsigned char)*b++]; + } + + return to; +} + +char* CodePage::ToLower(const char* b, const char* e, char* to) const { + return Apply(b, e, to, TCodePageData::rcdr_to_lower[CPEnum]); +} +char* CodePage::ToLower(const char* b, char* to) const { + return Apply(b, to, TCodePageData::rcdr_to_lower[CPEnum]); +} + +char* CodePage::ToUpper(const char* b, const char* e, char* to) const { + return Apply(b, e, to, TCodePageData::rcdr_to_upper[CPEnum]); +} +char* CodePage::ToUpper(const char* b, char* to) const { + return Apply(b, to, TCodePageData::rcdr_to_upper[CPEnum]); +} + +int CodePage::stricmp(const char* dst, const char* src) const { + unsigned char f, l; + do { + f = ToLower(*dst++); + l = ToLower(*src++); + } while (f && (f == l)); + return f - l; +} + +int CodePage::strnicmp(const char* dst, const char* src, size_t len) const { + unsigned char f, l; + if (len) { + do { + f = ToLower(*dst++); + l = ToLower(*src++); + } while (--len && f && (f == l)); + return f - l; + } + return 0; +} + +static const CodePage UNSUPPORTED_CODEPAGE = { + CODES_UNSUPPORTED, + { + "unsupported", + }, + {}, + nullptr, +}; + +static const CodePage UNKNOWN_CODEPAGE = { + CODES_UNKNOWN, + { + "unknown", + }, + {}, + nullptr, +}; + +void NCodepagePrivate::TCodepagesMap::SetData(const CodePage* cp) { + Y_ASSERT(cp); + int code = static_cast<int>(cp->CPEnum) + DataShift; + + Y_ASSERT(code >= 0 && code < DataSize); + Y_ASSERT(Data[code] == nullptr); + + Data[code] = cp; +} + +NCodepagePrivate::TCodepagesMap::TCodepagesMap() { + memset(Data, 0, sizeof(const CodePage*) * DataSize); + SetData(&UNSUPPORTED_CODEPAGE); + SetData(&UNKNOWN_CODEPAGE); + + for (size_t i = 0; i != CODES_MAX; ++i) { + SetData(TCodePageData::AllCodePages[i]); + } +} + +const NCodepagePrivate::TCodepagesMap& NCodepagePrivate::TCodepagesMap::Instance() { + return *Singleton<NCodepagePrivate::TCodepagesMap>(); +} + +class TCodePageHash { +private: + using TData = THashMap<TStringBuf, ECharset, ci_hash, ci_equal_to>; + + TData Data; + TMemoryPool Pool; + +private: + inline void AddNameWithCheck(const TString& name, ECharset code) { + if (Data.find(name.c_str()) == Data.end()) { + Data.insert(TData::value_type(Pool.Append(name.data(), name.size() + 1), code)); + } else { + Y_ASSERT(Data.find(name.c_str())->second == code); + } + } + + inline void AddName(const TString& name, ECharset code) { + AddNameWithCheck(name, code); + + TString temp = name; + RemoveAll(temp, '-'); + RemoveAll(temp, '_'); + AddNameWithCheck(temp, code); + + temp = name; + SubstGlobal(temp, '-', '_'); + AddNameWithCheck(temp, code); + + temp = name; + SubstGlobal(temp, '_', '-'); + AddNameWithCheck(temp, code); + } + +public: + inline TCodePageHash() + : Pool(20 * 1024) /* Currently used: 17KB. */ + { + TString xPrefix = "x-"; + const char* name; + + for (size_t i = 0; i != CODES_MAX; ++i) { + ECharset e = static_cast<ECharset>(i); + const CodePage* page = Singleton<NCodepagePrivate::TCodepagesMap>()->GetPrivate(e); + + AddName(ToString(static_cast<int>(i)), e); + + for (size_t j = 0; (name = page->Names[j]) != nullptr && name[0]; ++j) { + AddName(name, e); + + AddName(xPrefix + name, e); + } + } + } + + inline ECharset CharsetByName(TStringBuf name) { + if (!name) + return CODES_UNKNOWN; + + TData::const_iterator it = Data.find(name); + if (it == Data.end()) + return CODES_UNKNOWN; + + return it->second; + } +}; + +ECharset CharsetByName(TStringBuf name) { + return Singleton<TCodePageHash>()->CharsetByName(name); +} + +ECharset CharsetByNameOrDie(TStringBuf name) { + ECharset result = CharsetByName(name); + if (result == CODES_UNKNOWN) + ythrow yexception() << "CharsetByNameOrDie: unknown charset '" << name << "'"; + return result; +} + +template <typename TxChar> +static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size_t& rune_len, const TxChar* s, const TxChar* end) { + if ((*s & 0xFF00) != 0xF000) { + rune_len = 1; + rune = *s; + return RECODE_OK; + } + + rune_len = 0; + + size_t _len = UTF8RuneLen((unsigned char)(*s)); + if (s + _len > end) + return RECODE_EOINPUT; //[EOINPUT] + if (_len == 0) + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte + + wchar32 _rune = (ui8)(*s++); //[00000000 0XXXXXXX] + if (_len > 1) { + _rune &= UTF8LeadByteMask(_len); + wchar32 ch = *s++; + if ((ch & 0xFFC0) != 0xF080) + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte + _rune <<= 6; + _rune |= ch & 0x3F; //[00000XXX XXYYYYYY] + if (_len > 2) { + ch = *s++; + if ((ch & 0xFFC0) != 0xF080) + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte + _rune <<= 6; + _rune |= ch & 0x3F; //[XXXXYYYY YYZZZZZZ] + if (_len > 3) { + ch = *s; + if ((ch & 0xFFC0) != 0xF080) + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte + _rune <<= 6; + _rune |= ch & 0x3F; //[XXXYY YYYYZZZZ ZZQQQQQQ] + } + } + } + rune_len = _len; + if (_rune > Max<TxChar>()) + rune = ' '; // maybe put sequence + else + rune = TxChar(_rune); + return RECODE_OK; +} + +template <typename TxChar> +void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) { + TxChar* e = ee; + if (SingleByteCodepage(enc)) { + const CodePage* cp = CodePageByCharset(enc); + for (TxChar* s = str; s < e; s++) { + if (Hi8(Lo16(*s)) == 0xF0) + *s = (TxChar)cp->unicode[Lo8(Lo16(*s))]; // NOT mb compliant + } + } else if (enc == CODES_UTF8) { + TxChar* s; + TxChar* d; + + for (s = d = str; s < e;) { + size_t l = 0; + + if (utf8_read_rune_from_unknown_plane(*d, l, s, e) == RECODE_OK) { + d++, s += l; + } else { + *d++ = BROKEN_RUNE; + ++s; + } + } + e = d; + } else if (enc == CODES_UNKNOWN) { + for (TxChar* s = str; s < e; s++) { + if (Hi8(Lo16(*s)) == 0xF0) + *s = Lo8(Lo16(*s)); + } + } else { + Y_ASSERT(!SingleByteCodepage(enc)); + + TxChar* s = str; + TxChar* d = str; + + TVector<char> buf; + + size_t read = 0; + size_t written = 0; + for (; s < e; ++s) { + if (Hi8(Lo16(*s)) == 0xF0) { + buf.push_back(Lo8(Lo16(*s))); + } else { + if (!buf.empty()) { + if (RecodeToUnicode(enc, buf.data(), d, buf.size(), e - d, read, written) == RECODE_OK) { + Y_ASSERT(read == buf.size()); + d += written; + } else { // just copying broken symbols + Y_ASSERT(buf.size() <= static_cast<size_t>(e - d)); + Copy(buf.data(), buf.size(), d); + d += buf.size(); + } + buf.clear(); + } + *d++ = *s; + } + } + } + ee = e; +} + +void DecodeUnknownPlane(wchar16* str, wchar16*& ee, const ECharset enc) { + DoDecodeUnknownPlane(str, ee, enc); +} +void DecodeUnknownPlane(wchar32* str, wchar32*& ee, const ECharset enc) { + DoDecodeUnknownPlane(str, ee, enc); +} + +namespace { + class THashSetType: public THashSet<TString> { + public: + inline void Add(const TString& s) { + insert(s); + } + + inline bool Has(const TString& s) const noexcept { + return find(s) != end(); + } + }; +} + +class TWindowsPrefixesHashSet: public THashSetType { +public: + inline TWindowsPrefixesHashSet() { + Add("win"); + Add("wincp"); + Add("window"); + Add("windowcp"); + Add("windows"); + Add("windowscp"); + Add("ansi"); + Add("ansicp"); + } +}; + +class TCpPrefixesHashSet: public THashSetType { +public: + inline TCpPrefixesHashSet() { + Add("microsoft"); + Add("microsoftcp"); + Add("cp"); + } +}; + +class TIsoPrefixesHashSet: public THashSetType { +public: + inline TIsoPrefixesHashSet() { + Add("iso"); + Add("isolatin"); + Add("latin"); + } +}; + +class TLatinToIsoHash: public THashMap<const char*, TString, ci_hash, ci_equal_to> { +public: + inline TLatinToIsoHash() { + insert(value_type("latin1", "iso-8859-1")); + insert(value_type("latin2", "iso-8859-2")); + insert(value_type("latin3", "iso-8859-3")); + insert(value_type("latin4", "iso-8859-4")); + insert(value_type("latin5", "iso-8859-9")); + insert(value_type("latin6", "iso-8859-10")); + insert(value_type("latin7", "iso-8859-13")); + insert(value_type("latin8", "iso-8859-14")); + insert(value_type("latin9", "iso-8859-15")); + insert(value_type("latin10", "iso-8859-16")); + } +}; + +static inline void NormalizeEncodingPrefixes(TString& enc) { + size_t preflen = enc.find_first_of("0123456789"); + if (preflen == TString::npos) + return; + + TString prefix = enc.substr(0, preflen); + for (size_t i = 0; i < prefix.length(); ++i) { + if (prefix[i] == '-') { + prefix.remove(i--); + } + } + + if (Singleton<TWindowsPrefixesHashSet>()->Has(prefix)) { + enc.remove(0, preflen); + enc.prepend("windows-"); + return; + } + + if (Singleton<TCpPrefixesHashSet>()->Has(prefix)) { + if (enc.length() > preflen + 3 && !strncmp(enc.c_str() + preflen, "125", 3) && isdigit(enc[preflen + 3])) { + enc.remove(0, preflen); + enc.prepend("windows-"); + return; + } + enc.remove(0, preflen); + enc.prepend("cp"); + return; + } + + if (Singleton<TIsoPrefixesHashSet>()->Has(prefix)) { + if (enc.length() == preflen + 1 || enc.length() == preflen + 2) { + TString enccopy = enc.substr(preflen); + enccopy.prepend("latin"); + const TLatinToIsoHash* latinhash = Singleton<TLatinToIsoHash>(); + TLatinToIsoHash::const_iterator it = latinhash->find(enccopy.data()); + if (it != latinhash->end()) + enc.assign(it->second); + return; + } else if (enc.length() > preflen + 5 && enc[preflen] == '8') { + enc.remove(0, preflen); + enc.prepend("iso-"); + return; + } + } +} + +class TEncodingNamesHashSet: public THashSetType { +public: + TEncodingNamesHashSet() { + Add("iso-8859-1"); + Add("iso-8859-2"); + Add("iso-8859-3"); + Add("iso-8859-4"); + Add("iso-8859-5"); + Add("iso-8859-6"); + Add("iso-8859-7"); + Add("iso-8859-8"); + Add("iso-8859-8-i"); + Add("iso-8859-9"); + Add("iso-8859-10"); + Add("iso-8859-11"); + Add("iso-8859-12"); + Add("iso-8859-13"); + Add("iso-8859-14"); + Add("iso-8859-15"); + Add("windows-1250"); + Add("windows-1251"); + Add("windows-1252"); + Add("windows-1253"); + Add("windows-1254"); + Add("windows-1255"); + Add("windows-1256"); + Add("windows-1257"); + Add("windows-1258"); + Add("windows-874"); + Add("iso-2022-jp"); + Add("euc-jp"); + Add("shift-jis"); + Add("shiftjis"); + Add("iso-2022-kr"); + Add("euc-kr"); + Add("gb-2312"); + Add("gb2312"); + Add("gb-18030"); + Add("gb18030"); + Add("gbk"); + Add("big5"); + Add("tis-620"); + Add("tis620"); + } +}; + +ECharset EncodingHintByName(const char* encname) { + if (!encname) + return CODES_UNKNOWN; // safety check + + // Common trouble: spurious "charset=" in the encoding name + if (!strnicmp(encname, "charset=", 8)) { + encname += 8; + } + + // Strip everything up to the first alphanumeric, and after the last one + while (*encname && !isalnum(*encname)) + ++encname; + + if (!*encname) + return CODES_UNKNOWN; + + const char* lastpos = encname + strlen(encname) - 1; + while (lastpos > encname && !isalnum(*lastpos)) + --lastpos; + + // Do some normalization + TString enc(encname, lastpos - encname + 1); + enc.to_lower(); + for (char* p = enc.begin(); p != enc.end(); ++p) { + if (*p == ' ' || *p == '=' || *p == '_') + *p = '-'; + } + + NormalizeEncodingPrefixes(enc); + + ECharset hint = CharsetByName(enc.c_str()); + if (hint != CODES_UNKNOWN) + return hint; + + if (Singleton<TEncodingNamesHashSet>()->Has(enc)) + return CODES_UNSUPPORTED; + return CODES_UNKNOWN; +} |