diff options
author | sereglond <sereglond@yandex-team.ru> | 2022-02-10 16:47:47 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:47:47 +0300 |
commit | 73bb02f2495181e0719a800f979df508924f4b71 (patch) | |
tree | c0748b5dcbade83af788c0abfa89c0383d6b779c /library/cpp/charset/codepage.cpp | |
parent | eb3d925534734c808602b31b38b953677f0a279f (diff) | |
download | ydb-73bb02f2495181e0719a800f979df508924f4b71.tar.gz |
Restoring authorship annotation for <sereglond@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/charset/codepage.cpp')
-rw-r--r-- | library/cpp/charset/codepage.cpp | 294 |
1 files changed, 147 insertions, 147 deletions
diff --git a/library/cpp/charset/codepage.cpp b/library/cpp/charset/codepage.cpp index 368ed73f9b..0431bef31b 100644 --- a/library/cpp/charset/codepage.cpp +++ b/library/cpp/charset/codepage.cpp @@ -20,8 +20,8 @@ #include <ctype.h> -using namespace NCodepagePrivate; - +using namespace NCodepagePrivate; + void Recoder::Create(const CodePage& source, const CodePage& target) { const Encoder* wideTarget = &EncoderByCharset(target.CPEnum); Create(source, wideTarget); @@ -32,7 +32,7 @@ void Recoder::Create(const CodePage& page, wchar32 (*mapfunc)(wchar32)) { } template <class T, class T1> -static inline T1 Apply(T b, T e, T1 to, const Recoder& mapper) { +static inline T1 Apply(T b, T e, T1 to, const Recoder& mapper) { while (b != e) { *to++ = mapper.Table[(unsigned char)*b++]; } @@ -40,34 +40,34 @@ static inline T1 Apply(T b, T e, T1 to, const Recoder& mapper) { return to; } -template <class T, class T1> -static inline T1 Apply(T b, T1 to, const Recoder& mapper) { - while (*b != 0) { - *to++ = mapper.Table[(unsigned char)*b++]; - } - - return to; -} - +template <class T, class T1> +static inline T1 Apply(T b, T1 to, const Recoder& mapper) { + while (*b != 0) { + *to++ = mapper.Table[(unsigned char)*b++]; + } + + return to; +} + char* CodePage::ToLower(const char* b, const char* e, char* to) const { return Apply(b, e, to, TCodePageData::rcdr_to_lower[CPEnum]); } -char* CodePage::ToLower(const char* b, char* to) const { - return Apply(b, to, TCodePageData::rcdr_to_lower[CPEnum]); -} +char* CodePage::ToLower(const char* b, char* to) const { + return Apply(b, to, TCodePageData::rcdr_to_lower[CPEnum]); +} char* CodePage::ToUpper(const char* b, const char* e, char* to) const { - return Apply(b, e, to, TCodePageData::rcdr_to_upper[CPEnum]); + return Apply(b, e, to, TCodePageData::rcdr_to_upper[CPEnum]); +} +char* CodePage::ToUpper(const char* b, char* to) const { + return Apply(b, to, TCodePageData::rcdr_to_upper[CPEnum]); } -char* CodePage::ToUpper(const char* b, char* to) const { - return Apply(b, to, TCodePageData::rcdr_to_upper[CPEnum]); -} int CodePage::stricmp(const char* dst, const char* src) const { unsigned char f, l; do { - f = ToLower(*dst++); - l = ToLower(*src++); + f = ToLower(*dst++); + l = ToLower(*src++); } while (f && (f == l)); return f - l; } @@ -76,122 +76,122 @@ int CodePage::strnicmp(const char* dst, const char* src, size_t len) const { unsigned char f, l; if (len) { do { - f = ToLower(*dst++); - l = ToLower(*src++); + f = ToLower(*dst++); + l = ToLower(*src++); } while (--len && f && (f == l)); return f - l; } return 0; } -static const CodePage UNSUPPORTED_CODEPAGE = { - CODES_UNSUPPORTED, +static const CodePage UNSUPPORTED_CODEPAGE = { + CODES_UNSUPPORTED, { "unsupported", }, - {}, + {}, nullptr, -}; - -static const CodePage UNKNOWN_CODEPAGE = { - CODES_UNKNOWN, +}; + +static const CodePage UNKNOWN_CODEPAGE = { + CODES_UNKNOWN, { "unknown", }, - {}, + {}, nullptr, -}; - -void NCodepagePrivate::TCodepagesMap::SetData(const CodePage* cp) { +}; + +void NCodepagePrivate::TCodepagesMap::SetData(const CodePage* cp) { Y_ASSERT(cp); - int code = static_cast<int>(cp->CPEnum) + DataShift; - + int code = static_cast<int>(cp->CPEnum) + DataShift; + Y_ASSERT(code >= 0 && code < DataSize); Y_ASSERT(Data[code] == nullptr); - - Data[code] = cp; -} - -NCodepagePrivate::TCodepagesMap::TCodepagesMap() { - memset(Data, 0, sizeof(const CodePage*) * DataSize); - SetData(&UNSUPPORTED_CODEPAGE); - SetData(&UNKNOWN_CODEPAGE); - - for (size_t i = 0; i != CODES_MAX; ++i) { - SetData(TCodePageData::AllCodePages[i]); - } -} - + + Data[code] = cp; +} + +NCodepagePrivate::TCodepagesMap::TCodepagesMap() { + memset(Data, 0, sizeof(const CodePage*) * DataSize); + SetData(&UNSUPPORTED_CODEPAGE); + SetData(&UNKNOWN_CODEPAGE); + + for (size_t i = 0; i != CODES_MAX; ++i) { + SetData(TCodePageData::AllCodePages[i]); + } +} + const NCodepagePrivate::TCodepagesMap& NCodepagePrivate::TCodepagesMap::Instance() { return *Singleton<NCodepagePrivate::TCodepagesMap>(); } -class TCodePageHash { -private: +class TCodePageHash { +private: using TData = THashMap<TStringBuf, ECharset, ci_hash, ci_equal_to>; - TData Data; + TData Data; TMemoryPool Pool; -private: +private: inline void AddNameWithCheck(const TString& name, ECharset code) { - if (Data.find(name.c_str()) == Data.end()) { + if (Data.find(name.c_str()) == Data.end()) { Data.insert(TData::value_type(Pool.Append(name.data(), name.size() + 1), code)); - } else { + } else { Y_ASSERT(Data.find(name.c_str())->second == code); - } - } + } + } inline void AddName(const TString& name, ECharset code) { - AddNameWithCheck(name, code); - + AddNameWithCheck(name, code); + TString temp = name; RemoveAll(temp, '-'); RemoveAll(temp, '_'); - AddNameWithCheck(temp, code); - - temp = name; + AddNameWithCheck(temp, code); + + temp = name; SubstGlobal(temp, '-', '_'); - AddNameWithCheck(temp, code); + AddNameWithCheck(temp, code); - temp = name; + temp = name; SubstGlobal(temp, '_', '-'); - AddNameWithCheck(temp, code); - } - -public: + AddNameWithCheck(temp, code); + } + +public: inline TCodePageHash() : Pool(20 * 1024) /* Currently used: 17KB. */ { TString xPrefix = "x-"; - const char* name; - - for (size_t i = 0; i != CODES_MAX; ++i) { + const char* name; + + for (size_t i = 0; i != CODES_MAX; ++i) { ECharset e = static_cast<ECharset>(i); - const CodePage* page = Singleton<NCodepagePrivate::TCodepagesMap>()->GetPrivate(e); - - AddName(ToString(static_cast<int>(i)), e); - + const CodePage* page = Singleton<NCodepagePrivate::TCodepagesMap>()->GetPrivate(e); + + AddName(ToString(static_cast<int>(i)), e); + for (size_t j = 0; (name = page->Names[j]) != nullptr && name[0]; ++j) { - AddName(name, e); - - AddName(xPrefix + name, e); + AddName(name, e); + + AddName(xPrefix + name, e); } } - } + } inline ECharset CharsetByName(TStringBuf name) { - if (!name) - return CODES_UNKNOWN; - - TData::const_iterator it = Data.find(name); - if (it == Data.end()) - return CODES_UNKNOWN; - - return it->second; + if (!name) + return CODES_UNKNOWN; + + TData::const_iterator it = Data.find(name); + if (it == Data.end()) + return CODES_UNKNOWN; + + return it->second; } -}; - +}; + ECharset CharsetByName(TStringBuf name) { return Singleton<TCodePageHash>()->CharsetByName(name); } @@ -205,55 +205,55 @@ ECharset CharsetByNameOrDie(TStringBuf name) { template <typename TxChar> static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size_t& rune_len, const TxChar* s, const TxChar* end) { - if ((*s & 0xFF00) != 0xF000) { - rune_len = 1; - rune = *s; - return RECODE_OK; - } - - rune_len = 0; - + if ((*s & 0xFF00) != 0xF000) { + rune_len = 1; + rune = *s; + return RECODE_OK; + } + + rune_len = 0; + size_t _len = UTF8RuneLen((unsigned char)(*s)); if (s + _len > end) return RECODE_EOINPUT; //[EOINPUT] if (_len == 0) return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte - + wchar32 _rune = (ui8)(*s++); //[00000000 0XXXXXXX] - if (_len > 1) { + if (_len > 1) { _rune &= UTF8LeadByteMask(_len); - wchar32 ch = *s++; - if ((ch & 0xFFC0) != 0xF080) + wchar32 ch = *s++; + if ((ch & 0xFFC0) != 0xF080) return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte - _rune <<= 6; + _rune <<= 6; _rune |= ch & 0x3F; //[00000XXX XXYYYYYY] - if (_len > 2) { - ch = *s++; - if ((ch & 0xFFC0) != 0xF080) + if (_len > 2) { + ch = *s++; + if ((ch & 0xFFC0) != 0xF080) return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte - _rune <<= 6; + _rune <<= 6; _rune |= ch & 0x3F; //[XXXXYYYY YYZZZZZZ] - if (_len > 3) { - ch = *s; - if ((ch & 0xFFC0) != 0xF080) - return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte - _rune <<= 6; + if (_len > 3) { + ch = *s; + if ((ch & 0xFFC0) != 0xF080) + return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte + _rune <<= 6; _rune |= ch & 0x3F; //[XXXYY YYYYZZZZ ZZQQQQQQ] - } - } - } - rune_len = _len; - if (_rune > Max<TxChar>()) + } + } + } + rune_len = _len; + if (_rune > Max<TxChar>()) rune = ' '; // maybe put sequence - else - rune = TxChar(_rune); - return RECODE_OK; -} - -template <typename TxChar> + else + rune = TxChar(_rune); + return RECODE_OK; +} + +template <typename TxChar> void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) { TxChar* e = ee; - if (SingleByteCodepage(enc)) { + if (SingleByteCodepage(enc)) { const CodePage* cp = CodePageByCharset(enc); for (TxChar* s = str; s < e; s++) { if (Hi8(Lo16(*s)) == 0xF0) @@ -268,45 +268,45 @@ void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) { if (utf8_read_rune_from_unknown_plane(*d, l, s, e) == RECODE_OK) { d++, s += l; - } else { - *d++ = BROKEN_RUNE; - ++s; + } else { + *d++ = BROKEN_RUNE; + ++s; } } e = d; - } else if (enc == CODES_UNKNOWN) { + } else if (enc == CODES_UNKNOWN) { for (TxChar* s = str; s < e; s++) { if (Hi8(Lo16(*s)) == 0xF0) *s = Lo8(Lo16(*s)); } - } else { + } else { Y_ASSERT(!SingleByteCodepage(enc)); - - TxChar* s = str; - TxChar* d = str; - + + TxChar* s = str; + TxChar* d = str; + TVector<char> buf; - - size_t read = 0; - size_t written = 0; + + size_t read = 0; + size_t written = 0; for (; s < e; ++s) { if (Hi8(Lo16(*s)) == 0xF0) { buf.push_back(Lo8(Lo16(*s))); - } else { - if (!buf.empty()) { + } else { + if (!buf.empty()) { if (RecodeToUnicode(enc, buf.data(), d, buf.size(), e - d, read, written) == RECODE_OK) { Y_ASSERT(read == buf.size()); - d += written; - } else { // just copying broken symbols + d += written; + } else { // just copying broken symbols Y_ASSERT(buf.size() <= static_cast<size_t>(e - d)); Copy(buf.data(), buf.size(), d); - d += buf.size(); - } - buf.clear(); - } - *d++ = *s; - } - } + d += buf.size(); + } + buf.clear(); + } + *d++ = *s; + } + } } ee = e; } |