aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/charset/codepage.cpp
diff options
context:
space:
mode:
authorsereglond <sereglond@yandex-team.ru>2022-02-10 16:47:47 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:47:47 +0300
commit73bb02f2495181e0719a800f979df508924f4b71 (patch)
treec0748b5dcbade83af788c0abfa89c0383d6b779c /library/cpp/charset/codepage.cpp
parenteb3d925534734c808602b31b38b953677f0a279f (diff)
downloadydb-73bb02f2495181e0719a800f979df508924f4b71.tar.gz
Restoring authorship annotation for <sereglond@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/charset/codepage.cpp')
-rw-r--r--library/cpp/charset/codepage.cpp294
1 files changed, 147 insertions, 147 deletions
diff --git a/library/cpp/charset/codepage.cpp b/library/cpp/charset/codepage.cpp
index 368ed73f9b..0431bef31b 100644
--- a/library/cpp/charset/codepage.cpp
+++ b/library/cpp/charset/codepage.cpp
@@ -20,8 +20,8 @@
#include <ctype.h>
-using namespace NCodepagePrivate;
-
+using namespace NCodepagePrivate;
+
void Recoder::Create(const CodePage& source, const CodePage& target) {
const Encoder* wideTarget = &EncoderByCharset(target.CPEnum);
Create(source, wideTarget);
@@ -32,7 +32,7 @@ void Recoder::Create(const CodePage& page, wchar32 (*mapfunc)(wchar32)) {
}
template <class T, class T1>
-static inline T1 Apply(T b, T e, T1 to, const Recoder& mapper) {
+static inline T1 Apply(T b, T e, T1 to, const Recoder& mapper) {
while (b != e) {
*to++ = mapper.Table[(unsigned char)*b++];
}
@@ -40,34 +40,34 @@ static inline T1 Apply(T b, T e, T1 to, const Recoder& mapper) {
return to;
}
-template <class T, class T1>
-static inline T1 Apply(T b, T1 to, const Recoder& mapper) {
- while (*b != 0) {
- *to++ = mapper.Table[(unsigned char)*b++];
- }
-
- return to;
-}
-
+template <class T, class T1>
+static inline T1 Apply(T b, T1 to, const Recoder& mapper) {
+ while (*b != 0) {
+ *to++ = mapper.Table[(unsigned char)*b++];
+ }
+
+ return to;
+}
+
char* CodePage::ToLower(const char* b, const char* e, char* to) const {
return Apply(b, e, to, TCodePageData::rcdr_to_lower[CPEnum]);
}
-char* CodePage::ToLower(const char* b, char* to) const {
- return Apply(b, to, TCodePageData::rcdr_to_lower[CPEnum]);
-}
+char* CodePage::ToLower(const char* b, char* to) const {
+ return Apply(b, to, TCodePageData::rcdr_to_lower[CPEnum]);
+}
char* CodePage::ToUpper(const char* b, const char* e, char* to) const {
- return Apply(b, e, to, TCodePageData::rcdr_to_upper[CPEnum]);
+ return Apply(b, e, to, TCodePageData::rcdr_to_upper[CPEnum]);
+}
+char* CodePage::ToUpper(const char* b, char* to) const {
+ return Apply(b, to, TCodePageData::rcdr_to_upper[CPEnum]);
}
-char* CodePage::ToUpper(const char* b, char* to) const {
- return Apply(b, to, TCodePageData::rcdr_to_upper[CPEnum]);
-}
int CodePage::stricmp(const char* dst, const char* src) const {
unsigned char f, l;
do {
- f = ToLower(*dst++);
- l = ToLower(*src++);
+ f = ToLower(*dst++);
+ l = ToLower(*src++);
} while (f && (f == l));
return f - l;
}
@@ -76,122 +76,122 @@ int CodePage::strnicmp(const char* dst, const char* src, size_t len) const {
unsigned char f, l;
if (len) {
do {
- f = ToLower(*dst++);
- l = ToLower(*src++);
+ f = ToLower(*dst++);
+ l = ToLower(*src++);
} while (--len && f && (f == l));
return f - l;
}
return 0;
}
-static const CodePage UNSUPPORTED_CODEPAGE = {
- CODES_UNSUPPORTED,
+static const CodePage UNSUPPORTED_CODEPAGE = {
+ CODES_UNSUPPORTED,
{
"unsupported",
},
- {},
+ {},
nullptr,
-};
-
-static const CodePage UNKNOWN_CODEPAGE = {
- CODES_UNKNOWN,
+};
+
+static const CodePage UNKNOWN_CODEPAGE = {
+ CODES_UNKNOWN,
{
"unknown",
},
- {},
+ {},
nullptr,
-};
-
-void NCodepagePrivate::TCodepagesMap::SetData(const CodePage* cp) {
+};
+
+void NCodepagePrivate::TCodepagesMap::SetData(const CodePage* cp) {
Y_ASSERT(cp);
- int code = static_cast<int>(cp->CPEnum) + DataShift;
-
+ int code = static_cast<int>(cp->CPEnum) + DataShift;
+
Y_ASSERT(code >= 0 && code < DataSize);
Y_ASSERT(Data[code] == nullptr);
-
- Data[code] = cp;
-}
-
-NCodepagePrivate::TCodepagesMap::TCodepagesMap() {
- memset(Data, 0, sizeof(const CodePage*) * DataSize);
- SetData(&UNSUPPORTED_CODEPAGE);
- SetData(&UNKNOWN_CODEPAGE);
-
- for (size_t i = 0; i != CODES_MAX; ++i) {
- SetData(TCodePageData::AllCodePages[i]);
- }
-}
-
+
+ Data[code] = cp;
+}
+
+NCodepagePrivate::TCodepagesMap::TCodepagesMap() {
+ memset(Data, 0, sizeof(const CodePage*) * DataSize);
+ SetData(&UNSUPPORTED_CODEPAGE);
+ SetData(&UNKNOWN_CODEPAGE);
+
+ for (size_t i = 0; i != CODES_MAX; ++i) {
+ SetData(TCodePageData::AllCodePages[i]);
+ }
+}
+
const NCodepagePrivate::TCodepagesMap& NCodepagePrivate::TCodepagesMap::Instance() {
return *Singleton<NCodepagePrivate::TCodepagesMap>();
}
-class TCodePageHash {
-private:
+class TCodePageHash {
+private:
using TData = THashMap<TStringBuf, ECharset, ci_hash, ci_equal_to>;
- TData Data;
+ TData Data;
TMemoryPool Pool;
-private:
+private:
inline void AddNameWithCheck(const TString& name, ECharset code) {
- if (Data.find(name.c_str()) == Data.end()) {
+ if (Data.find(name.c_str()) == Data.end()) {
Data.insert(TData::value_type(Pool.Append(name.data(), name.size() + 1), code));
- } else {
+ } else {
Y_ASSERT(Data.find(name.c_str())->second == code);
- }
- }
+ }
+ }
inline void AddName(const TString& name, ECharset code) {
- AddNameWithCheck(name, code);
-
+ AddNameWithCheck(name, code);
+
TString temp = name;
RemoveAll(temp, '-');
RemoveAll(temp, '_');
- AddNameWithCheck(temp, code);
-
- temp = name;
+ AddNameWithCheck(temp, code);
+
+ temp = name;
SubstGlobal(temp, '-', '_');
- AddNameWithCheck(temp, code);
+ AddNameWithCheck(temp, code);
- temp = name;
+ temp = name;
SubstGlobal(temp, '_', '-');
- AddNameWithCheck(temp, code);
- }
-
-public:
+ AddNameWithCheck(temp, code);
+ }
+
+public:
inline TCodePageHash()
: Pool(20 * 1024) /* Currently used: 17KB. */
{
TString xPrefix = "x-";
- const char* name;
-
- for (size_t i = 0; i != CODES_MAX; ++i) {
+ const char* name;
+
+ for (size_t i = 0; i != CODES_MAX; ++i) {
ECharset e = static_cast<ECharset>(i);
- const CodePage* page = Singleton<NCodepagePrivate::TCodepagesMap>()->GetPrivate(e);
-
- AddName(ToString(static_cast<int>(i)), e);
-
+ const CodePage* page = Singleton<NCodepagePrivate::TCodepagesMap>()->GetPrivate(e);
+
+ AddName(ToString(static_cast<int>(i)), e);
+
for (size_t j = 0; (name = page->Names[j]) != nullptr && name[0]; ++j) {
- AddName(name, e);
-
- AddName(xPrefix + name, e);
+ AddName(name, e);
+
+ AddName(xPrefix + name, e);
}
}
- }
+ }
inline ECharset CharsetByName(TStringBuf name) {
- if (!name)
- return CODES_UNKNOWN;
-
- TData::const_iterator it = Data.find(name);
- if (it == Data.end())
- return CODES_UNKNOWN;
-
- return it->second;
+ if (!name)
+ return CODES_UNKNOWN;
+
+ TData::const_iterator it = Data.find(name);
+ if (it == Data.end())
+ return CODES_UNKNOWN;
+
+ return it->second;
}
-};
-
+};
+
ECharset CharsetByName(TStringBuf name) {
return Singleton<TCodePageHash>()->CharsetByName(name);
}
@@ -205,55 +205,55 @@ ECharset CharsetByNameOrDie(TStringBuf name) {
template <typename TxChar>
static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size_t& rune_len, const TxChar* s, const TxChar* end) {
- if ((*s & 0xFF00) != 0xF000) {
- rune_len = 1;
- rune = *s;
- return RECODE_OK;
- }
-
- rune_len = 0;
-
+ if ((*s & 0xFF00) != 0xF000) {
+ rune_len = 1;
+ rune = *s;
+ return RECODE_OK;
+ }
+
+ rune_len = 0;
+
size_t _len = UTF8RuneLen((unsigned char)(*s));
if (s + _len > end)
return RECODE_EOINPUT; //[EOINPUT]
if (_len == 0)
return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
-
+
wchar32 _rune = (ui8)(*s++); //[00000000 0XXXXXXX]
- if (_len > 1) {
+ if (_len > 1) {
_rune &= UTF8LeadByteMask(_len);
- wchar32 ch = *s++;
- if ((ch & 0xFFC0) != 0xF080)
+ wchar32 ch = *s++;
+ if ((ch & 0xFFC0) != 0xF080)
return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
- _rune <<= 6;
+ _rune <<= 6;
_rune |= ch & 0x3F; //[00000XXX XXYYYYYY]
- if (_len > 2) {
- ch = *s++;
- if ((ch & 0xFFC0) != 0xF080)
+ if (_len > 2) {
+ ch = *s++;
+ if ((ch & 0xFFC0) != 0xF080)
return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
- _rune <<= 6;
+ _rune <<= 6;
_rune |= ch & 0x3F; //[XXXXYYYY YYZZZZZZ]
- if (_len > 3) {
- ch = *s;
- if ((ch & 0xFFC0) != 0xF080)
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
- _rune <<= 6;
+ if (_len > 3) {
+ ch = *s;
+ if ((ch & 0xFFC0) != 0xF080)
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
+ _rune <<= 6;
_rune |= ch & 0x3F; //[XXXYY YYYYZZZZ ZZQQQQQQ]
- }
- }
- }
- rune_len = _len;
- if (_rune > Max<TxChar>())
+ }
+ }
+ }
+ rune_len = _len;
+ if (_rune > Max<TxChar>())
rune = ' '; // maybe put sequence
- else
- rune = TxChar(_rune);
- return RECODE_OK;
-}
-
-template <typename TxChar>
+ else
+ rune = TxChar(_rune);
+ return RECODE_OK;
+}
+
+template <typename TxChar>
void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) {
TxChar* e = ee;
- if (SingleByteCodepage(enc)) {
+ if (SingleByteCodepage(enc)) {
const CodePage* cp = CodePageByCharset(enc);
for (TxChar* s = str; s < e; s++) {
if (Hi8(Lo16(*s)) == 0xF0)
@@ -268,45 +268,45 @@ void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) {
if (utf8_read_rune_from_unknown_plane(*d, l, s, e) == RECODE_OK) {
d++, s += l;
- } else {
- *d++ = BROKEN_RUNE;
- ++s;
+ } else {
+ *d++ = BROKEN_RUNE;
+ ++s;
}
}
e = d;
- } else if (enc == CODES_UNKNOWN) {
+ } else if (enc == CODES_UNKNOWN) {
for (TxChar* s = str; s < e; s++) {
if (Hi8(Lo16(*s)) == 0xF0)
*s = Lo8(Lo16(*s));
}
- } else {
+ } else {
Y_ASSERT(!SingleByteCodepage(enc));
-
- TxChar* s = str;
- TxChar* d = str;
-
+
+ TxChar* s = str;
+ TxChar* d = str;
+
TVector<char> buf;
-
- size_t read = 0;
- size_t written = 0;
+
+ size_t read = 0;
+ size_t written = 0;
for (; s < e; ++s) {
if (Hi8(Lo16(*s)) == 0xF0) {
buf.push_back(Lo8(Lo16(*s)));
- } else {
- if (!buf.empty()) {
+ } else {
+ if (!buf.empty()) {
if (RecodeToUnicode(enc, buf.data(), d, buf.size(), e - d, read, written) == RECODE_OK) {
Y_ASSERT(read == buf.size());
- d += written;
- } else { // just copying broken symbols
+ d += written;
+ } else { // just copying broken symbols
Y_ASSERT(buf.size() <= static_cast<size_t>(e - d));
Copy(buf.data(), buf.size(), d);
- d += buf.size();
- }
- buf.clear();
- }
- *d++ = *s;
- }
- }
+ d += buf.size();
+ }
+ buf.clear();
+ }
+ *d++ = *s;
+ }
+ }
}
ee = e;
}