aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/charset/codepage.cpp
diff options
context:
space:
mode:
authorAnton Samokhvalov <pg83@yandex.ru>2022-02-10 16:45:15 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:45:15 +0300
commit72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch)
treeda2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /library/cpp/charset/codepage.cpp
parent778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff)
downloadydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/charset/codepage.cpp')
-rw-r--r--library/cpp/charset/codepage.cpp278
1 files changed, 139 insertions, 139 deletions
diff --git a/library/cpp/charset/codepage.cpp b/library/cpp/charset/codepage.cpp
index 0431bef31b..816f3fec67 100644
--- a/library/cpp/charset/codepage.cpp
+++ b/library/cpp/charset/codepage.cpp
@@ -1,45 +1,45 @@
#include "ci_string.h"
-#include "wide.h"
-#include "recyr.hh"
-#include "codepage.h"
-
-#include <util/string/cast.h>
+#include "wide.h"
+#include "recyr.hh"
+#include "codepage.h"
+
+#include <util/string/cast.h>
#include <util/string/subst.h>
-#include <util/string/util.h>
+#include <util/string/util.h>
#include <util/system/hi_lo.h>
-#include <util/system/yassert.h>
-#include <util/generic/hash.h>
+#include <util/system/yassert.h>
+#include <util/generic/hash.h>
#include <util/generic/string.h>
-#include <util/generic/vector.h>
-#include <util/generic/hash_set.h>
-#include <util/generic/singleton.h>
+#include <util/generic/vector.h>
+#include <util/generic/hash_set.h>
+#include <util/generic/singleton.h>
#include <util/generic/yexception.h>
#include <util/memory/pool.h>
-
-#include <cstring>
-
-#include <ctype.h>
-
+
+#include <cstring>
+
+#include <ctype.h>
+
using namespace NCodepagePrivate;
-void Recoder::Create(const CodePage& source, const CodePage& target) {
+void Recoder::Create(const CodePage& source, const CodePage& target) {
const Encoder* wideTarget = &EncoderByCharset(target.CPEnum);
Create(source, wideTarget);
}
-void Recoder::Create(const CodePage& page, wchar32 (*mapfunc)(wchar32)) {
+void Recoder::Create(const CodePage& page, wchar32 (*mapfunc)(wchar32)) {
const Encoder* widePage = &EncoderByCharset(page.CPEnum);
Create(page, widePage, mapfunc);
}
-template <class T, class T1>
+template <class T, class T1>
static inline T1 Apply(T b, T e, T1 to, const Recoder& mapper) {
- while (b != e) {
- *to++ = mapper.Table[(unsigned char)*b++];
- }
-
- return to;
-}
-
+ while (b != e) {
+ *to++ = mapper.Table[(unsigned char)*b++];
+ }
+
+ return to;
+}
+
template <class T, class T1>
static inline T1 Apply(T b, T1 to, const Recoder& mapper) {
while (*b != 0) {
@@ -49,21 +49,21 @@ static inline T1 Apply(T b, T1 to, const Recoder& mapper) {
return to;
}
-char* CodePage::ToLower(const char* b, const char* e, char* to) const {
- return Apply(b, e, to, TCodePageData::rcdr_to_lower[CPEnum]);
-}
+char* CodePage::ToLower(const char* b, const char* e, char* to) const {
+ return Apply(b, e, to, TCodePageData::rcdr_to_lower[CPEnum]);
+}
char* CodePage::ToLower(const char* b, char* to) const {
return Apply(b, to, TCodePageData::rcdr_to_lower[CPEnum]);
}
-
-char* CodePage::ToUpper(const char* b, const char* e, char* to) const {
+
+char* CodePage::ToUpper(const char* b, const char* e, char* to) const {
return Apply(b, e, to, TCodePageData::rcdr_to_upper[CPEnum]);
-}
+}
char* CodePage::ToUpper(const char* b, char* to) const {
return Apply(b, to, TCodePageData::rcdr_to_upper[CPEnum]);
}
-
-int CodePage::stricmp(const char* dst, const char* src) const {
+
+int CodePage::stricmp(const char* dst, const char* src) const {
unsigned char f, l;
do {
f = ToLower(*dst++);
@@ -86,18 +86,18 @@ int CodePage::strnicmp(const char* dst, const char* src, size_t len) const {
static const CodePage UNSUPPORTED_CODEPAGE = {
CODES_UNSUPPORTED,
- {
- "unsupported",
- },
+ {
+ "unsupported",
+ },
{},
nullptr,
};
static const CodePage UNKNOWN_CODEPAGE = {
CODES_UNKNOWN,
- {
- "unknown",
- },
+ {
+ "unknown",
+ },
{},
nullptr,
};
@@ -122,14 +122,14 @@ NCodepagePrivate::TCodepagesMap::TCodepagesMap() {
}
}
-const NCodepagePrivate::TCodepagesMap& NCodepagePrivate::TCodepagesMap::Instance() {
- return *Singleton<NCodepagePrivate::TCodepagesMap>();
-}
-
+const NCodepagePrivate::TCodepagesMap& NCodepagePrivate::TCodepagesMap::Instance() {
+ return *Singleton<NCodepagePrivate::TCodepagesMap>();
+}
+
class TCodePageHash {
private:
using TData = THashMap<TStringBuf, ECharset, ci_hash, ci_equal_to>;
-
+
TData Data;
TMemoryPool Pool;
@@ -153,7 +153,7 @@ private:
temp = name;
SubstGlobal(temp, '-', '_');
AddNameWithCheck(temp, code);
-
+
temp = name;
SubstGlobal(temp, '_', '-');
AddNameWithCheck(temp, code);
@@ -176,8 +176,8 @@ public:
AddName(name, e);
AddName(xPrefix + name, e);
- }
- }
+ }
+ }
}
inline ECharset CharsetByName(TStringBuf name) {
@@ -204,7 +204,7 @@ ECharset CharsetByNameOrDie(TStringBuf name) {
}
template <typename TxChar>
-static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size_t& rune_len, const TxChar* s, const TxChar* end) {
+static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size_t& rune_len, const TxChar* s, const TxChar* end) {
if ((*s & 0xFF00) != 0xF000) {
rune_len = 1;
rune = *s;
@@ -214,37 +214,37 @@ static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size
rune_len = 0;
size_t _len = UTF8RuneLen((unsigned char)(*s));
- if (s + _len > end)
- return RECODE_EOINPUT; //[EOINPUT]
- if (_len == 0)
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
+ if (s + _len > end)
+ return RECODE_EOINPUT; //[EOINPUT]
+ if (_len == 0)
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
- wchar32 _rune = (ui8)(*s++); //[00000000 0XXXXXXX]
+ wchar32 _rune = (ui8)(*s++); //[00000000 0XXXXXXX]
if (_len > 1) {
_rune &= UTF8LeadByteMask(_len);
wchar32 ch = *s++;
if ((ch & 0xFFC0) != 0xF080)
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
_rune <<= 6;
- _rune |= ch & 0x3F; //[00000XXX XXYYYYYY]
+ _rune |= ch & 0x3F; //[00000XXX XXYYYYYY]
if (_len > 2) {
ch = *s++;
if ((ch & 0xFFC0) != 0xF080)
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
_rune <<= 6;
- _rune |= ch & 0x3F; //[XXXXYYYY YYZZZZZZ]
+ _rune |= ch & 0x3F; //[XXXXYYYY YYZZZZZZ]
if (_len > 3) {
ch = *s;
if ((ch & 0xFFC0) != 0xF080)
return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
_rune <<= 6;
- _rune |= ch & 0x3F; //[XXXYY YYYYZZZZ ZZQQQQQQ]
+ _rune |= ch & 0x3F; //[XXXYY YYYYZZZZ ZZQQQQQQ]
}
}
}
rune_len = _len;
if (_rune > Max<TxChar>())
- rune = ' '; // maybe put sequence
+ rune = ' '; // maybe put sequence
else
rune = TxChar(_rune);
return RECODE_OK;
@@ -262,16 +262,16 @@ void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) {
} else if (enc == CODES_UTF8) {
TxChar* s;
TxChar* d;
-
- for (s = d = str; s < e;) {
+
+ for (s = d = str; s < e;) {
size_t l = 0;
-
+
if (utf8_read_rune_from_unknown_plane(*d, l, s, e) == RECODE_OK) {
d++, s += l;
} else {
*d++ = BROKEN_RUNE;
++s;
- }
+ }
}
e = d;
} else if (enc == CODES_UNKNOWN) {
@@ -289,7 +289,7 @@ void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) {
size_t read = 0;
size_t written = 0;
- for (; s < e; ++s) {
+ for (; s < e; ++s) {
if (Hi8(Lo16(*s)) == 0xF0) {
buf.push_back(Lo8(Lo16(*s)));
} else {
@@ -318,28 +318,28 @@ void DecodeUnknownPlane(wchar32* str, wchar32*& ee, const ECharset enc) {
DoDecodeUnknownPlane(str, ee, enc);
}
-namespace {
+namespace {
class THashSetType: public THashSet<TString> {
- public:
+ public:
inline void Add(const TString& s) {
- insert(s);
- }
-
+ insert(s);
+ }
+
inline bool Has(const TString& s) const noexcept {
- return find(s) != end();
- }
- };
-}
-
+ return find(s) != end();
+ }
+ };
+}
+
class TWindowsPrefixesHashSet: public THashSetType {
public:
inline TWindowsPrefixesHashSet() {
- Add("win");
- Add("wincp");
- Add("window");
+ Add("win");
+ Add("wincp");
+ Add("window");
Add("windowcp");
- Add("windows");
- Add("windowscp");
+ Add("windows");
+ Add("windowscp");
Add("ansi");
Add("ansicp");
}
@@ -364,19 +364,19 @@ public:
};
class TLatinToIsoHash: public THashMap<const char*, TString, ci_hash, ci_equal_to> {
-public:
- inline TLatinToIsoHash() {
- insert(value_type("latin1", "iso-8859-1"));
- insert(value_type("latin2", "iso-8859-2"));
- insert(value_type("latin3", "iso-8859-3"));
- insert(value_type("latin4", "iso-8859-4"));
- insert(value_type("latin5", "iso-8859-9"));
- insert(value_type("latin6", "iso-8859-10"));
- insert(value_type("latin7", "iso-8859-13"));
- insert(value_type("latin8", "iso-8859-14"));
- insert(value_type("latin9", "iso-8859-15"));
- insert(value_type("latin10", "iso-8859-16"));
- }
+public:
+ inline TLatinToIsoHash() {
+ insert(value_type("latin1", "iso-8859-1"));
+ insert(value_type("latin2", "iso-8859-2"));
+ insert(value_type("latin3", "iso-8859-3"));
+ insert(value_type("latin4", "iso-8859-4"));
+ insert(value_type("latin5", "iso-8859-9"));
+ insert(value_type("latin6", "iso-8859-10"));
+ insert(value_type("latin7", "iso-8859-13"));
+ insert(value_type("latin8", "iso-8859-14"));
+ insert(value_type("latin9", "iso-8859-15"));
+ insert(value_type("latin10", "iso-8859-16"));
+ }
};
static inline void NormalizeEncodingPrefixes(TString& enc) {
@@ -391,14 +391,14 @@ static inline void NormalizeEncodingPrefixes(TString& enc) {
}
}
- if (Singleton<TWindowsPrefixesHashSet>()->Has(prefix)) {
+ if (Singleton<TWindowsPrefixesHashSet>()->Has(prefix)) {
enc.remove(0, preflen);
enc.prepend("windows-");
return;
}
- if (Singleton<TCpPrefixesHashSet>()->Has(prefix)) {
- if (enc.length() > preflen + 3 && !strncmp(enc.c_str() + preflen, "125", 3) && isdigit(enc[preflen + 3])) {
+ if (Singleton<TCpPrefixesHashSet>()->Has(prefix)) {
+ if (enc.length() > preflen + 3 && !strncmp(enc.c_str() + preflen, "125", 3) && isdigit(enc[preflen + 3])) {
enc.remove(0, preflen);
enc.prepend("windows-");
return;
@@ -408,7 +408,7 @@ static inline void NormalizeEncodingPrefixes(TString& enc) {
return;
}
- if (Singleton<TIsoPrefixesHashSet>()->Has(prefix)) {
+ if (Singleton<TIsoPrefixesHashSet>()->Has(prefix)) {
if (enc.length() == preflen + 1 || enc.length() == preflen + 2) {
TString enccopy = enc.substr(preflen);
enccopy.prepend("latin");
@@ -428,46 +428,46 @@ static inline void NormalizeEncodingPrefixes(TString& enc) {
class TEncodingNamesHashSet: public THashSetType {
public:
TEncodingNamesHashSet() {
- Add("iso-8859-1");
- Add("iso-8859-2");
- Add("iso-8859-3");
- Add("iso-8859-4");
- Add("iso-8859-5");
- Add("iso-8859-6");
- Add("iso-8859-7");
- Add("iso-8859-8");
- Add("iso-8859-8-i");
- Add("iso-8859-9");
- Add("iso-8859-10");
- Add("iso-8859-11");
- Add("iso-8859-12");
- Add("iso-8859-13");
- Add("iso-8859-14");
- Add("iso-8859-15");
- Add("windows-1250");
- Add("windows-1251");
- Add("windows-1252");
- Add("windows-1253");
- Add("windows-1254");
- Add("windows-1255");
- Add("windows-1256");
- Add("windows-1257");
- Add("windows-1258");
- Add("windows-874");
- Add("iso-2022-jp");
- Add("euc-jp");
- Add("shift-jis");
- Add("shiftjis");
- Add("iso-2022-kr");
- Add("euc-kr");
- Add("gb-2312");
- Add("gb2312");
- Add("gb-18030");
- Add("gb18030");
- Add("gbk");
- Add("big5");
- Add("tis-620");
- Add("tis620");
+ Add("iso-8859-1");
+ Add("iso-8859-2");
+ Add("iso-8859-3");
+ Add("iso-8859-4");
+ Add("iso-8859-5");
+ Add("iso-8859-6");
+ Add("iso-8859-7");
+ Add("iso-8859-8");
+ Add("iso-8859-8-i");
+ Add("iso-8859-9");
+ Add("iso-8859-10");
+ Add("iso-8859-11");
+ Add("iso-8859-12");
+ Add("iso-8859-13");
+ Add("iso-8859-14");
+ Add("iso-8859-15");
+ Add("windows-1250");
+ Add("windows-1251");
+ Add("windows-1252");
+ Add("windows-1253");
+ Add("windows-1254");
+ Add("windows-1255");
+ Add("windows-1256");
+ Add("windows-1257");
+ Add("windows-1258");
+ Add("windows-874");
+ Add("iso-2022-jp");
+ Add("euc-jp");
+ Add("shift-jis");
+ Add("shiftjis");
+ Add("iso-2022-kr");
+ Add("euc-kr");
+ Add("gb-2312");
+ Add("gb2312");
+ Add("gb-18030");
+ Add("gb18030");
+ Add("gbk");
+ Add("big5");
+ Add("tis-620");
+ Add("tis620");
}
};
@@ -494,7 +494,7 @@ ECharset EncodingHintByName(const char* encname) {
// Do some normalization
TString enc(encname, lastpos - encname + 1);
enc.to_lower();
- for (char* p = enc.begin(); p != enc.end(); ++p) {
+ for (char* p = enc.begin(); p != enc.end(); ++p) {
if (*p == ' ' || *p == '=' || *p == '_')
*p = '-';
}
@@ -505,7 +505,7 @@ ECharset EncodingHintByName(const char* encname) {
if (hint != CODES_UNKNOWN)
return hint;
- if (Singleton<TEncodingNamesHashSet>()->Has(enc))
+ if (Singleton<TEncodingNamesHashSet>()->Has(enc))
return CODES_UNSUPPORTED;
return CODES_UNKNOWN;
}