aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/charset
diff options
context:
space:
mode:
authorAnton Samokhvalov <pg83@yandex.ru>2022-02-10 16:45:15 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:45:15 +0300
commit72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch)
treeda2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /library/cpp/charset
parent778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff)
downloadydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/charset')
-rw-r--r--library/cpp/charset/codepage.cpp278
-rw-r--r--library/cpp/charset/codepage.h90
-rw-r--r--library/cpp/charset/codepage_ut.cpp138
-rw-r--r--library/cpp/charset/cp_encrec.cpp10
-rw-r--r--library/cpp/charset/doccodes.cpp2
-rw-r--r--library/cpp/charset/doccodes.h72
-rw-r--r--library/cpp/charset/iconv.cpp186
-rw-r--r--library/cpp/charset/iconv.h82
-rw-r--r--library/cpp/charset/iconv_ut.cpp54
-rw-r--r--library/cpp/charset/recyr.hh34
-rw-r--r--library/cpp/charset/recyr_int.hh540
-rw-r--r--library/cpp/charset/wide.h20
-rw-r--r--library/cpp/charset/wide_ut.cpp96
-rw-r--r--library/cpp/charset/ya.make6
14 files changed, 804 insertions, 804 deletions
diff --git a/library/cpp/charset/codepage.cpp b/library/cpp/charset/codepage.cpp
index 0431bef31b..816f3fec67 100644
--- a/library/cpp/charset/codepage.cpp
+++ b/library/cpp/charset/codepage.cpp
@@ -1,45 +1,45 @@
#include "ci_string.h"
-#include "wide.h"
-#include "recyr.hh"
-#include "codepage.h"
-
-#include <util/string/cast.h>
+#include "wide.h"
+#include "recyr.hh"
+#include "codepage.h"
+
+#include <util/string/cast.h>
#include <util/string/subst.h>
-#include <util/string/util.h>
+#include <util/string/util.h>
#include <util/system/hi_lo.h>
-#include <util/system/yassert.h>
-#include <util/generic/hash.h>
+#include <util/system/yassert.h>
+#include <util/generic/hash.h>
#include <util/generic/string.h>
-#include <util/generic/vector.h>
-#include <util/generic/hash_set.h>
-#include <util/generic/singleton.h>
+#include <util/generic/vector.h>
+#include <util/generic/hash_set.h>
+#include <util/generic/singleton.h>
#include <util/generic/yexception.h>
#include <util/memory/pool.h>
-
-#include <cstring>
-
-#include <ctype.h>
-
+
+#include <cstring>
+
+#include <ctype.h>
+
using namespace NCodepagePrivate;
-void Recoder::Create(const CodePage& source, const CodePage& target) {
+void Recoder::Create(const CodePage& source, const CodePage& target) {
const Encoder* wideTarget = &EncoderByCharset(target.CPEnum);
Create(source, wideTarget);
}
-void Recoder::Create(const CodePage& page, wchar32 (*mapfunc)(wchar32)) {
+void Recoder::Create(const CodePage& page, wchar32 (*mapfunc)(wchar32)) {
const Encoder* widePage = &EncoderByCharset(page.CPEnum);
Create(page, widePage, mapfunc);
}
-template <class T, class T1>
+template <class T, class T1>
static inline T1 Apply(T b, T e, T1 to, const Recoder& mapper) {
- while (b != e) {
- *to++ = mapper.Table[(unsigned char)*b++];
- }
-
- return to;
-}
-
+ while (b != e) {
+ *to++ = mapper.Table[(unsigned char)*b++];
+ }
+
+ return to;
+}
+
template <class T, class T1>
static inline T1 Apply(T b, T1 to, const Recoder& mapper) {
while (*b != 0) {
@@ -49,21 +49,21 @@ static inline T1 Apply(T b, T1 to, const Recoder& mapper) {
return to;
}
-char* CodePage::ToLower(const char* b, const char* e, char* to) const {
- return Apply(b, e, to, TCodePageData::rcdr_to_lower[CPEnum]);
-}
+char* CodePage::ToLower(const char* b, const char* e, char* to) const {
+ return Apply(b, e, to, TCodePageData::rcdr_to_lower[CPEnum]);
+}
char* CodePage::ToLower(const char* b, char* to) const {
return Apply(b, to, TCodePageData::rcdr_to_lower[CPEnum]);
}
-
-char* CodePage::ToUpper(const char* b, const char* e, char* to) const {
+
+char* CodePage::ToUpper(const char* b, const char* e, char* to) const {
return Apply(b, e, to, TCodePageData::rcdr_to_upper[CPEnum]);
-}
+}
char* CodePage::ToUpper(const char* b, char* to) const {
return Apply(b, to, TCodePageData::rcdr_to_upper[CPEnum]);
}
-
-int CodePage::stricmp(const char* dst, const char* src) const {
+
+int CodePage::stricmp(const char* dst, const char* src) const {
unsigned char f, l;
do {
f = ToLower(*dst++);
@@ -86,18 +86,18 @@ int CodePage::strnicmp(const char* dst, const char* src, size_t len) const {
static const CodePage UNSUPPORTED_CODEPAGE = {
CODES_UNSUPPORTED,
- {
- "unsupported",
- },
+ {
+ "unsupported",
+ },
{},
nullptr,
};
static const CodePage UNKNOWN_CODEPAGE = {
CODES_UNKNOWN,
- {
- "unknown",
- },
+ {
+ "unknown",
+ },
{},
nullptr,
};
@@ -122,14 +122,14 @@ NCodepagePrivate::TCodepagesMap::TCodepagesMap() {
}
}
-const NCodepagePrivate::TCodepagesMap& NCodepagePrivate::TCodepagesMap::Instance() {
- return *Singleton<NCodepagePrivate::TCodepagesMap>();
-}
-
+const NCodepagePrivate::TCodepagesMap& NCodepagePrivate::TCodepagesMap::Instance() {
+ return *Singleton<NCodepagePrivate::TCodepagesMap>();
+}
+
class TCodePageHash {
private:
using TData = THashMap<TStringBuf, ECharset, ci_hash, ci_equal_to>;
-
+
TData Data;
TMemoryPool Pool;
@@ -153,7 +153,7 @@ private:
temp = name;
SubstGlobal(temp, '-', '_');
AddNameWithCheck(temp, code);
-
+
temp = name;
SubstGlobal(temp, '_', '-');
AddNameWithCheck(temp, code);
@@ -176,8 +176,8 @@ public:
AddName(name, e);
AddName(xPrefix + name, e);
- }
- }
+ }
+ }
}
inline ECharset CharsetByName(TStringBuf name) {
@@ -204,7 +204,7 @@ ECharset CharsetByNameOrDie(TStringBuf name) {
}
template <typename TxChar>
-static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size_t& rune_len, const TxChar* s, const TxChar* end) {
+static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size_t& rune_len, const TxChar* s, const TxChar* end) {
if ((*s & 0xFF00) != 0xF000) {
rune_len = 1;
rune = *s;
@@ -214,37 +214,37 @@ static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size
rune_len = 0;
size_t _len = UTF8RuneLen((unsigned char)(*s));
- if (s + _len > end)
- return RECODE_EOINPUT; //[EOINPUT]
- if (_len == 0)
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
+ if (s + _len > end)
+ return RECODE_EOINPUT; //[EOINPUT]
+ if (_len == 0)
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
- wchar32 _rune = (ui8)(*s++); //[00000000 0XXXXXXX]
+ wchar32 _rune = (ui8)(*s++); //[00000000 0XXXXXXX]
if (_len > 1) {
_rune &= UTF8LeadByteMask(_len);
wchar32 ch = *s++;
if ((ch & 0xFFC0) != 0xF080)
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
_rune <<= 6;
- _rune |= ch & 0x3F; //[00000XXX XXYYYYYY]
+ _rune |= ch & 0x3F; //[00000XXX XXYYYYYY]
if (_len > 2) {
ch = *s++;
if ((ch & 0xFFC0) != 0xF080)
- return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
+ return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
_rune <<= 6;
- _rune |= ch & 0x3F; //[XXXXYYYY YYZZZZZZ]
+ _rune |= ch & 0x3F; //[XXXXYYYY YYZZZZZZ]
if (_len > 3) {
ch = *s;
if ((ch & 0xFFC0) != 0xF080)
return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
_rune <<= 6;
- _rune |= ch & 0x3F; //[XXXYY YYYYZZZZ ZZQQQQQQ]
+ _rune |= ch & 0x3F; //[XXXYY YYYYZZZZ ZZQQQQQQ]
}
}
}
rune_len = _len;
if (_rune > Max<TxChar>())
- rune = ' '; // maybe put sequence
+ rune = ' '; // maybe put sequence
else
rune = TxChar(_rune);
return RECODE_OK;
@@ -262,16 +262,16 @@ void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) {
} else if (enc == CODES_UTF8) {
TxChar* s;
TxChar* d;
-
- for (s = d = str; s < e;) {
+
+ for (s = d = str; s < e;) {
size_t l = 0;
-
+
if (utf8_read_rune_from_unknown_plane(*d, l, s, e) == RECODE_OK) {
d++, s += l;
} else {
*d++ = BROKEN_RUNE;
++s;
- }
+ }
}
e = d;
} else if (enc == CODES_UNKNOWN) {
@@ -289,7 +289,7 @@ void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) {
size_t read = 0;
size_t written = 0;
- for (; s < e; ++s) {
+ for (; s < e; ++s) {
if (Hi8(Lo16(*s)) == 0xF0) {
buf.push_back(Lo8(Lo16(*s)));
} else {
@@ -318,28 +318,28 @@ void DecodeUnknownPlane(wchar32* str, wchar32*& ee, const ECharset enc) {
DoDecodeUnknownPlane(str, ee, enc);
}
-namespace {
+namespace {
class THashSetType: public THashSet<TString> {
- public:
+ public:
inline void Add(const TString& s) {
- insert(s);
- }
-
+ insert(s);
+ }
+
inline bool Has(const TString& s) const noexcept {
- return find(s) != end();
- }
- };
-}
-
+ return find(s) != end();
+ }
+ };
+}
+
class TWindowsPrefixesHashSet: public THashSetType {
public:
inline TWindowsPrefixesHashSet() {
- Add("win");
- Add("wincp");
- Add("window");
+ Add("win");
+ Add("wincp");
+ Add("window");
Add("windowcp");
- Add("windows");
- Add("windowscp");
+ Add("windows");
+ Add("windowscp");
Add("ansi");
Add("ansicp");
}
@@ -364,19 +364,19 @@ public:
};
class TLatinToIsoHash: public THashMap<const char*, TString, ci_hash, ci_equal_to> {
-public:
- inline TLatinToIsoHash() {
- insert(value_type("latin1", "iso-8859-1"));
- insert(value_type("latin2", "iso-8859-2"));
- insert(value_type("latin3", "iso-8859-3"));
- insert(value_type("latin4", "iso-8859-4"));
- insert(value_type("latin5", "iso-8859-9"));
- insert(value_type("latin6", "iso-8859-10"));
- insert(value_type("latin7", "iso-8859-13"));
- insert(value_type("latin8", "iso-8859-14"));
- insert(value_type("latin9", "iso-8859-15"));
- insert(value_type("latin10", "iso-8859-16"));
- }
+public:
+ inline TLatinToIsoHash() {
+ insert(value_type("latin1", "iso-8859-1"));
+ insert(value_type("latin2", "iso-8859-2"));
+ insert(value_type("latin3", "iso-8859-3"));
+ insert(value_type("latin4", "iso-8859-4"));
+ insert(value_type("latin5", "iso-8859-9"));
+ insert(value_type("latin6", "iso-8859-10"));
+ insert(value_type("latin7", "iso-8859-13"));
+ insert(value_type("latin8", "iso-8859-14"));
+ insert(value_type("latin9", "iso-8859-15"));
+ insert(value_type("latin10", "iso-8859-16"));
+ }
};
static inline void NormalizeEncodingPrefixes(TString& enc) {
@@ -391,14 +391,14 @@ static inline void NormalizeEncodingPrefixes(TString& enc) {
}
}
- if (Singleton<TWindowsPrefixesHashSet>()->Has(prefix)) {
+ if (Singleton<TWindowsPrefixesHashSet>()->Has(prefix)) {
enc.remove(0, preflen);
enc.prepend("windows-");
return;
}
- if (Singleton<TCpPrefixesHashSet>()->Has(prefix)) {
- if (enc.length() > preflen + 3 && !strncmp(enc.c_str() + preflen, "125", 3) && isdigit(enc[preflen + 3])) {
+ if (Singleton<TCpPrefixesHashSet>()->Has(prefix)) {
+ if (enc.length() > preflen + 3 && !strncmp(enc.c_str() + preflen, "125", 3) && isdigit(enc[preflen + 3])) {
enc.remove(0, preflen);
enc.prepend("windows-");
return;
@@ -408,7 +408,7 @@ static inline void NormalizeEncodingPrefixes(TString& enc) {
return;
}
- if (Singleton<TIsoPrefixesHashSet>()->Has(prefix)) {
+ if (Singleton<TIsoPrefixesHashSet>()->Has(prefix)) {
if (enc.length() == preflen + 1 || enc.length() == preflen + 2) {
TString enccopy = enc.substr(preflen);
enccopy.prepend("latin");
@@ -428,46 +428,46 @@ static inline void NormalizeEncodingPrefixes(TString& enc) {
class TEncodingNamesHashSet: public THashSetType {
public:
TEncodingNamesHashSet() {
- Add("iso-8859-1");
- Add("iso-8859-2");
- Add("iso-8859-3");
- Add("iso-8859-4");
- Add("iso-8859-5");
- Add("iso-8859-6");
- Add("iso-8859-7");
- Add("iso-8859-8");
- Add("iso-8859-8-i");
- Add("iso-8859-9");
- Add("iso-8859-10");
- Add("iso-8859-11");
- Add("iso-8859-12");
- Add("iso-8859-13");
- Add("iso-8859-14");
- Add("iso-8859-15");
- Add("windows-1250");
- Add("windows-1251");
- Add("windows-1252");
- Add("windows-1253");
- Add("windows-1254");
- Add("windows-1255");
- Add("windows-1256");
- Add("windows-1257");
- Add("windows-1258");
- Add("windows-874");
- Add("iso-2022-jp");
- Add("euc-jp");
- Add("shift-jis");
- Add("shiftjis");
- Add("iso-2022-kr");
- Add("euc-kr");
- Add("gb-2312");
- Add("gb2312");
- Add("gb-18030");
- Add("gb18030");
- Add("gbk");
- Add("big5");
- Add("tis-620");
- Add("tis620");
+ Add("iso-8859-1");
+ Add("iso-8859-2");
+ Add("iso-8859-3");
+ Add("iso-8859-4");
+ Add("iso-8859-5");
+ Add("iso-8859-6");
+ Add("iso-8859-7");
+ Add("iso-8859-8");
+ Add("iso-8859-8-i");
+ Add("iso-8859-9");
+ Add("iso-8859-10");
+ Add("iso-8859-11");
+ Add("iso-8859-12");
+ Add("iso-8859-13");
+ Add("iso-8859-14");
+ Add("iso-8859-15");
+ Add("windows-1250");
+ Add("windows-1251");
+ Add("windows-1252");
+ Add("windows-1253");
+ Add("windows-1254");
+ Add("windows-1255");
+ Add("windows-1256");
+ Add("windows-1257");
+ Add("windows-1258");
+ Add("windows-874");
+ Add("iso-2022-jp");
+ Add("euc-jp");
+ Add("shift-jis");
+ Add("shiftjis");
+ Add("iso-2022-kr");
+ Add("euc-kr");
+ Add("gb-2312");
+ Add("gb2312");
+ Add("gb-18030");
+ Add("gb18030");
+ Add("gbk");
+ Add("big5");
+ Add("tis-620");
+ Add("tis620");
}
};
@@ -494,7 +494,7 @@ ECharset EncodingHintByName(const char* encname) {
// Do some normalization
TString enc(encname, lastpos - encname + 1);
enc.to_lower();
- for (char* p = enc.begin(); p != enc.end(); ++p) {
+ for (char* p = enc.begin(); p != enc.end(); ++p) {
if (*p == ' ' || *p == '=' || *p == '_')
*p = '-';
}
@@ -505,7 +505,7 @@ ECharset EncodingHintByName(const char* encname) {
if (hint != CODES_UNKNOWN)
return hint;
- if (Singleton<TEncodingNamesHashSet>()->Has(enc))
+ if (Singleton<TEncodingNamesHashSet>()->Has(enc))
return CODES_UNSUPPORTED;
return CODES_UNKNOWN;
}
diff --git a/library/cpp/charset/codepage.h b/library/cpp/charset/codepage.h
index 30a02a4610..2911174dce 100644
--- a/library/cpp/charset/codepage.h
+++ b/library/cpp/charset/codepage.h
@@ -1,6 +1,6 @@
#pragma once
-#include "doccodes.h"
+#include "doccodes.h"
#include <util/charset/recode_result.h>
#include <util/charset/unidata.h> // all wchar32 functions
@@ -8,11 +8,11 @@
#include <util/generic/string.h>
#include <util/generic/ylimits.h>
#include <util/generic/yexception.h>
-#include <util/system/yassert.h>
-#include <util/system/defaults.h>
-
-#include <cctype>
-
+#include <util/system/yassert.h>
+#include <util/system/defaults.h>
+
+#include <cctype>
+
struct CodePage;
struct Recoder;
struct Encoder;
@@ -21,10 +21,10 @@ struct Encoder;
* struct CodePage *
\*****************************************************************/
struct CodePage {
- ECharset CPEnum; // int MIBEnum;
- const char* Names[30]; // name[0] -- preferred mime-name
- wchar32 unicode[256];
- const char* DefaultChar; //[CCL_NUM]
+ ECharset CPEnum; // int MIBEnum;
+ const char* Names[30]; // name[0] -- preferred mime-name
+ wchar32 unicode[256];
+ const char* DefaultChar; //[CCL_NUM]
bool IsLower(unsigned char ch) const {
return ::IsLower(unicode[ch]);
@@ -38,7 +38,7 @@ struct CodePage {
bool IsDigit(unsigned char ch) const {
return ::IsDigit(unicode[ch]);
}
- bool IsXdigit(unsigned char ch) const {
+ bool IsXdigit(unsigned char ch) const {
return ::IsXdigit(unicode[ch]);
}
bool IsAlnum(unsigned char ch) const {
@@ -62,18 +62,18 @@ struct CodePage {
bool IsComposed(unsigned char ch) const {
return ::IsComposed(unicode[ch]);
}
-
+
// return pointer to char after the last char
- char* ToLower(const char* begin, const char* end, char* to) const;
+ char* ToLower(const char* begin, const char* end, char* to) const;
char* ToLower(const char* begin, char* to) const;
- // return pointer to char after the last char
- char* ToUpper(const char* begin, const char* end, char* to) const;
+ // return pointer to char after the last char
+ char* ToUpper(const char* begin, const char* end, char* to) const;
char* ToUpper(const char* begin, char* to) const;
-
- int stricmp(const char* s1, const char* s2) const;
- int strnicmp(const char* s1, const char* s2, size_t len) const;
-
+
+ int stricmp(const char* s1, const char* s2) const;
+ int strnicmp(const char* s1, const char* s2, size_t len) const;
+
inline unsigned char ToUpper(unsigned char ch) const;
inline unsigned char ToLower(unsigned char ch) const;
inline unsigned char ToTitle(unsigned char ch) const;
@@ -131,18 +131,18 @@ namespace NCodepagePrivate {
return GetPrivate(e)->Names[0];
}
- static const TCodepagesMap& Instance();
-
+ static const TCodepagesMap& Instance();
+
friend class ::TCodePageHash;
};
inline bool NativeCodepage(ECharset e) {
- return ::NCodepagePrivate::TCodepagesMap::Instance().NativeCodepage(e);
+ return ::NCodepagePrivate::TCodepagesMap::Instance().NativeCodepage(e);
}
}
inline bool SingleByteCodepage(ECharset e) {
- return ::NCodepagePrivate::TCodepagesMap::Instance().SingleByteCodepage(e);
+ return ::NCodepagePrivate::TCodepagesMap::Instance().SingleByteCodepage(e);
}
inline bool ValidCodepage(ECharset e) {
@@ -150,7 +150,7 @@ inline bool ValidCodepage(ECharset e) {
}
inline const CodePage* CodePageByCharset(ECharset e) {
- return ::NCodepagePrivate::TCodepagesMap::Instance().Get(e);
+ return ::NCodepagePrivate::TCodepagesMap::Instance().Get(e);
}
ECharset CharsetByName(TStringBuf name);
@@ -163,12 +163,12 @@ inline ECharset CharsetByCodePage(const CodePage* CP) {
}
inline const char* NameByCharset(ECharset e) {
- return ::NCodepagePrivate::TCodepagesMap::Instance().NameByCharset(e);
+ return ::NCodepagePrivate::TCodepagesMap::Instance().NameByCharset(e);
}
inline const char* NameByCharsetSafe(ECharset e) {
if (CODES_UNKNOWN < e && e < CODES_MAX)
- return ::NCodepagePrivate::TCodepagesMap::Instance().NameByCharset(e);
+ return ::NCodepagePrivate::TCodepagesMap::Instance().NameByCharset(e);
else
ythrow yexception() << "unknown encoding: " << (int)e;
}
@@ -194,21 +194,21 @@ struct Encoder {
char* Table[256];
const char* DefaultChar;
- inline char Code(wchar32 ch) const {
+ inline char Code(wchar32 ch) const {
if (ch > 0xFFFF)
return 0;
- return (unsigned char)Table[(ch >> 8) & 255][ch & 255];
+ return (unsigned char)Table[(ch >> 8) & 255][ch & 255];
}
- inline char Tr(wchar32 ch) const {
+ inline char Tr(wchar32 ch) const {
char code = Code(ch);
if (code == 0 && ch != 0)
- code = DefaultChar[NUnicode::CharType(ch)];
+ code = DefaultChar[NUnicode::CharType(ch)];
Y_ASSERT(code != 0 || ch == 0);
return code;
}
- inline unsigned char operator[](wchar32 ch) const {
+ inline unsigned char operator[](wchar32 ch) const {
return Tr(ch);
}
@@ -223,25 +223,25 @@ struct Encoder {
struct Recoder {
unsigned char Table[257];
- void Create(const CodePage& source, const CodePage& target);
- void Create(const CodePage& source, const Encoder* wideTarget);
+ void Create(const CodePage& source, const CodePage& target);
+ void Create(const CodePage& source, const Encoder* wideTarget);
- void Create(const CodePage& page, wchar32 (*mapper)(wchar32));
- void Create(const CodePage& page, const Encoder* widePage, wchar32 (*mapper)(wchar32));
+ void Create(const CodePage& page, wchar32 (*mapper)(wchar32));
+ void Create(const CodePage& page, const Encoder* widePage, wchar32 (*mapper)(wchar32));
- inline unsigned char Tr(unsigned char c) const {
+ inline unsigned char Tr(unsigned char c) const {
return Table[c];
}
- inline unsigned char operator[](unsigned char c) const {
+ inline unsigned char operator[](unsigned char c) const {
return Table[c];
}
- void Tr(const char* in, char* out, size_t len) const;
- void Tr(const char* in, char* out) const;
- void Tr(char* in_out, size_t len) const;
- void Tr(char* in_out) const;
+ void Tr(const char* in, char* out, size_t len) const;
+ void Tr(const char* in, char* out) const;
+ void Tr(char* in_out, size_t len) const;
+ void Tr(char* in_out) const;
};
-extern const struct Encoder& WideCharToYandex;
+extern const struct Encoder& WideCharToYandex;
const Encoder& EncoderByCharset(ECharset enc);
@@ -255,7 +255,7 @@ namespace NCodepagePrivate {
static const Recoder rcdr_to_lower[];
static const Recoder rcdr_to_upper[];
static const Recoder rcdr_to_title[];
-
+
static const Encoder* const EncodeTo[];
friend struct ::CodePage;
@@ -264,7 +264,7 @@ namespace NCodepagePrivate {
friend RECODE_RESULT _recodeFromYandex(ECharset, const char*, char*, size_t, size_t, size_t&, size_t&);
friend const Encoder& ::EncoderByCharset(ECharset enc);
};
-}
+}
inline const Encoder& EncoderByCharset(ECharset enc) {
if (!SingleByteCodepage(enc)) {
@@ -319,6 +319,6 @@ inline TString ToTitle(TString s, const CodePage& cp, size_t pos = 0, size_t n =
return i == pos ? cp.ToTitle(c) : cp.ToLower(c);
},
pos,
- n);
+ n);
return s;
}
diff --git a/library/cpp/charset/codepage_ut.cpp b/library/cpp/charset/codepage_ut.cpp
index c3ac3ac478..1a572cac44 100644
--- a/library/cpp/charset/codepage_ut.cpp
+++ b/library/cpp/charset/codepage_ut.cpp
@@ -1,47 +1,47 @@
#include "codepage.h"
#include "recyr.hh"
-#include "wide.h"
-
+#include "wide.h"
+
#include <library/cpp/testing/unittest/registar.h>
-
+
#include <util/charset/utf8.h>
#include <util/system/yassert.h>
-#if defined(_MSC_VER)
-#pragma warning(disable : 4309) /*truncation of constant value*/
+#if defined(_MSC_VER)
+#pragma warning(disable : 4309) /*truncation of constant value*/
#endif
namespace {
const char yandexUpperCase[] =
- "\x81\x82\x83\x84\x85\x86\x87"
- "\x8E"
- "\xA1\xA2\xA3\xA4\xA5\xA6"
- "\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF"
+ "\x81\x82\x83\x84\x85\x86\x87"
+ "\x8E"
+ "\xA1\xA2\xA3\xA4\xA5\xA6"
+ "\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF"
"\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF"
"\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF";
const char yandexLowerCase[] =
- "\x91\x92\x93\x94\x95\x96\x97"
- "\x9E"
- "\xB1\xB2\xB3\xB4\xB5\xB6"
- "\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"
+ "\x91\x92\x93\x94\x95\x96\x97"
+ "\x9E"
+ "\xB1\xB2\xB3\xB4\xB5\xB6"
+ "\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"
"\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF"
"\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF";
}
-class TCodepageTest: public TTestBase {
+class TCodepageTest: public TTestBase {
private:
UNIT_TEST_SUITE(TCodepageTest);
- UNIT_TEST(TestUTF);
- UNIT_TEST(TestUTFFromUnknownPlane);
- UNIT_TEST(TestBrokenMultibyte);
- UNIT_TEST(TestSurrogatePairs);
- UNIT_TEST(TestEncodingHints);
- UNIT_TEST(TestToLower);
- UNIT_TEST(TestToUpper);
- UNIT_TEST(TestUpperLower);
- UNIT_TEST(TestBrokenRune);
- UNIT_TEST(TestCanEncode);
+ UNIT_TEST(TestUTF);
+ UNIT_TEST(TestUTFFromUnknownPlane);
+ UNIT_TEST(TestBrokenMultibyte);
+ UNIT_TEST(TestSurrogatePairs);
+ UNIT_TEST(TestEncodingHints);
+ UNIT_TEST(TestToLower);
+ UNIT_TEST(TestToUpper);
+ UNIT_TEST(TestUpperLower);
+ UNIT_TEST(TestBrokenRune);
+ UNIT_TEST(TestCanEncode);
UNIT_TEST_SUITE_END();
public:
@@ -55,18 +55,18 @@ public:
void TestCanEncode();
- inline void TestUpperLower() {
+ inline void TestUpperLower() {
const CodePage* cp = CodePageByCharset(CODES_ASCII);
- char tmp[100];
-
+ char tmp[100];
+
TStringBuf s = "abcde";
-
- TStringBuf upper(tmp, cp->ToUpper(s.begin(), s.end(), tmp));
+
+ TStringBuf upper(tmp, cp->ToUpper(s.begin(), s.end(), tmp));
UNIT_ASSERT_VALUES_EQUAL(upper, TStringBuf("ABCDE"));
-
- TStringBuf lower(tmp, cp->ToLower(upper.begin(), upper.end(), tmp));
+
+ TStringBuf lower(tmp, cp->ToLower(upper.begin(), upper.end(), tmp));
UNIT_ASSERT_VALUES_EQUAL(lower, TStringBuf("abcde"));
- }
+ }
void TestBrokenRune() {
UNIT_ASSERT_VALUES_EQUAL(BROKEN_RUNE, 0xFFFDu);
@@ -78,7 +78,7 @@ UNIT_TEST_SUITE_REGISTRATION(TCodepageTest);
void TCodepageTest::TestUTF() {
for (wchar32 i = 0; i <= 0x10FFFF; i++) {
unsigned char buffer[32];
- Zero(buffer);
+ Zero(buffer);
size_t rune_len;
size_t ref_len = 0;
@@ -120,7 +120,7 @@ void TCodepageTest::TestUTF() {
UNIT_ASSERT(res == RECODE_BROKENSYMBOL);
}
}
- const char* badStrings[] = {
+ const char* badStrings[] = {
"\xfe",
"\xff",
"\xcc\xc0",
@@ -153,7 +153,7 @@ void TCodepageTest::TestUTF() {
};
for (size_t i = 0; i < Y_ARRAY_SIZE(badStrings); ++i) {
wchar32 rune;
- const ui8* p = (const ui8*)badStrings[i];
+ const ui8* p = (const ui8*)badStrings[i];
size_t len;
RECODE_RESULT res = SafeReadUTF8Char(rune, len, p, p + strlen(badStrings[i]));
UNIT_ASSERT(res == RECODE_BROKENSYMBOL);
@@ -174,17 +174,17 @@ void TCodepageTest::TestBrokenMultibyte() {
UNIT_ASSERT(nread == 1);
UNIT_ASSERT(nwritten == 0);
- const char bigSample[] = {'\xC3', '\x87', '\xC3', '\x8E', '\xC2', '\xB0', '\xC3', '\x85', '\xC3', '\x85', '\xC3', '\xB8'};
+ const char bigSample[] = {'\xC3', '\x87', '\xC3', '\x8E', '\xC2', '\xB0', '\xC3', '\x85', '\xC3', '\x85', '\xC3', '\xB8'};
res = RecodeToUnicode(cp, bigSample, recodeResult, Y_ARRAY_SIZE(bigSample), Y_ARRAY_SIZE(recodeResult), nread, nwritten);
UNIT_ASSERT(res == RECODE_OK);
UNIT_ASSERT(nread == Y_ARRAY_SIZE(bigSample));
}
void TCodepageTest::TestUTFFromUnknownPlane() {
- static const wchar32 sampletext[] = {0x61, 0x62, 0x63, 0x20,
- 0x430, 0x431, 0x432, 0x20,
- 0x1001, 0x1002, 0x1003, 0x20,
- 0x10001, 0x10002, 0x10003};
+ static const wchar32 sampletext[] = {0x61, 0x62, 0x63, 0x20,
+ 0x430, 0x431, 0x432, 0x20,
+ 0x1001, 0x1002, 0x1003, 0x20,
+ 0x10001, 0x10002, 0x10003};
static const size_t BUFFER_SIZE = 1024;
char bytebuffer[BUFFER_SIZE];
@@ -192,17 +192,17 @@ void TCodepageTest::TestUTFFromUnknownPlane() {
size_t readchars = 0;
size_t writtenbytes = 0;
size_t samplelen = Y_ARRAY_SIZE(sampletext);
+
+ RECODE_RESULT res = RecodeFromUnicode(CODES_UTF8, sampletext, bytebuffer, samplelen, BUFFER_SIZE, readchars, writtenbytes);
- RECODE_RESULT res = RecodeFromUnicode(CODES_UTF8, sampletext, bytebuffer, samplelen, BUFFER_SIZE, readchars, writtenbytes);
-
- UNIT_ASSERT(res == RECODE_OK);
- UNIT_ASSERT(samplelen == readchars);
+ UNIT_ASSERT(res == RECODE_OK);
+ UNIT_ASSERT(samplelen == readchars);
size_t writtenbytes2 = 0;
char bytebuffer2[BUFFER_SIZE];
for (size_t i = 0; i != samplelen; ++i) {
size_t nwr = 0;
- const int res = RecodeFromUnicode(CODES_UTF8, sampletext[i], bytebuffer2 + writtenbytes2, BUFFER_SIZE - writtenbytes2, nwr);
+ const int res = RecodeFromUnicode(CODES_UTF8, sampletext[i], bytebuffer2 + writtenbytes2, BUFFER_SIZE - writtenbytes2, nwr);
UNIT_ASSERT_VALUES_EQUAL(res, int(RECODE_OK));
writtenbytes2 += nwr;
UNIT_ASSERT(BUFFER_SIZE > writtenbytes2);
@@ -213,43 +213,43 @@ void TCodepageTest::TestUTFFromUnknownPlane() {
size_t readbytes = 0;
size_t writtenchars = 0;
- res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars);
+ res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars);
- UNIT_ASSERT(res == RECODE_OK);
- UNIT_ASSERT(readbytes == writtenbytes);
+ UNIT_ASSERT(res == RECODE_OK);
+ UNIT_ASSERT(readbytes == writtenbytes);
wchar32* charbufferend = charbuffer + writtenchars;
- DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8);
+ DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8);
- UNIT_ASSERT(charbufferend == charbuffer + samplelen);
- for (size_t i = 0; i < samplelen; ++i)
- UNIT_ASSERT(sampletext[i] == charbuffer[i]);
+ UNIT_ASSERT(charbufferend == charbuffer + samplelen);
+ for (size_t i = 0; i < samplelen; ++i)
+ UNIT_ASSERT(sampletext[i] == charbuffer[i]);
// Now, concatenate the thing with an explicit character and retest
- res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars);
- UNIT_ASSERT(res == RECODE_OK);
- UNIT_ASSERT(readbytes == writtenbytes);
+ res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars);
+ UNIT_ASSERT(res == RECODE_OK);
+ UNIT_ASSERT(readbytes == writtenbytes);
charbuffer[writtenchars] = 0x1234;
size_t morewrittenchars = 0;
- res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer + writtenchars + 1, writtenbytes, BUFFER_SIZE, readbytes, morewrittenchars);
- UNIT_ASSERT(res == RECODE_OK);
- UNIT_ASSERT(readbytes == writtenbytes);
- UNIT_ASSERT(writtenchars == morewrittenchars);
+ res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer + writtenchars + 1, writtenbytes, BUFFER_SIZE, readbytes, morewrittenchars);
+ UNIT_ASSERT(res == RECODE_OK);
+ UNIT_ASSERT(readbytes == writtenbytes);
+ UNIT_ASSERT(writtenchars == morewrittenchars);
charbuffer[2 * writtenchars + 1] = 0x5678;
charbufferend = charbuffer + 2 * writtenchars + 2;
- DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8);
+ DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8);
- UNIT_ASSERT(charbufferend == charbuffer + 2 * samplelen + 2);
+ UNIT_ASSERT(charbufferend == charbuffer + 2 * samplelen + 2);
for (size_t i = 0; i < samplelen; ++i) {
- UNIT_ASSERT(sampletext[i] == charbuffer[i]);
- UNIT_ASSERT(sampletext[i] == charbuffer[samplelen + 1 + i]);
+ UNIT_ASSERT(sampletext[i] == charbuffer[i]);
+ UNIT_ASSERT(sampletext[i] == charbuffer[samplelen + 1 + i]);
}
- UNIT_ASSERT(0x1234 == charbuffer[samplelen]);
- UNIT_ASSERT(0x5678 == charbuffer[2 * samplelen + 1]);
+ UNIT_ASSERT(0x1234 == charbuffer[samplelen]);
+ UNIT_ASSERT(0x5678 == charbuffer[2 * samplelen + 1]);
// test TChar version
// bytebuffer of len writtenbytes contains sampletext of len samplelen chars in utf8
@@ -261,7 +261,7 @@ void TCodepageTest::TestUTFFromUnknownPlane() {
for (size_t i = 0; i < wtr.size(); ++i) {
if (sampletext[i] >= 0x10000) {
UNIT_ASSERT_VALUES_EQUAL(wtr[i], ' ');
- } else {
+ } else {
UNIT_ASSERT_VALUES_EQUAL(wtr[i], sampletext[i]);
}
}
@@ -290,11 +290,11 @@ static void TestSurrogates(const char* str, const wchar16* wide, size_t wideSize
void TCodepageTest::TestSurrogatePairs() {
const char* utf8NonBMP = "\xf4\x80\x89\x84\xf4\x80\x89\x87\xf4\x80\x88\xba";
- wchar16 wNonBMPDummy[] = {0xDBC0, 0xDE44, 0xDBC0, 0xDE47, 0xDBC0, 0xDE3A};
+ wchar16 wNonBMPDummy[] = {0xDBC0, 0xDE44, 0xDBC0, 0xDE47, 0xDBC0, 0xDE3A};
TestSurrogates(utf8NonBMP, wNonBMPDummy, Y_ARRAY_SIZE(wNonBMPDummy));
const char* utf8NonBMP2 = "ab\xf4\x80\x89\x87n";
- wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'};
+ wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'};
TestSurrogates(utf8NonBMP2, wNonBMPDummy2, Y_ARRAY_SIZE(wNonBMPDummy2));
}
@@ -356,7 +356,7 @@ static void TestCanEncodeEach(const TWtringBuf& text, ECharset encoding, bool ex
for (size_t i = 0; i < text.size(); ++i) {
if (CanBeEncoded(text.SubStr(i, 1), encoding) != expectedResult)
ythrow yexception() << "assertion failed: encoding " << NameByCharset(encoding)
- << " on '" << text.SubStr(i, 1) << "' (expected " << expectedResult << ")";
+ << " on '" << text.SubStr(i, 1) << "' (expected " << expectedResult << ")";
}
// whole text
UNIT_ASSERT_EQUAL(CanBeEncoded(text, encoding), expectedResult);
diff --git a/library/cpp/charset/cp_encrec.cpp b/library/cpp/charset/cp_encrec.cpp
index e4570cd628..aa68278a04 100644
--- a/library/cpp/charset/cp_encrec.cpp
+++ b/library/cpp/charset/cp_encrec.cpp
@@ -1,5 +1,5 @@
-#include "codepage.h"
-
+#include "codepage.h"
+
#include <util/stream/output.h>
void Encoder::Tr(const wchar32* in, char* out, size_t len) const {
@@ -13,14 +13,14 @@ void Encoder::Tr(const wchar32* in, char* out) const {
} while (*in++);
}
-void Recoder::Create(const CodePage& source, const Encoder* wideTarget) {
- for (size_t i = 0; i != 256; ++i) {
+void Recoder::Create(const CodePage& source, const Encoder* wideTarget) {
+ for (size_t i = 0; i != 256; ++i) {
Table[i] = wideTarget->Tr(source.unicode[i]);
Y_ASSERT(Table[i] != 0 || i == 0);
}
}
-void Recoder::Create(const CodePage& page, const Encoder* widePage, wchar32 (*mapfunc)(wchar32)) {
+void Recoder::Create(const CodePage& page, const Encoder* widePage, wchar32 (*mapfunc)(wchar32)) {
for (size_t i = 0; i != 256; ++i) {
char c = widePage->Code((*mapfunc)(page.unicode[i]));
Table[i] = (c == 0 && i != 0) ? (unsigned char)i : (unsigned char)c;
diff --git a/library/cpp/charset/doccodes.cpp b/library/cpp/charset/doccodes.cpp
index 1fc17a3275..e0384a7f88 100644
--- a/library/cpp/charset/doccodes.cpp
+++ b/library/cpp/charset/doccodes.cpp
@@ -1 +1 @@
-#include "doccodes.h"
+#include "doccodes.h"
diff --git a/library/cpp/charset/doccodes.h b/library/cpp/charset/doccodes.h
index 75c87adf9e..0aa7eb2d2b 100644
--- a/library/cpp/charset/doccodes.h
+++ b/library/cpp/charset/doccodes.h
@@ -1,45 +1,45 @@
#pragma once
-enum ECharset {
+enum ECharset {
CODES_UNSUPPORTED = -2, // valid but unsupported encoding
- CODES_UNKNOWN = -1, // invalid or unspecified encoding
- CODES_WIN, // [ 0] WINDOWS_1251 Windows
- CODES_KOI8, // [ 1] KOI8_U Koi8-u
- CODES_ALT, // [ 2] IBM_866 MS DOS, alternative
- CODES_MAC, // [ 3] MAC_CYRILLIC Macintosh
- CODES_MAIN, // [ 4] ISO_LATIN_CYRILLIC Main
- CODES_ASCII, // [ 5] WINDOWS_1252 Latin 1
- CODES_RESERVED_3, // reserved code: use it for new encodings before adding them to the end of the list
- CODES_WIN_EAST, // [ 7] WINDOWS_1250 WIN PL
- CODES_ISO_EAST, // [ 8] ISO_8859_2 ISO PL
+ CODES_UNKNOWN = -1, // invalid or unspecified encoding
+ CODES_WIN, // [ 0] WINDOWS_1251 Windows
+ CODES_KOI8, // [ 1] KOI8_U Koi8-u
+ CODES_ALT, // [ 2] IBM_866 MS DOS, alternative
+ CODES_MAC, // [ 3] MAC_CYRILLIC Macintosh
+ CODES_MAIN, // [ 4] ISO_LATIN_CYRILLIC Main
+ CODES_ASCII, // [ 5] WINDOWS_1252 Latin 1
+ CODES_RESERVED_3, // reserved code: use it for new encodings before adding them to the end of the list
+ CODES_WIN_EAST, // [ 7] WINDOWS_1250 WIN PL
+ CODES_ISO_EAST, // [ 8] ISO_8859_2 ISO PL
// our superset of subset of windows-1251
- CODES_YANDEX, // [ 9] YANDEX
- CODES_UTF_16BE, // [10] UTF_16BE
- CODES_UTF_16LE, // [11] UTF_16LE
+ CODES_YANDEX, // [ 9] YANDEX
+ CODES_UTF_16BE, // [10] UTF_16BE
+ CODES_UTF_16LE, // [11] UTF_16LE
// missing standard codepages
- CODES_IBM855, // [12] IBM_855
- CODES_UTF8, // [13] UTF8
- CODES_UNKNOWNPLANE, // [14] Unrecognized characters are mapped into the PUA: U+F000..U+F0FF
+ CODES_IBM855, // [12] IBM_855
+ CODES_UTF8, // [13] UTF8
+ CODES_UNKNOWNPLANE, // [14] Unrecognized characters are mapped into the PUA: U+F000..U+F0FF
- CODES_KAZWIN, // [15] WINDOWS_1251_K Kazakh version of Windows-1251
- CODES_TATWIN, // [16] WINDOWS_1251_T Tatarian version of Windows-1251
- CODES_ARMSCII, // [17] Armenian ASCII
- CODES_GEO_ITA, // [18] Academy of Sciences Georgian
- CODES_GEO_PS, // [19] Georgian Parliament
- CODES_ISO_8859_3, // [20] Latin-3: Turkish, Maltese and Esperanto
- CODES_ISO_8859_4, // [21] Latin-4: Estonian, Latvian, Lithuanian, Greenlandic, Sami
- CODES_ISO_8859_6, // [22] Latin/Arabic: Arabic
- CODES_ISO_8859_7, // [23] Latin/Greek: Greek
- CODES_ISO_8859_8, // [24] Latin/Hebrew: Hebrew
- CODES_ISO_8859_9, // [25] Latin-5 or Turkish: Turkish
- CODES_ISO_8859_13, // [26] Latin-7 or Baltic Rim: Baltic languages
- CODES_ISO_8859_15, // [27] Latin-9: Western European languages
- CODES_ISO_8859_16, // [28] Latin-10: South-Eastern European languages
- CODES_WINDOWS_1253, // [29] for Greek
- CODES_WINDOWS_1254, // [30] for Turkish
- CODES_WINDOWS_1255, // [31] for Hebrew
- CODES_WINDOWS_1256, // [32] for Arabic
- CODES_WINDOWS_1257, // [33] for Estonian, Latvian and Lithuanian
+ CODES_KAZWIN, // [15] WINDOWS_1251_K Kazakh version of Windows-1251
+ CODES_TATWIN, // [16] WINDOWS_1251_T Tatarian version of Windows-1251
+ CODES_ARMSCII, // [17] Armenian ASCII
+ CODES_GEO_ITA, // [18] Academy of Sciences Georgian
+ CODES_GEO_PS, // [19] Georgian Parliament
+ CODES_ISO_8859_3, // [20] Latin-3: Turkish, Maltese and Esperanto
+ CODES_ISO_8859_4, // [21] Latin-4: Estonian, Latvian, Lithuanian, Greenlandic, Sami
+ CODES_ISO_8859_6, // [22] Latin/Arabic: Arabic
+ CODES_ISO_8859_7, // [23] Latin/Greek: Greek
+ CODES_ISO_8859_8, // [24] Latin/Hebrew: Hebrew
+ CODES_ISO_8859_9, // [25] Latin-5 or Turkish: Turkish
+ CODES_ISO_8859_13, // [26] Latin-7 or Baltic Rim: Baltic languages
+ CODES_ISO_8859_15, // [27] Latin-9: Western European languages
+ CODES_ISO_8859_16, // [28] Latin-10: South-Eastern European languages
+ CODES_WINDOWS_1253, // [29] for Greek
+ CODES_WINDOWS_1254, // [30] for Turkish
+ CODES_WINDOWS_1255, // [31] for Hebrew
+ CODES_WINDOWS_1256, // [32] for Arabic
+ CODES_WINDOWS_1257, // [33] for Estonian, Latvian and Lithuanian
// these codes are all the other 8bit codes known by libiconv
// they follow in alphanumeric order
diff --git a/library/cpp/charset/iconv.cpp b/library/cpp/charset/iconv.cpp
index df43471470..605d0699ef 100644
--- a/library/cpp/charset/iconv.cpp
+++ b/library/cpp/charset/iconv.cpp
@@ -1,94 +1,94 @@
-#include "iconv.h"
-
-#include <contrib/libs/libiconv/iconv.h>
-
-using namespace NICONVPrivate;
-
-TDescriptor::TDescriptor(const char* from, const char* to)
- : Descriptor_(libiconv_open(to, from))
- , From_(from)
- , To_(to)
-{
- if (!Invalid()) {
- int temp = 1;
-
- libiconvctl(Descriptor_, ICONV_SET_DISCARD_ILSEQ, &temp);
- }
-}
-
-TDescriptor::~TDescriptor() {
- if (!Invalid()) {
- libiconv_close(Descriptor_);
- }
-}
-
-size_t NICONVPrivate::RecodeImpl(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) {
- Y_ASSERT(!descriptor.Invalid());
- Y_ASSERT(in);
- Y_ASSERT(out);
-
+#include "iconv.h"
+
+#include <contrib/libs/libiconv/iconv.h>
+
+using namespace NICONVPrivate;
+
+TDescriptor::TDescriptor(const char* from, const char* to)
+ : Descriptor_(libiconv_open(to, from))
+ , From_(from)
+ , To_(to)
+{
+ if (!Invalid()) {
+ int temp = 1;
+
+ libiconvctl(Descriptor_, ICONV_SET_DISCARD_ILSEQ, &temp);
+ }
+}
+
+TDescriptor::~TDescriptor() {
+ if (!Invalid()) {
+ libiconv_close(Descriptor_);
+ }
+}
+
+size_t NICONVPrivate::RecodeImpl(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) {
+ Y_ASSERT(!descriptor.Invalid());
+ Y_ASSERT(in);
+ Y_ASSERT(out);
+
char* inPtr = const_cast<char*>(in);
- char* outPtr = out;
- size_t inSizeMod = inSize;
- size_t outSizeMod = outSize;
- size_t res = libiconv(descriptor.Get(), &inPtr, &inSizeMod, &outPtr, &outSizeMod);
-
- read = inSize - inSizeMod;
- written = outSize - outSizeMod;
-
- return res;
-}
-
-void NICONVPrivate::DoRecode(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) {
- if (descriptor.Invalid()) {
- ythrow yexception() << "Can not convert from " << descriptor.From() << " to " << descriptor.To();
- }
-
- size_t res = RecodeImpl(descriptor, in, out, inSize, outSize, read, written);
-
- if (res == static_cast<size_t>(-1)) {
- switch (errno) {
- case EILSEQ:
- read = inSize;
- break;
-
- case EINVAL:
- read = inSize;
- break;
-
- case E2BIG:
- ythrow yexception() << "Iconv error: output buffer is too small";
-
- default:
- ythrow yexception() << "Unknown iconv error";
- }
- }
-}
-
-RECODE_RESULT NICONVPrivate::DoRecodeNoThrow(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) {
- if (descriptor.Invalid()) {
- return RECODE_ERROR;
- }
-
- size_t res = RecodeImpl(descriptor, in, out, inSize, outSize, read, written);
-
- if (res == static_cast<size_t>(-1)) {
- switch (errno) {
- case EILSEQ:
- read = inSize;
- break;
-
- case EINVAL:
- read = inSize;
- break;
-
- case E2BIG:
- return RECODE_EOOUTPUT;
-
- default:
- return RECODE_ERROR;
- }
- }
-
- return RECODE_OK;
-}
+ char* outPtr = out;
+ size_t inSizeMod = inSize;
+ size_t outSizeMod = outSize;
+ size_t res = libiconv(descriptor.Get(), &inPtr, &inSizeMod, &outPtr, &outSizeMod);
+
+ read = inSize - inSizeMod;
+ written = outSize - outSizeMod;
+
+ return res;
+}
+
+void NICONVPrivate::DoRecode(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) {
+ if (descriptor.Invalid()) {
+ ythrow yexception() << "Can not convert from " << descriptor.From() << " to " << descriptor.To();
+ }
+
+ size_t res = RecodeImpl(descriptor, in, out, inSize, outSize, read, written);
+
+ if (res == static_cast<size_t>(-1)) {
+ switch (errno) {
+ case EILSEQ:
+ read = inSize;
+ break;
+
+ case EINVAL:
+ read = inSize;
+ break;
+
+ case E2BIG:
+ ythrow yexception() << "Iconv error: output buffer is too small";
+
+ default:
+ ythrow yexception() << "Unknown iconv error";
+ }
+ }
+}
+
+RECODE_RESULT NICONVPrivate::DoRecodeNoThrow(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) {
+ if (descriptor.Invalid()) {
+ return RECODE_ERROR;
+ }
+
+ size_t res = RecodeImpl(descriptor, in, out, inSize, outSize, read, written);
+
+ if (res == static_cast<size_t>(-1)) {
+ switch (errno) {
+ case EILSEQ:
+ read = inSize;
+ break;
+
+ case EINVAL:
+ read = inSize;
+ break;
+
+ case E2BIG:
+ return RECODE_EOOUTPUT;
+
+ default:
+ return RECODE_ERROR;
+ }
+ }
+
+ return RECODE_OK;
+}
diff --git a/library/cpp/charset/iconv.h b/library/cpp/charset/iconv.h
index ac13539347..58188bb33d 100644
--- a/library/cpp/charset/iconv.h
+++ b/library/cpp/charset/iconv.h
@@ -10,66 +10,66 @@ namespace NICONVPrivate {
inline const char* CharsetName(ECharset code) {
return NameByCharset(code);
}
-
+
inline const char* CharsetName(const char* code) {
return code;
}
- template <int size>
+ template <int size>
inline const char* UnicodeNameBySize();
- template <>
+ template <>
inline const char* UnicodeNameBySize<1>() {
return "UTF-8";
}
-
- template <>
+
+ template <>
inline const char* UnicodeNameBySize<2>() {
return "UTF-16LE";
}
-
- template <>
+
+ template <>
inline const char* UnicodeNameBySize<4>() {
return "UCS-4LE";
}
- template <class C>
+ template <class C>
inline const char* UnicodeName() {
return UnicodeNameBySize<sizeof(C)>();
}
class TDescriptor : NNonCopyable::TNonCopyable {
private:
- void* Descriptor_;
- const char* From_;
- const char* To_;
+ void* Descriptor_;
+ const char* From_;
+ const char* To_;
public:
template <class TFrom, class TTo>
inline TDescriptor(TFrom from, TTo to)
- : TDescriptor(CharsetName(from), CharsetName(to))
+ : TDescriptor(CharsetName(from), CharsetName(to))
{
}
- TDescriptor(const char* from, const char* to);
-
- ~TDescriptor();
-
- inline void* Get() const {
- return Descriptor_;
- }
-
- inline bool Invalid() const {
- return Descriptor_ == (void*)(-1);
+ TDescriptor(const char* from, const char* to);
+
+ ~TDescriptor();
+
+ inline void* Get() const {
+ return Descriptor_;
}
- inline const char* From() const noexcept {
- return From_;
+ inline bool Invalid() const {
+ return Descriptor_ == (void*)(-1);
}
- inline const char* To() const noexcept {
- return To_;
+ inline const char* From() const noexcept {
+ return From_;
}
+
+ inline const char* To() const noexcept {
+ return To_;
+ }
};
template <class TFrom, class TTo>
@@ -79,43 +79,43 @@ namespace NICONVPrivate {
return !descriptor.Invalid();
}
- size_t RecodeImpl(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written);
- void DoRecode(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written);
+ size_t RecodeImpl(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written);
+ void DoRecode(const TDescriptor& descriptor, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written);
template <class TFrom, class TTo>
inline void Recode(TFrom from, TTo to, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) {
TDescriptor descriptor(from, to);
- DoRecode(descriptor, in, out, inSize, outSize, read, written);
+ DoRecode(descriptor, in, out, inSize, outSize, read, written);
}
- template <class TCharType>
- inline void RecodeToUnicode(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize, size_t& read, size_t& written) {
+ template <class TCharType>
+ inline void RecodeToUnicode(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize, size_t& read, size_t& written) {
const size_t charSize = sizeof(TCharType);
Recode(from, UnicodeName<TCharType>(), in, reinterpret_cast<char*>(out), inSize, outSize * charSize, read, written);
written /= charSize;
}
- template <class TCharType>
- inline void RecodeFromUnicode(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) {
+ template <class TCharType>
+ inline void RecodeFromUnicode(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) {
const size_t charSize = sizeof(TCharType);
Recode(UnicodeName<TCharType>(), to, reinterpret_cast<const char*>(in), out, inSize * charSize, outSize, read, written);
read /= charSize;
}
- RECODE_RESULT DoRecodeNoThrow(const TDescriptor& d, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written);
-
+ RECODE_RESULT DoRecodeNoThrow(const TDescriptor& d, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written);
+
template <class TFrom, class TTo>
inline RECODE_RESULT RecodeNoThrow(TFrom from, TTo to, const char* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) {
TDescriptor descriptor(from, to);
- return DoRecodeNoThrow(descriptor, in, out, inSize, outSize, read, written);
+ return DoRecodeNoThrow(descriptor, in, out, inSize, outSize, read, written);
}
- template <class TCharType>
- inline RECODE_RESULT RecodeToUnicodeNoThrow(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize, size_t& read, size_t& written) {
+ template <class TCharType>
+ inline RECODE_RESULT RecodeToUnicodeNoThrow(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize, size_t& read, size_t& written) {
const size_t charSize = sizeof(TCharType);
RECODE_RESULT res = RecodeNoThrow(from, UnicodeName<TCharType>(), in, reinterpret_cast<char*>(out), inSize, outSize * charSize, read, written);
@@ -124,8 +124,8 @@ namespace NICONVPrivate {
return res;
}
- template <class TCharType>
- inline RECODE_RESULT RecodeFromUnicodeNoThrow(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) {
+ template <class TCharType>
+ inline RECODE_RESULT RecodeFromUnicodeNoThrow(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize, size_t& read, size_t& written) {
const size_t charSize = sizeof(TCharType);
RECODE_RESULT res = RecodeNoThrow(UnicodeName<TCharType>(), to, reinterpret_cast<const char*>(in), out, inSize * charSize, outSize, read, written);
@@ -133,4 +133,4 @@ namespace NICONVPrivate {
return res;
}
-}
+}
diff --git a/library/cpp/charset/iconv_ut.cpp b/library/cpp/charset/iconv_ut.cpp
index e8c56f6d49..f79d76f7c2 100644
--- a/library/cpp/charset/iconv_ut.cpp
+++ b/library/cpp/charset/iconv_ut.cpp
@@ -1,7 +1,7 @@
-#include "wide.h"
-#include "recyr.hh"
-#include "codepage.h"
-
+#include "wide.h"
+#include "recyr.hh"
+#include "codepage.h"
+
#include <library/cpp/testing/unittest/registar.h>
static void TestIconv(const TString& utf8, const TString& other, ECharset enc) {
@@ -38,30 +38,30 @@ static void TestIconv(const TString& utf8, const TString& other, ECharset enc) {
UNIT_ASSERT(temp == other);
}
-class TIconvTest: public TTestBase {
- static void TestSurrogates(const char* str, const wchar16* wide, size_t wideSize) {
- size_t sSize = strlen(str);
- size_t wSize = sSize * 2;
- TArrayHolder<wchar16> w(new wchar16[wSize]);
-
- size_t read = 0;
- size_t written = 0;
- NICONVPrivate::RecodeToUnicode(CODES_UTF8, str, w.Get(), sSize, wSize, read, written);
- UNIT_ASSERT(read == sSize);
- UNIT_ASSERT(written == wideSize);
- UNIT_ASSERT(!memcmp(w.Get(), wide, wideSize));
-
- TArrayHolder<char> s(new char[sSize]);
- NICONVPrivate::RecodeFromUnicode(CODES_UTF8, w.Get(), s.Get(), wideSize, sSize, read, written);
- UNIT_ASSERT(read == wideSize);
- UNIT_ASSERT(written == sSize);
- UNIT_ASSERT(!memcmp(s.Get(), str, sSize));
- }
+class TIconvTest: public TTestBase {
+ static void TestSurrogates(const char* str, const wchar16* wide, size_t wideSize) {
+ size_t sSize = strlen(str);
+ size_t wSize = sSize * 2;
+ TArrayHolder<wchar16> w(new wchar16[wSize]);
+
+ size_t read = 0;
+ size_t written = 0;
+ NICONVPrivate::RecodeToUnicode(CODES_UTF8, str, w.Get(), sSize, wSize, read, written);
+ UNIT_ASSERT(read == sSize);
+ UNIT_ASSERT(written == wideSize);
+ UNIT_ASSERT(!memcmp(w.Get(), wide, wideSize));
+
+ TArrayHolder<char> s(new char[sSize]);
+ NICONVPrivate::RecodeFromUnicode(CODES_UTF8, w.Get(), s.Get(), wideSize, sSize, read, written);
+ UNIT_ASSERT(read == wideSize);
+ UNIT_ASSERT(written == sSize);
+ UNIT_ASSERT(!memcmp(s.Get(), str, sSize));
+ }
private:
UNIT_TEST_SUITE(TIconvTest);
- UNIT_TEST(TestBig5);
- UNIT_TEST(TestSurrogatePairs);
+ UNIT_TEST(TestBig5);
+ UNIT_TEST(TestSurrogatePairs);
UNIT_TEST_SUITE_END();
public:
@@ -75,11 +75,11 @@ public:
void TestSurrogatePairs() {
const char* utf8NonBMP = "\xf4\x80\x89\x84\xf4\x80\x89\x87\xf4\x80\x88\xba";
- wchar16 wNonBMPDummy[] = {0xDBC0, 0xDE44, 0xDBC0, 0xDE47, 0xDBC0, 0xDE3A};
+ wchar16 wNonBMPDummy[] = {0xDBC0, 0xDE44, 0xDBC0, 0xDE47, 0xDBC0, 0xDE3A};
TestSurrogates(utf8NonBMP, wNonBMPDummy, Y_ARRAY_SIZE(wNonBMPDummy));
const char* utf8NonBMP2 = "ab\xf4\x80\x89\x87n";
- wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'};
+ wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'};
TestSurrogates(utf8NonBMP2, wNonBMPDummy2, Y_ARRAY_SIZE(wNonBMPDummy2));
}
};
diff --git a/library/cpp/charset/recyr.hh b/library/cpp/charset/recyr.hh
index 5ec8734bcf..c5e752616e 100644
--- a/library/cpp/charset/recyr.hh
+++ b/library/cpp/charset/recyr.hh
@@ -14,39 +14,39 @@
///////////////////////////////////////////////////////////////////////////////////////
// input buf -> output buf //
///////////////////////////////////////////////////////////////////////////////////////
-template <class TCharType>
-inline RECODE_RESULT RecodeToUnicode(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) {
- static_assert(sizeof(TCharType) > 1, "expect wide character type");
-
+template <class TCharType>
+inline RECODE_RESULT RecodeToUnicode(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) {
+ static_assert(sizeof(TCharType) > 1, "expect wide character type");
+
return NCodepagePrivate::_recodeToUnicode(from, in, out, inSize, outSize, inRead, outWritten);
}
-template <class TCharType>
-inline RECODE_RESULT RecodeFromUnicode(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) {
- static_assert(sizeof(TCharType) > 1, "expect wide character type");
-
+template <class TCharType>
+inline RECODE_RESULT RecodeFromUnicode(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) {
+ static_assert(sizeof(TCharType) > 1, "expect wide character type");
+
return NCodepagePrivate::_recodeFromUnicode(to, in, out, inSize, outSize, inRead, outWritten);
}
-inline RECODE_RESULT RecodeFromUnicode(ECharset to, wchar32 rune, char* out, size_t outSize, size_t& outWritten) {
+inline RECODE_RESULT RecodeFromUnicode(ECharset to, wchar32 rune, char* out, size_t outSize, size_t& outWritten) {
return NCodepagePrivate::_recodeFromUnicode(to, rune, out, outSize, outWritten);
}
-template <class TCharType>
+template <class TCharType>
inline RECODE_RESULT RecodeToUnicode(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize) {
size_t inRead = 0;
size_t outWritten = 0;
return RecodeToUnicode(from, in, out, inSize, outSize, inRead, outWritten);
}
-template <class TCharType>
+template <class TCharType>
inline RECODE_RESULT RecodeFromUnicode(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize) {
size_t inRead = 0;
size_t outWritten = 0;
return RecodeFromUnicode(to, in, out, inSize, outSize, inRead, outWritten);
}
-inline RECODE_RESULT RecodeFromUnicode(ECharset theEncoding, const wchar16* chars, size_t length,
+inline RECODE_RESULT RecodeFromUnicode(ECharset theEncoding, const wchar16* chars, size_t length,
char* bytes, size_t size, size_t* read = nullptr, size_t* written = nullptr) {
size_t w = 0, r = 0;
RECODE_RESULT rc = ::RecodeFromUnicode(theEncoding, chars, bytes, length, size, r, w);
@@ -57,7 +57,7 @@ inline RECODE_RESULT RecodeFromUnicode(ECharset theEncoding, const wchar16* char
return rc;
}
-inline RECODE_RESULT Recode(ECharset from, ECharset to, const char* in, char* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) {
+inline RECODE_RESULT Recode(ECharset from, ECharset to, const char* in, char* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) {
inRead = 0;
outWritten = 0;
@@ -125,7 +125,7 @@ inline bool Recode(ECharset from, ECharset to, const TStringBuf& in, TString& ou
Y_ENSURE(RECODE_OK == res, "Recode failed. ");
if (outWritten > outSize)
ythrow yexception() << "Recode overrun the buffer: size="
- << outSize << " need=" << outWritten;
+ << outSize << " need=" << outWritten;
out.remove(outWritten);
return true;
@@ -149,10 +149,10 @@ inline TString RecodeToHTMLEntities(ECharset from, const TString& in) {
RECODE_RESULT res;
size_t outWritten, inRead;
TString out;
- out.resize(in.length() * (4 + 4));
+ out.resize(in.length() * (4 + 4));
res = NCodepagePrivate::_recodeToHTMLEntities(from, in.c_str(), out.begin(), in.length(), out.length(), inRead, outWritten);
- if (res == RECODE_EOOUTPUT) { //input contains many 8-byte characters?
- out.resize(in.length() * (4 + 8));
+ if (res == RECODE_EOOUTPUT) { //input contains many 8-byte characters?
+ out.resize(in.length() * (4 + 8));
res = NCodepagePrivate::_recodeToHTMLEntities(from, in.c_str(), out.begin(), in.length(), out.length(), inRead, outWritten);
}
if (res != RECODE_OK) {
diff --git a/library/cpp/charset/recyr_int.hh b/library/cpp/charset/recyr_int.hh
index 353af53305..c61822037f 100644
--- a/library/cpp/charset/recyr_int.hh
+++ b/library/cpp/charset/recyr_int.hh
@@ -5,332 +5,332 @@
#include <util/generic/ptr.h>
#include <util/generic/string.h>
#include <util/system/defaults.h>
-
+
#include "codepage.h"
#include "doccodes.h"
#include "iconv.h"
#include "wide.h"
namespace NCodepagePrivate {
- inline RECODE_RESULT _recodeCopy(const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
- in_readed = in_size;
- RECODE_RESULT res = RECODE_OK;
- if (in_readed > out_size) {
- res = RECODE_EOOUTPUT;
- in_readed = out_size;
- }
- if (in != out)
- memcpy(out, in, in_readed);
- out_writed = in_readed;
- return res;
+ inline RECODE_RESULT _recodeCopy(const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
+ in_readed = in_size;
+ RECODE_RESULT res = RECODE_OK;
+ if (in_readed > out_size) {
+ res = RECODE_EOOUTPUT;
+ in_readed = out_size;
+ }
+ if (in != out)
+ memcpy(out, in, in_readed);
+ out_writed = in_readed;
+ return res;
}
- inline RECODE_RESULT _recodeToUTF8(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
- if (From == CODES_UTF8)
- return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed);
- const CodePage* cp = CodePageByCharset(From);
+ inline RECODE_RESULT _recodeToUTF8(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
+ if (From == CODES_UTF8)
+ return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed);
+ const CodePage* cp = CodePageByCharset(From);
- const unsigned char* in_start = (const unsigned char*)in;
- const unsigned char* in_end = in_start + in_size;
- const unsigned char* out_start = (unsigned char*)out;
- const unsigned char* out_end = out_start + out_size;
+ const unsigned char* in_start = (const unsigned char*)in;
+ const unsigned char* in_end = in_start + in_size;
+ const unsigned char* out_start = (unsigned char*)out;
+ const unsigned char* out_end = out_start + out_size;
- size_t rune_len;
- RECODE_RESULT res = RECODE_OK;
- while ((unsigned char*)in < in_end && res == RECODE_OK) {
+ size_t rune_len;
+ RECODE_RESULT res = RECODE_OK;
+ while ((unsigned char*)in < in_end && res == RECODE_OK) {
res = SafeWriteUTF8Char(cp->unicode[(unsigned char)(*in++)], rune_len, (unsigned char*)out, out_end);
- out += rune_len;
- }
- in_readed = (unsigned char*)in - in_start;
- out_writed = (unsigned char*)out - out_start;
- return res;
+ out += rune_len;
+ }
+ in_readed = (unsigned char*)in - in_start;
+ out_writed = (unsigned char*)out - out_start;
+ return res;
}
- inline RECODE_RESULT _recodeFromUTF8(ECharset to, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
- if (to == CODES_UTF8)
- return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed);
+ inline RECODE_RESULT _recodeFromUTF8(ECharset to, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
+ if (to == CODES_UTF8)
+ return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed);
Y_ASSERT(CODES_UNKNOWN < to && to < CODES_MAX);
- const Encoder* enc = &EncoderByCharset(to);
+ const Encoder* enc = &EncoderByCharset(to);
- const unsigned char* in_start = (const unsigned char*)in;
- const unsigned char* in_end = in_start + in_size;
- const unsigned char* out_start = (unsigned char*)out;
- const unsigned char* out_end = out_start + out_size;
+ const unsigned char* in_start = (const unsigned char*)in;
+ const unsigned char* in_end = in_start + in_size;
+ const unsigned char* out_start = (unsigned char*)out;
+ const unsigned char* out_end = out_start + out_size;
- wchar32 rune;
- size_t rune_len;
- RECODE_RESULT res = RECODE_OK;
- while ((const unsigned char*)in < in_end && (res == RECODE_OK || res == RECODE_BROKENSYMBOL)) {
+ wchar32 rune;
+ size_t rune_len;
+ RECODE_RESULT res = RECODE_OK;
+ while ((const unsigned char*)in < in_end && (res == RECODE_OK || res == RECODE_BROKENSYMBOL)) {
res = SafeReadUTF8Char(rune, rune_len, (const unsigned char*)in, in_end);
- if (res == RECODE_BROKENSYMBOL)
- rune_len = 1;
- if (res != RECODE_EOINPUT)
- *out++ = enc->Tr(rune);
- in += rune_len;
- if (res == RECODE_OK && (const unsigned char*)in < in_end && (unsigned char*)out >= out_end)
- res = RECODE_EOOUTPUT;
- }
- in_readed = (unsigned char*)in - in_start;
- out_writed = (unsigned char*)out - out_start;
- return res;
+ if (res == RECODE_BROKENSYMBOL)
+ rune_len = 1;
+ if (res != RECODE_EOINPUT)
+ *out++ = enc->Tr(rune);
+ in += rune_len;
+ if (res == RECODE_OK && (const unsigned char*)in < in_end && (unsigned char*)out >= out_end)
+ res = RECODE_EOOUTPUT;
+ }
+ in_readed = (unsigned char*)in - in_start;
+ out_writed = (unsigned char*)out - out_start;
+ return res;
}
- inline RECODE_RESULT _recodeToYandex(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
- if (From == CODES_YANDEX)
- return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed);
- if (From == CODES_UTF8)
- return _recodeFromUTF8(CODES_YANDEX, in, out, in_size, out_size, in_readed, out_writed);
- in_readed = (out_size > in_size) ? in_size : out_size;
- const Recoder& rcdr = NCodepagePrivate::TCodePageData::rcdr_to_yandex[From];
- rcdr.Tr(in, out, in_readed);
- out_writed = in_readed;
- if (out_size < in_size)
- return RECODE_EOOUTPUT;
- return RECODE_OK;
- }
- inline RECODE_RESULT _recodeFromYandex(ECharset To, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
- if (To == CODES_YANDEX)
- return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed);
- if (To == CODES_UTF8)
- return _recodeToUTF8(CODES_YANDEX, in, out, in_size, out_size, in_readed, out_writed);
- in_readed = (out_size > in_size) ? in_size : out_size;
- const Recoder& rcdr = NCodepagePrivate::TCodePageData::rcdr_from_yandex[To];
- rcdr.Tr(in, out, in_readed);
- out_writed = in_readed;
- if (out_size < in_size)
- return RECODE_EOOUTPUT;
- return RECODE_OK;
- }
-
- template <class TCharType>
- inline RECODE_RESULT _recodeUTF8ToUnicode(const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
- const unsigned char* inp = (const unsigned char*)in;
- const unsigned char* in_end = inp + in_size;
- TCharType* outp = out;
- const TCharType* out_end = outp + out_size;
- size_t rune_len;
- wchar32 rune;
- RECODE_RESULT res = RECODE_OK;
- while ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && inp < in_end && outp < out_end) {
+ inline RECODE_RESULT _recodeToYandex(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
+ if (From == CODES_YANDEX)
+ return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed);
+ if (From == CODES_UTF8)
+ return _recodeFromUTF8(CODES_YANDEX, in, out, in_size, out_size, in_readed, out_writed);
+ in_readed = (out_size > in_size) ? in_size : out_size;
+ const Recoder& rcdr = NCodepagePrivate::TCodePageData::rcdr_to_yandex[From];
+ rcdr.Tr(in, out, in_readed);
+ out_writed = in_readed;
+ if (out_size < in_size)
+ return RECODE_EOOUTPUT;
+ return RECODE_OK;
+ }
+ inline RECODE_RESULT _recodeFromYandex(ECharset To, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
+ if (To == CODES_YANDEX)
+ return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed);
+ if (To == CODES_UTF8)
+ return _recodeToUTF8(CODES_YANDEX, in, out, in_size, out_size, in_readed, out_writed);
+ in_readed = (out_size > in_size) ? in_size : out_size;
+ const Recoder& rcdr = NCodepagePrivate::TCodePageData::rcdr_from_yandex[To];
+ rcdr.Tr(in, out, in_readed);
+ out_writed = in_readed;
+ if (out_size < in_size)
+ return RECODE_EOOUTPUT;
+ return RECODE_OK;
+ }
+
+ template <class TCharType>
+ inline RECODE_RESULT _recodeUTF8ToUnicode(const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
+ const unsigned char* inp = (const unsigned char*)in;
+ const unsigned char* in_end = inp + in_size;
+ TCharType* outp = out;
+ const TCharType* out_end = outp + out_size;
+ size_t rune_len;
+ wchar32 rune;
+ RECODE_RESULT res = RECODE_OK;
+ while ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && inp < in_end && outp < out_end) {
res = SafeReadUTF8Char(rune, rune_len, inp, in_end);
- if (res == RECODE_BROKENSYMBOL)
- rune_len = 1;
- if (res == RECODE_OK || res == RECODE_BROKENSYMBOL) {
- if (!WriteSymbol(rune, outp, out_end)) {
- break;
- }
- inp += rune_len;
+ if (res == RECODE_BROKENSYMBOL)
+ rune_len = 1;
+ if (res == RECODE_OK || res == RECODE_BROKENSYMBOL) {
+ if (!WriteSymbol(rune, outp, out_end)) {
+ break;
+ }
+ inp += rune_len;
}
}
- in_readed = inp - (const unsigned char*)in;
- out_writed = outp - out;
-
- if ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && in_readed != in_size)
- return RECODE_EOOUTPUT;
-
- return res;
- }
-
- template <class TCharType>
- inline RECODE_RESULT _recodeSBToUnicode(ECharset From, const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
- const CodePage* cp = CodePageByCharset(From);
- const unsigned char* inp = (const unsigned char*)in;
- const unsigned char* in_end = inp + in_size;
- TCharType* outp = out;
- const TCharType* out_end = outp + out_size;
- while (inp < in_end && outp < out_end)
- *outp++ = static_cast<TCharType>(cp->unicode[*inp++]);
- in_readed = inp - (const unsigned char*)in;
- out_writed = outp - out;
- if (in_readed != in_size)
- return RECODE_EOOUTPUT;
- return RECODE_OK;
- }
-
- template <class TCharType>
- inline RECODE_RESULT _recodeUnicodeToUTF8Impl(const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
- const TCharType* inp = in;
- const TCharType* in_end = in + in_size;
- unsigned char* outp = (unsigned char*)out;
- const unsigned char* out_end = outp + out_size;
- size_t rune_len;
- wchar32 rune;
- RECODE_RESULT res = RECODE_OK;
-
- while ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && inp != in_end) {
- rune = ReadSymbolAndAdvance(inp, in_end);
+ in_readed = inp - (const unsigned char*)in;
+ out_writed = outp - out;
+
+ if ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && in_readed != in_size)
+ return RECODE_EOOUTPUT;
+
+ return res;
+ }
+
+ template <class TCharType>
+ inline RECODE_RESULT _recodeSBToUnicode(ECharset From, const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
+ const CodePage* cp = CodePageByCharset(From);
+ const unsigned char* inp = (const unsigned char*)in;
+ const unsigned char* in_end = inp + in_size;
+ TCharType* outp = out;
+ const TCharType* out_end = outp + out_size;
+ while (inp < in_end && outp < out_end)
+ *outp++ = static_cast<TCharType>(cp->unicode[*inp++]);
+ in_readed = inp - (const unsigned char*)in;
+ out_writed = outp - out;
+ if (in_readed != in_size)
+ return RECODE_EOOUTPUT;
+ return RECODE_OK;
+ }
+
+ template <class TCharType>
+ inline RECODE_RESULT _recodeUnicodeToUTF8Impl(const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
+ const TCharType* inp = in;
+ const TCharType* in_end = in + in_size;
+ unsigned char* outp = (unsigned char*)out;
+ const unsigned char* out_end = outp + out_size;
+ size_t rune_len;
+ wchar32 rune;
+ RECODE_RESULT res = RECODE_OK;
+
+ while ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && inp != in_end) {
+ rune = ReadSymbolAndAdvance(inp, in_end);
res = SafeWriteUTF8Char(rune, rune_len, outp, out_end);
- if (outp >= out_end && (res == RECODE_OK || res == RECODE_BROKENSYMBOL))
- res = RECODE_EOOUTPUT;
- outp += rune_len;
- }
- in_readed = inp - in;
- out_writed = outp - (const unsigned char*)out;
- return res;
+ if (outp >= out_end && (res == RECODE_OK || res == RECODE_BROKENSYMBOL))
+ res = RECODE_EOOUTPUT;
+ outp += rune_len;
+ }
+ in_readed = inp - in;
+ out_writed = outp - (const unsigned char*)out;
+ return res;
}
- inline RECODE_RESULT _recodeUnicodeToUTF8(wchar32 rune, char* out, size_t out_size, size_t& nwritten) {
+ inline RECODE_RESULT _recodeUnicodeToUTF8(wchar32 rune, char* out, size_t out_size, size_t& nwritten) {
return SafeWriteUTF8Char(rune, nwritten, (unsigned char*)out, out_size);
- }
+ }
- template <class TCharType, int Size = sizeof(TCharType)>
- struct TCharTypeSwitch;
+ template <class TCharType, int Size = sizeof(TCharType)>
+ struct TCharTypeSwitch;
- template <class TCharType>
- struct TCharTypeSwitch<TCharType, 2> {
+ template <class TCharType>
+ struct TCharTypeSwitch<TCharType, 2> {
using TRealCharType = wchar16;
- };
+ };
- template <class TCharType>
- struct TCharTypeSwitch<TCharType, 4> {
+ template <class TCharType>
+ struct TCharTypeSwitch<TCharType, 4> {
using TRealCharType = wchar32;
- };
-
- template <class TCharType>
- inline RECODE_RESULT _recodeUnicodeToUTF8(const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
- static_assert(sizeof(TCharType) > 1, "expect some wide type");
+ };
+ template <class TCharType>
+ inline RECODE_RESULT _recodeUnicodeToUTF8(const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
+ static_assert(sizeof(TCharType) > 1, "expect some wide type");
+
using TRealCharType = typename TCharTypeSwitch<TCharType>::TRealCharType;
- return _recodeUnicodeToUTF8Impl(reinterpret_cast<const TRealCharType*>(in), out, in_size, out_size, in_readed, out_writed);
- }
-
- template <class TCharType>
- inline RECODE_RESULT _recodeUnicodeToSB(ECharset To, const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
- const TCharType* inp = in;
- const TCharType* in_end = in + in_size;
- const char* out_begin = out;
- const char* out_end = out + out_size;
-
- const Encoder* enc = &EncoderByCharset(To);
- while (inp != in_end && out != out_end) {
- *out++ = enc->Tr(ReadSymbolAndAdvance(inp, in_end));
- }
-
- in_readed = inp - in;
- out_writed = out - out_begin;
-
- if (in_readed != in_size)
- return RECODE_EOOUTPUT;
-
- return RECODE_OK;
- }
-
- inline RECODE_RESULT _recodeUnicodeToSB(ECharset To, wchar32 rune, char* out, size_t out_size, size_t& nwritten) {
- if (0 == out_size)
- return RECODE_EOOUTPUT;
- *out = EncoderByCharset(To).Tr(rune);
- nwritten = 1;
- return RECODE_OK;
- }
-
- inline RECODE_RESULT _rune2hex(wchar32 in, char* out, size_t out_size, size_t& out_writed) {
- static const char hex_digs[] = "0123456789ABCDEF";
- out_writed = 0;
- RECODE_RESULT res = RECODE_OK;
- for (int i = 7; i >= 0; i--) {
- unsigned char h = (unsigned char)(in >> (i * 4) & 0x0F);
- if (h || i == 0) {
- if (out_writed + 1 >= out_size) {
- res = RECODE_EOOUTPUT;
- break;
- }
- out[out_writed++] = hex_digs[h];
+ return _recodeUnicodeToUTF8Impl(reinterpret_cast<const TRealCharType*>(in), out, in_size, out_size, in_readed, out_writed);
+ }
+
+ template <class TCharType>
+ inline RECODE_RESULT _recodeUnicodeToSB(ECharset To, const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
+ const TCharType* inp = in;
+ const TCharType* in_end = in + in_size;
+ const char* out_begin = out;
+ const char* out_end = out + out_size;
+
+ const Encoder* enc = &EncoderByCharset(To);
+ while (inp != in_end && out != out_end) {
+ *out++ = enc->Tr(ReadSymbolAndAdvance(inp, in_end));
+ }
+
+ in_readed = inp - in;
+ out_writed = out - out_begin;
+
+ if (in_readed != in_size)
+ return RECODE_EOOUTPUT;
+
+ return RECODE_OK;
+ }
+
+ inline RECODE_RESULT _recodeUnicodeToSB(ECharset To, wchar32 rune, char* out, size_t out_size, size_t& nwritten) {
+ if (0 == out_size)
+ return RECODE_EOOUTPUT;
+ *out = EncoderByCharset(To).Tr(rune);
+ nwritten = 1;
+ return RECODE_OK;
+ }
+
+ inline RECODE_RESULT _rune2hex(wchar32 in, char* out, size_t out_size, size_t& out_writed) {
+ static const char hex_digs[] = "0123456789ABCDEF";
+ out_writed = 0;
+ RECODE_RESULT res = RECODE_OK;
+ for (int i = 7; i >= 0; i--) {
+ unsigned char h = (unsigned char)(in >> (i * 4) & 0x0F);
+ if (h || i == 0) {
+ if (out_writed + 1 >= out_size) {
+ res = RECODE_EOOUTPUT;
+ break;
+ }
+ out[out_writed++] = hex_digs[h];
}
}
- return res;
+ return res;
}
- inline RECODE_RESULT _recodeUnicodeToHTMLEntities(const wchar32* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
- const wchar32* in_end = in + in_size;
- const char* out_beg = out;
- const wchar32* in_beg = in;
- RECODE_RESULT res = RECODE_OK;
-
- const char* out_end = out + out_size - 1;
- while (in < in_end && out < out_end) {
- if (*in < 0x80 && *in != '<' && *in != '&' && *in != '>') { //ascii
- *out++ = char(*in & 0x00FF);
- } else { //entity
- char* ent = out;
- size_t ent_writed;
- if (ent > out_end - 6) {
- res = RECODE_EOOUTPUT;
- break;
- }
- memcpy(ent, "&#x", 3);
- ent += 3;
- res = _rune2hex(*in, ent, out_end - 1 - ent, ent_writed);
- if (res != RECODE_OK)
- break;
- ent += ent_writed;
- *ent++ = ';';
- out = ent;
+ inline RECODE_RESULT _recodeUnicodeToHTMLEntities(const wchar32* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
+ const wchar32* in_end = in + in_size;
+ const char* out_beg = out;
+ const wchar32* in_beg = in;
+ RECODE_RESULT res = RECODE_OK;
+
+ const char* out_end = out + out_size - 1;
+ while (in < in_end && out < out_end) {
+ if (*in < 0x80 && *in != '<' && *in != '&' && *in != '>') { //ascii
+ *out++ = char(*in & 0x00FF);
+ } else { //entity
+ char* ent = out;
+ size_t ent_writed;
+ if (ent > out_end - 6) {
+ res = RECODE_EOOUTPUT;
+ break;
+ }
+ memcpy(ent, "&#x", 3);
+ ent += 3;
+ res = _rune2hex(*in, ent, out_end - 1 - ent, ent_writed);
+ if (res != RECODE_OK)
+ break;
+ ent += ent_writed;
+ *ent++ = ';';
+ out = ent;
}
- in++;
+ in++;
}
- *out++ = '\x00';
- out_writed = out - out_beg;
- in_readed = in - in_beg;
- return res;
+ *out++ = '\x00';
+ out_writed = out - out_beg;
+ in_readed = in - in_beg;
+ return res;
}
- template <class TCharType>
- inline RECODE_RESULT _recodeToUnicode(ECharset From, const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
- if (!ValidCodepage(From))
- return RECODE_ERROR;
+ template <class TCharType>
+ inline RECODE_RESULT _recodeToUnicode(ECharset From, const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
+ if (!ValidCodepage(From))
+ return RECODE_ERROR;
- if (!NCodepagePrivate::NativeCodepage(From))
- return NICONVPrivate::RecodeToUnicodeNoThrow(From, in, out, in_size, out_size, in_readed, out_writed);
+ if (!NCodepagePrivate::NativeCodepage(From))
+ return NICONVPrivate::RecodeToUnicodeNoThrow(From, in, out, in_size, out_size, in_readed, out_writed);
- if (From == CODES_UTF8)
- return _recodeUTF8ToUnicode(in, out, in_size, out_size, in_readed, out_writed);
+ if (From == CODES_UTF8)
+ return _recodeUTF8ToUnicode(in, out, in_size, out_size, in_readed, out_writed);
- return _recodeSBToUnicode(From, in, out, in_size, out_size, in_readed, out_writed);
- }
+ return _recodeSBToUnicode(From, in, out, in_size, out_size, in_readed, out_writed);
+ }
- template <class TCharType>
- inline RECODE_RESULT _recodeFromUnicode(ECharset To, const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
- if (!ValidCodepage(To))
- return RECODE_ERROR;
+ template <class TCharType>
+ inline RECODE_RESULT _recodeFromUnicode(ECharset To, const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
+ if (!ValidCodepage(To))
+ return RECODE_ERROR;
- if (!NCodepagePrivate::NativeCodepage(To))
- return NICONVPrivate::RecodeFromUnicodeNoThrow(To, in, out, in_size, out_size, in_readed, out_writed);
+ if (!NCodepagePrivate::NativeCodepage(To))
+ return NICONVPrivate::RecodeFromUnicodeNoThrow(To, in, out, in_size, out_size, in_readed, out_writed);
- if (To == CODES_UTF8)
- return NCodepagePrivate::_recodeUnicodeToUTF8(in, out, in_size, out_size, in_readed, out_writed);
+ if (To == CODES_UTF8)
+ return NCodepagePrivate::_recodeUnicodeToUTF8(in, out, in_size, out_size, in_readed, out_writed);
- return NCodepagePrivate::_recodeUnicodeToSB(To, in, out, in_size, out_size, in_readed, out_writed);
+ return NCodepagePrivate::_recodeUnicodeToSB(To, in, out, in_size, out_size, in_readed, out_writed);
}
- inline RECODE_RESULT _recodeFromUnicode(ECharset To, wchar32 rune, char* out, size_t out_size, size_t& nwritten) {
- if (!ValidCodepage(To))
- return RECODE_ERROR;
+ inline RECODE_RESULT _recodeFromUnicode(ECharset To, wchar32 rune, char* out, size_t out_size, size_t& nwritten) {
+ if (!ValidCodepage(To))
+ return RECODE_ERROR;
- if (!NCodepagePrivate::NativeCodepage(To)) {
- size_t nread = 0;
- return NICONVPrivate::RecodeFromUnicodeNoThrow(To, &rune, out, 1, out_size, nread, nwritten);
- }
+ if (!NCodepagePrivate::NativeCodepage(To)) {
+ size_t nread = 0;
+ return NICONVPrivate::RecodeFromUnicodeNoThrow(To, &rune, out, 1, out_size, nread, nwritten);
+ }
- if (To == CODES_UTF8)
- return NCodepagePrivate::_recodeUnicodeToUTF8(rune, out, out_size, nwritten);
+ if (To == CODES_UTF8)
+ return NCodepagePrivate::_recodeUnicodeToUTF8(rune, out, out_size, nwritten);
- return NCodepagePrivate::_recodeUnicodeToSB(To, rune, out, out_size, nwritten);
- }
+ return NCodepagePrivate::_recodeUnicodeToSB(To, rune, out, out_size, nwritten);
+ }
- inline RECODE_RESULT _recodeToHTMLEntities(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
- TArrayHolder<wchar32> bufHolder(new wchar32[in_size]);
- wchar32* buf = bufHolder.Get();
- size_t unicode_size;
- RECODE_RESULT res1, res2;
+ inline RECODE_RESULT _recodeToHTMLEntities(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
+ TArrayHolder<wchar32> bufHolder(new wchar32[in_size]);
+ wchar32* buf = bufHolder.Get();
+ size_t unicode_size;
+ RECODE_RESULT res1, res2;
- //first pass - to unicode
- res1 = _recodeToUnicode(From, in, buf, in_size, in_size, in_readed, unicode_size);
+ //first pass - to unicode
+ res1 = _recodeToUnicode(From, in, buf, in_size, in_size, in_readed, unicode_size);
- //second pass - to entities
- res2 = _recodeUnicodeToHTMLEntities(buf, out, in_size, out_size, in_readed, out_writed);
+ //second pass - to entities
+ res2 = _recodeUnicodeToHTMLEntities(buf, out, in_size, out_size, in_readed, out_writed);
- return (res2 != RECODE_OK) ? res2 : res1;
- }
+ return (res2 != RECODE_OK) ? res2 : res1;
+ }
-}
+}
diff --git a/library/cpp/charset/wide.h b/library/cpp/charset/wide.h
index 32d30e849e..b7a391f0a5 100644
--- a/library/cpp/charset/wide.h
+++ b/library/cpp/charset/wide.h
@@ -1,15 +1,15 @@
#pragma once
#include "codepage.h"
-#include "iconv.h"
-
+#include "iconv.h"
+
#include <util/charset/recode_result.h>
#include <util/charset/unidata.h>
#include <util/charset/utf8.h>
#include <util/charset/wide.h>
#include <util/generic/string.h>
#include <util/generic/algorithm.h>
-#include <util/generic/yexception.h>
+#include <util/generic/yexception.h>
#include <util/memory/tempbuf.h>
#include <util/system/yassert.h>
@@ -19,7 +19,7 @@
template <typename TCharType>
inline size_t WideToChar(const TCharType* text, size_t len, char* dest, ECharset enc) {
Y_ASSERT(SingleByteCodepage(enc));
-
+
const char* start = dest;
const Encoder* const encoder = &EncoderByCharset(enc);
@@ -114,7 +114,7 @@ namespace NDetail {
return RecodeMultiByteChar(src, dst, encoding);
}
- }
+ }
template <typename TCharFrom>
struct TRecodeTraits;
@@ -124,8 +124,8 @@ namespace NDetail {
using TCharTo = wchar16;
using TStringBufTo = TWtringBuf;
using TStringTo = TUtf16String;
- enum { ReserveSize = 4 }; // How many TCharFrom characters we should reserve for one TCharTo character in worst case
- // Here an unicode character can be converted up to 4 bytes of UTF8
+ enum { ReserveSize = 4 }; // How many TCharFrom characters we should reserve for one TCharTo character in worst case
+ // Here an unicode character can be converted up to 4 bytes of UTF8
};
template <>
@@ -133,7 +133,7 @@ namespace NDetail {
using TCharTo = char;
using TStringBufTo = TStringBuf;
using TStringTo = TString;
- enum { ReserveSize = 2 }; // possible surrogate pairs ?
+ enum { ReserveSize = 2 }; // possible surrogate pairs ?
};
// Operations with destination buffer where recoded string will be written
@@ -203,7 +203,7 @@ namespace NDetail {
Recode<TCharFrom>(src, res, encoding);
return res;
}
-}
+}
// Write result into @dst. Return string-buffer pointing to re-coded content of @dst.
@@ -291,7 +291,7 @@ inline TString WideToChar(const TWtringBuf w, ECharset enc) {
inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) {
return CharToWide<false>(s.data(), s.size(), enc);
}
-
+
template <bool robust>
inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) {
return CharToWide<robust>(s.data(), s.size(), enc);
diff --git a/library/cpp/charset/wide_ut.cpp b/library/cpp/charset/wide_ut.cpp
index 78947d51ba..fc727fb1b4 100644
--- a/library/cpp/charset/wide_ut.cpp
+++ b/library/cpp/charset/wide_ut.cpp
@@ -1,14 +1,14 @@
-#include "wide.h"
-#include "codepage.h"
+#include "wide.h"
+#include "codepage.h"
#include "recyr.hh"
-
+
#include <library/cpp/testing/unittest/registar.h>
-
+
#include <util/charset/utf8.h>
-#include <util/digest/numeric.h>
+#include <util/digest/numeric.h>
#include <util/generic/hash_set.h>
-#include <algorithm>
+#include <algorithm>
namespace {
//! three UTF8 encoded russian letters (A, B, V)
@@ -21,7 +21,7 @@ namespace {
0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
- 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, 0x00};
+ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, 0x00};
const char utf8CyrillicAlphabet[] =
"\xd0\x90\xd0\x91\xd0\x92\xd0\x93\xd0\x94\xd0\x95\xd0\x96\xd0\x97"
"\xd0\x98\xd0\x99\xd0\x9a\xd0\x9b\xd0\x9c\xd0\x9d\xd0\x9e\xd0\x9f"
@@ -34,7 +34,7 @@ namespace {
TString CreateYandexText() {
const int len = 256;
- char text[len] = {0};
+ char text[len] = {0};
for (int i = 0; i < len; ++i) {
text[i] = static_cast<char>(i);
}
@@ -61,7 +61,7 @@ namespace {
for (int i = 0; i < len; ++i) {
if (i <= 0x7F) { // ASCII characters without 0x7 and 0x1B
text[i] = static_cast<wchar16>(i);
- } else if (i >= 0xC0 && i <= 0xFF) { // russian characters (without YO and yo)
+ } else if (i >= 0xC0 && i <= 0xFF) { // russian characters (without YO and yo)
text[i] = static_cast<wchar16>(i + 0x0350); // 0x0410 - 0x044F
}
}
@@ -94,27 +94,27 @@ namespace {
'\xd0', '\xb7', '\xd0', '\xb8', '\xd0', '\xb9', '\xd0', '\xba', '\xd0', '\xbb', '\xd0', '\xbc', '\xd0', '\xbd', '\xd0', '\xbe',
'\xd0', '\xbf', '\xd1', '\x80', '\xd1', '\x81', '\xd1', '\x82', '\xd1', '\x83', '\xd1', '\x84', '\xd1', '\x85', '\xd1', '\x86',
'\xd1', '\x87', '\xd1', '\x88', '\xd1', '\x89', '\xd1', '\x8a', '\xd1', '\x8b', '\xd1', '\x8c', '\xd1', '\x8d', '\xd1', '\x8e',
- '\xd1', '\x8f'};
+ '\xd1', '\x8f'};
return TString(text, Y_ARRAY_SIZE(text));
}
//! use this function to dump UTF8 text into a file in case of any changes
- // void DumpUTF8Text() {
+ // void DumpUTF8Text() {
// TString s = WideToUTF8(UnicodeText);
- // std::ofstream f("utf8.txt");
- // f << std::hex;
- // for (int i = 0; i < (int)s.size(); ++i) {
- // f << "0x" << std::setw(2) << std::setfill('0') << (int)(ui8)s[i] << ", ";
- // if ((i + 1) % 16 == 0)
- // f << std::endl;
- // }
- // }
+ // std::ofstream f("utf8.txt");
+ // f << std::hex;
+ // for (int i = 0; i < (int)s.size(); ++i) {
+ // f << "0x" << std::setw(2) << std::setfill('0') << (int)(ui8)s[i] << ", ";
+ // if ((i + 1) % 16 == 0)
+ // f << std::endl;
+ // }
+ // }
}
//! this unit tests ensure validity of Yandex-Unicode and UTF8-Unicode conversions
//! @note only those conversions are verified because they are used in index
-class TConversionTest: public TTestBase {
+class TConversionTest: public TTestBase {
private:
//! @note every of the text can have zeros in the middle
const TString YandexText;
@@ -123,13 +123,13 @@ private:
private:
UNIT_TEST_SUITE(TConversionTest);
- UNIT_TEST(TestCharToWide);
- UNIT_TEST(TestWideToChar);
+ UNIT_TEST(TestCharToWide);
+ UNIT_TEST(TestWideToChar);
UNIT_TEST(TestYandexEncoding);
- UNIT_TEST(TestRecodeIntoString);
- UNIT_TEST(TestRecodeAppend);
- UNIT_TEST(TestRecode);
- UNIT_TEST(TestUnicodeLimit);
+ UNIT_TEST(TestRecodeIntoString);
+ UNIT_TEST(TestRecodeAppend);
+ UNIT_TEST(TestRecode);
+ UNIT_TEST(TestUnicodeLimit);
UNIT_TEST_SUITE_END();
public:
@@ -152,23 +152,23 @@ public:
UNIT_TEST_SUITE_REGISTRATION(TConversionTest);
// test conversions (char -> wchar32), (wchar32 -> char) and (wchar32 -> wchar16)
-#define TEST_WCHAR32(sbuf, wbuf, enc) \
- do { \
- /* convert char to wchar32 */ \
- TTempBuf tmpbuf1(sbuf.length() * sizeof(wchar32)); \
+#define TEST_WCHAR32(sbuf, wbuf, enc) \
+ do { \
+ /* convert char to wchar32 */ \
+ TTempBuf tmpbuf1(sbuf.length() * sizeof(wchar32)); \
const TBasicStringBuf<wchar32> s4buf = NDetail::NBaseOps::Recode<char>(sbuf, reinterpret_cast<wchar32*>(tmpbuf1.Data()), enc); \
- \
- /* convert wchar32 to char */ \
- TTempBuf tmpbuf2(s4buf.length() * 4); \
- const TStringBuf s1buf = NDetail::NBaseOps::Recode(s4buf, tmpbuf2.Data(), enc); \
- \
- /* convert wchar32 to wchar16 */ \
- const TUtf16String wstr2 = UTF32ToWide(s4buf.data(), s4buf.length()); \
- \
- /* test conversions */ \
- UNIT_ASSERT_VALUES_EQUAL(sbuf, s1buf); \
- UNIT_ASSERT_VALUES_EQUAL(wbuf, wstr2); \
- } while (false)
+ \
+ /* convert wchar32 to char */ \
+ TTempBuf tmpbuf2(s4buf.length() * 4); \
+ const TStringBuf s1buf = NDetail::NBaseOps::Recode(s4buf, tmpbuf2.Data(), enc); \
+ \
+ /* convert wchar32 to wchar16 */ \
+ const TUtf16String wstr2 = UTF32ToWide(s4buf.data(), s4buf.length()); \
+ \
+ /* test conversions */ \
+ UNIT_ASSERT_VALUES_EQUAL(sbuf, s1buf); \
+ UNIT_ASSERT_VALUES_EQUAL(wbuf, wstr2); \
+ } while (false)
void TConversionTest::TestCharToWide() {
TUtf16String w = CharToWide(YandexText, CODES_YANDEX);
@@ -210,7 +210,7 @@ void TConversionTest::TestYandexEncoding() {
UNIT_ASSERT(w == wideCyrillicAlphabet);
const char* utf8NonBMP2 = "ab\xf4\x80\x89\x87n";
- wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'};
+ wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'};
TestSurrogates(utf8NonBMP2, wNonBMPDummy2, Y_ARRAY_SIZE(wNonBMPDummy2), CODES_UTF8);
{
@@ -232,7 +232,7 @@ void TConversionTest::TestRecodeIntoString() {
TString sYandex(UnicodeText.size() * 4, 'x');
const char* sdata = sYandex.data();
TStringBuf sres = NDetail::Recode<wchar16>(UnicodeText, sYandex, CODES_YANDEX);
- UNIT_ASSERT(sYandex == YandexText); // same content
+ UNIT_ASSERT(sYandex == YandexText); // same content
UNIT_ASSERT(sYandex.data() == sdata); // reserved buffer reused
UNIT_ASSERT(sYandex.data() == sres.data()); // same buffer
UNIT_ASSERT(sYandex.size() == sres.size()); // same size
@@ -242,7 +242,7 @@ void TConversionTest::TestRecodeIntoString() {
sUnicode.reserve(YandexText.size() * 4);
const wchar16* wdata = sUnicode.data();
TWtringBuf wres = NDetail::Recode<char>(YandexText, sUnicode, CODES_YANDEX);
- UNIT_ASSERT(sUnicode == UnicodeText); // same content
+ UNIT_ASSERT(sUnicode == UnicodeText); // same content
UNIT_ASSERT(sUnicode.data() == wdata); // reserved buffer reused
UNIT_ASSERT(sUnicode.data() == wres.data()); // same buffer
UNIT_ASSERT(sUnicode.size() == wres.size()); // same size
@@ -250,8 +250,8 @@ void TConversionTest::TestRecodeIntoString() {
TString sUtf8 = " ";
size_t scap = sUtf8.capacity();
sres = NDetail::Recode<wchar16>(UnicodeText, sUtf8, CODES_UTF8);
- UNIT_ASSERT(sUtf8 == UTF8Text); // same content
- UNIT_ASSERT(sUtf8.capacity() > scap); // increased buffer capacity (supplied was too small)
+ UNIT_ASSERT(sUtf8 == UTF8Text); // same content
+ UNIT_ASSERT(sUtf8.capacity() > scap); // increased buffer capacity (supplied was too small)
UNIT_ASSERT(sUtf8.data() == sres.data()); // same buffer
UNIT_ASSERT(sUtf8.size() == sres.size()); // same size
TEST_WCHAR32(sUtf8, UnicodeText, CODES_UTF8);
@@ -260,7 +260,7 @@ void TConversionTest::TestRecodeIntoString() {
wdata = sUnicode.data();
TUtf16String copy = sUnicode; // increase ref-counter
wres = NDetail::Recode<char>(UTF8Text, sUnicode, CODES_UTF8);
- UNIT_ASSERT(sUnicode == UnicodeText); // same content
+ UNIT_ASSERT(sUnicode == UnicodeText); // same content
#ifndef TSTRING_IS_STD_STRING
UNIT_ASSERT(sUnicode.data() != wdata); // re-allocated (shared buffer supplied)
UNIT_ASSERT(sUnicode.data() == wres.data()); // same buffer
diff --git a/library/cpp/charset/ya.make b/library/cpp/charset/ya.make
index 7565566bf0..8906c507f0 100644
--- a/library/cpp/charset/ya.make
+++ b/library/cpp/charset/ya.make
@@ -1,10 +1,10 @@
-LIBRARY()
+LIBRARY()
OWNER(alzobnin)
SRCS(
- generated/cp_data.cpp
- generated/encrec_data.cpp
+ generated/cp_data.cpp
+ generated/encrec_data.cpp
codepage.cpp
cp_encrec.cpp
doccodes.cpp