aboutsummaryrefslogtreecommitdiffstats
path: root/util/charset
diff options
context:
space:
mode:
authorsmalov <smalov@yandex-team.ru>2022-02-10 16:47:36 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:47:36 +0300
commitcfadda92ca195da3ad68d721a58872a4f1ced696 (patch)
treec0748b5dcbade83af788c0abfa89c0383d6b779c /util/charset
parentf70d9720e13aef3a935e3f405b0eac554529e76e (diff)
downloadydb-cfadda92ca195da3ad68d721a58872a4f1ced696.tar.gz
Restoring authorship annotation for <smalov@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'util/charset')
-rw-r--r--util/charset/wide.cpp110
-rw-r--r--util/charset/wide.h104
-rw-r--r--util/charset/wide_ut.cpp868
3 files changed, 541 insertions, 541 deletions
diff --git a/util/charset/wide.cpp b/util/charset/wide.cpp
index e71abf0cfe..a287438ddd 100644
--- a/util/charset/wide.cpp
+++ b/util/charset/wide.cpp
@@ -3,17 +3,17 @@
#include <util/generic/mem_copy.h>
#include <util/string/strip.h>
-namespace {
- //! the constants are not zero-terminated
+namespace {
+ //! the constants are not zero-terminated
const wchar16 LT[] = {'&', 'l', 't', ';'};
const wchar16 GT[] = {'&', 'g', 't', ';'};
const wchar16 AMP[] = {'&', 'a', 'm', 'p', ';'};
const wchar16 BR[] = {'<', 'B', 'R', '>'};
const wchar16 QUOT[] = {'&', 'q', 'u', 'o', 't', ';'};
-
+
template <bool insertBr>
- inline size_t EscapedLen(wchar16 c) {
- switch (c) {
+ inline size_t EscapedLen(wchar16 c) {
+ switch (c) {
case '<':
return Y_ARRAY_SIZE(LT);
case '>':
@@ -27,37 +27,37 @@ namespace {
return Y_ARRAY_SIZE(BR);
else
return 1;
- }
- }
-}
-
+ }
+ }
+}
+
void Collapse(TUtf16String& w) {
CollapseImpl(w, w, 0, IsWhitespace);
-}
-
-size_t Collapse(wchar16* s, size_t n) {
+}
+
+size_t Collapse(wchar16* s, size_t n) {
return CollapseImpl(s, n, IsWhitespace);
-}
-
+}
+
TWtringBuf StripLeft(const TWtringBuf text) noexcept {
const auto* p = text.data();
const auto* const pe = text.data() + text.size();
-
+
for (; p != pe && IsWhitespace(*p); ++p) {
}
-
+
return {p, pe};
}
-
+
void StripLeft(TUtf16String& text) {
const auto stripped = StripLeft(TWtringBuf(text));
if (stripped.size() == text.size()) {
return;
- }
-
+ }
+
text = stripped;
-}
-
+}
+
TWtringBuf StripRight(const TWtringBuf text) noexcept {
if (!text) {
return {};
@@ -574,53 +574,53 @@ void EscapeHtmlChars(TUtf16String& str) {
static const TUtf16String amp(AMP, Y_ARRAY_SIZE(AMP));
static const TUtf16String br(BR, Y_ARRAY_SIZE(BR));
static const TUtf16String quot(QUOT, Y_ARRAY_SIZE(QUOT));
-
- size_t escapedLen = 0;
-
+
+ size_t escapedLen = 0;
+
const TUtf16String& cs = str;
-
- for (size_t i = 0; i < cs.size(); ++i)
+
+ for (size_t i = 0; i < cs.size(); ++i)
escapedLen += EscapedLen<insertBr>(cs[i]);
-
- if (escapedLen == cs.size())
- return;
-
+
+ if (escapedLen == cs.size())
+ return;
+
TUtf16String res;
- res.reserve(escapedLen);
-
- size_t start = 0;
-
+ res.reserve(escapedLen);
+
+ size_t start = 0;
+
for (size_t i = 0; i < cs.size(); ++i) {
const TUtf16String* ent = nullptr;
switch (cs[i]) {
- case '<':
- ent = &lt;
- break;
- case '>':
- ent = &gt;
- break;
- case '&':
- ent = &amp;
- break;
+ case '<':
+ ent = &lt;
+ break;
+ case '>':
+ ent = &gt;
+ break;
+ case '&':
+ ent = &amp;
+ break;
case '\"':
ent = &quot;
break;
- default:
+ default:
if (insertBr && (cs[i] == '\r' || cs[i] == '\n')) {
ent = &br;
break;
} else
continue;
- }
-
- res.append(cs.begin() + start, cs.begin() + i);
- res.append(ent->begin(), ent->end());
- start = i + 1;
- }
-
- res.append(cs.begin() + start, cs.end());
- res.swap(str);
-}
-
+ }
+
+ res.append(cs.begin() + start, cs.begin() + i);
+ res.append(ent->begin(), ent->end());
+ start = i + 1;
+ }
+
+ res.append(cs.begin() + start, cs.end());
+ res.swap(str);
+}
+
template void EscapeHtmlChars<false>(TUtf16String& str);
template void EscapeHtmlChars<true>(TUtf16String& str);
diff --git a/util/charset/wide.h b/util/charset/wide.h
index f8d63d4289..04e6928aab 100644
--- a/util/charset/wide.h
+++ b/util/charset/wide.h
@@ -23,7 +23,7 @@ template <class T>
class TTempArray;
using TCharTemp = TTempArray<wchar16>;
-namespace NDetail {
+namespace NDetail {
inline TString InStringMsg(const char* s, size_t len) {
return (len <= 50) ? " in string " + TString(s, len).Quote() : TString();
}
@@ -301,7 +301,7 @@ namespace NDetail {
//! @return len if robust and position where encoding stopped if not
template <bool robust, typename TCharType>
inline size_t UTF8ToWideImpl(const char* text, size_t len, TCharType* dest, size_t& written) noexcept {
- const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
+ const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
const unsigned char* last = cur + len;
TCharType* p = dest;
#ifdef _sse_ //can't check for sse4, as we build most of arcadia without sse4 support even on platforms that support it
@@ -311,10 +311,10 @@ inline size_t UTF8ToWideImpl(const char* text, size_t len, TCharType* dest, size
#endif
::NDetail::UTF8ToWideImplScalar<robust>(cur, last, p);
- written = p - dest;
+ written = p - dest;
return cur - reinterpret_cast<const unsigned char*>(text);
-}
-
+}
+
template <typename TCharType>
inline size_t UTF8ToWideImpl(const char* text, size_t len, TCharType* dest, size_t& written) {
return UTF8ToWideImpl<false>(text, len, dest, written);
@@ -421,13 +421,13 @@ inline TStringBuf WideToUTF8(const TWtringBuf src, TString& dst) {
inline TString WideToUTF8(const wchar16* text, size_t len) {
TString s = TString::Uninitialized(WideToUTF8BufferSize(len));
- size_t written = 0;
- WideToUTF8(text, len, s.begin(), written);
+ size_t written = 0;
+ WideToUTF8(text, len, s.begin(), written);
Y_ASSERT(s.size() >= written);
- s.remove(written);
- return s;
-}
-
+ s.remove(written);
+ return s;
+}
+
inline TString WideToUTF8(const wchar32* text, size_t len) {
TString s = TString::Uninitialized(WideToUTF8BufferSize(len));
size_t written = 0;
@@ -439,8 +439,8 @@ inline TString WideToUTF8(const wchar32* text, size_t len) {
inline TString WideToUTF8(const TWtringBuf w) {
return WideToUTF8(w.data(), w.size());
-}
-
+}
+
inline TString WideToUTF8(const TUtf32StringBuf w) {
return WideToUTF8(w.data(), w.size());
}
@@ -594,11 +594,11 @@ namespace NDetail {
}
-//! returns @c true if character sequence has no symbols with value greater than 0x7F
-template <typename TChar>
-inline bool IsStringASCII(const TChar* first, const TChar* last) {
+//! returns @c true if character sequence has no symbols with value greater than 0x7F
+template <typename TChar>
+inline bool IsStringASCII(const TChar* first, const TChar* last) {
return ::NDetail::DoIsStringASCII(first, last);
-}
+}
#ifdef _sse2_
template <>
@@ -618,76 +618,76 @@ inline void Copy(const TChar* first, size_t len, TChar* result) {
memcpy(result, first, len * sizeof(TChar));
}
-template <typename TChar1, typename TChar2>
-inline void Copy(const TChar1* first, size_t len, TChar2* result) {
- Copy(first, first + len, result);
-}
+template <typename TChar1, typename TChar2>
+inline void Copy(const TChar1* first, size_t len, TChar2* result) {
+ Copy(first, first + len, result);
+}
-//! copies symbols from one character sequence to another without any conversion
-//! @note this function can be used instead of the template constructor of @c std::basic_string:
-//! template <typename InputIterator>
-//! basic_string(InputIterator begin, InputIterator end, const Allocator& a = Allocator());
-//! and the family of template member functions: append, assign, insert, replace.
+//! copies symbols from one character sequence to another without any conversion
+//! @note this function can be used instead of the template constructor of @c std::basic_string:
+//! template <typename InputIterator>
+//! basic_string(InputIterator begin, InputIterator end, const Allocator& a = Allocator());
+//! and the family of template member functions: append, assign, insert, replace.
template <typename TStringType, typename TChar>
inline TStringType CopyTo(const TChar* first, const TChar* last) {
Y_ASSERT(first <= last);
TStringType str = TStringType::Uninitialized(last - first);
- Copy(first, last, str.begin());
- return str;
-}
+ Copy(first, last, str.begin());
+ return str;
+}
template <typename TStringType, typename TChar>
inline TStringType CopyTo(const TChar* s, size_t n) {
TStringType str = TStringType::Uninitialized(n);
- Copy(s, n, str.begin());
- return str;
-}
-
+ Copy(s, n, str.begin());
+ return str;
+}
+
inline TString WideToASCII(const TWtringBuf w) {
Y_ASSERT(IsStringASCII(w.begin(), w.end()));
return CopyTo<TString>(w.begin(), w.end());
-}
-
+}
+
inline TUtf16String ASCIIToWide(const TStringBuf s) {
Y_ASSERT(IsStringASCII(s.begin(), s.end()));
return CopyTo<TUtf16String>(s.begin(), s.end());
-}
-
+}
+
inline TUtf32String ASCIIToUTF32(const TStringBuf s) {
Y_ASSERT(IsStringASCII(s.begin(), s.end()));
return CopyTo<TUtf32String>(s.begin(), s.end());
}
-//! returns @c true if string contains whitespace characters only
+//! returns @c true if string contains whitespace characters only
inline bool IsSpace(const wchar16* s, size_t n) {
if (n == 0)
return false;
Y_ASSERT(s);
- const wchar16* const e = s + n;
- for (const wchar16* p = s; p != e; ++p) {
+ const wchar16* const e = s + n;
+ for (const wchar16* p = s; p != e; ++p) {
if (!IsWhitespace(*p))
- return false;
- }
- return true;
-}
-
+ return false;
+ }
+ return true;
+}
+
//! returns @c true if string contains whitespace characters only
inline bool IsSpace(const TWtringBuf s) {
return IsSpace(s.data(), s.length());
}
-//! replaces multiple sequential whitespace characters with a single space character
+//! replaces multiple sequential whitespace characters with a single space character
void Collapse(TUtf16String& w);
-
-//! @return new length
-size_t Collapse(wchar16* s, size_t n);
-
+
+//! @return new length
+size_t Collapse(wchar16* s, size_t n);
+
//! Removes leading whitespace characters
TWtringBuf StripLeft(const TWtringBuf text) noexcept Y_WARN_UNUSED_RESULT;
void StripLeft(TUtf16String& text);
-
+
//! Removes trailing whitespace characters
TWtringBuf StripRight(const TWtringBuf text) noexcept Y_WARN_UNUSED_RESULT;
void StripRight(TUtf16String& text);
@@ -807,7 +807,7 @@ TUtf32String ToLowerRet(const TUtf32StringBuf text, size_t pos = 0, size_t count
TUtf32String ToUpperRet(const TUtf32StringBuf text, size_t pos = 0, size_t count = TWtringBuf::npos) Y_WARN_UNUSED_RESULT;
TUtf32String ToTitleRet(const TUtf32StringBuf text, size_t pos = 0, size_t count = TWtringBuf::npos) Y_WARN_UNUSED_RESULT;
-//! replaces the '<', '>' and '&' characters in string with '&lt;', '&gt;' and '&amp;' respectively
+//! replaces the '<', '>' and '&' characters in string with '&lt;', '&gt;' and '&amp;' respectively
// insertBr=true - replace '\r' and '\n' with "<BR>"
template <bool insertBr>
void EscapeHtmlChars(TUtf16String& str);
diff --git a/util/charset/wide_ut.cpp b/util/charset/wide_ut.cpp
index 995d3d1d85..d8f3233e73 100644
--- a/util/charset/wide_ut.cpp
+++ b/util/charset/wide_ut.cpp
@@ -7,18 +7,18 @@
#include <algorithm>
-namespace {
- //! three UTF8 encoded russian letters (A, B, V)
+namespace {
+ //! three UTF8 encoded russian letters (A, B, V)
const char utext[] = "\xd0\x90\xd0\x91\xd0\x92";
-
- const char asciiLatinAlphabet[] = "ABCDEFGHIGKLMNOPQRSTUVWXYZabcdefghigklmnopqrstuvwxyz";
- const wchar16 wideLatinAlphabet[] = {
- 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'G', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+
+ const char asciiLatinAlphabet[] = "ABCDEFGHIGKLMNOPQRSTUVWXYZabcdefghigklmnopqrstuvwxyz";
+ const wchar16 wideLatinAlphabet[] = {
+ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'G', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'g', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0};
- const wchar16 wideCyrillicAlphabet[] = {
- 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
- 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
- 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
+ const wchar16 wideCyrillicAlphabet[] = {
+ 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
+ 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
+ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, 0x00};
const char utf8CyrillicAlphabet[] =
"\xd0\x90\xd0\x91\xd0\x92\xd0\x93\xd0\x94\xd0\x95\xd0\x96\xd0\x97"
@@ -29,22 +29,22 @@ namespace {
"\xd0\xb8\xd0\xb9\xd0\xba\xd0\xbb\xd0\xbc\xd0\xbd\xd0\xbe\xd0\xbf"
"\xd1\x80\xd1\x81\xd1\x82\xd1\x83\xd1\x84\xd1\x85\xd1\x86\xd1\x87"
"\xd1\x88\xd1\x89\xd1\x8a\xd1\x8b\xd1\x8c\xd1\x8d\xd1\x8e\xd1\x8f";
-
- const wchar32 LEAD_BITS_MASK_2_BYTES = 0x1F;
- const wchar32 LEAD_BITS_MASK_3_BYTES = 0x0F;
+
+ const wchar32 LEAD_BITS_MASK_2_BYTES = 0x1F;
+ const wchar32 LEAD_BITS_MASK_3_BYTES = 0x0F;
const wchar32 LEAD_BITS_MASK_4_BYTES = 0x07;
-
- wchar16 ws[] = {
+
+ wchar16 ws[] = {
0x0009,
0x000A, 0x2028, 0x2029,
0x000B,
0x000C,
0x000D,
0x0020, 0x1680,
- 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B,
- 0x202F, 0x205F, 0x3000,
+ 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B,
+ 0x202F, 0x205F, 0x3000,
0x00A0};
-
+
const size_t CaseTestDataSize = 10;
wchar32 WideStringTestData[][CaseTestDataSize] = {
{0x01C4, 0x10428, 0x10429, 0x10447, 0x10441, 0x1C03, 0x00A0, 0x10400, 0x10415, 0x10437}, // original
@@ -54,32 +54,32 @@ namespace {
};
TUtf16String CreateUnicodeText() {
- const int len = 256;
- wchar16 text[len] = {
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x00 - 0x0F
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x10 - 0x1F
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x20 - 0x2F
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x30 - 0x3F
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x40 - 0x4F
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x50 - 0x5F
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x60 - 0x6F
- 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x70 - 0x7F
-
- 0x0301, 0x00C4, 0x00D6, 0x00DC, 0x0104, 0x0106, 0x0118, 0x0141, 0x00E0, 0x00E2, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x0490, 0x00AD, // 0x80 - 0x8F
- 0x00DF, 0x00E4, 0x00F6, 0x00FC, 0x0105, 0x0107, 0x0119, 0x0142, 0x00EB, 0x00EE, 0x00EF, 0x00F4, 0x00F9, 0x00FB, 0x0491, 0x92CF, // 0x90 - 0x9F
- 0x00A0, 0x0143, 0x00D3, 0x015A, 0x017B, 0x0179, 0x046C, 0x00A7, 0x0401, 0x0462, 0x0472, 0x0474, 0x040E, 0x0406, 0x0404, 0x0407, // 0xA0 - 0xAF
- 0x00B0, 0x0144, 0x00F3, 0x015B, 0x017C, 0x017A, 0x046D, 0x2116, 0x0451, 0x0463, 0x0473, 0x0475, 0x045E, 0x0456, 0x0454, 0x0457 // 0xB0 - 0xBF
- };
- for (int i = 0; i < len; ++i) {
- if (i <= 0x7F) { // ASCII characters without 0x7 and 0x1B
- text[i] = static_cast<wchar16>(i);
+ const int len = 256;
+ wchar16 text[len] = {
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x00 - 0x0F
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x10 - 0x1F
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x20 - 0x2F
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x30 - 0x3F
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x40 - 0x4F
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x50 - 0x5F
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x60 - 0x6F
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x70 - 0x7F
+
+ 0x0301, 0x00C4, 0x00D6, 0x00DC, 0x0104, 0x0106, 0x0118, 0x0141, 0x00E0, 0x00E2, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x0490, 0x00AD, // 0x80 - 0x8F
+ 0x00DF, 0x00E4, 0x00F6, 0x00FC, 0x0105, 0x0107, 0x0119, 0x0142, 0x00EB, 0x00EE, 0x00EF, 0x00F4, 0x00F9, 0x00FB, 0x0491, 0x92CF, // 0x90 - 0x9F
+ 0x00A0, 0x0143, 0x00D3, 0x015A, 0x017B, 0x0179, 0x046C, 0x00A7, 0x0401, 0x0462, 0x0472, 0x0474, 0x040E, 0x0406, 0x0404, 0x0407, // 0xA0 - 0xAF
+ 0x00B0, 0x0144, 0x00F3, 0x015B, 0x017C, 0x017A, 0x046D, 0x2116, 0x0451, 0x0463, 0x0473, 0x0475, 0x045E, 0x0456, 0x0454, 0x0457 // 0xB0 - 0xBF
+ };
+ for (int i = 0; i < len; ++i) {
+ if (i <= 0x7F) { // ASCII characters without 0x7 and 0x1B
+ text[i] = static_cast<wchar16>(i);
} else if (i >= 0xC0 && i <= 0xFF) { // russian characters (without YO and yo)
- text[i] = static_cast<wchar16>(i + 0x0350); // 0x0410 - 0x044F
- }
- }
+ text[i] = static_cast<wchar16>(i + 0x0350); // 0x0410 - 0x044F
+ }
+ }
return TUtf16String(text, len);
- }
-
+ }
+
TString CreateUTF8Text() {
char text[] = {
'\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
@@ -108,9 +108,9 @@ namespace {
'\xd1', '\x87', '\xd1', '\x88', '\xd1', '\x89', '\xd1', '\x8a', '\xd1', '\x8b', '\xd1', '\x8c', '\xd1', '\x8d', '\xd1', '\x8e',
'\xd1', '\x8f'};
return TString(text, Y_ARRAY_SIZE(text));
- }
-
- //! use this function to dump UTF8 text into a file in case of any changes
+ }
+
+ //! use this function to dump UTF8 text into a file in case of any changes
// void DumpUTF8Text() {
// TString s = WideToUTF8(UnicodeText);
// std::ofstream f("utf8.txt");
@@ -121,53 +121,53 @@ namespace {
// f << std::endl;
// }
// }
-
- void CheckRecodeOK(wchar32 expected, unsigned char* first, size_t n) {
- wchar32 w = 0;
- const unsigned char* p = first;
-
+
+ void CheckRecodeOK(wchar32 expected, unsigned char* first, size_t n) {
+ wchar32 w = 0;
+ const unsigned char* p = first;
+
RECODE_RESULT r = ReadUTF8CharAndAdvance(w, p, first + n);
- UNIT_ASSERT(w == expected);
- UNIT_ASSERT(size_t(p - first) == n);
- UNIT_ASSERT(r == RECODE_OK);
- }
-
- void CheckBrokenSymbol(unsigned char* first, unsigned char* last) {
- wchar32 w = 0;
- const unsigned char* p = first;
-
+ UNIT_ASSERT(w == expected);
+ UNIT_ASSERT(size_t(p - first) == n);
+ UNIT_ASSERT(r == RECODE_OK);
+ }
+
+ void CheckBrokenSymbol(unsigned char* first, unsigned char* last) {
+ wchar32 w = 0;
+ const unsigned char* p = first;
+
RECODE_RESULT r = ReadUTF8CharAndAdvance(w, p, last);
- UNIT_ASSERT(w == BROKEN_RUNE);
- UNIT_ASSERT(p - first == 0);
- UNIT_ASSERT(r == RECODE_BROKENSYMBOL);
- }
-
- void CheckEndOfInput(unsigned char* first, size_t n) {
- wchar32 w = 0;
- const unsigned char* p = first;
-
+ UNIT_ASSERT(w == BROKEN_RUNE);
+ UNIT_ASSERT(p - first == 0);
+ UNIT_ASSERT(r == RECODE_BROKENSYMBOL);
+ }
+
+ void CheckEndOfInput(unsigned char* first, size_t n) {
+ wchar32 w = 0;
+ const unsigned char* p = first;
+
RECODE_RESULT r = ReadUTF8CharAndAdvance(w, p, first + n);
- (void)w;
- UNIT_ASSERT(p - first == 0);
- UNIT_ASSERT(r == RECODE_EOINPUT);
- }
-
- void CheckCharLen(unsigned char* first, unsigned char* last, size_t len, RECODE_RESULT result) {
- size_t n = 0;
+ (void)w;
+ UNIT_ASSERT(p - first == 0);
+ UNIT_ASSERT(r == RECODE_EOINPUT);
+ }
+
+ void CheckCharLen(unsigned char* first, unsigned char* last, size_t len, RECODE_RESULT result) {
+ size_t n = 0;
RECODE_RESULT r = GetUTF8CharLen(n, first, last);
- UNIT_ASSERT(n == len);
- UNIT_ASSERT(r == result);
- }
-}
-
+ UNIT_ASSERT(n == len);
+ UNIT_ASSERT(r == result);
+ }
+}
+
class TConversionTest: public TTestBase {
-private:
- //! @note every of the text can have zeros in the middle
+private:
+ //! @note every of the text can have zeros in the middle
const TUtf16String UnicodeText_;
const TString Utf8Text_;
-
-private:
- UNIT_TEST_SUITE(TConversionTest);
+
+private:
+ UNIT_TEST_SUITE(TConversionTest);
UNIT_TEST(TestReadUTF8Char);
UNIT_TEST(TestGetUTF8CharLen);
UNIT_TEST(TestWriteUTF8Char);
@@ -178,29 +178,29 @@ private:
UNIT_TEST(TestUnicodeCase);
UNIT_TEST(TestUnicodeDetails);
UNIT_TEST(TestHexConversion);
- UNIT_TEST_SUITE_END();
-
-public:
- TConversionTest()
+ UNIT_TEST_SUITE_END();
+
+public:
+ TConversionTest()
: UnicodeText_(CreateUnicodeText())
, Utf8Text_(CreateUTF8Text())
- {
- }
-
- void TestReadUTF8Char();
- void TestGetUTF8CharLen();
- void TestWriteUTF8Char();
- void TestUTF8ToWide();
- void TestWideToUTF8();
- void TestGetNumOfUTF8Chars();
+ {
+ }
+
+ void TestReadUTF8Char();
+ void TestGetUTF8CharLen();
+ void TestWriteUTF8Char();
+ void TestUTF8ToWide();
+ void TestWideToUTF8();
+ void TestGetNumOfUTF8Chars();
void TestSubstrUTF8();
void TestUnicodeCase();
void TestUnicodeDetails();
void TestHexConversion();
-};
-
-UNIT_TEST_SUITE_REGISTRATION(TConversionTest);
-
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TConversionTest);
+
void TConversionTest::TestHexConversion() {
for (char ch = '0'; ch <= '9'; ++ch) {
UNIT_ASSERT(isxdigit(ch));
@@ -224,25 +224,25 @@ void TConversionTest::TestHexConversion() {
}
}
-void TConversionTest::TestReadUTF8Char() {
- wchar32 e; // expected unicode char
- wchar32 c;
- unsigned long u; // single UTF8 encoded character
- unsigned char* const first = reinterpret_cast<unsigned char*>(&u);
- unsigned char* const last = first + sizeof(u);
-
- // all ASCII characters are converted with no change (zero converted successfully as well)
- for (c = 0; c <= 0x7F; ++c) {
- u = c;
- CheckRecodeOK(c, first, 1);
- }
-
- // broken symbols from the second half of ASCII table (1000 0000 - 1011 1111)
- for (c = 0x80; c <= 0xBF; ++c) {
- u = c;
- CheckBrokenSymbol(first, last);
- }
-
+void TConversionTest::TestReadUTF8Char() {
+ wchar32 e; // expected unicode char
+ wchar32 c;
+ unsigned long u; // single UTF8 encoded character
+ unsigned char* const first = reinterpret_cast<unsigned char*>(&u);
+ unsigned char* const last = first + sizeof(u);
+
+ // all ASCII characters are converted with no change (zero converted successfully as well)
+ for (c = 0; c <= 0x7F; ++c) {
+ u = c;
+ CheckRecodeOK(c, first, 1);
+ }
+
+ // broken symbols from the second half of ASCII table (1000 0000 - 1011 1111)
+ for (c = 0x80; c <= 0xBF; ++c) {
+ u = c;
+ CheckBrokenSymbol(first, last);
+ }
+
// overlong encoding: leading byte of 2-byte symbol: 1100 0000 - 1100 0001
for (c = 0xC0; c <= 0xC1; ++c) {
u = c;
@@ -254,20 +254,20 @@ void TConversionTest::TestReadUTF8Char() {
CheckEndOfInput(first, 1);
}
- // leading byte of 2-byte symbol: 1100 0000 - 1101 1111
+ // leading byte of 2-byte symbol: 1100 0000 - 1101 1111
for (c = 0xC2; c <= 0xDF; ++c) {
- u = c;
- CheckBrokenSymbol(first, last);
-
- u |= 0x8000;
- // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
- e = c & LEAD_BITS_MASK_2_BYTES;
- e <<= 6;
- CheckRecodeOK(e, first, 2);
-
- CheckEndOfInput(first, 1);
- }
-
+ u = c;
+ CheckBrokenSymbol(first, last);
+
+ u |= 0x8000;
+ // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
+ e = c & LEAD_BITS_MASK_2_BYTES;
+ e <<= 6;
+ CheckRecodeOK(e, first, 2);
+
+ CheckEndOfInput(first, 1);
+ }
+
// possible overlong encoding with leading byte 1110 0000
{
u = c = 0xE0;
@@ -286,19 +286,19 @@ void TConversionTest::TestReadUTF8Char() {
// leading byte of 3-byte symbol: 1110 0001 - 1110 1111
for (c = 0xE1; c <= 0xEF; ++c) {
- u = c;
- CheckBrokenSymbol(first, last);
-
- u |= 0x808000;
- // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
- e = c & LEAD_BITS_MASK_3_BYTES;
- e <<= 12;
- CheckRecodeOK(e, first, 3);
-
- CheckEndOfInput(first, 2);
- CheckEndOfInput(first, 1);
- }
-
+ u = c;
+ CheckBrokenSymbol(first, last);
+
+ u |= 0x808000;
+ // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
+ e = c & LEAD_BITS_MASK_3_BYTES;
+ e <<= 12;
+ CheckRecodeOK(e, first, 3);
+
+ CheckEndOfInput(first, 2);
+ CheckEndOfInput(first, 1);
+ }
+
// possible overlong encoding with leading byte 1111 0000
{
u = c = 0xF0;
@@ -318,20 +318,20 @@ void TConversionTest::TestReadUTF8Char() {
// leading byte of 4-byte symbol: 1111 0001 - 1111 0111
for (c = 0xF1; c <= 0xF3; ++c) {
- u = c;
- CheckBrokenSymbol(first, last);
-
- u |= 0x80808000;
- // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
- e = c & LEAD_BITS_MASK_4_BYTES;
- e <<= 18;
- CheckRecodeOK(e, first, 4);
-
- CheckEndOfInput(first, 3);
- CheckEndOfInput(first, 2);
- CheckEndOfInput(first, 1);
- }
-
+ u = c;
+ CheckBrokenSymbol(first, last);
+
+ u |= 0x80808000;
+ // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
+ e = c & LEAD_BITS_MASK_4_BYTES;
+ e <<= 18;
+ CheckRecodeOK(e, first, 4);
+
+ CheckEndOfInput(first, 3);
+ CheckEndOfInput(first, 2);
+ CheckEndOfInput(first, 1);
+ }
+
// possible invalid code points with leading byte 1111 0100
{
c = 0xF4;
@@ -352,108 +352,108 @@ void TConversionTest::TestReadUTF8Char() {
// broken symbols: 1111 0101 - 1111 1111
for (c = 0xF5; c <= 0xFF; ++c) {
- u = c;
- CheckBrokenSymbol(first, last);
- }
-}
-
-void TConversionTest::TestGetUTF8CharLen() {
- wchar32 c;
- unsigned long u; // single UTF8 encoded character
- unsigned char* const first = reinterpret_cast<unsigned char*>(&u);
- unsigned char* const last = first + sizeof(u);
-
- // all ASCII characters are converted with no change (zero converted successfully as well)
- for (c = 0; c <= 0x7F; ++c) {
- u = c;
- CheckCharLen(first, last, 1, RECODE_OK);
- }
-
- // broken symbols from the second half of ASCII table (1000 0000 - 1011 1111)
- for (c = 0x80; c <= 0xBF; ++c) {
- u = c;
- CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
- }
-
- // leading byte of 2-byte symbol: 1100 0000 - 1101 1111
- for (c = 0xC0; c <= 0xDF; ++c) {
- u = c;
- CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
-
- u |= 0x8000;
- // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
- CheckCharLen(first, last, 2, RECODE_OK);
-
- CheckCharLen(first, first + 1, 0, RECODE_EOINPUT);
- }
-
- // leading byte of 3-byte symbol: 1110 0000 - 1110 1111
- for (c = 0xE0; c <= 0xEF; ++c) {
- u = c;
- CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
-
- u |= 0x808000;
- // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
- CheckCharLen(first, last, 3, RECODE_OK);
-
- CheckCharLen(first, first + 2, 0, RECODE_EOINPUT);
- CheckCharLen(first, first + 1, 0, RECODE_EOINPUT);
- }
-
- // leading byte of 4-byte symbol: 1111 0000 - 1111 0111
- for (c = 0xF0; c <= 0xF3; ++c) {
- u = c;
- CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
-
- u |= 0x80808000;
- // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
- CheckCharLen(first, last, 4, RECODE_OK);
-
- CheckCharLen(first, first + 3, 0, RECODE_EOINPUT);
- CheckCharLen(first, first + 2, 0, RECODE_EOINPUT);
- CheckCharLen(first, first + 1, 0, RECODE_EOINPUT);
- }
-
- // broken symbols: 1111 1000 - 1111 1111
- for (c = 0xF8; c <= 0xFF; ++c) {
- u = c;
- CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
- }
-}
-
-void TConversionTest::TestWriteUTF8Char() {
- wchar32 w;
- unsigned long u; // single UTF8 encoded character
- size_t n;
-
- for (w = 0x00; w < 0x80; ++w) {
- u = 0;
+ u = c;
+ CheckBrokenSymbol(first, last);
+ }
+}
+
+void TConversionTest::TestGetUTF8CharLen() {
+ wchar32 c;
+ unsigned long u; // single UTF8 encoded character
+ unsigned char* const first = reinterpret_cast<unsigned char*>(&u);
+ unsigned char* const last = first + sizeof(u);
+
+ // all ASCII characters are converted with no change (zero converted successfully as well)
+ for (c = 0; c <= 0x7F; ++c) {
+ u = c;
+ CheckCharLen(first, last, 1, RECODE_OK);
+ }
+
+ // broken symbols from the second half of ASCII table (1000 0000 - 1011 1111)
+ for (c = 0x80; c <= 0xBF; ++c) {
+ u = c;
+ CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
+ }
+
+ // leading byte of 2-byte symbol: 1100 0000 - 1101 1111
+ for (c = 0xC0; c <= 0xDF; ++c) {
+ u = c;
+ CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
+
+ u |= 0x8000;
+ // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
+ CheckCharLen(first, last, 2, RECODE_OK);
+
+ CheckCharLen(first, first + 1, 0, RECODE_EOINPUT);
+ }
+
+ // leading byte of 3-byte symbol: 1110 0000 - 1110 1111
+ for (c = 0xE0; c <= 0xEF; ++c) {
+ u = c;
+ CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
+
+ u |= 0x808000;
+ // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
+ CheckCharLen(first, last, 3, RECODE_OK);
+
+ CheckCharLen(first, first + 2, 0, RECODE_EOINPUT);
+ CheckCharLen(first, first + 1, 0, RECODE_EOINPUT);
+ }
+
+ // leading byte of 4-byte symbol: 1111 0000 - 1111 0111
+ for (c = 0xF0; c <= 0xF3; ++c) {
+ u = c;
+ CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
+
+ u |= 0x80808000;
+ // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
+ CheckCharLen(first, last, 4, RECODE_OK);
+
+ CheckCharLen(first, first + 3, 0, RECODE_EOINPUT);
+ CheckCharLen(first, first + 2, 0, RECODE_EOINPUT);
+ CheckCharLen(first, first + 1, 0, RECODE_EOINPUT);
+ }
+
+ // broken symbols: 1111 1000 - 1111 1111
+ for (c = 0xF8; c <= 0xFF; ++c) {
+ u = c;
+ CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
+ }
+}
+
+void TConversionTest::TestWriteUTF8Char() {
+ wchar32 w;
+ unsigned long u; // single UTF8 encoded character
+ size_t n;
+
+ for (w = 0x00; w < 0x80; ++w) {
+ u = 0;
WriteUTF8Char(w, n, reinterpret_cast<unsigned char*>(&u));
- UNIT_ASSERT((u & 0xFFFFFF80) == 0x00000000);
- UNIT_ASSERT(n == 1);
- }
-
- for (w = 0x80; w < 0x800; ++w) {
- u = 0;
+ UNIT_ASSERT((u & 0xFFFFFF80) == 0x00000000);
+ UNIT_ASSERT(n == 1);
+ }
+
+ for (w = 0x80; w < 0x800; ++w) {
+ u = 0;
WriteUTF8Char(w, n, reinterpret_cast<unsigned char*>(&u));
- UNIT_ASSERT((u & 0xFFFFC000) == 0x00008000); // see constants in ReadUTF8Char
- UNIT_ASSERT(n == 2);
- }
-
- for (w = 0x800; w < 0x10000; ++w) {
- u = 0;
+ UNIT_ASSERT((u & 0xFFFFC000) == 0x00008000); // see constants in ReadUTF8Char
+ UNIT_ASSERT(n == 2);
+ }
+
+ for (w = 0x800; w < 0x10000; ++w) {
+ u = 0;
WriteUTF8Char(w, n, reinterpret_cast<unsigned char*>(&u));
- UNIT_ASSERT((u & 0xFFC0C000) == 0x00808000); // see constants in ReadUTF8Char
- UNIT_ASSERT(n == 3);
- }
-
- for (w = 0x10000; w < 0x80; ++w) {
+ UNIT_ASSERT((u & 0xFFC0C000) == 0x00808000); // see constants in ReadUTF8Char
+ UNIT_ASSERT(n == 3);
+ }
+
+ for (w = 0x10000; w < 0x80; ++w) {
WriteUTF8Char(w, n, reinterpret_cast<unsigned char*>(&u));
- UNIT_ASSERT((u & 0xC0C0C000) == 0x80808000); // see constants in ReadUTF8Char
- UNIT_ASSERT(n == 4);
- }
-}
-
+ UNIT_ASSERT((u & 0xC0C0C000) == 0x80808000); // see constants in ReadUTF8Char
+ UNIT_ASSERT(n == 4);
+ }
+}
+
static void TestSurrogates(const char* str, const wchar16* wide, size_t wideSize) {
TUtf16String w = UTF8ToWide(str);
@@ -465,41 +465,41 @@ static void TestSurrogates(const char* str, const wchar16* wide, size_t wideSize
UNIT_ASSERT(s == str);
}
-void TConversionTest::TestUTF8ToWide() {
+void TConversionTest::TestUTF8ToWide() {
TUtf16String w = UTF8ToWide(Utf8Text_);
-
- UNIT_ASSERT(w.size() == 256);
+
+ UNIT_ASSERT(w.size() == 256);
UNIT_ASSERT(w.size() == UnicodeText_.size());
-
- for (int i = 0; i < 256; ++i) {
+
+ for (int i = 0; i < 256; ++i) {
UNIT_ASSERT_VALUES_EQUAL(w[i], UnicodeText_[i]);
- }
-
+ }
+
wchar16 buffer[4] = {0};
- size_t written = 0;
- // the function must extract 2 symbols only
+ size_t written = 0;
+ // the function must extract 2 symbols only
bool result = UTF8ToWide(utext, 5, buffer, written);
- UNIT_ASSERT(!result);
- UNIT_ASSERT(buffer[0] == 0x0410);
- UNIT_ASSERT(buffer[1] == 0x0411);
- UNIT_ASSERT(buffer[2] == 0x0000);
- UNIT_ASSERT(buffer[3] == 0x0000);
- UNIT_ASSERT(written == 2);
-
- memset(buffer, 0, 4);
- written = 0;
+ UNIT_ASSERT(!result);
+ UNIT_ASSERT(buffer[0] == 0x0410);
+ UNIT_ASSERT(buffer[1] == 0x0411);
+ UNIT_ASSERT(buffer[2] == 0x0000);
+ UNIT_ASSERT(buffer[3] == 0x0000);
+ UNIT_ASSERT(written == 2);
+
+ memset(buffer, 0, 4);
+ written = 0;
result = UTF8ToWide(utext, 1, buffer, written);
- UNIT_ASSERT(!result);
- UNIT_ASSERT(buffer[0] == 0x0000);
- UNIT_ASSERT(buffer[1] == 0x0000);
- UNIT_ASSERT(buffer[2] == 0x0000);
- UNIT_ASSERT(buffer[3] == 0x0000);
- UNIT_ASSERT(written == 0);
+ UNIT_ASSERT(!result);
+ UNIT_ASSERT(buffer[0] == 0x0000);
+ UNIT_ASSERT(buffer[1] == 0x0000);
+ UNIT_ASSERT(buffer[2] == 0x0000);
+ UNIT_ASSERT(buffer[3] == 0x0000);
+ UNIT_ASSERT(written == 0);
w = UTF8ToWide(asciiLatinAlphabet, strlen(asciiLatinAlphabet));
- UNIT_ASSERT(w == wideLatinAlphabet);
+ UNIT_ASSERT(w == wideLatinAlphabet);
w = UTF8ToWide(utf8CyrillicAlphabet, strlen(utf8CyrillicAlphabet));
- UNIT_ASSERT(w == wideCyrillicAlphabet);
+ UNIT_ASSERT(w == wideCyrillicAlphabet);
const char* utf8NonBMP = "\xf4\x80\x89\x84\xf4\x80\x89\x87\xf4\x80\x88\xba";
wchar16 wNonBMPDummy[] = {0xDBC0, 0xDE44, 0xDBC0, 0xDE47, 0xDBC0, 0xDE3A};
@@ -513,42 +513,42 @@ void TConversionTest::TestUTF8ToWide() {
"m\xFB\xB2\xA5\xAA\xAFyeuse.sexwebcamz.com")))),
TString(
"m\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBDyeuse.sexwebcamz.com"));
-}
-
-void TConversionTest::TestWideToUTF8() {
+}
+
+void TConversionTest::TestWideToUTF8() {
TString s = WideToUTF8(UnicodeText_);
size_t len = 0;
for (TUtf16String::const_iterator i = UnicodeText_.begin(), ie = UnicodeText_.end(); i != ie; ++i) {
len += UTF8RuneLenByUCS(*i);
}
-
+
UNIT_ASSERT(s.size() == Utf8Text_.size());
UNIT_ASSERT(s.size() == len);
-
- for (int i = 0; i < static_cast<int>(s.size()); ++i) {
+
+ for (int i = 0; i < static_cast<int>(s.size()); ++i) {
UNIT_ASSERT_VALUES_EQUAL(s[i], Utf8Text_[i]);
- }
-}
-
-void TConversionTest::TestGetNumOfUTF8Chars() {
- size_t n = 0;
+ }
+}
+
+void TConversionTest::TestGetNumOfUTF8Chars() {
+ size_t n = 0;
bool result = GetNumberOfUTF8Chars(Utf8Text_.c_str(), Utf8Text_.size(), n);
- UNIT_ASSERT(result);
- UNIT_ASSERT(n == 256);
-
- n = 0;
+ UNIT_ASSERT(result);
+ UNIT_ASSERT(n == 256);
+
+ n = 0;
result = GetNumberOfUTF8Chars(utext, 5, n);
- UNIT_ASSERT(!result);
- UNIT_ASSERT(n == 2);
-
- n = 0;
+ UNIT_ASSERT(!result);
+ UNIT_ASSERT(n == 2);
+
+ n = 0;
result = GetNumberOfUTF8Chars(utext, 1, n);
- UNIT_ASSERT(!result);
- UNIT_ASSERT(n == 0);
+ UNIT_ASSERT(!result);
+ UNIT_ASSERT(n == 0);
UNIT_ASSERT_EQUAL(GetNumberOfUTF8Chars("привет!"), 7);
-}
-
+}
+
void TConversionTest::TestSubstrUTF8() {
TStringBuf utextBuf(utext, sizeof(utext));
UNIT_ASSERT(SubstrUTF8(utextBuf, 0, 2) == utextBuf.substr(0, 4));
@@ -580,7 +580,7 @@ void TConversionTest::TestUnicodeDetails() {
}
class TWideUtilTest: public TTestBase {
- UNIT_TEST_SUITE(TWideUtilTest);
+ UNIT_TEST_SUITE(TWideUtilTest);
UNIT_TEST(TestCollapse);
UNIT_TEST(TestCollapseBuffer);
UNIT_TEST(TestStrip);
@@ -600,148 +600,148 @@ class TWideUtilTest: public TTestBase {
UNIT_TEST(TestToLowerStr);
UNIT_TEST(TestToUpperStr);
UNIT_TEST(TestToTitleStr);
- UNIT_TEST_SUITE_END();
-
-public:
- void TestCollapse() {
+ UNIT_TEST_SUITE_END();
+
+public:
+ void TestCollapse() {
TUtf16String s;
s.append(ws, Y_ARRAY_SIZE(ws)).append(3, 'a').append(ws, Y_ARRAY_SIZE(ws)).append(3, 'b').append(ws, Y_ARRAY_SIZE(ws));
- Collapse(s);
+ Collapse(s);
UNIT_ASSERT(s == ASCIIToWide(" aaa bbb "));
- {
+ {
const TUtf16String w(ASCIIToWide(" a b c "));
- s = w;
- Collapse(s);
- UNIT_ASSERT(s == w);
+ s = w;
+ Collapse(s);
+ UNIT_ASSERT(s == w);
#ifndef TSTRING_IS_STD_STRING
- UNIT_ASSERT(s.c_str() == w.c_str()); // Collapse() does not change the string at all
+ UNIT_ASSERT(s.c_str() == w.c_str()); // Collapse() does not change the string at all
#endif
- }
+ }
s = ASCIIToWide(" 123 456 ");
- Collapse(s);
+ Collapse(s);
UNIT_ASSERT(s == ASCIIToWide(" 123 456 "));
-
+
s = ASCIIToWide(" 1\n\n\n23\t 4\f\f56 ");
- Collapse(s);
+ Collapse(s);
UNIT_ASSERT(s == ASCIIToWide(" 1 23 4 56 "));
-
+
s = ASCIIToWide(" 1\n\n\n\f\f56 ");
- Collapse(s);
+ Collapse(s);
UNIT_ASSERT(s == ASCIIToWide(" 1 56 "));
-
+
s = ASCIIToWide(" 1\r\n,\n(\n23\t 4\f\f56 ");
- Collapse(s);
+ Collapse(s);
UNIT_ASSERT(s == ASCIIToWide(" 1 , ( 23 4 56 "));
-
+
s = ASCIIToWide("1 23 ");
- Collapse(s);
+ Collapse(s);
UNIT_ASSERT(s == ASCIIToWide("1 23 "));
- {
+ {
const TUtf16String w = ASCIIToWide(" ");
- s = w;
- Collapse(s);
- UNIT_ASSERT(s == w);
+ s = w;
+ Collapse(s);
+ UNIT_ASSERT(s == w);
#ifndef TSTRING_IS_STD_STRING
- UNIT_ASSERT(s.c_str() == w.c_str()); // Collapse() does not change the string at all
+ UNIT_ASSERT(s.c_str() == w.c_str()); // Collapse() does not change the string at all
#endif
- }
+ }
s = ASCIIToWide(" ");
- Collapse(s);
+ Collapse(s);
UNIT_ASSERT(s == ASCIIToWide(" "));
-
+
s = ASCIIToWide(",\r\n\"");
- Collapse(s);
+ Collapse(s);
UNIT_ASSERT(s == ASCIIToWide(", \""));
-
+
s = ASCIIToWide("-");
- Collapse(s);
+ Collapse(s);
UNIT_ASSERT(s == ASCIIToWide("-"));
-
- s.clear();
- Collapse(s);
+
+ s.clear();
+ Collapse(s);
UNIT_ASSERT(s == TUtf16String());
- }
-
- void TestCollapseBuffer() {
+ }
+
+ void TestCollapseBuffer() {
TUtf16String s;
s.append(ws, Y_ARRAY_SIZE(ws)).append(3, 'a').append(ws, Y_ARRAY_SIZE(ws)).append(3, 'b').append(ws, Y_ARRAY_SIZE(ws));
- size_t n = Collapse(s.begin(), s.size());
- s.resize(n);
+ size_t n = Collapse(s.begin(), s.size());
+ s.resize(n);
UNIT_ASSERT(s == ASCIIToWide(" aaa bbb "));
-
+
s = ASCIIToWide(" a b c ");
- n = Collapse(s.begin(), s.size());
- UNIT_ASSERT(n == s.size()); // length was not changed
+ n = Collapse(s.begin(), s.size());
+ UNIT_ASSERT(n == s.size()); // length was not changed
UNIT_ASSERT(s == ASCIIToWide(" a b c "));
-
+
s = ASCIIToWide(" 123 456 ");
- n = Collapse(s.begin(), s.size());
- s.resize(n);
+ n = Collapse(s.begin(), s.size());
+ s.resize(n);
UNIT_ASSERT(s == ASCIIToWide(" 123 456 "));
-
+
s = ASCIIToWide(" 1\n\n\n23\t 4\f\f56 ");
- n = Collapse(s.begin(), s.size());
- s.resize(n);
+ n = Collapse(s.begin(), s.size());
+ s.resize(n);
UNIT_ASSERT(s == ASCIIToWide(" 1 23 4 56 "));
-
+
s = ASCIIToWide(" 1\n\n\n\f\f56 ");
- n = Collapse(s.begin(), s.size());
- s.resize(n);
+ n = Collapse(s.begin(), s.size());
+ s.resize(n);
UNIT_ASSERT(s == ASCIIToWide(" 1 56 "));
-
+
s = ASCIIToWide(" 1\r\n,\n(\n23\t 4\f\f56 ");
- n = Collapse(s.begin(), s.size());
- s.resize(n);
+ n = Collapse(s.begin(), s.size());
+ s.resize(n);
UNIT_ASSERT(s == ASCIIToWide(" 1 , ( 23 4 56 "));
-
+
s = ASCIIToWide("1 23 ");
- n = Collapse(s.begin(), s.size());
- s.resize(n);
+ n = Collapse(s.begin(), s.size());
+ s.resize(n);
UNIT_ASSERT(s == ASCIIToWide("1 23 "));
-
+
s = ASCIIToWide(" ");
- n = Collapse(s.begin(), s.size());
- UNIT_ASSERT(n == 1);
+ n = Collapse(s.begin(), s.size());
+ UNIT_ASSERT(n == 1);
UNIT_ASSERT(s == ASCIIToWide(" "));
-
+
s = ASCIIToWide(" ");
- n = Collapse(s.begin(), s.size());
- s.resize(n);
+ n = Collapse(s.begin(), s.size());
+ s.resize(n);
UNIT_ASSERT(s == ASCIIToWide(" "));
-
+
s = ASCIIToWide(",\r\n\"");
- n = Collapse(s.begin(), s.size());
- s.resize(n);
+ n = Collapse(s.begin(), s.size());
+ s.resize(n);
UNIT_ASSERT(s == ASCIIToWide(", \""));
-
+
s = ASCIIToWide("-");
- n = Collapse(s.begin(), s.size());
- UNIT_ASSERT(n == 1);
+ n = Collapse(s.begin(), s.size());
+ UNIT_ASSERT(n == 1);
UNIT_ASSERT(s == ASCIIToWide("-"));
-
+
s = ASCIIToWide("\t");
- n = Collapse(s.begin(), s.size());
- UNIT_ASSERT(n == 1);
+ n = Collapse(s.begin(), s.size());
+ UNIT_ASSERT(n == 1);
UNIT_ASSERT(s == ASCIIToWide(" "));
-
- s.clear();
- n = Collapse(s.begin(), s.size());
- UNIT_ASSERT(n == 0);
+
+ s.clear();
+ n = Collapse(s.begin(), s.size());
+ UNIT_ASSERT(n == 0);
UNIT_ASSERT(s == TUtf16String());
- }
-
- void TestStrip() {
+ }
+
+ void TestStrip() {
TUtf16String s;
-
- Strip(s);
+
+ Strip(s);
UNIT_ASSERT(s == TUtf16String());
StripLeft(s);
UNIT_ASSERT(s == TUtf16String());
StripRight(s);
UNIT_ASSERT(s == TUtf16String());
-
+
s = ASCIIToWide(" \t\r\n");
- Strip(s);
+ Strip(s);
UNIT_ASSERT(s == TUtf16String());
s = ASCIIToWide(" \t\r\n");
StripLeft(s);
@@ -749,9 +749,9 @@ public:
s = ASCIIToWide(" \t\r\n");
StripRight(s);
UNIT_ASSERT(s == TUtf16String());
-
+
s = ASCIIToWide("\t\f\va \r\n");
- Strip(s);
+ Strip(s);
UNIT_ASSERT(s == ASCIIToWide("a"));
s = ASCIIToWide("\t\f\va \r\n");
StripLeft(s);
@@ -759,9 +759,9 @@ public:
s = ASCIIToWide("\t\f\va \r\n");
StripRight(s);
UNIT_ASSERT(s == ASCIIToWide("\t\f\va"));
-
+
s = ASCIIToWide("\r\na\r\nb\t\tc\r\n");
- Strip(s);
+ Strip(s);
UNIT_ASSERT(s == ASCIIToWide("a\r\nb\t\tc"));
s = ASCIIToWide("\r\na\r\nb\t\tc\r\n");
StripLeft(s);
@@ -769,13 +769,13 @@ public:
s = ASCIIToWide("\r\na\r\nb\t\tc\r\n");
StripRight(s);
UNIT_ASSERT(s == ASCIIToWide("\r\na\r\nb\t\tc"));
-
+
const TUtf16String w(ASCIIToWide("a b"));
- s = w;
- Strip(s);
- UNIT_ASSERT(s == w);
+ s = w;
+ Strip(s);
+ UNIT_ASSERT(s == w);
#ifndef TSTRING_IS_STD_STRING
- UNIT_ASSERT(s.c_str() == w.c_str()); // Strip() does not change the string at all
+ UNIT_ASSERT(s.c_str() == w.c_str()); // Strip() does not change the string at all
#endif
s = w;
StripLeft(s);
@@ -789,31 +789,31 @@ public:
#ifndef TSTRING_IS_STD_STRING
UNIT_ASSERT(s.c_str() == w.c_str()); // Strip() does not change the string at all
#endif
- }
-
- void TestIsSpace() {
+ }
+
+ void TestIsSpace() {
UNIT_ASSERT(!IsSpace(TUtf16String()));
UNIT_ASSERT(IsSpace(ws, Y_ARRAY_SIZE(ws)));
-
+
TUtf16String w;
w.assign(ws, Y_ARRAY_SIZE(ws)).append(TUtf16String(1, '!'));
- UNIT_ASSERT(!IsSpace(w.c_str(), w.size()));
-
+ UNIT_ASSERT(!IsSpace(w.c_str(), w.size()));
+
w.assign(TUtf16String(1, '_')).append(ws, Y_ARRAY_SIZE(ws));
- UNIT_ASSERT(!IsSpace(w.c_str(), w.size()));
-
+ UNIT_ASSERT(!IsSpace(w.c_str(), w.size()));
+
w.assign(ws, Y_ARRAY_SIZE(ws)).append(TUtf16String(1, '$')).append(ws, Y_ARRAY_SIZE(ws));
- UNIT_ASSERT(!IsSpace(w.c_str(), w.size()));
- }
-
- void TestEscapeHtmlChars() {
- // characters from the first half of the ASCII table
+ UNIT_ASSERT(!IsSpace(w.c_str(), w.size()));
+ }
+
+ void TestEscapeHtmlChars() {
+ // characters from the first half of the ASCII table
for (wchar16 c = 1; c < 0x7F; ++c) {
TUtf16String w(1, c);
EscapeHtmlChars<false>(w);
-
- switch (c) {
+
+ switch (c) {
case '<':
UNIT_ASSERT(w == ASCIIToWide("&lt;"));
break;
@@ -829,8 +829,8 @@ public:
default:
UNIT_ASSERT(w == TUtf16String(1, c));
break;
- }
- }
+ }
+ }
for (wchar16 c = 1; c < 0x7F; ++c) {
TUtf16String w(1, c);
@@ -858,24 +858,24 @@ public:
break;
}
}
- }
+ }
- void TestToLower() {
- const size_t n = 32;
- wchar16 upperCase[n];
+ void TestToLower() {
+ const size_t n = 32;
+ wchar16 upperCase[n];
std::copy(wideCyrillicAlphabet, wideCyrillicAlphabet + n, upperCase);
- ToLower(upperCase, n);
+ ToLower(upperCase, n);
UNIT_ASSERT(TWtringBuf(upperCase, n) == TWtringBuf(wideCyrillicAlphabet + n, n));
- }
-
- void TestToUpper() {
- const size_t n = 32;
- wchar16 lowerCase[n];
+ }
+
+ void TestToUpper() {
+ const size_t n = 32;
+ wchar16 lowerCase[n];
std::copy(wideCyrillicAlphabet + n, wideCyrillicAlphabet + n * 2, lowerCase);
- ToUpper(lowerCase, n);
+ ToUpper(lowerCase, n);
UNIT_ASSERT(TWtringBuf(lowerCase, n) == TWtringBuf(wideCyrillicAlphabet, n));
- }
-
+ }
+
void TestWideString() {
const TUtf16String original = UTF32ToWide(WideStringTestData[0], CaseTestDataSize);
const TUtf16String lower = UTF32ToWide(WideStringTestData[1], CaseTestDataSize);
@@ -1737,6 +1737,6 @@ public:
UNIT_ASSERT(ToTitleRet(TWtringBuf(copy), 3, 100500) == title);
}
}
-};
-
-UNIT_TEST_SUITE_REGISTRATION(TWideUtilTest);
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TWideUtilTest);