aboutsummaryrefslogtreecommitdiffstats
path: root/util/charset
diff options
context:
space:
mode:
authorIlnur Khuziev <ilnur.khuziev@yandex.ru>2022-02-10 16:46:13 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:46:13 +0300
commit736dcd8ca259457a136f2f9f9168c44643914323 (patch)
treeddd46a036d68bfa83aa11b892f31243ea6b068a1 /util/charset
parent9bf2fa2b060c9881d3135c2208c624a1dd546ecc (diff)
downloadydb-736dcd8ca259457a136f2f9f9168c44643914323.tar.gz
Restoring authorship annotation for Ilnur Khuziev <ilnur.khuziev@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'util/charset')
-rw-r--r--util/charset/benchmark/to_lower/main.cpp2
-rw-r--r--util/charset/benchmark/utf8_to_wide/main.cpp104
-rw-r--r--util/charset/utf8.h2
-rw-r--r--util/charset/utf8_ut.cpp4
-rw-r--r--util/charset/wide.cpp344
-rw-r--r--util/charset/wide.h240
-rw-r--r--util/charset/wide_sse41.cpp352
-rw-r--r--util/charset/wide_ut.cpp2
8 files changed, 525 insertions, 525 deletions
diff --git a/util/charset/benchmark/to_lower/main.cpp b/util/charset/benchmark/to_lower/main.cpp
index e95fdc2371..56599b1770 100644
--- a/util/charset/benchmark/to_lower/main.cpp
+++ b/util/charset/benchmark/to_lower/main.cpp
@@ -1,4 +1,4 @@
-#include <library/cpp/testing/benchmark/bench.h>
+#include <library/cpp/testing/benchmark/bench.h>
#include <util/charset/wide.h>
#include <util/generic/singleton.h>
diff --git a/util/charset/benchmark/utf8_to_wide/main.cpp b/util/charset/benchmark/utf8_to_wide/main.cpp
index 09fa567fe5..3a56c34361 100644
--- a/util/charset/benchmark/utf8_to_wide/main.cpp
+++ b/util/charset/benchmark/utf8_to_wide/main.cpp
@@ -1,4 +1,4 @@
-#include <library/cpp/testing/benchmark/bench.h>
+#include <library/cpp/testing/benchmark/bench.h>
#include <util/random/fast.h>
#include <util/random/random.h>
@@ -103,59 +103,59 @@ inline size_t UTF8ToWideImplSSE(const char* text, size_t len, TCharType* dest, s
return UTF8ToWideImpl(text, len, dest, written);
}
-static wchar16 WBUF_UTF16[10000000];
-static wchar32 WBUF_UTF32[10000000];
+static wchar16 WBUF_UTF16[10000000];
+static wchar32 WBUF_UTF32[10000000];
-#define UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(impl, length, to) \
- Y_CPU_BENCHMARK(UTF8ToWideASCII##impl##length##to, iface) { \
- const auto& data = *Singleton<RAS##length>(); \
- for (size_t x = 0; x < iface.Iterations(); ++x) { \
- size_t written = 0; \
- Y_DO_NOT_OPTIMIZE_AWAY(UTF8ToWideImpl##impl<false>(data.begin(), data.size(), WBUF_##to, written)); \
- } \
+#define UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(impl, length, to) \
+ Y_CPU_BENCHMARK(UTF8ToWideASCII##impl##length##to, iface) { \
+ const auto& data = *Singleton<RAS##length>(); \
+ for (size_t x = 0; x < iface.Iterations(); ++x) { \
+ size_t written = 0; \
+ Y_DO_NOT_OPTIMIZE_AWAY(UTF8ToWideImpl##impl<false>(data.begin(), data.size(), WBUF_##to, written)); \
+ } \
}
-#define UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(impl, length, to) \
- Y_CPU_BENCHMARK(UTF8ToWideRU##impl##length##to, iface) { \
- const auto& data = *Singleton<RRS##length>(); \
- for (size_t x = 0; x < iface.Iterations(); ++x) { \
- size_t written = 0; \
- Y_DO_NOT_OPTIMIZE_AWAY(UTF8ToWideImpl##impl<false>(data.begin(), data.size(), WBUF_##to, written)); \
- } \
+#define UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(impl, length, to) \
+ Y_CPU_BENCHMARK(UTF8ToWideRU##impl##length##to, iface) { \
+ const auto& data = *Singleton<RRS##length>(); \
+ for (size_t x = 0; x < iface.Iterations(); ++x) { \
+ size_t written = 0; \
+ Y_DO_NOT_OPTIMIZE_AWAY(UTF8ToWideImpl##impl<false>(data.begin(), data.size(), WBUF_##to, written)); \
+ } \
}
-UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(Scalar, 1, UTF16);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(SSE, 1, UTF16);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(Scalar, 10, UTF16);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(SSE, 10, UTF16);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(Scalar, 1000, UTF16);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(SSE, 1000, UTF16);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(Scalar, 1000000, UTF16);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(SSE, 1000000, UTF16);
-
-UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(Scalar, 1, UTF16);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(SSE, 1, UTF16);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(Scalar, 10, UTF16);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(SSE, 10, UTF16);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(Scalar, 1000, UTF16);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(SSE, 1000, UTF16);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(Scalar, 1000000, UTF16);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(SSE, 1000000, UTF16);
-
-UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(Scalar, 1, UTF32);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(SSE, 1, UTF32);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(Scalar, 10, UTF32);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(SSE, 10, UTF32);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(Scalar, 1000, UTF32);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(SSE, 1000, UTF32);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(Scalar, 1000000, UTF32);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(SSE, 1000000, UTF32);
-
-UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(Scalar, 1, UTF32);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(SSE, 1, UTF32);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(Scalar, 10, UTF32);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(SSE, 10, UTF32);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(Scalar, 1000, UTF32);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(SSE, 1000, UTF32);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(Scalar, 1000000, UTF32);
-UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(SSE, 1000000, UTF32);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(Scalar, 1, UTF16);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(SSE, 1, UTF16);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(Scalar, 10, UTF16);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(SSE, 10, UTF16);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(Scalar, 1000, UTF16);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(SSE, 1000, UTF16);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(Scalar, 1000000, UTF16);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(SSE, 1000000, UTF16);
+
+UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(Scalar, 1, UTF16);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(SSE, 1, UTF16);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(Scalar, 10, UTF16);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(SSE, 10, UTF16);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(Scalar, 1000, UTF16);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(SSE, 1000, UTF16);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(Scalar, 1000000, UTF16);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(SSE, 1000000, UTF16);
+
+UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(Scalar, 1, UTF32);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(SSE, 1, UTF32);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(Scalar, 10, UTF32);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(SSE, 10, UTF32);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(Scalar, 1000, UTF32);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(SSE, 1000, UTF32);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(Scalar, 1000000, UTF32);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_ASCII(SSE, 1000000, UTF32);
+
+UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(Scalar, 1, UTF32);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(SSE, 1, UTF32);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(Scalar, 10, UTF32);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(SSE, 10, UTF32);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(Scalar, 1000, UTF32);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(SSE, 1000, UTF32);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(Scalar, 1000000, UTF32);
+UTF8_TO_WIDE_SCALAR_BENCHMARK_RU(SSE, 1000000, UTF32);
diff --git a/util/charset/utf8.h b/util/charset/utf8.h
index 5039b46ae9..08499ff77f 100644
--- a/util/charset/utf8.h
+++ b/util/charset/utf8.h
@@ -194,7 +194,7 @@ inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const uns
//! @param c value of the current character
//! @param p pointer to the current character, it will be changed in case of valid UTF8 byte sequence
//! @param e the end of the character sequence
-Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigned char*& p, const unsigned char* e) noexcept {
+Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigned char*& p, const unsigned char* e) noexcept {
Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
switch (UTF8RuneLen(*p)) {
case 0:
diff --git a/util/charset/utf8_ut.cpp b/util/charset/utf8_ut.cpp
index 9e68881cca..7f50134624 100644
--- a/util/charset/utf8_ut.cpp
+++ b/util/charset/utf8_ut.cpp
@@ -4,8 +4,8 @@
#include <util/stream/file.h>
#include <util/ysaveload.h>
-#include <library/cpp/testing/unittest/registar.h>
-#include <library/cpp/testing/unittest/env.h>
+#include <library/cpp/testing/unittest/registar.h>
+#include <library/cpp/testing/unittest/env.h>
Y_UNIT_TEST_SUITE(TUtfUtilTest) {
Y_UNIT_TEST(TestUTF8Len) {
diff --git a/util/charset/wide.cpp b/util/charset/wide.cpp
index a287438ddd..7e96349631 100644
--- a/util/charset/wide.cpp
+++ b/util/charset/wide.cpp
@@ -146,8 +146,8 @@ bool IsTitleWord(const TWtringBuf text) noexcept {
return IsLowerWord({p, pe});
}
-template <bool stopOnFirstModification, typename TCharType, typename F>
-static bool ModifySequence(TCharType*& p, const TCharType* const pe, F&& f) {
+template <bool stopOnFirstModification, typename TCharType, typename F>
+static bool ModifySequence(TCharType*& p, const TCharType* const pe, F&& f) {
while (p != pe) {
const auto symbol = ReadSymbol(p, pe);
const auto modified = f(symbol);
@@ -165,8 +165,8 @@ static bool ModifySequence(TCharType*& p, const TCharType* const pe, F&& f) {
return false;
}
-template <bool stopOnFirstModification, typename TCharType, typename F>
-static bool ModifySequence(const TCharType*& p, const TCharType* const pe, TCharType*& out, F&& f) {
+template <bool stopOnFirstModification, typename TCharType, typename F>
+static bool ModifySequence(const TCharType*& p, const TCharType* const pe, TCharType*& out, F&& f) {
while (p != pe) {
const auto symbol = stopOnFirstModification ? ReadSymbol(p, pe) : ReadSymbolAndAdvance(p, pe);
const auto modified = f(symbol);
@@ -193,8 +193,8 @@ static void DetachAndFixPointers(TStringType& text, typename TStringType::value_
pe = p + count;
}
-template <class TStringType, typename F>
-static bool ModifyStringSymbolwise(TStringType& text, size_t pos, size_t count, F&& f) {
+template <class TStringType, typename F>
+static bool ModifyStringSymbolwise(TStringType& text, size_t pos, size_t count, F&& f) {
// TODO(yazevnul): this is done for consistency with `TUtf16String::to_lower` and friends
// at r2914050, maybe worth replacing them with asserts. Also see the same code in `ToTitle`.
pos = pos < text.size() ? pos : text.size();
@@ -225,16 +225,16 @@ bool ToUpper(TUtf16String& text, size_t pos, size_t count) {
return ModifyStringSymbolwise(text, pos, count, f);
}
-bool ToLower(TUtf32String& text, size_t pos, size_t count) {
+bool ToLower(TUtf32String& text, size_t pos, size_t count) {
const auto f = [](const wchar32 s) { return ToLower(s); };
- return ModifyStringSymbolwise(text, pos, count, f);
-}
-
-bool ToUpper(TUtf32String& text, size_t pos, size_t count) {
+ return ModifyStringSymbolwise(text, pos, count, f);
+}
+
+bool ToUpper(TUtf32String& text, size_t pos, size_t count) {
const auto f = [](const wchar32 s) { return ToUpper(s); };
- return ModifyStringSymbolwise(text, pos, count, f);
-}
-
+ return ModifyStringSymbolwise(text, pos, count, f);
+}
+
bool ToTitle(TUtf16String& text, size_t pos, size_t count) {
if (!text) {
return false;
@@ -266,37 +266,37 @@ bool ToTitle(TUtf16String& text, size_t pos, size_t count) {
return false;
}
-bool ToTitle(TUtf32String& text, size_t pos, size_t count) {
- if (!text) {
- return false;
- }
-
- pos = pos < text.size() ? pos : text.size();
- count = count < text.size() - pos ? count : text.size() - pos;
-
+bool ToTitle(TUtf32String& text, size_t pos, size_t count) {
+ if (!text) {
+ return false;
+ }
+
+ pos = pos < text.size() ? pos : text.size();
+ count = count < text.size() - pos ? count : text.size() - pos;
+
const auto toLower = [](const wchar32 s) { return ToLower(s); };
-
- auto* p = const_cast<wchar32*>(text.data() + pos);
- const auto* pe = text.data() + pos + count;
-
- const auto firstSymbol = *p;
- if (firstSymbol == ToTitle(firstSymbol)) {
- p += 1;
- if (ModifySequence<true>(p, pe, toLower)) {
- DetachAndFixPointers(text, p, pe);
- ModifySequence<false>(p, pe, toLower);
- return true;
- }
- } else {
- DetachAndFixPointers(text, p, pe);
- WriteSymbol(ToTitle(ReadSymbol(p, pe)), p); // also moves `p` forward
- ModifySequence<false>(p, pe, toLower);
- return true;
- }
-
- return false;
-}
-
+
+ auto* p = const_cast<wchar32*>(text.data() + pos);
+ const auto* pe = text.data() + pos + count;
+
+ const auto firstSymbol = *p;
+ if (firstSymbol == ToTitle(firstSymbol)) {
+ p += 1;
+ if (ModifySequence<true>(p, pe, toLower)) {
+ DetachAndFixPointers(text, p, pe);
+ ModifySequence<false>(p, pe, toLower);
+ return true;
+ }
+ } else {
+ DetachAndFixPointers(text, p, pe);
+ WriteSymbol(ToTitle(ReadSymbol(p, pe)), p); // also moves `p` forward
+ ModifySequence<false>(p, pe, toLower);
+ return true;
+ }
+
+ return false;
+}
+
TUtf16String ToLowerRet(TUtf16String text, size_t pos, size_t count) {
ToLower(text, pos, count);
return text;
@@ -312,21 +312,21 @@ TUtf16String ToTitleRet(TUtf16String text, size_t pos, size_t count) {
return text;
}
-TUtf32String ToLowerRet(TUtf32String text, size_t pos, size_t count) {
- ToLower(text, pos, count);
- return text;
-}
-
-TUtf32String ToUpperRet(TUtf32String text, size_t pos, size_t count) {
- ToUpper(text, pos, count);
- return text;
-}
-
-TUtf32String ToTitleRet(TUtf32String text, size_t pos, size_t count) {
- ToTitle(text, pos, count);
- return text;
-}
-
+TUtf32String ToLowerRet(TUtf32String text, size_t pos, size_t count) {
+ ToLower(text, pos, count);
+ return text;
+}
+
+TUtf32String ToUpperRet(TUtf32String text, size_t pos, size_t count) {
+ ToUpper(text, pos, count);
+ return text;
+}
+
+TUtf32String ToTitleRet(TUtf32String text, size_t pos, size_t count) {
+ ToTitle(text, pos, count);
+ return text;
+}
+
bool ToLower(const wchar16* text, size_t length, wchar16* out) noexcept {
// TODO(yazevnul): get rid of `text == out` case (it is probably used only in lemmer) and then
// we can declare text and out as `__restrict__`
@@ -408,87 +408,87 @@ bool ToTitle(wchar16* text, size_t length) noexcept {
return ToLower(text, textEnd - text) || firstSymbol != firstSymbolTitle;
}
-bool ToLower(const wchar32* text, size_t length, wchar32* out) noexcept {
- // TODO(yazevnul): get rid of `text == out` case (it is probably used only in lemmer) and then
- // we can declare text and out as `__restrict__`
- Y_ASSERT(text == out || !(out >= text && out < text + length));
+bool ToLower(const wchar32* text, size_t length, wchar32* out) noexcept {
+ // TODO(yazevnul): get rid of `text == out` case (it is probably used only in lemmer) and then
+ // we can declare text and out as `__restrict__`
+ Y_ASSERT(text == out || !(out >= text && out < text + length));
const auto f = [](const wchar32 s) { return ToLower(s); };
- const auto* p = text;
- const auto* const pe = text + length;
- if (ModifySequence<true>(p, pe, out, f)) {
- ModifySequence<false>(p, pe, out, f);
- return true;
- }
- return false;
-}
-
-bool ToUpper(const wchar32* text, size_t length, wchar32* out) noexcept {
- Y_ASSERT(text == out || !(out >= text && out < text + length));
+ const auto* p = text;
+ const auto* const pe = text + length;
+ if (ModifySequence<true>(p, pe, out, f)) {
+ ModifySequence<false>(p, pe, out, f);
+ return true;
+ }
+ return false;
+}
+
+bool ToUpper(const wchar32* text, size_t length, wchar32* out) noexcept {
+ Y_ASSERT(text == out || !(out >= text && out < text + length));
const auto f = [](const wchar32 s) { return ToUpper(s); };
- const auto* p = text;
- const auto* const pe = text + length;
- if (ModifySequence<true>(p, pe, out, f)) {
- ModifySequence<false>(p, pe, out, f);
- return true;
- }
- return false;
-}
-
-bool ToTitle(const wchar32* text, size_t length, wchar32* out) noexcept {
- if (!length) {
- return false;
- }
-
- Y_ASSERT(text == out || !(out >= text && out < text + length));
-
- const auto* const textEnd = text + length;
- const auto firstSymbol = ReadSymbolAndAdvance(text, textEnd);
- const auto firstSymbolTitle = ToTitle(firstSymbol);
-
- WriteSymbol(firstSymbolTitle, out);
-
- return ToLower(text, textEnd - text, out) || firstSymbol != firstSymbolTitle;
-}
-
-bool ToLower(wchar32* text, size_t length) noexcept {
+ const auto* p = text;
+ const auto* const pe = text + length;
+ if (ModifySequence<true>(p, pe, out, f)) {
+ ModifySequence<false>(p, pe, out, f);
+ return true;
+ }
+ return false;
+}
+
+bool ToTitle(const wchar32* text, size_t length, wchar32* out) noexcept {
+ if (!length) {
+ return false;
+ }
+
+ Y_ASSERT(text == out || !(out >= text && out < text + length));
+
+ const auto* const textEnd = text + length;
+ const auto firstSymbol = ReadSymbolAndAdvance(text, textEnd);
+ const auto firstSymbolTitle = ToTitle(firstSymbol);
+
+ WriteSymbol(firstSymbolTitle, out);
+
+ return ToLower(text, textEnd - text, out) || firstSymbol != firstSymbolTitle;
+}
+
+bool ToLower(wchar32* text, size_t length) noexcept {
const auto f = [](const wchar32 s) { return ToLower(s); };
- const auto* const textEnd = text + length;
- if (ModifySequence<true>(text, textEnd, f)) {
- ModifySequence<false>(text, textEnd, f);
- return true;
- }
- return false;
-}
-
-bool ToUpper(wchar32* text, size_t length) noexcept {
+ const auto* const textEnd = text + length;
+ if (ModifySequence<true>(text, textEnd, f)) {
+ ModifySequence<false>(text, textEnd, f);
+ return true;
+ }
+ return false;
+}
+
+bool ToUpper(wchar32* text, size_t length) noexcept {
const auto f = [](const wchar32 s) { return ToUpper(s); };
- const auto* const textEnd = text + length;
- if (ModifySequence<true>(text, textEnd, f)) {
- ModifySequence<false>(text, textEnd, f);
- return true;
- }
- return false;
-}
-
-bool ToTitle(wchar32* text, size_t length) noexcept {
- if (!length) {
- return false;
- }
-
- const auto* textEnd = text + length;
- const auto firstSymbol = ReadSymbol(text, textEnd);
- const auto firstSymbolTitle = ToTitle(firstSymbol);
-
- // avoid unnacessary writes to the memory
- if (firstSymbol != firstSymbolTitle) {
- WriteSymbol(firstSymbolTitle, text);
- } else {
- text = SkipSymbol(text, textEnd);
- }
-
- return ToLower(text, textEnd - text) || firstSymbol != firstSymbolTitle;
-}
-
+ const auto* const textEnd = text + length;
+ if (ModifySequence<true>(text, textEnd, f)) {
+ ModifySequence<false>(text, textEnd, f);
+ return true;
+ }
+ return false;
+}
+
+bool ToTitle(wchar32* text, size_t length) noexcept {
+ if (!length) {
+ return false;
+ }
+
+ const auto* textEnd = text + length;
+ const auto firstSymbol = ReadSymbol(text, textEnd);
+ const auto firstSymbolTitle = ToTitle(firstSymbol);
+
+ // avoid unnacessary writes to the memory
+ if (firstSymbol != firstSymbolTitle) {
+ WriteSymbol(firstSymbolTitle, text);
+ } else {
+ text = SkipSymbol(text, textEnd);
+ }
+
+ return ToLower(text, textEnd - text) || firstSymbol != firstSymbolTitle;
+}
+
template <typename F>
static TUtf16String ToSmthRet(const TWtringBuf text, size_t pos, size_t count, F&& f) {
pos = pos < text.size() ? pos : text.size();
@@ -510,27 +510,27 @@ static TUtf16String ToSmthRet(const TWtringBuf text, size_t pos, size_t count, F
return res;
}
-template <typename F>
-static TUtf32String ToSmthRet(const TUtf32StringBuf text, size_t pos, size_t count, F&& f) {
- pos = pos < text.size() ? pos : text.size();
- count = count < text.size() - pos ? count : text.size() - pos;
-
- auto res = TUtf32String::Uninitialized(text.size());
- auto* const resBegin = res.Detach();
-
- if (pos) {
- MemCopy(resBegin, text.data(), pos);
- }
-
- f(text.data() + pos, count, resBegin + pos);
-
- if (count - pos != text.size()) {
- MemCopy(resBegin + pos + count, text.data() + pos + count, text.size() - pos - count);
- }
-
- return res;
-}
-
+template <typename F>
+static TUtf32String ToSmthRet(const TUtf32StringBuf text, size_t pos, size_t count, F&& f) {
+ pos = pos < text.size() ? pos : text.size();
+ count = count < text.size() - pos ? count : text.size() - pos;
+
+ auto res = TUtf32String::Uninitialized(text.size());
+ auto* const resBegin = res.Detach();
+
+ if (pos) {
+ MemCopy(resBegin, text.data(), pos);
+ }
+
+ f(text.data() + pos, count, resBegin + pos);
+
+ if (count - pos != text.size()) {
+ MemCopy(resBegin + pos + count, text.data() + pos + count, text.size() - pos - count);
+ }
+
+ return res;
+}
+
TUtf16String ToLowerRet(const TWtringBuf text, size_t pos, size_t count) {
return ToSmthRet(text, pos, count, [](const wchar16* theText, size_t length, wchar16* out) {
ToLower(theText, length, out);
@@ -549,24 +549,24 @@ TUtf16String ToTitleRet(const TWtringBuf text, size_t pos, size_t count) {
});
}
-TUtf32String ToLowerRet(const TUtf32StringBuf text, size_t pos, size_t count) {
+TUtf32String ToLowerRet(const TUtf32StringBuf text, size_t pos, size_t count) {
return ToSmthRet(text, pos, count, [](const wchar32* theText, size_t length, wchar32* out) {
- ToLower(theText, length, out);
- });
-}
-
-TUtf32String ToUpperRet(const TUtf32StringBuf text, size_t pos, size_t count) {
+ ToLower(theText, length, out);
+ });
+}
+
+TUtf32String ToUpperRet(const TUtf32StringBuf text, size_t pos, size_t count) {
return ToSmthRet(text, pos, count, [](const wchar32* theText, size_t length, wchar32* out) {
- ToUpper(theText, length, out);
- });
-}
-
-TUtf32String ToTitleRet(const TUtf32StringBuf text, size_t pos, size_t count) {
+ ToUpper(theText, length, out);
+ });
+}
+
+TUtf32String ToTitleRet(const TUtf32StringBuf text, size_t pos, size_t count) {
return ToSmthRet(text, pos, count, [](const wchar32* theText, size_t length, wchar32* out) {
- ToTitle(theText, length, out);
- });
-}
-
+ ToTitle(theText, length, out);
+ });
+}
+
template <bool insertBr>
void EscapeHtmlChars(TUtf16String& str) {
static const TUtf16String lt(LT, Y_ARRAY_SIZE(LT));
diff --git a/util/charset/wide.h b/util/charset/wide.h
index 04e6928aab..5a566983fa 100644
--- a/util/charset/wide.h
+++ b/util/charset/wide.h
@@ -34,20 +34,20 @@ namespace NDetail {
template <>
struct TSelector<false> {
template <class T>
- static inline void WriteSymbol(wchar16 s, T& dest) noexcept {
+ static inline void WriteSymbol(wchar16 s, T& dest) noexcept {
dest.push_back(s);
}
};
-
+
template <>
struct TSelector<true> {
template <class T>
- static inline void WriteSymbol(wchar16 s, T& dest) noexcept {
+ static inline void WriteSymbol(wchar16 s, T& dest) noexcept {
*(dest++) = s;
}
};
- inline wchar32 ReadSurrogatePair(const wchar16* chars) noexcept {
+ inline wchar32 ReadSurrogatePair(const wchar16* chars) noexcept {
const wchar32 SURROGATE_OFFSET = static_cast<wchar32>(0x10000 - (0xD800 << 10) - 0xDC00);
wchar16 lead = chars[0];
wchar16 tail = chars[1];
@@ -59,26 +59,26 @@ namespace NDetail {
}
template <class T>
- inline void WriteSurrogatePair(wchar32 s, T& dest) noexcept;
+ inline void WriteSurrogatePair(wchar32 s, T& dest) noexcept;
}
-inline wchar16* SkipSymbol(wchar16* begin, const wchar16* end) noexcept {
+inline wchar16* SkipSymbol(wchar16* begin, const wchar16* end) noexcept {
return begin + W16SymbolSize(begin, end);
}
-inline const wchar16* SkipSymbol(const wchar16* begin, const wchar16* end) noexcept {
+inline const wchar16* SkipSymbol(const wchar16* begin, const wchar16* end) noexcept {
return begin + W16SymbolSize(begin, end);
}
-inline wchar32* SkipSymbol(wchar32* begin, const wchar32* end) noexcept {
- Y_ASSERT(begin < end);
- return begin + 1;
-}
-inline const wchar32* SkipSymbol(const wchar32* begin, const wchar32* end) noexcept {
- Y_ASSERT(begin < end);
- return begin + 1;
-}
+inline wchar32* SkipSymbol(wchar32* begin, const wchar32* end) noexcept {
+ Y_ASSERT(begin < end);
+ return begin + 1;
+}
+inline const wchar32* SkipSymbol(const wchar32* begin, const wchar32* end) noexcept {
+ Y_ASSERT(begin < end);
+ return begin + 1;
+}
-inline wchar32 ReadSymbol(const wchar16* begin, const wchar16* end) noexcept {
+inline wchar32 ReadSymbol(const wchar16* begin, const wchar16* end) noexcept {
Y_ASSERT(begin < end);
if (IsW16SurrogateLead(*begin)) {
if (begin + 1 < end && IsW16SurrogateTail(*(begin + 1)))
@@ -92,13 +92,13 @@ inline wchar32 ReadSymbol(const wchar16* begin, const wchar16* end) noexcept {
return *begin;
}
-inline wchar32 ReadSymbol(const wchar32* begin, const wchar32* end) noexcept {
- Y_ASSERT(begin < end);
- return *begin;
-}
-
+inline wchar32 ReadSymbol(const wchar32* begin, const wchar32* end) noexcept {
+ Y_ASSERT(begin < end);
+ return *begin;
+}
+
//! presuming input data is either big enought of null terminated
-inline wchar32 ReadSymbolAndAdvance(const wchar16*& begin) noexcept {
+inline wchar32 ReadSymbolAndAdvance(const wchar16*& begin) noexcept {
Y_ASSERT(*begin);
if (IsW16SurrogateLead(begin[0])) {
if (IsW16SurrogateTail(begin[1])) {
@@ -116,13 +116,13 @@ inline wchar32 ReadSymbolAndAdvance(const wchar16*& begin) noexcept {
return *(begin++);
}
-//! presuming input data is either big enought of null terminated
-inline wchar32 ReadSymbolAndAdvance(const wchar32*& begin) noexcept {
- Y_ASSERT(*begin);
- return *(begin++);
-}
-
-inline wchar32 ReadSymbolAndAdvance(const wchar16*& begin, const wchar16* end) noexcept {
+//! presuming input data is either big enought of null terminated
+inline wchar32 ReadSymbolAndAdvance(const wchar32*& begin) noexcept {
+ Y_ASSERT(*begin);
+ return *(begin++);
+}
+
+inline wchar32 ReadSymbolAndAdvance(const wchar16*& begin, const wchar16* end) noexcept {
Y_ASSERT(begin < end);
if (IsW16SurrogateLead(begin[0])) {
if (begin + 1 != end && IsW16SurrogateTail(begin[1])) {
@@ -139,19 +139,19 @@ inline wchar32 ReadSymbolAndAdvance(const wchar16*& begin, const wchar16* end) n
return *(begin++);
}
-inline wchar32 ReadSymbolAndAdvance(const wchar32*& begin, const wchar32* end) noexcept {
+inline wchar32 ReadSymbolAndAdvance(const wchar32*& begin, const wchar32* end) noexcept {
Y_ASSERT(begin < end);
return *(begin++);
}
template <class T>
-inline size_t WriteSymbol(wchar16 s, T& dest) noexcept {
+inline size_t WriteSymbol(wchar16 s, T& dest) noexcept {
::NDetail::TSelector<std::is_pointer<T>::value>::WriteSymbol(s, dest);
return 1;
}
template <class T>
-inline size_t WriteSymbol(wchar32 s, T& dest) noexcept {
+inline size_t WriteSymbol(wchar32 s, T& dest) noexcept {
if (s > 0xFFFF) {
if (s >= ::NUnicode::UnicodeInstancesLimit()) {
return WriteSymbol(static_cast<wchar16>(BROKEN_RUNE), dest);
@@ -164,7 +164,7 @@ inline size_t WriteSymbol(wchar32 s, T& dest) noexcept {
return WriteSymbol(static_cast<wchar16>(s), dest);
}
-inline bool WriteSymbol(wchar32 s, wchar16*& dest, const wchar16* destEnd) noexcept {
+inline bool WriteSymbol(wchar32 s, wchar16*& dest, const wchar16* destEnd) noexcept {
Y_ASSERT(dest < destEnd);
if (s > 0xFFFF) {
@@ -184,12 +184,12 @@ inline bool WriteSymbol(wchar32 s, wchar16*& dest, const wchar16* destEnd) noexc
return true;
}
-inline size_t WriteSymbol(wchar32 s, wchar32*& dest) noexcept {
+inline size_t WriteSymbol(wchar32 s, wchar32*& dest) noexcept {
*(dest++) = s;
return 1;
}
-inline bool WriteSymbol(wchar32 s, wchar32*& dest, const wchar32* destEnd) noexcept {
+inline bool WriteSymbol(wchar32 s, wchar32*& dest, const wchar32* destEnd) noexcept {
Y_ASSERT(dest < destEnd);
*(dest++) = s;
@@ -260,7 +260,7 @@ public:
namespace NDetail {
template <bool robust, typename TCharType>
- inline void UTF8ToWideImplScalar(const unsigned char*& cur, const unsigned char* last, TCharType*& dest) noexcept {
+ inline void UTF8ToWideImplScalar(const unsigned char*& cur, const unsigned char* last, TCharType*& dest) noexcept {
wchar32 rune = BROKEN_RUNE;
while (cur != last) {
@@ -278,29 +278,29 @@ namespace NDetail {
}
}
- template <typename TCharType>
- inline void UTF16ToUTF32ImplScalar(const wchar16* cur, const wchar16* last, TCharType*& dest) noexcept {
- wchar32 rune = BROKEN_RUNE;
-
- while (cur != last) {
- rune = ReadSymbolAndAdvance(cur, last);
- Y_ASSERT(cur <= last);
- WriteSymbol(rune, dest);
- }
- }
-
+ template <typename TCharType>
+ inline void UTF16ToUTF32ImplScalar(const wchar16* cur, const wchar16* last, TCharType*& dest) noexcept {
+ wchar32 rune = BROKEN_RUNE;
+
+ while (cur != last) {
+ rune = ReadSymbolAndAdvance(cur, last);
+ Y_ASSERT(cur <= last);
+ WriteSymbol(rune, dest);
+ }
+ }
+
template <class TCharType>
inline void UTF8ToWideImplSSE41(const unsigned char*& /*cur*/, const unsigned char* /*last*/, TCharType*& /*dest*/) noexcept {
}
void UTF8ToWideImplSSE41(const unsigned char*& cur, const unsigned char* last, wchar16*& dest) noexcept;
-
- void UTF8ToWideImplSSE41(const unsigned char*& cur, const unsigned char* last, wchar32*& dest) noexcept;
+
+ void UTF8ToWideImplSSE41(const unsigned char*& cur, const unsigned char* last, wchar32*& dest) noexcept;
}
//! @return len if robust and position where encoding stopped if not
template <bool robust, typename TCharType>
-inline size_t UTF8ToWideImpl(const char* text, size_t len, TCharType* dest, size_t& written) noexcept {
+inline size_t UTF8ToWideImpl(const char* text, size_t len, TCharType* dest, size_t& written) noexcept {
const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
const unsigned char* last = cur + len;
TCharType* p = dest;
@@ -333,7 +333,7 @@ inline TUtf16String UTF8ToWide(const char* text, size_t len) {
}
template <bool robust, typename TCharType>
-inline bool UTF8ToWide(const char* text, size_t len, TCharType* dest, size_t& written) noexcept {
+inline bool UTF8ToWide(const char* text, size_t len, TCharType* dest, size_t& written) noexcept {
return UTF8ToWideImpl<robust>(text, len, dest, written) == len;
}
@@ -342,7 +342,7 @@ inline bool UTF8ToWide(const char* text, size_t len, TCharType* dest, size_t& wr
//! conversion stops if a broken symbol is met
//! @return @c true if all the text converted successfully, @c false - a broken symbol was found
template <typename TCharType>
-inline bool UTF8ToWide(const char* text, size_t len, TCharType* dest, size_t& written) noexcept {
+inline bool UTF8ToWide(const char* text, size_t len, TCharType* dest, size_t& written) noexcept {
return UTF8ToWide<false>(text, len, dest, written);
}
@@ -350,21 +350,21 @@ template <bool robust>
inline TWtringBuf UTF8ToWide(const TStringBuf src, TUtf16String& dst) {
dst.ReserveAndResize(src.size());
size_t written = 0;
- UTF8ToWideImpl<robust>(src.data(), src.size(), dst.begin(), written);
- dst.resize(written);
- return dst;
-}
-
-//! if not robust will stop at first error position
-template <bool robust>
-inline TUtf32StringBuf UTF8ToUTF32(const TStringBuf src, TUtf32String& dst) {
- dst.ReserveAndResize(src.size());
- size_t written = 0;
- UTF8ToWideImpl<robust>(src.data(), src.size(), dst.begin(), written);
+ UTF8ToWideImpl<robust>(src.data(), src.size(), dst.begin(), written);
dst.resize(written);
return dst;
}
+//! if not robust will stop at first error position
+template <bool robust>
+inline TUtf32StringBuf UTF8ToUTF32(const TStringBuf src, TUtf32String& dst) {
+ dst.ReserveAndResize(src.size());
+ size_t written = 0;
+ UTF8ToWideImpl<robust>(src.data(), src.size(), dst.begin(), written);
+ dst.resize(written);
+ return dst;
+}
+
inline TWtringBuf UTF8ToWide(const TStringBuf src, TUtf16String& dst) {
return UTF8ToWide<false>(src, dst);
}
@@ -378,13 +378,13 @@ inline TUtf16String UTF8ToWide(const TStringBuf s) {
return UTF8ToWide<robust>(s.data(), s.size());
}
-template <bool robust>
-inline TUtf32String UTF8ToUTF32(const TStringBuf s) {
- TUtf32String r;
- UTF8ToUTF32<robust>(s, r);
- return r;
-}
-
+template <bool robust>
+inline TUtf32String UTF8ToUTF32(const TStringBuf s) {
+ TUtf32String r;
+ UTF8ToUTF32<robust>(s, r);
+ return r;
+}
+
inline TUtf16String UTF8ToWide(const TStringBuf s) {
return UTF8ToWide<false>(s.data(), s.size());
}
@@ -428,23 +428,23 @@ inline TString WideToUTF8(const wchar16* text, size_t len) {
return s;
}
-inline TString WideToUTF8(const wchar32* text, size_t len) {
- TString s = TString::Uninitialized(WideToUTF8BufferSize(len));
- size_t written = 0;
- WideToUTF8(text, len, s.begin(), written);
- Y_ASSERT(s.size() >= written);
- s.remove(written);
- return s;
-}
-
+inline TString WideToUTF8(const wchar32* text, size_t len) {
+ TString s = TString::Uninitialized(WideToUTF8BufferSize(len));
+ size_t written = 0;
+ WideToUTF8(text, len, s.begin(), written);
+ Y_ASSERT(s.size() >= written);
+ s.remove(written);
+ return s;
+}
+
inline TString WideToUTF8(const TWtringBuf w) {
return WideToUTF8(w.data(), w.size());
}
-inline TString WideToUTF8(const TUtf32StringBuf w) {
- return WideToUTF8(w.data(), w.size());
-}
-
+inline TString WideToUTF8(const TUtf32StringBuf w) {
+ return WideToUTF8(w.data(), w.size());
+}
+
inline TUtf16String UTF32ToWide(const wchar32* begin, size_t len) {
TUtf16String res;
res.reserve(len);
@@ -653,11 +653,11 @@ inline TUtf16String ASCIIToWide(const TStringBuf s) {
return CopyTo<TUtf16String>(s.begin(), s.end());
}
-inline TUtf32String ASCIIToUTF32(const TStringBuf s) {
- Y_ASSERT(IsStringASCII(s.begin(), s.end()));
- return CopyTo<TUtf32String>(s.begin(), s.end());
-}
-
+inline TUtf32String ASCIIToUTF32(const TStringBuf s) {
+ Y_ASSERT(IsStringASCII(s.begin(), s.end()));
+ return CopyTo<TUtf32String>(s.begin(), s.end());
+}
+
//! returns @c true if string contains whitespace characters only
inline bool IsSpace(const wchar16* s, size_t n) {
if (n == 0)
@@ -739,30 +739,30 @@ bool IsUpper(const TWtringBuf text) noexcept;
bool ToLower(TUtf16String& text, size_t pos = 0, size_t count = TUtf16String::npos);
bool ToUpper(TUtf16String& text, size_t pos = 0, size_t count = TUtf16String::npos);
-/* Lowercase/uppercase given string inplace. Any alphabetic symbol will be converted to a proper
-* case, the rest of the symbols will be kept the same. It is expected that `text` is a correct
-* UTF-32 string.
-*
-* For example `ToLower("heLLo")` will return `"hello"`.
-*
-* @param text String to modify
-* @param pos Position of the first character to modify
-* @param count Length of the substring
-* @returns `true` if `text` was changed
-*
-* NOTE: `pos` and `count` are measured in `wchar16`, not in codepoints.
-*/
-bool ToLower(TUtf32String& /*text*/, size_t /*pos*/ = 0, size_t /*count*/ = TUtf16String::npos);
-bool ToUpper(TUtf32String& /*text*/, size_t /*pos*/ = 0, size_t /*count*/ = TUtf16String::npos);
-
+/* Lowercase/uppercase given string inplace. Any alphabetic symbol will be converted to a proper
+* case, the rest of the symbols will be kept the same. It is expected that `text` is a correct
+* UTF-32 string.
+*
+* For example `ToLower("heLLo")` will return `"hello"`.
+*
+* @param text String to modify
+* @param pos Position of the first character to modify
+* @param count Length of the substring
+* @returns `true` if `text` was changed
+*
+* NOTE: `pos` and `count` are measured in `wchar16`, not in codepoints.
+*/
+bool ToLower(TUtf32String& /*text*/, size_t /*pos*/ = 0, size_t /*count*/ = TUtf16String::npos);
+bool ToUpper(TUtf32String& /*text*/, size_t /*pos*/ = 0, size_t /*count*/ = TUtf16String::npos);
+
/* Titlecase first symbol and lowercase the rest, see `ToLower` for more details.
*/
bool ToTitle(TUtf16String& text, size_t pos = 0, size_t count = TUtf16String::npos);
-/* Titlecase first symbol and lowercase the rest, see `ToLower` for more details.
-*/
-bool ToTitle(TUtf32String& /*text*/, size_t /*pos*/ = 0, size_t /*count*/ = TUtf16String::npos);
-
+/* Titlecase first symbol and lowercase the rest, see `ToLower` for more details.
+*/
+bool ToTitle(TUtf32String& /*text*/, size_t /*pos*/ = 0, size_t /*count*/ = TUtf16String::npos);
+
/* @param text Pointer to the string to modify
* @param length Length of the string to modify
* @param out Pointer to the character array to write to
@@ -776,10 +776,10 @@ bool ToLower(const wchar16* text, size_t length, wchar16* out) noexcept;
bool ToUpper(const wchar16* text, size_t length, wchar16* out) noexcept;
bool ToTitle(const wchar16* text, size_t length, wchar16* out) noexcept;
-bool ToLower(const wchar32* text, size_t length, wchar32* out) noexcept;
-bool ToUpper(const wchar32* text, size_t length, wchar32* out) noexcept;
-bool ToTitle(const wchar32* text, size_t length, wchar32* out) noexcept;
-
+bool ToLower(const wchar32* text, size_t length, wchar32* out) noexcept;
+bool ToUpper(const wchar32* text, size_t length, wchar32* out) noexcept;
+bool ToTitle(const wchar32* text, size_t length, wchar32* out) noexcept;
+
/* @param text Pointer to the string to modify
* @param length Length of the string to modify
*
@@ -789,10 +789,10 @@ bool ToLower(wchar16* text, size_t length) noexcept;
bool ToUpper(wchar16* text, size_t length) noexcept;
bool ToTitle(wchar16* text, size_t length) noexcept;
-bool ToLower(wchar32* text, size_t length) noexcept;
-bool ToUpper(wchar32* text, size_t length) noexcept;
-bool ToTitle(wchar32* text, size_t length) noexcept;
-
+bool ToLower(wchar32* text, size_t length) noexcept;
+bool ToUpper(wchar32* text, size_t length) noexcept;
+bool ToTitle(wchar32* text, size_t length) noexcept;
+
/* Convenience wrappers for `ToLower`, `ToUpper` and `ToTitle`.
*/
TUtf16String ToLowerRet(TUtf16String text, size_t pos = 0, size_t count = TUtf16String::npos) Y_WARN_UNUSED_RESULT;
@@ -803,10 +803,10 @@ TUtf16String ToLowerRet(const TWtringBuf text, size_t pos = 0, size_t count = TW
TUtf16String ToUpperRet(const TWtringBuf text, size_t pos = 0, size_t count = TWtringBuf::npos) Y_WARN_UNUSED_RESULT;
TUtf16String ToTitleRet(const TWtringBuf text, size_t pos = 0, size_t count = TWtringBuf::npos) Y_WARN_UNUSED_RESULT;
-TUtf32String ToLowerRet(const TUtf32StringBuf text, size_t pos = 0, size_t count = TWtringBuf::npos) Y_WARN_UNUSED_RESULT;
-TUtf32String ToUpperRet(const TUtf32StringBuf text, size_t pos = 0, size_t count = TWtringBuf::npos) Y_WARN_UNUSED_RESULT;
-TUtf32String ToTitleRet(const TUtf32StringBuf text, size_t pos = 0, size_t count = TWtringBuf::npos) Y_WARN_UNUSED_RESULT;
-
+TUtf32String ToLowerRet(const TUtf32StringBuf text, size_t pos = 0, size_t count = TWtringBuf::npos) Y_WARN_UNUSED_RESULT;
+TUtf32String ToUpperRet(const TUtf32StringBuf text, size_t pos = 0, size_t count = TWtringBuf::npos) Y_WARN_UNUSED_RESULT;
+TUtf32String ToTitleRet(const TUtf32StringBuf text, size_t pos = 0, size_t count = TWtringBuf::npos) Y_WARN_UNUSED_RESULT;
+
//! replaces the '<', '>' and '&' characters in string with '&lt;', '&gt;' and '&amp;' respectively
// insertBr=true - replace '\r' and '\n' with "<BR>"
template <bool insertBr>
diff --git a/util/charset/wide_sse41.cpp b/util/charset/wide_sse41.cpp
index d1f2a74851..a4c0982f56 100644
--- a/util/charset/wide_sse41.cpp
+++ b/util/charset/wide_sse41.cpp
@@ -21,226 +21,226 @@ namespace NDetail {
//processes to the first error, or until less then 16 bytes left
//most code taken from https://woboq.com/blog/utf-8-processing-using-simd.html
-//return dstAdvance 0 in case of problems
-static Y_FORCE_INLINE ui32 Unpack16BytesIntoUtf16IfNoSurrogats(const unsigned char*& cur, __m128i& utf16Low, __m128i& utf16High) {
- unsigned char curAligned[16];
-
- memcpy(curAligned, cur, sizeof(__m128i));
- __m128i chunk = _mm_load_si128(reinterpret_cast<const __m128i*>(curAligned));
-
- //only ascii characters - simple copy
- if (!_mm_movemask_epi8(chunk)) {
- utf16Low = _mm_unpacklo_epi8(chunk, _mm_setzero_si128());
- utf16High = _mm_unpackhi_epi8(chunk, _mm_setzero_si128());
- cur += 16;
- return 16;
- }
+//return dstAdvance 0 in case of problems
+static Y_FORCE_INLINE ui32 Unpack16BytesIntoUtf16IfNoSurrogats(const unsigned char*& cur, __m128i& utf16Low, __m128i& utf16High) {
+ unsigned char curAligned[16];
+
+ memcpy(curAligned, cur, sizeof(__m128i));
+ __m128i chunk = _mm_load_si128(reinterpret_cast<const __m128i*>(curAligned));
+
+ //only ascii characters - simple copy
+ if (!_mm_movemask_epi8(chunk)) {
+ utf16Low = _mm_unpacklo_epi8(chunk, _mm_setzero_si128());
+ utf16High = _mm_unpackhi_epi8(chunk, _mm_setzero_si128());
+ cur += 16;
+ return 16;
+ }
- __m128i chunkSigned = _mm_add_epi8(chunk, _mm_set1_epi8(0x80));
- __m128i isAsciiMask = _mm_cmpgt_epi8(chunk, _mm_set1_epi8(0));
+ __m128i chunkSigned = _mm_add_epi8(chunk, _mm_set1_epi8(0x80));
+ __m128i isAsciiMask = _mm_cmpgt_epi8(chunk, _mm_set1_epi8(0));
- __m128i cond2 = _mm_cmplt_epi8(_mm_set1_epi8(0xc2 - 1 - 0x80), chunkSigned);
+ __m128i cond2 = _mm_cmplt_epi8(_mm_set1_epi8(0xc2 - 1 - 0x80), chunkSigned);
__m128i state = _mm_set1_epi8(0x0 | (char)0x80);
- __m128i cond3 = _mm_cmplt_epi8(_mm_set1_epi8(0xe0 - 1 - 0x80), chunkSigned);
+ __m128i cond3 = _mm_cmplt_epi8(_mm_set1_epi8(0xe0 - 1 - 0x80), chunkSigned);
state = _mm_blendv_epi8(state, _mm_set1_epi8(0x2 | (char)0xc0), cond2);
- int sourceAdvance;
- __m128i shifts;
- __m128i chunkLow, chunkHigh;
+ int sourceAdvance;
+ __m128i shifts;
+ __m128i chunkLow, chunkHigh;
- if (Y_LIKELY(!_mm_movemask_epi8(cond3))) {
- //main case: no bloks of size 3 or 4
+ if (Y_LIKELY(!_mm_movemask_epi8(cond3))) {
+ //main case: no bloks of size 3 or 4
- //rune len for start of multi-byte sequences (0 for b0... and b10..., 2 for b110..., etc.)
- __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7));
+ //rune len for start of multi-byte sequences (0 for b0... and b10..., 2 for b110..., etc.)
+ __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7));
- __m128i countSub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1));
+ __m128i countSub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1));
- shifts = countSub1;
- __m128i continuation1 = _mm_slli_si128(countSub1, 1);
+ shifts = countSub1;
+ __m128i continuation1 = _mm_slli_si128(countSub1, 1);
- shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1));
- shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2));
+ shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1));
+ shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2));
- __m128i counts = _mm_or_si128(count, continuation1);
+ __m128i counts = _mm_or_si128(count, continuation1);
- __m128i isBeginMultibyteMask = _mm_cmpgt_epi8(count, _mm_set1_epi8(0));
- __m128i needNoContinuationMask = _mm_cmpeq_epi8(continuation1, _mm_set1_epi8(0));
- __m128i isBeginMask = _mm_add_epi8(isBeginMultibyteMask, isAsciiMask);
- //each symbol should be exactly one of ascii, continuation or begin
- __m128i okMask = _mm_cmpeq_epi8(isBeginMask, needNoContinuationMask);
+ __m128i isBeginMultibyteMask = _mm_cmpgt_epi8(count, _mm_set1_epi8(0));
+ __m128i needNoContinuationMask = _mm_cmpeq_epi8(continuation1, _mm_set1_epi8(0));
+ __m128i isBeginMask = _mm_add_epi8(isBeginMultibyteMask, isAsciiMask);
+ //each symbol should be exactly one of ascii, continuation or begin
+ __m128i okMask = _mm_cmpeq_epi8(isBeginMask, needNoContinuationMask);
- if (_mm_movemask_epi8(okMask) != 0xFFFF) {
- return 0;
- }
+ if (_mm_movemask_epi8(okMask) != 0xFFFF) {
+ return 0;
+ }
- shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4));
+ shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4));
- __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8));
- shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8));
+ __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8));
+ shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8));
- chunk = _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits
- shifts = _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1
+ chunk = _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits
+ shifts = _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1
- __m128i chunk_right = _mm_slli_si128(chunk, 1);
- shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1),
+ __m128i chunk_right = _mm_slli_si128(chunk, 1);
+ shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1),
_mm_srli_si128(_mm_slli_epi16(shifts, 7), 1));
- chunkLow = _mm_blendv_epi8(chunk,
+ chunkLow = _mm_blendv_epi8(chunk,
_mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6), _mm_set1_epi8(0xc0))),
_mm_cmpeq_epi8(counts, _mm_set1_epi8(1)));
- chunkHigh = _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2)));
+ chunkHigh = _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2)));
- shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2),
+ shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2),
_mm_srli_si128(_mm_slli_epi16(shifts, 6), 2));
- chunkHigh = _mm_srli_epi32(chunkHigh, 2);
+ chunkHigh = _mm_srli_epi32(chunkHigh, 2);
- shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4),
+ shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4),
_mm_srli_si128(_mm_slli_epi16(shifts, 5), 4));
-
- int c = _mm_extract_epi16(counts, 7);
- sourceAdvance = !(c & 0x0200) ? 16 : 15;
-
- } else {
- __m128i mask3 = _mm_slli_si128(cond3, 1);
-
- __m128i cond4 = _mm_cmplt_epi8(_mm_set1_epi8(0xf0 - 1 - 0x80), chunkSigned);
+
+ int c = _mm_extract_epi16(counts, 7);
+ sourceAdvance = !(c & 0x0200) ? 16 : 15;
+
+ } else {
+ __m128i mask3 = _mm_slli_si128(cond3, 1);
+
+ __m128i cond4 = _mm_cmplt_epi8(_mm_set1_epi8(0xf0 - 1 - 0x80), chunkSigned);
state = _mm_blendv_epi8(state, _mm_set1_epi8(0x3 | (char)0xe0), cond3);
-
- // 4 bytes sequences are not vectorize. Fall back to the scalar processing
- if (Y_UNLIKELY(_mm_movemask_epi8(cond4))) {
- return 0;
- }
-
- //rune len for start of multi-byte sequences (0 for b0... and b10..., 2 for b110..., etc.)
- __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7));
-
- __m128i countSub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1));
- __m128i continuation2 = _mm_slli_si128(_mm_subs_epu8(count, _mm_set1_epi8(0x2)), 2);
-
- shifts = countSub1;
- __m128i continuation1 = _mm_slli_si128(countSub1, 1);
-
- shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1));
- __m128i continuationsRunelen = _mm_or_si128(continuation1, continuation2);
-
- shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2));
- __m128i counts = _mm_or_si128(count, continuationsRunelen);
-
- __m128i isBeginMultibyteMask = _mm_cmpgt_epi8(count, _mm_set1_epi8(0));
- __m128i needNoContinuationMask = _mm_cmpeq_epi8(continuationsRunelen, _mm_set1_epi8(0));
- __m128i isBeginMask = _mm_add_epi8(isBeginMultibyteMask, isAsciiMask);
- //each symbol should be exactly one of ascii, continuation or begin
- __m128i okMask = _mm_cmpeq_epi8(isBeginMask, needNoContinuationMask);
-
- if (_mm_movemask_epi8(okMask) != 0xFFFF) {
- return 0;
- }
-
- shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4));
-
- __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8));
- shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8));
-
- chunk = _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits
- shifts = _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1
-
- __m128i chunk_right = _mm_slli_si128(chunk, 1);
- shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1),
+
+ // 4 bytes sequences are not vectorize. Fall back to the scalar processing
+ if (Y_UNLIKELY(_mm_movemask_epi8(cond4))) {
+ return 0;
+ }
+
+ //rune len for start of multi-byte sequences (0 for b0... and b10..., 2 for b110..., etc.)
+ __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7));
+
+ __m128i countSub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1));
+ __m128i continuation2 = _mm_slli_si128(_mm_subs_epu8(count, _mm_set1_epi8(0x2)), 2);
+
+ shifts = countSub1;
+ __m128i continuation1 = _mm_slli_si128(countSub1, 1);
+
+ shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1));
+ __m128i continuationsRunelen = _mm_or_si128(continuation1, continuation2);
+
+ shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2));
+ __m128i counts = _mm_or_si128(count, continuationsRunelen);
+
+ __m128i isBeginMultibyteMask = _mm_cmpgt_epi8(count, _mm_set1_epi8(0));
+ __m128i needNoContinuationMask = _mm_cmpeq_epi8(continuationsRunelen, _mm_set1_epi8(0));
+ __m128i isBeginMask = _mm_add_epi8(isBeginMultibyteMask, isAsciiMask);
+ //each symbol should be exactly one of ascii, continuation or begin
+ __m128i okMask = _mm_cmpeq_epi8(isBeginMask, needNoContinuationMask);
+
+ if (_mm_movemask_epi8(okMask) != 0xFFFF) {
+ return 0;
+ }
+
+ shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4));
+
+ __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8));
+ shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8));
+
+ chunk = _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits
+ shifts = _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1
+
+ __m128i chunk_right = _mm_slli_si128(chunk, 1);
+ shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1),
_mm_srli_si128(_mm_slli_epi16(shifts, 7), 1));
-
- chunkLow = _mm_blendv_epi8(chunk,
+
+ chunkLow = _mm_blendv_epi8(chunk,
_mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6), _mm_set1_epi8(0xc0))),
_mm_cmpeq_epi8(counts, _mm_set1_epi8(1)));
-
- chunkHigh = _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2)));
-
- shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2),
+
+ chunkHigh = _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2)));
+
+ shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2),
_mm_srli_si128(_mm_slli_epi16(shifts, 6), 2));
- chunkHigh = _mm_srli_epi32(chunkHigh, 2);
-
- shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4),
+ chunkHigh = _mm_srli_epi32(chunkHigh, 2);
+
+ shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4),
_mm_srli_si128(_mm_slli_epi16(shifts, 5), 4));
- chunkHigh = _mm_or_si128(chunkHigh,
+ chunkHigh = _mm_or_si128(chunkHigh,
_mm_and_si128(_mm_and_si128(_mm_slli_epi32(chunk_right, 4), _mm_set1_epi8(0xf0)),
mask3));
- int c = _mm_extract_epi16(counts, 7);
+ int c = _mm_extract_epi16(counts, 7);
sourceAdvance = !(c & 0x0200) ? 16 : !(c & 0x02) ? 15
: 14;
- }
+ }
- shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 8),
+ shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 8),
_mm_srli_si128(_mm_slli_epi16(shifts, 4), 8));
- chunkHigh = _mm_slli_si128(chunkHigh, 1);
-
- __m128i shuf = _mm_add_epi8(shifts, _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
-
- chunkLow = _mm_shuffle_epi8(chunkLow, shuf);
- chunkHigh = _mm_shuffle_epi8(chunkHigh, shuf);
-
- utf16Low = _mm_unpacklo_epi8(chunkLow, chunkHigh);
- utf16High = _mm_unpackhi_epi8(chunkLow, chunkHigh);
-
- ui32 s = _mm_extract_epi32(shifts, 3);
- ui32 destAdvance = sourceAdvance - (0xff & (s >> (8 * (3 - 16 + sourceAdvance))));
- cur += sourceAdvance;
- return destAdvance;
-}
-
-namespace NDetail {
- void UTF8ToWideImplSSE41(const unsigned char*& cur, const unsigned char* last, wchar16*& dest) noexcept {
- alignas(16) wchar16 destAligned[16];
-
- while (cur + 16 <= last) {
- __m128i utf16Low;
- __m128i utf16High;
- ui32 dstAdvance = Unpack16BytesIntoUtf16IfNoSurrogats(cur, utf16Low, utf16High);
-
- if (dstAdvance == 0) {
- break;
- }
-
- _mm_store_si128(reinterpret_cast<__m128i*>(destAligned), utf16Low);
- _mm_store_si128(reinterpret_cast<__m128i*>(destAligned) + 1, utf16High);
+ chunkHigh = _mm_slli_si128(chunkHigh, 1);
+
+ __m128i shuf = _mm_add_epi8(shifts, _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+
+ chunkLow = _mm_shuffle_epi8(chunkLow, shuf);
+ chunkHigh = _mm_shuffle_epi8(chunkHigh, shuf);
+
+ utf16Low = _mm_unpacklo_epi8(chunkLow, chunkHigh);
+ utf16High = _mm_unpackhi_epi8(chunkLow, chunkHigh);
+
+ ui32 s = _mm_extract_epi32(shifts, 3);
+ ui32 destAdvance = sourceAdvance - (0xff & (s >> (8 * (3 - 16 + sourceAdvance))));
+ cur += sourceAdvance;
+ return destAdvance;
+}
+
+namespace NDetail {
+ void UTF8ToWideImplSSE41(const unsigned char*& cur, const unsigned char* last, wchar16*& dest) noexcept {
+ alignas(16) wchar16 destAligned[16];
+
+ while (cur + 16 <= last) {
+ __m128i utf16Low;
+ __m128i utf16High;
+ ui32 dstAdvance = Unpack16BytesIntoUtf16IfNoSurrogats(cur, utf16Low, utf16High);
+
+ if (dstAdvance == 0) {
+ break;
+ }
+
+ _mm_store_si128(reinterpret_cast<__m128i*>(destAligned), utf16Low);
+ _mm_store_si128(reinterpret_cast<__m128i*>(destAligned) + 1, utf16High);
memcpy(dest, destAligned, sizeof(__m128i) * 2);
- dest += dstAdvance;
- }
- //The rest will be handled sequencially.
- // Possible improvement: go back to the vectorized processing after the error or the 4 byte sequence
- }
-
- void UTF8ToWideImplSSE41(const unsigned char*& cur, const unsigned char* last, wchar32*& dest) noexcept {
- alignas(16) wchar32 destAligned[16];
-
- while (cur + 16 <= last) {
- __m128i utf16Low;
- __m128i utf16High;
- ui32 dstAdvance = Unpack16BytesIntoUtf16IfNoSurrogats(cur, utf16Low, utf16High);
-
- if (dstAdvance == 0) {
- break;
- }
-
- //NOTE: we only work in case without surrogat pairs, so we can make simple copying with zeroes in 2 high bytes
- __m128i utf32_lowlow = _mm_unpacklo_epi16(utf16Low, _mm_set1_epi8(0));
- __m128i utf32_lowhigh = _mm_unpackhi_epi16(utf16Low, _mm_set1_epi8(0));
- __m128i utf32_highlow = _mm_unpacklo_epi16(utf16High, _mm_set1_epi8(0));
- __m128i utf32_highhigh = _mm_unpackhi_epi16(utf16High, _mm_set1_epi8(0));
-
- _mm_store_si128(reinterpret_cast<__m128i*>(destAligned), utf32_lowlow);
- _mm_store_si128(reinterpret_cast<__m128i*>(destAligned) + 1, utf32_lowhigh);
- _mm_store_si128(reinterpret_cast<__m128i*>(destAligned) + 2, utf32_highlow);
- _mm_store_si128(reinterpret_cast<__m128i*>(destAligned) + 3, utf32_highhigh);
-
- memcpy(dest, destAligned, sizeof(__m128i) * 4);
- dest += dstAdvance;
+ dest += dstAdvance;
+ }
+ //The rest will be handled sequencially.
+ // Possible improvement: go back to the vectorized processing after the error or the 4 byte sequence
+ }
+
+ void UTF8ToWideImplSSE41(const unsigned char*& cur, const unsigned char* last, wchar32*& dest) noexcept {
+ alignas(16) wchar32 destAligned[16];
+
+ while (cur + 16 <= last) {
+ __m128i utf16Low;
+ __m128i utf16High;
+ ui32 dstAdvance = Unpack16BytesIntoUtf16IfNoSurrogats(cur, utf16Low, utf16High);
+
+ if (dstAdvance == 0) {
+ break;
+ }
+
+ //NOTE: we only work in case without surrogat pairs, so we can make simple copying with zeroes in 2 high bytes
+ __m128i utf32_lowlow = _mm_unpacklo_epi16(utf16Low, _mm_set1_epi8(0));
+ __m128i utf32_lowhigh = _mm_unpackhi_epi16(utf16Low, _mm_set1_epi8(0));
+ __m128i utf32_highlow = _mm_unpacklo_epi16(utf16High, _mm_set1_epi8(0));
+ __m128i utf32_highhigh = _mm_unpackhi_epi16(utf16High, _mm_set1_epi8(0));
+
+ _mm_store_si128(reinterpret_cast<__m128i*>(destAligned), utf32_lowlow);
+ _mm_store_si128(reinterpret_cast<__m128i*>(destAligned) + 1, utf32_lowhigh);
+ _mm_store_si128(reinterpret_cast<__m128i*>(destAligned) + 2, utf32_highlow);
+ _mm_store_si128(reinterpret_cast<__m128i*>(destAligned) + 3, utf32_highhigh);
+
+ memcpy(dest, destAligned, sizeof(__m128i) * 4);
+ dest += dstAdvance;
}
//The rest will be handled sequencially.
- // Possible improvement: go back to the vectorized processing after the error or the 4 byte sequence
+ // Possible improvement: go back to the vectorized processing after the error or the 4 byte sequence
}
}
diff --git a/util/charset/wide_ut.cpp b/util/charset/wide_ut.cpp
index d8f3233e73..aa1a28f84f 100644
--- a/util/charset/wide_ut.cpp
+++ b/util/charset/wide_ut.cpp
@@ -1,7 +1,7 @@
#include "utf8.h"
#include "wide.h"
-#include <library/cpp/testing/unittest/registar.h>
+#include <library/cpp/testing/unittest/registar.h>
#include <util/string/reverse.h>