aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorthegeorg <thegeorg@yandex-team.com>2024-01-27 11:11:33 +0300
committerAlexander Smirnov <alex@ydb.tech>2024-01-31 14:23:53 +0300
commit08ab41882130617912c9e8e1580e692d94abfd48 (patch)
treeb1a23cfcbc357b8d9ff065e8f9bd5053ffa9cf19
parent7815d0e5e2775ad6d4979eb0f552350f02f706ae (diff)
downloadydb-08ab41882130617912c9e8e1580e692d94abfd48.tar.gz
Support wide strings (and string_views) in Out<> / IOutputStream
-rw-r--r--util/charset/wide.h47
-rw-r--r--util/stream/output.cpp39
-rw-r--r--util/stream/str_ut.cpp38
3 files changed, 104 insertions, 20 deletions
diff --git a/util/charset/wide.h b/util/charset/wide.h
index 6da3c76252..2b52bb4f12 100644
--- a/util/charset/wide.h
+++ b/util/charset/wide.h
@@ -49,8 +49,8 @@ namespace NDetail {
inline wchar32 ReadSurrogatePair(const wchar16* chars) noexcept {
const wchar32 SURROGATE_OFFSET = static_cast<wchar32>(0x10000 - (0xD800 << 10) - 0xDC00);
- wchar16 lead = chars[0];
- wchar16 tail = chars[1];
+ wchar32 lead = chars[0];
+ wchar32 tail = chars[1];
Y_ASSERT(IsW16SurrogateLead(lead));
Y_ASSERT(IsW16SurrogateTail(tail));
@@ -98,7 +98,7 @@ inline wchar32 ReadSymbol(const wchar32* begin, const wchar32* end) noexcept {
}
//! presuming input data is either big enought of null terminated
-inline wchar32 ReadSymbolAndAdvance(const wchar16*& begin) noexcept {
+inline wchar32 ReadSymbolAndAdvance(const char16_t*& begin) noexcept {
Y_ASSERT(*begin);
if (IsW16SurrogateLead(begin[0])) {
if (IsW16SurrogateTail(begin[1])) {
@@ -117,12 +117,30 @@ inline wchar32 ReadSymbolAndAdvance(const wchar16*& begin) noexcept {
}
//! presuming input data is either big enought of null terminated
-inline wchar32 ReadSymbolAndAdvance(const wchar32*& begin) noexcept {
+inline wchar32 ReadSymbolAndAdvance(const char32_t*& begin) noexcept {
Y_ASSERT(*begin);
return *(begin++);
}
-inline wchar32 ReadSymbolAndAdvance(const wchar16*& begin, const wchar16* end) noexcept {
+inline wchar32 ReadSymbolAndAdvance(const wchar_t*& begin) noexcept {
+ // According to
+ // https://en.cppreference.com/w/cpp/language/types
+ // wchar_t holds UTF-16 on Windows and UTF-32 on Linux / macOS
+ //
+ // Apply reinterpret cast and dispatch to a proper type
+
+#ifdef _win_
+ using TDistinctChar = char16_t;
+#else
+ using TDistinctChar = char32_t;
+#endif
+ const TDistinctChar*& distinctBegin = reinterpret_cast<const TDistinctChar*&>(begin);
+ wchar32 result = ReadSymbolAndAdvance(distinctBegin);
+ begin = reinterpret_cast<const wchar_t*&>(distinctBegin);
+ return result;
+}
+
+inline wchar32 ReadSymbolAndAdvance(const char16_t*& begin, const char16_t* end) noexcept {
Y_ASSERT(begin < end);
if (IsW16SurrogateLead(begin[0])) {
if (begin + 1 != end && IsW16SurrogateTail(begin[1])) {
@@ -144,6 +162,25 @@ inline wchar32 ReadSymbolAndAdvance(const wchar32*& begin, const wchar32* end) n
return *(begin++);
}
+inline wchar32 ReadSymbolAndAdvance(const wchar_t*& begin, const wchar_t* end) noexcept {
+ // According to
+ // https://en.cppreference.com/w/cpp/language/types
+ // wchar_t holds UTF-16 on Windows and UTF-32 on Linux / macOS
+ //
+ // Apply reinterpret cast and dispatch to a proper type
+
+#ifdef _win_
+ using TDistinctChar = char16_t;
+#else
+ using TDistinctChar = char32_t;
+#endif
+ const TDistinctChar* distinctBegin = reinterpret_cast<const TDistinctChar*>(begin);
+ const TDistinctChar* distinctEnd = reinterpret_cast<const TDistinctChar*>(end);
+ wchar32 result = ::ReadSymbolAndAdvance(distinctBegin, distinctEnd);
+ begin = reinterpret_cast<const wchar_t*>(distinctBegin);
+ return result;
+}
+
template <class T>
inline size_t WriteSymbol(wchar16 s, T& dest) noexcept {
::NDetail::TSelector<std::is_pointer<T>::value>::WriteSymbol(s, dest);
diff --git a/util/stream/output.cpp b/util/stream/output.cpp
index 719c877764..8b3f78527d 100644
--- a/util/stream/output.cpp
+++ b/util/stream/output.cpp
@@ -70,24 +70,13 @@ void Out<wchar32>(IOutputStream& o, wchar32 ch) {
o.Write(buffer, length);
}
-static void WriteString(IOutputStream& o, const wchar16* w, size_t n) {
+template <typename TCharType>
+static void WriteString(IOutputStream& o, const TCharType* w, size_t n) {
const size_t buflen = (n * MAX_UTF8_BYTES); // * 4 because the conversion functions can convert unicode character into maximum 4 bytes of UTF8
TTempBuf buffer(buflen + 1);
- char* const data = buffer.Data();
size_t written = 0;
- WideToUTF8(w, n, data, written);
- data[written] = 0;
- o.Write(data, written);
-}
-
-static void WriteString(IOutputStream& o, const wchar32* w, size_t n) {
- const size_t buflen = (n * MAX_UTF8_BYTES); // * 4 because the conversion functions can convert unicode character into maximum 4 bytes of UTF8
- TTempBuf buffer(buflen + 1);
- char* const data = buffer.Data();
- size_t written = 0;
- WideToUTF8(w, n, data, written);
- data[written] = 0;
- o.Write(data, written);
+ WideToUTF8(w, n, buffer.Data(), written);
+ o.Write(buffer.Data(), written);
}
template <>
@@ -101,11 +90,31 @@ void Out<std::string>(IOutputStream& o, const std::string& p) {
}
template <>
+void Out<std::wstring>(IOutputStream& o, const std::wstring& p) {
+ WriteString(o, p.data(), p.length());
+}
+
+template <>
+void Out<std::u16string>(IOutputStream& o, const std::u16string& p) {
+ WriteString(o, p.data(), p.length());
+}
+
+template <>
+void Out<std::u32string>(IOutputStream& o, const std::u32string& p) {
+ WriteString(o, p.data(), p.length());
+}
+
+template <>
void Out<std::string_view>(IOutputStream& o, const std::string_view& p) {
o.Write(p.data(), p.length());
}
template <>
+void Out<std::wstring_view>(IOutputStream& o, const std::wstring_view& p) {
+ WriteString(o, p.data(), p.length());
+}
+
+template <>
void Out<std::u16string_view>(IOutputStream& o, const std::u16string_view& p) {
WriteString(o, p.data(), p.length());
}
diff --git a/util/stream/str_ut.cpp b/util/stream/str_ut.cpp
index fc6b46c31a..534b58d71c 100644
--- a/util/stream/str_ut.cpp
+++ b/util/stream/str_ut.cpp
@@ -149,4 +149,42 @@ Y_UNIT_TEST_SUITE(TStringInputOutputTest) {
// Check old stream is in a valid state
output1 << "baz";
}
+
+ // There is no distinct tests for Out<> via IOutputStream.
+ // Let's tests strings output here.
+ Y_UNIT_TEST(TestWritingWideStrings) {
+ using namespace std::literals::string_literals;
+ TString str;
+ TStringOutput stream(str);
+
+ // test char16_t
+ const char16_t* utf16Data = u"Быть или не быть? Вот в чём вопрос";
+ stream << std::u16string(utf16Data);
+ UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос");
+ str.clear();
+
+ stream << std::u16string_view(utf16Data);
+ UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос");
+ str.clear();
+
+ // test char32_t
+ const char32_t* utf32Data = U"Быть или не быть? Вот в чём вопрос";
+ stream << std::u32string(utf32Data);
+ UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос");
+ str.clear();
+
+ stream << std::u32string_view(utf32Data);
+ UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос");
+ str.clear();
+
+ // test wchar_t
+ const wchar_t* wcharData = L"Быть или не быть? Вот в чём вопрос";
+ stream << std::wstring(wcharData);
+ UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос");
+ str.clear();
+
+ stream << std::wstring_view(wcharData);
+ UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос");
+ str.clear();
+ }
}