diff options
author | thegeorg <thegeorg@yandex-team.com> | 2024-01-27 11:11:33 +0300 |
---|---|---|
committer | Alexander Smirnov <alex@ydb.tech> | 2024-01-31 14:23:53 +0300 |
commit | 08ab41882130617912c9e8e1580e692d94abfd48 (patch) | |
tree | b1a23cfcbc357b8d9ff065e8f9bd5053ffa9cf19 | |
parent | 7815d0e5e2775ad6d4979eb0f552350f02f706ae (diff) | |
download | ydb-08ab41882130617912c9e8e1580e692d94abfd48.tar.gz |
Support wide strings (and string_views) in Out<> / IOutputStream
-rw-r--r-- | util/charset/wide.h | 47 | ||||
-rw-r--r-- | util/stream/output.cpp | 39 | ||||
-rw-r--r-- | util/stream/str_ut.cpp | 38 |
3 files changed, 104 insertions, 20 deletions
diff --git a/util/charset/wide.h b/util/charset/wide.h index 6da3c76252..2b52bb4f12 100644 --- a/util/charset/wide.h +++ b/util/charset/wide.h @@ -49,8 +49,8 @@ namespace NDetail { inline wchar32 ReadSurrogatePair(const wchar16* chars) noexcept { const wchar32 SURROGATE_OFFSET = static_cast<wchar32>(0x10000 - (0xD800 << 10) - 0xDC00); - wchar16 lead = chars[0]; - wchar16 tail = chars[1]; + wchar32 lead = chars[0]; + wchar32 tail = chars[1]; Y_ASSERT(IsW16SurrogateLead(lead)); Y_ASSERT(IsW16SurrogateTail(tail)); @@ -98,7 +98,7 @@ inline wchar32 ReadSymbol(const wchar32* begin, const wchar32* end) noexcept { } //! presuming input data is either big enought of null terminated -inline wchar32 ReadSymbolAndAdvance(const wchar16*& begin) noexcept { +inline wchar32 ReadSymbolAndAdvance(const char16_t*& begin) noexcept { Y_ASSERT(*begin); if (IsW16SurrogateLead(begin[0])) { if (IsW16SurrogateTail(begin[1])) { @@ -117,12 +117,30 @@ inline wchar32 ReadSymbolAndAdvance(const wchar16*& begin) noexcept { } //! presuming input data is either big enought of null terminated -inline wchar32 ReadSymbolAndAdvance(const wchar32*& begin) noexcept { +inline wchar32 ReadSymbolAndAdvance(const char32_t*& begin) noexcept { Y_ASSERT(*begin); return *(begin++); } -inline wchar32 ReadSymbolAndAdvance(const wchar16*& begin, const wchar16* end) noexcept { +inline wchar32 ReadSymbolAndAdvance(const wchar_t*& begin) noexcept { + // According to + // https://en.cppreference.com/w/cpp/language/types + // wchar_t holds UTF-16 on Windows and UTF-32 on Linux / macOS + // + // Apply reinterpret cast and dispatch to a proper type + +#ifdef _win_ + using TDistinctChar = char16_t; +#else + using TDistinctChar = char32_t; +#endif + const TDistinctChar*& distinctBegin = reinterpret_cast<const TDistinctChar*&>(begin); + wchar32 result = ReadSymbolAndAdvance(distinctBegin); + begin = reinterpret_cast<const wchar_t*&>(distinctBegin); + return result; +} + +inline wchar32 ReadSymbolAndAdvance(const char16_t*& begin, const char16_t* end) noexcept { Y_ASSERT(begin < end); if (IsW16SurrogateLead(begin[0])) { if (begin + 1 != end && IsW16SurrogateTail(begin[1])) { @@ -144,6 +162,25 @@ inline wchar32 ReadSymbolAndAdvance(const wchar32*& begin, const wchar32* end) n return *(begin++); } +inline wchar32 ReadSymbolAndAdvance(const wchar_t*& begin, const wchar_t* end) noexcept { + // According to + // https://en.cppreference.com/w/cpp/language/types + // wchar_t holds UTF-16 on Windows and UTF-32 on Linux / macOS + // + // Apply reinterpret cast and dispatch to a proper type + +#ifdef _win_ + using TDistinctChar = char16_t; +#else + using TDistinctChar = char32_t; +#endif + const TDistinctChar* distinctBegin = reinterpret_cast<const TDistinctChar*>(begin); + const TDistinctChar* distinctEnd = reinterpret_cast<const TDistinctChar*>(end); + wchar32 result = ::ReadSymbolAndAdvance(distinctBegin, distinctEnd); + begin = reinterpret_cast<const wchar_t*>(distinctBegin); + return result; +} + template <class T> inline size_t WriteSymbol(wchar16 s, T& dest) noexcept { ::NDetail::TSelector<std::is_pointer<T>::value>::WriteSymbol(s, dest); diff --git a/util/stream/output.cpp b/util/stream/output.cpp index 719c877764..8b3f78527d 100644 --- a/util/stream/output.cpp +++ b/util/stream/output.cpp @@ -70,24 +70,13 @@ void Out<wchar32>(IOutputStream& o, wchar32 ch) { o.Write(buffer, length); } -static void WriteString(IOutputStream& o, const wchar16* w, size_t n) { +template <typename TCharType> +static void WriteString(IOutputStream& o, const TCharType* w, size_t n) { const size_t buflen = (n * MAX_UTF8_BYTES); // * 4 because the conversion functions can convert unicode character into maximum 4 bytes of UTF8 TTempBuf buffer(buflen + 1); - char* const data = buffer.Data(); size_t written = 0; - WideToUTF8(w, n, data, written); - data[written] = 0; - o.Write(data, written); -} - -static void WriteString(IOutputStream& o, const wchar32* w, size_t n) { - const size_t buflen = (n * MAX_UTF8_BYTES); // * 4 because the conversion functions can convert unicode character into maximum 4 bytes of UTF8 - TTempBuf buffer(buflen + 1); - char* const data = buffer.Data(); - size_t written = 0; - WideToUTF8(w, n, data, written); - data[written] = 0; - o.Write(data, written); + WideToUTF8(w, n, buffer.Data(), written); + o.Write(buffer.Data(), written); } template <> @@ -101,11 +90,31 @@ void Out<std::string>(IOutputStream& o, const std::string& p) { } template <> +void Out<std::wstring>(IOutputStream& o, const std::wstring& p) { + WriteString(o, p.data(), p.length()); +} + +template <> +void Out<std::u16string>(IOutputStream& o, const std::u16string& p) { + WriteString(o, p.data(), p.length()); +} + +template <> +void Out<std::u32string>(IOutputStream& o, const std::u32string& p) { + WriteString(o, p.data(), p.length()); +} + +template <> void Out<std::string_view>(IOutputStream& o, const std::string_view& p) { o.Write(p.data(), p.length()); } template <> +void Out<std::wstring_view>(IOutputStream& o, const std::wstring_view& p) { + WriteString(o, p.data(), p.length()); +} + +template <> void Out<std::u16string_view>(IOutputStream& o, const std::u16string_view& p) { WriteString(o, p.data(), p.length()); } diff --git a/util/stream/str_ut.cpp b/util/stream/str_ut.cpp index fc6b46c31a..534b58d71c 100644 --- a/util/stream/str_ut.cpp +++ b/util/stream/str_ut.cpp @@ -149,4 +149,42 @@ Y_UNIT_TEST_SUITE(TStringInputOutputTest) { // Check old stream is in a valid state output1 << "baz"; } + + // There is no distinct tests for Out<> via IOutputStream. + // Let's tests strings output here. + Y_UNIT_TEST(TestWritingWideStrings) { + using namespace std::literals::string_literals; + TString str; + TStringOutput stream(str); + + // test char16_t + const char16_t* utf16Data = u"Быть или не быть? Вот в чём вопрос"; + stream << std::u16string(utf16Data); + UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос"); + str.clear(); + + stream << std::u16string_view(utf16Data); + UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос"); + str.clear(); + + // test char32_t + const char32_t* utf32Data = U"Быть или не быть? Вот в чём вопрос"; + stream << std::u32string(utf32Data); + UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос"); + str.clear(); + + stream << std::u32string_view(utf32Data); + UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос"); + str.clear(); + + // test wchar_t + const wchar_t* wcharData = L"Быть или не быть? Вот в чём вопрос"; + stream << std::wstring(wcharData); + UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос"); + str.clear(); + + stream << std::wstring_view(wcharData); + UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос"); + str.clear(); + } } |