diff options
author | rolton1999 <rolton1999@yandex-team.com> | 2023-11-04 11:06:26 +0300 |
---|---|---|
committer | rolton1999 <rolton1999@yandex-team.com> | 2023-11-04 11:24:30 +0300 |
commit | 2aade25e5d8359b411bd3cf5307d0f886d2c92dd (patch) | |
tree | 243b94bcd88f027d383b63b49fb0283f25dc728b | |
parent | 848502fd9d51f59443646970e735e9f0b7d4c988 (diff) | |
download | ydb-2aade25e5d8359b411bd3cf5307d0f886d2c92dd.tar.gz |
YT: Allow to print utf-8 strings w/o escaping in TYsonWriter
-rw-r--r-- | yt/yt/core/yson/unittests/yson_writer_ut.cpp | 16 | ||||
-rw-r--r-- | yt/yt/core/yson/writer.cpp | 32 | ||||
-rw-r--r-- | yt/yt/core/yson/writer.h | 4 |
3 files changed, 48 insertions, 4 deletions
diff --git a/yt/yt/core/yson/unittests/yson_writer_ut.cpp b/yt/yt/core/yson/unittests/yson_writer_ut.cpp index e58be45da6..99414f0d94 100644 --- a/yt/yt/core/yson/unittests/yson_writer_ut.cpp +++ b/yt/yt/core/yson/unittests/yson_writer_ut.cpp @@ -97,7 +97,6 @@ TEST_F(TYsonWriterTest, NaN) TEST_F(TYsonWriterTest, EmptyMap) { - InSequence dummy; EXPECT_CALL(Mock, OnBeginMap()); EXPECT_CALL(Mock, OnEndMap()); @@ -194,6 +193,21 @@ TEST_F(TYsonWriterTest, MapWithAttributes) Run(); } +TEST_F(TYsonWriterTest, UtfString) +{ + const TString utfString("строка ютф и специальные символы - \n \t \b"); + TStringStream output; + TYsonWriter writer( + &output, + EYsonFormat::Text, + EYsonType::Node, + true, + TYsonWriter::DefaultIndent, + /*passThroughUtf8Characters*/ true); + writer.OnStringScalar(TStringBuf(utfString)); + EXPECT_EQ("\"\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0 \xD1\x8E\xD1\x82\xD1\x84 \xD0\xB8 \xD1\x81\xD0\xBF\xD0\xB5\xD1\x86\xD0\xB8\xD0\xB0\xD0\xBB\xD1\x8C\xD0\xBD\xD1\x8B\xD0\xB5 \xD1\x81\xD0\xB8\xD0\xBC\xD0\xB2\xD0\xBE\xD0\xBB\xD1\x8B - \\n \\t \\x08\"", output.Str()); +} + TEST_F(TYsonWriterTest, Escaping) { TStringStream outputStream; diff --git a/yt/yt/core/yson/writer.cpp b/yt/yt/core/yson/writer.cpp index 903d138561..4db08ee279 100644 --- a/yt/yt/core/yson/writer.cpp +++ b/yt/yt/core/yson/writer.cpp @@ -4,6 +4,8 @@ #include <library/cpp/yt/coding/varint.h> +#include <util/charset/utf8.h> + #include <util/stream/buffer.h> #include <util/system/unaligned_mem.h> @@ -110,6 +112,26 @@ void EscapeC(const char* str, size_t len, IOutputStream& output) { } } +void WriteUtf8String(const char* str, size_t len, IOutputStream& output) +{ + char buffer[ESCAPE_C_BUFFER_SIZE]; + const auto* ustr = reinterpret_cast<const unsigned char*>(str); + for (size_t i = 0; i < len;) { + size_t runeLen; + YT_VERIFY(RECODE_OK == GetUTF8CharLen(runeLen, ustr + i, ustr + len)); + if (runeLen > 1) { + output.Write(ustr + i, runeLen); + i += runeLen; + } else { + YT_ASSERT(1 == runeLen); + // `EscapeC` must be called for case of non-ascii characters like `\t` and `\n`. + runeLen = EscapeC(ustr[i], (i + 1 < len ? ustr[i + 1] : 0), buffer); + output.Write(buffer, runeLen); + ++i; + } + } +} + size_t FloatToStringWithNanInf(double value, char* buf, size_t size) { if (std::isfinite(value)) { @@ -143,12 +165,14 @@ TYsonWriter::TYsonWriter( EYsonFormat format, EYsonType type, bool enableRaw, - int indent) + int indent, + bool passThroughUtf8Characters) : Stream_(stream) , Format_(format) , Type_(type) , EnableRaw_(enableRaw) , IndentSize_(indent) + , PassThroughUtf8Characters_(passThroughUtf8Characters) { YT_ASSERT(Stream_); } @@ -208,7 +232,11 @@ void TYsonWriter::WriteStringScalar(TStringBuf value) Stream_->Write(value.begin(), value.length()); } else { Stream_->Write('"'); - EscapeC(value.data(), value.length(), *Stream_); + if (PassThroughUtf8Characters_ && IsUtf(value)) { + WriteUtf8String(value.data(), value.length(), *Stream_); + } else { + EscapeC(value.data(), value.length(), *Stream_); + } Stream_->Write('"'); } } diff --git a/yt/yt/core/yson/writer.h b/yt/yt/core/yson/writer.h index 6c309c80f4..daba204889 100644 --- a/yt/yt/core/yson/writer.h +++ b/yt/yt/core/yson/writer.h @@ -28,7 +28,8 @@ public: EYsonFormat format = EYsonFormat::Binary, EYsonType type = EYsonType::Node, bool enableRaw = false, - int indent = DefaultIndent); + int indent = DefaultIndent, + bool passThroughUtf8Characters = false); // IYsonConsumer overrides. void OnStringScalar(TStringBuf value) override; @@ -62,6 +63,7 @@ protected: const EYsonType Type_; const bool EnableRaw_; const int IndentSize_; + const bool PassThroughUtf8Characters_; int Depth_ = 0; bool EmptyCollection_ = true; |