aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorrolton1999 <rolton1999@yandex-team.com>2023-11-04 11:06:26 +0300
committerrolton1999 <rolton1999@yandex-team.com>2023-11-04 11:24:30 +0300
commit2aade25e5d8359b411bd3cf5307d0f886d2c92dd (patch)
tree243b94bcd88f027d383b63b49fb0283f25dc728b
parent848502fd9d51f59443646970e735e9f0b7d4c988 (diff)
downloadydb-2aade25e5d8359b411bd3cf5307d0f886d2c92dd.tar.gz
YT: Allow to print utf-8 strings w/o escaping in TYsonWriter
-rw-r--r--yt/yt/core/yson/unittests/yson_writer_ut.cpp16
-rw-r--r--yt/yt/core/yson/writer.cpp32
-rw-r--r--yt/yt/core/yson/writer.h4
3 files changed, 48 insertions, 4 deletions
diff --git a/yt/yt/core/yson/unittests/yson_writer_ut.cpp b/yt/yt/core/yson/unittests/yson_writer_ut.cpp
index e58be45da6..99414f0d94 100644
--- a/yt/yt/core/yson/unittests/yson_writer_ut.cpp
+++ b/yt/yt/core/yson/unittests/yson_writer_ut.cpp
@@ -97,7 +97,6 @@ TEST_F(TYsonWriterTest, NaN)
TEST_F(TYsonWriterTest, EmptyMap)
{
-
InSequence dummy;
EXPECT_CALL(Mock, OnBeginMap());
EXPECT_CALL(Mock, OnEndMap());
@@ -194,6 +193,21 @@ TEST_F(TYsonWriterTest, MapWithAttributes)
Run();
}
+TEST_F(TYsonWriterTest, UtfString)
+{
+ const TString utfString("строка ютф и специальные символы - \n \t \b");
+ TStringStream output;
+ TYsonWriter writer(
+ &output,
+ EYsonFormat::Text,
+ EYsonType::Node,
+ true,
+ TYsonWriter::DefaultIndent,
+ /*passThroughUtf8Characters*/ true);
+ writer.OnStringScalar(TStringBuf(utfString));
+ EXPECT_EQ("\"\xD1\x81\xD1\x82\xD1\x80\xD0\xBE\xD0\xBA\xD0\xB0 \xD1\x8E\xD1\x82\xD1\x84 \xD0\xB8 \xD1\x81\xD0\xBF\xD0\xB5\xD1\x86\xD0\xB8\xD0\xB0\xD0\xBB\xD1\x8C\xD0\xBD\xD1\x8B\xD0\xB5 \xD1\x81\xD0\xB8\xD0\xBC\xD0\xB2\xD0\xBE\xD0\xBB\xD1\x8B - \\n \\t \\x08\"", output.Str());
+}
+
TEST_F(TYsonWriterTest, Escaping)
{
TStringStream outputStream;
diff --git a/yt/yt/core/yson/writer.cpp b/yt/yt/core/yson/writer.cpp
index 903d138561..4db08ee279 100644
--- a/yt/yt/core/yson/writer.cpp
+++ b/yt/yt/core/yson/writer.cpp
@@ -4,6 +4,8 @@
#include <library/cpp/yt/coding/varint.h>
+#include <util/charset/utf8.h>
+
#include <util/stream/buffer.h>
#include <util/system/unaligned_mem.h>
@@ -110,6 +112,26 @@ void EscapeC(const char* str, size_t len, IOutputStream& output) {
}
}
+void WriteUtf8String(const char* str, size_t len, IOutputStream& output)
+{
+ char buffer[ESCAPE_C_BUFFER_SIZE];
+ const auto* ustr = reinterpret_cast<const unsigned char*>(str);
+ for (size_t i = 0; i < len;) {
+ size_t runeLen;
+ YT_VERIFY(RECODE_OK == GetUTF8CharLen(runeLen, ustr + i, ustr + len));
+ if (runeLen > 1) {
+ output.Write(ustr + i, runeLen);
+ i += runeLen;
+ } else {
+ YT_ASSERT(1 == runeLen);
+ // `EscapeC` must be called for case of non-ascii characters like `\t` and `\n`.
+ runeLen = EscapeC(ustr[i], (i + 1 < len ? ustr[i + 1] : 0), buffer);
+ output.Write(buffer, runeLen);
+ ++i;
+ }
+ }
+}
+
size_t FloatToStringWithNanInf(double value, char* buf, size_t size)
{
if (std::isfinite(value)) {
@@ -143,12 +165,14 @@ TYsonWriter::TYsonWriter(
EYsonFormat format,
EYsonType type,
bool enableRaw,
- int indent)
+ int indent,
+ bool passThroughUtf8Characters)
: Stream_(stream)
, Format_(format)
, Type_(type)
, EnableRaw_(enableRaw)
, IndentSize_(indent)
+ , PassThroughUtf8Characters_(passThroughUtf8Characters)
{
YT_ASSERT(Stream_);
}
@@ -208,7 +232,11 @@ void TYsonWriter::WriteStringScalar(TStringBuf value)
Stream_->Write(value.begin(), value.length());
} else {
Stream_->Write('"');
- EscapeC(value.data(), value.length(), *Stream_);
+ if (PassThroughUtf8Characters_ && IsUtf(value)) {
+ WriteUtf8String(value.data(), value.length(), *Stream_);
+ } else {
+ EscapeC(value.data(), value.length(), *Stream_);
+ }
Stream_->Write('"');
}
}
diff --git a/yt/yt/core/yson/writer.h b/yt/yt/core/yson/writer.h
index 6c309c80f4..daba204889 100644
--- a/yt/yt/core/yson/writer.h
+++ b/yt/yt/core/yson/writer.h
@@ -28,7 +28,8 @@ public:
EYsonFormat format = EYsonFormat::Binary,
EYsonType type = EYsonType::Node,
bool enableRaw = false,
- int indent = DefaultIndent);
+ int indent = DefaultIndent,
+ bool passThroughUtf8Characters = false);
// IYsonConsumer overrides.
void OnStringScalar(TStringBuf value) override;
@@ -62,6 +63,7 @@ protected:
const EYsonType Type_;
const bool EnableRaw_;
const int IndentSize_;
+ const bool PassThroughUtf8Characters_;
int Depth_ = 0;
bool EmptyCollection_ = true;