diff options
author | ionagamed <ionagamed@yandex-team.com> | 2024-04-15 10:18:45 +0300 |
---|---|---|
committer | ionagamed <ionagamed@yandex-team.com> | 2024-04-15 10:28:06 +0300 |
commit | c015541a60f8d93070c53511daaff81db730d194 (patch) | |
tree | 248d7d962c718e75801036109fdef147bc1880ed /library/cpp | |
parent | 7930380b354abe9969174901a4e8a730ab1d0906 (diff) | |
download | ydb-c015541a60f8d93070c53511daaff81db730d194.tar.gz |
YT: Add NodeFromYsonStreamNonGreedy; use it in TNode::Load
В рамках
3547980204d51d6eba4c3b56989a916379526673
Diffstat (limited to 'library/cpp')
-rw-r--r-- | library/cpp/yson/node/benchmark/saveload.cpp | 57 | ||||
-rw-r--r-- | library/cpp/yson/node/benchmark/ya.make | 1 | ||||
-rw-r--r-- | library/cpp/yson/node/node.cpp | 2 | ||||
-rw-r--r-- | library/cpp/yson/node/node_io.cpp | 33 | ||||
-rw-r--r-- | library/cpp/yson/node/node_io.h | 7 | ||||
-rw-r--r-- | library/cpp/yson/node/node_ut.cpp | 31 | ||||
-rw-r--r-- | library/cpp/yson/parser.cpp | 14 | ||||
-rw-r--r-- | library/cpp/yson/parser.h | 2 | ||||
-rw-r--r-- | library/cpp/yson/parser_detail.h | 33 |
9 files changed, 160 insertions, 20 deletions
diff --git a/library/cpp/yson/node/benchmark/saveload.cpp b/library/cpp/yson/node/benchmark/saveload.cpp new file mode 100644 index 0000000000..838075f2e4 --- /dev/null +++ b/library/cpp/yson/node/benchmark/saveload.cpp @@ -0,0 +1,57 @@ +#include <benchmark/benchmark.h> + +#include <library/cpp/yson/node/node_io.h> + +using namespace NYT; + +namespace { + +static NYT::TNode GenerateList(size_t size) +{ + NYT::TNode result = NYT::TNode::CreateList(); + + for (size_t i = 0; i < size; ++i) { + result.AsList().emplace_back(NYT::TNode("val")); + } + + return result; +} + +} // namespace + +static void BM_SaveLoadGreedy(benchmark::State& state, size_t size) +{ + auto list = GenerateList(size); + + TString bytes; + TStringOutput outputStream{bytes}; + NodeToYsonStream(list, &outputStream, ::NYson::EYsonFormat::Binary); + + for (const auto& _ : state) { + TStringInput inputStream{bytes}; + NodeFromYsonStream(&inputStream); + } +} + +static void BM_SaveLoadNonGreedy(benchmark::State& state, size_t size) +{ + auto list = GenerateList(size); + + TString bytes; + TStringOutput outputStream{bytes}; + NodeToYsonStream(list, &outputStream, ::NYson::EYsonFormat::Binary); + + for (const auto& _ : state) { + TStringInput inputStream{bytes}; + NodeFromYsonStreamNonGreedy(&inputStream); + } +} + +BENCHMARK_CAPTURE(BM_SaveLoadGreedy, greedy_10, 10ul); +BENCHMARK_CAPTURE(BM_SaveLoadNonGreedy, non_greedy_10, 10ul); +BENCHMARK_CAPTURE(BM_SaveLoadGreedy, greedy_100, 100ul); +BENCHMARK_CAPTURE(BM_SaveLoadNonGreedy, non_greedy_100, 100ul); +BENCHMARK_CAPTURE(BM_SaveLoadGreedy, greedy_1000, 1000ul); +BENCHMARK_CAPTURE(BM_SaveLoadNonGreedy, non_greedy_1000, 1000ul); +BENCHMARK_CAPTURE(BM_SaveLoadGreedy, greedy_10000, 10000ul); +BENCHMARK_CAPTURE(BM_SaveLoadNonGreedy, non_greedy_10000, 10000ul); diff --git a/library/cpp/yson/node/benchmark/ya.make b/library/cpp/yson/node/benchmark/ya.make index dd2035b1fa..53a6e5f48c 100644 --- a/library/cpp/yson/node/benchmark/ya.make +++ b/library/cpp/yson/node/benchmark/ya.make @@ -2,6 +2,7 @@ G_BENCHMARK() SRCS( reserve.cpp + saveload.cpp ) PEERDIR( diff --git a/library/cpp/yson/node/node.cpp b/library/cpp/yson/node/node.cpp index 5156033cfe..f142ae0f42 100644 --- a/library/cpp/yson/node/node.cpp +++ b/library/cpp/yson/node/node.cpp @@ -861,7 +861,7 @@ void TNode::Save(IOutputStream* out) const void TNode::Load(IInputStream* in) { Clear(); - *this = NodeFromYsonStream(in, ::NYson::EYsonType::Node); + *this = NodeFromYsonStreamNonGreedy(in, ::NYson::EYsonType::Node); } //////////////////////////////////////////////////////////////////////////////// diff --git a/library/cpp/yson/node/node_io.cpp b/library/cpp/yson/node/node_io.cpp index 2e191d8d48..d8a05ec995 100644 --- a/library/cpp/yson/node/node_io.cpp +++ b/library/cpp/yson/node/node_io.cpp @@ -11,6 +11,7 @@ #include <library/cpp/json/json_reader.h> #include <library/cpp/json/json_value.h> +#include <util/generic/size_literals.h> #include <util/stream/input.h> #include <util/stream/output.h> #include <util/stream/str.h> @@ -82,6 +83,28 @@ static TNode CreateEmptyNodeByType(::NYson::EYsonType type) return result; } +static TNode NodeFromYsonStream(IInputStream* input, ::NYson::EYsonType type, bool consumeUntilEof) +{ + TNode result = CreateEmptyNodeByType(type); + + ui64 bufferSizeLimit = 64_KB; + if (!consumeUntilEof) { + // Other values might be in the stream, so reading one symbol at a time. + bufferSizeLimit = 1; + } + + TNodeBuilder builder(&result); + ::NYson::TYsonParser parser( + &builder, + input, + type, + /*enableLinePositionInfo*/ false, + bufferSizeLimit, + consumeUntilEof); + parser.Parse(); + return result; +} + TNode NodeFromYsonString(const TStringBuf input, ::NYson::EYsonType type) { TMemoryInput stream(input); @@ -104,12 +127,12 @@ TString NodeToCanonicalYsonString(const TNode& node, NYson::EYsonFormat format) TNode NodeFromYsonStream(IInputStream* input, ::NYson::EYsonType type) { - TNode result = CreateEmptyNodeByType(type); + return NodeFromYsonStream(input, type, /*consumeUntilEof*/ true); +} - TNodeBuilder builder(&result); - ::NYson::TYsonParser parser(&builder, input, type); - parser.Parse(); - return result; +TNode NodeFromYsonStreamNonGreedy(IInputStream* input, ::NYson::EYsonType type) +{ + return NodeFromYsonStream(input, type, /*consumeUntilEof*/ false); } void NodeToYsonStream(const TNode& node, IOutputStream* output, NYson::EYsonFormat format) diff --git a/library/cpp/yson/node/node_io.h b/library/cpp/yson/node/node_io.h index 1348d88bbb..2db1318db4 100644 --- a/library/cpp/yson/node/node_io.h +++ b/library/cpp/yson/node/node_io.h @@ -23,6 +23,13 @@ TString NodeToCanonicalYsonString(const TNode& node, ::NYson::EYsonFormat format // Parse TNode from stream in YSON format TNode NodeFromYsonStream(IInputStream* input, ::NYson::EYsonType type = ::NYson::EYsonType::Node); +// Parse TNode from stream in YSON format. +// NB: This is substantially slower (1.5-2x using the benchmark from `./benchmark/saveload.cpp`) than using +// the original `NodeFromYsonStream`. +// Stops reading from `input` as soon as some valid YSON was read, leaving the remainder of the stream unread. +// Used in TNode::Load to support cases of saveloading multiple values after the TNode from/to the same stream. +TNode NodeFromYsonStreamNonGreedy(IInputStream* input, ::NYson::EYsonType type = ::NYson::EYsonType::Node); + // Serialize TNode to stream in one of YSON formats with random order of maps' keys (don't use in tests) void NodeToYsonStream(const TNode& node, IOutputStream* output, ::NYson::EYsonFormat format = ::NYson::EYsonFormat::Text); diff --git a/library/cpp/yson/node/node_ut.cpp b/library/cpp/yson/node/node_ut.cpp index 728a926283..80d231cd09 100644 --- a/library/cpp/yson/node/node_ut.cpp +++ b/library/cpp/yson/node/node_ut.cpp @@ -279,6 +279,37 @@ Y_UNIT_TEST_SUITE(YtNodeTest) { UNIT_ASSERT_VALUES_EQUAL(node, nodeCopy); } + Y_UNIT_TEST(TestSaveLoadWithNeighbours) { + TString stringBefore = "before"; + + TNode node = TNode()("foo", "bar")("baz", 42); + node.Attributes()["attr_name"] = "attr_value"; + + TString stringAfter = "after"; + + TString bytes; + { + TStringOutput s(bytes); + ::Save(&s, stringBefore); + ::Save(&s, node); + ::Save(&s, stringAfter); + } + + TString deserializedStringBefore; + TString deserializedStringAfter; + TNode nodeCopy; + { + TStringInput s(bytes); + ::Load(&s, deserializedStringBefore); + ::Load(&s, nodeCopy); + ::Load(&s, deserializedStringAfter); + } + + UNIT_ASSERT_VALUES_EQUAL(stringBefore, deserializedStringBefore); + UNIT_ASSERT_VALUES_EQUAL(node, nodeCopy); + UNIT_ASSERT_VALUES_EQUAL(stringAfter, deserializedStringAfter); + } + Y_UNIT_TEST(TestIntCast) { TNode node = 1ull << 31; UNIT_ASSERT(node.IsUint64()); diff --git a/library/cpp/yson/parser.cpp b/library/cpp/yson/parser.cpp index 783f9b9047..934a56ee21 100644 --- a/library/cpp/yson/parser.cpp +++ b/library/cpp/yson/parser.cpp @@ -16,22 +16,27 @@ namespace NYson { IInputStream* stream, EYsonType type, bool enableLinePositionInfo, + ui64 bufferSizeLimit, + bool consumeUntilEof, TMaybe<ui64> memoryLimit = Nothing()) : Consumer_(consumer) , Stream_(stream) , Type_(type) , EnableLinePositionInfo_(enableLinePositionInfo) + , BufferSizeLimit_(bufferSizeLimit) + , ConsumeUntilEof_(consumeUntilEof) , MemoryLimit_(memoryLimit) { } void Parse() { - TBuffer buffer(64 << 10); + TBuffer buffer(BufferSizeLimit_); ParseYsonStreamImpl<NYT::NYson::IYsonConsumer, TStreamReader>( TStreamReader(Stream_, buffer.Data(), buffer.Capacity()), Consumer_, Type_, EnableLinePositionInfo_, + ConsumeUntilEof_, MemoryLimit_); } @@ -40,6 +45,8 @@ namespace NYson { IInputStream* Stream_; EYsonType Type_; bool EnableLinePositionInfo_; + ui64 BufferSizeLimit_; + bool ConsumeUntilEof_; TMaybe<ui64> MemoryLimit_; }; @@ -50,8 +57,10 @@ namespace NYson { IInputStream* stream, EYsonType type, bool enableLinePositionInfo, + ui64 bufferSizeLimit, + bool consumeUntilEof, TMaybe<ui64> memoryLimit) - : Impl(new TImpl(consumer, stream, type, enableLinePositionInfo, memoryLimit)) + : Impl(new TImpl(consumer, stream, type, enableLinePositionInfo, bufferSizeLimit, consumeUntilEof, memoryLimit)) { } @@ -115,6 +124,7 @@ namespace NYson { consumer, type, enableLinePositionInfo, + true, memoryLimit); } diff --git a/library/cpp/yson/parser.h b/library/cpp/yson/parser.h index dce35a8cd4..b6d8b110a1 100644 --- a/library/cpp/yson/parser.h +++ b/library/cpp/yson/parser.h @@ -21,6 +21,8 @@ namespace NYson { IInputStream* stream, EYsonType type = ::NYson::EYsonType::Node, bool enableLinePositionInfo = false, + ui64 bufferSizeLimit = 64 << 10, + bool consumeUntilEof = true, TMaybe<ui64> memoryLimit = Nothing()); ~TYsonParser(); diff --git a/library/cpp/yson/parser_detail.h b/library/cpp/yson/parser_detail.h index 9d8315dd60..fb72affed5 100644 --- a/library/cpp/yson/parser_detail.h +++ b/library/cpp/yson/parser_detail.h @@ -12,11 +12,17 @@ namespace NYson { private: using TBase = TLexerBase<TBlockStream, EnableLinePositionInfo>; TConsumer* Consumer; + bool ConsumeUntilEof_; public: - TParser(const TBlockStream& blockStream, TConsumer* consumer, TMaybe<ui64> memoryLimit) - : TBase(blockStream, memoryLimit) - , Consumer(consumer) + TParser( + const TBlockStream& blockStream, + TConsumer* consumer, + bool consumeUntilEof, + TMaybe<ui64> memoryLimit) + : TBase(blockStream, memoryLimit) + , Consumer(consumer) + , ConsumeUntilEof_(consumeUntilEof) { } @@ -38,11 +44,13 @@ namespace NYson { Y_ABORT("unreachable"); } - while (!(TBase::IsFinished() && TBase::IsEmpty())) { - if (TBase::template SkipSpaceAndGetChar<true>() != EndSymbol) { - ythrow TYsonException() << "Stray '" << (*TBase::Begin()) << "' found"; - } else if (!TBase::IsEmpty()) { - TBase::Advance(1); + if (ConsumeUntilEof_) { + while (!(TBase::IsFinished() && TBase::IsEmpty())) { + if (TBase::template SkipSpaceAndGetChar<true>() != EndSymbol) { + ythrow TYsonException() << "Stray '" << (*TBase::Begin()) << "' found"; + } else if (!TBase::IsEmpty()) { + TBase::Advance(1); + } } } } @@ -308,14 +316,15 @@ namespace NYson { NYT::NYson::IYsonConsumer* consumer, EYsonType parsingMode, bool enableLinePositionInfo, + bool consumeUntilEof, TMaybe<ui64> memoryLimit) { if (enableLinePositionInfo) { using TImpl = NDetail::TParser<TConsumer, TBlockStream, true>; - TImpl impl(blockStream, consumer, memoryLimit); + TImpl impl(blockStream, consumer, consumeUntilEof, memoryLimit); impl.DoParse(parsingMode); } else { using TImpl = NDetail::TParser<TConsumer, TBlockStream, false>; - TImpl impl(blockStream, consumer, memoryLimit); + TImpl impl(blockStream, consumer, consumeUntilEof, memoryLimit); impl.DoParse(parsingMode); } } @@ -337,7 +346,7 @@ namespace NYson { public: TStatelessYsonParserImpl(TConsumer* consumer, TMaybe<ui64> memoryLimit) - : Parser(TStringReader(), consumer, memoryLimit) + : Parser(TStringReader(), consumer, true, memoryLimit) { } @@ -365,7 +374,7 @@ namespace NYson { public: TYsonListParserImpl(const TBlockStream& blockStream, TConsumer* consumer, TMaybe<ui64> memoryLimit) - : Parser(blockStream, consumer, memoryLimit) + : Parser(blockStream, consumer, true, memoryLimit) { } |