diff options
author | Andrey Chulkov <achulkov2@nebius.com> | 2024-03-28 15:54:14 +0300 |
---|---|---|
committer | robot-piglet <robot-piglet@yandex-team.com> | 2024-03-28 16:06:28 +0300 |
commit | 23e15f39ac8b7c106f70a921fcaa95f60c65162a (patch) | |
tree | bd4c8d014d299790e5f50681fbdd64fe95683cb8 | |
parent | ed2b9ff756f5d7b9c0a847033c3023bcfd771970 (diff) | |
download | ydb-23e15f39ac8b7c106f70a921fcaa95f60c65162a.tar.gz |
Truncate and improve samples of composite values
This PR fixes #405. Composite values are now truncated while maintaining comparability with the original value.
Also generalized some code working with samples, since the differences in size accounting and truncation approaches could have produced less usable samples.
---
22f45766281679cd7b28e717647369ee885c2313
Pull Request resolved: https://github.com/ytsaurus/ytsaurus/pull/460
-rw-r--r-- | yt/yt/client/table_client/composite_compare.cpp | 153 | ||||
-rw-r--r-- | yt/yt/client/table_client/composite_compare.h | 11 | ||||
-rw-r--r-- | yt/yt/client/table_client/helpers.cpp | 58 | ||||
-rw-r--r-- | yt/yt/client/table_client/helpers.h | 33 | ||||
-rw-r--r-- | yt/yt/client/unittests/composite_compare_ut.cpp | 75 | ||||
-rw-r--r-- | yt/yt/core/yson/pull_parser-inl.h | 44 | ||||
-rw-r--r-- | yt/yt/core/yson/pull_parser.h | 1 |
7 files changed, 373 insertions, 2 deletions
diff --git a/yt/yt/client/table_client/composite_compare.cpp b/yt/yt/client/table_client/composite_compare.cpp index 5dc4c74fc0..b2ddd498ec 100644 --- a/yt/yt/client/table_client/composite_compare.cpp +++ b/yt/yt/client/table_client/composite_compare.cpp @@ -1,9 +1,12 @@ #include "composite_compare.h" #include <yt/yt/core/yson/pull_parser.h> +#include <yt/yt/core/yson/token_writer.h> + #include <yt/yt/library/numeric/util.h> #include <library/cpp/yt/farmhash/farm_hash.h> +#include <library/cpp/yt/logging/logger.h> #include <util/stream/mem.h> @@ -13,6 +16,10 @@ using namespace NYson; //////////////////////////////////////////////////////////////////////////////// +static const auto Logger = NLogging::TLogger{"YsonCompositveCompare"}; + +//////////////////////////////////////////////////////////////////////////////// + namespace { // This file implements comparison for composite values. @@ -148,6 +155,37 @@ Y_FORCE_INLINE static int CompareYsonItems(const TYsonItem& lhs, const TYsonItem return ComparePrimitive(static_cast<ui32>(lhsClass), static_cast<ui32>(rhsClass)); } +// Returns the minimum binary size needed to represent a potentially truncated version of the this item. +// We should not rely on the return value for Map or Attribute-related items, there is no special handling for them. +i64 GetMinResultingSize(const TYsonItem& item, bool isInsideList) +{ + // These bytes were already accounted when handling the corresponding BeginList. + if (item.GetType() == EYsonItemType::EndList) { + return 0; + } + + auto resultingSize = item.GetBinarySize(); + + // Strings can be truncated, so we only count the remaining bytes for now. + // In practice that is the single flag byte and the length of the string as a varint. + if (item.GetType() == EYsonItemType::StringValue) { + resultingSize -= item.UncheckedAsString().size(); + YT_VERIFY(resultingSize >= 0); + } + + // Accounting for EndList, since we need to accomodate the list ending within the size limit. + if (item.GetType() == EYsonItemType::BeginList) { + resultingSize += TYsonItem::Simple(EYsonItemType::EndList).GetBinarySize(); + } + + // All items inside any enclosing list are currently required to be followed by an item separator, which has the size of one byte. + if (isInsideList) { + resultingSize += 1; + } + + return resultingSize; +} + } // namespace int CompareCompositeValues(TYsonStringBuf lhs, TYsonStringBuf rhs) @@ -276,4 +314,119 @@ TFingerprint CompositeFarmHash(TYsonStringBuf value) //////////////////////////////////////////////////////////////////////////////// +std::optional<TYsonString> TruncateCompositeValue(TYsonStringBuf value, i64 size) +{ + YT_VERIFY(value.GetType() == EYsonType::Node); + + YT_VERIFY(size >= 0); + if (!size) { + return {}; + } + + TMemoryInput valueIn(value.AsStringBuf()); + TYsonPullParser valueParser(&valueIn, EYsonType::Node); + + TString truncatedYson; + TStringOutput output(truncatedYson); + output.Reserve(std::min(size, std::ssize(value.AsStringBuf()))); + TCheckedInDebugYsonTokenWriter writer(&output); + + i64 unclosedListCount = 0; + bool emptyResult = true; + + for (auto remainingBytes = size;;) { + const auto item = valueParser.Next(); + + // We don't handle the case of the last EndList, since the function below returns 0 for EndList anyway. + bool isInsideList = unclosedListCount > 0; + auto resultingItemSize = GetMinResultingSize(item, isInsideList); + + if (resultingItemSize > remainingBytes) { + break; + } + + bool isEof = false; + + switch (item.GetType()) { + case EYsonItemType::BeginList: + ++unclosedListCount; + writer.WriteBeginList(); + break; + case EYsonItemType::EndList: + --unclosedListCount; + writer.WriteEndList(); + break; + case EYsonItemType::EntityValue: + writer.WriteEntity(); + break; + case EYsonItemType::Int64Value: + writer.WriteBinaryInt64(item.UncheckedAsInt64()); + break; + case EYsonItemType::Uint64Value: + writer.WriteBinaryUint64(item.UncheckedAsUint64()); + break; + case EYsonItemType::DoubleValue: + writer.WriteBinaryDouble(item.UncheckedAsDouble()); + break; + case EYsonItemType::BooleanValue: + writer.WriteBinaryBoolean(item.UncheckedAsBoolean()); + break; + case EYsonItemType::StringValue: { + auto truncatedString = item.UncheckedAsString().Trunc(remainingBytes - resultingItemSize); + writer.WriteBinaryString(truncatedString); + // This is a slight overestimation, since a smaller string might have a shorter varint part storing its length. + resultingItemSize += truncatedString.size(); + break; + } + // Maps and attributes are not comparable. + // However, we can store everything up to this point into the truncated string. + case EYsonItemType::BeginAttributes: + case EYsonItemType::EndAttributes: + case EYsonItemType::BeginMap: + case EYsonItemType::EndMap: + case EYsonItemType::EndOfStream: + isEof = true; + break; + default: + YT_ABORT(); + } + + if (isEof) { + break; + } + + emptyResult = false; + + if (unclosedListCount && item.GetType() != EYsonItemType::BeginList) { + writer.WriteItemSeparator(); + } + + remainingBytes -= resultingItemSize; + } + + YT_VERIFY(unclosedListCount >= 0); + while (unclosedListCount) { + writer.WriteEndList(); + if (--unclosedListCount) { + writer.WriteItemSeparator(); + } + } + + if (emptyResult) { + return {}; + } else { + writer.Finish(); + } + + YT_LOG_ALERT_IF( + std::ssize(truncatedYson) > size, + "Composite YSON truncation increased the value's binary size (OriginalValue: %v, TruncatedValue: %v)", + value.AsStringBuf(), + truncatedYson); + + return TYsonString(std::move(truncatedYson)); +} + +//////////////////////////////////////////////////////////////////////////////// + } // namespace NYT::NTableClient diff --git a/yt/yt/client/table_client/composite_compare.h b/yt/yt/client/table_client/composite_compare.h index 75556472ad..04737d8b7b 100644 --- a/yt/yt/client/table_client/composite_compare.h +++ b/yt/yt/client/table_client/composite_compare.h @@ -13,6 +13,17 @@ namespace NYT::NTableClient { int CompareCompositeValues(NYson::TYsonStringBuf lhs, NYson::TYsonStringBuf rhs); TFingerprint CompositeFarmHash(NYson::TYsonStringBuf compositeValue); +//! Returns a new yson value containing a truncated expression which can be compared with the original value +//! by using the comparison function above. +//! Returns null if the result is empty, since empty values are not suported in YSON. +//! Effectively, it returns a prefix of the original value up to the first uncomparable item, such as a map or yson attributes, +//! or up until the point when the size limit is hit. +//! The size parameter limits the binary size of the output, i.e. the length of the returned string. +//! +//! NB: The current implementation guarantees that the size of the returned string is not larger then the provided limit. However, +//! this might be hard to maintain and is not something one should rely on. It is better to think of this function as an approximate one. +std::optional<NYson::TYsonString> TruncateCompositeValue(NYson::TYsonStringBuf value, i64 size); + //////////////////////////////////////////////////////////////////////////////// } // namespace NYT::NTableClient diff --git a/yt/yt/client/table_client/helpers.cpp b/yt/yt/client/table_client/helpers.cpp index 288bd4ad73..54733a18b4 100644 --- a/yt/yt/client/table_client/helpers.cpp +++ b/yt/yt/client/table_client/helpers.cpp @@ -2,6 +2,7 @@ #include "schema.h" #include "name_table.h" #include "key_bound.h" +#include "composite_compare.h" #include <yt/yt_proto/yt/client/table_chunk_format/proto/chunk_meta.pb.h> @@ -1568,4 +1569,61 @@ REGISTER_INTERMEDIATE_PROTO_INTEROP_BYTES_FIELD_REPRESENTATION( //////////////////////////////////////////////////////////////////////////////// +TUnversionedValueRangeTruncationResult TruncateUnversionedValues( + TUnversionedValueRange values, + const TRowBufferPtr& rowBuffer, + const TUnversionedValueRangeTruncationOptions& options) +{ + std::vector<TUnversionedValue> truncatedValues; + truncatedValues.reserve(values.size()); + + int truncatableValueCount = 0; + i64 remainingSize = options.MaxTotalSize; + for (const auto& value : values) { + if (IsStringLikeType(value.Type)) { + ++truncatableValueCount; + } else { + remainingSize -= EstimateRowValueSize(value); + } + } + + auto maxSizePerValue = std::max<i64>(0, remainingSize) / std::max(truncatableValueCount, 1); + + i64 resultSize = 0; + bool clipped = false; + + for (const auto& value : values) { + truncatedValues.push_back(value); + auto& truncatedValue = truncatedValues.back(); + + if (clipped || value.Type == EValueType::Any) { + truncatedValue = MakeUnversionedNullValue(value.Id, value.Flags); + } else if (value.Type == EValueType::Composite) { + if (auto truncatedCompositeValue = TruncateCompositeValue(TYsonStringBuf(value.AsStringBuf()), maxSizePerValue)) { + truncatedValue = rowBuffer->CaptureValue(MakeUnversionedCompositeValue(truncatedCompositeValue->AsStringBuf(), value.Id, value.Flags)); + } else { + truncatedValue = MakeUnversionedNullValue(value.Id, value.Flags); + } + } else if (value.Type == EValueType::String) { + truncatedValue.Length = std::min<ui32>(truncatedValue.Length, maxSizePerValue); + } + + if (options.ClipAfterOverflow && IsStringLikeType(value.Type) && (truncatedValue.Type == EValueType::Null || truncatedValue.Length < value.Length)) { + clipped = true; + } + + // This funciton also accounts for the representation of the id and type of the unversioned value. + // The limit can be slightly exceeded this way. + resultSize += EstimateRowValueSize(truncatedValue); + + if (options.ClipAfterOverflow && resultSize >= options.MaxTotalSize) { + clipped = true; + } + } + + return {MakeSharedRange(std::move(truncatedValues), rowBuffer), resultSize, clipped}; +} + +//////////////////////////////////////////////////////////////////////////////// + } // namespace NYT::NTableClient diff --git a/yt/yt/client/table_client/helpers.h b/yt/yt/client/table_client/helpers.h index 872543527e..cb0e131199 100644 --- a/yt/yt/client/table_client/helpers.h +++ b/yt/yt/client/table_client/helpers.h @@ -342,6 +342,39 @@ TUnversionedValue TryDecodeUnversionedAnyValue( //////////////////////////////////////////////////////////////////////////////// +struct TUnversionedValueRangeTruncationResult +{ + // Newly formed values are owned by the underlying row buffer. + TSharedRange<TUnversionedValue> Values; + //! Estimation of the total size based on the binary representation of unversioned values. + i64 Size; + //! If clipping was requested, signifies whether the resulting value range is actually equal to the input value range. + bool Clipped; +}; + +struct TUnversionedValueRangeTruncationOptions +{ + //! If true, the result will form a comparable prefix of the original values. + //! I.e. if rangeA is smaller than rangeB, then truncatedRangeA <= truncatedRangeB. + //! This is achieved by replacing all values after the first truncated or size-limit-overflowing value with a Null value. + //! + //! Otherwise, all values of primitive (not string-like) types are preserved and the remaining size + //! is uniformely distributed between truncated versions of the remaining string-like values. + bool ClipAfterOverflow = false; + //! Limits the total size of the resulting value range. + //! See value-preservation rules described above. + i64 MaxTotalSize = NTableClient::MaxSampleSize; +}; + +//! Captures and returns a new list of values truncated to roughly fit the provided size and form a comparable prefix. +//! The resulting value runge has exactly the same length as the input value range. +//! See the option descriptions above for more details on how values are truncated and what comparability guarantees are provided. +//! NB: Newly generated values are captured into the provided row buffer, however, the lifetime of unchanged values remains the responsibility of the caller. +//! NB: The resulting total binary size can be slightly larger than the limit, since even Null filler values take up some space. +TUnversionedValueRangeTruncationResult TruncateUnversionedValues(TUnversionedValueRange values, const TRowBufferPtr& rowBuffer, const TUnversionedValueRangeTruncationOptions& options); + +//////////////////////////////////////////////////////////////////////////////// + } // namespace NYT::NTableClient #define HELPERS_INL_H_ diff --git a/yt/yt/client/unittests/composite_compare_ut.cpp b/yt/yt/client/unittests/composite_compare_ut.cpp index 5332abe182..5e792919fa 100644 --- a/yt/yt/client/unittests/composite_compare_ut.cpp +++ b/yt/yt/client/unittests/composite_compare_ut.cpp @@ -8,12 +8,14 @@ namespace NYT::NTableClient { namespace { +using namespace NYson; + //////////////////////////////////////////////////////////////////////////////// TEST(TCompositeCompare, Simple) { auto compare = [] (TStringBuf lhs, TStringBuf rhs) { - return CompareCompositeValues(NYson::TYsonStringBuf(lhs), NYson::TYsonStringBuf(rhs)); + return CompareCompositeValues(TYsonStringBuf(lhs), TYsonStringBuf(rhs)); }; EXPECT_EQ(-1, compare("-4", "42")); @@ -59,7 +61,7 @@ TEST(TCompositeCompare, Simple) TEST(TCompositeCompare, CompositeFingerprint) { auto getFarmHash = [] (TStringBuf value) { - return CompositeFarmHash(NYson::TYsonStringBuf(value)); + return CompositeFarmHash(TYsonStringBuf(value)); }; EXPECT_EQ(getFarmHash("-42"), GetFarmFingerprint(MakeUnversionedInt64Value(-42))); @@ -70,6 +72,75 @@ TEST(TCompositeCompare, CompositeFingerprint) EXPECT_EQ(getFarmHash("#"), GetFarmFingerprint(MakeUnversionedNullValue())); } +TEST(TCompositeCompare, TruncateCompositeValue) +{ + auto normalizeYson = [] (TStringBuf yson) { + return yson.empty() ? TString(yson) : ConvertToYsonString(TYsonString(yson), EYsonFormat::Binary).ToString(); + }; + + auto getTruncatedYson = [&] (TStringBuf original, i64 size) { + auto truncatedCompositeValue = TruncateCompositeValue(TYsonString(original), size); + return truncatedCompositeValue ? truncatedCompositeValue->ToString() : ""; + }; + + // When we rebuild the whole string during truncation, we should produce the correct normalized binary YSON version of the string as output. + auto checkFullStringIdempotence = [&] (TStringBuf yson) { + auto normalizedYson = normalizeYson(yson); + EXPECT_EQ(normalizedYson, getTruncatedYson(yson, std::numeric_limits<i64>::max())); + EXPECT_EQ(normalizedYson, getTruncatedYson(yson, std::ssize(normalizedYson))); + }; + + auto checkTruncatedYson = [&] (TStringBuf expectedTruncatedYson, TStringBuf originalYson, i64 size) { + auto normalizedExpectedTruncatedYson = normalizeYson(expectedTruncatedYson); + auto truncatedYson = getTruncatedYson(originalYson, size); + + // For easier debugging. + EXPECT_EQ(normalizedExpectedTruncatedYson.size(), truncatedYson.size()); + EXPECT_EQ(normalizedExpectedTruncatedYson, truncatedYson); + }; + + checkFullStringIdempotence("[[5; 7]; [1; 5];]"); + checkFullStringIdempotence("[[5; 7]; [1; 5; 4; 3]; [2; 0; 0; 7]]"); + checkFullStringIdempotence("[[5; 7]; [1; 5; 4; 3; [g; r; i; t; #; k; #; n]]; [%true; [%false; 0;];]; [2; 0; 0; 7]]"); + checkFullStringIdempotence("this-string-desperately-wants-to-be-filled-with-some-funny-references-but-i-have-no-ideas"); + checkFullStringIdempotence("%true"); + checkFullStringIdempotence("#"); + checkFullStringIdempotence("\"\""); + + checkTruncatedYson("[[5; 7]; [1; 5];]", "[[5; 7]; [1; 5; 4; 3]; [2; 0; 0; 7]]", 20); + checkTruncatedYson("[[5; 7]; [1; 5];]", "[[5; 7]; [1; 5; 4; 3]; [2; 0; 0; 7]]", 21); + checkTruncatedYson("[[5; 7]; [1; 5];]", "[[5; 7]; [1; 5; 4; 3]; [2; 0; 0; 7]]", 22); + // We need 3 more bytes for the next integer: 1 for the type flag, 1 for the varint, 1 for the item separator. + checkTruncatedYson("[[5; 7]; [1; 5; 4;];]", "[[5; 7]; [1; 5; 4; 3]; [2; 0; 0; 7]]", 23); + // The value 1543 takes up 4 extra bytes, since it is represented as 2 varint bytes. + checkTruncatedYson("[[5; 7]; [1; 5; 4; 1543];]", "[[5; 7]; [1; 5; 4; 1543]; [2; 0; 0; 7]]", 27); + + checkTruncatedYson("[[#; 0;];]", "[[#; 0; %true; x; #]; 1;]", 10); + // We need 2 more bytes for the boolean: 1 for the type flag which encodes the value itself, 1 for the item serpator. + checkTruncatedYson("[[#; 0; %true];]", "[[#; 0; %true; #; x]; 1;]", 12); + // Same for entities. + checkTruncatedYson("[[#; 0; %true; #;];]", "[[#; 0; %true; #; x]; 1;]", 14); + + // NB: "" is actually std::nullopt returned from the function, it is just easier to visualize this way. + checkTruncatedYson("", "abacaba", 1); + checkTruncatedYson("", "1", 1); + checkTruncatedYson("", "[]", 1); + // Entity takes up only 1 byte! + checkTruncatedYson("#", "#", 1); + + checkTruncatedYson("this-string", "this-string-desperately-wants-to-be-filled-with-some-funny-references-but-i-have-no-ideas", 14); + checkTruncatedYson("this-string-", "this-string-desperately-wants-to-be-filled-with-some-funny-references-but-i-have-no-ideas", 15); + checkTruncatedYson("this-string-desperatel", "this-string-desperately-wants-to-be-filled-with-some-funny-references-but-i-have-no-ideas", 25); + checkTruncatedYson("[please; [take; [me; ha;];];]", "[please; [take; [me; haha; too; late]]]", 34); + // The actual size of the resulting yson is only 4 bytes, but during truncation it is too hard to account for the fact that longer strings + // take up more bytes for their length, since it is represented as a varint. + checkTruncatedYson("aa", TString(1000, 'a'), 5); + checkTruncatedYson("\"\"", "erase-me", 2); + + checkTruncatedYson("[[5; 7]; [1; 5; 4; 3]; [];]", "[[5; 7]; [1; 5; 4; 3]; [{hello=darkness}; 0; 0; 7]]", 10000); + checkTruncatedYson("[[5; 7];]", "[[5; 7]; <my-name=borat>[1; 5; 4; 3]; [{greetings=xoxo}; 0; 0; 7]]", 10000); +} + //////////////////////////////////////////////////////////////////////////////// } // namespace diff --git a/yt/yt/core/yson/pull_parser-inl.h b/yt/yt/core/yson/pull_parser-inl.h index e5ad9f8d06..e947992dba 100644 --- a/yt/yt/core/yson/pull_parser-inl.h +++ b/yt/yt/core/yson/pull_parser-inl.h @@ -133,6 +133,50 @@ bool TYsonItem::IsEndOfStream() const return GetType() == EYsonItemType::EndOfStream; } +// NB: Keep in sync with yson token writer. +i64 TYsonItem::GetBinarySize() const +{ + // Temporary buffer for calculating actual size of varints. + std::array<char, MaxVarUint64Size> buffer; + + // Each token requires at least 1 byte for the marker. + i64 result = 1; + switch (GetType()) { + case EYsonItemType::EndOfStream: + // This isn't a real token. + return 0; + case EYsonItemType::BeginList: + case EYsonItemType::EndList: + case EYsonItemType::BeginMap: + case EYsonItemType::EndMap: + case EYsonItemType::BeginAttributes: + case EYsonItemType::EndAttributes: + // These system values are represented solely by the marker. + break; + case EYsonItemType::EntityValue: + case EYsonItemType::BooleanValue: + // These values are represented solely by the marker. + break; + case EYsonItemType::Int64Value: + result += WriteVarInt64(buffer.data(), UncheckedAsInt64()); + break; + case EYsonItemType::Uint64Value: + result += WriteVarUint64(buffer.data(), UncheckedAsUint64()); + break; + case EYsonItemType::DoubleValue: + result += sizeof(double); + break; + case EYsonItemType::StringValue: + result += WriteVarInt32(buffer.data(), UncheckedAsString().size()); + result += UncheckedAsString().size(); + break; + default: + YT_ABORT(); + } + + return result; +} + //////////////////////////////////////////////////////////////////////////////// void NDetail::TZeroCopyInputStreamReader::RefreshBlock() diff --git a/yt/yt/core/yson/pull_parser.h b/yt/yt/core/yson/pull_parser.h index 5a632491e0..ecced8da63 100644 --- a/yt/yt/core/yson/pull_parser.h +++ b/yt/yt/core/yson/pull_parser.h @@ -36,6 +36,7 @@ public: template <typename T> Y_FORCE_INLINE T UncheckedAs() const; Y_FORCE_INLINE bool IsEndOfStream() const; + Y_FORCE_INLINE i64 GetBinarySize() const; private: TYsonItem() = default; |