diff options
author | galtsev <galtsev@yandex-team.com> | 2025-03-04 00:13:46 +0300 |
---|---|---|
committer | galtsev <galtsev@yandex-team.com> | 2025-03-04 00:33:09 +0300 |
commit | efbdb4de027d8001c6aadb50d3766edc5e34d1d1 (patch) | |
tree | b073d9f644be760e6f48d07d1fb8fe723eb45a21 | |
parent | 1cda8004d0370389c3f09e832a85d82b156844bf (diff) | |
download | ydb-efbdb4de027d8001c6aadb50d3766edc5e34d1d1.tar.gz |
Save the original data weights in the samples extension
commit_hash:cc604e42eba2d12718be2f0bb1746c9c06df94ef
-rw-r--r-- | yt/yt/client/table_client/config.cpp | 3 | ||||
-rw-r--r-- | yt/yt/client/table_client/config.h | 1 | ||||
-rw-r--r-- | yt/yt/client/table_client/helpers.cpp | 9 | ||||
-rw-r--r-- | yt/yt/client/table_client/helpers.h | 1 |
4 files changed, 12 insertions, 2 deletions
diff --git a/yt/yt/client/table_client/config.cpp b/yt/yt/client/table_client/config.cpp index b8bb96c1647..64f4502e08f 100644 --- a/yt/yt/client/table_client/config.cpp +++ b/yt/yt/client/table_client/config.cpp @@ -153,6 +153,9 @@ void TChunkWriterConfig::Register(TRegistrar registrar) .InRange(0.0, 0.001) .Default(0.0001); + registrar.Parameter("use_original_data_weight_in_samples", &TThis::UseOriginalDataWeightInSamples) + .Default(false); + registrar.Parameter("chunk_indexes", &TThis::ChunkIndexes) .DefaultNew(); diff --git a/yt/yt/client/table_client/config.h b/yt/yt/client/table_client/config.h index fa096afa53b..0b1c9700124 100644 --- a/yt/yt/client/table_client/config.h +++ b/yt/yt/client/table_client/config.h @@ -148,6 +148,7 @@ struct TChunkWriterConfig i64 MaxDataWeightBetweenBlocks; double SampleRate; + bool UseOriginalDataWeightInSamples; bool EnableLargeColumnarStatistics; diff --git a/yt/yt/client/table_client/helpers.cpp b/yt/yt/client/table_client/helpers.cpp index 857893c458b..b2ef61558f6 100644 --- a/yt/yt/client/table_client/helpers.cpp +++ b/yt/yt/client/table_client/helpers.cpp @@ -1607,13 +1607,16 @@ TUnversionedValueRangeTruncationResult TruncateUnversionedValues( std::vector<TUnversionedValue> truncatedValues; truncatedValues.reserve(values.size()); + i64 inputSize = 0; int truncatableValueCount = 0; i64 remainingSize = options.MaxTotalSize; for (const auto& value : values) { + auto valueSize = EstimateRowValueSize(value); + inputSize += valueSize; if (IsStringLikeType(value.Type)) { ++truncatableValueCount; } else { - remainingSize -= EstimateRowValueSize(value); + remainingSize -= valueSize; } } @@ -1656,7 +1659,9 @@ TUnversionedValueRangeTruncationResult TruncateUnversionedValues( } } - return {MakeSharedRange(std::move(truncatedValues), rowBuffer), resultSize, clipped}; + auto sampleSize = options.UseOriginalDataWeightInSamples ? inputSize : resultSize; + + return {MakeSharedRange(std::move(truncatedValues), rowBuffer), sampleSize, clipped}; } //////////////////////////////////////////////////////////////////////////////// diff --git a/yt/yt/client/table_client/helpers.h b/yt/yt/client/table_client/helpers.h index 36bbff34981..3b85e1edfb0 100644 --- a/yt/yt/client/table_client/helpers.h +++ b/yt/yt/client/table_client/helpers.h @@ -377,6 +377,7 @@ struct TUnversionedValueRangeTruncationOptions //! Otherwise, all values of primitive (not string-like) types are preserved and the remaining size //! is uniformely distributed between truncated versions of the remaining string-like values. bool ClipAfterOverflow = false; + bool UseOriginalDataWeightInSamples = false; //! Limits the total size of the resulting value range. //! See value-preservation rules described above. i64 MaxTotalSize = NTableClient::MaxSampleSize; |