aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorgaltsev <galtsev@yandex-team.com>2025-03-04 00:13:46 +0300
committergaltsev <galtsev@yandex-team.com>2025-03-04 00:33:09 +0300
commitefbdb4de027d8001c6aadb50d3766edc5e34d1d1 (patch)
treeb073d9f644be760e6f48d07d1fb8fe723eb45a21
parent1cda8004d0370389c3f09e832a85d82b156844bf (diff)
downloadydb-efbdb4de027d8001c6aadb50d3766edc5e34d1d1.tar.gz
Save the original data weights in the samples extension
commit_hash:cc604e42eba2d12718be2f0bb1746c9c06df94ef
-rw-r--r--yt/yt/client/table_client/config.cpp3
-rw-r--r--yt/yt/client/table_client/config.h1
-rw-r--r--yt/yt/client/table_client/helpers.cpp9
-rw-r--r--yt/yt/client/table_client/helpers.h1
4 files changed, 12 insertions, 2 deletions
diff --git a/yt/yt/client/table_client/config.cpp b/yt/yt/client/table_client/config.cpp
index b8bb96c1647..64f4502e08f 100644
--- a/yt/yt/client/table_client/config.cpp
+++ b/yt/yt/client/table_client/config.cpp
@@ -153,6 +153,9 @@ void TChunkWriterConfig::Register(TRegistrar registrar)
.InRange(0.0, 0.001)
.Default(0.0001);
+ registrar.Parameter("use_original_data_weight_in_samples", &TThis::UseOriginalDataWeightInSamples)
+ .Default(false);
+
registrar.Parameter("chunk_indexes", &TThis::ChunkIndexes)
.DefaultNew();
diff --git a/yt/yt/client/table_client/config.h b/yt/yt/client/table_client/config.h
index fa096afa53b..0b1c9700124 100644
--- a/yt/yt/client/table_client/config.h
+++ b/yt/yt/client/table_client/config.h
@@ -148,6 +148,7 @@ struct TChunkWriterConfig
i64 MaxDataWeightBetweenBlocks;
double SampleRate;
+ bool UseOriginalDataWeightInSamples;
bool EnableLargeColumnarStatistics;
diff --git a/yt/yt/client/table_client/helpers.cpp b/yt/yt/client/table_client/helpers.cpp
index 857893c458b..b2ef61558f6 100644
--- a/yt/yt/client/table_client/helpers.cpp
+++ b/yt/yt/client/table_client/helpers.cpp
@@ -1607,13 +1607,16 @@ TUnversionedValueRangeTruncationResult TruncateUnversionedValues(
std::vector<TUnversionedValue> truncatedValues;
truncatedValues.reserve(values.size());
+ i64 inputSize = 0;
int truncatableValueCount = 0;
i64 remainingSize = options.MaxTotalSize;
for (const auto& value : values) {
+ auto valueSize = EstimateRowValueSize(value);
+ inputSize += valueSize;
if (IsStringLikeType(value.Type)) {
++truncatableValueCount;
} else {
- remainingSize -= EstimateRowValueSize(value);
+ remainingSize -= valueSize;
}
}
@@ -1656,7 +1659,9 @@ TUnversionedValueRangeTruncationResult TruncateUnversionedValues(
}
}
- return {MakeSharedRange(std::move(truncatedValues), rowBuffer), resultSize, clipped};
+ auto sampleSize = options.UseOriginalDataWeightInSamples ? inputSize : resultSize;
+
+ return {MakeSharedRange(std::move(truncatedValues), rowBuffer), sampleSize, clipped};
}
////////////////////////////////////////////////////////////////////////////////
diff --git a/yt/yt/client/table_client/helpers.h b/yt/yt/client/table_client/helpers.h
index 36bbff34981..3b85e1edfb0 100644
--- a/yt/yt/client/table_client/helpers.h
+++ b/yt/yt/client/table_client/helpers.h
@@ -377,6 +377,7 @@ struct TUnversionedValueRangeTruncationOptions
//! Otherwise, all values of primitive (not string-like) types are preserved and the remaining size
//! is uniformely distributed between truncated versions of the remaining string-like values.
bool ClipAfterOverflow = false;
+ bool UseOriginalDataWeightInSamples = false;
//! Limits the total size of the resulting value range.
//! See value-preservation rules described above.
i64 MaxTotalSize = NTableClient::MaxSampleSize;