aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authororlovorlov <orlovorlov@yandex-team.com>2024-03-18 13:15:49 +0300
committerorlovorlov <orlovorlov@yandex-team.com>2024-03-18 15:02:17 +0300
commit29127cb44e4ff8ecde3924e1af7bb4213fda2a9f (patch)
tree1fb2dad483a4877ed5110e9df388e0380f661f5b
parentd7c2d188d2464ce858465870a39220c4dfbb5509 (diff)
downloadydb-29127cb44e4ff8ecde3924e1af7bb4213fda2a9f.tar.gz
YT-21141 Avoid content deduplication for files under 10MB
febae4e49cd0f600bf21616025f210e99235cfdc
-rw-r--r--yt/cpp/mapreduce/client/operation_preparer.cpp11
-rw-r--r--yt/cpp/mapreduce/client/operation_preparer.h2
-rw-r--r--yt/cpp/mapreduce/interface/config.cpp1
-rw-r--r--yt/cpp/mapreduce/interface/config.h3
4 files changed, 12 insertions, 5 deletions
diff --git a/yt/cpp/mapreduce/client/operation_preparer.cpp b/yt/cpp/mapreduce/client/operation_preparer.cpp
index eb30eed000..ec822e607e 100644
--- a/yt/cpp/mapreduce/client/operation_preparer.cpp
+++ b/yt/cpp/mapreduce/client/operation_preparer.cpp
@@ -315,7 +315,7 @@ public:
return FileName_;
}
- ui64 GetDataSize() const override
+ i64 GetDataSize() const override
{
return GetFileLength(FileName_);
}
@@ -353,9 +353,9 @@ public:
return Description_;
}
- ui64 GetDataSize() const override
+ i64 GetDataSize() const override
{
- return Data_.size();
+ return std::ssize(Data_);
}
private:
@@ -694,7 +694,10 @@ TString TJobPreparer::UploadToCacheUsingApi(const IItemToUpload& itemToUpload) c
itemToUpload.GetDescription(),
OperationPreparer_.GetPreparationId());
- if (OperationPreparer_.GetContext().Config->CacheUploadDeduplicationMode != EUploadDeduplicationMode::Disabled) {
+ const auto& config = OperationPreparer_.GetContext().Config;
+
+ if (config->CacheUploadDeduplicationMode != EUploadDeduplicationMode::Disabled &&
+ itemToUpload.GetDataSize() > config->CacheUploadDeduplicationThreshold) {
if (auto path = TryUploadWithDeduplication(itemToUpload)) {
return *path;
}
diff --git a/yt/cpp/mapreduce/client/operation_preparer.h b/yt/cpp/mapreduce/client/operation_preparer.h
index e1b9d59b1d..67eb28b31a 100644
--- a/yt/cpp/mapreduce/client/operation_preparer.h
+++ b/yt/cpp/mapreduce/client/operation_preparer.h
@@ -56,7 +56,7 @@ struct IItemToUpload
virtual TString CalculateMD5() const = 0;
virtual THolder<IInputStream> CreateInputStream() const = 0;
virtual TString GetDescription() const = 0;
- virtual ui64 GetDataSize() const = 0;
+ virtual i64 GetDataSize() const = 0;
};
////////////////////////////////////////////////////////////////////////////////
diff --git a/yt/cpp/mapreduce/interface/config.cpp b/yt/cpp/mapreduce/interface/config.cpp
index 407c213226..c012386cc3 100644
--- a/yt/cpp/mapreduce/interface/config.cpp
+++ b/yt/cpp/mapreduce/interface/config.cpp
@@ -212,6 +212,7 @@ void TConfig::Reset()
LoadTimings();
CacheUploadDeduplicationMode = GetUploadingDeduplicationMode("YT_UPLOAD_DEDUPLICATION", EUploadDeduplicationMode::Host);
+ CacheUploadDeduplicationThreshold = 10_MB;
RetryCount = Max(GetInt("YT_RETRY_COUNT", 10), 1);
ReadRetryCount = Max(GetInt("YT_READ_RETRY_COUNT", 30), 1);
diff --git a/yt/cpp/mapreduce/interface/config.h b/yt/cpp/mapreduce/interface/config.h
index b6d34f8895..de5f5ab7fc 100644
--- a/yt/cpp/mapreduce/interface/config.h
+++ b/yt/cpp/mapreduce/interface/config.h
@@ -155,6 +155,9 @@ struct TConfig
/// NB: Each mode affects only users with the same mode enabled.
EUploadDeduplicationMode CacheUploadDeduplicationMode;
+ // @brief Minimum byte size for files to undergo deduplication at upload
+ i64 CacheUploadDeduplicationThreshold;
+
bool MountSandboxInTmpfs;
/// @brief Set upload options (e.g.) for files created by library.