diff options
| author | robot-piglet <[email protected]> | 2024-02-02 09:29:53 +0300 | 
|---|---|---|
| committer | Alexander Smirnov <[email protected]> | 2024-02-09 19:17:12 +0300 | 
| commit | 0460c44a8d10b6286eeafba80720664bcd5e9ee1 (patch) | |
| tree | 60dc85342ea90b10b17d257c55d840d193793cb1 | |
| parent | 5bb6b881894a01666f5143835c8fc02264ff008e (diff) | |
Intermediate changes
| -rw-r--r-- | yt/yt/client/chunk_client/data_statistics.cpp | 13 | ||||
| -rw-r--r-- | yt/yt/client/chunk_client/data_statistics.h | 3 | ||||
| -rw-r--r-- | yt/yt/client/table_client/config.cpp | 10 | ||||
| -rw-r--r-- | yt/yt/client/table_client/config.h | 9 | ||||
| -rw-r--r-- | yt/yt/client/table_client/public.h | 7 | ||||
| -rw-r--r-- | yt/yt/client/table_client/row_base.h | 6 | ||||
| -rw-r--r-- | yt/yt_proto/yt/client/chunk_client/proto/chunk_meta.proto | 4 | ||||
| -rw-r--r-- | yt/yt_proto/yt/client/table_chunk_format/proto/chunk_meta.proto | 1 | 
8 files changed, 47 insertions, 6 deletions
diff --git a/yt/yt/client/chunk_client/data_statistics.cpp b/yt/yt/client/chunk_client/data_statistics.cpp index 1dbc2ad5201..ce4ca1cd846 100644 --- a/yt/yt/client/chunk_client/data_statistics.cpp +++ b/yt/yt/client/chunk_client/data_statistics.cpp @@ -188,11 +188,19 @@ TCodecStatistics& TCodecStatistics::Append(const std::pair<ECodec, TDuration>& c      return *this;  } +TCodecStatistics& TCodecStatistics::AppendToValueDictionaryCompression(TDuration duration) +{ +    ValueDictionaryCompressionDuration_ += duration; +    TotalDuration_ += duration; +    return *this; +} +  TCodecStatistics& TCodecStatistics::operator+=(const TCodecStatistics& other)  {      for (const auto& pair : other.CodecToDuration_) {          Append(pair);      } +    AppendToValueDictionaryCompression(other.ValueDictionaryCompressionDuration_);      return *this;  } @@ -204,6 +212,11 @@ TDuration TCodecStatistics::GetTotalDuration() const  void FormatValue(TStringBuilderBase* builder, const TCodecStatistics& statistics, TStringBuf /* spec */)  {      FormatKeyValueRange(builder, statistics.CodecToDuration(), TDefaultFormatter()); +    if (statistics.ValueDictionaryCompressionDuration()) { +        builder->AppendString(", "); +        builder->AppendFormat("ValueDictionaryCompressionDuration: %v", +            statistics.ValueDictionaryCompressionDuration()); +    }  }  TString ToString(const TCodecStatistics& statistics) diff --git a/yt/yt/client/chunk_client/data_statistics.h b/yt/yt/client/chunk_client/data_statistics.h index 880a510c598..0a8b9186279 100644 --- a/yt/yt/client/chunk_client/data_statistics.h +++ b/yt/yt/client/chunk_client/data_statistics.h @@ -52,8 +52,11 @@ public:          1>;      DEFINE_BYREF_RO_PROPERTY(TCodecToDuration, CodecToDuration); +    DEFINE_BYREF_RO_PROPERTY(TDuration, ValueDictionaryCompressionDuration); +  public:      TCodecStatistics& Append(const TCodecDuration& codecTime); +    TCodecStatistics& AppendToValueDictionaryCompression(TDuration duration);      TCodecStatistics& operator+=(const TCodecStatistics& other); diff --git a/yt/yt/client/table_client/config.cpp b/yt/yt/client/table_client/config.cpp index a4de89b4f14..f11756c5962 100644 --- a/yt/yt/client/table_client/config.cpp +++ b/yt/yt/client/table_client/config.cpp @@ -276,6 +276,16 @@ void TDictionaryCompressionConfig::Register(TRegistrar registrar)              EDictionaryCompressionPolicy::FreshChunkFirst,          }); +    registrar.Parameter("policy_probation_samples_size", &TThis::PolicyProbationSamplesSize) +        .GreaterThan(0) +        .Default(12_MB); +    registrar.Parameter("max_acceptable_compression_ratio", &TThis::MaxAcceptableCompressionRatio) +        .Default(0.7) +        .InRange(0, 1); +    registrar.Parameter("max_decompression_blob_size", &TThis::MaxDecompressionBlobSize) +        .GreaterThan(0) +        .Default(64_MB); +      registrar.Postprocessor([] (TThis* config) {          if (config->DesiredSampleCount > config->MaxProcessedSampleCount) {              THROW_ERROR_EXCEPTION("\"desired_sample_count\" cannot be greater than \"max_processed_sample_count\""); diff --git a/yt/yt/client/table_client/config.h b/yt/yt/client/table_client/config.h index 952dd2a94e8..cca367f23b5 100644 --- a/yt/yt/client/table_client/config.h +++ b/yt/yt/client/table_client/config.h @@ -249,6 +249,15 @@ public:      //! Upon each chunk compression will independently decide which dictionary fits best.      THashSet<EDictionaryCompressionPolicy> AppliedPolicies; +    //! Upon each chunk compression will first accumulate samples of that weight +    //! before deciding dictionary of which policy fits the best. +    i64 PolicyProbationSamplesSize; +    //! Upper limit on acceptable compression ratio. No chunk compression is performed if this limit is exceeded. +    double MaxAcceptableCompressionRatio; + +    //! Upper limit on content size of a batch that can be decompressed within a single iteration. +    i64 MaxDecompressionBlobSize; +      REGISTER_YSON_STRUCT(TDictionaryCompressionConfig);      static void Register(TRegistrar registrar); diff --git a/yt/yt/client/table_client/public.h b/yt/yt/client/table_client/public.h index eeb18fd5067..592cbcacd36 100644 --- a/yt/yt/client/table_client/public.h +++ b/yt/yt/client/table_client/public.h @@ -130,9 +130,10 @@ constexpr int TypicalHunkColumnCount = 8;  ////////////////////////////////////////////////////////////////////////////////  DEFINE_ENUM_WITH_UNDERLYING_TYPE(EHunkValueTag, ui8, -    ((Inline)   (0)) -    ((LocalRef) (1)) -    ((GlobalRef)(2)) +    ((Inline)            (0)) +    ((LocalRef)          (1)) +    ((GlobalRef)         (2)) +    ((CompressedInline)  (3))  );  // Do not change these values since they are stored in the master snapshot. diff --git a/yt/yt/client/table_client/row_base.h b/yt/yt/client/table_client/row_base.h index b353edd1cfa..126806aaadd 100644 --- a/yt/yt/client/table_client/row_base.h +++ b/yt/yt/client/table_client/row_base.h @@ -33,9 +33,9 @@ static_assert(      "Incorrect type order.");  DEFINE_BIT_ENUM_WITH_UNDERLYING_TYPE(EValueFlags, ui8, -    ((None)       (0x0000)) -    ((Aggregate)  (0x0001)) -    ((Hunk)       (0x0002)) +    ((None)        (0x00)) +    ((Aggregate)   (0x01)) +    ((Hunk)        (0x02))  );  DEFINE_ENUM_WITH_UNDERLYING_TYPE(ESimpleLogicalValueType, ui32, diff --git a/yt/yt_proto/yt/client/chunk_client/proto/chunk_meta.proto b/yt/yt_proto/yt/client/chunk_client/proto/chunk_meta.proto index d7e724b4ccc..874ec93151d 100644 --- a/yt/yt_proto/yt/client/chunk_client/proto/chunk_meta.proto +++ b/yt/yt_proto/yt/client/chunk_client/proto/chunk_meta.proto @@ -1,5 +1,6 @@  package NYT.NChunkClient.NProto; +import "yt_proto/yt/core/misc/proto/guid.proto";  import "yt_proto/yt/core/misc/proto/protobuf_helpers.proto";  option go_package = "a.yandex-team.ru/yt/go/proto/client/chunk_client"; @@ -91,6 +92,9 @@ message TMiscExt      // If present, chunk is a compression dictionary of respective policy.      optional int32 dictionary_compression_policy = 24; // EDictionaryCompressionPolicy +    // If present, chunk values are comressed with dictionary of respective id. +    optional NYT.NProto.TGuid compression_dictionary_id = 25; +      reserved 20;  } diff --git a/yt/yt_proto/yt/client/table_chunk_format/proto/chunk_meta.proto b/yt/yt_proto/yt/client/table_chunk_format/proto/chunk_meta.proto index 875f93c8194..6518287f9d3 100644 --- a/yt/yt_proto/yt/client/table_chunk_format/proto/chunk_meta.proto +++ b/yt/yt_proto/yt/client/table_chunk_format/proto/chunk_meta.proto @@ -294,6 +294,7 @@ message THunkChunkRef      optional int64 hunk_count = 2;      optional int64 total_hunk_length = 3;      optional int32 erasure_codec = 4; // NErasure::ECodec +    optional NYT.NProto.TGuid compression_dictionary_id = 5;  }  message THunkChunkRefsExt  | 
