diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/codecs/solar_codec.h | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/codecs/solar_codec.h')
-rw-r--r-- | library/cpp/codecs/solar_codec.h | 244 |
1 files changed, 244 insertions, 0 deletions
diff --git a/library/cpp/codecs/solar_codec.h b/library/cpp/codecs/solar_codec.h new file mode 100644 index 0000000000..7158ae7926 --- /dev/null +++ b/library/cpp/codecs/solar_codec.h @@ -0,0 +1,244 @@ +#pragma once + +#include "codecs.h" +#include <library/cpp/containers/comptrie/comptrie_trie.h> +#include <library/cpp/codecs/greedy_dict/gd_builder.h> + +#include <util/string/cast.h> +#include <util/string/escape.h> + +namespace NCodecs { + // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы. + // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе. + + struct TVarIntTraits { + static const size_t MAX_VARINT32_BYTES = 5; + + static void Write(ui32 value, TBuffer& b) { + while (value > 0x7F) { + b.Append(static_cast<ui8>(value) | 0x80); + value >>= 7; + } + b.Append(static_cast<ui8>(value) & 0x7F); + } + + static void Read(TStringBuf& r, ui32& value) { + ui32 result = 0; + for (ui32 count = 0; count < MAX_VARINT32_BYTES; ++count) { + const ui32 b = static_cast<ui8>(r[0]); + r.Skip(1); + result |= static_cast<ui32>(b & 0x7F) << (7 * count); + if (!(b & 0x80)) { + value = result; + return; + } else if (Y_UNLIKELY(r.empty())) { + break; + } + } + Y_ENSURE_EX(false, TCodecException() << "Bad data"); + } + }; + + struct TShortIntTraits { + static const size_t SHORTINT_SIZE_LIMIT = 0x8000; + + Y_FORCE_INLINE static void Write(ui32 value, TBuffer& b) { + Y_ENSURE_EX(value < SHORTINT_SIZE_LIMIT, TCodecException() << "Bad write method"); + if (value >= 0x80) { + b.Append(static_cast<ui8>(value >> 8) | 0x80); + } + b.Append(static_cast<ui8>(value)); + } + + Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) { + ui32 result = static_cast<ui8>(r[0]); + r.Skip(1); + if (result >= 0x80) { + Y_ENSURE_EX(!r.empty(), TCodecException() << "Bad data"); + result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]); + r.Skip(1); + } + value = result; + } + }; + + class TSolarCodec: public ICodec { + public: + static TStringBuf MyName8k() { + return TStringBuf("solar-8k"); + } + static TStringBuf MyName16k() { + return TStringBuf("solar-16k"); + } + static TStringBuf MyName32k() { + return TStringBuf("solar-32k"); + } + static TStringBuf MyName64k() { + return TStringBuf("solar-64k"); + } + static TStringBuf MyName256k() { + return TStringBuf("solar-256k"); + } + static TStringBuf MyName() { + return TStringBuf("solar"); + } + static TStringBuf MyName8kAdapt() { + return TStringBuf("solar-8k-a"); + } + static TStringBuf MyName16kAdapt() { + return TStringBuf("solar-16k-a"); + } + static TStringBuf MyName32kAdapt() { + return TStringBuf("solar-32k-a"); + } + static TStringBuf MyName64kAdapt() { + return TStringBuf("solar-64k-a"); + } + static TStringBuf MyName256kAdapt() { + return TStringBuf("solar-256k-a"); + } + static TStringBuf MyNameShortInt() { + return TStringBuf("solar-si"); + } + + explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) + : Settings(s) + , MaxEntries(maxentries) + , MaxIterations(maxiter) + { + MyTraits.NeedsTraining = true; + MyTraits.SizeOnDecodeMultiplier = 2; + MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8; + } + + ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { + EncodeImpl<TVarIntTraits>(r, b); + return 0; + } + + void Decode(TStringBuf r, TBuffer& b) const override { + DecodeImpl<TVarIntTraits>(r, b); + } + + TString GetName() const override { + return ToString(MyName()); + } + + protected: + void DoLearn(ISequenceReader&) override; + void Save(IOutputStream*) const override; + void Load(IInputStream*) override; + + Y_FORCE_INLINE TStringBuf SubStr(ui32 begoff, ui32 endoff) const { + return TStringBuf(Pool.Data() + begoff, endoff - begoff); + } + + Y_FORCE_INLINE TStringBuf DoDecode(ui32 num) const { + return SubStr(Decoder[num - 1], Decoder[num]); + } + + template <class TTraits> + Y_FORCE_INLINE void EncodeImpl(TStringBuf r, TBuffer& b) const { + b.Clear(); + b.Reserve(r.size()); + while (!r.empty()) { + size_t sz = 0; + ui32 val = (ui32)-1; + Encoder.FindLongestPrefix(r, &sz, &val); + TTraits::Write(val + 1, b); + r.Skip(Max<size_t>(sz, 1)); + } + } + + template <class TTraits> + Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const { + b.Clear(); + b.Reserve(r.size()); + ui32 v = 0; + while (!r.empty()) { + TTraits::Read(r, v); + TStringBuf s = DoDecode(v); + b.Append(s.data(), s.size()); + } + } + + inline bool CanUseShortInt() const { + return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT; + } + + private: + typedef TCompactTrie<char, ui32> TEncoder; + typedef TVector<ui32> TDecoder; + + TBuffer Pool; + TEncoder Encoder; + TDecoder Decoder; + + NGreedyDict::TBuildSettings Settings; + ui32 MaxEntries; + ui32 MaxIterations; + }; + + // Uses varints or shortints depending on the decoder size + class TAdaptiveSolarCodec: public TSolarCodec { + public: + explicit TAdaptiveSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) + : TSolarCodec(maxentries, maxiter, s) + { + } + + ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { + if (CanUseShortInt()) { + EncodeImpl<TShortIntTraits>(r, b); + } else { + EncodeImpl<TVarIntTraits>(r, b); + } + + return 0; + } + + void Decode(TStringBuf r, TBuffer& b) const override { + if (CanUseShortInt()) { + DecodeImpl<TShortIntTraits>(r, b); + } else { + DecodeImpl<TVarIntTraits>(r, b); + } + } + + TString GetName() const override { + if (CanUseShortInt()) { + return ToString(MyNameShortInt()); + } else { + return ToString(MyName()); + } + } + }; + + class TSolarCodecShortInt: public TSolarCodec { + public: + explicit TSolarCodecShortInt(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) + : TSolarCodec(maxentries, maxiter, s) + { + } + + ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { + EncodeImpl<TShortIntTraits>(r, b); + return 0; + } + + void Decode(TStringBuf r, TBuffer& b) const override { + DecodeImpl<TShortIntTraits>(r, b); + } + + TString GetName() const override { + return ToString(MyNameShortInt()); + } + + protected: + void Load(IInputStream* in) override { + TSolarCodec::Load(in); + Y_ENSURE_EX(CanUseShortInt(), TCodecException() << "Bad data"); + } + }; + +} |