diff options
author | Anton Samokhvalov <pg83@yandex.ru> | 2022-02-10 16:45:15 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:15 +0300 |
commit | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch) | |
tree | da2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /library/cpp/codecs/solar_codec.h | |
parent | 778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff) | |
download | ydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz |
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/codecs/solar_codec.h')
-rw-r--r-- | library/cpp/codecs/solar_codec.h | 370 |
1 files changed, 185 insertions, 185 deletions
diff --git a/library/cpp/codecs/solar_codec.h b/library/cpp/codecs/solar_codec.h index 7158ae7926..08fdf9d123 100644 --- a/library/cpp/codecs/solar_codec.h +++ b/library/cpp/codecs/solar_codec.h @@ -11,234 +11,234 @@ namespace NCodecs { // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы. // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе. - struct TVarIntTraits { - static const size_t MAX_VARINT32_BYTES = 5; - - static void Write(ui32 value, TBuffer& b) { - while (value > 0x7F) { - b.Append(static_cast<ui8>(value) | 0x80); - value >>= 7; - } - b.Append(static_cast<ui8>(value) & 0x7F); - } - - static void Read(TStringBuf& r, ui32& value) { - ui32 result = 0; - for (ui32 count = 0; count < MAX_VARINT32_BYTES; ++count) { - const ui32 b = static_cast<ui8>(r[0]); - r.Skip(1); - result |= static_cast<ui32>(b & 0x7F) << (7 * count); - if (!(b & 0x80)) { - value = result; - return; + struct TVarIntTraits { + static const size_t MAX_VARINT32_BYTES = 5; + + static void Write(ui32 value, TBuffer& b) { + while (value > 0x7F) { + b.Append(static_cast<ui8>(value) | 0x80); + value >>= 7; + } + b.Append(static_cast<ui8>(value) & 0x7F); + } + + static void Read(TStringBuf& r, ui32& value) { + ui32 result = 0; + for (ui32 count = 0; count < MAX_VARINT32_BYTES; ++count) { + const ui32 b = static_cast<ui8>(r[0]); + r.Skip(1); + result |= static_cast<ui32>(b & 0x7F) << (7 * count); + if (!(b & 0x80)) { + value = result; + return; } else if (Y_UNLIKELY(r.empty())) { - break; - } + break; + } } - Y_ENSURE_EX(false, TCodecException() << "Bad data"); + Y_ENSURE_EX(false, TCodecException() << "Bad data"); } - }; + }; - struct TShortIntTraits { - static const size_t SHORTINT_SIZE_LIMIT = 0x8000; + struct TShortIntTraits { + static const size_t SHORTINT_SIZE_LIMIT = 0x8000; - Y_FORCE_INLINE static void Write(ui32 value, TBuffer& b) { - Y_ENSURE_EX(value < SHORTINT_SIZE_LIMIT, TCodecException() << "Bad write method"); - if (value >= 0x80) { - b.Append(static_cast<ui8>(value >> 8) | 0x80); - } - b.Append(static_cast<ui8>(value)); + Y_FORCE_INLINE static void Write(ui32 value, TBuffer& b) { + Y_ENSURE_EX(value < SHORTINT_SIZE_LIMIT, TCodecException() << "Bad write method"); + if (value >= 0x80) { + b.Append(static_cast<ui8>(value >> 8) | 0x80); + } + b.Append(static_cast<ui8>(value)); } - Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) { - ui32 result = static_cast<ui8>(r[0]); + Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) { + ui32 result = static_cast<ui8>(r[0]); r.Skip(1); - if (result >= 0x80) { + if (result >= 0x80) { Y_ENSURE_EX(!r.empty(), TCodecException() << "Bad data"); - result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]); - r.Skip(1); - } - value = result; + result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]); + r.Skip(1); + } + value = result; } - }; + }; - class TSolarCodec: public ICodec { - public: - static TStringBuf MyName8k() { + class TSolarCodec: public ICodec { + public: + static TStringBuf MyName8k() { return TStringBuf("solar-8k"); - } - static TStringBuf MyName16k() { + } + static TStringBuf MyName16k() { return TStringBuf("solar-16k"); - } - static TStringBuf MyName32k() { + } + static TStringBuf MyName32k() { return TStringBuf("solar-32k"); - } - static TStringBuf MyName64k() { + } + static TStringBuf MyName64k() { return TStringBuf("solar-64k"); - } - static TStringBuf MyName256k() { + } + static TStringBuf MyName256k() { return TStringBuf("solar-256k"); - } - static TStringBuf MyName() { + } + static TStringBuf MyName() { return TStringBuf("solar"); - } - static TStringBuf MyName8kAdapt() { + } + static TStringBuf MyName8kAdapt() { return TStringBuf("solar-8k-a"); - } - static TStringBuf MyName16kAdapt() { + } + static TStringBuf MyName16kAdapt() { return TStringBuf("solar-16k-a"); - } - static TStringBuf MyName32kAdapt() { + } + static TStringBuf MyName32kAdapt() { return TStringBuf("solar-32k-a"); - } - static TStringBuf MyName64kAdapt() { + } + static TStringBuf MyName64kAdapt() { return TStringBuf("solar-64k-a"); - } - static TStringBuf MyName256kAdapt() { + } + static TStringBuf MyName256kAdapt() { return TStringBuf("solar-256k-a"); - } - static TStringBuf MyNameShortInt() { + } + static TStringBuf MyNameShortInt() { return TStringBuf("solar-si"); - } - - explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) - : Settings(s) - , MaxEntries(maxentries) - , MaxIterations(maxiter) - { - MyTraits.NeedsTraining = true; - MyTraits.SizeOnDecodeMultiplier = 2; - MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8; - } - - ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { - EncodeImpl<TVarIntTraits>(r, b); - return 0; - } - - void Decode(TStringBuf r, TBuffer& b) const override { - DecodeImpl<TVarIntTraits>(r, b); - } - - TString GetName() const override { + } + + explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) + : Settings(s) + , MaxEntries(maxentries) + , MaxIterations(maxiter) + { + MyTraits.NeedsTraining = true; + MyTraits.SizeOnDecodeMultiplier = 2; + MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8; + } + + ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { + EncodeImpl<TVarIntTraits>(r, b); + return 0; + } + + void Decode(TStringBuf r, TBuffer& b) const override { + DecodeImpl<TVarIntTraits>(r, b); + } + + TString GetName() const override { return ToString(MyName()); - } + } - protected: - void DoLearn(ISequenceReader&) override; - void Save(IOutputStream*) const override; - void Load(IInputStream*) override; + protected: + void DoLearn(ISequenceReader&) override; + void Save(IOutputStream*) const override; + void Load(IInputStream*) override; - Y_FORCE_INLINE TStringBuf SubStr(ui32 begoff, ui32 endoff) const { - return TStringBuf(Pool.Data() + begoff, endoff - begoff); - } + Y_FORCE_INLINE TStringBuf SubStr(ui32 begoff, ui32 endoff) const { + return TStringBuf(Pool.Data() + begoff, endoff - begoff); + } - Y_FORCE_INLINE TStringBuf DoDecode(ui32 num) const { - return SubStr(Decoder[num - 1], Decoder[num]); - } + Y_FORCE_INLINE TStringBuf DoDecode(ui32 num) const { + return SubStr(Decoder[num - 1], Decoder[num]); + } - template <class TTraits> - Y_FORCE_INLINE void EncodeImpl(TStringBuf r, TBuffer& b) const { - b.Clear(); + template <class TTraits> + Y_FORCE_INLINE void EncodeImpl(TStringBuf r, TBuffer& b) const { + b.Clear(); b.Reserve(r.size()); while (!r.empty()) { - size_t sz = 0; - ui32 val = (ui32)-1; - Encoder.FindLongestPrefix(r, &sz, &val); - TTraits::Write(val + 1, b); - r.Skip(Max<size_t>(sz, 1)); - } + size_t sz = 0; + ui32 val = (ui32)-1; + Encoder.FindLongestPrefix(r, &sz, &val); + TTraits::Write(val + 1, b); + r.Skip(Max<size_t>(sz, 1)); + } } - template <class TTraits> - Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const { - b.Clear(); + template <class TTraits> + Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const { + b.Clear(); b.Reserve(r.size()); - ui32 v = 0; + ui32 v = 0; while (!r.empty()) { - TTraits::Read(r, v); - TStringBuf s = DoDecode(v); + TTraits::Read(r, v); + TStringBuf s = DoDecode(v); b.Append(s.data(), s.size()); - } - } - - inline bool CanUseShortInt() const { - return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT; - } - - private: - typedef TCompactTrie<char, ui32> TEncoder; - typedef TVector<ui32> TDecoder; - - TBuffer Pool; - TEncoder Encoder; - TDecoder Decoder; - - NGreedyDict::TBuildSettings Settings; - ui32 MaxEntries; - ui32 MaxIterations; - }; - - // Uses varints or shortints depending on the decoder size - class TAdaptiveSolarCodec: public TSolarCodec { - public: - explicit TAdaptiveSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) - : TSolarCodec(maxentries, maxiter, s) - { - } - - ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { - if (CanUseShortInt()) { - EncodeImpl<TShortIntTraits>(r, b); - } else { - EncodeImpl<TVarIntTraits>(r, b); - } - - return 0; - } - - void Decode(TStringBuf r, TBuffer& b) const override { - if (CanUseShortInt()) { - DecodeImpl<TShortIntTraits>(r, b); - } else { - DecodeImpl<TVarIntTraits>(r, b); - } - } - - TString GetName() const override { - if (CanUseShortInt()) { + } + } + + inline bool CanUseShortInt() const { + return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT; + } + + private: + typedef TCompactTrie<char, ui32> TEncoder; + typedef TVector<ui32> TDecoder; + + TBuffer Pool; + TEncoder Encoder; + TDecoder Decoder; + + NGreedyDict::TBuildSettings Settings; + ui32 MaxEntries; + ui32 MaxIterations; + }; + + // Uses varints or shortints depending on the decoder size + class TAdaptiveSolarCodec: public TSolarCodec { + public: + explicit TAdaptiveSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) + : TSolarCodec(maxentries, maxiter, s) + { + } + + ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { + if (CanUseShortInt()) { + EncodeImpl<TShortIntTraits>(r, b); + } else { + EncodeImpl<TVarIntTraits>(r, b); + } + + return 0; + } + + void Decode(TStringBuf r, TBuffer& b) const override { + if (CanUseShortInt()) { + DecodeImpl<TShortIntTraits>(r, b); + } else { + DecodeImpl<TVarIntTraits>(r, b); + } + } + + TString GetName() const override { + if (CanUseShortInt()) { return ToString(MyNameShortInt()); - } else { + } else { return ToString(MyName()); - } + } } - }; + }; - class TSolarCodecShortInt: public TSolarCodec { - public: - explicit TSolarCodecShortInt(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) - : TSolarCodec(maxentries, maxiter, s) - { + class TSolarCodecShortInt: public TSolarCodec { + public: + explicit TSolarCodecShortInt(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) + : TSolarCodec(maxentries, maxiter, s) + { } - ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { - EncodeImpl<TShortIntTraits>(r, b); - return 0; - } + ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { + EncodeImpl<TShortIntTraits>(r, b); + return 0; + } - void Decode(TStringBuf r, TBuffer& b) const override { - DecodeImpl<TShortIntTraits>(r, b); - } + void Decode(TStringBuf r, TBuffer& b) const override { + DecodeImpl<TShortIntTraits>(r, b); + } - TString GetName() const override { + TString GetName() const override { return ToString(MyNameShortInt()); - } - - protected: - void Load(IInputStream* in) override { - TSolarCodec::Load(in); - Y_ENSURE_EX(CanUseShortInt(), TCodecException() << "Bad data"); - } - }; + } + + protected: + void Load(IInputStream* in) override { + TSolarCodec::Load(in); + Y_ENSURE_EX(CanUseShortInt(), TCodecException() << "Bad data"); + } + }; } |