#pragma once #include "codecs.h" #include <library/cpp/containers/comptrie/comptrie_trie.h> #include <library/cpp/codecs/greedy_dict/gd_builder.h> #include <util/string/cast.h> #include <util/string/escape.h> namespace NCodecs { // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы. // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе. struct TVarIntTraits { static const size_t MAX_VARINT32_BYTES = 5; static void Write(ui32 value, TBuffer& b) { while (value > 0x7F) { b.Append(static_cast<ui8>(value) | 0x80); value >>= 7; } b.Append(static_cast<ui8>(value) & 0x7F); } static void Read(TStringBuf& r, ui32& value) { ui32 result = 0; for (ui32 count = 0; count < MAX_VARINT32_BYTES; ++count) { const ui32 b = static_cast<ui8>(r[0]); r.Skip(1); result |= static_cast<ui32>(b & 0x7F) << (7 * count); if (!(b & 0x80)) { value = result; return; } else if (Y_UNLIKELY(r.empty())) { break; } } Y_ENSURE_EX(false, TCodecException() << "Bad data"); } }; struct TShortIntTraits { static const size_t SHORTINT_SIZE_LIMIT = 0x8000; Y_FORCE_INLINE static void Write(ui32 value, TBuffer& b) { Y_ENSURE_EX(value < SHORTINT_SIZE_LIMIT, TCodecException() << "Bad write method"); if (value >= 0x80) { b.Append(static_cast<ui8>(value >> 8) | 0x80); } b.Append(static_cast<ui8>(value)); } Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) { ui32 result = static_cast<ui8>(r[0]); r.Skip(1); if (result >= 0x80) { Y_ENSURE_EX(!r.empty(), TCodecException() << "Bad data"); result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]); r.Skip(1); } value = result; } }; class TSolarCodec: public ICodec { public: static TStringBuf MyName8k() { return TStringBuf("solar-8k"); } static TStringBuf MyName16k() { return TStringBuf("solar-16k"); } static TStringBuf MyName32k() { return TStringBuf("solar-32k"); } static TStringBuf MyName64k() { return TStringBuf("solar-64k"); } static TStringBuf MyName256k() { return TStringBuf("solar-256k"); } static TStringBuf MyName() { return TStringBuf("solar"); } static TStringBuf MyName8kAdapt() { return TStringBuf("solar-8k-a"); } static TStringBuf MyName16kAdapt() { return TStringBuf("solar-16k-a"); } static TStringBuf MyName32kAdapt() { return TStringBuf("solar-32k-a"); } static TStringBuf MyName64kAdapt() { return TStringBuf("solar-64k-a"); } static TStringBuf MyName256kAdapt() { return TStringBuf("solar-256k-a"); } static TStringBuf MyNameShortInt() { return TStringBuf("solar-si"); } explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) : Settings(s) , MaxEntries(maxentries) , MaxIterations(maxiter) { MyTraits.NeedsTraining = true; MyTraits.SizeOnDecodeMultiplier = 2; MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8; } ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { EncodeImpl<TVarIntTraits>(r, b); return 0; } void Decode(TStringBuf r, TBuffer& b) const override { DecodeImpl<TVarIntTraits>(r, b); } TString GetName() const override { return ToString(MyName()); } protected: void DoLearn(ISequenceReader&) override; void Save(IOutputStream*) const override; void Load(IInputStream*) override; Y_FORCE_INLINE TStringBuf SubStr(ui32 begoff, ui32 endoff) const { return TStringBuf(Pool.Data() + begoff, endoff - begoff); } Y_FORCE_INLINE TStringBuf DoDecode(ui32 num) const { return SubStr(Decoder[num - 1], Decoder[num]); } template <class TTraits> Y_FORCE_INLINE void EncodeImpl(TStringBuf r, TBuffer& b) const { b.Clear(); b.Reserve(r.size()); while (!r.empty()) { size_t sz = 0; ui32 val = (ui32)-1; Encoder.FindLongestPrefix(r, &sz, &val); TTraits::Write(val + 1, b); r.Skip(Max<size_t>(sz, 1)); } } template <class TTraits> Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const { b.Clear(); b.Reserve(r.size()); ui32 v = 0; while (!r.empty()) { TTraits::Read(r, v); TStringBuf s = DoDecode(v); b.Append(s.data(), s.size()); } } inline bool CanUseShortInt() const { return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT; } private: typedef TCompactTrie<char, ui32> TEncoder; typedef TVector<ui32> TDecoder; TBuffer Pool; TEncoder Encoder; TDecoder Decoder; NGreedyDict::TBuildSettings Settings; ui32 MaxEntries; ui32 MaxIterations; }; // Uses varints or shortints depending on the decoder size class TAdaptiveSolarCodec: public TSolarCodec { public: explicit TAdaptiveSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) : TSolarCodec(maxentries, maxiter, s) { } ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { if (CanUseShortInt()) { EncodeImpl<TShortIntTraits>(r, b); } else { EncodeImpl<TVarIntTraits>(r, b); } return 0; } void Decode(TStringBuf r, TBuffer& b) const override { if (CanUseShortInt()) { DecodeImpl<TShortIntTraits>(r, b); } else { DecodeImpl<TVarIntTraits>(r, b); } } TString GetName() const override { if (CanUseShortInt()) { return ToString(MyNameShortInt()); } else { return ToString(MyName()); } } }; class TSolarCodecShortInt: public TSolarCodec { public: explicit TSolarCodecShortInt(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) : TSolarCodec(maxentries, maxiter, s) { } ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { EncodeImpl<TShortIntTraits>(r, b); return 0; } void Decode(TStringBuf r, TBuffer& b) const override { DecodeImpl<TShortIntTraits>(r, b); } TString GetName() const override { return ToString(MyNameShortInt()); } protected: void Load(IInputStream* in) override { TSolarCodec::Load(in); Y_ENSURE_EX(CanUseShortInt(), TCodecException() << "Bad data"); } }; }