aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/codecs/solar_codec.h
blob: e6c0b891ad7cc863b4bafc2a93f27c4e2fc606d9 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#pragma once 
 
#include "codecs.h" 
#include <library/cpp/containers/comptrie/comptrie_trie.h>
#include <library/cpp/codecs/greedy_dict/gd_builder.h>
 
#include <util/string/cast.h>
#include <util/string/escape.h> 
 
namespace NCodecs { 
    // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы. 
    // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе. 
 
    struct TVarIntTraits {
        static const size_t MAX_VARINT32_BYTES = 5;

        static void Write(ui32 value, TBuffer& b) {
            while (value > 0x7F) {
                b.Append(static_cast<ui8>(value) | 0x80);
                value >>= 7;
            }
            b.Append(static_cast<ui8>(value) & 0x7F);
        }

        static void Read(TStringBuf& r, ui32& value) {
            ui32 result = 0;
            for (ui32 count = 0; count < MAX_VARINT32_BYTES; ++count) {
                const ui32 b = static_cast<ui8>(r[0]);
                r.Skip(1);
                result |= static_cast<ui32>(b & 0x7F) << (7 * count);
                if (!(b & 0x80)) {
                    value = result;
                    return;
                } else if (Y_UNLIKELY(r.empty())) {
                    break;
                }
            }
            Y_ENSURE_EX(false, TCodecException() << "Bad data");
        }
    };

    struct TShortIntTraits {
        static const size_t SHORTINT_SIZE_LIMIT = 0x8000;

        Y_FORCE_INLINE static void Write(ui32 value, TBuffer& b) {
            Y_ENSURE_EX(value < SHORTINT_SIZE_LIMIT, TCodecException() << "Bad write method");
            if (value >= 0x80) {
                b.Append(static_cast<ui8>(value >> 8) | 0x80);
            }
            b.Append(static_cast<ui8>(value));
        }

        Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) {
            ui32 result = static_cast<ui8>(r[0]);
            r.Skip(1); 
            if (result >= 0x80) {
                Y_ENSURE_EX(!r.empty(), TCodecException() << "Bad data");
                result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]);
                r.Skip(1);
            }
            value = result;
        }
    };

    class TSolarCodec: public ICodec {
    public:
        static TStringBuf MyName8k() {
            return TStringBuf("solar-8k");
        }
        static TStringBuf MyName16k() {
            return TStringBuf("solar-16k");
        }
        static TStringBuf MyName32k() {
            return TStringBuf("solar-32k");
        }
        static TStringBuf MyName64k() {
            return TStringBuf("solar-64k");
        }
        static TStringBuf MyName256k() {
            return TStringBuf("solar-256k");
        }
        static TStringBuf MyName() {
            return TStringBuf("solar");
        }
        static TStringBuf MyName8kAdapt() {
            return TStringBuf("solar-8k-a");
        }
        static TStringBuf MyName16kAdapt() {
            return TStringBuf("solar-16k-a");
        }
        static TStringBuf MyName32kAdapt() {
            return TStringBuf("solar-32k-a");
        }
        static TStringBuf MyName64kAdapt() {
            return TStringBuf("solar-64k-a");
        }
        static TStringBuf MyName256kAdapt() {
            return TStringBuf("solar-256k-a");
        }
        static TStringBuf MyNameShortInt() {
            return TStringBuf("solar-si");
        }
 
        explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
            : Settings(s)
            , MaxEntries(maxentries)
            , MaxIterations(maxiter)
        {
            MyTraits.NeedsTraining = true;
            MyTraits.SizeOnDecodeMultiplier = 2;
            MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8;
        }
 
        ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
            EncodeImpl<TVarIntTraits>(r, b);
            return 0;
        }

        void Decode(TStringBuf r, TBuffer& b) const override {
            DecodeImpl<TVarIntTraits>(r, b);
        }

        TString GetName() const override {
            return ToString(MyName());
        }

    protected:
        void DoLearn(ISequenceReader&) override;
        void Save(IOutputStream*) const override;
        void Load(IInputStream*) override;

        Y_FORCE_INLINE TStringBuf SubStr(ui32 begoff, ui32 endoff) const {
            return TStringBuf(Pool.Data() + begoff, endoff - begoff);
        }

        Y_FORCE_INLINE TStringBuf DoDecode(ui32 num) const {
            return SubStr(Decoder[num - 1], Decoder[num]);
        }

        template <class TTraits>
        Y_FORCE_INLINE void EncodeImpl(TStringBuf r, TBuffer& b) const {
            b.Clear();
            b.Reserve(r.size());
            while (!r.empty()) {
                size_t sz = 0;
                ui32 val = (ui32)-1;
                Encoder.FindLongestPrefix(r, &sz, &val);
                TTraits::Write(val + 1, b);
                r.Skip(Max<size_t>(sz, 1));
            }
        } 
 
        template <class TTraits>
        Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const {
            b.Clear();
            b.Reserve(r.size());
            ui32 v = 0;
            while (!r.empty()) {
                TTraits::Read(r, v);
                TStringBuf s = DoDecode(v);
                b.Append(s.data(), s.size());
            }
        } 
 
        inline bool CanUseShortInt() const {
            return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT;
        }
 
    private:
        typedef TCompactTrie<char, ui32> TEncoder;
        typedef TVector<ui32> TDecoder;
 
        TBuffer Pool;
        TEncoder Encoder;
        TDecoder Decoder;
 
        NGreedyDict::TBuildSettings Settings;
        ui32 MaxEntries;
        ui32 MaxIterations;
    };
 
    // Uses varints or shortints depending on the decoder size
    class TAdaptiveSolarCodec: public TSolarCodec {
    public:
        explicit TAdaptiveSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
            : TSolarCodec(maxentries, maxiter, s)
        {
        }
 
        ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
            if (CanUseShortInt()) {
                EncodeImpl<TShortIntTraits>(r, b);
            } else {
                EncodeImpl<TVarIntTraits>(r, b);
            }

            return 0;
        }

        void Decode(TStringBuf r, TBuffer& b) const override {
            if (CanUseShortInt()) {
                DecodeImpl<TShortIntTraits>(r, b);
            } else {
                DecodeImpl<TVarIntTraits>(r, b);
            }
        }

        TString GetName() const override {
            if (CanUseShortInt()) {
                return ToString(MyNameShortInt());
            } else {
                return ToString(MyName());
            }
        }
    };

    class TSolarCodecShortInt: public TSolarCodec {
    public:
        explicit TSolarCodecShortInt(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
            : TSolarCodec(maxentries, maxiter, s)
        {
        }

        ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
            EncodeImpl<TShortIntTraits>(r, b);
            return 0;
        }
 
        void Decode(TStringBuf r, TBuffer& b) const override {
            DecodeImpl<TShortIntTraits>(r, b);
        }

        TString GetName() const override {
            return ToString(MyNameShortInt());
        }

    protected:
        void Load(IInputStream* in) override {
            TSolarCodec::Load(in);
            Y_ENSURE_EX(CanUseShortInt(), TCodecException() << "Bad data");
        }
    };

}