aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/codecs/solar_codec.h
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/codecs/solar_codec.h
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/codecs/solar_codec.h')
-rw-r--r--library/cpp/codecs/solar_codec.h244
1 files changed, 244 insertions, 0 deletions
diff --git a/library/cpp/codecs/solar_codec.h b/library/cpp/codecs/solar_codec.h
new file mode 100644
index 0000000000..7158ae7926
--- /dev/null
+++ b/library/cpp/codecs/solar_codec.h
@@ -0,0 +1,244 @@
+#pragma once
+
+#include "codecs.h"
+#include <library/cpp/containers/comptrie/comptrie_trie.h>
+#include <library/cpp/codecs/greedy_dict/gd_builder.h>
+
+#include <util/string/cast.h>
+#include <util/string/escape.h>
+
+namespace NCodecs {
+ // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы.
+ // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе.
+
+ struct TVarIntTraits {
+ static const size_t MAX_VARINT32_BYTES = 5;
+
+ static void Write(ui32 value, TBuffer& b) {
+ while (value > 0x7F) {
+ b.Append(static_cast<ui8>(value) | 0x80);
+ value >>= 7;
+ }
+ b.Append(static_cast<ui8>(value) & 0x7F);
+ }
+
+ static void Read(TStringBuf& r, ui32& value) {
+ ui32 result = 0;
+ for (ui32 count = 0; count < MAX_VARINT32_BYTES; ++count) {
+ const ui32 b = static_cast<ui8>(r[0]);
+ r.Skip(1);
+ result |= static_cast<ui32>(b & 0x7F) << (7 * count);
+ if (!(b & 0x80)) {
+ value = result;
+ return;
+ } else if (Y_UNLIKELY(r.empty())) {
+ break;
+ }
+ }
+ Y_ENSURE_EX(false, TCodecException() << "Bad data");
+ }
+ };
+
+ struct TShortIntTraits {
+ static const size_t SHORTINT_SIZE_LIMIT = 0x8000;
+
+ Y_FORCE_INLINE static void Write(ui32 value, TBuffer& b) {
+ Y_ENSURE_EX(value < SHORTINT_SIZE_LIMIT, TCodecException() << "Bad write method");
+ if (value >= 0x80) {
+ b.Append(static_cast<ui8>(value >> 8) | 0x80);
+ }
+ b.Append(static_cast<ui8>(value));
+ }
+
+ Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) {
+ ui32 result = static_cast<ui8>(r[0]);
+ r.Skip(1);
+ if (result >= 0x80) {
+ Y_ENSURE_EX(!r.empty(), TCodecException() << "Bad data");
+ result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]);
+ r.Skip(1);
+ }
+ value = result;
+ }
+ };
+
+ class TSolarCodec: public ICodec {
+ public:
+ static TStringBuf MyName8k() {
+ return TStringBuf("solar-8k");
+ }
+ static TStringBuf MyName16k() {
+ return TStringBuf("solar-16k");
+ }
+ static TStringBuf MyName32k() {
+ return TStringBuf("solar-32k");
+ }
+ static TStringBuf MyName64k() {
+ return TStringBuf("solar-64k");
+ }
+ static TStringBuf MyName256k() {
+ return TStringBuf("solar-256k");
+ }
+ static TStringBuf MyName() {
+ return TStringBuf("solar");
+ }
+ static TStringBuf MyName8kAdapt() {
+ return TStringBuf("solar-8k-a");
+ }
+ static TStringBuf MyName16kAdapt() {
+ return TStringBuf("solar-16k-a");
+ }
+ static TStringBuf MyName32kAdapt() {
+ return TStringBuf("solar-32k-a");
+ }
+ static TStringBuf MyName64kAdapt() {
+ return TStringBuf("solar-64k-a");
+ }
+ static TStringBuf MyName256kAdapt() {
+ return TStringBuf("solar-256k-a");
+ }
+ static TStringBuf MyNameShortInt() {
+ return TStringBuf("solar-si");
+ }
+
+ explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
+ : Settings(s)
+ , MaxEntries(maxentries)
+ , MaxIterations(maxiter)
+ {
+ MyTraits.NeedsTraining = true;
+ MyTraits.SizeOnDecodeMultiplier = 2;
+ MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8;
+ }
+
+ ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
+ EncodeImpl<TVarIntTraits>(r, b);
+ return 0;
+ }
+
+ void Decode(TStringBuf r, TBuffer& b) const override {
+ DecodeImpl<TVarIntTraits>(r, b);
+ }
+
+ TString GetName() const override {
+ return ToString(MyName());
+ }
+
+ protected:
+ void DoLearn(ISequenceReader&) override;
+ void Save(IOutputStream*) const override;
+ void Load(IInputStream*) override;
+
+ Y_FORCE_INLINE TStringBuf SubStr(ui32 begoff, ui32 endoff) const {
+ return TStringBuf(Pool.Data() + begoff, endoff - begoff);
+ }
+
+ Y_FORCE_INLINE TStringBuf DoDecode(ui32 num) const {
+ return SubStr(Decoder[num - 1], Decoder[num]);
+ }
+
+ template <class TTraits>
+ Y_FORCE_INLINE void EncodeImpl(TStringBuf r, TBuffer& b) const {
+ b.Clear();
+ b.Reserve(r.size());
+ while (!r.empty()) {
+ size_t sz = 0;
+ ui32 val = (ui32)-1;
+ Encoder.FindLongestPrefix(r, &sz, &val);
+ TTraits::Write(val + 1, b);
+ r.Skip(Max<size_t>(sz, 1));
+ }
+ }
+
+ template <class TTraits>
+ Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const {
+ b.Clear();
+ b.Reserve(r.size());
+ ui32 v = 0;
+ while (!r.empty()) {
+ TTraits::Read(r, v);
+ TStringBuf s = DoDecode(v);
+ b.Append(s.data(), s.size());
+ }
+ }
+
+ inline bool CanUseShortInt() const {
+ return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT;
+ }
+
+ private:
+ typedef TCompactTrie<char, ui32> TEncoder;
+ typedef TVector<ui32> TDecoder;
+
+ TBuffer Pool;
+ TEncoder Encoder;
+ TDecoder Decoder;
+
+ NGreedyDict::TBuildSettings Settings;
+ ui32 MaxEntries;
+ ui32 MaxIterations;
+ };
+
+ // Uses varints or shortints depending on the decoder size
+ class TAdaptiveSolarCodec: public TSolarCodec {
+ public:
+ explicit TAdaptiveSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
+ : TSolarCodec(maxentries, maxiter, s)
+ {
+ }
+
+ ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
+ if (CanUseShortInt()) {
+ EncodeImpl<TShortIntTraits>(r, b);
+ } else {
+ EncodeImpl<TVarIntTraits>(r, b);
+ }
+
+ return 0;
+ }
+
+ void Decode(TStringBuf r, TBuffer& b) const override {
+ if (CanUseShortInt()) {
+ DecodeImpl<TShortIntTraits>(r, b);
+ } else {
+ DecodeImpl<TVarIntTraits>(r, b);
+ }
+ }
+
+ TString GetName() const override {
+ if (CanUseShortInt()) {
+ return ToString(MyNameShortInt());
+ } else {
+ return ToString(MyName());
+ }
+ }
+ };
+
+ class TSolarCodecShortInt: public TSolarCodec {
+ public:
+ explicit TSolarCodecShortInt(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
+ : TSolarCodec(maxentries, maxiter, s)
+ {
+ }
+
+ ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
+ EncodeImpl<TShortIntTraits>(r, b);
+ return 0;
+ }
+
+ void Decode(TStringBuf r, TBuffer& b) const override {
+ DecodeImpl<TShortIntTraits>(r, b);
+ }
+
+ TString GetName() const override {
+ return ToString(MyNameShortInt());
+ }
+
+ protected:
+ void Load(IInputStream* in) override {
+ TSolarCodec::Load(in);
+ Y_ENSURE_EX(CanUseShortInt(), TCodecException() << "Bad data");
+ }
+ };
+
+}