aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/codecs
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/codecs
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/codecs')
-rw-r--r--library/cpp/codecs/README.md46
-rw-r--r--library/cpp/codecs/codecs.cpp190
-rw-r--r--library/cpp/codecs/codecs.h259
-rw-r--r--library/cpp/codecs/codecs_registry.cpp226
-rw-r--r--library/cpp/codecs/codecs_registry.h60
-rw-r--r--library/cpp/codecs/comptable_codec.cpp108
-rw-r--r--library/cpp/codecs/comptable_codec.h40
-rw-r--r--library/cpp/codecs/delta_codec.cpp21
-rw-r--r--library/cpp/codecs/delta_codec.h143
-rw-r--r--library/cpp/codecs/float_huffman.cpp333
-rw-r--r--library/cpp/codecs/float_huffman.h50
-rw-r--r--library/cpp/codecs/float_huffman_bench/main.cpp145
-rw-r--r--library/cpp/codecs/float_huffman_bench/ya.make13
-rw-r--r--library/cpp/codecs/greedy_dict/gd_builder.cpp142
-rw-r--r--library/cpp/codecs/greedy_dict/gd_builder.h94
-rw-r--r--library/cpp/codecs/greedy_dict/gd_entry.cpp98
-rw-r--r--library/cpp/codecs/greedy_dict/gd_entry.h103
-rw-r--r--library/cpp/codecs/greedy_dict/gd_stats.h79
-rw-r--r--library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp282
-rw-r--r--library/cpp/codecs/greedy_dict/ut/ya.make9
-rw-r--r--library/cpp/codecs/greedy_dict/ya.make15
-rw-r--r--library/cpp/codecs/huffman_codec.cpp592
-rw-r--r--library/cpp/codecs/huffman_codec.h39
-rw-r--r--library/cpp/codecs/pfor_codec.cpp22
-rw-r--r--library/cpp/codecs/pfor_codec.h211
-rw-r--r--library/cpp/codecs/sample.h89
-rw-r--r--library/cpp/codecs/solar_codec.cpp133
-rw-r--r--library/cpp/codecs/solar_codec.h244
-rw-r--r--library/cpp/codecs/static/README1
-rw-r--r--library/cpp/codecs/static/builder.cpp39
-rw-r--r--library/cpp/codecs/static/builder.h29
-rw-r--r--library/cpp/codecs/static/common.h32
-rw-r--r--library/cpp/codecs/static/example/example.cpp43
-rw-r--r--library/cpp/codecs/static/example/example.h17
-rw-r--r--library/cpp/codecs/static/example/huffman.1467494385.codec_infobin0 -> 385 bytes
-rw-r--r--library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_infobin0 -> 3425 bytes
-rw-r--r--library/cpp/codecs/static/example/ya.make24
-rw-r--r--library/cpp/codecs/static/static.cpp98
-rw-r--r--library/cpp/codecs/static/static.h34
-rw-r--r--library/cpp/codecs/static/static_codec_info.proto17
-rw-r--r--library/cpp/codecs/static/tools/common/ct_common.cpp74
-rw-r--r--library/cpp/codecs/static/tools/common/ct_common.h75
-rw-r--r--library/cpp/codecs/static/tools/common/ya.make19
-rw-r--r--library/cpp/codecs/static/tools/static_codec_checker/README4
-rw-r--r--library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp73
-rw-r--r--library/cpp/codecs/static/tools/static_codec_checker/ya.make16
-rw-r--r--library/cpp/codecs/static/tools/static_codec_generator/README4
-rw-r--r--library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp82
-rw-r--r--library/cpp/codecs/static/tools/static_codec_generator/ya.make17
-rw-r--r--library/cpp/codecs/static/tools/tests/canondata/result.json6
-rw-r--r--library/cpp/codecs/static/tools/tests/static_codec_tools.py18
-rw-r--r--library/cpp/codecs/static/tools/tests/ya.make20
-rw-r--r--library/cpp/codecs/static/tools/ya.make5
-rw-r--r--library/cpp/codecs/static/ut/builder_ut.cpp57
-rw-r--r--library/cpp/codecs/static/ut/static_ut.cpp27
-rw-r--r--library/cpp/codecs/static/ut/ya.make14
-rw-r--r--library/cpp/codecs/static/ya.make18
-rw-r--r--library/cpp/codecs/tls_cache.cpp4
-rw-r--r--library/cpp/codecs/tls_cache.h100
-rw-r--r--library/cpp/codecs/ut/codecs_ut.cpp1360
-rw-r--r--library/cpp/codecs/ut/float_huffman_ut.cpp237
-rw-r--r--library/cpp/codecs/ut/tls_cache_ut.cpp36
-rw-r--r--library/cpp/codecs/ut/ya.make20
-rw-r--r--library/cpp/codecs/ya.make33
-rw-r--r--library/cpp/codecs/zstd_dict_codec.cpp281
-rw-r--r--library/cpp/codecs/zstd_dict_codec.h38
66 files changed, 6758 insertions, 0 deletions
diff --git a/library/cpp/codecs/README.md b/library/cpp/codecs/README.md
new file mode 100644
index 0000000000..42646ccd97
--- /dev/null
+++ b/library/cpp/codecs/README.md
@@ -0,0 +1,46 @@
+This is a library of compression algorithms with a unified interface and serialization.
+See also library/cpp/codecs/static, where a support for statically compiled dictionaries is implemented.
+
+All algorithms have a common `ICodec` interface (described in codecs.h).
+
+The `ICodec` interface has the following methods:\
+    `virtual ui8 ICodec::Encode (TMemoryRegion, TBuffer&) const;`\
+            - Input - memory region. Output - filled buffer and the rest of the last byte, if it was not filled to the end.\
+    `virtual void ICodec::Decode (TMemoryRegion, TBuffer&) const;`\
+            - Input - memory region. Output - filled buffer.\
+    `virtual void Save (TOutputStream*) const;`\
+            - Serialization.\
+    `virtual void Load (TInputStream*);`\
+            - Deserialization.\
+    `virtual bool NeedsTraining() const;`\
+            - Returns if it is necessary or not to teach this codec.\
+    `virtual bool PaddingBit() const;`\
+            - Returns which values should fill the free bits of the last byte.\
+    `virtual size_t ApproximateSizeOnEncode(size_t sz) const;`\
+            - Returns an approximate estimate of the size of the result after encoding.\
+                    For example could be used for a more accurate preallocation of a buffer.\
+    `virtual size_t ApproximateSizeOnDecode(size_t sz) const;`\
+            - Returns an approximate estimate of the size of the result after decoding.\
+                    For example could be used for a more accurate preallocation of a buffer.\
+    `virtual TString GetName() const;`\
+            - The name of the codec. It is required for registration of the codec in the system of serialization/deserialization.\
+                    For example, it allows you to save information about which combination of codecs was in use (see below).\
+    `virtual void Learn(ISequenceReader*);`\
+            - The interface for teaching codecs that use information about the distribution of data.
+
+In addition, the library has a number of utilities that allow a more flexible use of it.
+
+In the `ICodec` class the following methods are available:\
+    `static TCodecPtr GetInstance(const TString& name);`\
+            - Creation of a codec instance by a symbolic name\
+                    (See `GetName()` above)\
+    `static void Store(TOutputStream*, TCodecPtr p);`\
+            - Serialization of the codec along with its name\
+                    Allows you to save information about which particular combination of codecs was used for encoding,\
+                    and then reassemble the same combination for decoding\
+    `static TCodecPtr Restore(TInputStream* in);`\
+            - Serialization of the codec along with its name\
+    `static TCodecPtr RestoreFromString(TStringBuf data);`\
+            - Loads the codec instance from the string\
+    `static TVector<TString> GetCodecsList();`\
+            - The list of registered codecs
diff --git a/library/cpp/codecs/codecs.cpp b/library/cpp/codecs/codecs.cpp
new file mode 100644
index 0000000000..b17a3156d2
--- /dev/null
+++ b/library/cpp/codecs/codecs.cpp
@@ -0,0 +1,190 @@
+#include "codecs.h"
+#include "tls_cache.h"
+
+#include <util/stream/mem.h>
+
+namespace NCodecs {
+ void ICodec::Store(IOutputStream* out, TCodecPtr p) {
+ if (!p.Get()) {
+ ::Save(out, (ui16)0);
+ return;
+ }
+
+ Y_ENSURE_EX(p->AlreadyTrained(), TCodecException() << "untrained codec " << p->GetName());
+ const TString& n = p->GetName();
+ Y_VERIFY(n.size() <= Max<ui16>());
+ ::Save(out, (ui16)n.size());
+ out->Write(n.data(), n.size());
+ p->Save(out);
+ }
+
+ TCodecPtr ICodec::Restore(IInputStream* in) {
+ ui16 l = 0;
+ ::Load(in, l);
+
+ if (!l) {
+ return nullptr;
+ }
+
+ TString n;
+ n.resize(l);
+
+ Y_ENSURE_EX(in->Load(n.begin(), l) == l, TCodecException());
+
+ TCodecPtr p = ICodec::GetInstance(n);
+ p->Load(in);
+ p->Trained = true;
+ return p;
+ }
+
+ TCodecPtr ICodec::RestoreFromString(TStringBuf s) {
+ TMemoryInput minp{s.data(), s.size()};
+ return Restore(&minp);
+ }
+
+ TString ICodec::GetNameSafe(TCodecPtr p) {
+ return !p ? TString("none") : p->GetName();
+ }
+
+ ui8 TPipelineCodec::Encode(TStringBuf in, TBuffer& out) const {
+ size_t res = Traits().ApproximateSizeOnEncode(in.size());
+ out.Reserve(res);
+ out.Clear();
+
+ if (Pipeline.empty()) {
+ out.Append(in.data(), in.size());
+ return 0;
+ } else if (Pipeline.size() == 1) {
+ return Pipeline.front()->Encode(in, out);
+ }
+
+ ui8 freelastbits = 0;
+
+ auto buffer = TBufferTlsCache::TlsInstance().Item();
+ TBuffer& tmp = buffer.Get();
+ tmp.Reserve(res);
+
+ for (auto it = Pipeline.begin(); it != Pipeline.end(); ++it) {
+ if (it != Pipeline.begin()) {
+ tmp.Clear();
+ tmp.Swap(out);
+ in = TStringBuf{tmp.data(), tmp.size()};
+ }
+ freelastbits = (*it)->Encode(in, out);
+ }
+
+ return freelastbits;
+ }
+
+ void TPipelineCodec::Decode(TStringBuf in, TBuffer& out) const {
+ size_t res = Traits().ApproximateSizeOnDecode(in.size());
+ out.Reserve(res);
+ out.Clear();
+
+ if (Pipeline.empty()) {
+ out.Append(in.data(), in.size());
+ return;
+ } else if (Pipeline.size() == 1) {
+ Pipeline.front()->Decode(in, out);
+ return;
+ }
+
+ auto buffer = TBufferTlsCache::TlsInstance().Item();
+
+ TBuffer& tmp = buffer.Get();
+ tmp.Reserve(res);
+
+ for (TPipeline::const_reverse_iterator it = Pipeline.rbegin(); it != Pipeline.rend(); ++it) {
+ if (it != Pipeline.rbegin()) {
+ tmp.Clear();
+ tmp.Swap(out);
+ in = TStringBuf{tmp.data(), tmp.size()};
+ }
+ (*it)->Decode(in, out);
+ }
+ }
+
+ void TPipelineCodec::Save(IOutputStream* out) const {
+ for (const auto& it : Pipeline)
+ it->Save(out);
+ }
+
+ void TPipelineCodec::Load(IInputStream* in) {
+ for (const auto& it : Pipeline) {
+ it->Load(in);
+ it->SetTrained(true);
+ }
+ }
+
+ void TPipelineCodec::SetTrained(bool t) {
+ for (const auto& it : Pipeline) {
+ it->SetTrained(t);
+ }
+ }
+
+ TPipelineCodec& TPipelineCodec::AddCodec(TCodecPtr codec) {
+ if (!codec)
+ return *this;
+
+ TCodecTraits tr = codec->Traits();
+
+ if (!MyName) {
+ MyTraits.AssumesStructuredInput = tr.AssumesStructuredInput;
+ MyTraits.SizeOfInputElement = tr.SizeOfInputElement;
+ } else {
+ MyName.append(':');
+ }
+
+ MyName.append(codec->GetName());
+ MyTraits.PreservesPrefixGrouping &= tr.PreservesPrefixGrouping;
+ MyTraits.PaddingBit = tr.PaddingBit;
+ MyTraits.NeedsTraining |= tr.NeedsTraining;
+ MyTraits.Irreversible |= tr.Irreversible;
+ MyTraits.SizeOnEncodeAddition = MyTraits.SizeOnEncodeAddition * tr.SizeOnEncodeMultiplier + tr.SizeOnEncodeAddition;
+ MyTraits.SizeOnEncodeMultiplier *= tr.SizeOnEncodeMultiplier;
+ MyTraits.SizeOnDecodeMultiplier *= tr.SizeOnDecodeMultiplier;
+ MyTraits.RecommendedSampleSize = Max(MyTraits.RecommendedSampleSize, tr.RecommendedSampleSize);
+
+ Pipeline.push_back(codec);
+ return *this;
+ }
+
+ void TPipelineCodec::DoLearnX(ISequenceReader& in, double sampleSizeMult) {
+ if (!Traits().NeedsTraining) {
+ return;
+ }
+
+ if (Pipeline.size() == 1) {
+ Pipeline.back()->Learn(in);
+ return;
+ }
+
+ TVector<TBuffer> trainingInput;
+
+ TStringBuf r;
+ while (in.NextRegion(r)) {
+ trainingInput.emplace_back(r.data(), r.size());
+ }
+
+ TBuffer buff;
+ for (const auto& it : Pipeline) {
+ it->LearnX(trainingInput.begin(), trainingInput.end(), sampleSizeMult);
+
+ for (auto& bit : trainingInput) {
+ buff.Clear();
+ it->Encode(TStringBuf{bit.data(), bit.size()}, buff);
+ buff.Swap(bit);
+ }
+ }
+ }
+
+ bool TPipelineCodec::AlreadyTrained() const {
+ for (const auto& it : Pipeline) {
+ if (!it->AlreadyTrained())
+ return false;
+ }
+
+ return true;
+ }
+
+}
diff --git a/library/cpp/codecs/codecs.h b/library/cpp/codecs/codecs.h
new file mode 100644
index 0000000000..cc5e72b285
--- /dev/null
+++ b/library/cpp/codecs/codecs.h
@@ -0,0 +1,259 @@
+#pragma once
+
+#include "sample.h"
+
+#include <util/generic/bt_exception.h>
+#include <util/generic/hash.h>
+#include <util/generic/ptr.h>
+#include <util/generic/singleton.h>
+
+#include <util/stream/input.h>
+#include <util/stream/output.h>
+
+#include <util/string/cast.h>
+#include <util/string/vector.h>
+#include <util/system/tls.h>
+#include <util/ysaveload.h>
+
+namespace NCodecs {
+ class TCodecException: public TWithBackTrace<yexception> {};
+
+ class ICodec;
+
+ using TCodecPtr = TIntrusivePtr<ICodec>;
+ using TCodecConstPtr = TIntrusiveConstPtr<ICodec>;
+
+ struct TCodecTraits {
+ ui32 RecommendedSampleSize = 0;
+ ui16 SizeOfInputElement = 1;
+ ui8 SizeOnEncodeMultiplier = 1;
+ ui8 SizeOnEncodeAddition = 0;
+ ui8 SizeOnDecodeMultiplier = 1;
+
+ bool NeedsTraining = false;
+ bool PreservesPrefixGrouping = false;
+ bool Irreversible = false;
+ bool PaddingBit = 0;
+ bool AssumesStructuredInput = false;
+
+ size_t ApproximateSizeOnEncode(size_t sz) const {
+ return sz * SizeOnEncodeMultiplier + SizeOnEncodeAddition;
+ }
+
+ size_t ApproximateSizeOnDecode(size_t sz) const {
+ return sz * SizeOnDecodeMultiplier;
+ }
+ };
+
+ class ICodec: public TAtomicRefCount<ICodec> {
+ protected:
+ bool Trained = false;
+ TCodecTraits MyTraits;
+
+ public:
+ TCodecTraits Traits() const {
+ return MyTraits;
+ }
+
+ // the name of the codec (or its variant) to be used in the codec registry
+ virtual TString GetName() const = 0;
+
+ virtual ui8 /*free bits in last byte*/ Encode(TStringBuf, TBuffer&) const = 0;
+ virtual ui8 Encode(const TBuffer& input, TBuffer& output) const {
+ return Encode(TStringBuf(input.Data(), input.Data() + input.Size()), output);
+ }
+ virtual void Decode(TStringBuf, TBuffer&) const = 0;
+ virtual void Decode(const TBuffer& input, TBuffer& output) const {
+ Decode(TStringBuf(input.Data(), input.Data() + input.Size()), output);
+ }
+
+ virtual ~ICodec() = default;
+
+ virtual bool AlreadyTrained() const {
+ return !Traits().NeedsTraining || Trained;
+ }
+ virtual void SetTrained(bool t) {
+ Trained = t;
+ }
+
+ bool TryToLearn(ISequenceReader& r) {
+ Trained = DoTryToLearn(r);
+ return Trained;
+ }
+
+ void Learn(ISequenceReader& r) {
+ LearnX(r, 1);
+ }
+
+ template <class TIter>
+ void Learn(TIter beg, TIter end) {
+ Learn(beg, end, IterToStringBuf<TIter>);
+ }
+
+ template <class TIter, class TGetter>
+ void Learn(TIter beg, TIter end, TGetter getter) {
+ auto sample = GetSample(beg, end, Traits().RecommendedSampleSize, getter);
+ TSimpleSequenceReader<TBuffer> reader{sample};
+ Learn(reader);
+ }
+
+ static TCodecPtr GetInstance(TStringBuf name);
+
+ static TVector<TString> GetCodecsList();
+
+ static TString GetNameSafe(TCodecPtr p);
+
+ static void Store(IOutputStream* out, TCodecPtr p);
+ static TCodecPtr Restore(IInputStream* in);
+ static TCodecPtr RestoreFromString(TStringBuf);
+
+ protected:
+ virtual void DoLearn(ISequenceReader&) = 0;
+
+ virtual bool DoTryToLearn(ISequenceReader& r) {
+ DoLearn(r);
+ return true;
+ }
+
+ // so the pipeline codec will know to adjust the sample for the subcodecs
+ virtual void DoLearnX(ISequenceReader& r, double /*sampleSizeMultiplier*/) {
+ DoLearn(r);
+ }
+
+ virtual void Save(IOutputStream*) const {
+ }
+ virtual void Load(IInputStream*) {
+ }
+ friend class TPipelineCodec;
+
+ public:
+ // so the pipeline codec will know to adjust the sample for the subcodecs
+ void LearnX(ISequenceReader& r, double sampleSizeMult) {
+ DoLearnX(r, sampleSizeMult);
+ Trained = true;
+ }
+
+ template <class TIter>
+ void LearnX(TIter beg, TIter end, double sampleSizeMult) {
+ auto sample = GetSample(beg, end, Traits().RecommendedSampleSize * sampleSizeMult);
+ TSimpleSequenceReader<TBuffer> reader{sample};
+ LearnX(reader, sampleSizeMult);
+ }
+ };
+
+ class TBasicTrivialCodec: public ICodec {
+ public:
+ ui8 Encode(TStringBuf in, TBuffer& out) const override {
+ out.Assign(in.data(), in.size());
+ return 0;
+ }
+
+ void Decode(TStringBuf in, TBuffer& out) const override {
+ Encode(in, out);
+ }
+
+ protected:
+ void DoLearn(ISequenceReader&) override {
+ }
+ };
+
+ class TTrivialCodec: public TBasicTrivialCodec {
+ public:
+ TTrivialCodec() {
+ MyTraits.PreservesPrefixGrouping = true;
+ }
+
+ static TStringBuf MyName() {
+ return "trivial";
+ }
+
+ TString GetName() const override {
+ return ToString(MyName());
+ }
+ };
+
+ class TTrivialTrainableCodec: public TBasicTrivialCodec {
+ public:
+ TTrivialTrainableCodec() {
+ MyTraits.PreservesPrefixGrouping = true;
+ MyTraits.NeedsTraining = true;
+ }
+
+ static TStringBuf MyName() {
+ return "trivial-trainable";
+ }
+
+ TString GetName() const override {
+ return ToString(MyName());
+ }
+ };
+
+ class TNullCodec: public ICodec {
+ public:
+ TNullCodec() {
+ MyTraits.Irreversible = true;
+ MyTraits.SizeOnDecodeMultiplier = 0;
+ MyTraits.SizeOnEncodeMultiplier = 0;
+ }
+
+ TString GetName() const override {
+ return "null";
+ }
+
+ ui8 Encode(TStringBuf, TBuffer& out) const override {
+ out.Clear();
+ return 0;
+ }
+
+ void Decode(TStringBuf, TBuffer& out) const override {
+ out.Clear();
+ }
+
+ protected:
+ void DoLearn(ISequenceReader&) override {
+ }
+ };
+
+ class TPipelineCodec: public ICodec {
+ typedef TVector<TCodecPtr> TPipeline;
+
+ TPipeline Pipeline;
+ TString MyName;
+
+ public:
+ explicit TPipelineCodec(TCodecPtr c0 = nullptr, TCodecPtr c1 = nullptr, TCodecPtr c2 = nullptr, TCodecPtr c3 = nullptr) {
+ MyTraits.PreservesPrefixGrouping = true;
+ AddCodec(c0);
+ AddCodec(c1);
+ AddCodec(c2);
+ AddCodec(c3);
+ }
+
+ TString GetName() const override {
+ return MyName;
+ }
+
+ ui8 Encode(TStringBuf in, TBuffer& out) const override;
+ void Decode(TStringBuf in, TBuffer& out) const override;
+
+ public:
+ /*
+ * Add codecs in the following order:
+ * uncompressed -> codec0 | codec1 | ... | codecN -> compressed
+ */
+ TPipelineCodec& AddCodec(TCodecPtr codec);
+
+ bool AlreadyTrained() const override;
+ void SetTrained(bool t) override;
+
+ protected:
+ void DoLearn(ISequenceReader& in) override {
+ DoLearnX(in, 1);
+ }
+
+ void DoLearnX(ISequenceReader& in, double sampleSizeMult) override;
+ void Save(IOutputStream* out) const override;
+ void Load(IInputStream* in) override;
+ };
+
+}
diff --git a/library/cpp/codecs/codecs_registry.cpp b/library/cpp/codecs/codecs_registry.cpp
new file mode 100644
index 0000000000..6fd7ff67b0
--- /dev/null
+++ b/library/cpp/codecs/codecs_registry.cpp
@@ -0,0 +1,226 @@
+#include "codecs_registry.h"
+#include "delta_codec.h"
+#include "huffman_codec.h"
+#include "pfor_codec.h"
+#include "solar_codec.h"
+#include "comptable_codec.h"
+#include "zstd_dict_codec.h"
+
+#include <library/cpp/blockcodecs/codecs.h>
+
+#include <util/string/builder.h>
+#include <util/string/cast.h>
+
+namespace NCodecs {
+ TCodecPtr ICodec::GetInstance(TStringBuf name) {
+ return Default<NPrivate::TCodecRegistry>().GetCodec(name);
+ }
+
+ TVector<TString> ICodec::GetCodecsList() {
+ return Default<NPrivate::TCodecRegistry>().GetCodecsList();
+ }
+
+ namespace NPrivate {
+ void TCodecRegistry::RegisterFactory(TFactoryPtr fac) {
+ TVector<TString> names = fac->ListNames();
+ for (const auto& name : names) {
+ Y_VERIFY(!Registry.contains(name), "already has %s", name.data());
+ Registry[name] = fac;
+ }
+ }
+
+ TCodecPtr TCodecRegistry::GetCodec(TStringBuf name) const {
+ using namespace NPrivate;
+
+ if (!name || "none" == name) {
+ return nullptr;
+ }
+
+ if (TStringBuf::npos == name.find(':')) {
+ Y_ENSURE_EX(Registry.contains(name), TNoCodecException(name));
+ return Registry.find(name)->second->MakeCodec(name);
+ } else {
+ TPipelineCodec* pipe = new TPipelineCodec;
+
+ do {
+ TStringBuf v = name.NextTok(':');
+ pipe->AddCodec(GetCodec(v));
+ } while (name);
+
+ return pipe;
+ }
+ }
+
+ TVector<TString> TCodecRegistry::GetCodecsList() const {
+ using namespace NPrivate;
+ TVector<TString> vs;
+ vs.push_back("none");
+
+ for (const auto& it : Registry) {
+ vs.push_back(it.first);
+ }
+
+ Sort(vs.begin(), vs.end());
+ return vs;
+ }
+
+ struct TSolarCodecFactory : ICodecFactory {
+ TCodecPtr MakeCodec(TStringBuf name) const override {
+ if (TSolarCodec::MyNameShortInt() == name) {
+ return new TSolarCodecShortInt();
+ }
+ if (TSolarCodec::MyName() == name) {
+ return new TSolarCodec();
+ }
+ if (name.EndsWith(TStringBuf("-a"))) {
+ return MakeCodecImpl<TAdaptiveSolarCodec>(name, name.SubStr(TSolarCodec::MyName().size()).Chop(2));
+ } else {
+ return MakeCodecImpl<TSolarCodec>(name, name.SubStr(TSolarCodec::MyName().size()));
+ }
+ }
+
+ template <class TCodecCls>
+ TCodecPtr MakeCodecImpl(const TStringBuf& name, const TStringBuf& type) const {
+ if (TStringBuf("-8k") == type) {
+ return new TCodecCls(1 << 13);
+ }
+ if (TStringBuf("-16k") == type) {
+ return new TCodecCls(1 << 14);
+ }
+ if (TStringBuf("-32k") == type) {
+ return new TCodecCls(1 << 15);
+ }
+ if (TStringBuf("-64k") == type) {
+ return new TCodecCls(1 << 16);
+ }
+ if (TStringBuf("-256k") == type) {
+ return new TCodecCls(1 << 18);
+ }
+ ythrow TNoCodecException(name);
+ }
+
+ TVector<TString> ListNames() const override {
+ TVector<TString> vs;
+ vs.push_back(ToString(TSolarCodec::MyName()));
+ vs.push_back(ToString(TSolarCodec::MyName8k()));
+ vs.push_back(ToString(TSolarCodec::MyName16k()));
+ vs.push_back(ToString(TSolarCodec::MyName32k()));
+ vs.push_back(ToString(TSolarCodec::MyName64k()));
+ vs.push_back(ToString(TSolarCodec::MyName256k()));
+ vs.push_back(ToString(TSolarCodec::MyName8kAdapt()));
+ vs.push_back(ToString(TSolarCodec::MyName16kAdapt()));
+ vs.push_back(ToString(TSolarCodec::MyName32kAdapt()));
+ vs.push_back(ToString(TSolarCodec::MyName64kAdapt()));
+ vs.push_back(ToString(TSolarCodec::MyName256kAdapt()));
+ vs.push_back(ToString(TSolarCodec::MyNameShortInt()));
+ return vs;
+ }
+ };
+
+ struct TZStdDictCodecFactory : ICodecFactory {
+ TCodecPtr MakeCodec(TStringBuf name) const override {
+ return new TZStdDictCodec(TZStdDictCodec::ParseCompressionName(name));
+ }
+
+ TVector<TString> ListNames() const override {
+ return TZStdDictCodec::ListCompressionNames();
+ }
+ };
+
+ struct TCompTableCodecFactory : ICodecFactory {
+ TCodecPtr MakeCodec(TStringBuf name) const override {
+ if (TCompTableCodec::MyNameHQ() == name) {
+ return new TCompTableCodec(TCompTableCodec::Q_HIGH);
+ } else if (TCompTableCodec::MyNameLQ() == name) {
+ return new TCompTableCodec(TCompTableCodec::Q_LOW);
+ } else {
+ Y_ENSURE_EX(false, TNoCodecException(name));
+ return nullptr;
+ }
+ }
+
+ TVector<TString> ListNames() const override {
+ TVector<TString> vs;
+ vs.push_back(ToString(TCompTableCodec::MyNameHQ()));
+ vs.push_back(ToString(TCompTableCodec::MyNameLQ()));
+ return vs;
+ }
+ };
+
+ struct TBlockCodec : ICodec {
+ const NBlockCodecs::ICodec* Codec;
+
+ TBlockCodec(TStringBuf name)
+ : Codec(NBlockCodecs::Codec(name))
+ {
+ }
+
+ TString GetName() const override {
+ return ToString(Codec->Name());
+ }
+
+ ui8 Encode(TStringBuf r, TBuffer& b) const override {
+ Codec->Encode(r, b);
+ return 0;
+ }
+
+ void Decode(TStringBuf r, TBuffer& b) const override {
+ // TODO: throws exception that is not TCodecException
+ Codec->Decode(r, b);
+ }
+
+ protected:
+ void DoLearn(ISequenceReader&) override {
+ }
+ };
+
+ struct TBlockCodecsFactory : ICodecFactory {
+ using TRegistry = THashMap<TString, TCodecPtr>;
+ TRegistry Registry;
+
+ TBlockCodecsFactory() {
+ for (TStringBuf codec : NBlockCodecs::ListAllCodecs()) {
+ Register(codec);
+ }
+ }
+
+ void Register(TStringBuf name) {
+ TCodecPtr p = Registry[name] = new TBlockCodec(name);
+ Registry[p->GetName()] = p;
+ }
+
+ TCodecPtr MakeCodec(TStringBuf name) const override {
+ if (!Registry.contains(name)) {
+ ythrow TNoCodecException(name);
+ }
+ return Registry.find(name)->second;
+ }
+
+ TVector<TString> ListNames() const override {
+ TVector<TString> res;
+ for (const auto& it : Registry) {
+ res.push_back(it.first);
+ }
+ return res;
+ }
+ };
+
+ TCodecRegistry::TCodecRegistry() {
+ RegisterFactory(new TInstanceFactory<TTrivialCodec>);
+ RegisterFactory(new TInstanceFactory<TTrivialTrainableCodec>);
+ RegisterFactory(new TInstanceFactory<THuffmanCodec>);
+ RegisterFactory(new TInstanceFactory<TPForCodec<ui64, true>>);
+ RegisterFactory(new TInstanceFactory<TPForCodec<ui32, true>>);
+ RegisterFactory(new TSolarCodecFactory);
+ RegisterFactory(new TZStdDictCodecFactory);
+ RegisterFactory(new TCompTableCodecFactory);
+ RegisterFactory(new TBlockCodecsFactory);
+ }
+
+ }
+
+ void RegisterCodecFactory(TCodecFactoryPtr fact) {
+ Singleton<NPrivate::TCodecRegistry>()->RegisterFactory(fact);
+ }
+
+}
diff --git a/library/cpp/codecs/codecs_registry.h b/library/cpp/codecs/codecs_registry.h
new file mode 100644
index 0000000000..53710310d5
--- /dev/null
+++ b/library/cpp/codecs/codecs_registry.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#include "codecs.h"
+#include <util/string/cast.h>
+
+namespace NCodecs {
+ struct TNoCodecException : TCodecException {
+ TNoCodecException(TStringBuf name) {
+ (*this) << "unknown codec: " << name;
+ }
+ };
+
+ struct ICodecFactory : TAtomicRefCount<ICodecFactory> {
+ virtual ~ICodecFactory() = default;
+ virtual TCodecPtr MakeCodec(TStringBuf name) const = 0;
+ virtual TVector<TString> ListNames() const = 0;
+ };
+
+ typedef TIntrusivePtr<ICodecFactory> TCodecFactoryPtr;
+
+ namespace NPrivate {
+ template <typename TCodec>
+ struct TInstanceFactory : ICodecFactory {
+ TCodecPtr MakeCodec(TStringBuf) const override {
+ return new TCodec;
+ }
+
+ TVector<TString> ListNames() const override {
+ TVector<TString> vs;
+ vs.push_back(ToString(TCodec::MyName()));
+ return vs;
+ }
+ };
+
+ class TCodecRegistry {
+ using TRegistry = THashMap<TString, TIntrusivePtr<ICodecFactory>>;
+ TRegistry Registry;
+
+ public:
+ using TFactoryPtr = TIntrusivePtr<ICodecFactory>;
+
+ TCodecRegistry();
+
+ void RegisterFactory(TFactoryPtr fac);
+
+ TCodecPtr GetCodec(TStringBuf name) const;
+
+ TVector<TString> GetCodecsList() const;
+ };
+
+ }
+
+ void RegisterCodecFactory(TCodecFactoryPtr fact);
+
+ template <typename TCodec>
+ void RegisterCodec() {
+ RegisterCodecFactory(new NPrivate::TInstanceFactory<TCodec>());
+ }
+
+}
diff --git a/library/cpp/codecs/comptable_codec.cpp b/library/cpp/codecs/comptable_codec.cpp
new file mode 100644
index 0000000000..476b8ada80
--- /dev/null
+++ b/library/cpp/codecs/comptable_codec.cpp
@@ -0,0 +1,108 @@
+#include "comptable_codec.h"
+
+#include <library/cpp/comptable/comptable.h>
+#include <util/string/cast.h>
+
+namespace NCodecs {
+ class TCompTableCodec::TImpl: public TAtomicRefCount<TImpl> {
+ public:
+ TImpl(EQuality q)
+ : Quality(q)
+ {
+ }
+
+ void Init() {
+ Compressor.Reset(new NCompTable::TChunkCompressor{(bool)Quality, Table});
+ Decompressor.Reset(new NCompTable::TChunkDecompressor{(bool)Quality, Table});
+ }
+
+ ui8 Encode(TStringBuf in, TBuffer& out) const {
+ out.Clear();
+ if (!in) {
+ return 0;
+ }
+
+ TVector<char> result;
+ Compressor->Compress(in, &result);
+ out.Assign(&result[0], result.size());
+ return 0;
+ }
+
+ void Decode(TStringBuf in, TBuffer& out) const {
+ out.Clear();
+ if (!in) {
+ return;
+ }
+
+ TVector<char> result;
+ Decompressor->Decompress(in, &result);
+ out.Assign(&result[0], result.size());
+ }
+
+ void DoLearn(ISequenceReader& in) {
+ NCompTable::TDataSampler sampler;
+ TStringBuf region;
+ while (in.NextRegion(region)) {
+ if (!region) {
+ continue;
+ }
+
+ sampler.AddStat(region);
+ }
+
+ sampler.BuildTable(Table);
+ Init();
+ }
+
+ void Save(IOutputStream* out) const {
+ ::Save(out, Table);
+ }
+
+ void Load(IInputStream* in) {
+ ::Load(in, Table);
+ Init();
+ }
+
+ NCompTable::TCompressorTable Table;
+ THolder<NCompTable::TChunkCompressor> Compressor;
+ THolder<NCompTable::TChunkDecompressor> Decompressor;
+ const EQuality Quality;
+ static const ui32 SampleSize = Max(NCompTable::TDataSampler::Size * 4, (1 << 22) * 5);
+ };
+
+ TCompTableCodec::TCompTableCodec(EQuality q)
+ : Impl(new TImpl{q})
+ {
+ MyTraits.NeedsTraining = true;
+ MyTraits.SizeOnEncodeMultiplier = 2;
+ MyTraits.SizeOnDecodeMultiplier = 10;
+ MyTraits.RecommendedSampleSize = TImpl::SampleSize;
+ }
+
+ TCompTableCodec::~TCompTableCodec() = default;
+
+ TString TCompTableCodec::GetName() const {
+ return ToString(Impl->Quality ? MyNameHQ() : MyNameLQ());
+ }
+
+ ui8 TCompTableCodec::Encode(TStringBuf in, TBuffer& out) const {
+ return Impl->Encode(in, out);
+ }
+
+ void TCompTableCodec::Decode(TStringBuf in, TBuffer& out) const {
+ Impl->Decode(in, out);
+ }
+
+ void TCompTableCodec::DoLearn(ISequenceReader& in) {
+ Impl->DoLearn(in);
+ }
+
+ void TCompTableCodec::Save(IOutputStream* out) const {
+ Impl->Save(out);
+ }
+
+ void TCompTableCodec::Load(IInputStream* in) {
+ Impl->Load(in);
+ }
+
+}
diff --git a/library/cpp/codecs/comptable_codec.h b/library/cpp/codecs/comptable_codec.h
new file mode 100644
index 0000000000..7ba4f4c543
--- /dev/null
+++ b/library/cpp/codecs/comptable_codec.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include "codecs.h"
+
+#include <util/generic/ptr.h>
+
+namespace NCodecs {
+ class TCompTableCodec: public ICodec {
+ class TImpl;
+ TIntrusivePtr<TImpl> Impl;
+
+ public:
+ enum EQuality {
+ Q_LOW = 0,
+ Q_HIGH = 1
+ };
+
+ explicit TCompTableCodec(EQuality q = Q_HIGH);
+ ~TCompTableCodec() override;
+
+ static TStringBuf MyNameHQ() {
+ return "comptable-hq";
+ }
+ static TStringBuf MyNameLQ() {
+ return "comptable-lq";
+ }
+
+ TString GetName() const override;
+
+ ui8 Encode(TStringBuf in, TBuffer& out) const override;
+
+ void Decode(TStringBuf in, TBuffer& out) const override;
+
+ protected:
+ void DoLearn(ISequenceReader& in) override;
+ void Save(IOutputStream* out) const override;
+ void Load(IInputStream* in) override;
+ };
+
+}
diff --git a/library/cpp/codecs/delta_codec.cpp b/library/cpp/codecs/delta_codec.cpp
new file mode 100644
index 0000000000..61606d6f6f
--- /dev/null
+++ b/library/cpp/codecs/delta_codec.cpp
@@ -0,0 +1,21 @@
+#include "delta_codec.h"
+
+namespace NCodecs {
+ template <>
+ TStringBuf TDeltaCodec<ui64, true>::MyName() {
+ return "delta64-unsigned";
+ }
+ template <>
+ TStringBuf TDeltaCodec<ui32, true>::MyName() {
+ return "delta32-unsigned";
+ }
+ template <>
+ TStringBuf TDeltaCodec<ui64, false>::MyName() {
+ return "delta64-signed";
+ }
+ template <>
+ TStringBuf TDeltaCodec<ui32, false>::MyName() {
+ return "delta32-signed";
+ }
+
+}
diff --git a/library/cpp/codecs/delta_codec.h b/library/cpp/codecs/delta_codec.h
new file mode 100644
index 0000000000..21325825e6
--- /dev/null
+++ b/library/cpp/codecs/delta_codec.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include "codecs.h"
+
+#include <util/generic/array_ref.h>
+#include <util/generic/typetraits.h>
+#include <util/generic/bitops.h>
+#include <util/string/cast.h>
+
+namespace NCodecs {
+ template <typename T = ui64, bool UnsignedDelta = true>
+ class TDeltaCodec: public ICodec {
+ static_assert(std::is_integral<T>::value, "expect std::is_integral<T>::value");
+
+ public:
+ using TUnsigned = std::make_unsigned_t<T>;
+ using TSigned = std::make_signed_t<T>;
+ using TDelta = std::conditional_t<UnsignedDelta, TUnsigned, TSigned>;
+
+ private:
+ const TDelta MinDelta{Min<TDelta>()};
+ const TDelta MaxDelta{Max<TDelta>() - 1};
+ const TDelta InvalidDelta{MaxDelta + 1};
+
+ Y_FORCE_INLINE static TDelta AddSafe(TUnsigned a, TUnsigned b) {
+ return a + b;
+ }
+
+ Y_FORCE_INLINE static TDelta SubSafe(TUnsigned a, TUnsigned b) {
+ return a - b;
+ }
+
+ public:
+ struct TDecoder {
+ const TDelta InvalidDelta{Max<TDelta>()};
+
+ T Last = 0;
+ T Result = 0;
+
+ bool First = true;
+ bool Invalid = false;
+
+ Y_FORCE_INLINE bool Decode(TDelta t) {
+ if (Y_UNLIKELY(First)) {
+ First = false;
+ Result = Last = t;
+ return true;
+ }
+
+ if (Y_UNLIKELY(Invalid)) {
+ Invalid = false;
+ Last = 0;
+ Result = t;
+ return true;
+ }
+
+ Result = (Last += t);
+ Invalid = t == InvalidDelta;
+
+ return !Invalid;
+ }
+ };
+
+ public:
+ static TStringBuf MyName();
+
+ TDeltaCodec() {
+ MyTraits.SizeOfInputElement = sizeof(T);
+ MyTraits.AssumesStructuredInput = true;
+ }
+
+ TString GetName() const override {
+ return ToString(MyName());
+ }
+
+ template <class TItem>
+ static void AppendTo(TBuffer& b, TItem t) {
+ b.Append((char*)&t, sizeof(t));
+ }
+
+ ui8 Encode(TStringBuf s, TBuffer& b) const override {
+ b.Clear();
+ if (s.empty()) {
+ return 0;
+ }
+
+ b.Reserve(s.size());
+ TArrayRef<const T> tin{(const T*)s.data(), s.size() / sizeof(T)};
+
+ const T* it = tin.begin();
+ TDelta last = *(it++);
+ AppendTo(b, last);
+
+ TDelta maxt = SubSafe(MaxDelta, last);
+ TDelta mint = AddSafe(MinDelta, last);
+
+ for (; it != tin.end(); ++it) {
+ TDelta t = *it;
+
+ if (Y_LIKELY((t >= mint) & (t <= maxt))) {
+ AppendTo(b, t - last);
+ last = t;
+ maxt = SubSafe(MaxDelta, last);
+ mint = AddSafe(MinDelta, last);
+ } else {
+ // delta overflow
+ AppendTo(b, InvalidDelta);
+ AppendTo(b, t);
+ last = 0;
+ maxt = MaxDelta;
+ mint = MinDelta;
+ }
+ }
+
+ return 0;
+ }
+
+ void Decode(TStringBuf s, TBuffer& b) const override {
+ b.Clear();
+ if (s.empty()) {
+ return;
+ }
+
+ b.Reserve(s.size());
+ TArrayRef<const T> tin{(const T*)s.data(), s.size() / sizeof(T)};
+
+ TDecoder dec;
+
+ for (const T* it = tin.begin(); it != tin.end(); ++it) {
+ T tmp;
+ memcpy(&tmp, it, sizeof(tmp));
+ if (dec.Decode(tmp)) {
+ AppendTo(b, dec.Result);
+ }
+ }
+ }
+
+ protected:
+ void DoLearn(ISequenceReader&) override {
+ }
+ };
+
+}
diff --git a/library/cpp/codecs/float_huffman.cpp b/library/cpp/codecs/float_huffman.cpp
new file mode 100644
index 0000000000..c4a8bd228f
--- /dev/null
+++ b/library/cpp/codecs/float_huffman.cpp
@@ -0,0 +1,333 @@
+#include "float_huffman.h"
+
+#include <util/generic/array_ref.h>
+#include <util/generic/bitops.h>
+#include <util/generic/cast.h>
+#include <util/generic/yexception.h>
+#include <util/system/unaligned_mem.h>
+#include <util/system/yassert.h>
+#include <util/stream/format.h>
+
+namespace NCodecs::NFloatHuff {
+ namespace {
+ struct THuffEntry {
+ ui32 CodeBase;
+ ui16 Prefix;
+ int PrefLength;
+ int CodeLength;
+ int TotalLength;
+ ui32 Mask;
+ ui64 Offset;
+
+ THuffEntry() = default;
+
+ constexpr THuffEntry(ui32 codeBase, ui16 prefix, int prefLength, int codeLength)
+ : CodeBase(codeBase)
+ , Prefix(prefix)
+ , PrefLength(prefLength)
+ , CodeLength(codeLength)
+ , TotalLength(prefLength + codeLength)
+ , Mask(Mask64(codeLength))
+ , Offset(-(ui64(codeBase) << prefLength) + prefix)
+ {}
+
+ bool Fit(ui32 code) const {
+ return code >= CodeBase && code < CodeBase + (1ULL << CodeLength);
+ }
+ };
+
+ // NB. There is a typo in the penultimate line (34 instead of 24). It was there from the very
+ // first commit and we cannot fix it without breaking all the clients.
+ constexpr THuffEntry entries[16] = {
+ {0x00000000, 0x01, 1, 0}, // Only +0.0f, 1 bit, prefix [1]
+ {0x3f800000, 0x0e, 4, 0}, // Only +1.0f, 4 bits, prefix [0111]
+ {0x3f700000, 0x08, 5, 20}, // [0.9375, 1.0), 25 bits, prefix [00010]
+ {0x3f000000, 0x00, 5, 20}, // [0.5, 0.5625), 25 bits, prefx [00000]
+ {0x3f400000, 0x06, 6, 20}, // [0.75, 0.8125), 26 bits, prefix [011000]
+ {0x3f500000, 0x22, 6, 20}, // [0.8125, 0.875), 26 bits, prefix [010001]
+ {0x3f200000, 0x02, 6, 20}, // [0.625, 0.6875), 26 bits, prefix [010000]
+ {0x3f100000, 0x38, 6, 20}, // [0.5625, 0.625), 26 bits, prefix [000111]
+ {0x3f600000, 0x18, 6, 20}, // [0.875, 0.9375), 26 bits, prefix [000110]
+ {0x3f300000, 0x30, 6, 20}, // [0.6875, 0.75), 26 bits, prefix [000011]
+ {0x3e800000, 0x10, 6, 20}, // [0.25, 0.28125), 26 bits, prefix [000010]
+ {0x3e000000, 0x04, 3, 24}, // [0.125, 0.5), 27 bits, prefix [001]
+ {0x3d000000, 0x0a, 4, 24}, // [0.03125, 0.125), 28 bits, prefix [0101]
+ {0x3c000000, 0x12, 5, 24}, // [0.0078125, 0.03125), 29 bits, prefix [01001]
+ {0x3b000000, 0x26, 6, 34}, // [0.001953125, end of range), 40 bits, prefix [011001]
+ {0x00000000, 0x16, 5, 32}, // whole range, 37 bits, prefix [01101]
+ };
+
+ [[noreturn]] Y_NO_INLINE void ThrowInvalidOffset(size_t size, size_t byteOffset) {
+ ythrow yexception() <<
+ "Decompression error: requested decoding 8 bytes past end of input buffer of " << size << " bytes size at position " << byteOffset << ". ";
+ }
+
+ struct THuffInfo {
+ constexpr THuffInfo() {
+ for (size_t i = 0; i < 64; ++i) {
+ bool init = false;
+ for (size_t j = 0; j != 16; ++j) {
+ ui16 prefix = i & Mask64(entries[j].PrefLength);
+ if (entries[j].Prefix == prefix) {
+ init = true;
+ DecodeLookup[i] = entries[j];
+ break;
+ }
+ }
+ Y_ASSERT(init);
+ }
+
+ for (ui32 i = 0; i < (1 << 12); ++i) {
+ // First two entries (+0.0f and +1.0f) are not present in the lookup, they are handled separately
+ for (int value = 2; value < 16; ++value) {
+ if (entries[value].Fit(i << 20)) {
+ EncodeLookup[i] = value;
+ break;
+ }
+ }
+ }
+ }
+
+ std::pair<ui64, int> GetCode(ui32 value) const {
+ // Zeros are handled separately in the main loop
+ Y_ASSERT(value != 0);
+
+ if (value == 0x3f800000) {
+ return {0x0e, 4};
+ }
+
+ const auto& entry = entries[EncodeLookup[value >> 20]];
+
+ return {
+ (ui64(value) << entry.PrefLength) + entry.Offset,
+ entry.TotalLength
+ };
+ }
+
+ THuffEntry DecodeLookup[64];
+ ui8 EncodeLookup[1 << 12];
+ };
+
+ const THuffInfo huffInfo;
+ /// End Of Stream
+ const ui32 EOS = ui32(-1);
+ }
+
+ TString Encode(TArrayRef<const float> factors) {
+ TString result;
+ result.resize((factors.size() + 1) * 40 / 8 + 8, 0); // Max code length is 40 bits
+ int usedBits = 0;
+ ui64 buffer = 0;
+ char* writePtr = result.begin();
+
+ auto writeBits = [&](ui64 code, int size) {
+ const auto bitsToTransfer = Min(size, 64 - usedBits);
+ buffer |= (code << usedBits);
+ usedBits += bitsToTransfer;
+ if (usedBits == 64) {
+ memcpy(writePtr, &buffer, 8);
+ usedBits = size - bitsToTransfer;
+ if (bitsToTransfer != 64) {
+ buffer = code >> bitsToTransfer;
+ } else {
+ buffer = 0;
+ }
+ writePtr += 8;
+ }
+ };
+
+ for (size_t i = 0; i != factors.size();) {
+ if (BitCast<ui32>(factors[i]) == 0) {
+ int zeroCount = 1;
+ for (;;) {
+ ++i;
+ if (i == factors.size() || BitCast<ui32>(factors[i]) != 0) {
+ break;
+ }
+ ++zeroCount;
+ }
+ for (; zeroCount >= 64; zeroCount -= 64) {
+ writeBits(ui64(-1), 64);
+ }
+ writeBits(Mask64(zeroCount), zeroCount);
+ } else {
+ const auto [code, codeSize] = huffInfo.GetCode(BitCast<ui32>(factors[i]));
+ writeBits(code, codeSize);
+ ++i;
+ }
+ }
+ // Write EOS.
+ // We use precomputed constants instead of the following:
+ // auto [code, codeSize] = huffInfo.GetCode(EOS);
+ // writeBits(code, codeSize);
+ writeBits(211527139302, 40);
+ memcpy(writePtr, &buffer, 8);
+ result.resize(writePtr - result.begin() + usedBits / 8 + 8);
+
+ return result;
+ }
+
+ TDecoder::TDecoder(TStringBuf data)
+ : State{
+ .Workspace = data.size() < 8 ? ThrowInvalidOffset(data.size(), 0), 0 : ReadUnaligned<ui64>(data.data()),
+ .WorkspaceSize = 64,
+ .Position = 8,
+ .Data = data
+ }
+ {
+ FillDecodeBuffer();
+ }
+
+ TVector<float> TDecoder::DecodeAll(size_t sizeHint) {
+ TVector<float> result;
+ result.reserve(sizeHint);
+
+ while (Begin != End) {
+ result.insert(result.end(), Begin, End);
+ FillDecodeBuffer();
+ }
+ return result;
+ }
+
+ size_t TDecoder::Decode(TArrayRef<float> dest) {
+ size_t count = 0;
+ while (count < dest.size()) {
+ if (dest.size() - count < size_t(End - Begin)) {
+ const auto size = dest.size() - count;
+ std::copy(Begin, Begin + size, dest.data() + count);
+ Begin += size;
+ return dest.size();
+ } else {
+ std::copy(Begin, End, dest.data() + count);
+ count += End - Begin;
+ FillDecodeBuffer();
+ if (Begin == End) {
+ break;
+ }
+ }
+ }
+ return count;
+ }
+
+ size_t TDecoder::Skip(size_t count) {
+ size_t skippedCount = 0;
+ while (skippedCount < count) {
+ if (count - skippedCount < size_t(End - Begin)) {
+ const auto size = count - skippedCount;
+ Begin += size;
+ return count;
+ } else {
+ skippedCount += End - Begin;
+ FillDecodeBuffer();
+ if (Begin == End) {
+ break;
+ }
+ }
+ }
+ return skippedCount;
+ }
+
+ bool TDecoder::HasMore() const {
+ return Begin != End;
+ }
+
+ void TDecoder::FillDecodeBuffer() {
+ Begin = DecodeBuffer.data();
+ End = DecodeBuffer.data();
+
+ if (HitEos) {
+ return;
+ }
+
+ // This helps to keep most of the variables in the registers.
+ float* end = End;
+ TState state = State;
+
+ // It is faster to just zero all the memory here in one go
+ // and then avoid inner loop when writing zeros. There we
+ // can just increment end pointer.
+ std::fill(DecodeBuffer.begin(), DecodeBuffer.end(), 0.0f);
+
+ // Make sure that inside the loop we always have space to put 64 zeros and one other
+ // value.
+ float* cap = DecodeBuffer.data() + DecodeBuffer.size() - 64 - 1;
+
+ while (end < cap) {
+ if (state.Workspace % 2 == 1) {
+ // Decode zeros
+ // There we can just scan whole state.Workspace for ones because it contains
+ // zeros outside of the WorkspaceSize bits.
+ const auto negWorkspace = ~state.Workspace;
+ const int zeroCount = negWorkspace ? CountTrailingZeroBits(negWorkspace) : 64;
+ end += zeroCount;
+ state.SkipBits(zeroCount);
+ continue;
+ }
+ if (state.PeekBits(4) == 0x0e) {
+ *end++ = 1.0f;
+ state.SkipBits(4);
+ continue;
+ }
+ const auto& entry = huffInfo.DecodeLookup[state.PeekBits(6)];
+ const auto code = ui32((state.NextBitsUnmasked(entry.TotalLength) >> entry.PrefLength) & entry.Mask) + entry.CodeBase;
+ if (Y_UNLIKELY(code == EOS)) {
+ HitEos = true;
+ break;
+ }
+ *end++ = BitCast<float>(code);
+ }
+
+ End = end;
+ State = state;
+ }
+
+ ui64 TDecoder::TState::PeekBits(int count) {
+ if (WorkspaceSize > count) {
+ return Workspace & Mask64(count);
+ } else {
+ if (Y_UNLIKELY(Position + 8 > Data.size())) {
+ ThrowInvalidOffset(Data.size(), Position);
+ }
+ return (Workspace | (ReadUnaligned<ui64>(Data.data() + Position) << WorkspaceSize)) & Mask64(count);
+ }
+ }
+
+ ui64 TDecoder::TState::NextBitsUnmasked(int count) {
+ if (WorkspaceSize > count) {
+ const auto result = Workspace;
+ Workspace >>= count;
+ WorkspaceSize -= count;
+ return result;
+ } else {
+ if (Y_UNLIKELY(Position + 8 > Data.size())) {
+ ThrowInvalidOffset(Data.size(), Position);
+ }
+ ui64 result = Workspace;
+ Workspace = ReadUnaligned<ui64>(Data.data() + Position);
+ Position += 8;
+ result |= Workspace << WorkspaceSize;
+ Workspace >>= count - WorkspaceSize;
+ WorkspaceSize += 64 - count;
+ return result;
+ }
+ }
+
+ void TDecoder::TState::SkipBits(int count) {
+ if (WorkspaceSize > count) {
+ Workspace >>= count;
+ WorkspaceSize -= count;
+ } else {
+ if (Y_UNLIKELY(Position + 8 > Data.size())) {
+ ThrowInvalidOffset(Data.size(), Position);
+ }
+ Workspace = ReadUnaligned<ui64>(Data.data() + Position);
+ Position += 8;
+ Workspace >>= count - WorkspaceSize;
+ WorkspaceSize += 64 - count;
+ }
+ }
+
+ TVector<float> Decode(TStringBuf data, size_t sizeHint) {
+ return TDecoder(data).DecodeAll(sizeHint);
+ }
+} // namespace NCodecs::NFloatHuff
diff --git a/library/cpp/codecs/float_huffman.h b/library/cpp/codecs/float_huffman.h
new file mode 100644
index 0000000000..786a8eae1d
--- /dev/null
+++ b/library/cpp/codecs/float_huffman.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <util/generic/array_ref.h>
+#include <util/generic/vector.h>
+#include <util/generic/strbuf.h>
+
+#include <array>
+
+namespace NCodecs::NFloatHuff {
+ TString Encode(TArrayRef<const float> factors);
+
+ class TDecoder {
+ public:
+ explicit TDecoder(TStringBuf data);
+
+ TVector<float> DecodeAll(size_t sizeHint = 0);
+
+ // Returns number of decoded floats. May be fewer than requested if the EOS is found.
+ size_t Decode(TArrayRef<float> dest);
+
+ // Returns the number of skipped values.
+ size_t Skip(size_t count);
+
+ bool HasMore() const;
+
+ private:
+ struct TState {
+ ui64 Workspace = 0;
+ int WorkspaceSize = 0;
+ ui64 Position = 0;
+ TStringBuf Data;
+
+ ui64 NextBitsUnmasked(int count); // The upper 64 - count bits may be arbitrary
+ ui64 PeekBits(int count);
+ void SkipBits(int count);
+ };
+
+ void FillDecodeBuffer();
+
+ TState State;
+ std::array<float, 128> DecodeBuffer;
+ // The range of already decompressed numbers inside the DecodeBuffer.
+ // Always kept non-empty until the EOS is encountered.
+ float* Begin;
+ float* End;
+ bool HitEos = false;
+ };
+
+ TVector<float> Decode(TStringBuf data, size_t sizeHint = 0);
+}
diff --git a/library/cpp/codecs/float_huffman_bench/main.cpp b/library/cpp/codecs/float_huffman_bench/main.cpp
new file mode 100644
index 0000000000..1018c17834
--- /dev/null
+++ b/library/cpp/codecs/float_huffman_bench/main.cpp
@@ -0,0 +1,145 @@
+#include <library/cpp/codecs/float_huffman.h>
+
+#include <benchmark/benchmark.h>
+
+#include <util/generic/vector.h>
+
+const float Factors[] = {
+ 0.340582, 0.000974026, 0.487168, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0.411765, 0.921569,
+ 0.00390625, 0.109371, 0, 1, 0, 0, 0, 0, 0.523322, 0, 1, 0, 0, 0, 0, 0.285714, 1,
+ 0.008253, 1, 0, 0, 0.00993935, 0.450213, 0.000974026, 1, 1, 1, 1, 0, 0, 0.20564,
+ 0.97561, 0.913896, 1, 1, 0, 1, 0, 0, 0.5, 0, 0, 0, 0.1, 1, 0, 0, 0, 0, 0, 0.450923,
+ 0, 0.5, 0, 0, 0.20564, 0, 0.5, 0, 0, 0.20564, 0, 0, 0.0313726, 0, 1, 1, 1, 0.363636,
+ 0.5, 0.686073, 0.45121, 0.00574382, 0.366166, 0.413295, 1, 1, 1, 0, 0, 0, 0, 0.160784,
+ 0, 0.937255, 0.537255, 0.133333, 0, 0, 0, 0, 0.00392157, 0, 0.333333, 0.027451, 0.0156863,
+ 1, 0.105882, 1, 0.00220908, 0.000112501, 0.0111262, 0.102384, 0.00140808, 0.123581,
+ 0.29308, 6.57282e-06, 0.00489498, 2.10209e-05, 0.00140559, 5.907e-06, 0, 0.559322,
+ 0.559322, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0.794765, 0,
+ 0.352648, 0.225904, 1, 0.047619, 0.0107276, 0.399461, 0.0304838, 0.292932, 0.00969929,
+ 0, 0, 0.886904, 0.714693, 0, 0.00223213, 0.000544069, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0.00507403, 0, 0, 0, 0, 0, 0.875, 0, 0, 1, 1, 1, 0, 0.20564, 0, 0.00176048, 0,
+ 0.000440121, 0, 0, 0, 0.000974026, 0.487168, 0, 0, 0.533333, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 1, 0, 0, 0.723187, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 1, 0, 0.206882, 0.00483367, 0.792983, 0.00126106, 1, 0.0313726, 0.470588,
+ 0.254902, 0.188235, 0.188235, 0.388235, 0.164706, 0, 0.870588, 0.843137, 0.635294,
+ 0.384314, 0.384314, 0.643137, 0, 0, 0, 0, 0, 0, 0, 0, 0.541176, 0, 0.541176, 0, 0,
+ 0.0532634, 1, 0, 0, 0, 0.015044, 1, 0, 1, 1, 1, 0.47451, 0.329412, 0.964706, 0, 0,
+ 0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0.0941176, 0.970588, 0.970588, 0, 0.970588, 0.97561,
+ 0, 0.0431373, 0.47451, 0.329412, 0.964706, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0.231373, 0.00392157, 0, 0, 0, 0.054902, 0, 0,
+ 1, 0, 0, 0.0235294, 0, 1, 0, 0, 0, 0, 0.34902, 0.0352941, 0.925379, 0.623681, 0,
+ 0.954543, 0, 0, 0.00102756, 0.709804, 0.498039, 0.0901961, 0.631373, 0.847059, 0.270588,
+ 0.0156863, 0.133333, 0.980392, 1e-12, 1e-12, 1e-12, 1e-12, 0.497159, 0, 0.407487,
+ 0, 0, 0, 0.00392157, 0.00202156, 0.046875, 0.187159, 0.046875, 0.15625, 0.434232,
+ 0.15625, 0, 2.95083e-07, 0.20564, 0.20564, 0.97561, 0.913896, 0, 0, 0, 0, 0, 0, 0.00784314,
+ 0, 0.695525, 1, 0.07205, 0, 0, 0.176471, 0, 0, 0, 1, 1, 0.98, 0.01, 0.01, 0, 0.00690702,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.29078, 0.29078, 1, 0, 0, 0, 0, 0.192157, 0.188235,
+ 0.0941176, 0, 0.0313726, 0, 0.141176, 0.207843, 0.0901961, 0.00784314, 0.0784314,
+ 0, 0, 0, 0, 0, 0.203922, 0.0196078, 0.34902, 0.0235294, 0.0980392, 0.164706, 0.133333,
+ 0.368627, 0, 0.0941176, 0, 1, 0.313726, 0, 0, 0.433582, 0.384508, 0.0532186, 0.0833333,
+ 0.01609, 0, 1, 0, 0, 0, 0.0666667, 0, 0, 0, 0, 1, 0, 0.564706, 0.501961, 0, 0, 0,
+ 0, 0, 0.0516447, 0.000173065, 0, 0, 0, 0, 0, 0, 0, 0.996309, 0, 0, 0.00392157, 1,
+ 0, 0.01, 0, 0, 0, 0, 0, 0.439505, 0.206882, 0.206882, 0.260891, 0, 0.875, 0, 0, 0,
+ 0, 0, 0.185657, 1, 1, 0, 0, 0, 0.0332647, 0.206106, 0.0688878, 0.239216, 0, 0, 0,
+ 0, 0.054902, 0, 0.101961, 0.160784, 0.180392, 0, 0.737828, 0, 0, 0.875, 0.0142566,
+ 0, 0.662745, 1, 0, 0, 0, 0.225806, 0.99992, 0.631373, 0.00392157, 1, 0, 0.143647,
+ 0.00270085, 1, 0.231482, 0.246735, 0.0428062, 0, 0, 1, 0, 0.186441, 0.0115358, 0,
+ 0.221762, 0, 0.2, 0, 0.0156863, 0, 0, 0, 0.976471, 0, 0.231373, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0.00392157, 0.00392157, 0.0666667, 0, 0, 0, 0, 0.0117647, 0.580392, 0.98737,
+ 1, 1, 1, 0, 0, 0, 0.153, 0.847, 0.931373, 0.94697, 0.94697, 0, 0.946294, 0.408118,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0.99992, 0.97561, 0, 0, 0, 0, 0, 0,
+ 0.274677, 0.153017, 0, 0.642356, 0, 0, 0.1, 0, 0, 0, 0, 0.327944, 0.327944, 0, 0,
+ 0.815686, 0, 0, 0, 0, 0.206106, 0.439126, 0, 0, 0, 0, 0, 1, 1, 1, 0.00392157, 0.232788,
+ 0.232465, 0.999899, 0.00309296, 0.0636097, 0.445954, 0.156863, 0, 0, 0, 0, 0, 0,
+ 0.3796, 0.0784, 0.0651664, 0, 0, 0.254902, 0.266667, 1, 0, 0, 0, 0, 0, 0.596073,
+ 0.517876, 0.145833, 0.372549, 0, 0.991667, 0.602125, 0.161979, 0, 0, 0, 0, 0.0255146,
+ 0.947855, 0, 0, 0, 0, 0, 0, 0, 0, 0.847059, 0.679841, 0, 0.156863, 0, 0, 1, 0, 0,
+ 0, 0, 0.969697, 0, 0, 0.564706, 0, 0, 0, 0, 0, 1, 0.0367282, 0.0395228, 0, 0, 0,
+ 0, 0, 0.0470588, 0.141176, 0.054902, 0, 0, 0, 0};
+
+const ui8 CodedFactors[] = {
+ 0x24, 0x06, 0x73, 0xB5, 0xC7, 0x55, 0x7F, 0x3A, 0xB4, 0x70, 0xCB, 0xEF, 0xEE, 0xFE, 0xB3, 0x5B,
+ 0x5A, 0x1A, 0x93, 0x5F, 0x5F, 0x13, 0x00, 0x00, 0x10, 0x00, 0x3D, 0xEF, 0xFF, 0xEE, 0x0F, 0xDC,
+ 0xF0, 0xAB, 0x3F, 0x37, 0x92, 0x24, 0x5D, 0x5E, 0xDE, 0x1C, 0xF8, 0x12, 0x15, 0x5B, 0x84, 0x51,
+ 0x82, 0xE6, 0xF6, 0xB8, 0xEA, 0x4F, 0xC7, 0xDD, 0x7D, 0x2E, 0x4D, 0x4A, 0x21, 0xCA, 0xE0, 0xC4,
+ 0x2E, 0xEA, 0xD3, 0xBD, 0x0F, 0x00, 0x00, 0xE0, 0xDA, 0xCC, 0xCC, 0xEC, 0x9F, 0x61, 0xDF, 0xE6,
+ 0x01, 0x00, 0x00, 0xCC, 0xA5, 0x49, 0xA9, 0x00, 0x00, 0x00, 0xE6, 0xD2, 0xA4, 0xD4, 0xEA, 0x08,
+ 0x08, 0xD0, 0xDD, 0xF9, 0xE7, 0xA2, 0x0B, 0x00, 0x00, 0x40, 0xD8, 0x13, 0x7D, 0xFE, 0x13, 0x9C,
+ 0x9B, 0xA8, 0x36, 0xBC, 0x00, 0x90, 0x43, 0x6F, 0x97, 0x67, 0x9B, 0xD3, 0xEE, 0xFE, 0x84, 0x24,
+ 0x25, 0x89, 0xC9, 0xBF, 0x3F, 0x58, 0x4C, 0x4C, 0xCA, 0x21, 0x22, 0xBC, 0x39, 0x08, 0x08, 0x08,
+ 0x40, 0x7E, 0xAA, 0xAA, 0xCA, 0x75, 0x70, 0x70, 0xE9, 0x08, 0x08, 0xE8, 0x9A, 0x8A, 0x8D, 0xED,
+ 0xA6, 0x8D, 0x31, 0x04, 0x00, 0x96, 0xD0, 0x7D, 0x1D, 0x47, 0xAA, 0x2A, 0xD9, 0x28, 0xAD, 0x6B,
+ 0xB4, 0x9D, 0x7A, 0xC4, 0xD5, 0xD1, 0x04, 0x8C, 0x7E, 0x56, 0x3A, 0x58, 0x5A, 0x0C, 0x46, 0x6E,
+ 0x1B, 0x53, 0xC2, 0x0C, 0x14, 0x00, 0xAB, 0x60, 0x05, 0x7B, 0x63, 0x8D, 0x77, 0x70, 0x75, 0xAC,
+ 0x2F, 0x8D, 0xB1, 0x4D, 0xA0, 0xFB, 0xF2, 0x40, 0xF7, 0xE5, 0x7F, 0xDF, 0xDD, 0xFD, 0xBB, 0x1B,
+ 0xB8, 0x75, 0x9B, 0x47, 0x8E, 0xB4, 0x0C, 0x9B, 0x3A, 0x73, 0x25, 0x61, 0x18, 0x92, 0xD1, 0xC2,
+ 0x2F, 0x3C, 0x31, 0x64, 0x96, 0x2A, 0xB9, 0xF9, 0x7C, 0xD9, 0xAF, 0x94, 0xC5, 0xE9, 0x1E, 0x63,
+ 0x24, 0x0C, 0x03, 0x7F, 0xD8, 0x5B, 0xB3, 0x1D, 0x49, 0x02, 0x00, 0xAB, 0xFD, 0xE9, 0xA0, 0xF3,
+ 0xBF, 0xC9, 0x40, 0x64, 0x0A, 0xC0, 0xC7, 0x00, 0x00, 0x60, 0x77, 0xCF, 0xA5, 0x49, 0xA9, 0x16,
+ 0xFD, 0xD7, 0x5C, 0xA7, 0x55, 0x00, 0x36, 0xCF, 0xB9, 0x3D, 0xAE, 0xFA, 0xD3, 0xA1, 0x85, 0x5B,
+ 0xFE, 0x60, 0x10, 0x11, 0xFF, 0xF7, 0x7D, 0x38, 0x59, 0x24, 0xFF, 0xFF, 0xDF, 0x13, 0x1C, 0x7B,
+ 0xCA, 0x1C, 0x1E, 0xF3, 0x04, 0xC0, 0x78, 0x07, 0x58, 0x7B, 0xA2, 0x54, 0xAA, 0xE3, 0xEA, 0x08,
+ 0x08, 0xC0, 0x74, 0x78, 0x78, 0x88, 0x50, 0x50, 0xD8, 0x0A, 0x0C, 0xC4, 0x56, 0x60, 0x20, 0xF6,
+ 0x1A, 0x1B, 0x33, 0x16, 0x15, 0xA5, 0xB8, 0xED, 0xED, 0x22, 0xF5, 0xF5, 0x09, 0xA1, 0xA2, 0x42,
+ 0x67, 0x62, 0x62, 0x3A, 0x13, 0x13, 0x0B, 0xA0, 0xA4, 0xF4, 0x0F, 0x06, 0x15, 0x35, 0x18, 0x54,
+ 0xD4, 0x35, 0x57, 0x45, 0xCB, 0x2F, 0x39, 0xF6, 0xEC, 0xBC, 0xBB, 0x53, 0x5F, 0x5E, 0x9E, 0xB1,
+ 0xA8, 0xA8, 0x28, 0xDF, 0xDE, 0x3E, 0x00, 0x00, 0x80, 0x5F, 0x75, 0x81, 0x81, 0x51, 0x1D, 0x1E,
+ 0xA2, 0x3A, 0x3C, 0x8C, 0xEA, 0xF0, 0x10, 0x51, 0x06, 0x67, 0xED, 0x85, 0x85, 0xA1, 0xBE, 0xBC,
+ 0x3C, 0x63, 0x51, 0x51, 0x51, 0xBE, 0xBD, 0xFD, 0xFF, 0xFD, 0xFE, 0xCE, 0x85, 0x76, 0x36, 0x73,
+ 0x10, 0x10, 0x10, 0x80, 0xEB, 0x3A, 0x38, 0xD8, 0xBE, 0xD4, 0x05, 0x06, 0xEE, 0x4F, 0x60, 0x59,
+ 0x59, 0x65, 0x84, 0x84, 0xC0, 0x46, 0xCB, 0x19, 0x7F, 0x4C, 0xFD, 0xC8, 0x9D, 0x8B, 0xB6, 0x31,
+ 0xAF, 0x86, 0x3A, 0xF0, 0x6D, 0x6D, 0x11, 0xDF, 0xDF, 0x5F, 0x79, 0x71, 0x71, 0x85, 0xD4, 0xD0,
+ 0x10, 0xB9, 0xB1, 0x11, 0x1A, 0x54, 0x54, 0xE9, 0x08, 0x08, 0x48, 0x39, 0x44, 0x04, 0x84, 0xAF,
+ 0xAF, 0x96, 0x99, 0x97, 0x71, 0xC5, 0x32, 0xF3, 0x32, 0xAE, 0x58, 0x66, 0x5E, 0xC6, 0x15, 0xCB,
+ 0xCC, 0xCB, 0xB8, 0x42, 0xD0, 0x45, 0xFF, 0x1C, 0x11, 0x85, 0xBE, 0x39, 0x08, 0x08, 0x08, 0x80,
+ 0x69, 0xC2, 0x47, 0x00, 0x80, 0x02, 0x00, 0x00, 0x91, 0xD3, 0xF4, 0x47, 0x01, 0x00, 0x80, 0x08,
+ 0x00, 0x00, 0x42, 0xD4, 0x29, 0x6F, 0x02, 0x00, 0x80, 0xB4, 0xE6, 0x6B, 0x9E, 0x34, 0x5C, 0x9A,
+ 0x94, 0xE2, 0xD2, 0xA4, 0x14, 0xA2, 0x0C, 0x4E, 0xEC, 0xA2, 0x3E, 0x7F, 0x39, 0x08, 0x08, 0x10,
+ 0x6E, 0x6F, 0x10, 0xD7, 0x79, 0xC7, 0xC9, 0x09, 0x4D, 0x4B, 0x73, 0x77, 0x84, 0x14, 0xAE, 0x52,
+ 0xE1, 0x7A, 0x44, 0x2A, 0x5C, 0x8F, 0x34, 0x93, 0xA8, 0xC4, 0x01, 0xF8, 0x3F, 0x3D, 0xC2, 0x29,
+ 0xE9, 0x11, 0x4E, 0xE9, 0x4F, 0x67, 0x62, 0x22, 0xB6, 0x02, 0x03, 0xA9, 0x2E, 0x30, 0x70, 0x75,
+ 0x04, 0x04, 0xC8, 0x38, 0x48, 0x08, 0x32, 0x53, 0x53, 0x29, 0x2F, 0x2E, 0xAE, 0x1C, 0x04, 0x04,
+ 0x50, 0x52, 0x50, 0xD0, 0x4F, 0x77, 0x68, 0x28, 0x99, 0x08, 0x0A, 0x4A, 0x60, 0x59, 0x59, 0xA9,
+ 0x0B, 0x0C, 0xAC, 0xC7, 0xC8, 0xC8, 0x8C, 0x45, 0x45, 0xA1, 0x1C, 0x22, 0x02, 0x5D, 0x79, 0x79,
+ 0xAB, 0x2E, 0x30, 0x70, 0xA7, 0x2C, 0x28, 0xE8, 0xB4, 0xF3, 0xEF, 0x26, 0x8F, 0x37, 0xB1, 0xFE,
+ 0xEE, 0x67, 0xA9, 0xA9, 0xAA, 0xAA, 0x6C, 0x79, 0x1E, 0xEC, 0xD7, 0x46, 0x44, 0xC4, 0xF7, 0xF8,
+ 0x24, 0x24, 0x00, 0x42, 0x40, 0xF8, 0x5A, 0x96, 0x38, 0x65, 0x91, 0xF1, 0x6A, 0x72, 0xFE, 0x68,
+ 0xC3, 0xE1, 0x37, 0x07, 0x01, 0x01, 0x01, 0xF0, 0x52, 0xE1, 0x7A, 0xE4, 0xB3, 0xD9, 0x20, 0x9C,
+ 0xE0, 0xD8, 0x53, 0x04, 0xC7, 0x9E, 0x82, 0x02, 0x27, 0x2B, 0x06, 0x00, 0x00, 0x9F, 0xDE, 0x1C,
+ 0x3E, 0xEE, 0xD7, 0x48, 0x20, 0x04, 0xD2, 0x35, 0x4C, 0x29, 0x43, 0x45, 0x23, 0x15, 0xEA, 0xE9,
+ 0x5E, 0xD7, 0xC1, 0xC1, 0xAA, 0x3B, 0x34, 0x34, 0x21, 0x49, 0x49, 0xE8, 0x8A, 0x8B, 0x13, 0x66,
+ 0x12, 0xE7, 0x31, 0x00, 0x00, 0x90, 0x84, 0x94, 0x69, 0x05, 0xD4, 0xD4, 0xF4, 0x13, 0x36, 0xE7,
+ 0x0C, 0x09, 0xEB, 0xBF, 0x90, 0x1A, 0x1A, 0xE6, 0x20, 0x20, 0x20, 0x00, 0x9E, 0x33, 0x18, 0x13,
+ 0xA6, 0x2F, 0x40, 0x0C, 0x00, 0x4E, 0xCF, 0x84, 0x36, 0x6A, 0xA0, 0xF2, 0xA9, 0x63, 0xD5, 0xCB,
+ 0x9E, 0x64, 0xEA, 0x3E, 0xF2, 0x14, 0xA0, 0x27, 0x29, 0x2B, 0xC6, 0xB2, 0x99, 0x99, 0xA9, 0x74,
+ 0x04, 0x04, 0x3C, 0x0A, 0xD0, 0xCF, 0x5C, 0x68, 0x67, 0xFB, 0xDF, 0x1C, 0x04, 0x04, 0x04, 0xC0,
+ 0x1C, 0x04, 0x04, 0x04, 0x40, 0x1B, 0x11, 0x11, 0x5F, 0xEA, 0x02, 0x03, 0xE1, 0x92, 0x94, 0x84,
+ 0x90, 0x88, 0xD9, 0xDD, 0x4F, 0x04, 0x56, 0x0E, 0xD1, 0x9F, 0x1A, 0x31, 0x3B, 0x37, 0x47, 0xA0,
+ 0x6C, 0x82, 0x40, 0xD9, 0x24, 0x9A, 0x02, 0x12, 0x62, 0xD3, 0x43, 0xFF, 0xBF, 0x8F, 0x84, 0xF5,
+ 0x1F, 0x51, 0x06, 0xE7, 0x0F, 0xDD, 0x89, 0x32, 0xFB, 0x60, 0x39, 0x0A, 0x71, 0x71, 0xB4, 0x36,
+ 0x33, 0x33, 0x3F, 0x8F, 0xD0, 0x4F, 0x79, 0x84, 0x7E, 0xBA, 0xC8, 0x0C, 0x0D, 0x4F, 0xBA, 0x86,
+ 0x29, 0x82, 0x54, 0x83, 0x7F, 0x77, 0x37, 0x07, 0x01, 0x01, 0x01, 0xA0, 0xFE, 0x97, 0x1B, 0x9D,
+ 0x16, 0xDC, 0x90, 0x58, 0xFE, 0x9B, 0x42, 0xB3, 0x4A, 0x00, 0x68, 0x73, 0x91, 0x20, 0x2B, 0xA8,
+ 0xC8, 0x29, 0x0B, 0x0A, 0xF2, 0xD3, 0x5D, 0x4B, 0x58, 0x5D, 0x20, 0x41, 0xD5, 0xBE, 0xAE, 0x70,
+ 0x88, 0x50, 0x50, 0x20, 0x4A, 0x44, 0xF4, 0x8F, 0xF7, 0x60, 0x22, 0x30, 0x9C, 0x24, 0xFE, 0x54,
+ 0x55, 0xD0, 0xD7, 0xD7, 0x37, 0x1A, 0xEF, 0x6E, 0xBC, 0x9B, 0x44, 0x39, 0xDD, 0x5D, 0xF2, 0xF2,
+ 0x7F, 0x20, 0x1A, 0x81, 0x9A, 0xCA, 0xBF, 0xC8, 0x8D, 0x8D, 0xC2, 0x83, 0x82, 0xA7, 0x2C, 0x28,
+ 0xC8, 0xFE, 0x08, 0xC2, 0x07, 0xC7, 0x27, 0x21, 0xE1, 0xBB, 0x3E, 0xC1, 0x59, 0x68, 0xAA, 0x78,
+ 0xC8, 0x57, 0x5D, 0x60, 0x20, 0xC6, 0x41, 0x42, 0xE8, 0x3A, 0x38, 0xD8, 0x9B, 0xFF, 0xFF, 0xFF,
+ 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+static const TStringBuf CodedFactorsBuf(reinterpret_cast<const char*>(CodedFactors), Y_ARRAY_SIZE(CodedFactors));
+
+void BM_Encode(benchmark::State& state) {
+ for (const auto _ : state) {
+ NCodecs::NFloatHuff::Encode(Factors);
+ }
+}
+
+void BM_Decode(benchmark::State& state) {
+ for (const auto _ : state) {
+ NCodecs::NFloatHuff::Decode(CodedFactorsBuf, Y_ARRAY_SIZE(Factors));
+ }
+}
+
+BENCHMARK(BM_Encode);
+BENCHMARK(BM_Decode);
diff --git a/library/cpp/codecs/float_huffman_bench/ya.make b/library/cpp/codecs/float_huffman_bench/ya.make
new file mode 100644
index 0000000000..c8fae6873a
--- /dev/null
+++ b/library/cpp/codecs/float_huffman_bench/ya.make
@@ -0,0 +1,13 @@
+OWNER(eeight)
+
+G_BENCHMARK()
+
+SRCS(
+ main.cpp
+)
+
+PEERDIR(
+ library/cpp/codecs
+)
+
+END()
diff --git a/library/cpp/codecs/greedy_dict/gd_builder.cpp b/library/cpp/codecs/greedy_dict/gd_builder.cpp
new file mode 100644
index 0000000000..561bfbca01
--- /dev/null
+++ b/library/cpp/codecs/greedy_dict/gd_builder.cpp
@@ -0,0 +1,142 @@
+#include "gd_builder.h"
+
+#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h>
+#include <util/generic/algorithm.h>
+
+#include <util/random/shuffle.h>
+#include <util/stream/output.h>
+#include <util/string/printf.h>
+#include <util/system/rusage.h>
+
+namespace NGreedyDict {
+ void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) {
+ if (!Current) {
+ Current = MakeHolder<TEntrySet>();
+ Current->InitWithAlpha();
+ }
+
+ TEntrySet& set = *Current;
+
+ for (auto& it : set)
+ it.Count = 0;
+
+ CompoundCounts = nullptr;
+ CompoundCountsPool.Clear();
+
+ if (!final) {
+ CompoundCounts = MakeHolder<TCompoundCounts>(&CompoundCountsPool);
+ CompoundCounts->reserve(maxcand);
+ }
+
+ Shuffle(Input.begin(), Input.end(), Rng);
+
+ for (auto str : Input) {
+ if (!final && CompoundCounts->size() > maxcand)
+ break;
+
+ i32 prev = -1;
+
+ while (!!str) {
+ TEntry* e = set.FindPrefix(str);
+ ui32 num = e->Number;
+
+ e->Count += 1;
+ if (!final && prev >= 0) {
+ (*CompoundCounts)[Compose(prev, num)] += 1;
+ }
+
+ prev = num;
+ ++set.TotalCount;
+ }
+ }
+
+ Current->SetModelP();
+ }
+
+ ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) {
+ TAutoPtr<TEntrySet> newset = new TEntrySet;
+ newset->InitWithAlpha();
+ maxent -= newset->size();
+
+ ui32 additions = 0;
+ ui32 deletions = 0;
+
+ {
+ const TEntrySet& set = *Current;
+
+ Candidates.clear();
+ const ui32 total = set.TotalCount;
+ const float minpval = Settings.MinPValue;
+ const EEntryStatTest test = Settings.StatTest;
+ const EEntryScore score = Settings.Score;
+ const ui32 mincnt = Settings.MinAbsCount;
+
+ for (const auto& it : set) {
+ const TEntry& e = it;
+ float modelp = e.ModelP;
+ ui32 cnt = e.Count;
+
+ if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval)
+ Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number));
+ }
+
+ if (!!CompoundCounts) {
+ for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) {
+ const TEntry& prev = set.Get(Prev(it->first));
+ const TEntry& next = set.Get(Next(it->first));
+ float modelp = ModelP(prev.Count, next.Count, total);
+ ui32 cnt = it->second;
+ if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval)
+ Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first));
+ }
+ }
+
+ Sort(Candidates.begin(), Candidates.end());
+
+ if (Candidates.size() > maxent)
+ Candidates.resize(maxent);
+
+ for (const auto& candidate : Candidates) {
+ if (IsCompound(candidate.second)) {
+ additions++;
+ newset->Add(set.Get(Prev(candidate.second)).Str, set.Get(Next(candidate.second)).Str);
+ } else {
+ newset->Add(set.Get(candidate.second).Str);
+ }
+ }
+
+ deletions = set.size() - (newset->size() - additions);
+ }
+
+ Current = newset;
+ Current->BuildHierarchy();
+ return deletions + additions;
+ }
+
+ ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) {
+ size_t totalsz = 0;
+ for (auto it : Input)
+ totalsz += it.size();
+
+ while (maxiters) {
+ maxiters--;
+
+ RebuildCounts(maxentries * Settings.GrowLimit, false);
+
+ if (Settings.Verbose) {
+ TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size());
+ Clog << Sprintf("%-110s RSS=%" PRIu32 "M", mess.data(), (ui32)(TRusage::Get().MaxRss >> 20)) << Endl;
+ }
+
+ ui32 diff = BuildNextGeneration(maxentries);
+
+ if (Current->size() == maxentries && diff < mindiff)
+ break;
+ }
+
+ RebuildCounts(0, true);
+ Current->SetScores(Settings.Score);
+ return maxiters;
+ }
+
+}
diff --git a/library/cpp/codecs/greedy_dict/gd_builder.h b/library/cpp/codecs/greedy_dict/gd_builder.h
new file mode 100644
index 0000000000..b8e9a5e37b
--- /dev/null
+++ b/library/cpp/codecs/greedy_dict/gd_builder.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include "gd_entry.h"
+
+#include <util/generic/hash.h>
+#include <util/random/fast.h>
+
+namespace NGreedyDict {
+ struct TBuildSettings {
+ EEntryStatTest StatTest = EST_SIMPLE_NORM;
+ EEntryScore Score = ES_LEN_SIMPLE;
+
+ float MinPValue = 0.75;
+ ui32 MinAbsCount = 10;
+ ui32 GrowLimit = 10; // times of maxentries
+ bool Verbose = false;
+ };
+
+ class TDictBuilder {
+ using TCompoundCounts = THashMap<ui64, ui32, THash<ui64>, TEqualTo<ui64>, TPoolAllocator>;
+ using TCandidate = std::pair<float, ui64>;
+ using TCandidates = TVector<TCandidate>;
+
+ private:
+ TFastRng64 Rng{0x1a5d0ac170565c1c, 0x0be7bc27, 0x6235f6f57820aa0d, 0xafdc7fb};
+ TStringBufs Input;
+
+ THolder<TEntrySet> Current;
+
+ TMemoryPool CompoundCountsPool;
+ THolder<TCompoundCounts> CompoundCounts;
+
+ TCandidates Candidates;
+
+ TBuildSettings Settings;
+
+ public:
+ TDictBuilder(const TBuildSettings& s = TBuildSettings())
+ : CompoundCountsPool(8112, TMemoryPool::TLinearGrow::Instance())
+ , Settings(s)
+ {
+ }
+
+ void SetInput(const TStringBufs& in) {
+ Input = in;
+ }
+
+ const TBuildSettings& GetSettings() const {
+ return Settings;
+ }
+
+ TBuildSettings& GetSettings() {
+ return Settings;
+ }
+
+ void SetSettings(const TBuildSettings& s) {
+ Settings = s;
+ }
+
+ TEntrySet& EntrySet() {
+ return *Current;
+ }
+
+ const TEntrySet& EntrySet() const {
+ return *Current;
+ }
+
+ THolder<TEntrySet> ReleaseEntrySet() {
+ return std::move(Current);
+ }
+
+ ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10);
+
+ public:
+ void RebuildCounts(ui32 maxcand, bool final);
+ ui32 /*diff size*/ BuildNextGeneration(ui32 maxent);
+
+ static bool IsCompound(ui64 ent) {
+ return ent & 0xFFFFFFFF00000000ULL;
+ }
+
+ static ui32 Next(ui64 ent) {
+ return ent;
+ }
+ static ui32 Prev(ui64 ent) {
+ return (ent >> 32) - 1;
+ }
+
+ static ui64 Compose(ui32 prev, ui32 next) {
+ return ((prev + 1ULL) << 32) | next;
+ }
+ };
+
+}
diff --git a/library/cpp/codecs/greedy_dict/gd_entry.cpp b/library/cpp/codecs/greedy_dict/gd_entry.cpp
new file mode 100644
index 0000000000..2c315c7f7c
--- /dev/null
+++ b/library/cpp/codecs/greedy_dict/gd_entry.cpp
@@ -0,0 +1,98 @@
+#include "gd_entry.h"
+#include "gd_stats.h"
+
+#include <util/generic/algorithm.h>
+#include <util/generic/singleton.h>
+
+namespace NGreedyDict {
+ class TAlphas {
+ char Memory[512];
+
+ public:
+ TStringBufs Alphas;
+
+ TAlphas() {
+ for (ui32 i = 0; i < 256; ++i) {
+ Memory[2 * i] = (char)i;
+ Memory[2 * i + 1] = 0;
+
+ Alphas.push_back(TStringBuf(&Memory[2 * i], 1));
+ }
+ }
+ };
+
+ void TEntrySet::InitWithAlpha() {
+ Pool.ClearKeepFirstChunk();
+ const TStringBufs& a = Singleton<TAlphas>()->Alphas;
+ for (auto it : a) {
+ Add(it);
+ }
+ BuildHierarchy();
+ }
+
+ void TEntrySet::BuildHierarchy() {
+ Sort(begin(), end(), TEntry::StrLess);
+
+ TCompactTrieBuilder<char, ui32, TAsIsPacker<ui32>> builder(CTBF_PREFIX_GROUPED);
+
+ for (iterator it = begin(); it != end(); ++it) {
+ it->Number = (it - begin());
+ TStringBuf suff = it->Str;
+ size_t len = 0;
+ ui32 val = 0;
+
+ if (builder.FindLongestPrefix(suff.data(), suff.size(), &len, &val) && len) {
+ it->NearestPrefix = val;
+ }
+
+ builder.Add(suff.data(), suff.size(), it->Number);
+ }
+
+ TBufferOutput bout;
+ builder.Save(bout);
+ Trie.Init(TBlob::FromBuffer(bout.Buffer()));
+ }
+
+ TEntry* TEntrySet::FindPrefix(TStringBuf& str) {
+ size_t len = 0;
+ ui32 off = 0;
+
+ if (!Trie.FindLongestPrefix(str, &len, &off)) {
+ return nullptr;
+ }
+
+ str.Skip(len);
+ return &Get(off);
+ }
+
+ void TEntrySet::SetModelP() {
+ for (iterator it = begin(); it != end(); ++it) {
+ TEntry& e = *it;
+
+ if (!e.HasPrefix()) {
+ e.ModelP = 0;
+ continue;
+ }
+
+ TStringBuf suff = e.Str;
+ const TEntry& p = Get(e.NearestPrefix);
+ suff.Skip(p.Len());
+
+ float modelp = float(p.Count + e.Count) / TotalCount;
+
+ while (!!suff) {
+ TEntry* pp = FindPrefix(suff);
+ modelp *= float(pp->Count + e.Count) / TotalCount;
+ }
+
+ e.ModelP = modelp;
+ }
+ }
+
+ void TEntrySet::SetScores(EEntryScore s) {
+ for (auto& it : *this) {
+ it.Score = Score(s, it.Len(), it.ModelP, it.Count, TotalCount);
+ }
+ }
+
+}
diff --git a/library/cpp/codecs/greedy_dict/gd_entry.h b/library/cpp/codecs/greedy_dict/gd_entry.h
new file mode 100644
index 0000000000..18b5be0e15
--- /dev/null
+++ b/library/cpp/codecs/greedy_dict/gd_entry.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include "gd_stats.h"
+
+#include <library/cpp/containers/comptrie/comptrie.h>
+
+#include <util/generic/ptr.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/vector.h>
+
+#include <util/memory/pool.h>
+
+namespace NGreedyDict {
+ using TStringBufs = TVector<TStringBuf>;
+
+ struct TEntry {
+ static const i32 NoPrefix = -1;
+
+ TStringBuf Str;
+
+ i32 NearestPrefix = NoPrefix;
+ ui32 Count = 0;
+ ui32 Number = 0;
+ float ModelP = 0;
+ float Score = 0;
+
+ TEntry(TStringBuf b = TStringBuf(), ui32 cnt = 0)
+ : Str(b)
+ , Count(cnt)
+ {
+ }
+
+ bool HasPrefix() const {
+ return NearestPrefix != NoPrefix;
+ }
+ ui32 Len() const {
+ return Str.size();
+ }
+
+ static bool StrLess(const TEntry& a, const TEntry& b) {
+ return a.Str < b.Str;
+ }
+ static bool NumberLess(const TEntry& a, const TEntry& b) {
+ return a.Number < b.Number;
+ }
+ static bool ScoreMore(const TEntry& a, const TEntry& b) {
+ return a.Score > b.Score;
+ }
+ };
+
+ class TEntrySet: public TVector<TEntry>, TNonCopyable {
+ TMemoryPool Pool{8112};
+ TCompactTrie<char, ui32, TAsIsPacker<ui32>> Trie;
+
+ public:
+ ui32 TotalCount = 0;
+
+ void InitWithAlpha();
+
+ void Add(TStringBuf a) {
+ push_back(TStringBuf(Pool.Append(a.data(), a.size()), a.size()));
+ }
+
+ void Add(TStringBuf a, TStringBuf b) {
+ size_t sz = a.size() + b.size();
+ char* p = (char*)Pool.Allocate(sz);
+ memcpy(p, a.data(), a.size());
+ memcpy(p + a.size(), b.data(), b.size());
+ push_back(TStringBuf(p, sz));
+ }
+
+ TEntry& Get(ui32 idx) {
+ return (*this)[idx];
+ }
+
+ const TEntry& Get(ui32 idx) const {
+ return (*this)[idx];
+ }
+
+ void BuildHierarchy();
+
+ // longest prefix
+ TEntry* FindPrefix(TStringBuf& str);
+
+ const TEntry* FindPrefix(TStringBuf& str) const {
+ return ((TEntrySet*)this)->FindPrefix(str);
+ }
+
+ const TEntry* FirstPrefix(const TEntry& e, TStringBuf& suff) {
+ if (!e.HasPrefix())
+ return nullptr;
+
+ const TEntry& p = Get(e.NearestPrefix);
+ suff = e.Str;
+ suff.Skip(p.Str.size());
+ return &p;
+ }
+
+ void SetModelP();
+ void SetScores(EEntryScore);
+ };
+
+}
diff --git a/library/cpp/codecs/greedy_dict/gd_stats.h b/library/cpp/codecs/greedy_dict/gd_stats.h
new file mode 100644
index 0000000000..b63c4c38d2
--- /dev/null
+++ b/library/cpp/codecs/greedy_dict/gd_stats.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <util/generic/ymath.h>
+#include <util/generic/algorithm.h>
+#include <util/generic/yexception.h>
+
+namespace NGreedyDict {
+ enum EEntryScore {
+ ES_COUNT,
+ ES_LEN_COUNT,
+ ES_SIMPLE,
+ ES_LEN_SIMPLE,
+ ES_SOLAR
+ };
+
+ enum EEntryStatTest {
+ EST_NONE = 0,
+ EST_SIMPLE_NORM = 2
+ };
+
+ inline float ModelP(ui32 countA, ui32 countB, ui32 total) {
+ return float(countA) * countB / total / total;
+ }
+
+ // P (ab | dependent)
+ inline float SimpleTest(float modelp, ui32 countAB, ui32 total) {
+ float realp = float(countAB) / total;
+ return modelp >= realp ? 0 : (realp - modelp);
+ }
+
+ inline float SolarTest(float modelp, ui32 countAB, ui32 total) {
+ float realp = float(countAB) / total;
+ return modelp >= realp ? 0 : (modelp + realp * (log(realp / modelp) - 1));
+ }
+
+ // P (ab | dependent) / P (ab)
+ inline float SimpleTestNorm(float modelp, ui32 countAB, ui32 total) {
+ float realp = float(countAB) / total;
+ return modelp >= realp ? 0 : (realp - modelp) / realp;
+ }
+
+ inline float StatTest(EEntryStatTest test, float modelp, ui32 countAB, ui32 total) {
+ if (!total) {
+ return 0;
+ }
+ switch (test) {
+ case EST_NONE:
+ return 1;
+ case EST_SIMPLE_NORM:
+ return SimpleTestNorm(modelp, countAB, total);
+ }
+ Y_FAIL("no way!");
+ return 0;
+ }
+
+ inline float Score(EEntryScore score, ui32 len, float modelp, ui32 count, ui32 total) {
+ if (!total) {
+ return 0;
+ }
+ ui32 m = 1;
+ switch (score) {
+ case ES_LEN_COUNT:
+ m = len;
+ [[fallthrough]];
+ case ES_COUNT:
+ return m * count;
+ case ES_LEN_SIMPLE:
+ m = len;
+ [[fallthrough]];
+ case ES_SIMPLE:
+ return m * SimpleTest(modelp, count, total);
+ case ES_SOLAR:
+ return SolarTest(modelp, count, total);
+ }
+ Y_FAIL("no way!");
+ return 0;
+ }
+
+}
diff --git a/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp
new file mode 100644
index 0000000000..679089a11b
--- /dev/null
+++ b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp
@@ -0,0 +1,282 @@
+#include "gd_builder.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h>
+#include <util/string/printf.h>
+#include <util/generic/ymath.h>
+
+class TGreedyDictTest: public TTestBase {
+ UNIT_TEST_SUITE(TGreedyDictTest);
+ UNIT_TEST(TestEntrySet)
+ UNIT_TEST(TestBuilder0)
+ UNIT_TEST(TestBuilder)
+ UNIT_TEST_SUITE_END();
+
+ void TestEntrySet() {
+ using namespace NGreedyDict;
+
+ {
+ TEntrySet d;
+
+ d.InitWithAlpha();
+
+ for (TEntrySet::const_iterator it = d.begin(); it != d.end(); ++it) {
+ UNIT_ASSERT_C(!it->HasPrefix(), Sprintf("%u -> %u", it->Number, it->NearestPrefix));
+ UNIT_ASSERT_VALUES_EQUAL(it->Number, (ui32)(it - d.begin()));
+ }
+
+ UNIT_ASSERT_VALUES_EQUAL(d.size(), 256u);
+ TStringBuf s = "aaabbb";
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a");
+ UNIT_ASSERT_VALUES_EQUAL(s, "aabbb");
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a");
+ UNIT_ASSERT_VALUES_EQUAL(s, "abbb");
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a");
+ UNIT_ASSERT_VALUES_EQUAL(s, "bbb");
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b");
+ UNIT_ASSERT_VALUES_EQUAL(s, "bb");
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b");
+ UNIT_ASSERT_VALUES_EQUAL(s, "b");
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b");
+ UNIT_ASSERT_VALUES_EQUAL(s, "");
+ s = TStringBuf("", 1);
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, TStringBuf("", 1));
+ UNIT_ASSERT_VALUES_EQUAL(s, "");
+ s = "\xFF";
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "\xFF");
+ UNIT_ASSERT_VALUES_EQUAL(s, "");
+ }
+ {
+ TEntrySet d;
+ d.Add("a");
+ d.Add("b");
+ d.Add("b", "a");
+ d.BuildHierarchy();
+
+ UNIT_ASSERT_VALUES_EQUAL(d.size(), 3u);
+
+ TStringBuf s = "bab";
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "ba");
+ UNIT_ASSERT_VALUES_EQUAL(s, "b");
+ }
+ {
+ TEntrySet d;
+
+ d.Add("a");
+ d.Add("aa");
+ d.Add("aaa");
+ d.Add("aab");
+ d.Add("b");
+ d.Add("ba");
+
+ d.BuildHierarchy();
+
+ UNIT_ASSERT_VALUES_EQUAL(d.size(), 6u);
+ {
+ TStringBuf s = "aaaaa";
+ const TEntry* e = d.FindPrefix(s);
+ UNIT_ASSERT_VALUES_EQUAL(e->Str, "aaa");
+ UNIT_ASSERT_VALUES_EQUAL(e->Number, 2u);
+ UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 1);
+ UNIT_ASSERT_VALUES_EQUAL(s, "aa");
+ }
+
+ {
+ TStringBuf s = "a";
+ const TEntry* e = d.FindPrefix(s);
+ UNIT_ASSERT_VALUES_EQUAL(e->Str, "a");
+ UNIT_ASSERT_VALUES_EQUAL(e->Number, 0u);
+ UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1);
+ UNIT_ASSERT_VALUES_EQUAL(s, "");
+ }
+
+ {
+ TStringBuf s = "bab";
+ const TEntry* e = d.FindPrefix(s);
+ UNIT_ASSERT_VALUES_EQUAL(e->Str, "ba");
+ UNIT_ASSERT_VALUES_EQUAL(e->Number, 5u);
+ UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 4);
+ UNIT_ASSERT_VALUES_EQUAL(s, "b");
+ }
+
+ {
+ TStringBuf s = "bba";
+ const TEntry* e = d.FindPrefix(s);
+ UNIT_ASSERT_VALUES_EQUAL(e->Str, "b");
+ UNIT_ASSERT_VALUES_EQUAL(e->Number, 4u);
+ UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1);
+ UNIT_ASSERT_VALUES_EQUAL(s, "ba");
+ }
+ }
+ }
+
+ void TestBuilder0() {
+ using namespace NGreedyDict;
+ ui32 a = 1, b = 11;
+ ui64 ab = TDictBuilder::Compose(a, b);
+ UNIT_ASSERT(TDictBuilder::IsCompound(ab));
+ UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Prev(ab), a);
+ UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Next(ab), b);
+ }
+
+ void FillData(NGreedyDict::TStringBufs& data) {
+ static const char* urls[] = {"http://53.ru/car/motors/foreign/opel/tigra/", "http://abakan.24au.ru/tender/85904/", "http://anm15.gulaig.com/", "http://avto-parts.com/mercedes-benz/mercedes-benz-w220-1998-2005/category-442/category-443/", "http://ballooncousin.co.uk/", "http://benzol.ru/equipment/?id=1211&parent=514", "http://blazingseorank.com/blazing-seo-rank-free-website-analysis-to-increase-rank-and-traffic-450.html", "http://blogblaugrana.contadorwebmasters.com/", "http://bristolhash.org.uk/bh3cntct.php", "http://broker.borovichi.ru/category/item/3/1/0/8/28/257", "http://canoncompactcamerax.blogspot.com/", "http://classifieds.smashits.com/p,107881,email-to-friend.htm", "http://conferences.ksde.org/Portals/132/FallAssessment/SAVETHEDAY-FA09.pdf", "http://eway.vn/raovat/325-dien-tu-gia-dung/337-dieu-hoa/98041-b1-sua-may-lanh-quan-binh-tan-sua-may-lanh-quan-binh-chanh-hh-979676119-toan-quoc.html", "http://gallery.e2bn.org/asset73204_8-.html", "http://goplay.nsw.gov.au/activities-for-kids/by/historic-houses-trust/?startdate=2012-07-10", "http://grichards19067.multiply.com/", "http://hotkovo.egent.ru/user/89262269084/", "http://howimetyourself.com/?redirect_to=http://gomiso.com/m/suits/seasons/2/episodes/2", "http://islamqa.com/hi/ref/9014/DEAD%20PEOPLE%20GOOD%20DEEDS", "http://lapras.rutube.ru/", "http://nceluiko.ya.ru/", "http://nyanyanyanyaa.beon.ru/", "http://ozbo.com/Leaf-River-DV-7SS-7-0-MP-Game-Camera-K1-32541.html", "http://sbantom.ru/catalog/chasy/632753.html", "http://shopingoff.com/index.php?option=com_virtuemart&Itemid=65&category_id=&page=shop.browse&manufacturer_id=122&limit=32&limitstart=96", "http://shopingoff.com/katalog-odezhdy/manufacturer/62-christian-audigier.html?limit=32&start=448", "https://webwinkel.ah.nl/process?fh_location=//ecommerce/nl_NL/categories%3C%7Becommerce_shoc1%7D/it_show_product_code_1384%3E%7B10%3B20%7D/pr_startdate%3C20120519/pr_enddate%3E20120519/pr_ltc_allowed%3E%7Bbowi%7D/categories%3C%7Becommerce_shoc1_1al%7D/categories%3C%7Becommerce_shoc1_1al_1ahal%7D&&action=albert_noscript.modules.build", "http://top100.rambler.ru/navi/?theme=208/210/371&rgn=17", "http://volgogradskaya-oblast.extra-m.ru/classifieds/rabota/vakansii/banki-investicii/901467/", "http://wikien4.appspot.com/wiki/Warburg_hypothesis", "http://wola_baranowska.kamerzysta24.com.pl/", "http://www.10dot0dot0dot1.com/", "http://www.anima-redux.ru/index.php?key=gifts+teenage+girls", "http://www.aquaticabyseaworld.com/Calendar.aspx/CP/CP/CP/sp-us/CP/CP/ParkMap/Tickets/Weather.aspx", "http://www.autousa.com/360spin/2012_cadillac_ctssportwagon_3.6awdpremiumcollection.htm", "http://www.booking.com/city/gb/paignton-aireborough.html?inac=0&lang=pl", "http://www.booking.com/city/it/vodo-cadore.en.html", "http://www.booking.com/district/us/new-york/rockefeller-center.html&lang=no", "http://www.booking.com/hotel/bg/crown-fort-club.lv.html", "http://www.booking.com/hotel/ca/gouverneur-rimouski.ar.html", "http://www.booking.com/hotel/ch/l-auberge-du-chalet-a-gobet.fi.html", "http://www.booking.com/hotel/de/mark-garni.ru.html?aid=337384;label=yandex-hotel-mark-garni-68157-%7Bparam1%7D", "http://www.booking.com/hotel/de/mercure-goldschmieding-castrop-rauxel.ro.html", "http://www.booking.com/hotel/de/zollenspieker-fahrhaus.fr.html", "http://www.booking.com/hotel/es/jardin-metropolitano.ca.html", "http://www.booking.com/hotel/fr/clim.fr.html", "http://www.booking.com/hotel/fr/radisson-sas-toulouse-airport.et.html", "http://www.booking.com/hotel/gb/stgileshotel.ro.html?srfid=68c7fe42a03653a8796c84435c5299e4X16?tab=4", "http://www.booking.com/hotel/gr/rodos-park-suites.ru.html", "http://www.booking.com/hotel/id/le-grande-suites-bali.ru.html", "http://www.booking.com/hotel/it/mozart.it.html?aid=321655", "http://www.booking.com/hotel/ni/bahia-del-sol-villas.ru.html?dcid=1;dva=0", "http://www.booking.com/hotel/nl/cpschiphol.ro.html.ro.html?tab=4", "http://www.booking.com/hotel/th/laem-din.en-gb.html", "http://www.booking.com/hotel/th/tinidee-ranong.en.html", "http://www.booking.com/hotel/us/best-western-plus-merrimack-valley.hu.html", "http://www.booking.com/hotel/vn/tan-hai-long.km.html", "http://www.booking.com/landmark/au/royal-brisbane-women-s-hospital.vi.html", "http://www.booking.com/landmark/hk/nam-cheong-station.html&lang=id", "http://www.booking.com/landmark/it/spanish-steps.ca.html", "http://www.booking.com/landmark/sg/asian-civilisations-museum.html&lang=fi", "http://www.booking.com/place/fi-1376029.pt.html", "http://www.booking.com/place/tn257337.pl.html", "http://www.booking.com/region/ca/niagarafalls.ar.html&selected_currency=PLN", "http://www.booking.com/region/mx/queretaro.pt-pt.html&selected_currency=AUD", "http://www.booking.com/searchresults.en.html?city=20063074", "http://www.booking.com/searchresults.et.html?checkin=;checkout=;city=-394632", "http://www.booking.com/searchresults.lv.html?region=3936", "http://www.cevredanismanlari.com/index.php/component/k2/index.php/mevzuat/genel-yazlar/item/dosyalar/index.php?option=com_k2&view=item&id=16:iso-14001-%C3%A7evre-y%C3%B6netim-sistemi&Itemid=132&limitstart=107120", "http://www.dh-wholesaler.com/MENS-POLO-RACING-TEE-RL-p-417.html", "http://www.employabilityonline.net/", "http://www.esso.inc.ru/board/tools.php?event=profile&pname=Invinerrq", "http://www.filesurgery.ru/searchfw/kids_clothes-3.html", "http://www.furnitureandcarpetsource.com/Item.aspx?ItemID=-2107311899&ItemNum=53-T3048", "http://www.gets.cn/product/Gold-Sand-Lampwork-Glass-Beads--Flat-round--28x28x13mm_p260717.html", "http://www.gets.cn/wholesale-Sterling-Silver-Pendant-Findings-3577_S--L-Star-P-1.html?view=1&by=1", "http://www.homeandgardenadvice.com/diy/Mortgages_Loans_and_Financing/9221.html", "http://www.hongkongairport.com/eng/index.html/passenger/passenger/transport/to-from-airport/business/about-the-airport/transport/shopping/entertainment/t2/passenger/interactive-map.html", "http://www.hongkongairport.com/eng/index.html/shopping/insideshopping/all/passenger/transfer-transit/all/airline-information/shopping/entertainment/t2/business/about-the-airport/welcome.html", "http://www.hongkongairport.com/eng/index.html/transport/business/about-the-airport/transport/business/airport-authority/passenger/shopping/dining/all/dining.html", "http://www.idedge.com/index.cfm/fuseaction/category.display/category_id/298/index.cfm", "http://www.istanbulburda.com/aramalar.php", "http://www.jewelryinthenet.com/ads/AdDetail.aspx?AdID=1-0311002490689&stid=22-0111001020877", "http://www.johnnydepp.ru/forum/index.php?showtopic=1629&mode=linearplus&view=findpost&p=186977", "http://www.johnnydepp.ru/forum/index.php?showtopic=476&st=60&p=87379&", "http://www.joseleano.com/joomla/index.php/audio", "http://www.kaplicarehberi.com/tag/sakar-ilicali-kaplicalari/feed", "http://www.khaber.com.tr/arama.html?key=%C3%A7avdar", "http://www.kiz-oyunlari1.com/1783/4437/4363/1056/4170/Bump-Copter2-.html", "http://www.kiz-oyunlari1.com/3752/2612/4175/1166/3649/1047/Angelina-Oyunu.html", "http://www.kiz-oyunlari1.com/4266/3630/3665/3286/4121/301/3274/Sinir-Sinekler-.html", "http://www.kuldiga.lv/index.php?f=8&cat=371", "http://www.kuldiga.lv/index.php/img/index.php?l=lv&art_id=1836&show_c=&cat=85", "http://www.patronessa.ru/remontiruemsya/kuzovnie30raboti.html", "http://www.rapdict.org/Nu_Money?title=Talk:Nu_Money&action=edit", "http://www.serafin-phu.tabor24.com/?page=8", "http://www.shoes-store.org/brand1/Kids/Minnetonka.html", "http://www.shoes-store.org/shoes-store.xml", "http://www.way2allah.com/khotab-download-34695.htm"};
+ data.clear();
+ data.insert(data.begin(), urls, urls + Y_ARRAY_SIZE(urls));
+ }
+
+ typedef THashMap<TStringBuf, NGreedyDict::TEntry> TDict;
+
+ TAutoPtr<NGreedyDict::TEntrySet> DoTestBuilder(const NGreedyDict::TBuildSettings& s,
+ TDict& res) {
+ using namespace NGreedyDict;
+
+ TStringBufs data;
+ FillData(data);
+
+ TDictBuilder b(s);
+ b.SetInput(data);
+ b.Build(256 + 128);
+
+ TEntrySet& set = b.EntrySet();
+
+ for (const auto& it : set) {
+ if (it.Score) {
+ res[it.Str] = it;
+ }
+ }
+
+ return b.ReleaseEntrySet();
+ }
+
+ void DoAssertEntry(TStringBuf entry, ui32 number, i32 parent, float score, const TDict& dict) {
+ TDict::const_iterator it = dict.find(entry);
+ UNIT_ASSERT_C(it != dict.end(), entry);
+ UNIT_ASSERT_VALUES_EQUAL_C(it->second.Number, number, entry);
+ UNIT_ASSERT_VALUES_EQUAL_C(it->second.NearestPrefix, parent, entry);
+ UNIT_ASSERT_VALUES_EQUAL_C(round(it->second.Score * 10000), round(score * 10000), entry);
+ }
+
+ void TestBuilder() {
+ TAutoPtr<NGreedyDict::TEntrySet> set;
+ THashMap<TStringBuf, NGreedyDict::TEntry> res;
+ NGreedyDict::TBuildSettings s;
+ set = DoTestBuilder(s, res);
+
+ UNIT_ASSERT_VALUES_EQUAL(set->size(), 295u);
+ UNIT_ASSERT_VALUES_EQUAL(res.size(), 110u);
+
+ DoAssertEntry("%", 37, -1, 0.00375193, res);
+ DoAssertEntry("%7", 38, 37, 0.00513299, res);
+ DoAssertEntry("&", 39, -1, 0.00794527, res);
+ DoAssertEntry("+", 44, -1, 0.000441404, res);
+ DoAssertEntry(",", 45, -1, 0.000441404, res);
+ DoAssertEntry("-", 46, -1, 0.0417126, res);
+ DoAssertEntry(".", 47, -1, 0.0196425, res);
+ DoAssertEntry(".com/", 48, 47, 0.0374482, res);
+ DoAssertEntry(".html", 49, 47, 0.0496577, res);
+ DoAssertEntry(".html?", 50, 49, 0.0153908, res);
+ DoAssertEntry(".php", 51, 47, 0.0123585, res);
+ DoAssertEntry(".ru/", 52, 47, 0.0150027, res);
+ DoAssertEntry("/", 53, -1, 0.0452439, res);
+ DoAssertEntry("/index", 54, 53, 0.0158905, res);
+ DoAssertEntry("0", 55, -1, 0.00816597, res);
+ DoAssertEntry("1", 56, -1, 0.0167733, res);
+ DoAssertEntry("10", 57, 56, 0.00530474, res);
+ DoAssertEntry("2", 58, -1, 0.0101523, res);
+ DoAssertEntry("20", 59, 58, 0.00674234, res);
+ DoAssertEntry("3", 60, -1, 0.01258, res);
+ DoAssertEntry("32", 61, 60, 0.00490697, res);
+ DoAssertEntry("4", 62, -1, 0.00993158, res);
+ DoAssertEntry("5", 63, -1, 0.00617965, res);
+ DoAssertEntry("6", 64, -1, 0.00971088, res);
+ DoAssertEntry("7", 65, -1, 0.0101523, res);
+ DoAssertEntry("8", 66, -1, 0.00728316, res);
+ DoAssertEntry("9", 67, -1, 0.00728316, res);
+ DoAssertEntry(":", 68, -1, 0.000662106, res);
+ DoAssertEntry(";", 69, -1, 0.000882807, res);
+ DoAssertEntry("=", 71, -1, 0.01258, res);
+ DoAssertEntry("?", 73, -1, 0.00397263, res);
+ DoAssertEntry("A", 75, -1, 0.00264842, res);
+ DoAssertEntry("B", 76, -1, 0.00220702, res);
+ DoAssertEntry("C", 77, -1, 0.00353123, res);
+ DoAssertEntry("D", 78, -1, 0.00375193, res);
+ DoAssertEntry("E", 79, -1, 0.00286912, res);
+ DoAssertEntry("F", 80, -1, 0.00110351, res);
+ DoAssertEntry("G", 81, -1, 0.00110351, res);
+ DoAssertEntry("H", 82, -1, 0.000220702, res);
+ DoAssertEntry("I", 83, -1, 0.00198632, res);
+ DoAssertEntry("K", 85, -1, 0.000441404, res);
+ DoAssertEntry("L", 86, -1, 0.00198632, res);
+ DoAssertEntry("M", 87, -1, 0.00154491, res);
+ DoAssertEntry("N", 88, -1, 0.00154491, res);
+ DoAssertEntry("O", 89, -1, 0.00132421, res);
+ DoAssertEntry("P", 90, -1, 0.00308983, res);
+ DoAssertEntry("R", 92, -1, 0.000662106, res);
+ DoAssertEntry("S", 93, -1, 0.00264842, res);
+ DoAssertEntry("T", 94, -1, 0.00110351, res);
+ DoAssertEntry("U", 95, -1, 0.000220702, res);
+ DoAssertEntry("V", 96, -1, 0.000441404, res);
+ DoAssertEntry("W", 97, -1, 0.000441404, res);
+ DoAssertEntry("X", 98, -1, 0.000220702, res);
+ DoAssertEntry("Y", 99, -1, 0.000220702, res);
+ DoAssertEntry("_", 105, -1, 0.00904877, res);
+ DoAssertEntry("a", 107, -1, 0.0505407, res);
+ DoAssertEntry("an", 108, 107, 0.018273, res);
+ DoAssertEntry("ar", 109, 107, 0.0169385, res);
+ DoAssertEntry("b", 110, -1, 0.0156698, res);
+ DoAssertEntry("c", 111, -1, 0.018539, res);
+ DoAssertEntry("cat", 112, 111, 0.00846732, res);
+ DoAssertEntry("ch", 113, 111, 0.00644872, res);
+ DoAssertEntry("com", 114, 111, 0.00724235, res);
+ DoAssertEntry("ct", 115, 111, 0.00605729, res);
+ DoAssertEntry("d", 116, -1, 0.020746, res);
+ DoAssertEntry("di", 117, 116, 0.00730659, res);
+ DoAssertEntry("e", 118, -1, 0.0624586, res);
+ DoAssertEntry("en", 119, 118, 0.0108999, res);
+ DoAssertEntry("ent", 120, 119, 0.00616002, res);
+ DoAssertEntry("f", 121, -1, 0.00860737, res);
+ DoAssertEntry("fi", 122, 121, 0.00423196, res);
+ DoAssertEntry("g", 123, -1, 0.0180975, res);
+ DoAssertEntry("go", 124, 123, 0.00601862, res);
+ DoAssertEntry("h", 125, -1, 0.010373, res);
+ DoAssertEntry("ho", 126, 125, 0.00570298, res);
+ DoAssertEntry("http://", 127, 125, 0.0494372, res);
+ DoAssertEntry("http://www.", 128, 127, 0.0849702, res);
+ DoAssertEntry("http://www.booking.com/", 129, 128, 0.071066, res);
+ DoAssertEntry("http://www.booking.com/hotel/", 130, 129, 0.121607, res);
+ DoAssertEntry("i", 131, -1, 0.0258221, res);
+ DoAssertEntry("id=", 132, 131, 0.00725369, res);
+ DoAssertEntry("im", 133, 131, 0.00373318, res);
+ DoAssertEntry("in", 134, 131, 0.013625, res);
+ DoAssertEntry("ing", 135, 134, 0.00795491, res);
+ DoAssertEntry("ion", 136, 131, 0.00796149, res);
+ DoAssertEntry("it", 137, 131, 0.00953416, res);
+ DoAssertEntry("j", 138, -1, 0.00132421, res);
+ DoAssertEntry("k", 139, -1, 0.0134628, res);
+ DoAssertEntry("l", 140, -1, 0.0381814, res);
+ DoAssertEntry("m", 141, -1, 0.0174354, res);
+ DoAssertEntry("mer", 142, 141, 0.00711846, res);
+ DoAssertEntry("n", 143, -1, 0.0132421, res);
+ DoAssertEntry("o", 144, -1, 0.0302362, res);
+ DoAssertEntry("on", 145, 144, 0.00802271, res);
+ DoAssertEntry("ou", 146, 144, 0.00414545, res);
+ DoAssertEntry("p", 147, -1, 0.0225116, res);
+ DoAssertEntry("port", 148, 147, 0.0123532, res);
+ DoAssertEntry("q", 149, -1, 0.00176561, res);
+ DoAssertEntry("r", 150, -1, 0.0401677, res);
+ DoAssertEntry("ran", 151, 150, 0.00686918, res);
+ DoAssertEntry("s", 152, -1, 0.0487751, res);
+ DoAssertEntry("sho", 153, 152, 0.0113876, res);
+ DoAssertEntry("t", 154, -1, 0.0379607, res);
+ DoAssertEntry("u", 155, -1, 0.0211874, res);
+ DoAssertEntry("v", 156, -1, 0.00595895, res);
+ DoAssertEntry("vi", 157, 156, 0.00480673, res);
+ DoAssertEntry("w", 158, -1, 0.00816597, res);
+ DoAssertEntry("x", 159, -1, 0.00375193, res);
+ DoAssertEntry("y", 160, -1, 0.0130214, res);
+ DoAssertEntry("z", 161, -1, 0.00353123, res);
+ }
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TGreedyDictTest);
diff --git a/library/cpp/codecs/greedy_dict/ut/ya.make b/library/cpp/codecs/greedy_dict/ut/ya.make
new file mode 100644
index 0000000000..bd67d1a452
--- /dev/null
+++ b/library/cpp/codecs/greedy_dict/ut/ya.make
@@ -0,0 +1,9 @@
+UNITTEST_FOR(library/cpp/codecs/greedy_dict)
+
+OWNER(velavokr)
+
+SRCS(
+ greedy_dict_ut.cpp
+)
+
+END()
diff --git a/library/cpp/codecs/greedy_dict/ya.make b/library/cpp/codecs/greedy_dict/ya.make
new file mode 100644
index 0000000000..2a57224f7e
--- /dev/null
+++ b/library/cpp/codecs/greedy_dict/ya.make
@@ -0,0 +1,15 @@
+OWNER(velavokr)
+
+LIBRARY()
+
+SRCS(
+ gd_builder.cpp
+ gd_entry.cpp
+)
+
+PEERDIR(
+ library/cpp/containers/comptrie
+ library/cpp/string_utils/relaxed_escaper
+)
+
+END()
diff --git a/library/cpp/codecs/huffman_codec.cpp b/library/cpp/codecs/huffman_codec.cpp
new file mode 100644
index 0000000000..650fe7cdfd
--- /dev/null
+++ b/library/cpp/codecs/huffman_codec.cpp
@@ -0,0 +1,592 @@
+#include "huffman_codec.h"
+#include <library/cpp/bit_io/bitinput.h>
+#include <library/cpp/bit_io/bitoutput.h>
+
+#include <util/generic/algorithm.h>
+#include <util/generic/bitops.h>
+#include <util/stream/buffer.h>
+#include <util/stream/length.h>
+#include <util/string/printf.h>
+
+namespace NCodecs {
+ template <typename T>
+ struct TCanonicalCmp {
+ bool operator()(const T& a, const T& b) const {
+ if (a.CodeLength == b.CodeLength) {
+ return a.Char < b.Char;
+ } else {
+ return a.CodeLength < b.CodeLength;
+ }
+ }
+ };
+
+ template <typename T>
+ struct TByCharCmp {
+ bool operator()(const T& a, const T& b) const {
+ return a.Char < b.Char;
+ }
+ };
+
+ struct TTreeEntry {
+ static const ui32 InvalidBranch = (ui32)-1;
+
+ ui64 Freq = 0;
+ ui32 Branches[2]{InvalidBranch, InvalidBranch};
+
+ ui32 CodeLength = 0;
+ ui8 Char = 0;
+ bool Invalid = false;
+
+ TTreeEntry() = default;
+
+ static bool ByFreq(const TTreeEntry& a, const TTreeEntry& b) {
+ return a.Freq < b.Freq;
+ }
+
+ static bool ByFreqRev(const TTreeEntry& a, const TTreeEntry& b) {
+ return a.Freq > b.Freq;
+ }
+ };
+
+ using TCodeTree = TVector<TTreeEntry>;
+
+ void InitTreeByFreqs(TCodeTree& tree, const ui64 freqs[256]) {
+ tree.reserve(255 * 256 / 2); // worst case - balanced tree
+
+ for (ui32 i = 0; i < 256; ++i) {
+ tree.emplace_back();
+ tree.back().Char = i;
+ tree.back().Freq = freqs[i];
+ }
+
+ StableSort(tree.begin(), tree.end(), TTreeEntry::ByFreq);
+ }
+
+ void InitTree(TCodeTree& tree, ISequenceReader* in) {
+ using namespace NPrivate;
+ ui64 freqs[256];
+ Zero(freqs);
+
+ TStringBuf r;
+ while (in->NextRegion(r)) {
+ for (ui64 i = 0; i < r.size(); ++i)
+ ++freqs[(ui8)r[i]];
+ }
+
+ InitTreeByFreqs(tree, freqs);
+ }
+
+ void CalculateCodeLengths(TCodeTree& tree) {
+ Y_ENSURE(tree.size() == 256, " ");
+ const ui32 firstbranch = tree.size();
+
+ ui32 curleaf = 0;
+ ui32 curbranch = firstbranch;
+
+ // building code tree. two priority queues are combined in one.
+ while (firstbranch - curleaf + tree.size() - curbranch >= 2) {
+ TTreeEntry e;
+
+ for (auto& branche : e.Branches) {
+ ui32 br;
+
+ if (curleaf >= firstbranch)
+ br = curbranch++;
+ else if (curbranch >= tree.size())
+ br = curleaf++;
+ else if (tree[curleaf].Freq < tree[curbranch].Freq)
+ br = curleaf++;
+ else
+ br = curbranch++;
+
+ Y_ENSURE(br < tree.size(), " ");
+ branche = br;
+ e.Freq += tree[br].Freq;
+ }
+
+ tree.push_back(e);
+ PushHeap(tree.begin() + curbranch, tree.end(), TTreeEntry::ByFreqRev);
+ }
+
+ // computing code lengths
+ for (ui64 i = tree.size() - 1; i >= firstbranch; --i) {
+ TTreeEntry e = tree[i];
+
+ for (auto branche : e.Branches)
+ tree[branche].CodeLength = e.CodeLength + 1;
+ }
+
+ // chopping off the branches
+ tree.resize(firstbranch);
+
+ Sort(tree.begin(), tree.end(), TCanonicalCmp<TTreeEntry>());
+
+ // simplification: we are stripping codes longer than 64 bits
+ while (!tree.empty() && tree.back().CodeLength > 64)
+ tree.pop_back();
+
+ // will not compress
+ if (tree.empty())
+ return;
+
+ // special invalid code word
+ tree.back().Invalid = true;
+ }
+
+ struct TEncoderEntry {
+ ui64 Code = 0;
+
+ ui8 CodeLength = 0;
+ ui8 Char = 0;
+ ui8 Invalid = true;
+
+ explicit TEncoderEntry(TTreeEntry e)
+ : CodeLength(e.CodeLength)
+ , Char(e.Char)
+ , Invalid(e.Invalid)
+ {
+ }
+
+ TEncoderEntry() = default;
+ };
+
+ struct TEncoderTable {
+ TEncoderEntry Entries[256];
+
+ void Save(IOutputStream* out) const {
+ ui16 nval = 0;
+
+ for (auto entrie : Entries)
+ nval += !entrie.Invalid;
+
+ ::Save(out, nval);
+
+ for (auto entrie : Entries) {
+ if (!entrie.Invalid) {
+ ::Save(out, entrie.Char);
+ ::Save(out, entrie.CodeLength);
+ }
+ }
+ }
+
+ void Load(IInputStream* in) {
+ ui16 nval = 0;
+ ::Load(in, nval);
+
+ for (ui32 i = 0; i < 256; ++i)
+ Entries[i].Char = i;
+
+ for (ui32 i = 0; i < nval; ++i) {
+ ui8 ch = 0;
+ ui8 len = 0;
+ ::Load(in, ch);
+ ::Load(in, len);
+ Entries[ch].CodeLength = len;
+ Entries[ch].Invalid = false;
+ }
+ }
+ };
+
+ struct TDecoderEntry {
+ ui32 NextTable : 10;
+ ui32 Char : 8;
+ ui32 Invalid : 1;
+ ui32 Bad : 1;
+
+ TDecoderEntry()
+ : NextTable()
+ , Char()
+ , Invalid()
+ , Bad()
+ {
+ }
+ };
+
+ struct TDecoderTable: public TIntrusiveListItem<TDecoderTable> {
+ ui64 Length = 0;
+ ui64 BaseCode = 0;
+
+ TDecoderEntry Entries[256];
+
+ TDecoderTable() {
+ Zero(Entries);
+ }
+ };
+
+ const int CACHE_BITS_COUNT = 16;
+ class THuffmanCodec::TImpl: public TAtomicRefCount<TImpl> {
+ TEncoderTable Encoder;
+ TDecoderTable Decoder[256];
+
+ TEncoderEntry Invalid;
+
+ ui32 SubTablesNum;
+
+ class THuffmanCache {
+ struct TCacheEntry {
+ int EndOffset : 24;
+ int BitsLeft : 8;
+ };
+ TVector<char> DecodeCache;
+ TVector<TCacheEntry> CacheEntries;
+ const TImpl& Original;
+
+ public:
+ THuffmanCache(const THuffmanCodec::TImpl& encoder);
+
+ void Decode(NBitIO::TBitInput& in, TBuffer& out) const;
+ };
+
+ THolder<THuffmanCache> Cache;
+
+ public:
+ TImpl()
+ : SubTablesNum(1)
+ {
+ Invalid.CodeLength = 255;
+ }
+
+ ui8 Encode(TStringBuf in, TBuffer& out) const {
+ out.Clear();
+
+ if (in.empty()) {
+ return 0;
+ }
+
+ out.Reserve(in.size() * 2);
+
+ {
+ NBitIO::TBitOutputVector<TBuffer> bout(&out);
+ TStringBuf tin = in;
+
+ // data is under compression
+ bout.Write(1, 1);
+
+ for (auto t : tin) {
+ const TEncoderEntry& ce = Encoder.Entries[(ui8)t];
+
+ bout.Write(ce.Code, ce.CodeLength);
+
+ if (ce.Invalid) {
+ bout.Write(t, 8);
+ }
+ }
+
+ // in canonical huffman coding there cannot be a code having no 0 in the suffix
+ // and shorter than 8 bits.
+ bout.Write((ui64)-1, bout.GetByteReminder());
+ return bout.GetByteReminder();
+ }
+ }
+
+ void Decode(TStringBuf in, TBuffer& out) const {
+ out.Clear();
+
+ if (in.empty()) {
+ return;
+ }
+
+ NBitIO::TBitInput bin(in);
+ ui64 f = 0;
+ bin.ReadK<1>(f);
+
+ // if data is uncompressed
+ if (!f) {
+ in.Skip(1);
+ out.Append(in.data(), in.size());
+ } else {
+ out.Reserve(in.size() * 8);
+
+ if (Cache.Get()) {
+ Cache->Decode(bin, out);
+ } else {
+ while (ReadNextChar(bin, out)) {
+ }
+ }
+ }
+ }
+
+ Y_FORCE_INLINE int ReadNextChar(NBitIO::TBitInput& bin, TBuffer& out) const {
+ const TDecoderTable* table = Decoder;
+ TDecoderEntry e;
+
+ int bitsRead = 0;
+ while (true) {
+ ui64 code = 0;
+
+ if (Y_UNLIKELY(!bin.Read(code, table->Length)))
+ return 0;
+ bitsRead += table->Length;
+
+ if (Y_UNLIKELY(code < table->BaseCode))
+ return 0;
+
+ code -= table->BaseCode;
+
+ if (Y_UNLIKELY(code > 255))
+ return 0;
+
+ e = table->Entries[code];
+
+ if (Y_UNLIKELY(e.Bad))
+ return 0;
+
+ if (e.NextTable) {
+ table = Decoder + e.NextTable;
+ } else {
+ if (e.Invalid) {
+ code = 0;
+ bin.ReadK<8>(code);
+ bitsRead += 8;
+ out.Append((ui8)code);
+ } else {
+ out.Append((ui8)e.Char);
+ }
+
+ return bitsRead;
+ }
+ }
+
+ Y_ENSURE(false, " could not decode input");
+ return 0;
+ }
+
+ void GenerateEncoder(TCodeTree& tree) {
+ const ui64 sz = tree.size();
+
+ TEncoderEntry lastcode = Encoder.Entries[tree[0].Char] = TEncoderEntry(tree[0]);
+
+ for (ui32 i = 1; i < sz; ++i) {
+ const TTreeEntry& te = tree[i];
+ TEncoderEntry& e = Encoder.Entries[te.Char];
+ e = TEncoderEntry(te);
+
+ e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength);
+ lastcode = e;
+
+ e.Code = ReverseBits(e.Code, e.CodeLength);
+
+ if (e.Invalid)
+ Invalid = e;
+ }
+
+ for (auto& e : Encoder.Entries) {
+ if (e.Invalid)
+ e = Invalid;
+
+ Y_ENSURE(e.CodeLength, " ");
+ }
+ }
+
+ void RegenerateEncoder() {
+ for (auto& entrie : Encoder.Entries) {
+ if (entrie.Invalid)
+ entrie.CodeLength = Invalid.CodeLength;
+ }
+
+ Sort(Encoder.Entries, Encoder.Entries + 256, TCanonicalCmp<TEncoderEntry>());
+
+ TEncoderEntry lastcode = Encoder.Entries[0];
+
+ for (ui32 i = 1; i < 256; ++i) {
+ TEncoderEntry& e = Encoder.Entries[i];
+ e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength);
+ lastcode = e;
+
+ e.Code = ReverseBits(e.Code, e.CodeLength);
+ }
+
+ for (auto& entrie : Encoder.Entries) {
+ if (entrie.Invalid) {
+ Invalid = entrie;
+ break;
+ }
+ }
+
+ Sort(Encoder.Entries, Encoder.Entries + 256, TByCharCmp<TEncoderEntry>());
+
+ for (auto& entrie : Encoder.Entries) {
+ if (entrie.Invalid)
+ entrie = Invalid;
+ }
+ }
+
+ void BuildDecoder() {
+ TEncoderTable enc = Encoder;
+ Sort(enc.Entries, enc.Entries + 256, TCanonicalCmp<TEncoderEntry>());
+
+ TEncoderEntry& e1 = enc.Entries[0];
+ Decoder[0].BaseCode = e1.Code;
+ Decoder[0].Length = e1.CodeLength;
+
+ for (auto e2 : enc.Entries) {
+ SetEntry(Decoder, e2.Code, e2.CodeLength, e2);
+ }
+ Cache.Reset(new THuffmanCache(*this));
+ }
+
+ void SetEntry(TDecoderTable* t, ui64 code, ui64 len, TEncoderEntry e) {
+ Y_ENSURE(len >= t->Length, len << " < " << t->Length);
+
+ ui64 idx = (code & MaskLowerBits(t->Length)) - t->BaseCode;
+ TDecoderEntry& d = t->Entries[idx];
+
+ if (len == t->Length) {
+ Y_ENSURE(!d.NextTable, " ");
+
+ d.Char = e.Char;
+ d.Invalid = e.Invalid;
+ return;
+ }
+
+ if (!d.NextTable) {
+ Y_ENSURE(SubTablesNum < Y_ARRAY_SIZE(Decoder), " ");
+ d.NextTable = SubTablesNum++;
+ TDecoderTable* nt = Decoder + d.NextTable;
+ nt->Length = Min<ui64>(8, len - t->Length);
+ nt->BaseCode = (code >> t->Length) & MaskLowerBits(nt->Length);
+ }
+
+ SetEntry(Decoder + d.NextTable, code >> t->Length, len - t->Length, e);
+ }
+
+ void Learn(ISequenceReader* in) {
+ {
+ TCodeTree tree;
+ InitTree(tree, in);
+ CalculateCodeLengths(tree);
+ Y_ENSURE(!tree.empty(), " ");
+ GenerateEncoder(tree);
+ }
+ BuildDecoder();
+ }
+
+ void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) {
+ TCodeTree tree;
+
+ ui64 freqsArray[256];
+ Zero(freqsArray);
+
+ for (const auto& freq : freqs)
+ freqsArray[static_cast<ui8>(freq.first)] += freq.second;
+
+ InitTreeByFreqs(tree, freqsArray);
+ CalculateCodeLengths(tree);
+
+ Y_ENSURE(!tree.empty(), " ");
+
+ GenerateEncoder(tree);
+ BuildDecoder();
+ }
+
+ void Save(IOutputStream* out) {
+ ::Save(out, Invalid.CodeLength);
+ Encoder.Save(out);
+ }
+
+ void Load(IInputStream* in) {
+ ::Load(in, Invalid.CodeLength);
+ Encoder.Load(in);
+ RegenerateEncoder();
+ BuildDecoder();
+ }
+ };
+
+ THuffmanCodec::TImpl::THuffmanCache::THuffmanCache(const THuffmanCodec::TImpl& codec)
+ : Original(codec)
+ {
+ CacheEntries.resize(1 << CACHE_BITS_COUNT);
+ DecodeCache.reserve(CacheEntries.size() * 2);
+ char buffer[2];
+ TBuffer decoded;
+ for (size_t i = 0; i < CacheEntries.size(); i++) {
+ buffer[1] = i >> 8;
+ buffer[0] = i;
+ NBitIO::TBitInput bin(buffer, buffer + sizeof(buffer));
+ int totalBits = 0;
+ while (true) {
+ decoded.Resize(0);
+ int bits = codec.ReadNextChar(bin, decoded);
+ if (totalBits + bits > 16 || !bits) {
+ TCacheEntry e = {static_cast<int>(DecodeCache.size()), 16 - totalBits};
+ CacheEntries[i] = e;
+ break;
+ }
+
+ for (TBuffer::TConstIterator it = decoded.Begin(); it != decoded.End(); ++it) {
+ DecodeCache.push_back(*it);
+ }
+ totalBits += bits;
+ }
+ }
+ DecodeCache.push_back(0);
+ CacheEntries.shrink_to_fit();
+ DecodeCache.shrink_to_fit();
+ }
+
+ void THuffmanCodec::TImpl::THuffmanCache::Decode(NBitIO::TBitInput& bin, TBuffer& out) const {
+ int bits = 0;
+ ui64 code = 0;
+ while (!bin.Eof()) {
+ ui64 f = 0;
+ const int toRead = 16 - bits;
+ if (toRead > 0 && bin.Read(f, toRead)) {
+ code = (code >> (16 - bits)) | (f << bits);
+ code &= 0xFFFF;
+ TCacheEntry entry = CacheEntries[code];
+ int start = code > 0 ? CacheEntries[code - 1].EndOffset : 0;
+ out.Append((const char*)&DecodeCache[start], (const char*)&DecodeCache[entry.EndOffset]);
+ bits = entry.BitsLeft;
+ } else { // should never happen until there are exceptions or unaligned input
+ bin.Back(bits);
+ if (!Original.ReadNextChar(bin, out))
+ break;
+
+ code = 0;
+ bits = 0;
+ }
+ }
+ }
+
+ THuffmanCodec::THuffmanCodec()
+ : Impl(new TImpl)
+ {
+ MyTraits.NeedsTraining = true;
+ MyTraits.PreservesPrefixGrouping = true;
+ MyTraits.PaddingBit = 1;
+ MyTraits.SizeOnEncodeMultiplier = 2;
+ MyTraits.SizeOnDecodeMultiplier = 8;
+ MyTraits.RecommendedSampleSize = 1 << 21;
+ }
+
+ THuffmanCodec::~THuffmanCodec() = default;
+
+ ui8 THuffmanCodec::Encode(TStringBuf in, TBuffer& bbb) const {
+ if (Y_UNLIKELY(!Trained))
+ ythrow TCodecException() << " not trained";
+
+ return Impl->Encode(in, bbb);
+ }
+
+ void THuffmanCodec::Decode(TStringBuf in, TBuffer& bbb) const {
+ Impl->Decode(in, bbb);
+ }
+
+ void THuffmanCodec::Save(IOutputStream* out) const {
+ Impl->Save(out);
+ }
+
+ void THuffmanCodec::Load(IInputStream* in) {
+ Impl->Load(in);
+ }
+
+ void THuffmanCodec::DoLearn(ISequenceReader& in) {
+ Impl->Learn(&in);
+ }
+
+ void THuffmanCodec::LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) {
+ Impl->LearnByFreqs(freqs);
+ Trained = true;
+ }
+
+}
diff --git a/library/cpp/codecs/huffman_codec.h b/library/cpp/codecs/huffman_codec.h
new file mode 100644
index 0000000000..559545b90d
--- /dev/null
+++ b/library/cpp/codecs/huffman_codec.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "codecs.h"
+
+#include <util/generic/ptr.h>
+#include <util/string/cast.h>
+
+namespace NCodecs {
+ // for types greater than char, pipeline with TFreqCodec.
+
+ class THuffmanCodec: public ICodec {
+ class TImpl;
+ TIntrusivePtr<TImpl> Impl;
+
+ public:
+ THuffmanCodec();
+ ~THuffmanCodec() override;
+
+ static TStringBuf MyName() {
+ return "huffman";
+ }
+
+ TString GetName() const override {
+ return ToString(MyName());
+ }
+
+ ui8 Encode(TStringBuf in, TBuffer& bbb) const override;
+
+ void Decode(TStringBuf in, TBuffer& bbb) const override;
+
+ void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs);
+
+ protected:
+ void DoLearn(ISequenceReader& in) override;
+ void Save(IOutputStream* out) const override;
+ void Load(IInputStream* in) override;
+ };
+
+}
diff --git a/library/cpp/codecs/pfor_codec.cpp b/library/cpp/codecs/pfor_codec.cpp
new file mode 100644
index 0000000000..f6b3b0920b
--- /dev/null
+++ b/library/cpp/codecs/pfor_codec.cpp
@@ -0,0 +1,22 @@
+#include "pfor_codec.h"
+
+namespace NCodecs {
+ template <>
+ TStringBuf TPForCodec<ui64, true>::MyName() {
+ return "pfor-delta64-sorted";
+ }
+ template <>
+ TStringBuf TPForCodec<ui32, true>::MyName() {
+ return "pfor-delta32-sorted";
+ }
+
+ template <>
+ TStringBuf TPForCodec<ui64, false>::MyName() {
+ return "pfor-ui64";
+ }
+ template <>
+ TStringBuf TPForCodec<ui32, false>::MyName() {
+ return "pfor-ui32";
+ }
+
+}
diff --git a/library/cpp/codecs/pfor_codec.h b/library/cpp/codecs/pfor_codec.h
new file mode 100644
index 0000000000..d7d4bb8bf4
--- /dev/null
+++ b/library/cpp/codecs/pfor_codec.h
@@ -0,0 +1,211 @@
+#pragma once
+
+#include "codecs.h"
+
+#include "delta_codec.h"
+#include "tls_cache.h"
+
+#include <library/cpp/bit_io/bitinput.h>
+#include <library/cpp/bit_io/bitoutput.h>
+#include <util/string/cast.h>
+
+namespace NCodecs {
+ template <typename T, bool WithDelta = false>
+ class TPForCodec: public ICodec {
+ using TUnsigned = std::make_unsigned_t<T>;
+ typedef TDeltaCodec<TUnsigned> TDCodec;
+
+ typedef std::conditional_t<WithDelta, typename TDCodec::TDelta, T> TValue;
+ static_assert(std::is_unsigned<TValue>::value, "expect std:is_unsigned<TValue>::value");
+
+ static const ui64 BitsInT = sizeof(TUnsigned) * 8;
+
+ TDCodec DeltaCodec;
+
+ public:
+ static TStringBuf MyName();
+
+ TPForCodec() {
+ MyTraits.AssumesStructuredInput = true;
+ MyTraits.SizeOfInputElement = sizeof(T);
+ MyTraits.SizeOnDecodeMultiplier = sizeof(T);
+ }
+
+ TString GetName() const override {
+ return ToString(MyName());
+ }
+
+ ui8 Encode(TStringBuf s, TBuffer& b) const override {
+ b.Clear();
+ if (s.empty()) {
+ return 0;
+ }
+
+ b.Reserve(2 * s.size() + b.Size());
+
+ if (WithDelta) {
+ auto buffer = TBufferTlsCache::TlsInstance().Item();
+ TBuffer& db = buffer.Get();
+ db.Clear();
+ db.Reserve(2 * s.size());
+ DeltaCodec.Encode(s, db);
+ s = TStringBuf{db.data(), db.size()};
+ }
+
+ TArrayRef<const TValue> tin{(const TValue*)s.data(), s.size() / sizeof(TValue)};
+
+ const ui64 sz = tin.size();
+ ui64 bitcounts[BitsInT + 1];
+ Zero(bitcounts);
+
+ ui32 zeros = 0;
+
+ for (const TValue* it = tin.begin(); it != tin.end(); ++it) {
+ TUnsigned v = 1 + (TUnsigned)*it;
+ ui64 l = MostSignificantBit(v) + 1;
+ ++bitcounts[l];
+
+ if (!v) {
+ ++zeros;
+ }
+ }
+
+ // cumulative bit counts
+ for (ui64 i = 0; i < BitsInT; ++i) {
+ bitcounts[i + 1] += bitcounts[i];
+ }
+
+ bool hasexceptions = zeros;
+ ui64 optimalbits = BitsInT;
+
+ {
+ ui64 excsize = 0;
+ ui64 minsize = sz * BitsInT;
+
+ for (ui64 current = BitsInT; current; --current) {
+ ui64 size = bitcounts[current] * current + (sz - bitcounts[current]) * (current + 6 + excsize) + zeros * (current + 6);
+
+ excsize += current * bitcounts[current];
+
+ if (size < minsize) {
+ minsize = size;
+ optimalbits = current;
+ hasexceptions = zeros || sz - bitcounts[current];
+ }
+ }
+ }
+
+ if (!optimalbits || BitsInT == optimalbits) {
+ b.Append((ui8)-1);
+ b.Append(s.data(), s.size());
+ return 0;
+ } else {
+ NBitIO::TBitOutputVector<TBuffer> bout(&b);
+ bout.Write(0, 1);
+ bout.Write(hasexceptions, 1);
+ bout.Write(optimalbits, 6);
+
+ for (const TValue* it = tin.begin(); it != tin.end(); ++it) {
+ TUnsigned word = 1 + (TUnsigned)*it;
+ ui64 len = MostSignificantBit(word) + 1;
+ if (len > optimalbits || !word) {
+ Y_ENSURE(hasexceptions, " ");
+ bout.Write(0, optimalbits);
+ bout.Write(len, 6);
+ bout.Write(word, len);
+ } else {
+ bout.Write(word, optimalbits);
+ }
+ }
+
+ return bout.GetByteReminder();
+ } // the rest of the last byte is zero padded. BitsInT is always > 7.
+ }
+
+ void Decode(TStringBuf s, TBuffer& b) const override {
+ b.Clear();
+ if (s.empty()) {
+ return;
+ }
+
+ b.Reserve(s.size() * sizeof(T) + b.Size());
+
+ ui64 isplain = 0;
+ ui64 hasexceptions = 0;
+ ui64 bits = 0;
+
+ NBitIO::TBitInput bin(s);
+ bin.ReadK<1>(isplain);
+ bin.ReadK<1>(hasexceptions);
+ bin.ReadK<6>(bits);
+
+ if (Y_UNLIKELY(isplain)) {
+ s.Skip(1);
+
+ if (WithDelta) {
+ DeltaCodec.Decode(s, b);
+ } else {
+ b.Append(s.data(), s.size());
+ }
+ } else {
+ typename TDCodec::TDecoder decoder;
+
+ if (hasexceptions) {
+ ui64 word = 0;
+ while (bin.Read(word, bits)) {
+ if (word || (bin.ReadK<6>(word) && bin.Read(word, word))) {
+ --word;
+
+ TValue t = word;
+
+ if (WithDelta) {
+ if (decoder.Decode(t)) {
+ TStringBuf r{(char*)&decoder.Result, sizeof(decoder.Result)};
+ b.Append(r.data(), r.size());
+ }
+ } else {
+ TStringBuf r{(char*)&t, sizeof(t)};
+ b.Append(r.data(), r.size());
+ }
+ }
+ }
+ } else {
+ ui64 word = 0;
+ T outarr[256 / sizeof(T)];
+ ui32 cnt = 0;
+ while (true) {
+ ui64 v = bin.Read(word, bits);
+
+ if ((!v) | (!word))
+ break;
+
+ --word;
+ TValue t = word;
+
+ if (WithDelta) {
+ if (decoder.Decode(t)) {
+ outarr[cnt++] = decoder.Result;
+ }
+ } else {
+ outarr[cnt++] = t;
+ }
+
+ if (cnt == Y_ARRAY_SIZE(outarr)) {
+ b.Append((const char*)outarr, sizeof(outarr));
+ cnt = 0;
+ }
+ }
+
+ if (cnt) {
+ b.Append((const char*)outarr, cnt * sizeof(T));
+ }
+ }
+ }
+ }
+
+ protected:
+ void DoLearn(ISequenceReader&) override {
+ }
+ };
+
+}
diff --git a/library/cpp/codecs/sample.h b/library/cpp/codecs/sample.h
new file mode 100644
index 0000000000..15f03afcc5
--- /dev/null
+++ b/library/cpp/codecs/sample.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <library/cpp/deprecated/accessors/accessors.h>
+
+#include <util/generic/buffer.h>
+#include <util/generic/vector.h>
+#include <util/random/fast.h>
+#include <util/random/shuffle.h>
+
+#include <functional>
+#include <type_traits>
+
+namespace NCodecs {
+ class ISequenceReader {
+ public:
+ virtual bool NextRegion(TStringBuf& s) = 0;
+
+ virtual ~ISequenceReader() = default;
+ };
+
+ template <class TValue>
+ TStringBuf ValueToStringBuf(TValue&& t) {
+ return TStringBuf{NAccessors::Begin(t), NAccessors::End(t)};
+ }
+
+ template <class TIter>
+ TStringBuf IterToStringBuf(TIter iter) {
+ return ValueToStringBuf(*iter);
+ }
+
+ template <class TItem>
+ class TSimpleSequenceReader: public ISequenceReader {
+ const TVector<TItem>& Items;
+ size_t Idx = 0;
+
+ public:
+ TSimpleSequenceReader(const TVector<TItem>& items)
+ : Items(items)
+ {
+ }
+
+ bool NextRegion(TStringBuf& s) override {
+ if (Idx >= Items.size()) {
+ return false;
+ }
+
+ s = ValueToStringBuf(Items[Idx++]);
+ return true;
+ }
+ };
+
+ template <class TIter, class TGetter>
+ size_t GetInputSize(TIter begin, TIter end, TGetter getter) {
+ size_t totalBytes = 0;
+ for (TIter iter = begin; iter != end; ++iter) {
+ totalBytes += getter(iter).size();
+ }
+ return totalBytes;
+ }
+
+ template <class TIter>
+ size_t GetInputSize(TIter begin, TIter end) {
+ return GetInputSize(begin, end, IterToStringBuf<TIter>);
+ }
+
+ template <class TIter, class TGetter>
+ TVector<TBuffer> GetSample(TIter begin, TIter end, size_t sampleSizeBytes, TGetter getter) {
+ TFastRng64 rng{0x1ce1f2e507541a05, 0x07d45659, 0x7b8771030dd9917e, 0x2d6636ce};
+
+ size_t totalBytes = GetInputSize(begin, end, getter);
+ double sampleProb = (double)sampleSizeBytes / Max<size_t>(1, totalBytes);
+
+ TVector<TBuffer> result;
+ for (TIter iter = begin; iter != end; ++iter) {
+ if (sampleProb >= 1 || rng.GenRandReal1() < sampleProb) {
+ TStringBuf reg = getter(iter);
+ result.emplace_back(reg.data(), reg.size());
+ }
+ }
+ Shuffle(result.begin(), result.end(), rng);
+ return result;
+ }
+
+ template <class TIter>
+ TVector<TBuffer> GetSample(TIter begin, TIter end, size_t sampleSizeBytes) {
+ return GetSample(begin, end, sampleSizeBytes, IterToStringBuf<TIter>);
+ }
+
+}
diff --git a/library/cpp/codecs/solar_codec.cpp b/library/cpp/codecs/solar_codec.cpp
new file mode 100644
index 0000000000..d0692fe2a4
--- /dev/null
+++ b/library/cpp/codecs/solar_codec.cpp
@@ -0,0 +1,133 @@
+#include "solar_codec.h"
+
+#include <library/cpp/codecs/greedy_dict/gd_builder.h>
+
+#include <library/cpp/containers/comptrie/comptrie_builder.h>
+#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h>
+#include <util/stream/length.h>
+#include <util/string/printf.h>
+#include <util/ysaveload.h>
+
+namespace NCodecs {
+ static inline ui32 Append(TBuffer& pool, TStringBuf data) {
+ pool.Append(data.data(), data.size());
+ return pool.Size();
+ }
+
+ void TSolarCodec::DoLearn(ISequenceReader& r) {
+ using namespace NGreedyDict;
+
+ Decoder.clear();
+ Pool.Clear();
+
+ THolder<TEntrySet> set;
+
+ {
+ TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance());
+ TStringBufs bufs;
+
+ TStringBuf m;
+ while (r.NextRegion(m)) {
+ bufs.push_back(pool.AppendString(m));
+ }
+
+ {
+ TDictBuilder b(Settings);
+ b.SetInput(bufs);
+ b.Build(MaxEntries, MaxIterations);
+
+ set = b.ReleaseEntrySet();
+ }
+ }
+
+ set->SetScores(ES_LEN_COUNT);
+
+ {
+ TVector<std::pair<float, TStringBuf>> tmp;
+ tmp.reserve(set->size());
+
+ for (const auto& it : *set) {
+ tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1))));
+ }
+
+ Sort(tmp.begin(), tmp.end());
+
+ Decoder.reserve(tmp.size() + 1);
+ Decoder.push_back(0);
+
+ for (const auto& it : tmp) {
+ Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed");
+ ui32 endoff = Append(Pool, it.second);
+ Decoder.push_back(endoff);
+ }
+ }
+
+ Pool.ShrinkToFit();
+ Decoder.shrink_to_fit();
+
+ TBufferOutput bout;
+
+ {
+ TVector<std::pair<TStringBuf, ui32>> tmp2;
+ tmp2.reserve(Decoder.size());
+
+ for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) {
+ TStringBuf s = DoDecode(i);
+ tmp2.push_back(std::make_pair(s, i - 1));
+ Y_ENSURE(s.size() == (Decoder[i] - Decoder[i - 1]), "learning invariant failed");
+ }
+
+ Sort(tmp2.begin(), tmp2.end());
+
+ {
+ TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED);
+ for (const auto& it : tmp2) {
+ builder.Add(it.first.data(), it.first.size(), it.second);
+ }
+
+ builder.Save(bout);
+ }
+ }
+
+ Encoder.Init(TBlob::FromBuffer(bout.Buffer()));
+ }
+
+ void TSolarCodec::Save(IOutputStream* out) const {
+ TBlob b = Encoder.Data();
+ ::Save(out, (ui32)b.Size());
+ out->Write(b.Data(), b.Size());
+ }
+
+ void TSolarCodec::Load(IInputStream* in) {
+ ui32 sz;
+ ::Load(in, sz);
+ TLengthLimitedInput lin(in, sz);
+ Encoder.Init(TBlob::FromStream(lin));
+ Pool.Clear();
+ Decoder.clear();
+
+ TVector<std::pair<ui32, TString>> tmp;
+
+ ui32 poolsz = 0;
+ for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) {
+ const TString& s = it.GetKey();
+ tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s));
+ poolsz += Max<ui32>(s.size(), 1);
+ }
+
+ Sort(tmp.begin(), tmp.end());
+
+ Pool.Reserve(poolsz);
+ Decoder.reserve(tmp.size() + 1);
+ Decoder.push_back(0);
+
+ for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) {
+ Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first);
+ Decoder.push_back(Append(Pool, tmp[i].second));
+ }
+
+ Pool.ShrinkToFit();
+ Decoder.shrink_to_fit();
+ }
+
+}
diff --git a/library/cpp/codecs/solar_codec.h b/library/cpp/codecs/solar_codec.h
new file mode 100644
index 0000000000..7158ae7926
--- /dev/null
+++ b/library/cpp/codecs/solar_codec.h
@@ -0,0 +1,244 @@
+#pragma once
+
+#include "codecs.h"
+#include <library/cpp/containers/comptrie/comptrie_trie.h>
+#include <library/cpp/codecs/greedy_dict/gd_builder.h>
+
+#include <util/string/cast.h>
+#include <util/string/escape.h>
+
+namespace NCodecs {
+ // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы.
+ // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе.
+
+ struct TVarIntTraits {
+ static const size_t MAX_VARINT32_BYTES = 5;
+
+ static void Write(ui32 value, TBuffer& b) {
+ while (value > 0x7F) {
+ b.Append(static_cast<ui8>(value) | 0x80);
+ value >>= 7;
+ }
+ b.Append(static_cast<ui8>(value) & 0x7F);
+ }
+
+ static void Read(TStringBuf& r, ui32& value) {
+ ui32 result = 0;
+ for (ui32 count = 0; count < MAX_VARINT32_BYTES; ++count) {
+ const ui32 b = static_cast<ui8>(r[0]);
+ r.Skip(1);
+ result |= static_cast<ui32>(b & 0x7F) << (7 * count);
+ if (!(b & 0x80)) {
+ value = result;
+ return;
+ } else if (Y_UNLIKELY(r.empty())) {
+ break;
+ }
+ }
+ Y_ENSURE_EX(false, TCodecException() << "Bad data");
+ }
+ };
+
+ struct TShortIntTraits {
+ static const size_t SHORTINT_SIZE_LIMIT = 0x8000;
+
+ Y_FORCE_INLINE static void Write(ui32 value, TBuffer& b) {
+ Y_ENSURE_EX(value < SHORTINT_SIZE_LIMIT, TCodecException() << "Bad write method");
+ if (value >= 0x80) {
+ b.Append(static_cast<ui8>(value >> 8) | 0x80);
+ }
+ b.Append(static_cast<ui8>(value));
+ }
+
+ Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) {
+ ui32 result = static_cast<ui8>(r[0]);
+ r.Skip(1);
+ if (result >= 0x80) {
+ Y_ENSURE_EX(!r.empty(), TCodecException() << "Bad data");
+ result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]);
+ r.Skip(1);
+ }
+ value = result;
+ }
+ };
+
+ class TSolarCodec: public ICodec {
+ public:
+ static TStringBuf MyName8k() {
+ return TStringBuf("solar-8k");
+ }
+ static TStringBuf MyName16k() {
+ return TStringBuf("solar-16k");
+ }
+ static TStringBuf MyName32k() {
+ return TStringBuf("solar-32k");
+ }
+ static TStringBuf MyName64k() {
+ return TStringBuf("solar-64k");
+ }
+ static TStringBuf MyName256k() {
+ return TStringBuf("solar-256k");
+ }
+ static TStringBuf MyName() {
+ return TStringBuf("solar");
+ }
+ static TStringBuf MyName8kAdapt() {
+ return TStringBuf("solar-8k-a");
+ }
+ static TStringBuf MyName16kAdapt() {
+ return TStringBuf("solar-16k-a");
+ }
+ static TStringBuf MyName32kAdapt() {
+ return TStringBuf("solar-32k-a");
+ }
+ static TStringBuf MyName64kAdapt() {
+ return TStringBuf("solar-64k-a");
+ }
+ static TStringBuf MyName256kAdapt() {
+ return TStringBuf("solar-256k-a");
+ }
+ static TStringBuf MyNameShortInt() {
+ return TStringBuf("solar-si");
+ }
+
+ explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
+ : Settings(s)
+ , MaxEntries(maxentries)
+ , MaxIterations(maxiter)
+ {
+ MyTraits.NeedsTraining = true;
+ MyTraits.SizeOnDecodeMultiplier = 2;
+ MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8;
+ }
+
+ ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
+ EncodeImpl<TVarIntTraits>(r, b);
+ return 0;
+ }
+
+ void Decode(TStringBuf r, TBuffer& b) const override {
+ DecodeImpl<TVarIntTraits>(r, b);
+ }
+
+ TString GetName() const override {
+ return ToString(MyName());
+ }
+
+ protected:
+ void DoLearn(ISequenceReader&) override;
+ void Save(IOutputStream*) const override;
+ void Load(IInputStream*) override;
+
+ Y_FORCE_INLINE TStringBuf SubStr(ui32 begoff, ui32 endoff) const {
+ return TStringBuf(Pool.Data() + begoff, endoff - begoff);
+ }
+
+ Y_FORCE_INLINE TStringBuf DoDecode(ui32 num) const {
+ return SubStr(Decoder[num - 1], Decoder[num]);
+ }
+
+ template <class TTraits>
+ Y_FORCE_INLINE void EncodeImpl(TStringBuf r, TBuffer& b) const {
+ b.Clear();
+ b.Reserve(r.size());
+ while (!r.empty()) {
+ size_t sz = 0;
+ ui32 val = (ui32)-1;
+ Encoder.FindLongestPrefix(r, &sz, &val);
+ TTraits::Write(val + 1, b);
+ r.Skip(Max<size_t>(sz, 1));
+ }
+ }
+
+ template <class TTraits>
+ Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const {
+ b.Clear();
+ b.Reserve(r.size());
+ ui32 v = 0;
+ while (!r.empty()) {
+ TTraits::Read(r, v);
+ TStringBuf s = DoDecode(v);
+ b.Append(s.data(), s.size());
+ }
+ }
+
+ inline bool CanUseShortInt() const {
+ return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT;
+ }
+
+ private:
+ typedef TCompactTrie<char, ui32> TEncoder;
+ typedef TVector<ui32> TDecoder;
+
+ TBuffer Pool;
+ TEncoder Encoder;
+ TDecoder Decoder;
+
+ NGreedyDict::TBuildSettings Settings;
+ ui32 MaxEntries;
+ ui32 MaxIterations;
+ };
+
+ // Uses varints or shortints depending on the decoder size
+ class TAdaptiveSolarCodec: public TSolarCodec {
+ public:
+ explicit TAdaptiveSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
+ : TSolarCodec(maxentries, maxiter, s)
+ {
+ }
+
+ ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
+ if (CanUseShortInt()) {
+ EncodeImpl<TShortIntTraits>(r, b);
+ } else {
+ EncodeImpl<TVarIntTraits>(r, b);
+ }
+
+ return 0;
+ }
+
+ void Decode(TStringBuf r, TBuffer& b) const override {
+ if (CanUseShortInt()) {
+ DecodeImpl<TShortIntTraits>(r, b);
+ } else {
+ DecodeImpl<TVarIntTraits>(r, b);
+ }
+ }
+
+ TString GetName() const override {
+ if (CanUseShortInt()) {
+ return ToString(MyNameShortInt());
+ } else {
+ return ToString(MyName());
+ }
+ }
+ };
+
+ class TSolarCodecShortInt: public TSolarCodec {
+ public:
+ explicit TSolarCodecShortInt(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
+ : TSolarCodec(maxentries, maxiter, s)
+ {
+ }
+
+ ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
+ EncodeImpl<TShortIntTraits>(r, b);
+ return 0;
+ }
+
+ void Decode(TStringBuf r, TBuffer& b) const override {
+ DecodeImpl<TShortIntTraits>(r, b);
+ }
+
+ TString GetName() const override {
+ return ToString(MyNameShortInt());
+ }
+
+ protected:
+ void Load(IInputStream* in) override {
+ TSolarCodec::Load(in);
+ Y_ENSURE_EX(CanUseShortInt(), TCodecException() << "Bad data");
+ }
+ };
+
+}
diff --git a/library/cpp/codecs/static/README b/library/cpp/codecs/static/README
new file mode 100644
index 0000000000..1b07f02433
--- /dev/null
+++ b/library/cpp/codecs/static/README
@@ -0,0 +1 @@
+Support of static libraries in library/cpp/codecs. See library/cpp/codecs/static/example.
diff --git a/library/cpp/codecs/static/builder.cpp b/library/cpp/codecs/static/builder.cpp
new file mode 100644
index 0000000000..93e34a3edb
--- /dev/null
+++ b/library/cpp/codecs/static/builder.cpp
@@ -0,0 +1,39 @@
+#include "builder.h"
+#include "common.h"
+
+#include <library/cpp/codecs/static/static_codec_info.pb.h>
+
+#include <library/cpp/codecs/codecs.h>
+
+#include <util/generic/yexception.h>
+#include <util/string/subst.h>
+
+namespace NCodecs {
+ TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo& info) {
+ TStaticCodecInfo result;
+ TCodecPtr codec = ICodec::GetInstance(info.CodecName);
+ Y_ENSURE_EX(codec, TCodecException() << "empty codec is not allowed");
+
+ codec->LearnX(trainingData.begin(), trainingData.end(), info.SampleSizeMultiplier);
+ {
+ TStringOutput sout{*result.MutableStoredCodec()};
+ ICodec::Store(&sout, codec);
+ }
+
+ auto& debugInfo = *result.MutableDebugInfo();
+ debugInfo.SetStoredCodecHash(DataSignature(result.GetStoredCodec()));
+ debugInfo.SetCodecName(info.CodecName);
+ debugInfo.SetSampleSizeMultiplier(info.SampleSizeMultiplier);
+ debugInfo.SetTimestamp(info.Timestamp);
+ debugInfo.SetRevisionInfo(info.RevisionInfo);
+ debugInfo.SetTrainingSetComment(info.TrainingSetComment);
+ debugInfo.SetTrainingSetResId(info.TrainingSetResId);
+ return result;
+ }
+
+ TString GetStandardFileName(const TStaticCodecInfo& info) {
+ TString cName = info.GetDebugInfo().GetCodecName();
+ SubstGlobal(cName, ':', '.');
+ return TStringBuilder() << cName << "." << info.GetDebugInfo().GetTimestamp() << ".codec_info";
+ }
+}
diff --git a/library/cpp/codecs/static/builder.h b/library/cpp/codecs/static/builder.h
new file mode 100644
index 0000000000..d7533be4d5
--- /dev/null
+++ b/library/cpp/codecs/static/builder.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "static.h"
+
+#include <library/cpp/svnversion/svnversion.h>
+
+#include <util/datetime/base.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/string/builder.h>
+
+namespace NCodecs {
+ struct TCodecBuildInfo {
+ // optimal values from SEARCH-1655
+ TString CodecName = "solar-8k-a:zstd08d-1";
+ float SampleSizeMultiplier = 1;
+
+ // debug info:
+ time_t Timestamp = TInstant::Now().TimeT();
+ TString RevisionInfo = (TStringBuilder() << "r" << ToString(GetProgramSvnRevision()));
+ TString TrainingSetComment; // a human comment on the training data
+ TString TrainingSetResId; // sandbox resid of the training set
+ };
+
+ TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo&);
+
+ TString GetStandardFileName(const TStaticCodecInfo&);
+
+}
diff --git a/library/cpp/codecs/static/common.h b/library/cpp/codecs/static/common.h
new file mode 100644
index 0000000000..211de2a27d
--- /dev/null
+++ b/library/cpp/codecs/static/common.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <util/string/hex.h>
+#include <util/digest/city.h>
+#include <util/system/byteorder.h>
+
+namespace NCodecs {
+ template <class T>
+ ui64 DataSignature(const T& t) {
+ static_assert(!std::is_scalar<T>::value, "no scalars");
+ return CityHash64(t.data(), t.size());
+ }
+
+ template <class T>
+ TString HexWriteScalar(T t) {
+ static_assert(std::is_scalar<T>::value, "scalars only");
+ t = LittleToBig(t);
+ TString res = HexEncode(&t, sizeof(t));
+ res.to_lower();
+ return res;
+ }
+
+ template <class T>
+ T HexReadScalar(TStringBuf s) {
+ static_assert(std::is_scalar<T>::value, "scalars only");
+ T t = 0;
+ HexDecode(s.data(), Min(s.size(), sizeof(T)), &t);
+ t = BigToLittle(t);
+ return t;
+ }
+
+}
diff --git a/library/cpp/codecs/static/example/example.cpp b/library/cpp/codecs/static/example/example.cpp
new file mode 100644
index 0000000000..5b750b717e
--- /dev/null
+++ b/library/cpp/codecs/static/example/example.cpp
@@ -0,0 +1,43 @@
+#include "example.h"
+
+#include <library/cpp/codecs/static/static.h>
+
+#include <util/generic/yexception.h>
+
+extern "C" {
+extern const ui8 codec_info_huff_20160707[];
+extern const ui32 codec_info_huff_20160707Size;
+extern const ui8 codec_info_sa_huff_20160707[];
+extern const ui32 codec_info_sa_huff_20160707Size;
+};
+
+namespace NStaticCodecExample {
+ static const NCodecs::TCodecConstPtr CODECS[] = {
+ nullptr,
+ NCodecs::RestoreCodecFromArchive(codec_info_huff_20160707, codec_info_huff_20160707Size),
+ NCodecs::RestoreCodecFromArchive(codec_info_sa_huff_20160707, codec_info_sa_huff_20160707Size),
+ };
+
+ static_assert(Y_ARRAY_SIZE(CODECS) == DV_COUNT, "bad array size");
+
+ void Encode(TBuffer& out, TStringBuf in, EDictVersion dv) {
+ Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv);
+ out.Clear();
+ if (!in) {
+ return;
+ }
+ CODECS[dv]->Encode(in, out);
+ out.Append((char)dv);
+ }
+
+ void Decode(TBuffer& out, TStringBuf in) {
+ out.Clear();
+ if (!in) {
+ return;
+ }
+ EDictVersion dv = (EDictVersion)in.back();
+ Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv);
+ in.Chop(1);
+ CODECS[dv]->Decode(in, out);
+ }
+}
diff --git a/library/cpp/codecs/static/example/example.h b/library/cpp/codecs/static/example/example.h
new file mode 100644
index 0000000000..f9b3a7324b
--- /dev/null
+++ b/library/cpp/codecs/static/example/example.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <util/generic/strbuf.h>
+#include <util/generic/buffer.h>
+
+namespace NStaticCodecExample {
+ enum EDictVersion : ui8 {
+ DV_NULL = 0,
+ DV_HUFF_20160707,
+ DV_SA_HUFF_20160707,
+ DV_COUNT
+ };
+
+ void Encode(TBuffer&, TStringBuf, EDictVersion dv = DV_SA_HUFF_20160707);
+
+ void Decode(TBuffer&, TStringBuf);
+}
diff --git a/library/cpp/codecs/static/example/huffman.1467494385.codec_info b/library/cpp/codecs/static/example/huffman.1467494385.codec_info
new file mode 100644
index 0000000000..5fc18270a6
--- /dev/null
+++ b/library/cpp/codecs/static/example/huffman.1467494385.codec_info
Binary files differ
diff --git a/library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_info b/library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_info
new file mode 100644
index 0000000000..d36d8e24ec
--- /dev/null
+++ b/library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_info
Binary files differ
diff --git a/library/cpp/codecs/static/example/ya.make b/library/cpp/codecs/static/example/ya.make
new file mode 100644
index 0000000000..ca6c5fd900
--- /dev/null
+++ b/library/cpp/codecs/static/example/ya.make
@@ -0,0 +1,24 @@
+LIBRARY()
+
+OWNER(velavokr)
+
+SRCS(
+ GLOBAL example.cpp
+)
+
+PEERDIR(
+ library/cpp/codecs
+ library/cpp/codecs/static
+)
+
+ARCHIVE_ASM(
+ "solar-8k-a.huffman.1467494385.codec_info"
+ NAME codec_info_sa_huff_20160707
+)
+
+ARCHIVE_ASM(
+ "huffman.1467494385.codec_info"
+ NAME codec_info_huff_20160707
+)
+
+END()
diff --git a/library/cpp/codecs/static/static.cpp b/library/cpp/codecs/static/static.cpp
new file mode 100644
index 0000000000..44a07dd73a
--- /dev/null
+++ b/library/cpp/codecs/static/static.cpp
@@ -0,0 +1,98 @@
+#include "static.h"
+#include "common.h"
+
+#include <library/cpp/codecs/static/static_codec_info.pb.h>
+#include <library/cpp/archive/yarchive.h>
+
+#include <util/draft/datetime.h>
+
+#include <util/string/builder.h>
+#include <util/stream/buffer.h>
+#include <util/stream/mem.h>
+#include <util/string/hex.h>
+#include <util/ysaveload.h>
+
+namespace NCodecs {
+ static constexpr TStringBuf STATIC_CODEC_INFO_MAGIC = "CodecInf";
+
+ static TStringBuf GetStaticCodecInfoMagic() {
+ return STATIC_CODEC_INFO_MAGIC;
+ }
+
+ void SaveCodecInfoToStream(IOutputStream& out, const TStaticCodecInfo& info) {
+ TBufferOutput bout;
+ info.SerializeToArcadiaStream(&bout);
+ ui64 hash = DataSignature(bout.Buffer());
+ out.Write(GetStaticCodecInfoMagic());
+ ::Save(&out, hash);
+ ::Save(&out, bout.Buffer());
+ }
+
+ TStaticCodecInfo LoadCodecInfoFromStream(IInputStream& in) {
+ {
+ TBuffer magic;
+ magic.Resize(GetStaticCodecInfoMagic().size());
+ Y_ENSURE_EX(in.Read(magic.Data(), GetStaticCodecInfoMagic().size()) == GetStaticCodecInfoMagic().size(),
+ TCodecException() << "bad codec info");
+ Y_ENSURE_EX(TStringBuf(magic.data(), magic.size()) == GetStaticCodecInfoMagic(),
+ TCodecException() << "bad codec info");
+ }
+
+ ui64 hash;
+ ::Load(&in, hash);
+ TBuffer info;
+ ::Load(&in, info);
+ Y_ENSURE_EX(hash == DataSignature(info), TCodecException() << "bad codec info");
+
+ TStaticCodecInfo result;
+ Y_ENSURE_EX(result.ParseFromArray(info.data(), info.size()), TCodecException() << "bad codec info");
+
+ return result;
+ }
+
+ TString SaveCodecInfoToString(const TStaticCodecInfo& info) {
+ TStringStream s;
+ SaveCodecInfoToStream(s, info);
+ return s.Str();
+ }
+
+ TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data) {
+ TMemoryInput m{data.data(), data.size()};
+ return LoadCodecInfoFromStream(m);
+ }
+
+ TString FormatCodecInfo(const TStaticCodecInfo& ci) {
+ TStringBuilder s;
+ s << "codec name: " << ci.GetDebugInfo().GetCodecName() << Endl;
+ s << "codec hash: " << HexWriteScalar(ci.GetDebugInfo().GetStoredCodecHash()) << Endl;
+ s << "dict size: " << ci.GetStoredCodec().Size() << Endl;
+ s << "sample mult: " << ci.GetDebugInfo().GetSampleSizeMultiplier() << Endl;
+ s << "orig.compress: " << ci.GetDebugInfo().GetCompression() * 100 << " %" << Endl;
+ s << "timestamp: " << ci.GetDebugInfo().GetTimestamp() << " ("
+ << NDatetime::TSimpleTM::NewLocal(ci.GetDebugInfo().GetTimestamp()).ToString()
+ << ")" << Endl;
+ s << "revision: " << ci.GetDebugInfo().GetRevisionInfo() << Endl;
+ s << "training set comment: " << ci.GetDebugInfo().GetTrainingSetComment() << Endl;
+ s << "training set resId: " << ci.GetDebugInfo().GetTrainingSetResId() << Endl;
+ return s;
+ }
+
+ TString LoadStringFromArchive(const ui8* begin, size_t size) {
+ TArchiveReader ar(TBlob::NoCopy(begin, size));
+ Y_VERIFY(ar.Count() == 1, "invalid number of entries");
+ auto blob = ar.ObjectBlobByKey(ar.KeyByIndex(0));
+ return TString{blob.AsCharPtr(), blob.Size()};
+ }
+
+ TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo& info) {
+ return NCodecs::ICodec::RestoreFromString(info.GetStoredCodec());
+ }
+
+ TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size) {
+ const auto& data = LoadStringFromArchive(begin, size);
+ const auto& info = LoadCodecInfoFromString(data);
+ const auto& codec = RestoreCodecFromCodecInfo(info);
+ Y_ENSURE_EX(codec, TCodecException() << "null codec");
+ return codec;
+ }
+}
diff --git a/library/cpp/codecs/static/static.h b/library/cpp/codecs/static/static.h
new file mode 100644
index 0000000000..c1eaed2a74
--- /dev/null
+++ b/library/cpp/codecs/static/static.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <library/cpp/codecs/codecs.h>
+
+#include <util/generic/strbuf.h>
+#include <util/generic/string.h>
+#include <util/stream/output.h>
+
+namespace NCodecs {
+ class TStaticCodecInfo;
+
+ // load
+
+ TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo&);
+
+ TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data);
+
+ TString LoadStringFromArchive(const ui8* begin, size_t size);
+
+ TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size);
+
+ // save
+
+ TString SaveCodecInfoToString(const TStaticCodecInfo&);
+
+ void SaveCodecInfoToStream(IOutputStream& out, const TStaticCodecInfo&);
+
+ // misc
+
+ TStaticCodecInfo LoadCodecInfoFromStream(IInputStream& in);
+
+ TString FormatCodecInfo(const TStaticCodecInfo&);
+
+}
diff --git a/library/cpp/codecs/static/static_codec_info.proto b/library/cpp/codecs/static/static_codec_info.proto
new file mode 100644
index 0000000000..362abb4dad
--- /dev/null
+++ b/library/cpp/codecs/static/static_codec_info.proto
@@ -0,0 +1,17 @@
+package NCodecs;
+
+message TStaticCodecInfo {
+ message TDebugInfo {
+ optional string CodecName = 1; // the exact codec variant name
+ optional uint64 Timestamp = 2; // when the codec was built
+ optional string RevisionInfo = 3; // the arcadia revision info
+ optional float SampleSizeMultiplier = 4; // how the default sample size was modified to improve compression
+ optional float Compression = 5; // the compression on the training set ((raw_size - coded_size) / raw_size)
+ optional string TrainingSetComment = 6; // a human readable description of the training set
+ optional string TrainingSetResId = 7; // the training set sandbox resource id
+ optional uint64 StoredCodecHash = 8; // cityhash64(data)
+ }
+
+ optional bytes StoredCodec = 1; // the data of the codec
+ optional TDebugInfo DebugInfo = 2; // misc debug info which could be useful in finding whereabouts later
+}
diff --git a/library/cpp/codecs/static/tools/common/ct_common.cpp b/library/cpp/codecs/static/tools/common/ct_common.cpp
new file mode 100644
index 0000000000..fe77691280
--- /dev/null
+++ b/library/cpp/codecs/static/tools/common/ct_common.cpp
@@ -0,0 +1,74 @@
+#include "ct_common.h"
+
+#include <library/cpp/codecs/codecs.h>
+#include <library/cpp/codecs/static/static_codec_info.pb.h>
+#include <library/cpp/string_utils/base64/base64.h>
+
+#include <util/stream/output.h>
+#include <util/string/builder.h>
+#include <util/system/hp_timer.h>
+
+namespace NCodecs {
+ TString TComprStats::Format(const TStaticCodecInfo& info, bool checkMode) const {
+ TStringBuilder s;
+ s << "raw size/item: " << RawSizePerRecord() << Endl;
+ s << "enc.size/item: " << EncSizePerRecord() << Endl;
+ if (checkMode) {
+ s << "orig.enc.size/item: " << OldEncSizePerRecord(info.GetDebugInfo().GetCompression()) << Endl;
+ }
+ s << "enc time us/item: " << EncTimePerRecordUS() << Endl;
+ s << "dec time us/item: " << DecTimePerRecordUS() << Endl;
+ s << "dict size: " << info.GetStoredCodec().Size() << Endl;
+ s << "compression: " << AsPercent(Compression()) << " %" << Endl;
+ if (checkMode) {
+ s << "orig.compression: " << AsPercent(info.GetDebugInfo().GetCompression()) << " %" << Endl;
+ }
+ return s;
+ }
+
+ TComprStats TestCodec(const ICodec& c, const TVector<TString>& input) {
+ TComprStats stats;
+
+ TBuffer encodeBuffer;
+ TBuffer decodeBuffer;
+ for (const auto& data : input) {
+ encodeBuffer.Clear();
+ decodeBuffer.Clear();
+
+ stats.Records += 1;
+ stats.RawSize += data.size();
+
+ THPTimer timer;
+ c.Encode(data, encodeBuffer);
+ stats.EncSize += encodeBuffer.size();
+ stats.EncSeconds += timer.PassedReset();
+
+ c.Decode(TStringBuf{encodeBuffer.data(), encodeBuffer.size()}, decodeBuffer);
+ stats.DecSeconds += timer.PassedReset();
+ Y_ENSURE(data == TStringBuf(decodeBuffer.data(), decodeBuffer.size()), "invalid encoding at record " << stats.Records);
+ }
+
+ return stats;
+ }
+
+ void ParseBlob(TVector<TString>& result, EDataStreamFormat fmt, const TBlob& blob) {
+ TStringBuf bin(blob.AsCharPtr(), blob.Size());
+ TStringBuf line;
+ TString buffer;
+ while (bin.ReadLine(line)) {
+ if (DSF_BASE64_LF == fmt) {
+ Base64Decode(line, buffer);
+ line = buffer;
+ }
+ if (!line) {
+ continue;
+ }
+ result.emplace_back(line.data(), line.size());
+ }
+ }
+
+ TBlob GetInputBlob(const TString& dataFile) {
+ return dataFile && dataFile != "-" ? TBlob::FromFile(dataFile) : TBlob::FromStream(Cin);
+ }
+
+}
diff --git a/library/cpp/codecs/static/tools/common/ct_common.h b/library/cpp/codecs/static/tools/common/ct_common.h
new file mode 100644
index 0000000000..9d3dcbda93
--- /dev/null
+++ b/library/cpp/codecs/static/tools/common/ct_common.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/memory/blob.h>
+#include <cmath>
+
+namespace NCodecs {
+ class TStaticCodecInfo;
+ class ICodec;
+
+ struct TComprStats {
+ double EncSeconds = 0;
+ double DecSeconds = 0;
+ size_t Records = 0;
+ size_t RawSize = 0;
+ size_t EncSize = 0;
+
+ static double Round(double n, size_t decPlaces = 2) {
+ double p = pow(10, decPlaces);
+ return round(n * p) / p;
+ }
+
+ static double AsPercent(double n) {
+ return Round(n * 100);
+ }
+
+ static double AsMicroSecond(double s) {
+ return s * 1000000;
+ }
+
+ double PerRecord(double n) const {
+ return Round((double)(Records ? n / Records : 0));
+ }
+
+ double Compression() const {
+ return ((double)RawSize - (double)EncSize) / RawSize;
+ }
+
+ double EncTimePerRecordUS() const {
+ return PerRecord(AsMicroSecond(EncSeconds));
+ }
+
+ double DecTimePerRecordUS() const {
+ return PerRecord(AsMicroSecond(DecSeconds));
+ }
+
+ double RawSizePerRecord() const {
+ return PerRecord(RawSize);
+ }
+
+ double EncSizePerRecord() const {
+ return PerRecord(EncSize);
+ }
+
+ double OldEncSizePerRecord(double compr) const {
+ return PerRecord((1 - compr) * RawSize);
+ }
+
+ TString Format(const TStaticCodecInfo&, bool checkMode) const;
+ };
+
+ TComprStats TestCodec(const ICodec&, const TVector<TString>& data);
+
+ enum EDataStreamFormat {
+ DSF_NONE,
+ DSF_PLAIN_LF /* "plain" */,
+ DSF_BASE64_LF /* "base64" */,
+ };
+
+ void ParseBlob(TVector<TString>&, EDataStreamFormat, const TBlob&);
+
+ TBlob GetInputBlob(const TString& dataFile);
+
+}
diff --git a/library/cpp/codecs/static/tools/common/ya.make b/library/cpp/codecs/static/tools/common/ya.make
new file mode 100644
index 0000000000..d624222dad
--- /dev/null
+++ b/library/cpp/codecs/static/tools/common/ya.make
@@ -0,0 +1,19 @@
+LIBRARY()
+
+OWNER(velavokr)
+
+SRCS(
+ ct_common.cpp
+)
+
+PEERDIR(
+ library/cpp/codecs
+ library/cpp/codecs/static
+ library/cpp/getopt/small
+ library/cpp/string_utils/base64
+ util/draft
+)
+
+GENERATE_ENUM_SERIALIZATION(ct_common.h)
+
+END()
diff --git a/library/cpp/codecs/static/tools/static_codec_checker/README b/library/cpp/codecs/static/tools/static_codec_checker/README
new file mode 100644
index 0000000000..723a68300b
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_checker/README
@@ -0,0 +1,4 @@
+This is a viewer for generated codec and utility for verification of the compression quality on a new data.
+
+Usage:
+static_codec_checker -t -c 029b29ff64a74927.codec_info -f plain samples.txt
diff --git a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp
new file mode 100644
index 0000000000..9c8d568d82
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp
@@ -0,0 +1,73 @@
+#include <library/cpp/codecs/static/tools/common/ct_common.h>
+#include <library/cpp/codecs/static/static.h>
+#include <library/cpp/codecs/static/static_codec_info.pb.h>
+#include <library/cpp/codecs/codecs.h>
+#include <library/cpp/getopt/small/last_getopt.h>
+
+#include <util/digest/city.h>
+#include <util/generic/yexception.h>
+#include <util/stream/file.h>
+#include <util/stream/buffer.h>
+#include <util/stream/format.h>
+#include <util/string/builder.h>
+
+int main(int argc, char** argv) {
+ NCodecs::TCodecPtr codecPtr;
+ NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE;
+ TString codecFile;
+ bool testCompression = false;
+
+ auto opts = NLastGetopt::TOpts::Default();
+ opts.SetTitle("Prints a .codec_info file and optionally checks its performance on new data. See also static_codec_generator.");
+ opts.SetCmdLineDescr("-c 9089f3e9b7a0f0d4.codec_info -t -f base64 qtrees.sample.txt");
+ NCodecs::TStaticCodecInfo codec;
+
+ opts.AddLongOption('c', "codec-info").RequiredArgument("codec_info").Handler1T<TString>([&codecFile, &codec, &codecPtr](TString name) {
+ codecFile = name;
+ codec.CopyFrom(NCodecs::LoadCodecInfoFromString(TUnbufferedFileInput(name).ReadAll()));
+ codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec());
+ })
+ .Required()
+ .Help(".codec_info file with serialized static data for codec");
+
+ opts.AddLongOption('t', "test").NoArgument().StoreValue(&testCompression, true).Optional().Help("test current performance");
+
+ opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Optional().Help("test set input file format");
+
+ opts.SetFreeArgsMin(0);
+ opts.SetFreeArgTitle(0, "testing_set_input_file", "testing set input files");
+
+ NLastGetopt::TOptsParseResult res(&opts, argc, argv);
+
+ Cout << codecFile << Endl;
+ Cout << NCodecs::FormatCodecInfo(codec) << Endl;
+
+ if (testCompression) {
+ if (NCodecs::DSF_NONE == fmt) {
+ Cerr << "Specify format (-f|--format) for testing set input" << Endl;
+ exit(1);
+ }
+
+ Cout << "Reading testing set data ... " << Flush;
+
+ TVector<TString> allData;
+ for (const auto& freeArg : res.GetFreeArgs()) {
+ NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg));
+ }
+
+ if (!res.GetFreeArgs()) {
+ NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-"));
+ }
+
+ Cout << "Done" << Endl << Endl;
+
+ Cout << "records: " << allData.size() << Endl;
+ Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl;
+
+ Cout << "Testing compression ... " << Flush;
+ auto stats = NCodecs::TestCodec(*codecPtr, allData);
+ Cout << "Done" << Endl << Endl;
+
+ Cout << stats.Format(codec, true) << Endl;
+ }
+}
diff --git a/library/cpp/codecs/static/tools/static_codec_checker/ya.make b/library/cpp/codecs/static/tools/static_codec_checker/ya.make
new file mode 100644
index 0000000000..90e06ca448
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_checker/ya.make
@@ -0,0 +1,16 @@
+PROGRAM()
+
+OWNER(velavokr)
+
+SRCS(
+ static_codec_checker.cpp
+)
+
+PEERDIR(
+ library/cpp/codecs
+ library/cpp/codecs/static
+ library/cpp/codecs/static/tools/common
+ library/cpp/getopt/small
+)
+
+END()
diff --git a/library/cpp/codecs/static/tools/static_codec_generator/README b/library/cpp/codecs/static/tools/static_codec_generator/README
new file mode 100644
index 0000000000..e6bb52b959
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_generator/README
@@ -0,0 +1,4 @@
+This is a utility for reproducible teaching of a codec. And also for saving it into a file with a unique name for a static compilation as a resource.
+
+Usage:
+static_codec_generator -t -m 'the training data description' -f plain samples.txt
diff --git a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp
new file mode 100644
index 0000000000..45fdb5c5fe
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp
@@ -0,0 +1,82 @@
+#include <library/cpp/codecs/static/tools/common/ct_common.h>
+#include <library/cpp/codecs/static/static_codec_info.pb.h>
+#include <library/cpp/codecs/static/builder.h>
+#include <library/cpp/codecs/codecs.h>
+
+#include <library/cpp/getopt/small/last_getopt.h>
+
+#include <util/generic/yexception.h>
+#include <util/stream/file.h>
+#include <util/string/builder.h>
+
+int main(int argc, char** argv) {
+ NCodecs::TCodecBuildInfo info;
+ NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE;
+
+ auto opts = NLastGetopt::TOpts::Default();
+ opts.SetCmdLineDescr("-m 'Training set: 100000 qtrees taken from web mmeta logs' -f base64 qtrees.sample.txt");
+ opts.SetTitle("Teaches the codec and serializes it as a file named CODECNAME.hash(CODECDATA).bin");
+
+ opts.AddLongOption('m', "message").RequiredArgument("training_set_comment").StoreResult(&info.TrainingSetComment).Required().Help("a human description for the training set");
+
+ opts.AddLongOption('r', "resource").RequiredArgument("training_set_res_id").StoreResult(&info.TrainingSetResId).Optional().Help("sandbox resource id for the training set");
+
+ opts.AddLongOption('c', "codec").RequiredArgument("codec_name").StoreResult(&info.CodecName).Optional().DefaultValue(info.CodecName);
+
+ opts.AddLongOption('s', "sample-multiplier").RequiredArgument("multiplier").StoreResult(&info.SampleSizeMultiplier).Optional().DefaultValue(ToString(info.SampleSizeMultiplier)).Help("multiplier for default sample size");
+
+ opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Required().Help("training set input file format");
+
+ opts.AddLongOption("list-codecs").NoArgument().Handler0([]() {
+ Cout << JoinStrings(NCodecs::ICodec::GetCodecsList(), "\n") << Endl;
+ exit(0);
+ })
+ .Optional()
+ .Help("list available codecs");
+
+ opts.AddLongOption("fake-revision").RequiredArgument("revision").StoreResult(&info.RevisionInfo).Optional().Hidden(); // replace static_codec_generator revision in debug info
+
+ opts.AddLongOption("fake-timestamp").RequiredArgument("timestamp").StoreResult(&info.Timestamp).Optional().Hidden(); // replace generating timestamp in debug info
+
+ opts.SetFreeArgsMin(0);
+ opts.SetFreeArgTitle(0, "training_set_input_file", "training set input files");
+
+ NLastGetopt::TOptsParseResult res(&opts, argc, argv);
+
+ Cout << "Reading training set data ... " << Flush;
+ TVector<TString> allData;
+ for (const auto& freeArg : res.GetFreeArgs()) {
+ NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg));
+ }
+
+ if (!res.GetFreeArgs()) {
+ NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-"));
+ }
+ Cout << "Done" << Endl << Endl;
+
+ Cout << "records: " << allData.size() << Endl;
+ Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl;
+
+ Cout << "Training " << info.CodecName << " , sample size multiplier is " << info.SampleSizeMultiplier << " ... " << Flush;
+ auto codec = NCodecs::BuildStaticCodec(allData, info);
+ Cout << "Done" << Endl;
+
+ TString codecName = NCodecs::GetStandardFileName(codec);
+ NCodecs::TCodecPtr codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec());
+
+ Cout << "Testing compression ... " << Flush;
+ auto stats = NCodecs::TestCodec(*codecPtr, allData);
+ Cout << "Done" << Endl << Endl;
+
+ codec.MutableDebugInfo()->SetCompression(stats.Compression());
+
+ Cout << stats.Format(codec, false) << Endl;
+
+ Cout << "Saving as " << codecName << " ... " << Flush;
+ {
+ TUnbufferedFileOutput fout{codecName};
+ NCodecs::SaveCodecInfoToStream(fout, codec);
+ fout.Finish();
+ }
+ Cout << "Done" << Endl << Endl;
+}
diff --git a/library/cpp/codecs/static/tools/static_codec_generator/ya.make b/library/cpp/codecs/static/tools/static_codec_generator/ya.make
new file mode 100644
index 0000000000..efbc440dd1
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_generator/ya.make
@@ -0,0 +1,17 @@
+PROGRAM()
+
+OWNER(velavokr)
+
+SRCS(
+ static_codec_generator.cpp
+)
+
+PEERDIR(
+ library/cpp/codecs
+ library/cpp/codecs/static
+ library/cpp/codecs/static/tools/common
+ library/cpp/digest/md5
+ library/cpp/getopt/small
+)
+
+END()
diff --git a/library/cpp/codecs/static/tools/tests/canondata/result.json b/library/cpp/codecs/static/tools/tests/canondata/result.json
new file mode 100644
index 0000000000..7a637c6763
--- /dev/null
+++ b/library/cpp/codecs/static/tools/tests/canondata/result.json
@@ -0,0 +1,6 @@
+{
+ "static_codec_tools.test_static_codec_tools": {
+ "checksum": "960e3c8c57fb846ab53ccbd07e287233",
+ "uri": "sbr://144512644/static_codec_tools.test_static_codec_tools/solar-8k-a.huffman.1467494385.codec_info"
+ }
+} \ No newline at end of file
diff --git a/library/cpp/codecs/static/tools/tests/static_codec_tools.py b/library/cpp/codecs/static/tools/tests/static_codec_tools.py
new file mode 100644
index 0000000000..db4140e370
--- /dev/null
+++ b/library/cpp/codecs/static/tools/tests/static_codec_tools.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+
+import yatest.common as tt
+import os.path as op
+
+def test_static_codec_tools():
+ tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator")]
+ + ["-m", "test codec", "-r", "sbr://143310406", "-f", "plain", "-c", "solar-8k-a:huffman", "-s", "1",
+ "--fake-revision", "r2385905", "--fake-timestamp", "1467494385", "sample.txt"],
+ timeout=60)
+ assert(op.exists("solar-8k-a.huffman.1467494385.codec_info"))
+ tt.canonical_execute(tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker"),
+ args=["-c", "solar-8k-a.huffman.1467494385.codec_info"],
+ timeout=60)
+ tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker")]
+ + ["-c", "solar-8k-a.huffman.1467494385.codec_info", "-f", "plain", "-t", "sample.txt"],
+ timeout=60)
+ return tt.canonical_file("solar-8k-a.huffman.1467494385.codec_info")
diff --git a/library/cpp/codecs/static/tools/tests/ya.make b/library/cpp/codecs/static/tools/tests/ya.make
new file mode 100644
index 0000000000..c5324eaf53
--- /dev/null
+++ b/library/cpp/codecs/static/tools/tests/ya.make
@@ -0,0 +1,20 @@
+PY2TEST()
+
+OWNER(velavokr)
+
+TEST_SRCS(static_codec_tools.py)
+
+DATA(sbr://143310406)
+
+TIMEOUT(4200)
+
+TAG(ya:not_autocheck)
+
+DEPENDS(
+ library/cpp/codecs/static/tools/static_codec_checker
+ library/cpp/codecs/static/tools/static_codec_generator
+)
+
+
+
+END()
diff --git a/library/cpp/codecs/static/tools/ya.make b/library/cpp/codecs/static/tools/ya.make
new file mode 100644
index 0000000000..dd3e8437aa
--- /dev/null
+++ b/library/cpp/codecs/static/tools/ya.make
@@ -0,0 +1,5 @@
+RECURSE(
+ common
+ static_codec_generator
+ static_codec_checker
+)
diff --git a/library/cpp/codecs/static/ut/builder_ut.cpp b/library/cpp/codecs/static/ut/builder_ut.cpp
new file mode 100644
index 0000000000..b47c279ed1
--- /dev/null
+++ b/library/cpp/codecs/static/ut/builder_ut.cpp
@@ -0,0 +1,57 @@
+#include <library/cpp/testing/unittest/registar.h>
+#include <library/cpp/codecs/static/builder.h>
+#include <library/cpp/codecs/static/static_codec_info.pb.h>
+#include <util/string/vector.h>
+
+class TStaticCodecInfoBuilderTest: public NUnitTest::TTestBase {
+ UNIT_TEST_SUITE(TStaticCodecInfoBuilderTest)
+ UNIT_TEST(TestBuild)
+ UNIT_TEST_SUITE_END();
+
+private:
+ TVector<TString> PrepareData() {
+ TVector<TString> data;
+ for (ui32 i = 'a'; i <= 'z'; ++i) {
+ data.push_back(TString(1, (char)i));
+ }
+ return data;
+ }
+
+ void TestBuild() {
+ TVector<TString> data;
+ NCodecs::TCodecBuildInfo info;
+ info.CodecName = "huffman";
+ info.SampleSizeMultiplier = 2;
+ info.Timestamp = 1467494385;
+ info.RevisionInfo = "r2385905";
+ info.TrainingSetComment = "some dummy data";
+ info.TrainingSetResId = "sbr://1234";
+ auto res = NCodecs::BuildStaticCodec(PrepareData(), info);
+ UNIT_ASSERT_VALUES_EQUAL(res.ShortUtf8DebugString(),
+ "StoredCodec: \"\\007\\000huffman@S\\000a"
+ "\\006b\\005c\\005d\\005e\\005f\\005g\\005h\\005i\\005j\\005k\\005l\\005m\\005n\\005o"
+ "\\005p\\005q\\005r\\005s\\005t\\005u\\004v\\004w\\004x\\004y\\004z\\004\xC7?\xC8>"
+ "\xC9=\xCA<\xCB;\xCC:\3159\3168\3177\3206\3215\3224\3233\3242\3251\3260\xD7/\xD8."
+ "\xD9-\xDA,\xDB+\xDC*\xDD)\xDE(\xDF\\'\xE0&\xE1%\xE2$\xE3#\xE4\\\"\xE5!\xE6 \xE7"
+ "\\037\xE8\\036\xE9\\035\xEA\\034\xEB\\033\xEC\\032\xED\\031\xEE\\030\xEF\\027\xF0"
+ "\\026\xF1\\025\xF2\\024\xF3\\023\xF4\\022\xF5\\021\xF6\\020\xF7\\017\xF8\\016\xF9"
+ "\\r\xFA\\014\xFB\\013\xFC\\n\xFD\\t\xFE\\010\xFF\\007\" "
+ "DebugInfo { "
+ "CodecName: \"huffman\" "
+ "Timestamp: 1467494385 "
+ "RevisionInfo: \"r2385905\" "
+ "SampleSizeMultiplier: 2 "
+ "TrainingSetComment: \"some dummy data\" "
+ "TrainingSetResId: \"sbr://1234\" "
+ "StoredCodecHash: 2509195835471488613 "
+ "}");
+
+ UNIT_ASSERT_VALUES_EQUAL(NCodecs::GetStandardFileName(res), "huffman.1467494385.codec_info");
+ UNIT_ASSERT_VALUES_EQUAL(res.GetDebugInfo().GetStoredCodecHash(), 2509195835471488613ULL);
+
+ auto res1 = NCodecs::LoadCodecInfoFromString(NCodecs::SaveCodecInfoToString(res));
+ UNIT_ASSERT_VALUES_EQUAL(res1.ShortUtf8DebugString(), res.ShortUtf8DebugString());
+ }
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TStaticCodecInfoBuilderTest);
diff --git a/library/cpp/codecs/static/ut/static_ut.cpp b/library/cpp/codecs/static/ut/static_ut.cpp
new file mode 100644
index 0000000000..57e1e62887
--- /dev/null
+++ b/library/cpp/codecs/static/ut/static_ut.cpp
@@ -0,0 +1,27 @@
+#include <library/cpp/testing/unittest/registar.h>
+#include <library/cpp/codecs/static/example/example.h>
+
+class TStaticCodecUsageTest: public NUnitTest::TTestBase {
+ UNIT_TEST_SUITE(TStaticCodecUsageTest)
+ UNIT_TEST(TestUsage)
+ UNIT_TEST_SUITE_END();
+
+private:
+ void DoTestUsage(NStaticCodecExample::EDictVersion dv, size_t expectedSize) {
+ const TStringBuf letov = "Всё идёт по плану";
+
+ TBuffer outEnc, outDec;
+ NStaticCodecExample::Encode(outEnc, letov, dv);
+ NStaticCodecExample::Decode(outDec, TStringBuf{outEnc.data(), outEnc.size()});
+
+ UNIT_ASSERT_VALUES_EQUAL(outEnc.Size(), expectedSize);
+ UNIT_ASSERT_EQUAL(TStringBuf(outDec.data(), outDec.size()), letov);
+ }
+
+ void TestUsage() {
+ DoTestUsage(NStaticCodecExample::DV_HUFF_20160707, 18u);
+ DoTestUsage(NStaticCodecExample::DV_SA_HUFF_20160707, 22u);
+ }
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TStaticCodecUsageTest)
diff --git a/library/cpp/codecs/static/ut/ya.make b/library/cpp/codecs/static/ut/ya.make
new file mode 100644
index 0000000000..b9116097d8
--- /dev/null
+++ b/library/cpp/codecs/static/ut/ya.make
@@ -0,0 +1,14 @@
+UNITTEST_FOR(library/cpp/codecs/static)
+
+OWNER(velavokr)
+
+SRCS(
+ builder_ut.cpp
+ static_ut.cpp
+)
+
+PEERDIR(
+ library/cpp/codecs/static/example
+)
+
+END()
diff --git a/library/cpp/codecs/static/ya.make b/library/cpp/codecs/static/ya.make
new file mode 100644
index 0000000000..00e00fd8d4
--- /dev/null
+++ b/library/cpp/codecs/static/ya.make
@@ -0,0 +1,18 @@
+LIBRARY()
+
+OWNER(velavokr)
+
+SRCS(
+ builder.cpp
+ static_codec_info.proto
+ static.cpp
+)
+
+PEERDIR(
+ library/cpp/codecs
+ library/cpp/archive
+ library/cpp/svnversion
+ util/draft
+)
+
+END()
diff --git a/library/cpp/codecs/tls_cache.cpp b/library/cpp/codecs/tls_cache.cpp
new file mode 100644
index 0000000000..0a1b32bda1
--- /dev/null
+++ b/library/cpp/codecs/tls_cache.cpp
@@ -0,0 +1,4 @@
+#include "tls_cache.h"
+
+namespace NCodecs {
+}
diff --git a/library/cpp/codecs/tls_cache.h b/library/cpp/codecs/tls_cache.h
new file mode 100644
index 0000000000..0184e4bb6c
--- /dev/null
+++ b/library/cpp/codecs/tls_cache.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <util/generic/buffer.h>
+#include <util/generic/deque.h>
+#include <util/generic/noncopyable.h>
+#include <util/generic/strbuf.h>
+#include <util/system/tls.h>
+#include <util/thread/singleton.h>
+
+namespace NCodecs {
+ template <class TItem>
+ struct TClear {
+ void operator()(TItem& item) const {
+ item.Clear();
+ }
+ };
+
+ template <class TItem, class TCleaner = TClear<TItem>>
+ class TTlsCache {
+ using TSelf = TTlsCache<TItem, TCleaner>;
+
+ struct TItemHolder: public TIntrusiveListItem<TItemHolder> {
+ TItemHolder(TSelf& factory)
+ : Factory(factory)
+ {
+ }
+
+ void Release() {
+ Factory.Release(*this);
+ }
+
+ TSelf& Factory;
+ TItem Item;
+ };
+
+ class TItemGuard {
+ public:
+ explicit TItemGuard(TSelf& fact)
+ : Holder(fact.Acquire())
+ {
+ }
+
+ TItemGuard(TItemGuard&& other) noexcept {
+ *this = std::move(other);
+ }
+
+ TItemGuard& operator=(TItemGuard&& other) noexcept {
+ if (&other != this) {
+ std::swap(Holder, other.Holder);
+ }
+ return *this;
+ }
+
+ ~TItemGuard() {
+ if (Holder) {
+ Holder->Release();
+ }
+ }
+
+ TItem& Get() & {
+ Y_ASSERT(Holder);
+ return Holder->Item;
+ }
+
+ TItem& Get() && = delete;
+
+ private:
+ TItemHolder* Holder = nullptr;
+ };
+
+ public:
+ TItemGuard Item() {
+ return TItemGuard(*this);
+ }
+
+ static TSelf& TlsInstance() {
+ return *FastTlsSingleton<TSelf>();
+ }
+
+ private:
+ TItemHolder* Acquire() {
+ if (Free.Empty()) {
+ return new TItemHolder(*this);
+ } else {
+ return Free.PopBack();
+ }
+ }
+
+ void Release(TItemHolder& item) {
+ Cleaner(item.Item);
+ Free.PushBack(&item);
+ }
+
+ private:
+ TIntrusiveListWithAutoDelete<TItemHolder, TDelete> Free;
+ TCleaner Cleaner;
+ };
+
+ using TBufferTlsCache = TTlsCache<TBuffer>;
+}
diff --git a/library/cpp/codecs/ut/codecs_ut.cpp b/library/cpp/codecs/ut/codecs_ut.cpp
new file mode 100644
index 0000000000..caf6089aef
--- /dev/null
+++ b/library/cpp/codecs/ut/codecs_ut.cpp
@@ -0,0 +1,1360 @@
+#include <library/cpp/codecs/delta_codec.h>
+#include <library/cpp/codecs/huffman_codec.h>
+#include <library/cpp/codecs/pfor_codec.h>
+#include <library/cpp/codecs/solar_codec.h>
+#include <library/cpp/codecs/zstd_dict_codec.h>
+#include <library/cpp/codecs/comptable_codec.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/generic/buffer.h>
+#include <util/string/util.h>
+#include <util/string/hex.h>
+#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h>
+
+namespace {
+ const char* TextValues[] = {
+ "! сентября газета",
+ "!(возмездие это)!",
+ "!(материнский капитал)",
+ "!(пермь березники)",
+ "!биография | !жизнь / + розинг | зворыгин & изобретение | телевидение | электронно лучевая трубка",
+ "!овсиенко николай павлович",
+ "!путин",
+ "\"i'm on you\" p. diddy тимати клип",
+ "\"билайн\" представит собственный планшет",
+ "\"в особо крупном размере\"",
+ "\"викиликс\" джулиан ассанж",
+ "\"вимм билль данн",
+ "\"газэнергосеть астрахань",
+ "\"газэнергосеть астрахань\"",
+ "\"домодедово\" ту-154",
+ "\"жилина\" \"спартак\" видео",
+ "\"зелёнsq шершнm\"",
+ "\"зелёного шершня\"",
+ "\"золотой граммофон\" марины яблоковой",
+ "\"золотой граммофон-2010\"",
+ "\"калинниковы\"",
+ "\"манчестер юнайтед\" (англия) \"валенсия\" (испания) 1:1 (0:1)",
+ "\"маркер\"",
+ "\"моника\" засыпает москву снегом",
+ "\"моника\" снегопад",
+ "\"о безопасности\",",
+ "\"памятку\" для пассажиров воздушных международных рейсов",
+ "\"петровский парк\" и \"ходынское поле\"",
+ "\"путинская\" трава",
+ "\"пятерочка\"купила \"копейку\"",
+ "\"пятёрочка\" и \"копейка\" объединились",
+ "\"реал\" \"осер\" 4:0",
+ "\"речь мутко\"",
+ "\"российский лес 2010\"",
+ "\"ростехинвентаризация федеральное бти\" рубцов",
+ "\"саня останется с нами\",",
+ "\"следопыт\" реалити шоу",
+ "\"слышишь\" молодые авторы",
+ "\"стадион\"",
+ "\"ходынское поле\" метро",
+ "\"хроники нарнии\"",
+ "\"чистая вода\"",
+ "\"школа деда мороза\"",
+ "# asus -1394",
+ "# сторонники wikileaks",
+ "#106#",
+ "#11",
+ "#8 какой цвет",
+ "#если клиент",
+ "$ 13,79",
+ "$ xnj ,s dct ,skb ljdjkmys !!!",
+ "$ в день",
+ "$ диск компьютера",
+ "$.ajax",
+ "$125 000",
+ "$курс",
+ "% в си",
+ "% влады",
+ "% годовых",
+ "% женщин и % мужчин в россии",
+ "% занятости персонала",
+ "% инфляции 2010",
+ "% инфляции в 2010 г.",
+ "% налога",
+ "% налогов в 2010г.",
+ "% общего количества",
+ "% от числа",
+ "% по налогу на прибыль организации",
+ "%24",
+ "%академия%",
+ "%комарова%татьяна",
+ "& в 1с",
+ "&& (+не существует | !такой проблемы)",
+ "&gt;&gt;&gt;скачать | download c cs strikez.clan.su&lt;&lt;&lt;",
+ "&gt;hbq nbityrjd",
+ "&lt; какой знак",
+ "&lt; лицей | &lt; техническая школа# &lt; история#&lt; лицей сегодня#&lt; перечень профессий#&lt; руководство лицея#&lt; прием учащихся#&lt; контакты#&lt; схема проезда#&lt; фотогалереяистория создания лицея и основные этапы путиулица купчинская дом 28",
+ "&lt;&lt;link&gt;&gt;",
+ "&lt;/storage&gt;",
+ "&lt;bfnkjy",
+ "&lt;bktntd",
+ "&lt;cr",
+ "&lt;ddr3&gt;",
+ "&lt;e[ufknthcrbq abyfycjdsq",
+ "&lt;fcctqys",
+ "&lt;fhcf",
+ "&lt;fhctkjyf he,by",
+ "&lt;firbhbz",
+ "&lt;fyr djphj;ltybt",
+ "&lt;fyr vjcrds",
+ "&lt;fyr резерв",
+ "&lt;fyufkjh",
+ "&lt;index&gt;",
+ "&lt;jkmifz jrhe;yfz rbtd",
+ "&lt;kbpytws",
+ "&lt;megafon&gt; интернет",
+ "&lt;thtpybrb gthvcrbq rhfq",
+ "&lt;tkjxrf",
+ "&lt;беларусь это мы",
+ "&lt;бокс, версия ibf",
+ "designer tree svc",
+ "seriesg810",
+ "doll makers",
+ "rotten.com",
+ "evening gowns",
+ "discover",
+ "south carolina escorts",
+ "forkliftjobsinhousron",
+ "mailbox",
+ "alexis",
+ "espn.com mlb",
+ "gypsy.chat.2k",
+ "the man in the mirror",
+ "azteca",
+ "sebastian telfair - jamel thomas",
+ "kirby",
+ "java",
+ "trike motorcycles",
+ "piasecki helicopter",
+ "wicca binding spells",
+ "pier park panama city beach .com",
+ "continente europeo",
+ "asswatchers.com",
+ "asswatchers.com",
+ "easton stealth stiff flex cnt adult baseball bat - 3",
+ "facesofdeath",
+ "video of 9 11",
+ "profileedit.myspace.com",
+ "georgia snakes",
+ "yahoo.com",
+ "google",
+ "http wwwclassicindustries .corvettes-roadsters.com",
+ "arington training stable",
+ "find bred of dog",
+ "southpark contact tables for myspace",
+ "symptoms of laryngitis",
+ "suzuki stickers",
+ "avianca",
+ "radio shack",
+ "dominican republic pictures",
+ "recent",
+ "mapquest",
+ "http myspace .com",
+ "research chemicals supplies",
+ "winn dixie.com",
+ "drivers 20guide.com",
+ "dylan whitley north carolina",
+ "google com",
+ "order wild horses cigarettes",
+ "yahoocom",
+ "fl runners",
+ "aol companion install",
+ "nbc.comdond 59595 6",
+ "directv.com",
+ "motorsports insurance",
+ "cartoonnetwork",
+ "pop warner-victorville",
+ "black iorn spars",
+ "goog",
+ "the suns",
+ "ebay",
+ "pop warner",
+ "philadelphia cream cheese",
+ "oklahoma",
+ "doudleday books.com",
+ "javascript download",
+ "city of nacogdoches",
+ "sfyl",
+ "myspace.com",
+ "baptism pictures",
+ "games",
+ "depredadores sexuales",
+ "mycl.cravelyrics.com",
+ "become a bone marrow donner",
+ "vintage copies",
+ "ford dealership",
+ "candystand",
+ "smarthairypussyom",
+ "yahoo.com",
+ "vanderbilt.edu",
+ "ebay",
+ "grouper",
+ "mys",
+ "myrsa and birth defects",
+ "hatteras rentals",
+ "female escorts",
+ "ja rule",
+ "meat bluesheet",
+ "yahoo",
+ "american disability act court cases",
+ "clearview cinemas",
+ "hard69.com",
+ "make a living will for free",
+ "fat asses",
+ "flashback concert in atlanta ga",
+ "fucking",
+ "flat abdomen exercises",
+ "big brother facial",
+ "german dictionary",
+ "black dick",
+ "ebonymovies",
+ "airsoft rifles",
+ "best fishing days calander",
+ "tattoo",
+ "impressions",
+ "cs.com",
+ "northwest airlines reservations",
+ "halo 3",
+ "wallbaums",
+ "chat room listings",
+ "waterbury ct warrants",
+ "pictures of chad michael murry",
+ "yahoo",
+ "install wallpaper",
+ "halo 3",
+ "clits and tits",
+ "prothsmouth general circuit courts",
+ "old hawthorne columbia",
+ "jess lee photos",
+ "no deposit casino bonus",
+ "bbc gladiator dressed to kill",
+ "anemagazine.com",
+ "lyrics unfaithful",
+ "gold bars found",
+ "art.comhttp",
+ "free unlock key",
+ "man o war lost a race",
+ "blue cross and blue shield",
+ "phenergan",
+ "myspace.com",
+ "http www.constitutional court.com",
+ "monster trucks",
+ "the breeze fort myers fla.newspaper",
+ "davis origin name",
+ "upper deck.com",
+ "arizona",
+ "akira lane",
+ "ebaumsworld",
+ "union pacific jobs",
+ "google.cm",
+ "free bigt girls nudes",
+ "abcnews.com",
+ "tootse.com",
+ "az lyrics",
+ "freddy",
+ "georgia.com",
+ "johncombest.com",
+ "nelly",
+ "gussi mane",
+ "university of illinois",
+ "oregan valcano's",
+ "mythbusters",
+ "sailormoon hentai",
+ "international cub tractor",
+ "desert sky movie green valley az",
+ "evite",
+ "nelly nud epics",
+ "penndot.com",
+ "first banks",
+ "psp manual",
+ "google",
+ "jackieaudet hotmail.com",
+ "internet",
+ "shootinggames",
+ "shootinggames",
+ "montana western rendezvous of art",
+ "hello kitty layouts",
+ "yahoo",
+ "translation",
+ "glenn scott attorney",
+ "hallofshame",
+ "capitolone.com",
+ "recipe for popovers",
+ "pictures of demons",
+ "barnes and nobles.com",
+ "rbd",
+ "hart and hunnington tattoo shop",
+ "janepowellmovies.com",
+ "ged schools in the military",
+ "kelis",
+ "hvacagent",
+ "neat home organizer television show",
+ "2719 24-2-crime and courts",
+ "fsu",
+ "torpedo bomber games",
+ "love poems",
+ "polly pocket'toys",
+ "yweatherahoo.com",
+ "jungle gin",
+ "flemington new jersey real estate",
+ "milf hunter stories",
+ "budget.com",
+ "chopperstyle",
+ "keno player",
+ "up skirt",
+ "dogs",
+ "beerballers",
+ "phat white butt",
+ "phat white butt",
+ "va licensing for interpeters for the deaf",
+ "white page phone book maiden north carolina",
+ "controlled 20solutions 20corp.com",
+ "friedman jewelery",
+ "kelis",
+ "curtains",
+ "curtains",
+ "fuck me harder",
+ "naked girls",
+ "southwest airlines boarding pass",
+ "mailbox",
+ "1976 mavrick",
+ "adult diapers",
+ "horse nasal discharge",
+ "charles ludlam",
+ "google",
+ "himnos en espanol",
+ "quarter horses for sale in nebraska",
+ "cosmo",
+ "hi",
+ "mattel",
+ "aouto 20trader.com",
+ "sunsetter awnings",
+ "bl.cfm",
+ "at",
+ "tattoo designs",
+ "bubs",
+ "yahoo",
+ "free live gay cam chats",
+ "antibiotics",
+ "upgrade",
+ "aessuccess.org",
+ "yahoo",
+ "boobdex",
+ "the jackle",
+ "plus size lingerie magazines for home",
+ "lehigh valley little league",
+ "ancient trade coins",
+ "pillsbury",
+ "colorado springs",
+ "canada aviation jobs",
+ "free guitar tablature",
+ "kids aol",
+ "capitol community colage",
+ "kevin thomas bermuda",
+ "missouri lotto",
+ "homedepotsportscomplex.com",
+ "dr. franklin schneier",
+ "williamsburg va. hotels",
+ "aim",
+ "morningbuzz",
+ "probusines.com",
+ "wwwalbasoul.com",
+ "w.runehints.com",
+ "yahoo.com",
+ "yahoo.com",
+ "yahoo.com",
+ "fantasy 5",
+ "xxx rape",
+ "hawaiian gift baskets",
+ "madonna.com",
+ "myspace contact tables",
+ "white cock",
+ "safe space",
+ "drinks",
+ "o rly",
+ "dsl",
+ "wwww.uncc.edu",
+ "wwww.uncc.edu",
+ "wwww.uncc.edu",
+ "online overseas checkt.westernunion.com",
+ "angina",
+ "heba technologies",
+ "hebrew ancient coins",
+ "games",
+ "recent",
+ "international male.com",
+ "sex pics",
+ "paul wall layouts for myspace",
+ "health",
+ "wire lamp shade frames",
+ "windows",
+ "top business colleges",
+ "mary jo eustace",
+ "attored",
+ "oklahoma indian legal services",
+ "6arab",
+ "santo nino",
+ "10.1.0.199",
+ "http www.myspace.com daffydonn07",
+ "marine electrical",
+ "sandy creek cabins weekend new york",
+ "onionbutts",
+ "tucson classifieds",
+ "new york times",
+ "recently deleted screen names",
+ "goldeneagle.net",
+ "fta support forums",
+ "low protein in bloos",
+ "datring",
+ "lilwayne",
+ "free billiards games",
+ "yahoo",
+ "ako",
+ "a.tribalfusion.c script language",
+ "dustin davis",
+ "cooking",
+ "yahoo.com",
+ "universal studios",
+ "adult chat",
+ "santa monica flea market",
+ "carpevino.us",
+ "wine vinyard in stewertstown pa",
+ "y",
+ "craigslist",
+ "ups.com",
+ "1-866-347-3292",
+ "renegade boats",
+ "renegade boats",
+ "sunset state beach caping",
+ "artofstanlync.org",
+ "heart-i want make love to you video",
+ "triangles around the world",
+ "mycl.cravelyrics.com",
+ "in the bible what type of persons were forced to walk around in public and say unclean unclean",
+ "providence water fire",
+ "googlecom",
+ "yahoo.com",
+ "b.g",
+ "website de rebelde",
+ "stoplinks",
+ "allison 2000 transmission",
+ "thepriceanduseofgasoline.com",
+ "chamillinaire",
+ "veryspecialhomescom",
+ "crashbandicoot",
+ "a short sex story",
+ "yahoo.com",
+ "music now",
+ "east carolina university",
+ "vandalism in new york",
+ "the bainde soleil company",
+ "dicaprio movies",
+ "xxx dvds",
+ "visual basic scripting support",
+ "english bulldogs",
+ "travelocity.com",
+ "website for asstr.org",
+ "hypnotic slave training",
+ "pogo",
+ "university at buffalo addmissions",
+ "screen name services",
+ "superdrol",
+ "art institute",
+ "online business cards",
+ "aolfinancial",
+ "upgrade shop",
+ "anderson abrasive",
+ "weatherchannel.com",
+ "recent",
+ "ebay",
+ "diagram and xray of a normal shouldercheck out surgicalpoker.comfor more sports medicine and orthopedic information and images check out emedx.com by dr. allan mishranormal diagram normal x-ray",
+ "95 mustang gt chips",
+ "gold grills",
+ "hap housing in portland or",
+ "car sales",
+ "swimming with dolphins",
+ "jennifer lopez nude",
+ "wwwdubcnn.com",
+ "dominicks pizza",
+ "fl studio",
+ "http blackplanet .com",
+ "http blackplanet .com",
+ "http blackplanet .com",
+ "A$AP Rocky",
+ "benie mac",
+ "fujifilm.com",
+ "aol dialup setup",
+ "metal fabrication tools",
+ "internet",
+ "buy my painting",
+ "pulaski va classifieds",
+ "w.coj.net",
+ "postopia.com",
+ "no medical records hydrocodone",
+ "auto completes for deal or no deal contest",
+ "http www. big monster dicks .com",
+ "invacare wheelchairs",
+ "musicdownload.com",
+ "president bush",
+ "heavy equipment",
+ "inmate information",
+ "allina.com",
+ "megan law.gov",
+ "wwwl.eharmony.com",
+ "jobs in colombiaoqx0nq",
+ "beastsex",
+ "ferguisson",
+ "heart-i wanna make love to you vedio",
+ "west georgia university",
+ "west georgia university",
+ "hsn",
+ "bb&t",
+ "midas realty",
+ "yahoo",
+ "mytrip.com",
+ "donna texas mcdonalds",
+ "free picture of our lady",
+ "bubs",
+ "taken chemo for 5 month's cancer can still be seen on ct scan",
+ "porn 20video 20clips",
+ "lake monsters",
+ "freedj mix vibes",
+ "myspace.coim",
+ "la joya school district tx",
+ "colorado bungee jumping",
+ "yahoo",
+ "google.com",
+ "lafayette co vampire grave",
+ "ice cube",
+ "internet",
+ "tccd.edu",
+ "google",
+ "people",
+ "instructions on putting together a filing cabinet",
+ "click.babycenter.com",
+ "90minut",
+ "ramien noodles",
+ "lilwayne",
+ "danni virgin",
+ "nice sexy girls.com",
+ "guttural pouch",
+ "free male masturbating",
+ "good",
+ "rotton 20dot.com",
+ "fox sports",
+ "seth rogen",
+ "desb.mspaceads.com",
+ "betjc.com",
+ "pictures of quebec",
+ "gold in quartz",
+ "evergreen college",
+ "runescape",
+ "gastons white river resort",
+ "sunset beach santa cruz",
+ "auto parts",
+ "travelocity",
+ "myspace.com",
+ "laptops",
+ "beyaonce and j",
+ "free gay ebony knights webcams",
+ "google",
+ "derek watson",
+ "alice in wonderland tshirts",
+ "hippa p rivacy act",
+ "down payment mortgage",
+ "believe it or not",
+ "mys",
+ "datatreca",
+ "onesuite",
+ "names",
+ "lil john",
+ "scales of justice cuff links",
+ "localsales.com",
+ "alametris denise lipsey",
+ "adam for adam",
+ "flip flops crochet",
+ "arbors",
+ "heb hospital",
+ "myspae.com",
+ "midevil breast torture",
+ "askjeeves",
+ "assparade",
+ ".comhttp",
+ "weekly hotels reston virginia",
+ "noiceinparadise.com",
+ "pre diabetic diet",
+ "h.i.m.com",
+ "myspace",
+ "myspace",
+ "wwww.sex.lp.cpm",
+ "mcso mugshots",
+ "roush",
+ "wellfargo",
+ "lilwayne",
+ "hopecherie",
+ "frontgate.com",
+ "barbados registration department",
+ "american pitbull",
+ "free pc full flight simulation game downloads",
+ "google",
+ "vaginal secretion grey stuff",
+ "myspace layouts",
+ "kanye west",
+ "walmart",
+ "pain in hip and leg",
+ "tenneesseeaquarium.com",
+ "suncom.com",
+ "alysseandrachelwerehere",
+ "pimiclo",
+ "starmagazine.com",
+ "classifieds",
+ "mount rushmore in dakota",
+ "sams",
+ "disney com",
+ "beastyality",
+ "chief joseph paintings",
+ "henry scott",
+ "paris hilton",
+ "kb903235",
+ "autotrader",
+ "irish traveller",
+ "ajcobs.com",
+ "art of stanlync.org",
+ "fox news",
+ "freeporn",
+ "depo provera",
+ "air france",
+ "talk city active chats",
+ "codes for the gamecube game resident evil 4",
+ "good food to eat for sugar diabetes",
+ "warpmymind",
+ "arc jacksonville fl",
+ "7fwww.sendspace.com",
+ "j blackfoot",
+ "mcso madison street jail inmate",
+ "macys",
+ "eduscapes",
+ "free picture of our lady",
+ "http www.eastman.org",
+ "minneapolisstartribune localnews",
+ "minneapolisstartribune localnews",
+ "tennessee",
+ "foodtown",
+ "anti virous download",
+ "http www.mdland rec.net",
+ "ed edd eddy",
+ "maryjbilge",
+ "shipping services",
+ "baseball videogames",
+ "egyption ancient coins",
+ "internet",
+ "what is sodomy",
+ "international cub lowboy",
+ "mary j. bilge",
+ "scenic backgrounds",
+ "google.com",
+ "rosettalangueges.com",
+ "titanpoker.net",
+ "titie show",
+ "edelen realtor",
+ "lil cim",
+ "china.com",
+ "boost mobile",
+ "nc eipa",
+ "people's 20pharmacy 20guide 20to",
+ "costco",
+ "charles schultz drawings",
+ "nicisterling",
+ "a picture of author stephen crane",
+ "yahoo.com",
+ "sponge bob myspace layouts",
+ "g",
+ "calendar creator",
+ "careerbuilder.com",
+ "cool tex for web pages",
+ "yahoo.com",
+ "mcdougal littel",
+ "sign on",
+ "superman",
+ "radio",
+ "lajollaindians.com",
+ "mike tyson died",
+ "pink panther",
+ "lolita newgroups",
+ "nude girls",
+ "galveston 20texas",
+ "gerlach meat co.",
+ "thetakeover2006.com",
+ "yahoo",
+ "simpsons movie",
+ "saxy",
+ "yahoo",
+ "21st century realty",
+ "new zealand",
+ "dogs",
+ "weather",
+ "free porn sex",
+ "bugs bunny parties",
+ "mortal kombat 2 fatalities",
+ "sea life park hawaii",
+ "songs for middle school choir",
+ "rocky mountain jeep",
+ "householdbank.com",
+ "birdville isd",
+ "brutal dildo",
+ "brutal dildo",
+ "free live gay cam chats",
+ "wonder woman",
+ "ebay com",
+ "myspace.com",
+ "boost mobile",
+ "desktop themes sex",
+ "myspace.com",
+ "myspace.com",
+ "maroon chevy auto dealership",
+ "beyonce",
+ "cleopatra vii",
+ "accountcentralonline.com",
+ "juvenile",
+ "the game cock",
+ "pics of ashland city tennessee",
+ "coherent deos",
+ "microwsoft wireless connection",
+ "best buy",
+ "southwest airlines",
+ "southwest airlines",
+ "pogo games",
+ "family court record room in brooklyn newyork",
+ "60.ufc.net",
+ "us mint",
+ "people",
+ "firstcitycreditunion",
+ "washington mutual careers",
+ "beyonce",
+ "tab energy drink",
+ "http vemmabuilder.com",
+ "new york state lottery",
+ "yahoo",
+ "tmobile",
+ "yellow pages.com",
+ "az.central.com",
+ "pasco auto salvage",
+ "im help",
+ "home based businesses",
+ "studyisland",
+ "bible study from king james on 1 corinthians chapter 6 verses 18- 20",
+ "bellevue-ne",
+ "msn.com",
+ "aolsignupfree",
+ "the simsons",
+ "nevada",
+ "forsyth central high school",
+ "road state college",
+ "does my child have adhd",
+ "les tucanesde tijuana",
+ "yahoo.com",
+ "mexican pharmacy hyrocodone",
+ "ford motor co year end sales",
+ "google.com",
+ "google.com",
+ "person.com",
+ "marylyn monroe",
+ "nfl",
+ "the hun.net",
+ "nkena anderson",
+ "free netscape download",
+ "top fifty colleges",
+ "wil.",
+ "memphis tennessee",
+ "yahoo mail",
+ "corrections officer of juveniles",
+ "jada pinkett smith",
+ "mapquest.com",
+ "apartments",
+ "msn.com",
+ "msn.com",
+ "wasco state prison",
+ "solitaire",
+ "http",
+ "freeport seaman center",
+ "futbol soccer",
+ "screen names",
+ "kmov.com",
+ "survey.otxresearch.com",
+ "facial shaves",
+ "gle",
+ "flw.com",
+ "seasportboats.com",
+ "toysrus.com",
+ "animated sexy graphics",
+ "colombia",
+ "unitarian univeralist association",
+ "fr",
+ "google video.com",
+ "660-342-1072",
+ "suzan-lori parks",
+ "male facial",
+ "william bouguereau first kiss how much it is worth",
+ "streetfighter",
+ "nick.com",
+ "wonder woman",
+ "pentagram",
+ "mcafee virus protection",
+ "diary",
+ "037f34742140a5f761ad51d95180b4f8",
+ "free porn",
+ "no deposit casino bonus",
+ "spongebob the movie myspace layouts",
+ "on line banking",
+ "equestrian properties for sale",
+ "kazaa free muisc download",
+ "gay truckers",
+ "24",
+ "pay-pal",
+ "www yahoo.com",
+ "phatazz.white hoes",
+ "planets of the universe",
+ "free movies",
+ "budget rentals special",
+ "yahoogames",
+ "talaat pasha",
+ "mariah carey song lyrics don't forget about us",
+ "futbol soccer",
+ "msn groups",
+ "martha steward",
+ "martha steward",
+ "soap opera scoops cbs",
+ "cingular",
+ "stuwie",
+ "womengiving blowjobs",
+ "hear dancing queen by abba",
+ "love song",
+ "fhsaa.org",
+ "any dvd",
+ "any dvd",
+ "gallery.brookeskye.com",
+ "gibson ranch",
+ "wachovia com",
+ "kzg golf information",
+ "skylight curtains",
+ "c",
+ "123freeweblayouts.com",
+ "yahoo.com",
+ "allie.com",
+ "ghosts of bingham cemetery",
+ "resume maker",
+ "resume maker",
+ "resume maker",
+ "lymphomatoid papulosis",
+ "sez.com",
+ };
+}
+
+class TCodecsTest: public TTestBase {
+ UNIT_TEST_SUITE(TCodecsTest);
+ UNIT_TEST(TestPipeline)
+ UNIT_TEST(TestDelta)
+ UNIT_TEST(TestHuffman)
+ UNIT_TEST(TestZStdDict)
+ UNIT_TEST(TestCompTable)
+ UNIT_TEST(TestHuffmanLearnByFreqs)
+ UNIT_TEST(TestSolar)
+ UNIT_TEST(TestPFor)
+ UNIT_TEST(TestRegistry)
+
+ UNIT_TEST_SUITE_END();
+
+private:
+ TString PrintError(TStringBuf learn, TStringBuf test, TStringBuf codec, ui32 i) {
+ TString s;
+ TStringOutput sout(s);
+ sout << codec << ": " << i << ", "
+ << "\n";
+ sout << HexEncode(learn.data(), learn.size()); //NEscJ::EscapeJ<true>(learn, sout);
+ sout << " != \n";
+ sout << HexEncode(test.data(), test.size()); //NEscJ::EscapeJ<true>(test, sout);
+
+ if (s.Size() > 1536) {
+ TString res = s.substr(0, 512);
+ res.append("...<skipped ").append(ToString(s.size() - 1024)).append(">...");
+ res.append(s.substr(s.size() - 512));
+ }
+
+ return s;
+ }
+
+ TStringBuf AsStrBuf(const TBuffer& b) {
+ return TStringBuf(b.data(), b.size());
+ }
+
+ template <typename TCodec, bool testsaveload>
+ void TestCodec(const TVector<TBuffer>& inlearn = TVector<TBuffer>(), const TVector<TBuffer>& in = TVector<TBuffer>(), NCodecs::TCodecPtr c = new TCodec) {
+ using namespace NCodecs;
+
+ TBuffer buff;
+
+ {
+ TVector<TBuffer> out;
+
+ c->Learn(inlearn.begin(), inlearn.end());
+
+ if (testsaveload) {
+ {
+ TBufferOutput bout(buff);
+ ICodec::Store(&bout, c);
+ }
+
+ {
+ TBufferInput bin(buff);
+ c = ICodec::Restore(&bin);
+ UNIT_ASSERT(c->AlreadyTrained());
+ }
+ }
+
+ {
+ size_t insz = 0;
+ size_t outsz = buff.Size();
+
+ for (ui32 i = 0; i < inlearn.size(); ++i) {
+ out.emplace_back();
+ c->Encode(AsStrBuf(inlearn[i]), out[i]);
+
+ insz += inlearn[i].Size();
+ outsz += out[i].Size();
+ }
+
+ TBuffer vecl;
+ for (ui32 i = 0; i < out.size(); ++i) {
+ vecl.Clear();
+ c->Decode(AsStrBuf(out[i]), vecl);
+
+ UNIT_ASSERT_EQUAL_C(AsStrBuf(inlearn[i]), AsStrBuf(vecl),
+ PrintError(TStringBuf(inlearn[i].data(), inlearn[i].size()),
+ TStringBuf(vecl.data(), vecl.size()), c->GetName(), i));
+ }
+ }
+ }
+
+ {
+ if (testsaveload) {
+ TBufferInput bin(buff);
+ c = ICodec::Restore(&bin);
+ }
+
+ size_t insz = 0;
+ size_t outsz = buff.Size();
+
+ TBuffer out, in1;
+ for (ui32 i = 0; i < in.size(); ++i) {
+ out.Clear();
+ in1.Clear();
+ c->Encode(AsStrBuf(in[i]), out);
+ insz += in[i].Size();
+ outsz += out.Size();
+ c->Decode(AsStrBuf(out), in1);
+ UNIT_ASSERT_EQUAL_C(AsStrBuf(in[i]), AsStrBuf(in1),
+ PrintError(TStringBuf(in[i].data(), in[i].size()),
+ TStringBuf(in1.data(), in1.size()), c->GetName(), i));
+ }
+ }
+ }
+
+ template <class T>
+ void AppendTo(TBuffer& b, T t) {
+ b.Append((char*)&t, sizeof(t));
+ }
+
+ void TestDelta() {
+ using namespace NCodecs;
+ TVector<TBuffer> d;
+
+ // 1. common case
+ d.emplace_back();
+ AppendTo(d.back(), 1ULL);
+ AppendTo(d.back(), 10ULL);
+ AppendTo(d.back(), 100ULL);
+ AppendTo(d.back(), 1000ULL);
+ AppendTo(d.back(), 10000ULL);
+ AppendTo(d.back(), 100000ULL);
+
+ // 2. delta overflow
+ d.emplace_back();
+ AppendTo(d.back(), 1ULL);
+ AppendTo(d.back(), 10ULL);
+ AppendTo(d.back(), 100ULL);
+ AppendTo(d.back(), 1000ULL);
+ AppendTo(d.back(), (ui64)-100LL);
+ AppendTo(d.back(), (ui64)-10ULL);
+
+ // 3. bad sorting
+ d.emplace_back();
+ AppendTo(d.back(), 1ULL);
+ AppendTo(d.back(), 10ULL);
+ AppendTo(d.back(), 1000ULL);
+ AppendTo(d.back(), 100ULL);
+ AppendTo(d.back(), 10000ULL);
+ AppendTo(d.back(), 100000ULL);
+
+ // all bad
+ d.emplace_back();
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -1LL);
+
+ TestCodec<TDeltaCodec<ui64, true>, false>(d);
+ TestCodec<TDeltaCodec<ui64, false>, false>(d);
+ }
+
+ void TestPFor() {
+ using namespace NCodecs;
+ {
+ TVector<TBuffer> d;
+ d.emplace_back();
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -1LL);
+ d.emplace_back();
+ AppendTo(d.back(), 0LL);
+ AppendTo(d.back(), 1LL);
+ AppendTo(d.back(), 2LL);
+ AppendTo(d.back(), 1LL);
+ AppendTo(d.back(), 0LL);
+ AppendTo(d.back(), 1LL);
+ AppendTo(d.back(), 2LL);
+ d.emplace_back();
+ AppendTo(d.back(), 0LL);
+ AppendTo(d.back(), 1LL);
+ AppendTo(d.back(), 2LL);
+ AppendTo(d.back(), 1LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), 0LL);
+ AppendTo(d.back(), 1LL);
+ AppendTo(d.back(), 2LL);
+ d.emplace_back();
+ AppendTo(d.back(), 0LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -2LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -2LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), 0LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -2LL);
+
+ TestCodec<TPForCodec<ui64>, false>(d);
+ TestCodec<TPForCodec<ui64, true>, true>(d);
+ }
+ {
+ TVector<TBuffer> d;
+ d.emplace_back();
+ AppendTo(d.back(), -1);
+ AppendTo(d.back(), -1);
+ AppendTo(d.back(), -1);
+ AppendTo(d.back(), -1);
+ d.emplace_back();
+ AppendTo(d.back(), 0);
+ AppendTo(d.back(), 1);
+ AppendTo(d.back(), 2);
+ AppendTo(d.back(), 1);
+ AppendTo(d.back(), -1);
+ AppendTo(d.back(), 0);
+ AppendTo(d.back(), 1);
+ AppendTo(d.back(), 2);
+ d.emplace_back();
+ AppendTo(d.back(), 0);
+ AppendTo(d.back(), -1);
+ AppendTo(d.back(), -2);
+ AppendTo(d.back(), -1);
+ AppendTo(d.back(), -2);
+ AppendTo(d.back(), -1);
+ AppendTo(d.back(), 0);
+ AppendTo(d.back(), -1);
+ AppendTo(d.back(), -2);
+
+ TestCodec<TPForCodec<ui32>, false>(d);
+ TestCodec<TPForCodec<ui32, true>, false>(d);
+ }
+ {
+ TVector<TBuffer> d;
+ d.emplace_back();
+ for (auto& textValue : TextValues) {
+ AppendTo(d.back(), (ui32)strlen(textValue));
+ }
+
+ TestCodec<TPForCodec<ui32>, false>(d);
+ TestCodec<TPForCodec<ui32, true>, false>(d);
+ }
+ {
+ TVector<TBuffer> d;
+ d.emplace_back();
+ for (auto& textValue : TextValues) {
+ AppendTo(d.back(), (ui64)strlen(textValue));
+ }
+
+ TestCodec<TPForCodec<ui64>, false>(d);
+ TestCodec<TPForCodec<ui64, true>, false>(d);
+ }
+ }
+
+ template <class TCodec>
+ void DoTestSimpleCodec() {
+ using namespace NCodecs;
+ {
+ TVector<TBuffer> learn;
+
+ for (auto& textValue : TextValues) {
+ learn.emplace_back(textValue, strlen(textValue));
+ }
+
+ TestCodec<TCodec, true>(learn);
+ }
+ {
+ TestCodec<TCodec, true>();
+ }
+
+ {
+ TVector<TBuffer> learn;
+ learn.emplace_back();
+ learn.back().Append('a');
+
+ TVector<TBuffer> test;
+ test.emplace_back();
+ for (ui32 i = 0; i < 256; ++i) {
+ test.back().Append((ui8)i);
+ }
+
+ TestCodec<TCodec, true>(learn, test);
+ }
+
+ {
+ TVector<TBuffer> learn;
+ learn.emplace_back();
+ for (ui32 i = 0; i < 256; ++i) {
+ for (ui32 j = 0; j < i; ++j) {
+ learn.back().Append((ui8)i);
+ }
+ }
+
+ TVector<TBuffer> test;
+ test.emplace_back();
+ for (ui32 i = 0; i < 256; ++i) {
+ test.back().Append((ui8)i);
+ }
+
+ TestCodec<TCodec, true>(learn, test);
+ }
+
+ {
+ TVector<TBuffer> learn;
+ learn.emplace_back();
+ for (ui32 i = 0; i < 128; ++i) {
+ for (ui32 j = 0; j < i; ++j) {
+ learn.back().Append((ui8)i);
+ }
+ }
+
+ TVector<TBuffer> test;
+ test.emplace_back();
+ for (ui32 i = 128; i < 256; ++i) {
+ test.back().Append((ui8)i);
+ }
+
+ TestCodec<TCodec, true>(learn, test);
+ }
+ }
+
+ void TestHuffman() {
+ DoTestSimpleCodec<NCodecs::THuffmanCodec>();
+ }
+
+ void TestZStdDict() {
+ using namespace NCodecs;
+ {
+ TVector<TBuffer> learn;
+
+ for (auto& textValue : TextValues) {
+ learn.emplace_back(textValue, strlen(textValue));
+ }
+
+ TestCodec<TZStdDictCodec, true>(learn);
+ }
+
+ }
+
+ void TestCompTable() {
+ DoTestSimpleCodec<NCodecs::TCompTableCodec>();
+ }
+
+ void TestHuffmanLearnByFreqs() {
+ using namespace NCodecs;
+
+ TVector<TBuffer> data;
+
+ for (auto& textValue : TextValues) {
+ data.emplace_back(textValue, strlen(textValue));
+ }
+
+ TVector<TBuffer> outLearn;
+
+ {
+ THuffmanCodec codec;
+ static_cast<ICodec&>(codec).Learn(data.begin(), data.end());
+
+ for (ui32 i = 0; i < data.size(); ++i) {
+ outLearn.emplace_back();
+ codec.Encode(AsStrBuf(data[i]), outLearn[i]);
+ }
+ }
+
+ TVector<TBuffer> outLearnByFreqs;
+
+ {
+ THuffmanCodec codec;
+ std::pair<char, ui64> freqs[256];
+
+ for (size_t i = 0; i < Y_ARRAY_SIZE(freqs); ++i) {
+ freqs[i].first = (char)i;
+ freqs[i].second = 0;
+ }
+
+ for (auto& textValue : TextValues) {
+ size_t len = strlen(textValue);
+ for (size_t j = 0; j < len; ++j) {
+ ++freqs[(ui32)(0xFF & textValue[j])].second;
+ }
+ }
+
+ codec.LearnByFreqs(TArrayRef<std::pair<char, ui64>>(freqs, Y_ARRAY_SIZE(freqs)));
+
+ for (ui32 i = 0; i < data.size(); ++i) {
+ outLearnByFreqs.emplace_back();
+ codec.Encode(AsStrBuf(data[i]), outLearnByFreqs[i]);
+ }
+ }
+
+ UNIT_ASSERT_EQUAL(outLearn.size(), outLearnByFreqs.size());
+ const size_t sz = outLearn.size();
+ for (size_t n = 0; n < sz; ++n) {
+ UNIT_ASSERT_EQUAL(AsStrBuf(outLearn[n]), AsStrBuf(outLearnByFreqs[n]));
+ }
+ }
+
+ void TestSolar() {
+ using namespace NCodecs;
+ {
+ TVector<TBuffer> learn;
+
+ for (auto& textValue : TextValues) {
+ learn.emplace_back(textValue, strlen(textValue));
+ }
+
+ TestCodec<TSolarCodec, true>(learn, TVector<TBuffer>(), new TSolarCodec(512, 8));
+ TestCodec<TAdaptiveSolarCodec, false>(learn, TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8));
+ TestCodec<TAdaptiveSolarCodec, true>(learn, TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8));
+ TestCodec<TSolarCodecShortInt, true>(learn, TVector<TBuffer>(), new TSolarCodecShortInt(512, 8));
+ }
+ {
+ TestCodec<TSolarCodec, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TSolarCodec(512, 8));
+ TestCodec<TAdaptiveSolarCodec, false>(TVector<TBuffer>(), TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8));
+ TestCodec<TAdaptiveSolarCodec, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8));
+ TestCodec<TSolarCodecShortInt, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TSolarCodecShortInt(512, 8));
+ }
+
+ {
+ TVector<TBuffer> learn;
+ learn.emplace_back();
+ learn.back().Append('a');
+
+ TVector<TBuffer> test;
+ test.emplace_back();
+ for (ui32 i = 0; i < 256; ++i) {
+ test.back().Append((ui8)i);
+ }
+
+ TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8));
+ TestCodec<TAdaptiveSolarCodec, false>(learn, test, new TAdaptiveSolarCodec(512, 8));
+ TestCodec<TAdaptiveSolarCodec, true>(learn, test, new TAdaptiveSolarCodec(512, 8));
+ TestCodec<TSolarCodecShortInt, true>(learn, test, new TSolarCodecShortInt(512, 8));
+ }
+
+ {
+ TVector<TBuffer> learn;
+ learn.emplace_back();
+ for (ui32 i = 0; i < 256; ++i) {
+ for (ui32 j = 0; j < i; ++j) {
+ learn.back().Append((ui8)i);
+ }
+ }
+
+ TVector<TBuffer> test;
+ test.emplace_back();
+ for (ui32 i = 0; i < 256; ++i) {
+ test.back().Append((ui8)i);
+ }
+
+ TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8));
+ TestCodec<TAdaptiveSolarCodec, false>(learn, test, new TAdaptiveSolarCodec(512, 8));
+ TestCodec<TAdaptiveSolarCodec, true>(learn, test, new TAdaptiveSolarCodec(512, 8));
+ TestCodec<TSolarCodecShortInt, true>(learn, test, new TSolarCodecShortInt(512, 8));
+ }
+ }
+
+ void TestPipeline() {
+ using namespace NCodecs;
+ {
+ TVector<TBuffer> learn;
+ learn.emplace_back();
+ for (ui32 i = 0; i < 256; ++i) {
+ for (i32 j = i; j >= 0; --j) {
+ learn.back().Append((ui8)j);
+ }
+ }
+
+ TVector<TBuffer> test;
+ test.emplace_back();
+ for (ui32 i = 0; i < 256; ++i) {
+ test.back().Append((ui8)i);
+ }
+
+ TestCodec<TPipelineCodec, true>(learn, test,
+ new TPipelineCodec(new TSolarCodec(512, 8), new TSolarCodec(512, 8), new THuffmanCodec));
+ }
+ {
+ TVector<TBuffer> d;
+ d.emplace_back();
+ for (ui32 i = 0; i < 256; ++i) {
+ for (i32 j = i; j >= 0; --j) {
+ d.back().Append(i * i);
+ }
+ }
+
+ TestCodec<TPipelineCodec, false>(d, TVector<TBuffer>(),
+ new TPipelineCodec(new TDeltaCodec<ui32, false>, new TPForCodec<ui32>));
+ }
+ }
+
+ void TestRegistry() {
+ using namespace NCodecs;
+ TVector<TString> vs = ICodec::GetCodecsList();
+ for (const auto& v : vs) {
+ TCodecPtr p = ICodec::GetInstance(v);
+ if (v == "none") {
+ UNIT_ASSERT(!p);
+ continue;
+ }
+ UNIT_ASSERT_C(!!p, v);
+ UNIT_ASSERT_C(TStringBuf(v).Head(3) == TStringBuf(p->GetName()).Head(3), v + " " + p->GetName());
+ }
+ }
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TCodecsTest)
diff --git a/library/cpp/codecs/ut/float_huffman_ut.cpp b/library/cpp/codecs/ut/float_huffman_ut.cpp
new file mode 100644
index 0000000000..3156fb1f46
--- /dev/null
+++ b/library/cpp/codecs/ut/float_huffman_ut.cpp
@@ -0,0 +1,237 @@
+#include <library/cpp/codecs/float_huffman.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+#include <util/stream/format.h>
+#include <util/stream/output.h>
+#include <library/cpp/string_utils/base64/base64.h>
+
+namespace fh = NCodecs::NFloatHuff;
+
+Y_UNIT_TEST_SUITE(FloatHuffmanTest) {
+ static const float Factors[] = {
+ 0.340582, 0.000974026, 0.487168, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0.411765, 0.921569,
+ 0.00390625, 0.109371, 0, 1, 0, 0, 0, 0, 0.523322, 0, 1, 0, 0, 0, 0, 0.285714, 1,
+ 0.008253, 1, 0, 0, 0.00993935, 0.450213, 0.000974026, 1, 1, 1, 1, 0, 0, 0.20564,
+ 0.97561, 0.913896, 1, 1, 0, 1, 0, 0, 0.5, 0, 0, 0, 0.1, 1, 0, 0, 0, 0, 0, 0.450923,
+ 0, 0.5, 0, 0, 0.20564, 0, 0.5, 0, 0, 0.20564, 0, 0, 0.0313726, 0, 1, 1, 1, 0.363636,
+ 0.5, 0.686073, 0.45121, 0.00574382, 0.366166, 0.413295, 1, 1, 1, 0, 0, 0, 0, 0.160784,
+ 0, 0.937255, 0.537255, 0.133333, 0, 0, 0, 0, 0.00392157, 0, 0.333333, 0.027451, 0.0156863,
+ 1, 0.105882, 1, 0.00220908, 0.000112501, 0.0111262, 0.102384, 0.00140808, 0.123581,
+ 0.29308, 6.57282e-06, 0.00489498, 2.10209e-05, 0.00140559, 5.907e-06, 0, 0.559322,
+ 0.559322, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0.794765, 0,
+ 0.352648, 0.225904, 1, 0.047619, 0.0107276, 0.399461, 0.0304838, 0.292932, 0.00969929,
+ 0, 0, 0.886904, 0.714693, 0, 0.00223213, 0.000544069, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0.00507403, 0, 0, 0, 0, 0, 0.875, 0, 0, 1, 1, 1, 0, 0.20564, 0, 0.00176048, 0,
+ 0.000440121, 0, 0, 0, 0.000974026, 0.487168, 0, 0, 0.533333, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 1, 0, 0, 0.723187, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 1, 0, 0.206882, 0.00483367, 0.792983, 0.00126106, 1, 0.0313726, 0.470588,
+ 0.254902, 0.188235, 0.188235, 0.388235, 0.164706, 0, 0.870588, 0.843137, 0.635294,
+ 0.384314, 0.384314, 0.643137, 0, 0, 0, 0, 0, 0, 0, 0, 0.541176, 0, 0.541176, 0, 0,
+ 0.0532634, 1, 0, 0, 0, 0.015044, 1, 0, 1, 1, 1, 0.47451, 0.329412, 0.964706, 0, 0,
+ 0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0.0941176, 0.970588, 0.970588, 0, 0.970588, 0.97561,
+ 0, 0.0431373, 0.47451, 0.329412, 0.964706, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0.231373, 0.00392157, 0, 0, 0, 0.054902, 0, 0,
+ 1, 0, 0, 0.0235294, 0, 1, 0, 0, 0, 0, 0.34902, 0.0352941, 0.925379, 0.623681, 0,
+ 0.954543, 0, 0, 0.00102756, 0.709804, 0.498039, 0.0901961, 0.631373, 0.847059, 0.270588,
+ 0.0156863, 0.133333, 0.980392, 1e-12, 1e-12, 1e-12, 1e-12, 0.497159, 0, 0.407487,
+ 0, 0, 0, 0.00392157, 0.00202156, 0.046875, 0.187159, 0.046875, 0.15625, 0.434232,
+ 0.15625, 0, 2.95083e-07, 0.20564, 0.20564, 0.97561, 0.913896, 0, 0, 0, 0, 0, 0, 0.00784314,
+ 0, 0.695525, 1, 0.07205, 0, 0, 0.176471, 0, 0, 0, 1, 1, 0.98, 0.01, 0.01, 0, 0.00690702,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.29078, 0.29078, 1, 0, 0, 0, 0, 0.192157, 0.188235,
+ 0.0941176, 0, 0.0313726, 0, 0.141176, 0.207843, 0.0901961, 0.00784314, 0.0784314,
+ 0, 0, 0, 0, 0, 0.203922, 0.0196078, 0.34902, 0.0235294, 0.0980392, 0.164706, 0.133333,
+ 0.368627, 0, 0.0941176, 0, 1, 0.313726, 0, 0, 0.433582, 0.384508, 0.0532186, 0.0833333,
+ 0.01609, 0, 1, 0, 0, 0, 0.0666667, 0, 0, 0, 0, 1, 0, 0.564706, 0.501961, 0, 0, 0,
+ 0, 0, 0.0516447, 0.000173065, 0, 0, 0, 0, 0, 0, 0, 0.996309, 0, 0, 0.00392157, 1,
+ 0, 0.01, 0, 0, 0, 0, 0, 0.439505, 0.206882, 0.206882, 0.260891, 0, 0.875, 0, 0, 0,
+ 0, 0, 0.185657, 1, 1, 0, 0, 0, 0.0332647, 0.206106, 0.0688878, 0.239216, 0, 0, 0,
+ 0, 0.054902, 0, 0.101961, 0.160784, 0.180392, 0, 0.737828, 0, 0, 0.875, 0.0142566,
+ 0, 0.662745, 1, 0, 0, 0, 0.225806, 0.99992, 0.631373, 0.00392157, 1, 0, 0.143647,
+ 0.00270085, 1, 0.231482, 0.246735, 0.0428062, 0, 0, 1, 0, 0.186441, 0.0115358, 0,
+ 0.221762, 0, 0.2, 0, 0.0156863, 0, 0, 0, 0.976471, 0, 0.231373, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0.00392157, 0.00392157, 0.0666667, 0, 0, 0, 0, 0.0117647, 0.580392, 0.98737,
+ 1, 1, 1, 0, 0, 0, 0.153, 0.847, 0.931373, 0.94697, 0.94697, 0, 0.946294, 0.408118,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0.99992, 0.97561, 0, 0, 0, 0, 0, 0,
+ 0.274677, 0.153017, 0, 0.642356, 0, 0, 0.1, 0, 0, 0, 0, 0.327944, 0.327944, 0, 0,
+ 0.815686, 0, 0, 0, 0, 0.206106, 0.439126, 0, 0, 0, 0, 0, 1, 1, 1, 0.00392157, 0.232788,
+ 0.232465, 0.999899, 0.00309296, 0.0636097, 0.445954, 0.156863, 0, 0, 0, 0, 0, 0,
+ 0.3796, 0.0784, 0.0651664, 0, 0, 0.254902, 0.266667, 1, 0, 0, 0, 0, 0, 0.596073,
+ 0.517876, 0.145833, 0.372549, 0, 0.991667, 0.602125, 0.161979, 0, 0, 0, 0, 0.0255146,
+ 0.947855, 0, 0, 0, 0, 0, 0, 0, 0, 0.847059, 0.679841, 0, 0.156863, 0, 0, 1, 0, 0,
+ 0, 0, 0.969697, 0, 0, 0.564706, 0, 0, 0, 0, 0, 1, 0.0367282, 0.0395228, 0, 0, 0,
+ 0, 0, 0.0470588, 0.141176, 0.054902, 0, 0, 0, 0};
+ static const size_t FactorCount = Y_ARRAY_SIZE(Factors);
+
+ static const ui8 CodedFactors[] = {
+ 0x24, 0x06, 0x73, 0xB5, 0xC7, 0x55, 0x7F, 0x3A, 0xB4, 0x70, 0xCB, 0xEF, 0xEE, 0xFE, 0xB3, 0x5B,
+ 0x5A, 0x1A, 0x93, 0x5F, 0x5F, 0x13, 0x00, 0x00, 0x10, 0x00, 0x3D, 0xEF, 0xFF, 0xEE, 0x0F, 0xDC,
+ 0xF0, 0xAB, 0x3F, 0x37, 0x92, 0x24, 0x5D, 0x5E, 0xDE, 0x1C, 0xF8, 0x12, 0x15, 0x5B, 0x84, 0x51,
+ 0x82, 0xE6, 0xF6, 0xB8, 0xEA, 0x4F, 0xC7, 0xDD, 0x7D, 0x2E, 0x4D, 0x4A, 0x21, 0xCA, 0xE0, 0xC4,
+ 0x2E, 0xEA, 0xD3, 0xBD, 0x0F, 0x00, 0x00, 0xE0, 0xDA, 0xCC, 0xCC, 0xEC, 0x9F, 0x61, 0xDF, 0xE6,
+ 0x01, 0x00, 0x00, 0xCC, 0xA5, 0x49, 0xA9, 0x00, 0x00, 0x00, 0xE6, 0xD2, 0xA4, 0xD4, 0xEA, 0x08,
+ 0x08, 0xD0, 0xDD, 0xF9, 0xE7, 0xA2, 0x0B, 0x00, 0x00, 0x40, 0xD8, 0x13, 0x7D, 0xFE, 0x13, 0x9C,
+ 0x9B, 0xA8, 0x36, 0xBC, 0x00, 0x90, 0x43, 0x6F, 0x97, 0x67, 0x9B, 0xD3, 0xEE, 0xFE, 0x84, 0x24,
+ 0x25, 0x89, 0xC9, 0xBF, 0x3F, 0x58, 0x4C, 0x4C, 0xCA, 0x21, 0x22, 0xBC, 0x39, 0x08, 0x08, 0x08,
+ 0x40, 0x7E, 0xAA, 0xAA, 0xCA, 0x75, 0x70, 0x70, 0xE9, 0x08, 0x08, 0xE8, 0x9A, 0x8A, 0x8D, 0xED,
+ 0xA6, 0x8D, 0x31, 0x04, 0x00, 0x96, 0xD0, 0x7D, 0x1D, 0x47, 0xAA, 0x2A, 0xD9, 0x28, 0xAD, 0x6B,
+ 0xB4, 0x9D, 0x7A, 0xC4, 0xD5, 0xD1, 0x04, 0x8C, 0x7E, 0x56, 0x3A, 0x58, 0x5A, 0x0C, 0x46, 0x6E,
+ 0x1B, 0x53, 0xC2, 0x0C, 0x14, 0x00, 0xAB, 0x60, 0x05, 0x7B, 0x63, 0x8D, 0x77, 0x70, 0x75, 0xAC,
+ 0x2F, 0x8D, 0xB1, 0x4D, 0xA0, 0xFB, 0xF2, 0x40, 0xF7, 0xE5, 0x7F, 0xDF, 0xDD, 0xFD, 0xBB, 0x1B,
+ 0xB8, 0x75, 0x9B, 0x47, 0x8E, 0xB4, 0x0C, 0x9B, 0x3A, 0x73, 0x25, 0x61, 0x18, 0x92, 0xD1, 0xC2,
+ 0x2F, 0x3C, 0x31, 0x64, 0x96, 0x2A, 0xB9, 0xF9, 0x7C, 0xD9, 0xAF, 0x94, 0xC5, 0xE9, 0x1E, 0x63,
+ 0x24, 0x0C, 0x03, 0x7F, 0xD8, 0x5B, 0xB3, 0x1D, 0x49, 0x02, 0x00, 0xAB, 0xFD, 0xE9, 0xA0, 0xF3,
+ 0xBF, 0xC9, 0x40, 0x64, 0x0A, 0xC0, 0xC7, 0x00, 0x00, 0x60, 0x77, 0xCF, 0xA5, 0x49, 0xA9, 0x16,
+ 0xFD, 0xD7, 0x5C, 0xA7, 0x55, 0x00, 0x36, 0xCF, 0xB9, 0x3D, 0xAE, 0xFA, 0xD3, 0xA1, 0x85, 0x5B,
+ 0xFE, 0x60, 0x10, 0x11, 0xFF, 0xF7, 0x7D, 0x38, 0x59, 0x24, 0xFF, 0xFF, 0xDF, 0x13, 0x1C, 0x7B,
+ 0xCA, 0x1C, 0x1E, 0xF3, 0x04, 0xC0, 0x78, 0x07, 0x58, 0x7B, 0xA2, 0x54, 0xAA, 0xE3, 0xEA, 0x08,
+ 0x08, 0xC0, 0x74, 0x78, 0x78, 0x88, 0x50, 0x50, 0xD8, 0x0A, 0x0C, 0xC4, 0x56, 0x60, 0x20, 0xF6,
+ 0x1A, 0x1B, 0x33, 0x16, 0x15, 0xA5, 0xB8, 0xED, 0xED, 0x22, 0xF5, 0xF5, 0x09, 0xA1, 0xA2, 0x42,
+ 0x67, 0x62, 0x62, 0x3A, 0x13, 0x13, 0x0B, 0xA0, 0xA4, 0xF4, 0x0F, 0x06, 0x15, 0x35, 0x18, 0x54,
+ 0xD4, 0x35, 0x57, 0x45, 0xCB, 0x2F, 0x39, 0xF6, 0xEC, 0xBC, 0xBB, 0x53, 0x5F, 0x5E, 0x9E, 0xB1,
+ 0xA8, 0xA8, 0x28, 0xDF, 0xDE, 0x3E, 0x00, 0x00, 0x80, 0x5F, 0x75, 0x81, 0x81, 0x51, 0x1D, 0x1E,
+ 0xA2, 0x3A, 0x3C, 0x8C, 0xEA, 0xF0, 0x10, 0x51, 0x06, 0x67, 0xED, 0x85, 0x85, 0xA1, 0xBE, 0xBC,
+ 0x3C, 0x63, 0x51, 0x51, 0x51, 0xBE, 0xBD, 0xFD, 0xFF, 0xFD, 0xFE, 0xCE, 0x85, 0x76, 0x36, 0x73,
+ 0x10, 0x10, 0x10, 0x80, 0xEB, 0x3A, 0x38, 0xD8, 0xBE, 0xD4, 0x05, 0x06, 0xEE, 0x4F, 0x60, 0x59,
+ 0x59, 0x65, 0x84, 0x84, 0xC0, 0x46, 0xCB, 0x19, 0x7F, 0x4C, 0xFD, 0xC8, 0x9D, 0x8B, 0xB6, 0x31,
+ 0xAF, 0x86, 0x3A, 0xF0, 0x6D, 0x6D, 0x11, 0xDF, 0xDF, 0x5F, 0x79, 0x71, 0x71, 0x85, 0xD4, 0xD0,
+ 0x10, 0xB9, 0xB1, 0x11, 0x1A, 0x54, 0x54, 0xE9, 0x08, 0x08, 0x48, 0x39, 0x44, 0x04, 0x84, 0xAF,
+ 0xAF, 0x96, 0x99, 0x97, 0x71, 0xC5, 0x32, 0xF3, 0x32, 0xAE, 0x58, 0x66, 0x5E, 0xC6, 0x15, 0xCB,
+ 0xCC, 0xCB, 0xB8, 0x42, 0xD0, 0x45, 0xFF, 0x1C, 0x11, 0x85, 0xBE, 0x39, 0x08, 0x08, 0x08, 0x80,
+ 0x69, 0xC2, 0x47, 0x00, 0x80, 0x02, 0x00, 0x00, 0x91, 0xD3, 0xF4, 0x47, 0x01, 0x00, 0x80, 0x08,
+ 0x00, 0x00, 0x42, 0xD4, 0x29, 0x6F, 0x02, 0x00, 0x80, 0xB4, 0xE6, 0x6B, 0x9E, 0x34, 0x5C, 0x9A,
+ 0x94, 0xE2, 0xD2, 0xA4, 0x14, 0xA2, 0x0C, 0x4E, 0xEC, 0xA2, 0x3E, 0x7F, 0x39, 0x08, 0x08, 0x10,
+ 0x6E, 0x6F, 0x10, 0xD7, 0x79, 0xC7, 0xC9, 0x09, 0x4D, 0x4B, 0x73, 0x77, 0x84, 0x14, 0xAE, 0x52,
+ 0xE1, 0x7A, 0x44, 0x2A, 0x5C, 0x8F, 0x34, 0x93, 0xA8, 0xC4, 0x01, 0xF8, 0x3F, 0x3D, 0xC2, 0x29,
+ 0xE9, 0x11, 0x4E, 0xE9, 0x4F, 0x67, 0x62, 0x22, 0xB6, 0x02, 0x03, 0xA9, 0x2E, 0x30, 0x70, 0x75,
+ 0x04, 0x04, 0xC8, 0x38, 0x48, 0x08, 0x32, 0x53, 0x53, 0x29, 0x2F, 0x2E, 0xAE, 0x1C, 0x04, 0x04,
+ 0x50, 0x52, 0x50, 0xD0, 0x4F, 0x77, 0x68, 0x28, 0x99, 0x08, 0x0A, 0x4A, 0x60, 0x59, 0x59, 0xA9,
+ 0x0B, 0x0C, 0xAC, 0xC7, 0xC8, 0xC8, 0x8C, 0x45, 0x45, 0xA1, 0x1C, 0x22, 0x02, 0x5D, 0x79, 0x79,
+ 0xAB, 0x2E, 0x30, 0x70, 0xA7, 0x2C, 0x28, 0xE8, 0xB4, 0xF3, 0xEF, 0x26, 0x8F, 0x37, 0xB1, 0xFE,
+ 0xEE, 0x67, 0xA9, 0xA9, 0xAA, 0xAA, 0x6C, 0x79, 0x1E, 0xEC, 0xD7, 0x46, 0x44, 0xC4, 0xF7, 0xF8,
+ 0x24, 0x24, 0x00, 0x42, 0x40, 0xF8, 0x5A, 0x96, 0x38, 0x65, 0x91, 0xF1, 0x6A, 0x72, 0xFE, 0x68,
+ 0xC3, 0xE1, 0x37, 0x07, 0x01, 0x01, 0x01, 0xF0, 0x52, 0xE1, 0x7A, 0xE4, 0xB3, 0xD9, 0x20, 0x9C,
+ 0xE0, 0xD8, 0x53, 0x04, 0xC7, 0x9E, 0x82, 0x02, 0x27, 0x2B, 0x06, 0x00, 0x00, 0x9F, 0xDE, 0x1C,
+ 0x3E, 0xEE, 0xD7, 0x48, 0x20, 0x04, 0xD2, 0x35, 0x4C, 0x29, 0x43, 0x45, 0x23, 0x15, 0xEA, 0xE9,
+ 0x5E, 0xD7, 0xC1, 0xC1, 0xAA, 0x3B, 0x34, 0x34, 0x21, 0x49, 0x49, 0xE8, 0x8A, 0x8B, 0x13, 0x66,
+ 0x12, 0xE7, 0x31, 0x00, 0x00, 0x90, 0x84, 0x94, 0x69, 0x05, 0xD4, 0xD4, 0xF4, 0x13, 0x36, 0xE7,
+ 0x0C, 0x09, 0xEB, 0xBF, 0x90, 0x1A, 0x1A, 0xE6, 0x20, 0x20, 0x20, 0x00, 0x9E, 0x33, 0x18, 0x13,
+ 0xA6, 0x2F, 0x40, 0x0C, 0x00, 0x4E, 0xCF, 0x84, 0x36, 0x6A, 0xA0, 0xF2, 0xA9, 0x63, 0xD5, 0xCB,
+ 0x9E, 0x64, 0xEA, 0x3E, 0xF2, 0x14, 0xA0, 0x27, 0x29, 0x2B, 0xC6, 0xB2, 0x99, 0x99, 0xA9, 0x74,
+ 0x04, 0x04, 0x3C, 0x0A, 0xD0, 0xCF, 0x5C, 0x68, 0x67, 0xFB, 0xDF, 0x1C, 0x04, 0x04, 0x04, 0xC0,
+ 0x1C, 0x04, 0x04, 0x04, 0x40, 0x1B, 0x11, 0x11, 0x5F, 0xEA, 0x02, 0x03, 0xE1, 0x92, 0x94, 0x84,
+ 0x90, 0x88, 0xD9, 0xDD, 0x4F, 0x04, 0x56, 0x0E, 0xD1, 0x9F, 0x1A, 0x31, 0x3B, 0x37, 0x47, 0xA0,
+ 0x6C, 0x82, 0x40, 0xD9, 0x24, 0x9A, 0x02, 0x12, 0x62, 0xD3, 0x43, 0xFF, 0xBF, 0x8F, 0x84, 0xF5,
+ 0x1F, 0x51, 0x06, 0xE7, 0x0F, 0xDD, 0x89, 0x32, 0xFB, 0x60, 0x39, 0x0A, 0x71, 0x71, 0xB4, 0x36,
+ 0x33, 0x33, 0x3F, 0x8F, 0xD0, 0x4F, 0x79, 0x84, 0x7E, 0xBA, 0xC8, 0x0C, 0x0D, 0x4F, 0xBA, 0x86,
+ 0x29, 0x82, 0x54, 0x83, 0x7F, 0x77, 0x37, 0x07, 0x01, 0x01, 0x01, 0xA0, 0xFE, 0x97, 0x1B, 0x9D,
+ 0x16, 0xDC, 0x90, 0x58, 0xFE, 0x9B, 0x42, 0xB3, 0x4A, 0x00, 0x68, 0x73, 0x91, 0x20, 0x2B, 0xA8,
+ 0xC8, 0x29, 0x0B, 0x0A, 0xF2, 0xD3, 0x5D, 0x4B, 0x58, 0x5D, 0x20, 0x41, 0xD5, 0xBE, 0xAE, 0x70,
+ 0x88, 0x50, 0x50, 0x20, 0x4A, 0x44, 0xF4, 0x8F, 0xF7, 0x60, 0x22, 0x30, 0x9C, 0x24, 0xFE, 0x54,
+ 0x55, 0xD0, 0xD7, 0xD7, 0x37, 0x1A, 0xEF, 0x6E, 0xBC, 0x9B, 0x44, 0x39, 0xDD, 0x5D, 0xF2, 0xF2,
+ 0x7F, 0x20, 0x1A, 0x81, 0x9A, 0xCA, 0xBF, 0xC8, 0x8D, 0x8D, 0xC2, 0x83, 0x82, 0xA7, 0x2C, 0x28,
+ 0xC8, 0xFE, 0x08, 0xC2, 0x07, 0xC7, 0x27, 0x21, 0xE1, 0xBB, 0x3E, 0xC1, 0x59, 0x68, 0xAA, 0x78,
+ 0xC8, 0x57, 0x5D, 0x60, 0x20, 0xC6, 0x41, 0x42, 0xE8, 0x3A, 0x38, 0xD8, 0x9B, 0xFF, 0xFF, 0xFF,
+ 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+ static const size_t CodedSize = Y_ARRAY_SIZE(CodedFactors);
+ static const TStringBuf CodedFactorsBuf(reinterpret_cast<const char*>(CodedFactors), CodedSize);
+
+ void FillWithGarbage(float* factors, size_t count) {
+ void* data = static_cast<void*>(factors);
+ memset(data, 0xAA, sizeof(float) * count);
+ }
+
+ // Helper for dumping compressed values
+ void PrintCompressed(const TVector<ui8>& codedFactors) {
+ for (size_t i = 0; i < codedFactors.size(); ++i) {
+ if (i % 0x10 == 0)
+ Cerr << Endl;
+ Cerr << Hex(codedFactors[i]) << ", ";
+ }
+ Cerr << Endl;
+ }
+
+ // Helper for dumping decompressed values
+ void PrintDecompressed(const TVector<float>& factors) {
+ TStringStream result;
+ TStringStream line;
+
+ for (size_t i = 0; i < factors.size(); ++i) {
+ line << factors[i] << ", ";
+ if (line.Str().size() > 80) {
+ result << line.Str() << Endl;
+ line.Clear();
+ }
+ }
+ Cerr << result.Str() << Endl;
+ }
+
+ Y_UNIT_TEST(TestCompress) {
+ const auto codedFactors = fh::Encode(Factors);
+ UNIT_ASSERT_VALUES_EQUAL(codedFactors.size(), CodedSize);
+ for (size_t i = 0; i < Min(codedFactors.size(), CodedSize); ++i)
+ UNIT_ASSERT_VALUES_EQUAL((ui8)codedFactors[i], CodedFactors[i]);
+ //PrintCompressed(codedFactors);
+ }
+
+ Y_UNIT_TEST(TestSimpleDecompress) {
+ TVector<float> factors = fh::Decode(CodedFactorsBuf);
+ UNIT_ASSERT_VALUES_EQUAL(factors.size(), FactorCount);
+ for (size_t i = 0; i < Min(factors.size(), FactorCount); ++i)
+ UNIT_ASSERT_VALUES_EQUAL(factors[i], Factors[i]);
+ //PrintDecompressed(factors);
+ }
+
+ Y_UNIT_TEST(TestDecompressInParts) {
+ float factors[FactorCount];
+ FillWithGarbage(factors, FactorCount);
+ fh::TDecoder decoder(CodedFactorsBuf);
+ const size_t firstPack = 100;
+ // unpack first pack
+ UNIT_ASSERT_VALUES_EQUAL(decoder.Decode({factors, firstPack}), firstPack);
+ // unpack all the rest
+ UNIT_ASSERT_VALUES_EQUAL(decoder.Decode({factors + firstPack, FactorCount - firstPack}), FactorCount - firstPack);
+
+ for (size_t i = 0; i < FactorCount; ++i)
+ UNIT_ASSERT_VALUES_EQUAL(factors[i], Factors[i]);
+ //PrintDecompressed(factors);
+ }
+
+ Y_UNIT_TEST(TestSkip) {
+ float factors[FactorCount];
+ FillWithGarbage(factors, FactorCount);
+ fh::TDecoder decoder(CodedFactorsBuf);
+ const size_t firstPack = 100;
+ // unpack first pack
+ UNIT_ASSERT_VALUES_EQUAL(decoder.Decode({factors, firstPack}), firstPack);
+ // skip some factors
+ const size_t skipCount = 60;
+ UNIT_ASSERT_VALUES_EQUAL(decoder.Skip(skipCount / 2), skipCount / 2);
+ // unpack all, except some factors in the end
+ const auto toDecode = FactorCount - firstPack - skipCount;
+ UNIT_ASSERT_VALUES_EQUAL(decoder.Decode({factors + firstPack, toDecode}), toDecode);
+ UNIT_ASSERT_VALUES_EQUAL(decoder.Skip(skipCount / 2), skipCount / 2);
+ for (size_t i = 0; i < FactorCount - skipCount; ++i) {
+ size_t correctedI = i < firstPack ? i : i + skipCount / 2;
+ UNIT_ASSERT_VALUES_EQUAL(factors[i], Factors[correctedI]);
+ }
+ //PrintDecompressed(factors);
+ }
+
+ Y_UNIT_TEST(TestDecompressForgedData) {
+ // this coredumps without end-of-coded-stream check, see SEARCH-1156 for details
+ TString brokenBase64Encoded =
+ "NLjYltUWs5pqnd3d3f05Li4OAwCAEqrP6mv06jDt7PiAUVu7Y+PiMpuZmdzeM"
+ "ArqOLxS2q4FKCII52dktcVs7y0zL+OKgeO9SOzEkFj7uPfFqqoCAAAAAADAtZ"
+ "mZ2fdmICAgANQXhi1WVRUAAAAAAAAGjvcWq6oKAAAAAAAAA8d7qe4rV3Nxcd3"
+ "d4ZfQZrETm3B+OxxB8bbnTPM5+qtbQ92mJ3fHPGj+iH5+8tzcnJuamry1tWUw"
+ "MBD693f07+9+DQQEkIGAgIgPetzN5yEbAGxWpbCNxXK/0JGTKRz2KkIoR7aM";
+ UNIT_ASSERT_EXCEPTION(
+ fh::Decode(Base64Decode(brokenBase64Encoded)),
+ yexception);
+ }
+
+ Y_UNIT_TEST(TestDecompressEmpty) {
+ UNIT_ASSERT_EXCEPTION(fh::Decode({}), yexception);
+ }
+};
diff --git a/library/cpp/codecs/ut/tls_cache_ut.cpp b/library/cpp/codecs/ut/tls_cache_ut.cpp
new file mode 100644
index 0000000000..8101af761f
--- /dev/null
+++ b/library/cpp/codecs/ut/tls_cache_ut.cpp
@@ -0,0 +1,36 @@
+#include <library/cpp/testing/unittest/registar.h>
+#include <library/cpp/codecs/tls_cache.h>
+
+Y_UNIT_TEST_SUITE(CodecsBufferFactoryTest){
+ void AssignToBuffer(TBuffer & buf, TStringBuf val){
+ buf.Assign(val.data(), val.size());
+}
+
+TStringBuf AsStringBuf(const TBuffer& b) {
+ return TStringBuf(b.Data(), b.Size());
+}
+
+Y_UNIT_TEST(TestAcquireReleaseReuse) {
+ NCodecs::TBufferTlsCache factory;
+ // acquiring the first buffer
+ auto buf1 = factory.Item();
+ AssignToBuffer(buf1.Get(), "Buffer_01");
+ {
+ // acquiring the second buffer
+ auto buf2 = factory.Item();
+ AssignToBuffer(buf2.Get(), "Buffer_02");
+ }
+ // the first buffer should stay intact
+ UNIT_ASSERT_EQUAL(AsStringBuf(buf1.Get()), "Buffer_01");
+ {
+ // reacquiring the last released buffer
+ // expecting it zero sized but having the same memory
+ auto buf2 = factory.Item();
+ UNIT_ASSERT_VALUES_EQUAL(buf2.Get().Size(), 0u);
+ buf2.Get().Resize(TStringBuf("Buffer_02").Size());
+ UNIT_ASSERT_EQUAL(AsStringBuf(buf2.Get()), "Buffer_02");
+ }
+ // when the factory dies we should see no leaks
+}
+}
+;
diff --git a/library/cpp/codecs/ut/ya.make b/library/cpp/codecs/ut/ya.make
new file mode 100644
index 0000000000..90841b05ef
--- /dev/null
+++ b/library/cpp/codecs/ut/ya.make
@@ -0,0 +1,20 @@
+UNITTEST()
+
+OWNER(
+ g:base
+ velavokr
+)
+
+PEERDIR(
+ library/cpp/string_utils/base64
+ library/cpp/codecs
+ library/cpp/string_utils/relaxed_escaper
+)
+
+SRCS(
+ tls_cache_ut.cpp
+ codecs_ut.cpp
+ float_huffman_ut.cpp
+)
+
+END()
diff --git a/library/cpp/codecs/ya.make b/library/cpp/codecs/ya.make
new file mode 100644
index 0000000000..7e76fb0c9a
--- /dev/null
+++ b/library/cpp/codecs/ya.make
@@ -0,0 +1,33 @@
+LIBRARY()
+
+OWNER(
+ g:base
+ velavokr
+)
+
+SRCS(
+ tls_cache.cpp
+ codecs.cpp
+ codecs_registry.cpp
+ comptable_codec.cpp
+ delta_codec.cpp
+ float_huffman.cpp
+ huffman_codec.cpp
+ pfor_codec.cpp
+ solar_codec.cpp
+ zstd_dict_codec.cpp
+)
+
+PEERDIR(
+ contrib/libs/zstd
+ library/cpp/bit_io
+ library/cpp/blockcodecs
+ library/cpp/codecs/greedy_dict
+ library/cpp/comptable
+ library/cpp/containers/comptrie
+ library/cpp/deprecated/accessors
+ library/cpp/packers
+ library/cpp/string_utils/relaxed_escaper
+)
+
+END()
diff --git a/library/cpp/codecs/zstd_dict_codec.cpp b/library/cpp/codecs/zstd_dict_codec.cpp
new file mode 100644
index 0000000000..c42a2879e6
--- /dev/null
+++ b/library/cpp/codecs/zstd_dict_codec.cpp
@@ -0,0 +1,281 @@
+#include "zstd_dict_codec.h"
+
+#include <library/cpp/packers/packers.h>
+
+#include <util/generic/ptr.h>
+#include <util/generic/refcount.h>
+#include <util/generic/noncopyable.h>
+#include <util/string/builder.h>
+#include <util/system/src_location.h>
+#include <util/ysaveload.h>
+
+#define ZDICT_STATIC_LINKING_ONLY
+
+#include <contrib/libs/zstd/include/zdict.h>
+#include <contrib/libs/zstd/include/zstd.h>
+#include <contrib/libs/zstd/include/zstd_errors.h>
+
+// See IGNIETFERRO-320 for possible bugs
+
+namespace NCodecs {
+ class TZStdDictCodec::TImpl: public TAtomicRefCount<TZStdDictCodec::TImpl> {
+ template <class T, size_t Deleter(T*)>
+ class TPtrHolder : TMoveOnly {
+ T* Ptr = nullptr;
+
+ public:
+ TPtrHolder() = default;
+
+ TPtrHolder(T* dict)
+ : Ptr(dict)
+ {
+ }
+
+ T* Get() {
+ return Ptr;
+ }
+
+ const T* Get() const {
+ return Ptr;
+ }
+
+ void Reset(T* dict) {
+ Dispose();
+ Ptr = dict;
+ }
+
+ void Dispose() {
+ if (Ptr) {
+ Deleter(Ptr);
+ Ptr = nullptr;
+ }
+ }
+
+ ~TPtrHolder() {
+ Dispose();
+ }
+ };
+
+ using TCDict = TPtrHolder<ZSTD_CDict, ZSTD_freeCDict>;
+ using TDDict = TPtrHolder<ZSTD_DDict, ZSTD_freeDDict>;
+ using TCCtx = TPtrHolder<ZSTD_CCtx, ZSTD_freeCCtx>;
+ using TDCtx = TPtrHolder<ZSTD_DCtx, ZSTD_freeDCtx>;
+
+ using TSizePacker = NPackers::TPacker<ui64>;
+
+ public:
+ static const ui32 SampleSize = (1 << 22) * 5;
+
+ explicit TImpl(ui32 comprLevel)
+ : CompressionLevel(comprLevel)
+ {
+ const size_t zeroSz = TSizePacker().MeasureLeaf(0);
+ Zero.Resize(zeroSz);
+ TSizePacker().PackLeaf(Zero.data(), 0, zeroSz);
+ }
+
+ ui32 GetCompressionLevel() const {
+ return CompressionLevel;
+ }
+
+ ui8 Encode(TStringBuf in, TBuffer& outbuf) const {
+ outbuf.Clear();
+
+ if (in.empty()) {
+ return 0;
+ }
+
+ TSizePacker packer;
+
+ const char* rawBeg = in.data();
+ const size_t rawSz = in.size();
+
+ const size_t szSz = packer.MeasureLeaf(rawSz);
+ const size_t maxDatSz = ZSTD_compressBound(rawSz);
+
+ outbuf.Resize(szSz + maxDatSz);
+ packer.PackLeaf(outbuf.data(), rawSz, szSz);
+
+ TCCtx ctx{CheckPtr(ZSTD_createCCtx(), __LOCATION__)};
+ const size_t resSz = CheckSize(ZSTD_compress_usingCDict(
+ ctx.Get(), outbuf.data() + szSz, maxDatSz, rawBeg, rawSz, CDict.Get()),
+ __LOCATION__);
+
+ if (resSz < rawSz) {
+ outbuf.Resize(resSz + szSz);
+ } else {
+ outbuf.Resize(Zero.size() + rawSz);
+ memcpy(outbuf.data(), Zero.data(), Zero.size());
+ memcpy(outbuf.data() + Zero.size(), rawBeg, rawSz);
+ }
+ return 0;
+ }
+
+ void Decode(TStringBuf in, TBuffer& outbuf) const {
+ outbuf.Clear();
+
+ if (in.empty()) {
+ return;
+ }
+
+ TSizePacker packer;
+
+ const char* rawBeg = in.data();
+ size_t rawSz = in.size();
+
+ const size_t szSz = packer.SkipLeaf(rawBeg);
+ ui64 datSz = 0;
+ packer.UnpackLeaf(rawBeg, datSz);
+
+ rawBeg += szSz;
+ rawSz -= szSz;
+
+ if (!datSz) {
+ outbuf.Resize(rawSz);
+ memcpy(outbuf.data(), rawBeg, rawSz);
+ } else {
+ // size_t zSz = ZSTD_getDecompressedSize(rawBeg, rawSz);
+ // Y_ENSURE_EX(datSz == zSz, TCodecException() << datSz << " != " << zSz);
+ outbuf.Resize(datSz);
+ TDCtx ctx{CheckPtr(ZSTD_createDCtx(), __LOCATION__)};
+ CheckSize(ZSTD_decompress_usingDDict(
+ ctx.Get(), outbuf.data(), outbuf.size(), rawBeg, rawSz, DDict.Get()),
+ __LOCATION__);
+ outbuf.Resize(datSz);
+ }
+ }
+
+ bool Learn(ISequenceReader& in, bool throwOnError) {
+ TBuffer data;
+ TVector<size_t> lens;
+
+ data.Reserve(2 * SampleSize);
+ TStringBuf r;
+ while (in.NextRegion(r)) {
+ if (!r) {
+ continue;
+ }
+ data.Append(r.data(), r.size());
+ lens.push_back(r.size());
+ }
+
+ ZDICT_legacy_params_t params;
+ memset(&params, 0, sizeof(params));
+ params.zParams.compressionLevel = 1;
+ params.zParams.notificationLevel = 1;
+ Dict.Resize(Max<size_t>(1 << 20, data.Size() + 16 * lens.size()));
+
+ if (!lens) {
+ Dict.Reset();
+ } else {
+ size_t trainResult = ZDICT_trainFromBuffer_legacy(
+ Dict.data(), Dict.size(), data.Data(), const_cast<const size_t*>(&lens[0]), lens.size(), params);
+ if (ZSTD_isError(trainResult)) {
+ if (!throwOnError) {
+ return false;
+ }
+ CheckSize(trainResult, __LOCATION__);
+ }
+ Dict.Resize(trainResult);
+ Dict.ShrinkToFit();
+ }
+ InitContexts();
+ return true;
+ }
+
+ void Save(IOutputStream* out) const {
+ ::Save(out, Dict);
+ }
+
+ void Load(IInputStream* in) {
+ ::Load(in, Dict);
+ InitContexts();
+ }
+
+ void InitContexts() {
+ CDict.Reset(CheckPtr(ZSTD_createCDict(Dict.data(), Dict.size(), CompressionLevel), __LOCATION__));
+ DDict.Reset(CheckPtr(ZSTD_createDDict(Dict.data(), Dict.size()), __LOCATION__));
+ }
+
+ static size_t CheckSize(size_t sz, TSourceLocation loc) {
+ if (ZSTD_isError(sz)) {
+ ythrow TCodecException() << loc << " " << ZSTD_getErrorName(sz) << " (code " << (int)ZSTD_getErrorCode(sz) << ")";
+ }
+ return sz;
+ }
+
+ template <class T>
+ static T* CheckPtr(T* t, TSourceLocation loc) {
+ Y_ENSURE_EX(t, TCodecException() << loc << " "
+ << "unexpected nullptr");
+ return t;
+ }
+
+ private:
+ ui32 CompressionLevel = 1;
+
+ TBuffer Zero;
+ TBuffer Dict;
+
+ TCDict CDict;
+ TDDict DDict;
+ };
+
+ TZStdDictCodec::TZStdDictCodec(ui32 comprLevel)
+ : Impl(new TImpl(comprLevel))
+ {
+ MyTraits.NeedsTraining = true;
+ MyTraits.SizeOnEncodeMultiplier = 2;
+ MyTraits.SizeOnDecodeMultiplier = 10;
+ MyTraits.RecommendedSampleSize = TImpl::SampleSize; // same as for solar
+ }
+
+ TZStdDictCodec::~TZStdDictCodec() {
+ }
+
+ TString TZStdDictCodec::GetName() const {
+ return TStringBuilder() << MyName() << "-" << Impl->GetCompressionLevel();
+ }
+
+ ui8 TZStdDictCodec::Encode(TStringBuf in, TBuffer& out) const {
+ return Impl->Encode(in, out);
+ }
+
+ void TZStdDictCodec::Decode(TStringBuf in, TBuffer& out) const {
+ Impl->Decode(in, out);
+ }
+
+ void TZStdDictCodec::DoLearn(ISequenceReader& in) {
+ Impl = new TImpl(Impl->GetCompressionLevel());
+ Impl->Learn(in, true/*throwOnError*/);
+ }
+
+ bool TZStdDictCodec::DoTryToLearn(ISequenceReader& in) {
+ Impl = new TImpl(Impl->GetCompressionLevel());
+ return Impl->Learn(in, false/*throwOnError*/);
+ }
+
+ void TZStdDictCodec::Save(IOutputStream* out) const {
+ Impl->Save(out);
+ }
+
+ void TZStdDictCodec::Load(IInputStream* in) {
+ Impl->Load(in);
+ }
+
+ TVector<TString> TZStdDictCodec::ListCompressionNames() {
+ TVector<TString> res;
+ for (int i = 1; i <= ZSTD_maxCLevel(); ++i) {
+ res.emplace_back(TStringBuilder() << MyName() << "-" << i);
+ }
+ return res;
+ }
+
+ int TZStdDictCodec::ParseCompressionName(TStringBuf name) {
+ int c = 0;
+ TryFromString(name.After('-'), c);
+ Y_ENSURE_EX(name.Before('-') == MyName() && c > 0 && c <= ZSTD_maxCLevel(), TCodecException() << "invald codec name" << name);
+ return c;
+ }
+
+}
diff --git a/library/cpp/codecs/zstd_dict_codec.h b/library/cpp/codecs/zstd_dict_codec.h
new file mode 100644
index 0000000000..59c1ad6c60
--- /dev/null
+++ b/library/cpp/codecs/zstd_dict_codec.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "codecs.h"
+
+#include <util/generic/ptr.h>
+
+namespace NCodecs {
+ // benchmarks are here: https://st.yandex-team.ru/SEARCH-1655
+
+ class TZStdDictCodec: public ICodec {
+ class TImpl;
+ TIntrusivePtr<TImpl> Impl;
+
+ public:
+ explicit TZStdDictCodec(ui32 comprLevel = 1);
+ ~TZStdDictCodec() override;
+
+ static TStringBuf MyName() {
+ return "zstd08d";
+ }
+
+ TString GetName() const override;
+
+ ui8 Encode(TStringBuf in, TBuffer& out) const override;
+
+ void Decode(TStringBuf in, TBuffer& out) const override;
+
+ static TVector<TString> ListCompressionNames();
+ static int ParseCompressionName(TStringBuf);
+
+ protected:
+ void DoLearn(ISequenceReader& in) override;
+ bool DoTryToLearn(ISequenceReader& in) final;
+ void Save(IOutputStream* out) const override;
+ void Load(IInputStream* in) override;
+ };
+
+}