diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/codecs | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/codecs')
66 files changed, 6758 insertions, 0 deletions
diff --git a/library/cpp/codecs/README.md b/library/cpp/codecs/README.md new file mode 100644 index 0000000000..42646ccd97 --- /dev/null +++ b/library/cpp/codecs/README.md @@ -0,0 +1,46 @@ +This is a library of compression algorithms with a unified interface and serialization. +See also library/cpp/codecs/static, where a support for statically compiled dictionaries is implemented. + +All algorithms have a common `ICodec` interface (described in codecs.h). + +The `ICodec` interface has the following methods:\ + `virtual ui8 ICodec::Encode (TMemoryRegion, TBuffer&) const;`\ + - Input - memory region. Output - filled buffer and the rest of the last byte, if it was not filled to the end.\ + `virtual void ICodec::Decode (TMemoryRegion, TBuffer&) const;`\ + - Input - memory region. Output - filled buffer.\ + `virtual void Save (TOutputStream*) const;`\ + - Serialization.\ + `virtual void Load (TInputStream*);`\ + - Deserialization.\ + `virtual bool NeedsTraining() const;`\ + - Returns if it is necessary or not to teach this codec.\ + `virtual bool PaddingBit() const;`\ + - Returns which values should fill the free bits of the last byte.\ + `virtual size_t ApproximateSizeOnEncode(size_t sz) const;`\ + - Returns an approximate estimate of the size of the result after encoding.\ + For example could be used for a more accurate preallocation of a buffer.\ + `virtual size_t ApproximateSizeOnDecode(size_t sz) const;`\ + - Returns an approximate estimate of the size of the result after decoding.\ + For example could be used for a more accurate preallocation of a buffer.\ + `virtual TString GetName() const;`\ + - The name of the codec. It is required for registration of the codec in the system of serialization/deserialization.\ + For example, it allows you to save information about which combination of codecs was in use (see below).\ + `virtual void Learn(ISequenceReader*);`\ + - The interface for teaching codecs that use information about the distribution of data. + +In addition, the library has a number of utilities that allow a more flexible use of it. + +In the `ICodec` class the following methods are available:\ + `static TCodecPtr GetInstance(const TString& name);`\ + - Creation of a codec instance by a symbolic name\ + (See `GetName()` above)\ + `static void Store(TOutputStream*, TCodecPtr p);`\ + - Serialization of the codec along with its name\ + Allows you to save information about which particular combination of codecs was used for encoding,\ + and then reassemble the same combination for decoding\ + `static TCodecPtr Restore(TInputStream* in);`\ + - Serialization of the codec along with its name\ + `static TCodecPtr RestoreFromString(TStringBuf data);`\ + - Loads the codec instance from the string\ + `static TVector<TString> GetCodecsList();`\ + - The list of registered codecs diff --git a/library/cpp/codecs/codecs.cpp b/library/cpp/codecs/codecs.cpp new file mode 100644 index 0000000000..b17a3156d2 --- /dev/null +++ b/library/cpp/codecs/codecs.cpp @@ -0,0 +1,190 @@ +#include "codecs.h" +#include "tls_cache.h" + +#include <util/stream/mem.h> + +namespace NCodecs { + void ICodec::Store(IOutputStream* out, TCodecPtr p) { + if (!p.Get()) { + ::Save(out, (ui16)0); + return; + } + + Y_ENSURE_EX(p->AlreadyTrained(), TCodecException() << "untrained codec " << p->GetName()); + const TString& n = p->GetName(); + Y_VERIFY(n.size() <= Max<ui16>()); + ::Save(out, (ui16)n.size()); + out->Write(n.data(), n.size()); + p->Save(out); + } + + TCodecPtr ICodec::Restore(IInputStream* in) { + ui16 l = 0; + ::Load(in, l); + + if (!l) { + return nullptr; + } + + TString n; + n.resize(l); + + Y_ENSURE_EX(in->Load(n.begin(), l) == l, TCodecException()); + + TCodecPtr p = ICodec::GetInstance(n); + p->Load(in); + p->Trained = true; + return p; + } + + TCodecPtr ICodec::RestoreFromString(TStringBuf s) { + TMemoryInput minp{s.data(), s.size()}; + return Restore(&minp); + } + + TString ICodec::GetNameSafe(TCodecPtr p) { + return !p ? TString("none") : p->GetName(); + } + + ui8 TPipelineCodec::Encode(TStringBuf in, TBuffer& out) const { + size_t res = Traits().ApproximateSizeOnEncode(in.size()); + out.Reserve(res); + out.Clear(); + + if (Pipeline.empty()) { + out.Append(in.data(), in.size()); + return 0; + } else if (Pipeline.size() == 1) { + return Pipeline.front()->Encode(in, out); + } + + ui8 freelastbits = 0; + + auto buffer = TBufferTlsCache::TlsInstance().Item(); + TBuffer& tmp = buffer.Get(); + tmp.Reserve(res); + + for (auto it = Pipeline.begin(); it != Pipeline.end(); ++it) { + if (it != Pipeline.begin()) { + tmp.Clear(); + tmp.Swap(out); + in = TStringBuf{tmp.data(), tmp.size()}; + } + freelastbits = (*it)->Encode(in, out); + } + + return freelastbits; + } + + void TPipelineCodec::Decode(TStringBuf in, TBuffer& out) const { + size_t res = Traits().ApproximateSizeOnDecode(in.size()); + out.Reserve(res); + out.Clear(); + + if (Pipeline.empty()) { + out.Append(in.data(), in.size()); + return; + } else if (Pipeline.size() == 1) { + Pipeline.front()->Decode(in, out); + return; + } + + auto buffer = TBufferTlsCache::TlsInstance().Item(); + + TBuffer& tmp = buffer.Get(); + tmp.Reserve(res); + + for (TPipeline::const_reverse_iterator it = Pipeline.rbegin(); it != Pipeline.rend(); ++it) { + if (it != Pipeline.rbegin()) { + tmp.Clear(); + tmp.Swap(out); + in = TStringBuf{tmp.data(), tmp.size()}; + } + (*it)->Decode(in, out); + } + } + + void TPipelineCodec::Save(IOutputStream* out) const { + for (const auto& it : Pipeline) + it->Save(out); + } + + void TPipelineCodec::Load(IInputStream* in) { + for (const auto& it : Pipeline) { + it->Load(in); + it->SetTrained(true); + } + } + + void TPipelineCodec::SetTrained(bool t) { + for (const auto& it : Pipeline) { + it->SetTrained(t); + } + } + + TPipelineCodec& TPipelineCodec::AddCodec(TCodecPtr codec) { + if (!codec) + return *this; + + TCodecTraits tr = codec->Traits(); + + if (!MyName) { + MyTraits.AssumesStructuredInput = tr.AssumesStructuredInput; + MyTraits.SizeOfInputElement = tr.SizeOfInputElement; + } else { + MyName.append(':'); + } + + MyName.append(codec->GetName()); + MyTraits.PreservesPrefixGrouping &= tr.PreservesPrefixGrouping; + MyTraits.PaddingBit = tr.PaddingBit; + MyTraits.NeedsTraining |= tr.NeedsTraining; + MyTraits.Irreversible |= tr.Irreversible; + MyTraits.SizeOnEncodeAddition = MyTraits.SizeOnEncodeAddition * tr.SizeOnEncodeMultiplier + tr.SizeOnEncodeAddition; + MyTraits.SizeOnEncodeMultiplier *= tr.SizeOnEncodeMultiplier; + MyTraits.SizeOnDecodeMultiplier *= tr.SizeOnDecodeMultiplier; + MyTraits.RecommendedSampleSize = Max(MyTraits.RecommendedSampleSize, tr.RecommendedSampleSize); + + Pipeline.push_back(codec); + return *this; + } + + void TPipelineCodec::DoLearnX(ISequenceReader& in, double sampleSizeMult) { + if (!Traits().NeedsTraining) { + return; + } + + if (Pipeline.size() == 1) { + Pipeline.back()->Learn(in); + return; + } + + TVector<TBuffer> trainingInput; + + TStringBuf r; + while (in.NextRegion(r)) { + trainingInput.emplace_back(r.data(), r.size()); + } + + TBuffer buff; + for (const auto& it : Pipeline) { + it->LearnX(trainingInput.begin(), trainingInput.end(), sampleSizeMult); + + for (auto& bit : trainingInput) { + buff.Clear(); + it->Encode(TStringBuf{bit.data(), bit.size()}, buff); + buff.Swap(bit); + } + } + } + + bool TPipelineCodec::AlreadyTrained() const { + for (const auto& it : Pipeline) { + if (!it->AlreadyTrained()) + return false; + } + + return true; + } + +} diff --git a/library/cpp/codecs/codecs.h b/library/cpp/codecs/codecs.h new file mode 100644 index 0000000000..cc5e72b285 --- /dev/null +++ b/library/cpp/codecs/codecs.h @@ -0,0 +1,259 @@ +#pragma once + +#include "sample.h" + +#include <util/generic/bt_exception.h> +#include <util/generic/hash.h> +#include <util/generic/ptr.h> +#include <util/generic/singleton.h> + +#include <util/stream/input.h> +#include <util/stream/output.h> + +#include <util/string/cast.h> +#include <util/string/vector.h> +#include <util/system/tls.h> +#include <util/ysaveload.h> + +namespace NCodecs { + class TCodecException: public TWithBackTrace<yexception> {}; + + class ICodec; + + using TCodecPtr = TIntrusivePtr<ICodec>; + using TCodecConstPtr = TIntrusiveConstPtr<ICodec>; + + struct TCodecTraits { + ui32 RecommendedSampleSize = 0; + ui16 SizeOfInputElement = 1; + ui8 SizeOnEncodeMultiplier = 1; + ui8 SizeOnEncodeAddition = 0; + ui8 SizeOnDecodeMultiplier = 1; + + bool NeedsTraining = false; + bool PreservesPrefixGrouping = false; + bool Irreversible = false; + bool PaddingBit = 0; + bool AssumesStructuredInput = false; + + size_t ApproximateSizeOnEncode(size_t sz) const { + return sz * SizeOnEncodeMultiplier + SizeOnEncodeAddition; + } + + size_t ApproximateSizeOnDecode(size_t sz) const { + return sz * SizeOnDecodeMultiplier; + } + }; + + class ICodec: public TAtomicRefCount<ICodec> { + protected: + bool Trained = false; + TCodecTraits MyTraits; + + public: + TCodecTraits Traits() const { + return MyTraits; + } + + // the name of the codec (or its variant) to be used in the codec registry + virtual TString GetName() const = 0; + + virtual ui8 /*free bits in last byte*/ Encode(TStringBuf, TBuffer&) const = 0; + virtual ui8 Encode(const TBuffer& input, TBuffer& output) const { + return Encode(TStringBuf(input.Data(), input.Data() + input.Size()), output); + } + virtual void Decode(TStringBuf, TBuffer&) const = 0; + virtual void Decode(const TBuffer& input, TBuffer& output) const { + Decode(TStringBuf(input.Data(), input.Data() + input.Size()), output); + } + + virtual ~ICodec() = default; + + virtual bool AlreadyTrained() const { + return !Traits().NeedsTraining || Trained; + } + virtual void SetTrained(bool t) { + Trained = t; + } + + bool TryToLearn(ISequenceReader& r) { + Trained = DoTryToLearn(r); + return Trained; + } + + void Learn(ISequenceReader& r) { + LearnX(r, 1); + } + + template <class TIter> + void Learn(TIter beg, TIter end) { + Learn(beg, end, IterToStringBuf<TIter>); + } + + template <class TIter, class TGetter> + void Learn(TIter beg, TIter end, TGetter getter) { + auto sample = GetSample(beg, end, Traits().RecommendedSampleSize, getter); + TSimpleSequenceReader<TBuffer> reader{sample}; + Learn(reader); + } + + static TCodecPtr GetInstance(TStringBuf name); + + static TVector<TString> GetCodecsList(); + + static TString GetNameSafe(TCodecPtr p); + + static void Store(IOutputStream* out, TCodecPtr p); + static TCodecPtr Restore(IInputStream* in); + static TCodecPtr RestoreFromString(TStringBuf); + + protected: + virtual void DoLearn(ISequenceReader&) = 0; + + virtual bool DoTryToLearn(ISequenceReader& r) { + DoLearn(r); + return true; + } + + // so the pipeline codec will know to adjust the sample for the subcodecs + virtual void DoLearnX(ISequenceReader& r, double /*sampleSizeMultiplier*/) { + DoLearn(r); + } + + virtual void Save(IOutputStream*) const { + } + virtual void Load(IInputStream*) { + } + friend class TPipelineCodec; + + public: + // so the pipeline codec will know to adjust the sample for the subcodecs + void LearnX(ISequenceReader& r, double sampleSizeMult) { + DoLearnX(r, sampleSizeMult); + Trained = true; + } + + template <class TIter> + void LearnX(TIter beg, TIter end, double sampleSizeMult) { + auto sample = GetSample(beg, end, Traits().RecommendedSampleSize * sampleSizeMult); + TSimpleSequenceReader<TBuffer> reader{sample}; + LearnX(reader, sampleSizeMult); + } + }; + + class TBasicTrivialCodec: public ICodec { + public: + ui8 Encode(TStringBuf in, TBuffer& out) const override { + out.Assign(in.data(), in.size()); + return 0; + } + + void Decode(TStringBuf in, TBuffer& out) const override { + Encode(in, out); + } + + protected: + void DoLearn(ISequenceReader&) override { + } + }; + + class TTrivialCodec: public TBasicTrivialCodec { + public: + TTrivialCodec() { + MyTraits.PreservesPrefixGrouping = true; + } + + static TStringBuf MyName() { + return "trivial"; + } + + TString GetName() const override { + return ToString(MyName()); + } + }; + + class TTrivialTrainableCodec: public TBasicTrivialCodec { + public: + TTrivialTrainableCodec() { + MyTraits.PreservesPrefixGrouping = true; + MyTraits.NeedsTraining = true; + } + + static TStringBuf MyName() { + return "trivial-trainable"; + } + + TString GetName() const override { + return ToString(MyName()); + } + }; + + class TNullCodec: public ICodec { + public: + TNullCodec() { + MyTraits.Irreversible = true; + MyTraits.SizeOnDecodeMultiplier = 0; + MyTraits.SizeOnEncodeMultiplier = 0; + } + + TString GetName() const override { + return "null"; + } + + ui8 Encode(TStringBuf, TBuffer& out) const override { + out.Clear(); + return 0; + } + + void Decode(TStringBuf, TBuffer& out) const override { + out.Clear(); + } + + protected: + void DoLearn(ISequenceReader&) override { + } + }; + + class TPipelineCodec: public ICodec { + typedef TVector<TCodecPtr> TPipeline; + + TPipeline Pipeline; + TString MyName; + + public: + explicit TPipelineCodec(TCodecPtr c0 = nullptr, TCodecPtr c1 = nullptr, TCodecPtr c2 = nullptr, TCodecPtr c3 = nullptr) { + MyTraits.PreservesPrefixGrouping = true; + AddCodec(c0); + AddCodec(c1); + AddCodec(c2); + AddCodec(c3); + } + + TString GetName() const override { + return MyName; + } + + ui8 Encode(TStringBuf in, TBuffer& out) const override; + void Decode(TStringBuf in, TBuffer& out) const override; + + public: + /* + * Add codecs in the following order: + * uncompressed -> codec0 | codec1 | ... | codecN -> compressed + */ + TPipelineCodec& AddCodec(TCodecPtr codec); + + bool AlreadyTrained() const override; + void SetTrained(bool t) override; + + protected: + void DoLearn(ISequenceReader& in) override { + DoLearnX(in, 1); + } + + void DoLearnX(ISequenceReader& in, double sampleSizeMult) override; + void Save(IOutputStream* out) const override; + void Load(IInputStream* in) override; + }; + +} diff --git a/library/cpp/codecs/codecs_registry.cpp b/library/cpp/codecs/codecs_registry.cpp new file mode 100644 index 0000000000..6fd7ff67b0 --- /dev/null +++ b/library/cpp/codecs/codecs_registry.cpp @@ -0,0 +1,226 @@ +#include "codecs_registry.h" +#include "delta_codec.h" +#include "huffman_codec.h" +#include "pfor_codec.h" +#include "solar_codec.h" +#include "comptable_codec.h" +#include "zstd_dict_codec.h" + +#include <library/cpp/blockcodecs/codecs.h> + +#include <util/string/builder.h> +#include <util/string/cast.h> + +namespace NCodecs { + TCodecPtr ICodec::GetInstance(TStringBuf name) { + return Default<NPrivate::TCodecRegistry>().GetCodec(name); + } + + TVector<TString> ICodec::GetCodecsList() { + return Default<NPrivate::TCodecRegistry>().GetCodecsList(); + } + + namespace NPrivate { + void TCodecRegistry::RegisterFactory(TFactoryPtr fac) { + TVector<TString> names = fac->ListNames(); + for (const auto& name : names) { + Y_VERIFY(!Registry.contains(name), "already has %s", name.data()); + Registry[name] = fac; + } + } + + TCodecPtr TCodecRegistry::GetCodec(TStringBuf name) const { + using namespace NPrivate; + + if (!name || "none" == name) { + return nullptr; + } + + if (TStringBuf::npos == name.find(':')) { + Y_ENSURE_EX(Registry.contains(name), TNoCodecException(name)); + return Registry.find(name)->second->MakeCodec(name); + } else { + TPipelineCodec* pipe = new TPipelineCodec; + + do { + TStringBuf v = name.NextTok(':'); + pipe->AddCodec(GetCodec(v)); + } while (name); + + return pipe; + } + } + + TVector<TString> TCodecRegistry::GetCodecsList() const { + using namespace NPrivate; + TVector<TString> vs; + vs.push_back("none"); + + for (const auto& it : Registry) { + vs.push_back(it.first); + } + + Sort(vs.begin(), vs.end()); + return vs; + } + + struct TSolarCodecFactory : ICodecFactory { + TCodecPtr MakeCodec(TStringBuf name) const override { + if (TSolarCodec::MyNameShortInt() == name) { + return new TSolarCodecShortInt(); + } + if (TSolarCodec::MyName() == name) { + return new TSolarCodec(); + } + if (name.EndsWith(TStringBuf("-a"))) { + return MakeCodecImpl<TAdaptiveSolarCodec>(name, name.SubStr(TSolarCodec::MyName().size()).Chop(2)); + } else { + return MakeCodecImpl<TSolarCodec>(name, name.SubStr(TSolarCodec::MyName().size())); + } + } + + template <class TCodecCls> + TCodecPtr MakeCodecImpl(const TStringBuf& name, const TStringBuf& type) const { + if (TStringBuf("-8k") == type) { + return new TCodecCls(1 << 13); + } + if (TStringBuf("-16k") == type) { + return new TCodecCls(1 << 14); + } + if (TStringBuf("-32k") == type) { + return new TCodecCls(1 << 15); + } + if (TStringBuf("-64k") == type) { + return new TCodecCls(1 << 16); + } + if (TStringBuf("-256k") == type) { + return new TCodecCls(1 << 18); + } + ythrow TNoCodecException(name); + } + + TVector<TString> ListNames() const override { + TVector<TString> vs; + vs.push_back(ToString(TSolarCodec::MyName())); + vs.push_back(ToString(TSolarCodec::MyName8k())); + vs.push_back(ToString(TSolarCodec::MyName16k())); + vs.push_back(ToString(TSolarCodec::MyName32k())); + vs.push_back(ToString(TSolarCodec::MyName64k())); + vs.push_back(ToString(TSolarCodec::MyName256k())); + vs.push_back(ToString(TSolarCodec::MyName8kAdapt())); + vs.push_back(ToString(TSolarCodec::MyName16kAdapt())); + vs.push_back(ToString(TSolarCodec::MyName32kAdapt())); + vs.push_back(ToString(TSolarCodec::MyName64kAdapt())); + vs.push_back(ToString(TSolarCodec::MyName256kAdapt())); + vs.push_back(ToString(TSolarCodec::MyNameShortInt())); + return vs; + } + }; + + struct TZStdDictCodecFactory : ICodecFactory { + TCodecPtr MakeCodec(TStringBuf name) const override { + return new TZStdDictCodec(TZStdDictCodec::ParseCompressionName(name)); + } + + TVector<TString> ListNames() const override { + return TZStdDictCodec::ListCompressionNames(); + } + }; + + struct TCompTableCodecFactory : ICodecFactory { + TCodecPtr MakeCodec(TStringBuf name) const override { + if (TCompTableCodec::MyNameHQ() == name) { + return new TCompTableCodec(TCompTableCodec::Q_HIGH); + } else if (TCompTableCodec::MyNameLQ() == name) { + return new TCompTableCodec(TCompTableCodec::Q_LOW); + } else { + Y_ENSURE_EX(false, TNoCodecException(name)); + return nullptr; + } + } + + TVector<TString> ListNames() const override { + TVector<TString> vs; + vs.push_back(ToString(TCompTableCodec::MyNameHQ())); + vs.push_back(ToString(TCompTableCodec::MyNameLQ())); + return vs; + } + }; + + struct TBlockCodec : ICodec { + const NBlockCodecs::ICodec* Codec; + + TBlockCodec(TStringBuf name) + : Codec(NBlockCodecs::Codec(name)) + { + } + + TString GetName() const override { + return ToString(Codec->Name()); + } + + ui8 Encode(TStringBuf r, TBuffer& b) const override { + Codec->Encode(r, b); + return 0; + } + + void Decode(TStringBuf r, TBuffer& b) const override { + // TODO: throws exception that is not TCodecException + Codec->Decode(r, b); + } + + protected: + void DoLearn(ISequenceReader&) override { + } + }; + + struct TBlockCodecsFactory : ICodecFactory { + using TRegistry = THashMap<TString, TCodecPtr>; + TRegistry Registry; + + TBlockCodecsFactory() { + for (TStringBuf codec : NBlockCodecs::ListAllCodecs()) { + Register(codec); + } + } + + void Register(TStringBuf name) { + TCodecPtr p = Registry[name] = new TBlockCodec(name); + Registry[p->GetName()] = p; + } + + TCodecPtr MakeCodec(TStringBuf name) const override { + if (!Registry.contains(name)) { + ythrow TNoCodecException(name); + } + return Registry.find(name)->second; + } + + TVector<TString> ListNames() const override { + TVector<TString> res; + for (const auto& it : Registry) { + res.push_back(it.first); + } + return res; + } + }; + + TCodecRegistry::TCodecRegistry() { + RegisterFactory(new TInstanceFactory<TTrivialCodec>); + RegisterFactory(new TInstanceFactory<TTrivialTrainableCodec>); + RegisterFactory(new TInstanceFactory<THuffmanCodec>); + RegisterFactory(new TInstanceFactory<TPForCodec<ui64, true>>); + RegisterFactory(new TInstanceFactory<TPForCodec<ui32, true>>); + RegisterFactory(new TSolarCodecFactory); + RegisterFactory(new TZStdDictCodecFactory); + RegisterFactory(new TCompTableCodecFactory); + RegisterFactory(new TBlockCodecsFactory); + } + + } + + void RegisterCodecFactory(TCodecFactoryPtr fact) { + Singleton<NPrivate::TCodecRegistry>()->RegisterFactory(fact); + } + +} diff --git a/library/cpp/codecs/codecs_registry.h b/library/cpp/codecs/codecs_registry.h new file mode 100644 index 0000000000..53710310d5 --- /dev/null +++ b/library/cpp/codecs/codecs_registry.h @@ -0,0 +1,60 @@ +#pragma once + +#include "codecs.h" +#include <util/string/cast.h> + +namespace NCodecs { + struct TNoCodecException : TCodecException { + TNoCodecException(TStringBuf name) { + (*this) << "unknown codec: " << name; + } + }; + + struct ICodecFactory : TAtomicRefCount<ICodecFactory> { + virtual ~ICodecFactory() = default; + virtual TCodecPtr MakeCodec(TStringBuf name) const = 0; + virtual TVector<TString> ListNames() const = 0; + }; + + typedef TIntrusivePtr<ICodecFactory> TCodecFactoryPtr; + + namespace NPrivate { + template <typename TCodec> + struct TInstanceFactory : ICodecFactory { + TCodecPtr MakeCodec(TStringBuf) const override { + return new TCodec; + } + + TVector<TString> ListNames() const override { + TVector<TString> vs; + vs.push_back(ToString(TCodec::MyName())); + return vs; + } + }; + + class TCodecRegistry { + using TRegistry = THashMap<TString, TIntrusivePtr<ICodecFactory>>; + TRegistry Registry; + + public: + using TFactoryPtr = TIntrusivePtr<ICodecFactory>; + + TCodecRegistry(); + + void RegisterFactory(TFactoryPtr fac); + + TCodecPtr GetCodec(TStringBuf name) const; + + TVector<TString> GetCodecsList() const; + }; + + } + + void RegisterCodecFactory(TCodecFactoryPtr fact); + + template <typename TCodec> + void RegisterCodec() { + RegisterCodecFactory(new NPrivate::TInstanceFactory<TCodec>()); + } + +} diff --git a/library/cpp/codecs/comptable_codec.cpp b/library/cpp/codecs/comptable_codec.cpp new file mode 100644 index 0000000000..476b8ada80 --- /dev/null +++ b/library/cpp/codecs/comptable_codec.cpp @@ -0,0 +1,108 @@ +#include "comptable_codec.h" + +#include <library/cpp/comptable/comptable.h> +#include <util/string/cast.h> + +namespace NCodecs { + class TCompTableCodec::TImpl: public TAtomicRefCount<TImpl> { + public: + TImpl(EQuality q) + : Quality(q) + { + } + + void Init() { + Compressor.Reset(new NCompTable::TChunkCompressor{(bool)Quality, Table}); + Decompressor.Reset(new NCompTable::TChunkDecompressor{(bool)Quality, Table}); + } + + ui8 Encode(TStringBuf in, TBuffer& out) const { + out.Clear(); + if (!in) { + return 0; + } + + TVector<char> result; + Compressor->Compress(in, &result); + out.Assign(&result[0], result.size()); + return 0; + } + + void Decode(TStringBuf in, TBuffer& out) const { + out.Clear(); + if (!in) { + return; + } + + TVector<char> result; + Decompressor->Decompress(in, &result); + out.Assign(&result[0], result.size()); + } + + void DoLearn(ISequenceReader& in) { + NCompTable::TDataSampler sampler; + TStringBuf region; + while (in.NextRegion(region)) { + if (!region) { + continue; + } + + sampler.AddStat(region); + } + + sampler.BuildTable(Table); + Init(); + } + + void Save(IOutputStream* out) const { + ::Save(out, Table); + } + + void Load(IInputStream* in) { + ::Load(in, Table); + Init(); + } + + NCompTable::TCompressorTable Table; + THolder<NCompTable::TChunkCompressor> Compressor; + THolder<NCompTable::TChunkDecompressor> Decompressor; + const EQuality Quality; + static const ui32 SampleSize = Max(NCompTable::TDataSampler::Size * 4, (1 << 22) * 5); + }; + + TCompTableCodec::TCompTableCodec(EQuality q) + : Impl(new TImpl{q}) + { + MyTraits.NeedsTraining = true; + MyTraits.SizeOnEncodeMultiplier = 2; + MyTraits.SizeOnDecodeMultiplier = 10; + MyTraits.RecommendedSampleSize = TImpl::SampleSize; + } + + TCompTableCodec::~TCompTableCodec() = default; + + TString TCompTableCodec::GetName() const { + return ToString(Impl->Quality ? MyNameHQ() : MyNameLQ()); + } + + ui8 TCompTableCodec::Encode(TStringBuf in, TBuffer& out) const { + return Impl->Encode(in, out); + } + + void TCompTableCodec::Decode(TStringBuf in, TBuffer& out) const { + Impl->Decode(in, out); + } + + void TCompTableCodec::DoLearn(ISequenceReader& in) { + Impl->DoLearn(in); + } + + void TCompTableCodec::Save(IOutputStream* out) const { + Impl->Save(out); + } + + void TCompTableCodec::Load(IInputStream* in) { + Impl->Load(in); + } + +} diff --git a/library/cpp/codecs/comptable_codec.h b/library/cpp/codecs/comptable_codec.h new file mode 100644 index 0000000000..7ba4f4c543 --- /dev/null +++ b/library/cpp/codecs/comptable_codec.h @@ -0,0 +1,40 @@ +#pragma once + +#include "codecs.h" + +#include <util/generic/ptr.h> + +namespace NCodecs { + class TCompTableCodec: public ICodec { + class TImpl; + TIntrusivePtr<TImpl> Impl; + + public: + enum EQuality { + Q_LOW = 0, + Q_HIGH = 1 + }; + + explicit TCompTableCodec(EQuality q = Q_HIGH); + ~TCompTableCodec() override; + + static TStringBuf MyNameHQ() { + return "comptable-hq"; + } + static TStringBuf MyNameLQ() { + return "comptable-lq"; + } + + TString GetName() const override; + + ui8 Encode(TStringBuf in, TBuffer& out) const override; + + void Decode(TStringBuf in, TBuffer& out) const override; + + protected: + void DoLearn(ISequenceReader& in) override; + void Save(IOutputStream* out) const override; + void Load(IInputStream* in) override; + }; + +} diff --git a/library/cpp/codecs/delta_codec.cpp b/library/cpp/codecs/delta_codec.cpp new file mode 100644 index 0000000000..61606d6f6f --- /dev/null +++ b/library/cpp/codecs/delta_codec.cpp @@ -0,0 +1,21 @@ +#include "delta_codec.h" + +namespace NCodecs { + template <> + TStringBuf TDeltaCodec<ui64, true>::MyName() { + return "delta64-unsigned"; + } + template <> + TStringBuf TDeltaCodec<ui32, true>::MyName() { + return "delta32-unsigned"; + } + template <> + TStringBuf TDeltaCodec<ui64, false>::MyName() { + return "delta64-signed"; + } + template <> + TStringBuf TDeltaCodec<ui32, false>::MyName() { + return "delta32-signed"; + } + +} diff --git a/library/cpp/codecs/delta_codec.h b/library/cpp/codecs/delta_codec.h new file mode 100644 index 0000000000..21325825e6 --- /dev/null +++ b/library/cpp/codecs/delta_codec.h @@ -0,0 +1,143 @@ +#pragma once + +#include "codecs.h" + +#include <util/generic/array_ref.h> +#include <util/generic/typetraits.h> +#include <util/generic/bitops.h> +#include <util/string/cast.h> + +namespace NCodecs { + template <typename T = ui64, bool UnsignedDelta = true> + class TDeltaCodec: public ICodec { + static_assert(std::is_integral<T>::value, "expect std::is_integral<T>::value"); + + public: + using TUnsigned = std::make_unsigned_t<T>; + using TSigned = std::make_signed_t<T>; + using TDelta = std::conditional_t<UnsignedDelta, TUnsigned, TSigned>; + + private: + const TDelta MinDelta{Min<TDelta>()}; + const TDelta MaxDelta{Max<TDelta>() - 1}; + const TDelta InvalidDelta{MaxDelta + 1}; + + Y_FORCE_INLINE static TDelta AddSafe(TUnsigned a, TUnsigned b) { + return a + b; + } + + Y_FORCE_INLINE static TDelta SubSafe(TUnsigned a, TUnsigned b) { + return a - b; + } + + public: + struct TDecoder { + const TDelta InvalidDelta{Max<TDelta>()}; + + T Last = 0; + T Result = 0; + + bool First = true; + bool Invalid = false; + + Y_FORCE_INLINE bool Decode(TDelta t) { + if (Y_UNLIKELY(First)) { + First = false; + Result = Last = t; + return true; + } + + if (Y_UNLIKELY(Invalid)) { + Invalid = false; + Last = 0; + Result = t; + return true; + } + + Result = (Last += t); + Invalid = t == InvalidDelta; + + return !Invalid; + } + }; + + public: + static TStringBuf MyName(); + + TDeltaCodec() { + MyTraits.SizeOfInputElement = sizeof(T); + MyTraits.AssumesStructuredInput = true; + } + + TString GetName() const override { + return ToString(MyName()); + } + + template <class TItem> + static void AppendTo(TBuffer& b, TItem t) { + b.Append((char*)&t, sizeof(t)); + } + + ui8 Encode(TStringBuf s, TBuffer& b) const override { + b.Clear(); + if (s.empty()) { + return 0; + } + + b.Reserve(s.size()); + TArrayRef<const T> tin{(const T*)s.data(), s.size() / sizeof(T)}; + + const T* it = tin.begin(); + TDelta last = *(it++); + AppendTo(b, last); + + TDelta maxt = SubSafe(MaxDelta, last); + TDelta mint = AddSafe(MinDelta, last); + + for (; it != tin.end(); ++it) { + TDelta t = *it; + + if (Y_LIKELY((t >= mint) & (t <= maxt))) { + AppendTo(b, t - last); + last = t; + maxt = SubSafe(MaxDelta, last); + mint = AddSafe(MinDelta, last); + } else { + // delta overflow + AppendTo(b, InvalidDelta); + AppendTo(b, t); + last = 0; + maxt = MaxDelta; + mint = MinDelta; + } + } + + return 0; + } + + void Decode(TStringBuf s, TBuffer& b) const override { + b.Clear(); + if (s.empty()) { + return; + } + + b.Reserve(s.size()); + TArrayRef<const T> tin{(const T*)s.data(), s.size() / sizeof(T)}; + + TDecoder dec; + + for (const T* it = tin.begin(); it != tin.end(); ++it) { + T tmp; + memcpy(&tmp, it, sizeof(tmp)); + if (dec.Decode(tmp)) { + AppendTo(b, dec.Result); + } + } + } + + protected: + void DoLearn(ISequenceReader&) override { + } + }; + +} diff --git a/library/cpp/codecs/float_huffman.cpp b/library/cpp/codecs/float_huffman.cpp new file mode 100644 index 0000000000..c4a8bd228f --- /dev/null +++ b/library/cpp/codecs/float_huffman.cpp @@ -0,0 +1,333 @@ +#include "float_huffman.h" + +#include <util/generic/array_ref.h> +#include <util/generic/bitops.h> +#include <util/generic/cast.h> +#include <util/generic/yexception.h> +#include <util/system/unaligned_mem.h> +#include <util/system/yassert.h> +#include <util/stream/format.h> + +namespace NCodecs::NFloatHuff { + namespace { + struct THuffEntry { + ui32 CodeBase; + ui16 Prefix; + int PrefLength; + int CodeLength; + int TotalLength; + ui32 Mask; + ui64 Offset; + + THuffEntry() = default; + + constexpr THuffEntry(ui32 codeBase, ui16 prefix, int prefLength, int codeLength) + : CodeBase(codeBase) + , Prefix(prefix) + , PrefLength(prefLength) + , CodeLength(codeLength) + , TotalLength(prefLength + codeLength) + , Mask(Mask64(codeLength)) + , Offset(-(ui64(codeBase) << prefLength) + prefix) + {} + + bool Fit(ui32 code) const { + return code >= CodeBase && code < CodeBase + (1ULL << CodeLength); + } + }; + + // NB. There is a typo in the penultimate line (34 instead of 24). It was there from the very + // first commit and we cannot fix it without breaking all the clients. + constexpr THuffEntry entries[16] = { + {0x00000000, 0x01, 1, 0}, // Only +0.0f, 1 bit, prefix [1] + {0x3f800000, 0x0e, 4, 0}, // Only +1.0f, 4 bits, prefix [0111] + {0x3f700000, 0x08, 5, 20}, // [0.9375, 1.0), 25 bits, prefix [00010] + {0x3f000000, 0x00, 5, 20}, // [0.5, 0.5625), 25 bits, prefx [00000] + {0x3f400000, 0x06, 6, 20}, // [0.75, 0.8125), 26 bits, prefix [011000] + {0x3f500000, 0x22, 6, 20}, // [0.8125, 0.875), 26 bits, prefix [010001] + {0x3f200000, 0x02, 6, 20}, // [0.625, 0.6875), 26 bits, prefix [010000] + {0x3f100000, 0x38, 6, 20}, // [0.5625, 0.625), 26 bits, prefix [000111] + {0x3f600000, 0x18, 6, 20}, // [0.875, 0.9375), 26 bits, prefix [000110] + {0x3f300000, 0x30, 6, 20}, // [0.6875, 0.75), 26 bits, prefix [000011] + {0x3e800000, 0x10, 6, 20}, // [0.25, 0.28125), 26 bits, prefix [000010] + {0x3e000000, 0x04, 3, 24}, // [0.125, 0.5), 27 bits, prefix [001] + {0x3d000000, 0x0a, 4, 24}, // [0.03125, 0.125), 28 bits, prefix [0101] + {0x3c000000, 0x12, 5, 24}, // [0.0078125, 0.03125), 29 bits, prefix [01001] + {0x3b000000, 0x26, 6, 34}, // [0.001953125, end of range), 40 bits, prefix [011001] + {0x00000000, 0x16, 5, 32}, // whole range, 37 bits, prefix [01101] + }; + + [[noreturn]] Y_NO_INLINE void ThrowInvalidOffset(size_t size, size_t byteOffset) { + ythrow yexception() << + "Decompression error: requested decoding 8 bytes past end of input buffer of " << size << " bytes size at position " << byteOffset << ". "; + } + + struct THuffInfo { + constexpr THuffInfo() { + for (size_t i = 0; i < 64; ++i) { + bool init = false; + for (size_t j = 0; j != 16; ++j) { + ui16 prefix = i & Mask64(entries[j].PrefLength); + if (entries[j].Prefix == prefix) { + init = true; + DecodeLookup[i] = entries[j]; + break; + } + } + Y_ASSERT(init); + } + + for (ui32 i = 0; i < (1 << 12); ++i) { + // First two entries (+0.0f and +1.0f) are not present in the lookup, they are handled separately + for (int value = 2; value < 16; ++value) { + if (entries[value].Fit(i << 20)) { + EncodeLookup[i] = value; + break; + } + } + } + } + + std::pair<ui64, int> GetCode(ui32 value) const { + // Zeros are handled separately in the main loop + Y_ASSERT(value != 0); + + if (value == 0x3f800000) { + return {0x0e, 4}; + } + + const auto& entry = entries[EncodeLookup[value >> 20]]; + + return { + (ui64(value) << entry.PrefLength) + entry.Offset, + entry.TotalLength + }; + } + + THuffEntry DecodeLookup[64]; + ui8 EncodeLookup[1 << 12]; + }; + + const THuffInfo huffInfo; + /// End Of Stream + const ui32 EOS = ui32(-1); + } + + TString Encode(TArrayRef<const float> factors) { + TString result; + result.resize((factors.size() + 1) * 40 / 8 + 8, 0); // Max code length is 40 bits + int usedBits = 0; + ui64 buffer = 0; + char* writePtr = result.begin(); + + auto writeBits = [&](ui64 code, int size) { + const auto bitsToTransfer = Min(size, 64 - usedBits); + buffer |= (code << usedBits); + usedBits += bitsToTransfer; + if (usedBits == 64) { + memcpy(writePtr, &buffer, 8); + usedBits = size - bitsToTransfer; + if (bitsToTransfer != 64) { + buffer = code >> bitsToTransfer; + } else { + buffer = 0; + } + writePtr += 8; + } + }; + + for (size_t i = 0; i != factors.size();) { + if (BitCast<ui32>(factors[i]) == 0) { + int zeroCount = 1; + for (;;) { + ++i; + if (i == factors.size() || BitCast<ui32>(factors[i]) != 0) { + break; + } + ++zeroCount; + } + for (; zeroCount >= 64; zeroCount -= 64) { + writeBits(ui64(-1), 64); + } + writeBits(Mask64(zeroCount), zeroCount); + } else { + const auto [code, codeSize] = huffInfo.GetCode(BitCast<ui32>(factors[i])); + writeBits(code, codeSize); + ++i; + } + } + // Write EOS. + // We use precomputed constants instead of the following: + // auto [code, codeSize] = huffInfo.GetCode(EOS); + // writeBits(code, codeSize); + writeBits(211527139302, 40); + memcpy(writePtr, &buffer, 8); + result.resize(writePtr - result.begin() + usedBits / 8 + 8); + + return result; + } + + TDecoder::TDecoder(TStringBuf data) + : State{ + .Workspace = data.size() < 8 ? ThrowInvalidOffset(data.size(), 0), 0 : ReadUnaligned<ui64>(data.data()), + .WorkspaceSize = 64, + .Position = 8, + .Data = data + } + { + FillDecodeBuffer(); + } + + TVector<float> TDecoder::DecodeAll(size_t sizeHint) { + TVector<float> result; + result.reserve(sizeHint); + + while (Begin != End) { + result.insert(result.end(), Begin, End); + FillDecodeBuffer(); + } + return result; + } + + size_t TDecoder::Decode(TArrayRef<float> dest) { + size_t count = 0; + while (count < dest.size()) { + if (dest.size() - count < size_t(End - Begin)) { + const auto size = dest.size() - count; + std::copy(Begin, Begin + size, dest.data() + count); + Begin += size; + return dest.size(); + } else { + std::copy(Begin, End, dest.data() + count); + count += End - Begin; + FillDecodeBuffer(); + if (Begin == End) { + break; + } + } + } + return count; + } + + size_t TDecoder::Skip(size_t count) { + size_t skippedCount = 0; + while (skippedCount < count) { + if (count - skippedCount < size_t(End - Begin)) { + const auto size = count - skippedCount; + Begin += size; + return count; + } else { + skippedCount += End - Begin; + FillDecodeBuffer(); + if (Begin == End) { + break; + } + } + } + return skippedCount; + } + + bool TDecoder::HasMore() const { + return Begin != End; + } + + void TDecoder::FillDecodeBuffer() { + Begin = DecodeBuffer.data(); + End = DecodeBuffer.data(); + + if (HitEos) { + return; + } + + // This helps to keep most of the variables in the registers. + float* end = End; + TState state = State; + + // It is faster to just zero all the memory here in one go + // and then avoid inner loop when writing zeros. There we + // can just increment end pointer. + std::fill(DecodeBuffer.begin(), DecodeBuffer.end(), 0.0f); + + // Make sure that inside the loop we always have space to put 64 zeros and one other + // value. + float* cap = DecodeBuffer.data() + DecodeBuffer.size() - 64 - 1; + + while (end < cap) { + if (state.Workspace % 2 == 1) { + // Decode zeros + // There we can just scan whole state.Workspace for ones because it contains + // zeros outside of the WorkspaceSize bits. + const auto negWorkspace = ~state.Workspace; + const int zeroCount = negWorkspace ? CountTrailingZeroBits(negWorkspace) : 64; + end += zeroCount; + state.SkipBits(zeroCount); + continue; + } + if (state.PeekBits(4) == 0x0e) { + *end++ = 1.0f; + state.SkipBits(4); + continue; + } + const auto& entry = huffInfo.DecodeLookup[state.PeekBits(6)]; + const auto code = ui32((state.NextBitsUnmasked(entry.TotalLength) >> entry.PrefLength) & entry.Mask) + entry.CodeBase; + if (Y_UNLIKELY(code == EOS)) { + HitEos = true; + break; + } + *end++ = BitCast<float>(code); + } + + End = end; + State = state; + } + + ui64 TDecoder::TState::PeekBits(int count) { + if (WorkspaceSize > count) { + return Workspace & Mask64(count); + } else { + if (Y_UNLIKELY(Position + 8 > Data.size())) { + ThrowInvalidOffset(Data.size(), Position); + } + return (Workspace | (ReadUnaligned<ui64>(Data.data() + Position) << WorkspaceSize)) & Mask64(count); + } + } + + ui64 TDecoder::TState::NextBitsUnmasked(int count) { + if (WorkspaceSize > count) { + const auto result = Workspace; + Workspace >>= count; + WorkspaceSize -= count; + return result; + } else { + if (Y_UNLIKELY(Position + 8 > Data.size())) { + ThrowInvalidOffset(Data.size(), Position); + } + ui64 result = Workspace; + Workspace = ReadUnaligned<ui64>(Data.data() + Position); + Position += 8; + result |= Workspace << WorkspaceSize; + Workspace >>= count - WorkspaceSize; + WorkspaceSize += 64 - count; + return result; + } + } + + void TDecoder::TState::SkipBits(int count) { + if (WorkspaceSize > count) { + Workspace >>= count; + WorkspaceSize -= count; + } else { + if (Y_UNLIKELY(Position + 8 > Data.size())) { + ThrowInvalidOffset(Data.size(), Position); + } + Workspace = ReadUnaligned<ui64>(Data.data() + Position); + Position += 8; + Workspace >>= count - WorkspaceSize; + WorkspaceSize += 64 - count; + } + } + + TVector<float> Decode(TStringBuf data, size_t sizeHint) { + return TDecoder(data).DecodeAll(sizeHint); + } +} // namespace NCodecs::NFloatHuff diff --git a/library/cpp/codecs/float_huffman.h b/library/cpp/codecs/float_huffman.h new file mode 100644 index 0000000000..786a8eae1d --- /dev/null +++ b/library/cpp/codecs/float_huffman.h @@ -0,0 +1,50 @@ +#pragma once + +#include <util/generic/array_ref.h> +#include <util/generic/vector.h> +#include <util/generic/strbuf.h> + +#include <array> + +namespace NCodecs::NFloatHuff { + TString Encode(TArrayRef<const float> factors); + + class TDecoder { + public: + explicit TDecoder(TStringBuf data); + + TVector<float> DecodeAll(size_t sizeHint = 0); + + // Returns number of decoded floats. May be fewer than requested if the EOS is found. + size_t Decode(TArrayRef<float> dest); + + // Returns the number of skipped values. + size_t Skip(size_t count); + + bool HasMore() const; + + private: + struct TState { + ui64 Workspace = 0; + int WorkspaceSize = 0; + ui64 Position = 0; + TStringBuf Data; + + ui64 NextBitsUnmasked(int count); // The upper 64 - count bits may be arbitrary + ui64 PeekBits(int count); + void SkipBits(int count); + }; + + void FillDecodeBuffer(); + + TState State; + std::array<float, 128> DecodeBuffer; + // The range of already decompressed numbers inside the DecodeBuffer. + // Always kept non-empty until the EOS is encountered. + float* Begin; + float* End; + bool HitEos = false; + }; + + TVector<float> Decode(TStringBuf data, size_t sizeHint = 0); +} diff --git a/library/cpp/codecs/float_huffman_bench/main.cpp b/library/cpp/codecs/float_huffman_bench/main.cpp new file mode 100644 index 0000000000..1018c17834 --- /dev/null +++ b/library/cpp/codecs/float_huffman_bench/main.cpp @@ -0,0 +1,145 @@ +#include <library/cpp/codecs/float_huffman.h> + +#include <benchmark/benchmark.h> + +#include <util/generic/vector.h> + +const float Factors[] = { + 0.340582, 0.000974026, 0.487168, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0.411765, 0.921569, + 0.00390625, 0.109371, 0, 1, 0, 0, 0, 0, 0.523322, 0, 1, 0, 0, 0, 0, 0.285714, 1, + 0.008253, 1, 0, 0, 0.00993935, 0.450213, 0.000974026, 1, 1, 1, 1, 0, 0, 0.20564, + 0.97561, 0.913896, 1, 1, 0, 1, 0, 0, 0.5, 0, 0, 0, 0.1, 1, 0, 0, 0, 0, 0, 0.450923, + 0, 0.5, 0, 0, 0.20564, 0, 0.5, 0, 0, 0.20564, 0, 0, 0.0313726, 0, 1, 1, 1, 0.363636, + 0.5, 0.686073, 0.45121, 0.00574382, 0.366166, 0.413295, 1, 1, 1, 0, 0, 0, 0, 0.160784, + 0, 0.937255, 0.537255, 0.133333, 0, 0, 0, 0, 0.00392157, 0, 0.333333, 0.027451, 0.0156863, + 1, 0.105882, 1, 0.00220908, 0.000112501, 0.0111262, 0.102384, 0.00140808, 0.123581, + 0.29308, 6.57282e-06, 0.00489498, 2.10209e-05, 0.00140559, 5.907e-06, 0, 0.559322, + 0.559322, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0.794765, 0, + 0.352648, 0.225904, 1, 0.047619, 0.0107276, 0.399461, 0.0304838, 0.292932, 0.00969929, + 0, 0, 0.886904, 0.714693, 0, 0.00223213, 0.000544069, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0.00507403, 0, 0, 0, 0, 0, 0.875, 0, 0, 1, 1, 1, 0, 0.20564, 0, 0.00176048, 0, + 0.000440121, 0, 0, 0, 0.000974026, 0.487168, 0, 0, 0.533333, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 1, 0, 0, 0.723187, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 0.206882, 0.00483367, 0.792983, 0.00126106, 1, 0.0313726, 0.470588, + 0.254902, 0.188235, 0.188235, 0.388235, 0.164706, 0, 0.870588, 0.843137, 0.635294, + 0.384314, 0.384314, 0.643137, 0, 0, 0, 0, 0, 0, 0, 0, 0.541176, 0, 0.541176, 0, 0, + 0.0532634, 1, 0, 0, 0, 0.015044, 1, 0, 1, 1, 1, 0.47451, 0.329412, 0.964706, 0, 0, + 0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0.0941176, 0.970588, 0.970588, 0, 0.970588, 0.97561, + 0, 0.0431373, 0.47451, 0.329412, 0.964706, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0.231373, 0.00392157, 0, 0, 0, 0.054902, 0, 0, + 1, 0, 0, 0.0235294, 0, 1, 0, 0, 0, 0, 0.34902, 0.0352941, 0.925379, 0.623681, 0, + 0.954543, 0, 0, 0.00102756, 0.709804, 0.498039, 0.0901961, 0.631373, 0.847059, 0.270588, + 0.0156863, 0.133333, 0.980392, 1e-12, 1e-12, 1e-12, 1e-12, 0.497159, 0, 0.407487, + 0, 0, 0, 0.00392157, 0.00202156, 0.046875, 0.187159, 0.046875, 0.15625, 0.434232, + 0.15625, 0, 2.95083e-07, 0.20564, 0.20564, 0.97561, 0.913896, 0, 0, 0, 0, 0, 0, 0.00784314, + 0, 0.695525, 1, 0.07205, 0, 0, 0.176471, 0, 0, 0, 1, 1, 0.98, 0.01, 0.01, 0, 0.00690702, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.29078, 0.29078, 1, 0, 0, 0, 0, 0.192157, 0.188235, + 0.0941176, 0, 0.0313726, 0, 0.141176, 0.207843, 0.0901961, 0.00784314, 0.0784314, + 0, 0, 0, 0, 0, 0.203922, 0.0196078, 0.34902, 0.0235294, 0.0980392, 0.164706, 0.133333, + 0.368627, 0, 0.0941176, 0, 1, 0.313726, 0, 0, 0.433582, 0.384508, 0.0532186, 0.0833333, + 0.01609, 0, 1, 0, 0, 0, 0.0666667, 0, 0, 0, 0, 1, 0, 0.564706, 0.501961, 0, 0, 0, + 0, 0, 0.0516447, 0.000173065, 0, 0, 0, 0, 0, 0, 0, 0.996309, 0, 0, 0.00392157, 1, + 0, 0.01, 0, 0, 0, 0, 0, 0.439505, 0.206882, 0.206882, 0.260891, 0, 0.875, 0, 0, 0, + 0, 0, 0.185657, 1, 1, 0, 0, 0, 0.0332647, 0.206106, 0.0688878, 0.239216, 0, 0, 0, + 0, 0.054902, 0, 0.101961, 0.160784, 0.180392, 0, 0.737828, 0, 0, 0.875, 0.0142566, + 0, 0.662745, 1, 0, 0, 0, 0.225806, 0.99992, 0.631373, 0.00392157, 1, 0, 0.143647, + 0.00270085, 1, 0.231482, 0.246735, 0.0428062, 0, 0, 1, 0, 0.186441, 0.0115358, 0, + 0.221762, 0, 0.2, 0, 0.0156863, 0, 0, 0, 0.976471, 0, 0.231373, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0.00392157, 0.00392157, 0.0666667, 0, 0, 0, 0, 0.0117647, 0.580392, 0.98737, + 1, 1, 1, 0, 0, 0, 0.153, 0.847, 0.931373, 0.94697, 0.94697, 0, 0.946294, 0.408118, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0.99992, 0.97561, 0, 0, 0, 0, 0, 0, + 0.274677, 0.153017, 0, 0.642356, 0, 0, 0.1, 0, 0, 0, 0, 0.327944, 0.327944, 0, 0, + 0.815686, 0, 0, 0, 0, 0.206106, 0.439126, 0, 0, 0, 0, 0, 1, 1, 1, 0.00392157, 0.232788, + 0.232465, 0.999899, 0.00309296, 0.0636097, 0.445954, 0.156863, 0, 0, 0, 0, 0, 0, + 0.3796, 0.0784, 0.0651664, 0, 0, 0.254902, 0.266667, 1, 0, 0, 0, 0, 0, 0.596073, + 0.517876, 0.145833, 0.372549, 0, 0.991667, 0.602125, 0.161979, 0, 0, 0, 0, 0.0255146, + 0.947855, 0, 0, 0, 0, 0, 0, 0, 0, 0.847059, 0.679841, 0, 0.156863, 0, 0, 1, 0, 0, + 0, 0, 0.969697, 0, 0, 0.564706, 0, 0, 0, 0, 0, 1, 0.0367282, 0.0395228, 0, 0, 0, + 0, 0, 0.0470588, 0.141176, 0.054902, 0, 0, 0, 0}; + +const ui8 CodedFactors[] = { + 0x24, 0x06, 0x73, 0xB5, 0xC7, 0x55, 0x7F, 0x3A, 0xB4, 0x70, 0xCB, 0xEF, 0xEE, 0xFE, 0xB3, 0x5B, + 0x5A, 0x1A, 0x93, 0x5F, 0x5F, 0x13, 0x00, 0x00, 0x10, 0x00, 0x3D, 0xEF, 0xFF, 0xEE, 0x0F, 0xDC, + 0xF0, 0xAB, 0x3F, 0x37, 0x92, 0x24, 0x5D, 0x5E, 0xDE, 0x1C, 0xF8, 0x12, 0x15, 0x5B, 0x84, 0x51, + 0x82, 0xE6, 0xF6, 0xB8, 0xEA, 0x4F, 0xC7, 0xDD, 0x7D, 0x2E, 0x4D, 0x4A, 0x21, 0xCA, 0xE0, 0xC4, + 0x2E, 0xEA, 0xD3, 0xBD, 0x0F, 0x00, 0x00, 0xE0, 0xDA, 0xCC, 0xCC, 0xEC, 0x9F, 0x61, 0xDF, 0xE6, + 0x01, 0x00, 0x00, 0xCC, 0xA5, 0x49, 0xA9, 0x00, 0x00, 0x00, 0xE6, 0xD2, 0xA4, 0xD4, 0xEA, 0x08, + 0x08, 0xD0, 0xDD, 0xF9, 0xE7, 0xA2, 0x0B, 0x00, 0x00, 0x40, 0xD8, 0x13, 0x7D, 0xFE, 0x13, 0x9C, + 0x9B, 0xA8, 0x36, 0xBC, 0x00, 0x90, 0x43, 0x6F, 0x97, 0x67, 0x9B, 0xD3, 0xEE, 0xFE, 0x84, 0x24, + 0x25, 0x89, 0xC9, 0xBF, 0x3F, 0x58, 0x4C, 0x4C, 0xCA, 0x21, 0x22, 0xBC, 0x39, 0x08, 0x08, 0x08, + 0x40, 0x7E, 0xAA, 0xAA, 0xCA, 0x75, 0x70, 0x70, 0xE9, 0x08, 0x08, 0xE8, 0x9A, 0x8A, 0x8D, 0xED, + 0xA6, 0x8D, 0x31, 0x04, 0x00, 0x96, 0xD0, 0x7D, 0x1D, 0x47, 0xAA, 0x2A, 0xD9, 0x28, 0xAD, 0x6B, + 0xB4, 0x9D, 0x7A, 0xC4, 0xD5, 0xD1, 0x04, 0x8C, 0x7E, 0x56, 0x3A, 0x58, 0x5A, 0x0C, 0x46, 0x6E, + 0x1B, 0x53, 0xC2, 0x0C, 0x14, 0x00, 0xAB, 0x60, 0x05, 0x7B, 0x63, 0x8D, 0x77, 0x70, 0x75, 0xAC, + 0x2F, 0x8D, 0xB1, 0x4D, 0xA0, 0xFB, 0xF2, 0x40, 0xF7, 0xE5, 0x7F, 0xDF, 0xDD, 0xFD, 0xBB, 0x1B, + 0xB8, 0x75, 0x9B, 0x47, 0x8E, 0xB4, 0x0C, 0x9B, 0x3A, 0x73, 0x25, 0x61, 0x18, 0x92, 0xD1, 0xC2, + 0x2F, 0x3C, 0x31, 0x64, 0x96, 0x2A, 0xB9, 0xF9, 0x7C, 0xD9, 0xAF, 0x94, 0xC5, 0xE9, 0x1E, 0x63, + 0x24, 0x0C, 0x03, 0x7F, 0xD8, 0x5B, 0xB3, 0x1D, 0x49, 0x02, 0x00, 0xAB, 0xFD, 0xE9, 0xA0, 0xF3, + 0xBF, 0xC9, 0x40, 0x64, 0x0A, 0xC0, 0xC7, 0x00, 0x00, 0x60, 0x77, 0xCF, 0xA5, 0x49, 0xA9, 0x16, + 0xFD, 0xD7, 0x5C, 0xA7, 0x55, 0x00, 0x36, 0xCF, 0xB9, 0x3D, 0xAE, 0xFA, 0xD3, 0xA1, 0x85, 0x5B, + 0xFE, 0x60, 0x10, 0x11, 0xFF, 0xF7, 0x7D, 0x38, 0x59, 0x24, 0xFF, 0xFF, 0xDF, 0x13, 0x1C, 0x7B, + 0xCA, 0x1C, 0x1E, 0xF3, 0x04, 0xC0, 0x78, 0x07, 0x58, 0x7B, 0xA2, 0x54, 0xAA, 0xE3, 0xEA, 0x08, + 0x08, 0xC0, 0x74, 0x78, 0x78, 0x88, 0x50, 0x50, 0xD8, 0x0A, 0x0C, 0xC4, 0x56, 0x60, 0x20, 0xF6, + 0x1A, 0x1B, 0x33, 0x16, 0x15, 0xA5, 0xB8, 0xED, 0xED, 0x22, 0xF5, 0xF5, 0x09, 0xA1, 0xA2, 0x42, + 0x67, 0x62, 0x62, 0x3A, 0x13, 0x13, 0x0B, 0xA0, 0xA4, 0xF4, 0x0F, 0x06, 0x15, 0x35, 0x18, 0x54, + 0xD4, 0x35, 0x57, 0x45, 0xCB, 0x2F, 0x39, 0xF6, 0xEC, 0xBC, 0xBB, 0x53, 0x5F, 0x5E, 0x9E, 0xB1, + 0xA8, 0xA8, 0x28, 0xDF, 0xDE, 0x3E, 0x00, 0x00, 0x80, 0x5F, 0x75, 0x81, 0x81, 0x51, 0x1D, 0x1E, + 0xA2, 0x3A, 0x3C, 0x8C, 0xEA, 0xF0, 0x10, 0x51, 0x06, 0x67, 0xED, 0x85, 0x85, 0xA1, 0xBE, 0xBC, + 0x3C, 0x63, 0x51, 0x51, 0x51, 0xBE, 0xBD, 0xFD, 0xFF, 0xFD, 0xFE, 0xCE, 0x85, 0x76, 0x36, 0x73, + 0x10, 0x10, 0x10, 0x80, 0xEB, 0x3A, 0x38, 0xD8, 0xBE, 0xD4, 0x05, 0x06, 0xEE, 0x4F, 0x60, 0x59, + 0x59, 0x65, 0x84, 0x84, 0xC0, 0x46, 0xCB, 0x19, 0x7F, 0x4C, 0xFD, 0xC8, 0x9D, 0x8B, 0xB6, 0x31, + 0xAF, 0x86, 0x3A, 0xF0, 0x6D, 0x6D, 0x11, 0xDF, 0xDF, 0x5F, 0x79, 0x71, 0x71, 0x85, 0xD4, 0xD0, + 0x10, 0xB9, 0xB1, 0x11, 0x1A, 0x54, 0x54, 0xE9, 0x08, 0x08, 0x48, 0x39, 0x44, 0x04, 0x84, 0xAF, + 0xAF, 0x96, 0x99, 0x97, 0x71, 0xC5, 0x32, 0xF3, 0x32, 0xAE, 0x58, 0x66, 0x5E, 0xC6, 0x15, 0xCB, + 0xCC, 0xCB, 0xB8, 0x42, 0xD0, 0x45, 0xFF, 0x1C, 0x11, 0x85, 0xBE, 0x39, 0x08, 0x08, 0x08, 0x80, + 0x69, 0xC2, 0x47, 0x00, 0x80, 0x02, 0x00, 0x00, 0x91, 0xD3, 0xF4, 0x47, 0x01, 0x00, 0x80, 0x08, + 0x00, 0x00, 0x42, 0xD4, 0x29, 0x6F, 0x02, 0x00, 0x80, 0xB4, 0xE6, 0x6B, 0x9E, 0x34, 0x5C, 0x9A, + 0x94, 0xE2, 0xD2, 0xA4, 0x14, 0xA2, 0x0C, 0x4E, 0xEC, 0xA2, 0x3E, 0x7F, 0x39, 0x08, 0x08, 0x10, + 0x6E, 0x6F, 0x10, 0xD7, 0x79, 0xC7, 0xC9, 0x09, 0x4D, 0x4B, 0x73, 0x77, 0x84, 0x14, 0xAE, 0x52, + 0xE1, 0x7A, 0x44, 0x2A, 0x5C, 0x8F, 0x34, 0x93, 0xA8, 0xC4, 0x01, 0xF8, 0x3F, 0x3D, 0xC2, 0x29, + 0xE9, 0x11, 0x4E, 0xE9, 0x4F, 0x67, 0x62, 0x22, 0xB6, 0x02, 0x03, 0xA9, 0x2E, 0x30, 0x70, 0x75, + 0x04, 0x04, 0xC8, 0x38, 0x48, 0x08, 0x32, 0x53, 0x53, 0x29, 0x2F, 0x2E, 0xAE, 0x1C, 0x04, 0x04, + 0x50, 0x52, 0x50, 0xD0, 0x4F, 0x77, 0x68, 0x28, 0x99, 0x08, 0x0A, 0x4A, 0x60, 0x59, 0x59, 0xA9, + 0x0B, 0x0C, 0xAC, 0xC7, 0xC8, 0xC8, 0x8C, 0x45, 0x45, 0xA1, 0x1C, 0x22, 0x02, 0x5D, 0x79, 0x79, + 0xAB, 0x2E, 0x30, 0x70, 0xA7, 0x2C, 0x28, 0xE8, 0xB4, 0xF3, 0xEF, 0x26, 0x8F, 0x37, 0xB1, 0xFE, + 0xEE, 0x67, 0xA9, 0xA9, 0xAA, 0xAA, 0x6C, 0x79, 0x1E, 0xEC, 0xD7, 0x46, 0x44, 0xC4, 0xF7, 0xF8, + 0x24, 0x24, 0x00, 0x42, 0x40, 0xF8, 0x5A, 0x96, 0x38, 0x65, 0x91, 0xF1, 0x6A, 0x72, 0xFE, 0x68, + 0xC3, 0xE1, 0x37, 0x07, 0x01, 0x01, 0x01, 0xF0, 0x52, 0xE1, 0x7A, 0xE4, 0xB3, 0xD9, 0x20, 0x9C, + 0xE0, 0xD8, 0x53, 0x04, 0xC7, 0x9E, 0x82, 0x02, 0x27, 0x2B, 0x06, 0x00, 0x00, 0x9F, 0xDE, 0x1C, + 0x3E, 0xEE, 0xD7, 0x48, 0x20, 0x04, 0xD2, 0x35, 0x4C, 0x29, 0x43, 0x45, 0x23, 0x15, 0xEA, 0xE9, + 0x5E, 0xD7, 0xC1, 0xC1, 0xAA, 0x3B, 0x34, 0x34, 0x21, 0x49, 0x49, 0xE8, 0x8A, 0x8B, 0x13, 0x66, + 0x12, 0xE7, 0x31, 0x00, 0x00, 0x90, 0x84, 0x94, 0x69, 0x05, 0xD4, 0xD4, 0xF4, 0x13, 0x36, 0xE7, + 0x0C, 0x09, 0xEB, 0xBF, 0x90, 0x1A, 0x1A, 0xE6, 0x20, 0x20, 0x20, 0x00, 0x9E, 0x33, 0x18, 0x13, + 0xA6, 0x2F, 0x40, 0x0C, 0x00, 0x4E, 0xCF, 0x84, 0x36, 0x6A, 0xA0, 0xF2, 0xA9, 0x63, 0xD5, 0xCB, + 0x9E, 0x64, 0xEA, 0x3E, 0xF2, 0x14, 0xA0, 0x27, 0x29, 0x2B, 0xC6, 0xB2, 0x99, 0x99, 0xA9, 0x74, + 0x04, 0x04, 0x3C, 0x0A, 0xD0, 0xCF, 0x5C, 0x68, 0x67, 0xFB, 0xDF, 0x1C, 0x04, 0x04, 0x04, 0xC0, + 0x1C, 0x04, 0x04, 0x04, 0x40, 0x1B, 0x11, 0x11, 0x5F, 0xEA, 0x02, 0x03, 0xE1, 0x92, 0x94, 0x84, + 0x90, 0x88, 0xD9, 0xDD, 0x4F, 0x04, 0x56, 0x0E, 0xD1, 0x9F, 0x1A, 0x31, 0x3B, 0x37, 0x47, 0xA0, + 0x6C, 0x82, 0x40, 0xD9, 0x24, 0x9A, 0x02, 0x12, 0x62, 0xD3, 0x43, 0xFF, 0xBF, 0x8F, 0x84, 0xF5, + 0x1F, 0x51, 0x06, 0xE7, 0x0F, 0xDD, 0x89, 0x32, 0xFB, 0x60, 0x39, 0x0A, 0x71, 0x71, 0xB4, 0x36, + 0x33, 0x33, 0x3F, 0x8F, 0xD0, 0x4F, 0x79, 0x84, 0x7E, 0xBA, 0xC8, 0x0C, 0x0D, 0x4F, 0xBA, 0x86, + 0x29, 0x82, 0x54, 0x83, 0x7F, 0x77, 0x37, 0x07, 0x01, 0x01, 0x01, 0xA0, 0xFE, 0x97, 0x1B, 0x9D, + 0x16, 0xDC, 0x90, 0x58, 0xFE, 0x9B, 0x42, 0xB3, 0x4A, 0x00, 0x68, 0x73, 0x91, 0x20, 0x2B, 0xA8, + 0xC8, 0x29, 0x0B, 0x0A, 0xF2, 0xD3, 0x5D, 0x4B, 0x58, 0x5D, 0x20, 0x41, 0xD5, 0xBE, 0xAE, 0x70, + 0x88, 0x50, 0x50, 0x20, 0x4A, 0x44, 0xF4, 0x8F, 0xF7, 0x60, 0x22, 0x30, 0x9C, 0x24, 0xFE, 0x54, + 0x55, 0xD0, 0xD7, 0xD7, 0x37, 0x1A, 0xEF, 0x6E, 0xBC, 0x9B, 0x44, 0x39, 0xDD, 0x5D, 0xF2, 0xF2, + 0x7F, 0x20, 0x1A, 0x81, 0x9A, 0xCA, 0xBF, 0xC8, 0x8D, 0x8D, 0xC2, 0x83, 0x82, 0xA7, 0x2C, 0x28, + 0xC8, 0xFE, 0x08, 0xC2, 0x07, 0xC7, 0x27, 0x21, 0xE1, 0xBB, 0x3E, 0xC1, 0x59, 0x68, 0xAA, 0x78, + 0xC8, 0x57, 0x5D, 0x60, 0x20, 0xC6, 0x41, 0x42, 0xE8, 0x3A, 0x38, 0xD8, 0x9B, 0xFF, 0xFF, 0xFF, + 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; +static const TStringBuf CodedFactorsBuf(reinterpret_cast<const char*>(CodedFactors), Y_ARRAY_SIZE(CodedFactors)); + +void BM_Encode(benchmark::State& state) { + for (const auto _ : state) { + NCodecs::NFloatHuff::Encode(Factors); + } +} + +void BM_Decode(benchmark::State& state) { + for (const auto _ : state) { + NCodecs::NFloatHuff::Decode(CodedFactorsBuf, Y_ARRAY_SIZE(Factors)); + } +} + +BENCHMARK(BM_Encode); +BENCHMARK(BM_Decode); diff --git a/library/cpp/codecs/float_huffman_bench/ya.make b/library/cpp/codecs/float_huffman_bench/ya.make new file mode 100644 index 0000000000..c8fae6873a --- /dev/null +++ b/library/cpp/codecs/float_huffman_bench/ya.make @@ -0,0 +1,13 @@ +OWNER(eeight) + +G_BENCHMARK() + +SRCS( + main.cpp +) + +PEERDIR( + library/cpp/codecs +) + +END() diff --git a/library/cpp/codecs/greedy_dict/gd_builder.cpp b/library/cpp/codecs/greedy_dict/gd_builder.cpp new file mode 100644 index 0000000000..561bfbca01 --- /dev/null +++ b/library/cpp/codecs/greedy_dict/gd_builder.cpp @@ -0,0 +1,142 @@ +#include "gd_builder.h" + +#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> +#include <util/generic/algorithm.h> + +#include <util/random/shuffle.h> +#include <util/stream/output.h> +#include <util/string/printf.h> +#include <util/system/rusage.h> + +namespace NGreedyDict { + void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) { + if (!Current) { + Current = MakeHolder<TEntrySet>(); + Current->InitWithAlpha(); + } + + TEntrySet& set = *Current; + + for (auto& it : set) + it.Count = 0; + + CompoundCounts = nullptr; + CompoundCountsPool.Clear(); + + if (!final) { + CompoundCounts = MakeHolder<TCompoundCounts>(&CompoundCountsPool); + CompoundCounts->reserve(maxcand); + } + + Shuffle(Input.begin(), Input.end(), Rng); + + for (auto str : Input) { + if (!final && CompoundCounts->size() > maxcand) + break; + + i32 prev = -1; + + while (!!str) { + TEntry* e = set.FindPrefix(str); + ui32 num = e->Number; + + e->Count += 1; + if (!final && prev >= 0) { + (*CompoundCounts)[Compose(prev, num)] += 1; + } + + prev = num; + ++set.TotalCount; + } + } + + Current->SetModelP(); + } + + ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) { + TAutoPtr<TEntrySet> newset = new TEntrySet; + newset->InitWithAlpha(); + maxent -= newset->size(); + + ui32 additions = 0; + ui32 deletions = 0; + + { + const TEntrySet& set = *Current; + + Candidates.clear(); + const ui32 total = set.TotalCount; + const float minpval = Settings.MinPValue; + const EEntryStatTest test = Settings.StatTest; + const EEntryScore score = Settings.Score; + const ui32 mincnt = Settings.MinAbsCount; + + for (const auto& it : set) { + const TEntry& e = it; + float modelp = e.ModelP; + ui32 cnt = e.Count; + + if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval) + Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number)); + } + + if (!!CompoundCounts) { + for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) { + const TEntry& prev = set.Get(Prev(it->first)); + const TEntry& next = set.Get(Next(it->first)); + float modelp = ModelP(prev.Count, next.Count, total); + ui32 cnt = it->second; + if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval) + Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first)); + } + } + + Sort(Candidates.begin(), Candidates.end()); + + if (Candidates.size() > maxent) + Candidates.resize(maxent); + + for (const auto& candidate : Candidates) { + if (IsCompound(candidate.second)) { + additions++; + newset->Add(set.Get(Prev(candidate.second)).Str, set.Get(Next(candidate.second)).Str); + } else { + newset->Add(set.Get(candidate.second).Str); + } + } + + deletions = set.size() - (newset->size() - additions); + } + + Current = newset; + Current->BuildHierarchy(); + return deletions + additions; + } + + ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) { + size_t totalsz = 0; + for (auto it : Input) + totalsz += it.size(); + + while (maxiters) { + maxiters--; + + RebuildCounts(maxentries * Settings.GrowLimit, false); + + if (Settings.Verbose) { + TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size()); + Clog << Sprintf("%-110s RSS=%" PRIu32 "M", mess.data(), (ui32)(TRusage::Get().MaxRss >> 20)) << Endl; + } + + ui32 diff = BuildNextGeneration(maxentries); + + if (Current->size() == maxentries && diff < mindiff) + break; + } + + RebuildCounts(0, true); + Current->SetScores(Settings.Score); + return maxiters; + } + +} diff --git a/library/cpp/codecs/greedy_dict/gd_builder.h b/library/cpp/codecs/greedy_dict/gd_builder.h new file mode 100644 index 0000000000..b8e9a5e37b --- /dev/null +++ b/library/cpp/codecs/greedy_dict/gd_builder.h @@ -0,0 +1,94 @@ +#pragma once + +#include "gd_entry.h" + +#include <util/generic/hash.h> +#include <util/random/fast.h> + +namespace NGreedyDict { + struct TBuildSettings { + EEntryStatTest StatTest = EST_SIMPLE_NORM; + EEntryScore Score = ES_LEN_SIMPLE; + + float MinPValue = 0.75; + ui32 MinAbsCount = 10; + ui32 GrowLimit = 10; // times of maxentries + bool Verbose = false; + }; + + class TDictBuilder { + using TCompoundCounts = THashMap<ui64, ui32, THash<ui64>, TEqualTo<ui64>, TPoolAllocator>; + using TCandidate = std::pair<float, ui64>; + using TCandidates = TVector<TCandidate>; + + private: + TFastRng64 Rng{0x1a5d0ac170565c1c, 0x0be7bc27, 0x6235f6f57820aa0d, 0xafdc7fb}; + TStringBufs Input; + + THolder<TEntrySet> Current; + + TMemoryPool CompoundCountsPool; + THolder<TCompoundCounts> CompoundCounts; + + TCandidates Candidates; + + TBuildSettings Settings; + + public: + TDictBuilder(const TBuildSettings& s = TBuildSettings()) + : CompoundCountsPool(8112, TMemoryPool::TLinearGrow::Instance()) + , Settings(s) + { + } + + void SetInput(const TStringBufs& in) { + Input = in; + } + + const TBuildSettings& GetSettings() const { + return Settings; + } + + TBuildSettings& GetSettings() { + return Settings; + } + + void SetSettings(const TBuildSettings& s) { + Settings = s; + } + + TEntrySet& EntrySet() { + return *Current; + } + + const TEntrySet& EntrySet() const { + return *Current; + } + + THolder<TEntrySet> ReleaseEntrySet() { + return std::move(Current); + } + + ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10); + + public: + void RebuildCounts(ui32 maxcand, bool final); + ui32 /*diff size*/ BuildNextGeneration(ui32 maxent); + + static bool IsCompound(ui64 ent) { + return ent & 0xFFFFFFFF00000000ULL; + } + + static ui32 Next(ui64 ent) { + return ent; + } + static ui32 Prev(ui64 ent) { + return (ent >> 32) - 1; + } + + static ui64 Compose(ui32 prev, ui32 next) { + return ((prev + 1ULL) << 32) | next; + } + }; + +} diff --git a/library/cpp/codecs/greedy_dict/gd_entry.cpp b/library/cpp/codecs/greedy_dict/gd_entry.cpp new file mode 100644 index 0000000000..2c315c7f7c --- /dev/null +++ b/library/cpp/codecs/greedy_dict/gd_entry.cpp @@ -0,0 +1,98 @@ +#include "gd_entry.h" +#include "gd_stats.h" + +#include <util/generic/algorithm.h> +#include <util/generic/singleton.h> + +namespace NGreedyDict { + class TAlphas { + char Memory[512]; + + public: + TStringBufs Alphas; + + TAlphas() { + for (ui32 i = 0; i < 256; ++i) { + Memory[2 * i] = (char)i; + Memory[2 * i + 1] = 0; + + Alphas.push_back(TStringBuf(&Memory[2 * i], 1)); + } + } + }; + + void TEntrySet::InitWithAlpha() { + Pool.ClearKeepFirstChunk(); + const TStringBufs& a = Singleton<TAlphas>()->Alphas; + for (auto it : a) { + Add(it); + } + BuildHierarchy(); + } + + void TEntrySet::BuildHierarchy() { + Sort(begin(), end(), TEntry::StrLess); + + TCompactTrieBuilder<char, ui32, TAsIsPacker<ui32>> builder(CTBF_PREFIX_GROUPED); + + for (iterator it = begin(); it != end(); ++it) { + it->Number = (it - begin()); + TStringBuf suff = it->Str; + size_t len = 0; + ui32 val = 0; + + if (builder.FindLongestPrefix(suff.data(), suff.size(), &len, &val) && len) { + it->NearestPrefix = val; + } + + builder.Add(suff.data(), suff.size(), it->Number); + } + + TBufferOutput bout; + builder.Save(bout); + Trie.Init(TBlob::FromBuffer(bout.Buffer())); + } + + TEntry* TEntrySet::FindPrefix(TStringBuf& str) { + size_t len = 0; + ui32 off = 0; + + if (!Trie.FindLongestPrefix(str, &len, &off)) { + return nullptr; + } + + str.Skip(len); + return &Get(off); + } + + void TEntrySet::SetModelP() { + for (iterator it = begin(); it != end(); ++it) { + TEntry& e = *it; + + if (!e.HasPrefix()) { + e.ModelP = 0; + continue; + } + + TStringBuf suff = e.Str; + const TEntry& p = Get(e.NearestPrefix); + suff.Skip(p.Len()); + + float modelp = float(p.Count + e.Count) / TotalCount; + + while (!!suff) { + TEntry* pp = FindPrefix(suff); + modelp *= float(pp->Count + e.Count) / TotalCount; + } + + e.ModelP = modelp; + } + } + + void TEntrySet::SetScores(EEntryScore s) { + for (auto& it : *this) { + it.Score = Score(s, it.Len(), it.ModelP, it.Count, TotalCount); + } + } + +} diff --git a/library/cpp/codecs/greedy_dict/gd_entry.h b/library/cpp/codecs/greedy_dict/gd_entry.h new file mode 100644 index 0000000000..18b5be0e15 --- /dev/null +++ b/library/cpp/codecs/greedy_dict/gd_entry.h @@ -0,0 +1,103 @@ +#pragma once + +#include "gd_stats.h" + +#include <library/cpp/containers/comptrie/comptrie.h> + +#include <util/generic/ptr.h> +#include <util/generic/strbuf.h> +#include <util/generic/vector.h> + +#include <util/memory/pool.h> + +namespace NGreedyDict { + using TStringBufs = TVector<TStringBuf>; + + struct TEntry { + static const i32 NoPrefix = -1; + + TStringBuf Str; + + i32 NearestPrefix = NoPrefix; + ui32 Count = 0; + ui32 Number = 0; + float ModelP = 0; + float Score = 0; + + TEntry(TStringBuf b = TStringBuf(), ui32 cnt = 0) + : Str(b) + , Count(cnt) + { + } + + bool HasPrefix() const { + return NearestPrefix != NoPrefix; + } + ui32 Len() const { + return Str.size(); + } + + static bool StrLess(const TEntry& a, const TEntry& b) { + return a.Str < b.Str; + } + static bool NumberLess(const TEntry& a, const TEntry& b) { + return a.Number < b.Number; + } + static bool ScoreMore(const TEntry& a, const TEntry& b) { + return a.Score > b.Score; + } + }; + + class TEntrySet: public TVector<TEntry>, TNonCopyable { + TMemoryPool Pool{8112}; + TCompactTrie<char, ui32, TAsIsPacker<ui32>> Trie; + + public: + ui32 TotalCount = 0; + + void InitWithAlpha(); + + void Add(TStringBuf a) { + push_back(TStringBuf(Pool.Append(a.data(), a.size()), a.size())); + } + + void Add(TStringBuf a, TStringBuf b) { + size_t sz = a.size() + b.size(); + char* p = (char*)Pool.Allocate(sz); + memcpy(p, a.data(), a.size()); + memcpy(p + a.size(), b.data(), b.size()); + push_back(TStringBuf(p, sz)); + } + + TEntry& Get(ui32 idx) { + return (*this)[idx]; + } + + const TEntry& Get(ui32 idx) const { + return (*this)[idx]; + } + + void BuildHierarchy(); + + // longest prefix + TEntry* FindPrefix(TStringBuf& str); + + const TEntry* FindPrefix(TStringBuf& str) const { + return ((TEntrySet*)this)->FindPrefix(str); + } + + const TEntry* FirstPrefix(const TEntry& e, TStringBuf& suff) { + if (!e.HasPrefix()) + return nullptr; + + const TEntry& p = Get(e.NearestPrefix); + suff = e.Str; + suff.Skip(p.Str.size()); + return &p; + } + + void SetModelP(); + void SetScores(EEntryScore); + }; + +} diff --git a/library/cpp/codecs/greedy_dict/gd_stats.h b/library/cpp/codecs/greedy_dict/gd_stats.h new file mode 100644 index 0000000000..b63c4c38d2 --- /dev/null +++ b/library/cpp/codecs/greedy_dict/gd_stats.h @@ -0,0 +1,79 @@ +#pragma once + +#include <util/generic/ymath.h> +#include <util/generic/algorithm.h> +#include <util/generic/yexception.h> + +namespace NGreedyDict { + enum EEntryScore { + ES_COUNT, + ES_LEN_COUNT, + ES_SIMPLE, + ES_LEN_SIMPLE, + ES_SOLAR + }; + + enum EEntryStatTest { + EST_NONE = 0, + EST_SIMPLE_NORM = 2 + }; + + inline float ModelP(ui32 countA, ui32 countB, ui32 total) { + return float(countA) * countB / total / total; + } + + // P (ab | dependent) + inline float SimpleTest(float modelp, ui32 countAB, ui32 total) { + float realp = float(countAB) / total; + return modelp >= realp ? 0 : (realp - modelp); + } + + inline float SolarTest(float modelp, ui32 countAB, ui32 total) { + float realp = float(countAB) / total; + return modelp >= realp ? 0 : (modelp + realp * (log(realp / modelp) - 1)); + } + + // P (ab | dependent) / P (ab) + inline float SimpleTestNorm(float modelp, ui32 countAB, ui32 total) { + float realp = float(countAB) / total; + return modelp >= realp ? 0 : (realp - modelp) / realp; + } + + inline float StatTest(EEntryStatTest test, float modelp, ui32 countAB, ui32 total) { + if (!total) { + return 0; + } + switch (test) { + case EST_NONE: + return 1; + case EST_SIMPLE_NORM: + return SimpleTestNorm(modelp, countAB, total); + } + Y_FAIL("no way!"); + return 0; + } + + inline float Score(EEntryScore score, ui32 len, float modelp, ui32 count, ui32 total) { + if (!total) { + return 0; + } + ui32 m = 1; + switch (score) { + case ES_LEN_COUNT: + m = len; + [[fallthrough]]; + case ES_COUNT: + return m * count; + case ES_LEN_SIMPLE: + m = len; + [[fallthrough]]; + case ES_SIMPLE: + return m * SimpleTest(modelp, count, total); + case ES_SOLAR: + return SolarTest(modelp, count, total); + } + Y_FAIL("no way!"); + return 0; + } + +} diff --git a/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp new file mode 100644 index 0000000000..679089a11b --- /dev/null +++ b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp @@ -0,0 +1,282 @@ +#include "gd_builder.h" + +#include <library/cpp/testing/unittest/registar.h> +#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> +#include <util/string/printf.h> +#include <util/generic/ymath.h> + +class TGreedyDictTest: public TTestBase { + UNIT_TEST_SUITE(TGreedyDictTest); + UNIT_TEST(TestEntrySet) + UNIT_TEST(TestBuilder0) + UNIT_TEST(TestBuilder) + UNIT_TEST_SUITE_END(); + + void TestEntrySet() { + using namespace NGreedyDict; + + { + TEntrySet d; + + d.InitWithAlpha(); + + for (TEntrySet::const_iterator it = d.begin(); it != d.end(); ++it) { + UNIT_ASSERT_C(!it->HasPrefix(), Sprintf("%u -> %u", it->Number, it->NearestPrefix)); + UNIT_ASSERT_VALUES_EQUAL(it->Number, (ui32)(it - d.begin())); + } + + UNIT_ASSERT_VALUES_EQUAL(d.size(), 256u); + TStringBuf s = "aaabbb"; + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a"); + UNIT_ASSERT_VALUES_EQUAL(s, "aabbb"); + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a"); + UNIT_ASSERT_VALUES_EQUAL(s, "abbb"); + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a"); + UNIT_ASSERT_VALUES_EQUAL(s, "bbb"); + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b"); + UNIT_ASSERT_VALUES_EQUAL(s, "bb"); + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b"); + UNIT_ASSERT_VALUES_EQUAL(s, "b"); + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b"); + UNIT_ASSERT_VALUES_EQUAL(s, ""); + s = TStringBuf("", 1); + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, TStringBuf("", 1)); + UNIT_ASSERT_VALUES_EQUAL(s, ""); + s = "\xFF"; + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "\xFF"); + UNIT_ASSERT_VALUES_EQUAL(s, ""); + } + { + TEntrySet d; + d.Add("a"); + d.Add("b"); + d.Add("b", "a"); + d.BuildHierarchy(); + + UNIT_ASSERT_VALUES_EQUAL(d.size(), 3u); + + TStringBuf s = "bab"; + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "ba"); + UNIT_ASSERT_VALUES_EQUAL(s, "b"); + } + { + TEntrySet d; + + d.Add("a"); + d.Add("aa"); + d.Add("aaa"); + d.Add("aab"); + d.Add("b"); + d.Add("ba"); + + d.BuildHierarchy(); + + UNIT_ASSERT_VALUES_EQUAL(d.size(), 6u); + { + TStringBuf s = "aaaaa"; + const TEntry* e = d.FindPrefix(s); + UNIT_ASSERT_VALUES_EQUAL(e->Str, "aaa"); + UNIT_ASSERT_VALUES_EQUAL(e->Number, 2u); + UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 1); + UNIT_ASSERT_VALUES_EQUAL(s, "aa"); + } + + { + TStringBuf s = "a"; + const TEntry* e = d.FindPrefix(s); + UNIT_ASSERT_VALUES_EQUAL(e->Str, "a"); + UNIT_ASSERT_VALUES_EQUAL(e->Number, 0u); + UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1); + UNIT_ASSERT_VALUES_EQUAL(s, ""); + } + + { + TStringBuf s = "bab"; + const TEntry* e = d.FindPrefix(s); + UNIT_ASSERT_VALUES_EQUAL(e->Str, "ba"); + UNIT_ASSERT_VALUES_EQUAL(e->Number, 5u); + UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 4); + UNIT_ASSERT_VALUES_EQUAL(s, "b"); + } + + { + TStringBuf s = "bba"; + const TEntry* e = d.FindPrefix(s); + UNIT_ASSERT_VALUES_EQUAL(e->Str, "b"); + UNIT_ASSERT_VALUES_EQUAL(e->Number, 4u); + UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1); + UNIT_ASSERT_VALUES_EQUAL(s, "ba"); + } + } + } + + void TestBuilder0() { + using namespace NGreedyDict; + ui32 a = 1, b = 11; + ui64 ab = TDictBuilder::Compose(a, b); + UNIT_ASSERT(TDictBuilder::IsCompound(ab)); + UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Prev(ab), a); + UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Next(ab), b); + } + + void FillData(NGreedyDict::TStringBufs& data) { + static const char* urls[] = {"http://53.ru/car/motors/foreign/opel/tigra/", "http://abakan.24au.ru/tender/85904/", "http://anm15.gulaig.com/", "http://avto-parts.com/mercedes-benz/mercedes-benz-w220-1998-2005/category-442/category-443/", "http://ballooncousin.co.uk/", "http://benzol.ru/equipment/?id=1211&parent=514", "http://blazingseorank.com/blazing-seo-rank-free-website-analysis-to-increase-rank-and-traffic-450.html", "http://blogblaugrana.contadorwebmasters.com/", "http://bristolhash.org.uk/bh3cntct.php", "http://broker.borovichi.ru/category/item/3/1/0/8/28/257", "http://canoncompactcamerax.blogspot.com/", "http://classifieds.smashits.com/p,107881,email-to-friend.htm", "http://conferences.ksde.org/Portals/132/FallAssessment/SAVETHEDAY-FA09.pdf", "http://eway.vn/raovat/325-dien-tu-gia-dung/337-dieu-hoa/98041-b1-sua-may-lanh-quan-binh-tan-sua-may-lanh-quan-binh-chanh-hh-979676119-toan-quoc.html", "http://gallery.e2bn.org/asset73204_8-.html", "http://goplay.nsw.gov.au/activities-for-kids/by/historic-houses-trust/?startdate=2012-07-10", "http://grichards19067.multiply.com/", "http://hotkovo.egent.ru/user/89262269084/", "http://howimetyourself.com/?redirect_to=http://gomiso.com/m/suits/seasons/2/episodes/2", "http://islamqa.com/hi/ref/9014/DEAD%20PEOPLE%20GOOD%20DEEDS", "http://lapras.rutube.ru/", "http://nceluiko.ya.ru/", "http://nyanyanyanyaa.beon.ru/", "http://ozbo.com/Leaf-River-DV-7SS-7-0-MP-Game-Camera-K1-32541.html", "http://sbantom.ru/catalog/chasy/632753.html", "http://shopingoff.com/index.php?option=com_virtuemart&Itemid=65&category_id=&page=shop.browse&manufacturer_id=122&limit=32&limitstart=96", "http://shopingoff.com/katalog-odezhdy/manufacturer/62-christian-audigier.html?limit=32&start=448", "https://webwinkel.ah.nl/process?fh_location=//ecommerce/nl_NL/categories%3C%7Becommerce_shoc1%7D/it_show_product_code_1384%3E%7B10%3B20%7D/pr_startdate%3C20120519/pr_enddate%3E20120519/pr_ltc_allowed%3E%7Bbowi%7D/categories%3C%7Becommerce_shoc1_1al%7D/categories%3C%7Becommerce_shoc1_1al_1ahal%7D&&action=albert_noscript.modules.build", "http://top100.rambler.ru/navi/?theme=208/210/371&rgn=17", "http://volgogradskaya-oblast.extra-m.ru/classifieds/rabota/vakansii/banki-investicii/901467/", "http://wikien4.appspot.com/wiki/Warburg_hypothesis", "http://wola_baranowska.kamerzysta24.com.pl/", "http://www.10dot0dot0dot1.com/", "http://www.anima-redux.ru/index.php?key=gifts+teenage+girls", "http://www.aquaticabyseaworld.com/Calendar.aspx/CP/CP/CP/sp-us/CP/CP/ParkMap/Tickets/Weather.aspx", "http://www.autousa.com/360spin/2012_cadillac_ctssportwagon_3.6awdpremiumcollection.htm", "http://www.booking.com/city/gb/paignton-aireborough.html?inac=0&lang=pl", "http://www.booking.com/city/it/vodo-cadore.en.html", "http://www.booking.com/district/us/new-york/rockefeller-center.html&lang=no", "http://www.booking.com/hotel/bg/crown-fort-club.lv.html", "http://www.booking.com/hotel/ca/gouverneur-rimouski.ar.html", "http://www.booking.com/hotel/ch/l-auberge-du-chalet-a-gobet.fi.html", "http://www.booking.com/hotel/de/mark-garni.ru.html?aid=337384;label=yandex-hotel-mark-garni-68157-%7Bparam1%7D", "http://www.booking.com/hotel/de/mercure-goldschmieding-castrop-rauxel.ro.html", "http://www.booking.com/hotel/de/zollenspieker-fahrhaus.fr.html", "http://www.booking.com/hotel/es/jardin-metropolitano.ca.html", "http://www.booking.com/hotel/fr/clim.fr.html", "http://www.booking.com/hotel/fr/radisson-sas-toulouse-airport.et.html", "http://www.booking.com/hotel/gb/stgileshotel.ro.html?srfid=68c7fe42a03653a8796c84435c5299e4X16?tab=4", "http://www.booking.com/hotel/gr/rodos-park-suites.ru.html", "http://www.booking.com/hotel/id/le-grande-suites-bali.ru.html", "http://www.booking.com/hotel/it/mozart.it.html?aid=321655", "http://www.booking.com/hotel/ni/bahia-del-sol-villas.ru.html?dcid=1;dva=0", "http://www.booking.com/hotel/nl/cpschiphol.ro.html.ro.html?tab=4", "http://www.booking.com/hotel/th/laem-din.en-gb.html", "http://www.booking.com/hotel/th/tinidee-ranong.en.html", "http://www.booking.com/hotel/us/best-western-plus-merrimack-valley.hu.html", "http://www.booking.com/hotel/vn/tan-hai-long.km.html", "http://www.booking.com/landmark/au/royal-brisbane-women-s-hospital.vi.html", "http://www.booking.com/landmark/hk/nam-cheong-station.html&lang=id", "http://www.booking.com/landmark/it/spanish-steps.ca.html", "http://www.booking.com/landmark/sg/asian-civilisations-museum.html&lang=fi", "http://www.booking.com/place/fi-1376029.pt.html", "http://www.booking.com/place/tn257337.pl.html", "http://www.booking.com/region/ca/niagarafalls.ar.html&selected_currency=PLN", "http://www.booking.com/region/mx/queretaro.pt-pt.html&selected_currency=AUD", "http://www.booking.com/searchresults.en.html?city=20063074", "http://www.booking.com/searchresults.et.html?checkin=;checkout=;city=-394632", "http://www.booking.com/searchresults.lv.html?region=3936", "http://www.cevredanismanlari.com/index.php/component/k2/index.php/mevzuat/genel-yazlar/item/dosyalar/index.php?option=com_k2&view=item&id=16:iso-14001-%C3%A7evre-y%C3%B6netim-sistemi&Itemid=132&limitstart=107120", "http://www.dh-wholesaler.com/MENS-POLO-RACING-TEE-RL-p-417.html", "http://www.employabilityonline.net/", "http://www.esso.inc.ru/board/tools.php?event=profile&pname=Invinerrq", "http://www.filesurgery.ru/searchfw/kids_clothes-3.html", "http://www.furnitureandcarpetsource.com/Item.aspx?ItemID=-2107311899&ItemNum=53-T3048", "http://www.gets.cn/product/Gold-Sand-Lampwork-Glass-Beads--Flat-round--28x28x13mm_p260717.html", "http://www.gets.cn/wholesale-Sterling-Silver-Pendant-Findings-3577_S--L-Star-P-1.html?view=1&by=1", "http://www.homeandgardenadvice.com/diy/Mortgages_Loans_and_Financing/9221.html", "http://www.hongkongairport.com/eng/index.html/passenger/passenger/transport/to-from-airport/business/about-the-airport/transport/shopping/entertainment/t2/passenger/interactive-map.html", "http://www.hongkongairport.com/eng/index.html/shopping/insideshopping/all/passenger/transfer-transit/all/airline-information/shopping/entertainment/t2/business/about-the-airport/welcome.html", "http://www.hongkongairport.com/eng/index.html/transport/business/about-the-airport/transport/business/airport-authority/passenger/shopping/dining/all/dining.html", "http://www.idedge.com/index.cfm/fuseaction/category.display/category_id/298/index.cfm", "http://www.istanbulburda.com/aramalar.php", "http://www.jewelryinthenet.com/ads/AdDetail.aspx?AdID=1-0311002490689&stid=22-0111001020877", "http://www.johnnydepp.ru/forum/index.php?showtopic=1629&mode=linearplus&view=findpost&p=186977", "http://www.johnnydepp.ru/forum/index.php?showtopic=476&st=60&p=87379&", "http://www.joseleano.com/joomla/index.php/audio", "http://www.kaplicarehberi.com/tag/sakar-ilicali-kaplicalari/feed", "http://www.khaber.com.tr/arama.html?key=%C3%A7avdar", "http://www.kiz-oyunlari1.com/1783/4437/4363/1056/4170/Bump-Copter2-.html", "http://www.kiz-oyunlari1.com/3752/2612/4175/1166/3649/1047/Angelina-Oyunu.html", "http://www.kiz-oyunlari1.com/4266/3630/3665/3286/4121/301/3274/Sinir-Sinekler-.html", "http://www.kuldiga.lv/index.php?f=8&cat=371", "http://www.kuldiga.lv/index.php/img/index.php?l=lv&art_id=1836&show_c=&cat=85", "http://www.patronessa.ru/remontiruemsya/kuzovnie30raboti.html", "http://www.rapdict.org/Nu_Money?title=Talk:Nu_Money&action=edit", "http://www.serafin-phu.tabor24.com/?page=8", "http://www.shoes-store.org/brand1/Kids/Minnetonka.html", "http://www.shoes-store.org/shoes-store.xml", "http://www.way2allah.com/khotab-download-34695.htm"}; + data.clear(); + data.insert(data.begin(), urls, urls + Y_ARRAY_SIZE(urls)); + } + + typedef THashMap<TStringBuf, NGreedyDict::TEntry> TDict; + + TAutoPtr<NGreedyDict::TEntrySet> DoTestBuilder(const NGreedyDict::TBuildSettings& s, + TDict& res) { + using namespace NGreedyDict; + + TStringBufs data; + FillData(data); + + TDictBuilder b(s); + b.SetInput(data); + b.Build(256 + 128); + + TEntrySet& set = b.EntrySet(); + + for (const auto& it : set) { + if (it.Score) { + res[it.Str] = it; + } + } + + return b.ReleaseEntrySet(); + } + + void DoAssertEntry(TStringBuf entry, ui32 number, i32 parent, float score, const TDict& dict) { + TDict::const_iterator it = dict.find(entry); + UNIT_ASSERT_C(it != dict.end(), entry); + UNIT_ASSERT_VALUES_EQUAL_C(it->second.Number, number, entry); + UNIT_ASSERT_VALUES_EQUAL_C(it->second.NearestPrefix, parent, entry); + UNIT_ASSERT_VALUES_EQUAL_C(round(it->second.Score * 10000), round(score * 10000), entry); + } + + void TestBuilder() { + TAutoPtr<NGreedyDict::TEntrySet> set; + THashMap<TStringBuf, NGreedyDict::TEntry> res; + NGreedyDict::TBuildSettings s; + set = DoTestBuilder(s, res); + + UNIT_ASSERT_VALUES_EQUAL(set->size(), 295u); + UNIT_ASSERT_VALUES_EQUAL(res.size(), 110u); + + DoAssertEntry("%", 37, -1, 0.00375193, res); + DoAssertEntry("%7", 38, 37, 0.00513299, res); + DoAssertEntry("&", 39, -1, 0.00794527, res); + DoAssertEntry("+", 44, -1, 0.000441404, res); + DoAssertEntry(",", 45, -1, 0.000441404, res); + DoAssertEntry("-", 46, -1, 0.0417126, res); + DoAssertEntry(".", 47, -1, 0.0196425, res); + DoAssertEntry(".com/", 48, 47, 0.0374482, res); + DoAssertEntry(".html", 49, 47, 0.0496577, res); + DoAssertEntry(".html?", 50, 49, 0.0153908, res); + DoAssertEntry(".php", 51, 47, 0.0123585, res); + DoAssertEntry(".ru/", 52, 47, 0.0150027, res); + DoAssertEntry("/", 53, -1, 0.0452439, res); + DoAssertEntry("/index", 54, 53, 0.0158905, res); + DoAssertEntry("0", 55, -1, 0.00816597, res); + DoAssertEntry("1", 56, -1, 0.0167733, res); + DoAssertEntry("10", 57, 56, 0.00530474, res); + DoAssertEntry("2", 58, -1, 0.0101523, res); + DoAssertEntry("20", 59, 58, 0.00674234, res); + DoAssertEntry("3", 60, -1, 0.01258, res); + DoAssertEntry("32", 61, 60, 0.00490697, res); + DoAssertEntry("4", 62, -1, 0.00993158, res); + DoAssertEntry("5", 63, -1, 0.00617965, res); + DoAssertEntry("6", 64, -1, 0.00971088, res); + DoAssertEntry("7", 65, -1, 0.0101523, res); + DoAssertEntry("8", 66, -1, 0.00728316, res); + DoAssertEntry("9", 67, -1, 0.00728316, res); + DoAssertEntry(":", 68, -1, 0.000662106, res); + DoAssertEntry(";", 69, -1, 0.000882807, res); + DoAssertEntry("=", 71, -1, 0.01258, res); + DoAssertEntry("?", 73, -1, 0.00397263, res); + DoAssertEntry("A", 75, -1, 0.00264842, res); + DoAssertEntry("B", 76, -1, 0.00220702, res); + DoAssertEntry("C", 77, -1, 0.00353123, res); + DoAssertEntry("D", 78, -1, 0.00375193, res); + DoAssertEntry("E", 79, -1, 0.00286912, res); + DoAssertEntry("F", 80, -1, 0.00110351, res); + DoAssertEntry("G", 81, -1, 0.00110351, res); + DoAssertEntry("H", 82, -1, 0.000220702, res); + DoAssertEntry("I", 83, -1, 0.00198632, res); + DoAssertEntry("K", 85, -1, 0.000441404, res); + DoAssertEntry("L", 86, -1, 0.00198632, res); + DoAssertEntry("M", 87, -1, 0.00154491, res); + DoAssertEntry("N", 88, -1, 0.00154491, res); + DoAssertEntry("O", 89, -1, 0.00132421, res); + DoAssertEntry("P", 90, -1, 0.00308983, res); + DoAssertEntry("R", 92, -1, 0.000662106, res); + DoAssertEntry("S", 93, -1, 0.00264842, res); + DoAssertEntry("T", 94, -1, 0.00110351, res); + DoAssertEntry("U", 95, -1, 0.000220702, res); + DoAssertEntry("V", 96, -1, 0.000441404, res); + DoAssertEntry("W", 97, -1, 0.000441404, res); + DoAssertEntry("X", 98, -1, 0.000220702, res); + DoAssertEntry("Y", 99, -1, 0.000220702, res); + DoAssertEntry("_", 105, -1, 0.00904877, res); + DoAssertEntry("a", 107, -1, 0.0505407, res); + DoAssertEntry("an", 108, 107, 0.018273, res); + DoAssertEntry("ar", 109, 107, 0.0169385, res); + DoAssertEntry("b", 110, -1, 0.0156698, res); + DoAssertEntry("c", 111, -1, 0.018539, res); + DoAssertEntry("cat", 112, 111, 0.00846732, res); + DoAssertEntry("ch", 113, 111, 0.00644872, res); + DoAssertEntry("com", 114, 111, 0.00724235, res); + DoAssertEntry("ct", 115, 111, 0.00605729, res); + DoAssertEntry("d", 116, -1, 0.020746, res); + DoAssertEntry("di", 117, 116, 0.00730659, res); + DoAssertEntry("e", 118, -1, 0.0624586, res); + DoAssertEntry("en", 119, 118, 0.0108999, res); + DoAssertEntry("ent", 120, 119, 0.00616002, res); + DoAssertEntry("f", 121, -1, 0.00860737, res); + DoAssertEntry("fi", 122, 121, 0.00423196, res); + DoAssertEntry("g", 123, -1, 0.0180975, res); + DoAssertEntry("go", 124, 123, 0.00601862, res); + DoAssertEntry("h", 125, -1, 0.010373, res); + DoAssertEntry("ho", 126, 125, 0.00570298, res); + DoAssertEntry("http://", 127, 125, 0.0494372, res); + DoAssertEntry("http://www.", 128, 127, 0.0849702, res); + DoAssertEntry("http://www.booking.com/", 129, 128, 0.071066, res); + DoAssertEntry("http://www.booking.com/hotel/", 130, 129, 0.121607, res); + DoAssertEntry("i", 131, -1, 0.0258221, res); + DoAssertEntry("id=", 132, 131, 0.00725369, res); + DoAssertEntry("im", 133, 131, 0.00373318, res); + DoAssertEntry("in", 134, 131, 0.013625, res); + DoAssertEntry("ing", 135, 134, 0.00795491, res); + DoAssertEntry("ion", 136, 131, 0.00796149, res); + DoAssertEntry("it", 137, 131, 0.00953416, res); + DoAssertEntry("j", 138, -1, 0.00132421, res); + DoAssertEntry("k", 139, -1, 0.0134628, res); + DoAssertEntry("l", 140, -1, 0.0381814, res); + DoAssertEntry("m", 141, -1, 0.0174354, res); + DoAssertEntry("mer", 142, 141, 0.00711846, res); + DoAssertEntry("n", 143, -1, 0.0132421, res); + DoAssertEntry("o", 144, -1, 0.0302362, res); + DoAssertEntry("on", 145, 144, 0.00802271, res); + DoAssertEntry("ou", 146, 144, 0.00414545, res); + DoAssertEntry("p", 147, -1, 0.0225116, res); + DoAssertEntry("port", 148, 147, 0.0123532, res); + DoAssertEntry("q", 149, -1, 0.00176561, res); + DoAssertEntry("r", 150, -1, 0.0401677, res); + DoAssertEntry("ran", 151, 150, 0.00686918, res); + DoAssertEntry("s", 152, -1, 0.0487751, res); + DoAssertEntry("sho", 153, 152, 0.0113876, res); + DoAssertEntry("t", 154, -1, 0.0379607, res); + DoAssertEntry("u", 155, -1, 0.0211874, res); + DoAssertEntry("v", 156, -1, 0.00595895, res); + DoAssertEntry("vi", 157, 156, 0.00480673, res); + DoAssertEntry("w", 158, -1, 0.00816597, res); + DoAssertEntry("x", 159, -1, 0.00375193, res); + DoAssertEntry("y", 160, -1, 0.0130214, res); + DoAssertEntry("z", 161, -1, 0.00353123, res); + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TGreedyDictTest); diff --git a/library/cpp/codecs/greedy_dict/ut/ya.make b/library/cpp/codecs/greedy_dict/ut/ya.make new file mode 100644 index 0000000000..bd67d1a452 --- /dev/null +++ b/library/cpp/codecs/greedy_dict/ut/ya.make @@ -0,0 +1,9 @@ +UNITTEST_FOR(library/cpp/codecs/greedy_dict) + +OWNER(velavokr) + +SRCS( + greedy_dict_ut.cpp +) + +END() diff --git a/library/cpp/codecs/greedy_dict/ya.make b/library/cpp/codecs/greedy_dict/ya.make new file mode 100644 index 0000000000..2a57224f7e --- /dev/null +++ b/library/cpp/codecs/greedy_dict/ya.make @@ -0,0 +1,15 @@ +OWNER(velavokr) + +LIBRARY() + +SRCS( + gd_builder.cpp + gd_entry.cpp +) + +PEERDIR( + library/cpp/containers/comptrie + library/cpp/string_utils/relaxed_escaper +) + +END() diff --git a/library/cpp/codecs/huffman_codec.cpp b/library/cpp/codecs/huffman_codec.cpp new file mode 100644 index 0000000000..650fe7cdfd --- /dev/null +++ b/library/cpp/codecs/huffman_codec.cpp @@ -0,0 +1,592 @@ +#include "huffman_codec.h" +#include <library/cpp/bit_io/bitinput.h> +#include <library/cpp/bit_io/bitoutput.h> + +#include <util/generic/algorithm.h> +#include <util/generic/bitops.h> +#include <util/stream/buffer.h> +#include <util/stream/length.h> +#include <util/string/printf.h> + +namespace NCodecs { + template <typename T> + struct TCanonicalCmp { + bool operator()(const T& a, const T& b) const { + if (a.CodeLength == b.CodeLength) { + return a.Char < b.Char; + } else { + return a.CodeLength < b.CodeLength; + } + } + }; + + template <typename T> + struct TByCharCmp { + bool operator()(const T& a, const T& b) const { + return a.Char < b.Char; + } + }; + + struct TTreeEntry { + static const ui32 InvalidBranch = (ui32)-1; + + ui64 Freq = 0; + ui32 Branches[2]{InvalidBranch, InvalidBranch}; + + ui32 CodeLength = 0; + ui8 Char = 0; + bool Invalid = false; + + TTreeEntry() = default; + + static bool ByFreq(const TTreeEntry& a, const TTreeEntry& b) { + return a.Freq < b.Freq; + } + + static bool ByFreqRev(const TTreeEntry& a, const TTreeEntry& b) { + return a.Freq > b.Freq; + } + }; + + using TCodeTree = TVector<TTreeEntry>; + + void InitTreeByFreqs(TCodeTree& tree, const ui64 freqs[256]) { + tree.reserve(255 * 256 / 2); // worst case - balanced tree + + for (ui32 i = 0; i < 256; ++i) { + tree.emplace_back(); + tree.back().Char = i; + tree.back().Freq = freqs[i]; + } + + StableSort(tree.begin(), tree.end(), TTreeEntry::ByFreq); + } + + void InitTree(TCodeTree& tree, ISequenceReader* in) { + using namespace NPrivate; + ui64 freqs[256]; + Zero(freqs); + + TStringBuf r; + while (in->NextRegion(r)) { + for (ui64 i = 0; i < r.size(); ++i) + ++freqs[(ui8)r[i]]; + } + + InitTreeByFreqs(tree, freqs); + } + + void CalculateCodeLengths(TCodeTree& tree) { + Y_ENSURE(tree.size() == 256, " "); + const ui32 firstbranch = tree.size(); + + ui32 curleaf = 0; + ui32 curbranch = firstbranch; + + // building code tree. two priority queues are combined in one. + while (firstbranch - curleaf + tree.size() - curbranch >= 2) { + TTreeEntry e; + + for (auto& branche : e.Branches) { + ui32 br; + + if (curleaf >= firstbranch) + br = curbranch++; + else if (curbranch >= tree.size()) + br = curleaf++; + else if (tree[curleaf].Freq < tree[curbranch].Freq) + br = curleaf++; + else + br = curbranch++; + + Y_ENSURE(br < tree.size(), " "); + branche = br; + e.Freq += tree[br].Freq; + } + + tree.push_back(e); + PushHeap(tree.begin() + curbranch, tree.end(), TTreeEntry::ByFreqRev); + } + + // computing code lengths + for (ui64 i = tree.size() - 1; i >= firstbranch; --i) { + TTreeEntry e = tree[i]; + + for (auto branche : e.Branches) + tree[branche].CodeLength = e.CodeLength + 1; + } + + // chopping off the branches + tree.resize(firstbranch); + + Sort(tree.begin(), tree.end(), TCanonicalCmp<TTreeEntry>()); + + // simplification: we are stripping codes longer than 64 bits + while (!tree.empty() && tree.back().CodeLength > 64) + tree.pop_back(); + + // will not compress + if (tree.empty()) + return; + + // special invalid code word + tree.back().Invalid = true; + } + + struct TEncoderEntry { + ui64 Code = 0; + + ui8 CodeLength = 0; + ui8 Char = 0; + ui8 Invalid = true; + + explicit TEncoderEntry(TTreeEntry e) + : CodeLength(e.CodeLength) + , Char(e.Char) + , Invalid(e.Invalid) + { + } + + TEncoderEntry() = default; + }; + + struct TEncoderTable { + TEncoderEntry Entries[256]; + + void Save(IOutputStream* out) const { + ui16 nval = 0; + + for (auto entrie : Entries) + nval += !entrie.Invalid; + + ::Save(out, nval); + + for (auto entrie : Entries) { + if (!entrie.Invalid) { + ::Save(out, entrie.Char); + ::Save(out, entrie.CodeLength); + } + } + } + + void Load(IInputStream* in) { + ui16 nval = 0; + ::Load(in, nval); + + for (ui32 i = 0; i < 256; ++i) + Entries[i].Char = i; + + for (ui32 i = 0; i < nval; ++i) { + ui8 ch = 0; + ui8 len = 0; + ::Load(in, ch); + ::Load(in, len); + Entries[ch].CodeLength = len; + Entries[ch].Invalid = false; + } + } + }; + + struct TDecoderEntry { + ui32 NextTable : 10; + ui32 Char : 8; + ui32 Invalid : 1; + ui32 Bad : 1; + + TDecoderEntry() + : NextTable() + , Char() + , Invalid() + , Bad() + { + } + }; + + struct TDecoderTable: public TIntrusiveListItem<TDecoderTable> { + ui64 Length = 0; + ui64 BaseCode = 0; + + TDecoderEntry Entries[256]; + + TDecoderTable() { + Zero(Entries); + } + }; + + const int CACHE_BITS_COUNT = 16; + class THuffmanCodec::TImpl: public TAtomicRefCount<TImpl> { + TEncoderTable Encoder; + TDecoderTable Decoder[256]; + + TEncoderEntry Invalid; + + ui32 SubTablesNum; + + class THuffmanCache { + struct TCacheEntry { + int EndOffset : 24; + int BitsLeft : 8; + }; + TVector<char> DecodeCache; + TVector<TCacheEntry> CacheEntries; + const TImpl& Original; + + public: + THuffmanCache(const THuffmanCodec::TImpl& encoder); + + void Decode(NBitIO::TBitInput& in, TBuffer& out) const; + }; + + THolder<THuffmanCache> Cache; + + public: + TImpl() + : SubTablesNum(1) + { + Invalid.CodeLength = 255; + } + + ui8 Encode(TStringBuf in, TBuffer& out) const { + out.Clear(); + + if (in.empty()) { + return 0; + } + + out.Reserve(in.size() * 2); + + { + NBitIO::TBitOutputVector<TBuffer> bout(&out); + TStringBuf tin = in; + + // data is under compression + bout.Write(1, 1); + + for (auto t : tin) { + const TEncoderEntry& ce = Encoder.Entries[(ui8)t]; + + bout.Write(ce.Code, ce.CodeLength); + + if (ce.Invalid) { + bout.Write(t, 8); + } + } + + // in canonical huffman coding there cannot be a code having no 0 in the suffix + // and shorter than 8 bits. + bout.Write((ui64)-1, bout.GetByteReminder()); + return bout.GetByteReminder(); + } + } + + void Decode(TStringBuf in, TBuffer& out) const { + out.Clear(); + + if (in.empty()) { + return; + } + + NBitIO::TBitInput bin(in); + ui64 f = 0; + bin.ReadK<1>(f); + + // if data is uncompressed + if (!f) { + in.Skip(1); + out.Append(in.data(), in.size()); + } else { + out.Reserve(in.size() * 8); + + if (Cache.Get()) { + Cache->Decode(bin, out); + } else { + while (ReadNextChar(bin, out)) { + } + } + } + } + + Y_FORCE_INLINE int ReadNextChar(NBitIO::TBitInput& bin, TBuffer& out) const { + const TDecoderTable* table = Decoder; + TDecoderEntry e; + + int bitsRead = 0; + while (true) { + ui64 code = 0; + + if (Y_UNLIKELY(!bin.Read(code, table->Length))) + return 0; + bitsRead += table->Length; + + if (Y_UNLIKELY(code < table->BaseCode)) + return 0; + + code -= table->BaseCode; + + if (Y_UNLIKELY(code > 255)) + return 0; + + e = table->Entries[code]; + + if (Y_UNLIKELY(e.Bad)) + return 0; + + if (e.NextTable) { + table = Decoder + e.NextTable; + } else { + if (e.Invalid) { + code = 0; + bin.ReadK<8>(code); + bitsRead += 8; + out.Append((ui8)code); + } else { + out.Append((ui8)e.Char); + } + + return bitsRead; + } + } + + Y_ENSURE(false, " could not decode input"); + return 0; + } + + void GenerateEncoder(TCodeTree& tree) { + const ui64 sz = tree.size(); + + TEncoderEntry lastcode = Encoder.Entries[tree[0].Char] = TEncoderEntry(tree[0]); + + for (ui32 i = 1; i < sz; ++i) { + const TTreeEntry& te = tree[i]; + TEncoderEntry& e = Encoder.Entries[te.Char]; + e = TEncoderEntry(te); + + e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength); + lastcode = e; + + e.Code = ReverseBits(e.Code, e.CodeLength); + + if (e.Invalid) + Invalid = e; + } + + for (auto& e : Encoder.Entries) { + if (e.Invalid) + e = Invalid; + + Y_ENSURE(e.CodeLength, " "); + } + } + + void RegenerateEncoder() { + for (auto& entrie : Encoder.Entries) { + if (entrie.Invalid) + entrie.CodeLength = Invalid.CodeLength; + } + + Sort(Encoder.Entries, Encoder.Entries + 256, TCanonicalCmp<TEncoderEntry>()); + + TEncoderEntry lastcode = Encoder.Entries[0]; + + for (ui32 i = 1; i < 256; ++i) { + TEncoderEntry& e = Encoder.Entries[i]; + e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength); + lastcode = e; + + e.Code = ReverseBits(e.Code, e.CodeLength); + } + + for (auto& entrie : Encoder.Entries) { + if (entrie.Invalid) { + Invalid = entrie; + break; + } + } + + Sort(Encoder.Entries, Encoder.Entries + 256, TByCharCmp<TEncoderEntry>()); + + for (auto& entrie : Encoder.Entries) { + if (entrie.Invalid) + entrie = Invalid; + } + } + + void BuildDecoder() { + TEncoderTable enc = Encoder; + Sort(enc.Entries, enc.Entries + 256, TCanonicalCmp<TEncoderEntry>()); + + TEncoderEntry& e1 = enc.Entries[0]; + Decoder[0].BaseCode = e1.Code; + Decoder[0].Length = e1.CodeLength; + + for (auto e2 : enc.Entries) { + SetEntry(Decoder, e2.Code, e2.CodeLength, e2); + } + Cache.Reset(new THuffmanCache(*this)); + } + + void SetEntry(TDecoderTable* t, ui64 code, ui64 len, TEncoderEntry e) { + Y_ENSURE(len >= t->Length, len << " < " << t->Length); + + ui64 idx = (code & MaskLowerBits(t->Length)) - t->BaseCode; + TDecoderEntry& d = t->Entries[idx]; + + if (len == t->Length) { + Y_ENSURE(!d.NextTable, " "); + + d.Char = e.Char; + d.Invalid = e.Invalid; + return; + } + + if (!d.NextTable) { + Y_ENSURE(SubTablesNum < Y_ARRAY_SIZE(Decoder), " "); + d.NextTable = SubTablesNum++; + TDecoderTable* nt = Decoder + d.NextTable; + nt->Length = Min<ui64>(8, len - t->Length); + nt->BaseCode = (code >> t->Length) & MaskLowerBits(nt->Length); + } + + SetEntry(Decoder + d.NextTable, code >> t->Length, len - t->Length, e); + } + + void Learn(ISequenceReader* in) { + { + TCodeTree tree; + InitTree(tree, in); + CalculateCodeLengths(tree); + Y_ENSURE(!tree.empty(), " "); + GenerateEncoder(tree); + } + BuildDecoder(); + } + + void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) { + TCodeTree tree; + + ui64 freqsArray[256]; + Zero(freqsArray); + + for (const auto& freq : freqs) + freqsArray[static_cast<ui8>(freq.first)] += freq.second; + + InitTreeByFreqs(tree, freqsArray); + CalculateCodeLengths(tree); + + Y_ENSURE(!tree.empty(), " "); + + GenerateEncoder(tree); + BuildDecoder(); + } + + void Save(IOutputStream* out) { + ::Save(out, Invalid.CodeLength); + Encoder.Save(out); + } + + void Load(IInputStream* in) { + ::Load(in, Invalid.CodeLength); + Encoder.Load(in); + RegenerateEncoder(); + BuildDecoder(); + } + }; + + THuffmanCodec::TImpl::THuffmanCache::THuffmanCache(const THuffmanCodec::TImpl& codec) + : Original(codec) + { + CacheEntries.resize(1 << CACHE_BITS_COUNT); + DecodeCache.reserve(CacheEntries.size() * 2); + char buffer[2]; + TBuffer decoded; + for (size_t i = 0; i < CacheEntries.size(); i++) { + buffer[1] = i >> 8; + buffer[0] = i; + NBitIO::TBitInput bin(buffer, buffer + sizeof(buffer)); + int totalBits = 0; + while (true) { + decoded.Resize(0); + int bits = codec.ReadNextChar(bin, decoded); + if (totalBits + bits > 16 || !bits) { + TCacheEntry e = {static_cast<int>(DecodeCache.size()), 16 - totalBits}; + CacheEntries[i] = e; + break; + } + + for (TBuffer::TConstIterator it = decoded.Begin(); it != decoded.End(); ++it) { + DecodeCache.push_back(*it); + } + totalBits += bits; + } + } + DecodeCache.push_back(0); + CacheEntries.shrink_to_fit(); + DecodeCache.shrink_to_fit(); + } + + void THuffmanCodec::TImpl::THuffmanCache::Decode(NBitIO::TBitInput& bin, TBuffer& out) const { + int bits = 0; + ui64 code = 0; + while (!bin.Eof()) { + ui64 f = 0; + const int toRead = 16 - bits; + if (toRead > 0 && bin.Read(f, toRead)) { + code = (code >> (16 - bits)) | (f << bits); + code &= 0xFFFF; + TCacheEntry entry = CacheEntries[code]; + int start = code > 0 ? CacheEntries[code - 1].EndOffset : 0; + out.Append((const char*)&DecodeCache[start], (const char*)&DecodeCache[entry.EndOffset]); + bits = entry.BitsLeft; + } else { // should never happen until there are exceptions or unaligned input + bin.Back(bits); + if (!Original.ReadNextChar(bin, out)) + break; + + code = 0; + bits = 0; + } + } + } + + THuffmanCodec::THuffmanCodec() + : Impl(new TImpl) + { + MyTraits.NeedsTraining = true; + MyTraits.PreservesPrefixGrouping = true; + MyTraits.PaddingBit = 1; + MyTraits.SizeOnEncodeMultiplier = 2; + MyTraits.SizeOnDecodeMultiplier = 8; + MyTraits.RecommendedSampleSize = 1 << 21; + } + + THuffmanCodec::~THuffmanCodec() = default; + + ui8 THuffmanCodec::Encode(TStringBuf in, TBuffer& bbb) const { + if (Y_UNLIKELY(!Trained)) + ythrow TCodecException() << " not trained"; + + return Impl->Encode(in, bbb); + } + + void THuffmanCodec::Decode(TStringBuf in, TBuffer& bbb) const { + Impl->Decode(in, bbb); + } + + void THuffmanCodec::Save(IOutputStream* out) const { + Impl->Save(out); + } + + void THuffmanCodec::Load(IInputStream* in) { + Impl->Load(in); + } + + void THuffmanCodec::DoLearn(ISequenceReader& in) { + Impl->Learn(&in); + } + + void THuffmanCodec::LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) { + Impl->LearnByFreqs(freqs); + Trained = true; + } + +} diff --git a/library/cpp/codecs/huffman_codec.h b/library/cpp/codecs/huffman_codec.h new file mode 100644 index 0000000000..559545b90d --- /dev/null +++ b/library/cpp/codecs/huffman_codec.h @@ -0,0 +1,39 @@ +#pragma once + +#include "codecs.h" + +#include <util/generic/ptr.h> +#include <util/string/cast.h> + +namespace NCodecs { + // for types greater than char, pipeline with TFreqCodec. + + class THuffmanCodec: public ICodec { + class TImpl; + TIntrusivePtr<TImpl> Impl; + + public: + THuffmanCodec(); + ~THuffmanCodec() override; + + static TStringBuf MyName() { + return "huffman"; + } + + TString GetName() const override { + return ToString(MyName()); + } + + ui8 Encode(TStringBuf in, TBuffer& bbb) const override; + + void Decode(TStringBuf in, TBuffer& bbb) const override; + + void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs); + + protected: + void DoLearn(ISequenceReader& in) override; + void Save(IOutputStream* out) const override; + void Load(IInputStream* in) override; + }; + +} diff --git a/library/cpp/codecs/pfor_codec.cpp b/library/cpp/codecs/pfor_codec.cpp new file mode 100644 index 0000000000..f6b3b0920b --- /dev/null +++ b/library/cpp/codecs/pfor_codec.cpp @@ -0,0 +1,22 @@ +#include "pfor_codec.h" + +namespace NCodecs { + template <> + TStringBuf TPForCodec<ui64, true>::MyName() { + return "pfor-delta64-sorted"; + } + template <> + TStringBuf TPForCodec<ui32, true>::MyName() { + return "pfor-delta32-sorted"; + } + + template <> + TStringBuf TPForCodec<ui64, false>::MyName() { + return "pfor-ui64"; + } + template <> + TStringBuf TPForCodec<ui32, false>::MyName() { + return "pfor-ui32"; + } + +} diff --git a/library/cpp/codecs/pfor_codec.h b/library/cpp/codecs/pfor_codec.h new file mode 100644 index 0000000000..d7d4bb8bf4 --- /dev/null +++ b/library/cpp/codecs/pfor_codec.h @@ -0,0 +1,211 @@ +#pragma once + +#include "codecs.h" + +#include "delta_codec.h" +#include "tls_cache.h" + +#include <library/cpp/bit_io/bitinput.h> +#include <library/cpp/bit_io/bitoutput.h> +#include <util/string/cast.h> + +namespace NCodecs { + template <typename T, bool WithDelta = false> + class TPForCodec: public ICodec { + using TUnsigned = std::make_unsigned_t<T>; + typedef TDeltaCodec<TUnsigned> TDCodec; + + typedef std::conditional_t<WithDelta, typename TDCodec::TDelta, T> TValue; + static_assert(std::is_unsigned<TValue>::value, "expect std:is_unsigned<TValue>::value"); + + static const ui64 BitsInT = sizeof(TUnsigned) * 8; + + TDCodec DeltaCodec; + + public: + static TStringBuf MyName(); + + TPForCodec() { + MyTraits.AssumesStructuredInput = true; + MyTraits.SizeOfInputElement = sizeof(T); + MyTraits.SizeOnDecodeMultiplier = sizeof(T); + } + + TString GetName() const override { + return ToString(MyName()); + } + + ui8 Encode(TStringBuf s, TBuffer& b) const override { + b.Clear(); + if (s.empty()) { + return 0; + } + + b.Reserve(2 * s.size() + b.Size()); + + if (WithDelta) { + auto buffer = TBufferTlsCache::TlsInstance().Item(); + TBuffer& db = buffer.Get(); + db.Clear(); + db.Reserve(2 * s.size()); + DeltaCodec.Encode(s, db); + s = TStringBuf{db.data(), db.size()}; + } + + TArrayRef<const TValue> tin{(const TValue*)s.data(), s.size() / sizeof(TValue)}; + + const ui64 sz = tin.size(); + ui64 bitcounts[BitsInT + 1]; + Zero(bitcounts); + + ui32 zeros = 0; + + for (const TValue* it = tin.begin(); it != tin.end(); ++it) { + TUnsigned v = 1 + (TUnsigned)*it; + ui64 l = MostSignificantBit(v) + 1; + ++bitcounts[l]; + + if (!v) { + ++zeros; + } + } + + // cumulative bit counts + for (ui64 i = 0; i < BitsInT; ++i) { + bitcounts[i + 1] += bitcounts[i]; + } + + bool hasexceptions = zeros; + ui64 optimalbits = BitsInT; + + { + ui64 excsize = 0; + ui64 minsize = sz * BitsInT; + + for (ui64 current = BitsInT; current; --current) { + ui64 size = bitcounts[current] * current + (sz - bitcounts[current]) * (current + 6 + excsize) + zeros * (current + 6); + + excsize += current * bitcounts[current]; + + if (size < minsize) { + minsize = size; + optimalbits = current; + hasexceptions = zeros || sz - bitcounts[current]; + } + } + } + + if (!optimalbits || BitsInT == optimalbits) { + b.Append((ui8)-1); + b.Append(s.data(), s.size()); + return 0; + } else { + NBitIO::TBitOutputVector<TBuffer> bout(&b); + bout.Write(0, 1); + bout.Write(hasexceptions, 1); + bout.Write(optimalbits, 6); + + for (const TValue* it = tin.begin(); it != tin.end(); ++it) { + TUnsigned word = 1 + (TUnsigned)*it; + ui64 len = MostSignificantBit(word) + 1; + if (len > optimalbits || !word) { + Y_ENSURE(hasexceptions, " "); + bout.Write(0, optimalbits); + bout.Write(len, 6); + bout.Write(word, len); + } else { + bout.Write(word, optimalbits); + } + } + + return bout.GetByteReminder(); + } // the rest of the last byte is zero padded. BitsInT is always > 7. + } + + void Decode(TStringBuf s, TBuffer& b) const override { + b.Clear(); + if (s.empty()) { + return; + } + + b.Reserve(s.size() * sizeof(T) + b.Size()); + + ui64 isplain = 0; + ui64 hasexceptions = 0; + ui64 bits = 0; + + NBitIO::TBitInput bin(s); + bin.ReadK<1>(isplain); + bin.ReadK<1>(hasexceptions); + bin.ReadK<6>(bits); + + if (Y_UNLIKELY(isplain)) { + s.Skip(1); + + if (WithDelta) { + DeltaCodec.Decode(s, b); + } else { + b.Append(s.data(), s.size()); + } + } else { + typename TDCodec::TDecoder decoder; + + if (hasexceptions) { + ui64 word = 0; + while (bin.Read(word, bits)) { + if (word || (bin.ReadK<6>(word) && bin.Read(word, word))) { + --word; + + TValue t = word; + + if (WithDelta) { + if (decoder.Decode(t)) { + TStringBuf r{(char*)&decoder.Result, sizeof(decoder.Result)}; + b.Append(r.data(), r.size()); + } + } else { + TStringBuf r{(char*)&t, sizeof(t)}; + b.Append(r.data(), r.size()); + } + } + } + } else { + ui64 word = 0; + T outarr[256 / sizeof(T)]; + ui32 cnt = 0; + while (true) { + ui64 v = bin.Read(word, bits); + + if ((!v) | (!word)) + break; + + --word; + TValue t = word; + + if (WithDelta) { + if (decoder.Decode(t)) { + outarr[cnt++] = decoder.Result; + } + } else { + outarr[cnt++] = t; + } + + if (cnt == Y_ARRAY_SIZE(outarr)) { + b.Append((const char*)outarr, sizeof(outarr)); + cnt = 0; + } + } + + if (cnt) { + b.Append((const char*)outarr, cnt * sizeof(T)); + } + } + } + } + + protected: + void DoLearn(ISequenceReader&) override { + } + }; + +} diff --git a/library/cpp/codecs/sample.h b/library/cpp/codecs/sample.h new file mode 100644 index 0000000000..15f03afcc5 --- /dev/null +++ b/library/cpp/codecs/sample.h @@ -0,0 +1,89 @@ +#pragma once + +#include <library/cpp/deprecated/accessors/accessors.h> + +#include <util/generic/buffer.h> +#include <util/generic/vector.h> +#include <util/random/fast.h> +#include <util/random/shuffle.h> + +#include <functional> +#include <type_traits> + +namespace NCodecs { + class ISequenceReader { + public: + virtual bool NextRegion(TStringBuf& s) = 0; + + virtual ~ISequenceReader() = default; + }; + + template <class TValue> + TStringBuf ValueToStringBuf(TValue&& t) { + return TStringBuf{NAccessors::Begin(t), NAccessors::End(t)}; + } + + template <class TIter> + TStringBuf IterToStringBuf(TIter iter) { + return ValueToStringBuf(*iter); + } + + template <class TItem> + class TSimpleSequenceReader: public ISequenceReader { + const TVector<TItem>& Items; + size_t Idx = 0; + + public: + TSimpleSequenceReader(const TVector<TItem>& items) + : Items(items) + { + } + + bool NextRegion(TStringBuf& s) override { + if (Idx >= Items.size()) { + return false; + } + + s = ValueToStringBuf(Items[Idx++]); + return true; + } + }; + + template <class TIter, class TGetter> + size_t GetInputSize(TIter begin, TIter end, TGetter getter) { + size_t totalBytes = 0; + for (TIter iter = begin; iter != end; ++iter) { + totalBytes += getter(iter).size(); + } + return totalBytes; + } + + template <class TIter> + size_t GetInputSize(TIter begin, TIter end) { + return GetInputSize(begin, end, IterToStringBuf<TIter>); + } + + template <class TIter, class TGetter> + TVector<TBuffer> GetSample(TIter begin, TIter end, size_t sampleSizeBytes, TGetter getter) { + TFastRng64 rng{0x1ce1f2e507541a05, 0x07d45659, 0x7b8771030dd9917e, 0x2d6636ce}; + + size_t totalBytes = GetInputSize(begin, end, getter); + double sampleProb = (double)sampleSizeBytes / Max<size_t>(1, totalBytes); + + TVector<TBuffer> result; + for (TIter iter = begin; iter != end; ++iter) { + if (sampleProb >= 1 || rng.GenRandReal1() < sampleProb) { + TStringBuf reg = getter(iter); + result.emplace_back(reg.data(), reg.size()); + } + } + Shuffle(result.begin(), result.end(), rng); + return result; + } + + template <class TIter> + TVector<TBuffer> GetSample(TIter begin, TIter end, size_t sampleSizeBytes) { + return GetSample(begin, end, sampleSizeBytes, IterToStringBuf<TIter>); + } + +} diff --git a/library/cpp/codecs/solar_codec.cpp b/library/cpp/codecs/solar_codec.cpp new file mode 100644 index 0000000000..d0692fe2a4 --- /dev/null +++ b/library/cpp/codecs/solar_codec.cpp @@ -0,0 +1,133 @@ +#include "solar_codec.h" + +#include <library/cpp/codecs/greedy_dict/gd_builder.h> + +#include <library/cpp/containers/comptrie/comptrie_builder.h> +#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> +#include <util/stream/length.h> +#include <util/string/printf.h> +#include <util/ysaveload.h> + +namespace NCodecs { + static inline ui32 Append(TBuffer& pool, TStringBuf data) { + pool.Append(data.data(), data.size()); + return pool.Size(); + } + + void TSolarCodec::DoLearn(ISequenceReader& r) { + using namespace NGreedyDict; + + Decoder.clear(); + Pool.Clear(); + + THolder<TEntrySet> set; + + { + TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance()); + TStringBufs bufs; + + TStringBuf m; + while (r.NextRegion(m)) { + bufs.push_back(pool.AppendString(m)); + } + + { + TDictBuilder b(Settings); + b.SetInput(bufs); + b.Build(MaxEntries, MaxIterations); + + set = b.ReleaseEntrySet(); + } + } + + set->SetScores(ES_LEN_COUNT); + + { + TVector<std::pair<float, TStringBuf>> tmp; + tmp.reserve(set->size()); + + for (const auto& it : *set) { + tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1)))); + } + + Sort(tmp.begin(), tmp.end()); + + Decoder.reserve(tmp.size() + 1); + Decoder.push_back(0); + + for (const auto& it : tmp) { + Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed"); + ui32 endoff = Append(Pool, it.second); + Decoder.push_back(endoff); + } + } + + Pool.ShrinkToFit(); + Decoder.shrink_to_fit(); + + TBufferOutput bout; + + { + TVector<std::pair<TStringBuf, ui32>> tmp2; + tmp2.reserve(Decoder.size()); + + for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) { + TStringBuf s = DoDecode(i); + tmp2.push_back(std::make_pair(s, i - 1)); + Y_ENSURE(s.size() == (Decoder[i] - Decoder[i - 1]), "learning invariant failed"); + } + + Sort(tmp2.begin(), tmp2.end()); + + { + TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED); + for (const auto& it : tmp2) { + builder.Add(it.first.data(), it.first.size(), it.second); + } + + builder.Save(bout); + } + } + + Encoder.Init(TBlob::FromBuffer(bout.Buffer())); + } + + void TSolarCodec::Save(IOutputStream* out) const { + TBlob b = Encoder.Data(); + ::Save(out, (ui32)b.Size()); + out->Write(b.Data(), b.Size()); + } + + void TSolarCodec::Load(IInputStream* in) { + ui32 sz; + ::Load(in, sz); + TLengthLimitedInput lin(in, sz); + Encoder.Init(TBlob::FromStream(lin)); + Pool.Clear(); + Decoder.clear(); + + TVector<std::pair<ui32, TString>> tmp; + + ui32 poolsz = 0; + for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) { + const TString& s = it.GetKey(); + tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s)); + poolsz += Max<ui32>(s.size(), 1); + } + + Sort(tmp.begin(), tmp.end()); + + Pool.Reserve(poolsz); + Decoder.reserve(tmp.size() + 1); + Decoder.push_back(0); + + for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) { + Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first); + Decoder.push_back(Append(Pool, tmp[i].second)); + } + + Pool.ShrinkToFit(); + Decoder.shrink_to_fit(); + } + +} diff --git a/library/cpp/codecs/solar_codec.h b/library/cpp/codecs/solar_codec.h new file mode 100644 index 0000000000..7158ae7926 --- /dev/null +++ b/library/cpp/codecs/solar_codec.h @@ -0,0 +1,244 @@ +#pragma once + +#include "codecs.h" +#include <library/cpp/containers/comptrie/comptrie_trie.h> +#include <library/cpp/codecs/greedy_dict/gd_builder.h> + +#include <util/string/cast.h> +#include <util/string/escape.h> + +namespace NCodecs { + // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы. + // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе. + + struct TVarIntTraits { + static const size_t MAX_VARINT32_BYTES = 5; + + static void Write(ui32 value, TBuffer& b) { + while (value > 0x7F) { + b.Append(static_cast<ui8>(value) | 0x80); + value >>= 7; + } + b.Append(static_cast<ui8>(value) & 0x7F); + } + + static void Read(TStringBuf& r, ui32& value) { + ui32 result = 0; + for (ui32 count = 0; count < MAX_VARINT32_BYTES; ++count) { + const ui32 b = static_cast<ui8>(r[0]); + r.Skip(1); + result |= static_cast<ui32>(b & 0x7F) << (7 * count); + if (!(b & 0x80)) { + value = result; + return; + } else if (Y_UNLIKELY(r.empty())) { + break; + } + } + Y_ENSURE_EX(false, TCodecException() << "Bad data"); + } + }; + + struct TShortIntTraits { + static const size_t SHORTINT_SIZE_LIMIT = 0x8000; + + Y_FORCE_INLINE static void Write(ui32 value, TBuffer& b) { + Y_ENSURE_EX(value < SHORTINT_SIZE_LIMIT, TCodecException() << "Bad write method"); + if (value >= 0x80) { + b.Append(static_cast<ui8>(value >> 8) | 0x80); + } + b.Append(static_cast<ui8>(value)); + } + + Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) { + ui32 result = static_cast<ui8>(r[0]); + r.Skip(1); + if (result >= 0x80) { + Y_ENSURE_EX(!r.empty(), TCodecException() << "Bad data"); + result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]); + r.Skip(1); + } + value = result; + } + }; + + class TSolarCodec: public ICodec { + public: + static TStringBuf MyName8k() { + return TStringBuf("solar-8k"); + } + static TStringBuf MyName16k() { + return TStringBuf("solar-16k"); + } + static TStringBuf MyName32k() { + return TStringBuf("solar-32k"); + } + static TStringBuf MyName64k() { + return TStringBuf("solar-64k"); + } + static TStringBuf MyName256k() { + return TStringBuf("solar-256k"); + } + static TStringBuf MyName() { + return TStringBuf("solar"); + } + static TStringBuf MyName8kAdapt() { + return TStringBuf("solar-8k-a"); + } + static TStringBuf MyName16kAdapt() { + return TStringBuf("solar-16k-a"); + } + static TStringBuf MyName32kAdapt() { + return TStringBuf("solar-32k-a"); + } + static TStringBuf MyName64kAdapt() { + return TStringBuf("solar-64k-a"); + } + static TStringBuf MyName256kAdapt() { + return TStringBuf("solar-256k-a"); + } + static TStringBuf MyNameShortInt() { + return TStringBuf("solar-si"); + } + + explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) + : Settings(s) + , MaxEntries(maxentries) + , MaxIterations(maxiter) + { + MyTraits.NeedsTraining = true; + MyTraits.SizeOnDecodeMultiplier = 2; + MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8; + } + + ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { + EncodeImpl<TVarIntTraits>(r, b); + return 0; + } + + void Decode(TStringBuf r, TBuffer& b) const override { + DecodeImpl<TVarIntTraits>(r, b); + } + + TString GetName() const override { + return ToString(MyName()); + } + + protected: + void DoLearn(ISequenceReader&) override; + void Save(IOutputStream*) const override; + void Load(IInputStream*) override; + + Y_FORCE_INLINE TStringBuf SubStr(ui32 begoff, ui32 endoff) const { + return TStringBuf(Pool.Data() + begoff, endoff - begoff); + } + + Y_FORCE_INLINE TStringBuf DoDecode(ui32 num) const { + return SubStr(Decoder[num - 1], Decoder[num]); + } + + template <class TTraits> + Y_FORCE_INLINE void EncodeImpl(TStringBuf r, TBuffer& b) const { + b.Clear(); + b.Reserve(r.size()); + while (!r.empty()) { + size_t sz = 0; + ui32 val = (ui32)-1; + Encoder.FindLongestPrefix(r, &sz, &val); + TTraits::Write(val + 1, b); + r.Skip(Max<size_t>(sz, 1)); + } + } + + template <class TTraits> + Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const { + b.Clear(); + b.Reserve(r.size()); + ui32 v = 0; + while (!r.empty()) { + TTraits::Read(r, v); + TStringBuf s = DoDecode(v); + b.Append(s.data(), s.size()); + } + } + + inline bool CanUseShortInt() const { + return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT; + } + + private: + typedef TCompactTrie<char, ui32> TEncoder; + typedef TVector<ui32> TDecoder; + + TBuffer Pool; + TEncoder Encoder; + TDecoder Decoder; + + NGreedyDict::TBuildSettings Settings; + ui32 MaxEntries; + ui32 MaxIterations; + }; + + // Uses varints or shortints depending on the decoder size + class TAdaptiveSolarCodec: public TSolarCodec { + public: + explicit TAdaptiveSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) + : TSolarCodec(maxentries, maxiter, s) + { + } + + ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { + if (CanUseShortInt()) { + EncodeImpl<TShortIntTraits>(r, b); + } else { + EncodeImpl<TVarIntTraits>(r, b); + } + + return 0; + } + + void Decode(TStringBuf r, TBuffer& b) const override { + if (CanUseShortInt()) { + DecodeImpl<TShortIntTraits>(r, b); + } else { + DecodeImpl<TVarIntTraits>(r, b); + } + } + + TString GetName() const override { + if (CanUseShortInt()) { + return ToString(MyNameShortInt()); + } else { + return ToString(MyName()); + } + } + }; + + class TSolarCodecShortInt: public TSolarCodec { + public: + explicit TSolarCodecShortInt(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) + : TSolarCodec(maxentries, maxiter, s) + { + } + + ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { + EncodeImpl<TShortIntTraits>(r, b); + return 0; + } + + void Decode(TStringBuf r, TBuffer& b) const override { + DecodeImpl<TShortIntTraits>(r, b); + } + + TString GetName() const override { + return ToString(MyNameShortInt()); + } + + protected: + void Load(IInputStream* in) override { + TSolarCodec::Load(in); + Y_ENSURE_EX(CanUseShortInt(), TCodecException() << "Bad data"); + } + }; + +} diff --git a/library/cpp/codecs/static/README b/library/cpp/codecs/static/README new file mode 100644 index 0000000000..1b07f02433 --- /dev/null +++ b/library/cpp/codecs/static/README @@ -0,0 +1 @@ +Support of static libraries in library/cpp/codecs. See library/cpp/codecs/static/example. diff --git a/library/cpp/codecs/static/builder.cpp b/library/cpp/codecs/static/builder.cpp new file mode 100644 index 0000000000..93e34a3edb --- /dev/null +++ b/library/cpp/codecs/static/builder.cpp @@ -0,0 +1,39 @@ +#include "builder.h" +#include "common.h" + +#include <library/cpp/codecs/static/static_codec_info.pb.h> + +#include <library/cpp/codecs/codecs.h> + +#include <util/generic/yexception.h> +#include <util/string/subst.h> + +namespace NCodecs { + TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo& info) { + TStaticCodecInfo result; + TCodecPtr codec = ICodec::GetInstance(info.CodecName); + Y_ENSURE_EX(codec, TCodecException() << "empty codec is not allowed"); + + codec->LearnX(trainingData.begin(), trainingData.end(), info.SampleSizeMultiplier); + { + TStringOutput sout{*result.MutableStoredCodec()}; + ICodec::Store(&sout, codec); + } + + auto& debugInfo = *result.MutableDebugInfo(); + debugInfo.SetStoredCodecHash(DataSignature(result.GetStoredCodec())); + debugInfo.SetCodecName(info.CodecName); + debugInfo.SetSampleSizeMultiplier(info.SampleSizeMultiplier); + debugInfo.SetTimestamp(info.Timestamp); + debugInfo.SetRevisionInfo(info.RevisionInfo); + debugInfo.SetTrainingSetComment(info.TrainingSetComment); + debugInfo.SetTrainingSetResId(info.TrainingSetResId); + return result; + } + + TString GetStandardFileName(const TStaticCodecInfo& info) { + TString cName = info.GetDebugInfo().GetCodecName(); + SubstGlobal(cName, ':', '.'); + return TStringBuilder() << cName << "." << info.GetDebugInfo().GetTimestamp() << ".codec_info"; + } +} diff --git a/library/cpp/codecs/static/builder.h b/library/cpp/codecs/static/builder.h new file mode 100644 index 0000000000..d7533be4d5 --- /dev/null +++ b/library/cpp/codecs/static/builder.h @@ -0,0 +1,29 @@ +#pragma once + +#include "static.h" + +#include <library/cpp/svnversion/svnversion.h> + +#include <util/datetime/base.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/string/builder.h> + +namespace NCodecs { + struct TCodecBuildInfo { + // optimal values from SEARCH-1655 + TString CodecName = "solar-8k-a:zstd08d-1"; + float SampleSizeMultiplier = 1; + + // debug info: + time_t Timestamp = TInstant::Now().TimeT(); + TString RevisionInfo = (TStringBuilder() << "r" << ToString(GetProgramSvnRevision())); + TString TrainingSetComment; // a human comment on the training data + TString TrainingSetResId; // sandbox resid of the training set + }; + + TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo&); + + TString GetStandardFileName(const TStaticCodecInfo&); + +} diff --git a/library/cpp/codecs/static/common.h b/library/cpp/codecs/static/common.h new file mode 100644 index 0000000000..211de2a27d --- /dev/null +++ b/library/cpp/codecs/static/common.h @@ -0,0 +1,32 @@ +#pragma once + +#include <util/string/hex.h> +#include <util/digest/city.h> +#include <util/system/byteorder.h> + +namespace NCodecs { + template <class T> + ui64 DataSignature(const T& t) { + static_assert(!std::is_scalar<T>::value, "no scalars"); + return CityHash64(t.data(), t.size()); + } + + template <class T> + TString HexWriteScalar(T t) { + static_assert(std::is_scalar<T>::value, "scalars only"); + t = LittleToBig(t); + TString res = HexEncode(&t, sizeof(t)); + res.to_lower(); + return res; + } + + template <class T> + T HexReadScalar(TStringBuf s) { + static_assert(std::is_scalar<T>::value, "scalars only"); + T t = 0; + HexDecode(s.data(), Min(s.size(), sizeof(T)), &t); + t = BigToLittle(t); + return t; + } + +} diff --git a/library/cpp/codecs/static/example/example.cpp b/library/cpp/codecs/static/example/example.cpp new file mode 100644 index 0000000000..5b750b717e --- /dev/null +++ b/library/cpp/codecs/static/example/example.cpp @@ -0,0 +1,43 @@ +#include "example.h" + +#include <library/cpp/codecs/static/static.h> + +#include <util/generic/yexception.h> + +extern "C" { +extern const ui8 codec_info_huff_20160707[]; +extern const ui32 codec_info_huff_20160707Size; +extern const ui8 codec_info_sa_huff_20160707[]; +extern const ui32 codec_info_sa_huff_20160707Size; +}; + +namespace NStaticCodecExample { + static const NCodecs::TCodecConstPtr CODECS[] = { + nullptr, + NCodecs::RestoreCodecFromArchive(codec_info_huff_20160707, codec_info_huff_20160707Size), + NCodecs::RestoreCodecFromArchive(codec_info_sa_huff_20160707, codec_info_sa_huff_20160707Size), + }; + + static_assert(Y_ARRAY_SIZE(CODECS) == DV_COUNT, "bad array size"); + + void Encode(TBuffer& out, TStringBuf in, EDictVersion dv) { + Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv); + out.Clear(); + if (!in) { + return; + } + CODECS[dv]->Encode(in, out); + out.Append((char)dv); + } + + void Decode(TBuffer& out, TStringBuf in) { + out.Clear(); + if (!in) { + return; + } + EDictVersion dv = (EDictVersion)in.back(); + Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv); + in.Chop(1); + CODECS[dv]->Decode(in, out); + } +} diff --git a/library/cpp/codecs/static/example/example.h b/library/cpp/codecs/static/example/example.h new file mode 100644 index 0000000000..f9b3a7324b --- /dev/null +++ b/library/cpp/codecs/static/example/example.h @@ -0,0 +1,17 @@ +#pragma once + +#include <util/generic/strbuf.h> +#include <util/generic/buffer.h> + +namespace NStaticCodecExample { + enum EDictVersion : ui8 { + DV_NULL = 0, + DV_HUFF_20160707, + DV_SA_HUFF_20160707, + DV_COUNT + }; + + void Encode(TBuffer&, TStringBuf, EDictVersion dv = DV_SA_HUFF_20160707); + + void Decode(TBuffer&, TStringBuf); +} diff --git a/library/cpp/codecs/static/example/huffman.1467494385.codec_info b/library/cpp/codecs/static/example/huffman.1467494385.codec_info Binary files differnew file mode 100644 index 0000000000..5fc18270a6 --- /dev/null +++ b/library/cpp/codecs/static/example/huffman.1467494385.codec_info diff --git a/library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_info b/library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_info Binary files differnew file mode 100644 index 0000000000..d36d8e24ec --- /dev/null +++ b/library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_info diff --git a/library/cpp/codecs/static/example/ya.make b/library/cpp/codecs/static/example/ya.make new file mode 100644 index 0000000000..ca6c5fd900 --- /dev/null +++ b/library/cpp/codecs/static/example/ya.make @@ -0,0 +1,24 @@ +LIBRARY() + +OWNER(velavokr) + +SRCS( + GLOBAL example.cpp +) + +PEERDIR( + library/cpp/codecs + library/cpp/codecs/static +) + +ARCHIVE_ASM( + "solar-8k-a.huffman.1467494385.codec_info" + NAME codec_info_sa_huff_20160707 +) + +ARCHIVE_ASM( + "huffman.1467494385.codec_info" + NAME codec_info_huff_20160707 +) + +END() diff --git a/library/cpp/codecs/static/static.cpp b/library/cpp/codecs/static/static.cpp new file mode 100644 index 0000000000..44a07dd73a --- /dev/null +++ b/library/cpp/codecs/static/static.cpp @@ -0,0 +1,98 @@ +#include "static.h" +#include "common.h" + +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <library/cpp/archive/yarchive.h> + +#include <util/draft/datetime.h> + +#include <util/string/builder.h> +#include <util/stream/buffer.h> +#include <util/stream/mem.h> +#include <util/string/hex.h> +#include <util/ysaveload.h> + +namespace NCodecs { + static constexpr TStringBuf STATIC_CODEC_INFO_MAGIC = "CodecInf"; + + static TStringBuf GetStaticCodecInfoMagic() { + return STATIC_CODEC_INFO_MAGIC; + } + + void SaveCodecInfoToStream(IOutputStream& out, const TStaticCodecInfo& info) { + TBufferOutput bout; + info.SerializeToArcadiaStream(&bout); + ui64 hash = DataSignature(bout.Buffer()); + out.Write(GetStaticCodecInfoMagic()); + ::Save(&out, hash); + ::Save(&out, bout.Buffer()); + } + + TStaticCodecInfo LoadCodecInfoFromStream(IInputStream& in) { + { + TBuffer magic; + magic.Resize(GetStaticCodecInfoMagic().size()); + Y_ENSURE_EX(in.Read(magic.Data(), GetStaticCodecInfoMagic().size()) == GetStaticCodecInfoMagic().size(), + TCodecException() << "bad codec info"); + Y_ENSURE_EX(TStringBuf(magic.data(), magic.size()) == GetStaticCodecInfoMagic(), + TCodecException() << "bad codec info"); + } + + ui64 hash; + ::Load(&in, hash); + TBuffer info; + ::Load(&in, info); + Y_ENSURE_EX(hash == DataSignature(info), TCodecException() << "bad codec info"); + + TStaticCodecInfo result; + Y_ENSURE_EX(result.ParseFromArray(info.data(), info.size()), TCodecException() << "bad codec info"); + + return result; + } + + TString SaveCodecInfoToString(const TStaticCodecInfo& info) { + TStringStream s; + SaveCodecInfoToStream(s, info); + return s.Str(); + } + + TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data) { + TMemoryInput m{data.data(), data.size()}; + return LoadCodecInfoFromStream(m); + } + + TString FormatCodecInfo(const TStaticCodecInfo& ci) { + TStringBuilder s; + s << "codec name: " << ci.GetDebugInfo().GetCodecName() << Endl; + s << "codec hash: " << HexWriteScalar(ci.GetDebugInfo().GetStoredCodecHash()) << Endl; + s << "dict size: " << ci.GetStoredCodec().Size() << Endl; + s << "sample mult: " << ci.GetDebugInfo().GetSampleSizeMultiplier() << Endl; + s << "orig.compress: " << ci.GetDebugInfo().GetCompression() * 100 << " %" << Endl; + s << "timestamp: " << ci.GetDebugInfo().GetTimestamp() << " (" + << NDatetime::TSimpleTM::NewLocal(ci.GetDebugInfo().GetTimestamp()).ToString() + << ")" << Endl; + s << "revision: " << ci.GetDebugInfo().GetRevisionInfo() << Endl; + s << "training set comment: " << ci.GetDebugInfo().GetTrainingSetComment() << Endl; + s << "training set resId: " << ci.GetDebugInfo().GetTrainingSetResId() << Endl; + return s; + } + + TString LoadStringFromArchive(const ui8* begin, size_t size) { + TArchiveReader ar(TBlob::NoCopy(begin, size)); + Y_VERIFY(ar.Count() == 1, "invalid number of entries"); + auto blob = ar.ObjectBlobByKey(ar.KeyByIndex(0)); + return TString{blob.AsCharPtr(), blob.Size()}; + } + + TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo& info) { + return NCodecs::ICodec::RestoreFromString(info.GetStoredCodec()); + } + + TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size) { + const auto& data = LoadStringFromArchive(begin, size); + const auto& info = LoadCodecInfoFromString(data); + const auto& codec = RestoreCodecFromCodecInfo(info); + Y_ENSURE_EX(codec, TCodecException() << "null codec"); + return codec; + } +} diff --git a/library/cpp/codecs/static/static.h b/library/cpp/codecs/static/static.h new file mode 100644 index 0000000000..c1eaed2a74 --- /dev/null +++ b/library/cpp/codecs/static/static.h @@ -0,0 +1,34 @@ +#pragma once + +#include <library/cpp/codecs/codecs.h> + +#include <util/generic/strbuf.h> +#include <util/generic/string.h> +#include <util/stream/output.h> + +namespace NCodecs { + class TStaticCodecInfo; + + // load + + TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo&); + + TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data); + + TString LoadStringFromArchive(const ui8* begin, size_t size); + + TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size); + + // save + + TString SaveCodecInfoToString(const TStaticCodecInfo&); + + void SaveCodecInfoToStream(IOutputStream& out, const TStaticCodecInfo&); + + // misc + + TStaticCodecInfo LoadCodecInfoFromStream(IInputStream& in); + + TString FormatCodecInfo(const TStaticCodecInfo&); + +} diff --git a/library/cpp/codecs/static/static_codec_info.proto b/library/cpp/codecs/static/static_codec_info.proto new file mode 100644 index 0000000000..362abb4dad --- /dev/null +++ b/library/cpp/codecs/static/static_codec_info.proto @@ -0,0 +1,17 @@ +package NCodecs; + +message TStaticCodecInfo { + message TDebugInfo { + optional string CodecName = 1; // the exact codec variant name + optional uint64 Timestamp = 2; // when the codec was built + optional string RevisionInfo = 3; // the arcadia revision info + optional float SampleSizeMultiplier = 4; // how the default sample size was modified to improve compression + optional float Compression = 5; // the compression on the training set ((raw_size - coded_size) / raw_size) + optional string TrainingSetComment = 6; // a human readable description of the training set + optional string TrainingSetResId = 7; // the training set sandbox resource id + optional uint64 StoredCodecHash = 8; // cityhash64(data) + } + + optional bytes StoredCodec = 1; // the data of the codec + optional TDebugInfo DebugInfo = 2; // misc debug info which could be useful in finding whereabouts later +} diff --git a/library/cpp/codecs/static/tools/common/ct_common.cpp b/library/cpp/codecs/static/tools/common/ct_common.cpp new file mode 100644 index 0000000000..fe77691280 --- /dev/null +++ b/library/cpp/codecs/static/tools/common/ct_common.cpp @@ -0,0 +1,74 @@ +#include "ct_common.h" + +#include <library/cpp/codecs/codecs.h> +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <library/cpp/string_utils/base64/base64.h> + +#include <util/stream/output.h> +#include <util/string/builder.h> +#include <util/system/hp_timer.h> + +namespace NCodecs { + TString TComprStats::Format(const TStaticCodecInfo& info, bool checkMode) const { + TStringBuilder s; + s << "raw size/item: " << RawSizePerRecord() << Endl; + s << "enc.size/item: " << EncSizePerRecord() << Endl; + if (checkMode) { + s << "orig.enc.size/item: " << OldEncSizePerRecord(info.GetDebugInfo().GetCompression()) << Endl; + } + s << "enc time us/item: " << EncTimePerRecordUS() << Endl; + s << "dec time us/item: " << DecTimePerRecordUS() << Endl; + s << "dict size: " << info.GetStoredCodec().Size() << Endl; + s << "compression: " << AsPercent(Compression()) << " %" << Endl; + if (checkMode) { + s << "orig.compression: " << AsPercent(info.GetDebugInfo().GetCompression()) << " %" << Endl; + } + return s; + } + + TComprStats TestCodec(const ICodec& c, const TVector<TString>& input) { + TComprStats stats; + + TBuffer encodeBuffer; + TBuffer decodeBuffer; + for (const auto& data : input) { + encodeBuffer.Clear(); + decodeBuffer.Clear(); + + stats.Records += 1; + stats.RawSize += data.size(); + + THPTimer timer; + c.Encode(data, encodeBuffer); + stats.EncSize += encodeBuffer.size(); + stats.EncSeconds += timer.PassedReset(); + + c.Decode(TStringBuf{encodeBuffer.data(), encodeBuffer.size()}, decodeBuffer); + stats.DecSeconds += timer.PassedReset(); + Y_ENSURE(data == TStringBuf(decodeBuffer.data(), decodeBuffer.size()), "invalid encoding at record " << stats.Records); + } + + return stats; + } + + void ParseBlob(TVector<TString>& result, EDataStreamFormat fmt, const TBlob& blob) { + TStringBuf bin(blob.AsCharPtr(), blob.Size()); + TStringBuf line; + TString buffer; + while (bin.ReadLine(line)) { + if (DSF_BASE64_LF == fmt) { + Base64Decode(line, buffer); + line = buffer; + } + if (!line) { + continue; + } + result.emplace_back(line.data(), line.size()); + } + } + + TBlob GetInputBlob(const TString& dataFile) { + return dataFile && dataFile != "-" ? TBlob::FromFile(dataFile) : TBlob::FromStream(Cin); + } + +} diff --git a/library/cpp/codecs/static/tools/common/ct_common.h b/library/cpp/codecs/static/tools/common/ct_common.h new file mode 100644 index 0000000000..9d3dcbda93 --- /dev/null +++ b/library/cpp/codecs/static/tools/common/ct_common.h @@ -0,0 +1,75 @@ +#pragma once + +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/memory/blob.h> +#include <cmath> + +namespace NCodecs { + class TStaticCodecInfo; + class ICodec; + + struct TComprStats { + double EncSeconds = 0; + double DecSeconds = 0; + size_t Records = 0; + size_t RawSize = 0; + size_t EncSize = 0; + + static double Round(double n, size_t decPlaces = 2) { + double p = pow(10, decPlaces); + return round(n * p) / p; + } + + static double AsPercent(double n) { + return Round(n * 100); + } + + static double AsMicroSecond(double s) { + return s * 1000000; + } + + double PerRecord(double n) const { + return Round((double)(Records ? n / Records : 0)); + } + + double Compression() const { + return ((double)RawSize - (double)EncSize) / RawSize; + } + + double EncTimePerRecordUS() const { + return PerRecord(AsMicroSecond(EncSeconds)); + } + + double DecTimePerRecordUS() const { + return PerRecord(AsMicroSecond(DecSeconds)); + } + + double RawSizePerRecord() const { + return PerRecord(RawSize); + } + + double EncSizePerRecord() const { + return PerRecord(EncSize); + } + + double OldEncSizePerRecord(double compr) const { + return PerRecord((1 - compr) * RawSize); + } + + TString Format(const TStaticCodecInfo&, bool checkMode) const; + }; + + TComprStats TestCodec(const ICodec&, const TVector<TString>& data); + + enum EDataStreamFormat { + DSF_NONE, + DSF_PLAIN_LF /* "plain" */, + DSF_BASE64_LF /* "base64" */, + }; + + void ParseBlob(TVector<TString>&, EDataStreamFormat, const TBlob&); + + TBlob GetInputBlob(const TString& dataFile); + +} diff --git a/library/cpp/codecs/static/tools/common/ya.make b/library/cpp/codecs/static/tools/common/ya.make new file mode 100644 index 0000000000..d624222dad --- /dev/null +++ b/library/cpp/codecs/static/tools/common/ya.make @@ -0,0 +1,19 @@ +LIBRARY() + +OWNER(velavokr) + +SRCS( + ct_common.cpp +) + +PEERDIR( + library/cpp/codecs + library/cpp/codecs/static + library/cpp/getopt/small + library/cpp/string_utils/base64 + util/draft +) + +GENERATE_ENUM_SERIALIZATION(ct_common.h) + +END() diff --git a/library/cpp/codecs/static/tools/static_codec_checker/README b/library/cpp/codecs/static/tools/static_codec_checker/README new file mode 100644 index 0000000000..723a68300b --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_checker/README @@ -0,0 +1,4 @@ +This is a viewer for generated codec and utility for verification of the compression quality on a new data. + +Usage: +static_codec_checker -t -c 029b29ff64a74927.codec_info -f plain samples.txt diff --git a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp new file mode 100644 index 0000000000..9c8d568d82 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp @@ -0,0 +1,73 @@ +#include <library/cpp/codecs/static/tools/common/ct_common.h> +#include <library/cpp/codecs/static/static.h> +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <library/cpp/codecs/codecs.h> +#include <library/cpp/getopt/small/last_getopt.h> + +#include <util/digest/city.h> +#include <util/generic/yexception.h> +#include <util/stream/file.h> +#include <util/stream/buffer.h> +#include <util/stream/format.h> +#include <util/string/builder.h> + +int main(int argc, char** argv) { + NCodecs::TCodecPtr codecPtr; + NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE; + TString codecFile; + bool testCompression = false; + + auto opts = NLastGetopt::TOpts::Default(); + opts.SetTitle("Prints a .codec_info file and optionally checks its performance on new data. See also static_codec_generator."); + opts.SetCmdLineDescr("-c 9089f3e9b7a0f0d4.codec_info -t -f base64 qtrees.sample.txt"); + NCodecs::TStaticCodecInfo codec; + + opts.AddLongOption('c', "codec-info").RequiredArgument("codec_info").Handler1T<TString>([&codecFile, &codec, &codecPtr](TString name) { + codecFile = name; + codec.CopyFrom(NCodecs::LoadCodecInfoFromString(TUnbufferedFileInput(name).ReadAll())); + codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec()); + }) + .Required() + .Help(".codec_info file with serialized static data for codec"); + + opts.AddLongOption('t', "test").NoArgument().StoreValue(&testCompression, true).Optional().Help("test current performance"); + + opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Optional().Help("test set input file format"); + + opts.SetFreeArgsMin(0); + opts.SetFreeArgTitle(0, "testing_set_input_file", "testing set input files"); + + NLastGetopt::TOptsParseResult res(&opts, argc, argv); + + Cout << codecFile << Endl; + Cout << NCodecs::FormatCodecInfo(codec) << Endl; + + if (testCompression) { + if (NCodecs::DSF_NONE == fmt) { + Cerr << "Specify format (-f|--format) for testing set input" << Endl; + exit(1); + } + + Cout << "Reading testing set data ... " << Flush; + + TVector<TString> allData; + for (const auto& freeArg : res.GetFreeArgs()) { + NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg)); + } + + if (!res.GetFreeArgs()) { + NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-")); + } + + Cout << "Done" << Endl << Endl; + + Cout << "records: " << allData.size() << Endl; + Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl; + + Cout << "Testing compression ... " << Flush; + auto stats = NCodecs::TestCodec(*codecPtr, allData); + Cout << "Done" << Endl << Endl; + + Cout << stats.Format(codec, true) << Endl; + } +} diff --git a/library/cpp/codecs/static/tools/static_codec_checker/ya.make b/library/cpp/codecs/static/tools/static_codec_checker/ya.make new file mode 100644 index 0000000000..90e06ca448 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_checker/ya.make @@ -0,0 +1,16 @@ +PROGRAM() + +OWNER(velavokr) + +SRCS( + static_codec_checker.cpp +) + +PEERDIR( + library/cpp/codecs + library/cpp/codecs/static + library/cpp/codecs/static/tools/common + library/cpp/getopt/small +) + +END() diff --git a/library/cpp/codecs/static/tools/static_codec_generator/README b/library/cpp/codecs/static/tools/static_codec_generator/README new file mode 100644 index 0000000000..e6bb52b959 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_generator/README @@ -0,0 +1,4 @@ +This is a utility for reproducible teaching of a codec. And also for saving it into a file with a unique name for a static compilation as a resource. + +Usage: +static_codec_generator -t -m 'the training data description' -f plain samples.txt diff --git a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp new file mode 100644 index 0000000000..45fdb5c5fe --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp @@ -0,0 +1,82 @@ +#include <library/cpp/codecs/static/tools/common/ct_common.h> +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <library/cpp/codecs/static/builder.h> +#include <library/cpp/codecs/codecs.h> + +#include <library/cpp/getopt/small/last_getopt.h> + +#include <util/generic/yexception.h> +#include <util/stream/file.h> +#include <util/string/builder.h> + +int main(int argc, char** argv) { + NCodecs::TCodecBuildInfo info; + NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE; + + auto opts = NLastGetopt::TOpts::Default(); + opts.SetCmdLineDescr("-m 'Training set: 100000 qtrees taken from web mmeta logs' -f base64 qtrees.sample.txt"); + opts.SetTitle("Teaches the codec and serializes it as a file named CODECNAME.hash(CODECDATA).bin"); + + opts.AddLongOption('m', "message").RequiredArgument("training_set_comment").StoreResult(&info.TrainingSetComment).Required().Help("a human description for the training set"); + + opts.AddLongOption('r', "resource").RequiredArgument("training_set_res_id").StoreResult(&info.TrainingSetResId).Optional().Help("sandbox resource id for the training set"); + + opts.AddLongOption('c', "codec").RequiredArgument("codec_name").StoreResult(&info.CodecName).Optional().DefaultValue(info.CodecName); + + opts.AddLongOption('s', "sample-multiplier").RequiredArgument("multiplier").StoreResult(&info.SampleSizeMultiplier).Optional().DefaultValue(ToString(info.SampleSizeMultiplier)).Help("multiplier for default sample size"); + + opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Required().Help("training set input file format"); + + opts.AddLongOption("list-codecs").NoArgument().Handler0([]() { + Cout << JoinStrings(NCodecs::ICodec::GetCodecsList(), "\n") << Endl; + exit(0); + }) + .Optional() + .Help("list available codecs"); + + opts.AddLongOption("fake-revision").RequiredArgument("revision").StoreResult(&info.RevisionInfo).Optional().Hidden(); // replace static_codec_generator revision in debug info + + opts.AddLongOption("fake-timestamp").RequiredArgument("timestamp").StoreResult(&info.Timestamp).Optional().Hidden(); // replace generating timestamp in debug info + + opts.SetFreeArgsMin(0); + opts.SetFreeArgTitle(0, "training_set_input_file", "training set input files"); + + NLastGetopt::TOptsParseResult res(&opts, argc, argv); + + Cout << "Reading training set data ... " << Flush; + TVector<TString> allData; + for (const auto& freeArg : res.GetFreeArgs()) { + NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg)); + } + + if (!res.GetFreeArgs()) { + NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-")); + } + Cout << "Done" << Endl << Endl; + + Cout << "records: " << allData.size() << Endl; + Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl; + + Cout << "Training " << info.CodecName << " , sample size multiplier is " << info.SampleSizeMultiplier << " ... " << Flush; + auto codec = NCodecs::BuildStaticCodec(allData, info); + Cout << "Done" << Endl; + + TString codecName = NCodecs::GetStandardFileName(codec); + NCodecs::TCodecPtr codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec()); + + Cout << "Testing compression ... " << Flush; + auto stats = NCodecs::TestCodec(*codecPtr, allData); + Cout << "Done" << Endl << Endl; + + codec.MutableDebugInfo()->SetCompression(stats.Compression()); + + Cout << stats.Format(codec, false) << Endl; + + Cout << "Saving as " << codecName << " ... " << Flush; + { + TUnbufferedFileOutput fout{codecName}; + NCodecs::SaveCodecInfoToStream(fout, codec); + fout.Finish(); + } + Cout << "Done" << Endl << Endl; +} diff --git a/library/cpp/codecs/static/tools/static_codec_generator/ya.make b/library/cpp/codecs/static/tools/static_codec_generator/ya.make new file mode 100644 index 0000000000..efbc440dd1 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_generator/ya.make @@ -0,0 +1,17 @@ +PROGRAM() + +OWNER(velavokr) + +SRCS( + static_codec_generator.cpp +) + +PEERDIR( + library/cpp/codecs + library/cpp/codecs/static + library/cpp/codecs/static/tools/common + library/cpp/digest/md5 + library/cpp/getopt/small +) + +END() diff --git a/library/cpp/codecs/static/tools/tests/canondata/result.json b/library/cpp/codecs/static/tools/tests/canondata/result.json new file mode 100644 index 0000000000..7a637c6763 --- /dev/null +++ b/library/cpp/codecs/static/tools/tests/canondata/result.json @@ -0,0 +1,6 @@ +{ + "static_codec_tools.test_static_codec_tools": { + "checksum": "960e3c8c57fb846ab53ccbd07e287233", + "uri": "sbr://144512644/static_codec_tools.test_static_codec_tools/solar-8k-a.huffman.1467494385.codec_info" + } +}
\ No newline at end of file diff --git a/library/cpp/codecs/static/tools/tests/static_codec_tools.py b/library/cpp/codecs/static/tools/tests/static_codec_tools.py new file mode 100644 index 0000000000..db4140e370 --- /dev/null +++ b/library/cpp/codecs/static/tools/tests/static_codec_tools.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python + +import yatest.common as tt +import os.path as op + +def test_static_codec_tools(): + tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator")] + + ["-m", "test codec", "-r", "sbr://143310406", "-f", "plain", "-c", "solar-8k-a:huffman", "-s", "1", + "--fake-revision", "r2385905", "--fake-timestamp", "1467494385", "sample.txt"], + timeout=60) + assert(op.exists("solar-8k-a.huffman.1467494385.codec_info")) + tt.canonical_execute(tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker"), + args=["-c", "solar-8k-a.huffman.1467494385.codec_info"], + timeout=60) + tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker")] + + ["-c", "solar-8k-a.huffman.1467494385.codec_info", "-f", "plain", "-t", "sample.txt"], + timeout=60) + return tt.canonical_file("solar-8k-a.huffman.1467494385.codec_info") diff --git a/library/cpp/codecs/static/tools/tests/ya.make b/library/cpp/codecs/static/tools/tests/ya.make new file mode 100644 index 0000000000..c5324eaf53 --- /dev/null +++ b/library/cpp/codecs/static/tools/tests/ya.make @@ -0,0 +1,20 @@ +PY2TEST() + +OWNER(velavokr) + +TEST_SRCS(static_codec_tools.py) + +DATA(sbr://143310406) + +TIMEOUT(4200) + +TAG(ya:not_autocheck) + +DEPENDS( + library/cpp/codecs/static/tools/static_codec_checker + library/cpp/codecs/static/tools/static_codec_generator +) + + + +END() diff --git a/library/cpp/codecs/static/tools/ya.make b/library/cpp/codecs/static/tools/ya.make new file mode 100644 index 0000000000..dd3e8437aa --- /dev/null +++ b/library/cpp/codecs/static/tools/ya.make @@ -0,0 +1,5 @@ +RECURSE( + common + static_codec_generator + static_codec_checker +) diff --git a/library/cpp/codecs/static/ut/builder_ut.cpp b/library/cpp/codecs/static/ut/builder_ut.cpp new file mode 100644 index 0000000000..b47c279ed1 --- /dev/null +++ b/library/cpp/codecs/static/ut/builder_ut.cpp @@ -0,0 +1,57 @@ +#include <library/cpp/testing/unittest/registar.h> +#include <library/cpp/codecs/static/builder.h> +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <util/string/vector.h> + +class TStaticCodecInfoBuilderTest: public NUnitTest::TTestBase { + UNIT_TEST_SUITE(TStaticCodecInfoBuilderTest) + UNIT_TEST(TestBuild) + UNIT_TEST_SUITE_END(); + +private: + TVector<TString> PrepareData() { + TVector<TString> data; + for (ui32 i = 'a'; i <= 'z'; ++i) { + data.push_back(TString(1, (char)i)); + } + return data; + } + + void TestBuild() { + TVector<TString> data; + NCodecs::TCodecBuildInfo info; + info.CodecName = "huffman"; + info.SampleSizeMultiplier = 2; + info.Timestamp = 1467494385; + info.RevisionInfo = "r2385905"; + info.TrainingSetComment = "some dummy data"; + info.TrainingSetResId = "sbr://1234"; + auto res = NCodecs::BuildStaticCodec(PrepareData(), info); + UNIT_ASSERT_VALUES_EQUAL(res.ShortUtf8DebugString(), + "StoredCodec: \"\\007\\000huffman@S\\000a" + "\\006b\\005c\\005d\\005e\\005f\\005g\\005h\\005i\\005j\\005k\\005l\\005m\\005n\\005o" + "\\005p\\005q\\005r\\005s\\005t\\005u\\004v\\004w\\004x\\004y\\004z\\004\xC7?\xC8>" + "\xC9=\xCA<\xCB;\xCC:\3159\3168\3177\3206\3215\3224\3233\3242\3251\3260\xD7/\xD8." + "\xD9-\xDA,\xDB+\xDC*\xDD)\xDE(\xDF\\'\xE0&\xE1%\xE2$\xE3#\xE4\\\"\xE5!\xE6 \xE7" + "\\037\xE8\\036\xE9\\035\xEA\\034\xEB\\033\xEC\\032\xED\\031\xEE\\030\xEF\\027\xF0" + "\\026\xF1\\025\xF2\\024\xF3\\023\xF4\\022\xF5\\021\xF6\\020\xF7\\017\xF8\\016\xF9" + "\\r\xFA\\014\xFB\\013\xFC\\n\xFD\\t\xFE\\010\xFF\\007\" " + "DebugInfo { " + "CodecName: \"huffman\" " + "Timestamp: 1467494385 " + "RevisionInfo: \"r2385905\" " + "SampleSizeMultiplier: 2 " + "TrainingSetComment: \"some dummy data\" " + "TrainingSetResId: \"sbr://1234\" " + "StoredCodecHash: 2509195835471488613 " + "}"); + + UNIT_ASSERT_VALUES_EQUAL(NCodecs::GetStandardFileName(res), "huffman.1467494385.codec_info"); + UNIT_ASSERT_VALUES_EQUAL(res.GetDebugInfo().GetStoredCodecHash(), 2509195835471488613ULL); + + auto res1 = NCodecs::LoadCodecInfoFromString(NCodecs::SaveCodecInfoToString(res)); + UNIT_ASSERT_VALUES_EQUAL(res1.ShortUtf8DebugString(), res.ShortUtf8DebugString()); + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TStaticCodecInfoBuilderTest); diff --git a/library/cpp/codecs/static/ut/static_ut.cpp b/library/cpp/codecs/static/ut/static_ut.cpp new file mode 100644 index 0000000000..57e1e62887 --- /dev/null +++ b/library/cpp/codecs/static/ut/static_ut.cpp @@ -0,0 +1,27 @@ +#include <library/cpp/testing/unittest/registar.h> +#include <library/cpp/codecs/static/example/example.h> + +class TStaticCodecUsageTest: public NUnitTest::TTestBase { + UNIT_TEST_SUITE(TStaticCodecUsageTest) + UNIT_TEST(TestUsage) + UNIT_TEST_SUITE_END(); + +private: + void DoTestUsage(NStaticCodecExample::EDictVersion dv, size_t expectedSize) { + const TStringBuf letov = "Всё идёт по плану"; + + TBuffer outEnc, outDec; + NStaticCodecExample::Encode(outEnc, letov, dv); + NStaticCodecExample::Decode(outDec, TStringBuf{outEnc.data(), outEnc.size()}); + + UNIT_ASSERT_VALUES_EQUAL(outEnc.Size(), expectedSize); + UNIT_ASSERT_EQUAL(TStringBuf(outDec.data(), outDec.size()), letov); + } + + void TestUsage() { + DoTestUsage(NStaticCodecExample::DV_HUFF_20160707, 18u); + DoTestUsage(NStaticCodecExample::DV_SA_HUFF_20160707, 22u); + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TStaticCodecUsageTest) diff --git a/library/cpp/codecs/static/ut/ya.make b/library/cpp/codecs/static/ut/ya.make new file mode 100644 index 0000000000..b9116097d8 --- /dev/null +++ b/library/cpp/codecs/static/ut/ya.make @@ -0,0 +1,14 @@ +UNITTEST_FOR(library/cpp/codecs/static) + +OWNER(velavokr) + +SRCS( + builder_ut.cpp + static_ut.cpp +) + +PEERDIR( + library/cpp/codecs/static/example +) + +END() diff --git a/library/cpp/codecs/static/ya.make b/library/cpp/codecs/static/ya.make new file mode 100644 index 0000000000..00e00fd8d4 --- /dev/null +++ b/library/cpp/codecs/static/ya.make @@ -0,0 +1,18 @@ +LIBRARY() + +OWNER(velavokr) + +SRCS( + builder.cpp + static_codec_info.proto + static.cpp +) + +PEERDIR( + library/cpp/codecs + library/cpp/archive + library/cpp/svnversion + util/draft +) + +END() diff --git a/library/cpp/codecs/tls_cache.cpp b/library/cpp/codecs/tls_cache.cpp new file mode 100644 index 0000000000..0a1b32bda1 --- /dev/null +++ b/library/cpp/codecs/tls_cache.cpp @@ -0,0 +1,4 @@ +#include "tls_cache.h" + +namespace NCodecs { +} diff --git a/library/cpp/codecs/tls_cache.h b/library/cpp/codecs/tls_cache.h new file mode 100644 index 0000000000..0184e4bb6c --- /dev/null +++ b/library/cpp/codecs/tls_cache.h @@ -0,0 +1,100 @@ +#pragma once + +#include <util/generic/buffer.h> +#include <util/generic/deque.h> +#include <util/generic/noncopyable.h> +#include <util/generic/strbuf.h> +#include <util/system/tls.h> +#include <util/thread/singleton.h> + +namespace NCodecs { + template <class TItem> + struct TClear { + void operator()(TItem& item) const { + item.Clear(); + } + }; + + template <class TItem, class TCleaner = TClear<TItem>> + class TTlsCache { + using TSelf = TTlsCache<TItem, TCleaner>; + + struct TItemHolder: public TIntrusiveListItem<TItemHolder> { + TItemHolder(TSelf& factory) + : Factory(factory) + { + } + + void Release() { + Factory.Release(*this); + } + + TSelf& Factory; + TItem Item; + }; + + class TItemGuard { + public: + explicit TItemGuard(TSelf& fact) + : Holder(fact.Acquire()) + { + } + + TItemGuard(TItemGuard&& other) noexcept { + *this = std::move(other); + } + + TItemGuard& operator=(TItemGuard&& other) noexcept { + if (&other != this) { + std::swap(Holder, other.Holder); + } + return *this; + } + + ~TItemGuard() { + if (Holder) { + Holder->Release(); + } + } + + TItem& Get() & { + Y_ASSERT(Holder); + return Holder->Item; + } + + TItem& Get() && = delete; + + private: + TItemHolder* Holder = nullptr; + }; + + public: + TItemGuard Item() { + return TItemGuard(*this); + } + + static TSelf& TlsInstance() { + return *FastTlsSingleton<TSelf>(); + } + + private: + TItemHolder* Acquire() { + if (Free.Empty()) { + return new TItemHolder(*this); + } else { + return Free.PopBack(); + } + } + + void Release(TItemHolder& item) { + Cleaner(item.Item); + Free.PushBack(&item); + } + + private: + TIntrusiveListWithAutoDelete<TItemHolder, TDelete> Free; + TCleaner Cleaner; + }; + + using TBufferTlsCache = TTlsCache<TBuffer>; +} diff --git a/library/cpp/codecs/ut/codecs_ut.cpp b/library/cpp/codecs/ut/codecs_ut.cpp new file mode 100644 index 0000000000..caf6089aef --- /dev/null +++ b/library/cpp/codecs/ut/codecs_ut.cpp @@ -0,0 +1,1360 @@ +#include <library/cpp/codecs/delta_codec.h> +#include <library/cpp/codecs/huffman_codec.h> +#include <library/cpp/codecs/pfor_codec.h> +#include <library/cpp/codecs/solar_codec.h> +#include <library/cpp/codecs/zstd_dict_codec.h> +#include <library/cpp/codecs/comptable_codec.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/generic/buffer.h> +#include <util/string/util.h> +#include <util/string/hex.h> +#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> + +namespace { + const char* TextValues[] = { + "! сентября газета", + "!(возмездие это)!", + "!(материнский капитал)", + "!(пермь березники)", + "!биография | !жизнь / + розинг | зворыгин & изобретение | телевидение | электронно лучевая трубка", + "!овсиенко николай павлович", + "!путин", + "\"i'm on you\" p. diddy тимати клип", + "\"билайн\" представит собственный планшет", + "\"в особо крупном размере\"", + "\"викиликс\" джулиан ассанж", + "\"вимм билль данн", + "\"газэнергосеть астрахань", + "\"газэнергосеть астрахань\"", + "\"домодедово\" ту-154", + "\"жилина\" \"спартак\" видео", + "\"зелёнsq шершнm\"", + "\"зелёного шершня\"", + "\"золотой граммофон\" марины яблоковой", + "\"золотой граммофон-2010\"", + "\"калинниковы\"", + "\"манчестер юнайтед\" (англия) \"валенсия\" (испания) 1:1 (0:1)", + "\"маркер\"", + "\"моника\" засыпает москву снегом", + "\"моника\" снегопад", + "\"о безопасности\",", + "\"памятку\" для пассажиров воздушных международных рейсов", + "\"петровский парк\" и \"ходынское поле\"", + "\"путинская\" трава", + "\"пятерочка\"купила \"копейку\"", + "\"пятёрочка\" и \"копейка\" объединились", + "\"реал\" \"осер\" 4:0", + "\"речь мутко\"", + "\"российский лес 2010\"", + "\"ростехинвентаризация федеральное бти\" рубцов", + "\"саня останется с нами\",", + "\"следопыт\" реалити шоу", + "\"слышишь\" молодые авторы", + "\"стадион\"", + "\"ходынское поле\" метро", + "\"хроники нарнии\"", + "\"чистая вода\"", + "\"школа деда мороза\"", + "# asus -1394", + "# сторонники wikileaks", + "#106#", + "#11", + "#8 какой цвет", + "#если клиент", + "$ 13,79", + "$ xnj ,s dct ,skb ljdjkmys !!!", + "$ в день", + "$ диск компьютера", + "$.ajax", + "$125 000", + "$курс", + "% в си", + "% влады", + "% годовых", + "% женщин и % мужчин в россии", + "% занятости персонала", + "% инфляции 2010", + "% инфляции в 2010 г.", + "% налога", + "% налогов в 2010г.", + "% общего количества", + "% от числа", + "% по налогу на прибыль организации", + "%24", + "%академия%", + "%комарова%татьяна", + "& в 1с", + "&& (+не существует | !такой проблемы)", + ">>>скачать | download c cs strikez.clan.su<<<", + ">hbq nbityrjd", + "< какой знак", + "< лицей | < техническая школа# < история#< лицей сегодня#< перечень профессий#< руководство лицея#< прием учащихся#< контакты#< схема проезда#< фотогалереяистория создания лицея и основные этапы путиулица купчинская дом 28", + "<<link>>", + "</storage>", + "<bfnkjy", + "<bktntd", + "<cr", + "<ddr3>", + "<e[ufknthcrbq abyfycjdsq", + "<fcctqys", + "<fhcf", + "<fhctkjyf he,by", + "<firbhbz", + "<fyr djphj;ltybt", + "<fyr vjcrds", + "<fyr резерв", + "<fyufkjh", + "<index>", + "<jkmifz jrhe;yfz rbtd", + "<kbpytws", + "<megafon> интернет", + "<thtpybrb gthvcrbq rhfq", + "<tkjxrf", + "<беларусь это мы", + "<бокс, версия ibf", + "designer tree svc", + "seriesg810", + "doll makers", + "rotten.com", + "evening gowns", + "discover", + "south carolina escorts", + "forkliftjobsinhousron", + "mailbox", + "alexis", + "espn.com mlb", + "gypsy.chat.2k", + "the man in the mirror", + "azteca", + "sebastian telfair - jamel thomas", + "kirby", + "java", + "trike motorcycles", + "piasecki helicopter", + "wicca binding spells", + "pier park panama city beach .com", + "continente europeo", + "asswatchers.com", + "asswatchers.com", + "easton stealth stiff flex cnt adult baseball bat - 3", + "facesofdeath", + "video of 9 11", + "profileedit.myspace.com", + "georgia snakes", + "yahoo.com", + "google", + "http wwwclassicindustries .corvettes-roadsters.com", + "arington training stable", + "find bred of dog", + "southpark contact tables for myspace", + "symptoms of laryngitis", + "suzuki stickers", + "avianca", + "radio shack", + "dominican republic pictures", + "recent", + "mapquest", + "http myspace .com", + "research chemicals supplies", + "winn dixie.com", + "drivers 20guide.com", + "dylan whitley north carolina", + "google com", + "order wild horses cigarettes", + "yahoocom", + "fl runners", + "aol companion install", + "nbc.comdond 59595 6", + "directv.com", + "motorsports insurance", + "cartoonnetwork", + "pop warner-victorville", + "black iorn spars", + "goog", + "the suns", + "ebay", + "pop warner", + "philadelphia cream cheese", + "oklahoma", + "doudleday books.com", + "javascript download", + "city of nacogdoches", + "sfyl", + "myspace.com", + "baptism pictures", + "games", + "depredadores sexuales", + "mycl.cravelyrics.com", + "become a bone marrow donner", + "vintage copies", + "ford dealership", + "candystand", + "smarthairypussyom", + "yahoo.com", + "vanderbilt.edu", + "ebay", + "grouper", + "mys", + "myrsa and birth defects", + "hatteras rentals", + "female escorts", + "ja rule", + "meat bluesheet", + "yahoo", + "american disability act court cases", + "clearview cinemas", + "hard69.com", + "make a living will for free", + "fat asses", + "flashback concert in atlanta ga", + "fucking", + "flat abdomen exercises", + "big brother facial", + "german dictionary", + "black dick", + "ebonymovies", + "airsoft rifles", + "best fishing days calander", + "tattoo", + "impressions", + "cs.com", + "northwest airlines reservations", + "halo 3", + "wallbaums", + "chat room listings", + "waterbury ct warrants", + "pictures of chad michael murry", + "yahoo", + "install wallpaper", + "halo 3", + "clits and tits", + "prothsmouth general circuit courts", + "old hawthorne columbia", + "jess lee photos", + "no deposit casino bonus", + "bbc gladiator dressed to kill", + "anemagazine.com", + "lyrics unfaithful", + "gold bars found", + "art.comhttp", + "free unlock key", + "man o war lost a race", + "blue cross and blue shield", + "phenergan", + "myspace.com", + "http www.constitutional court.com", + "monster trucks", + "the breeze fort myers fla.newspaper", + "davis origin name", + "upper deck.com", + "arizona", + "akira lane", + "ebaumsworld", + "union pacific jobs", + "google.cm", + "free bigt girls nudes", + "abcnews.com", + "tootse.com", + "az lyrics", + "freddy", + "georgia.com", + "johncombest.com", + "nelly", + "gussi mane", + "university of illinois", + "oregan valcano's", + "mythbusters", + "sailormoon hentai", + "international cub tractor", + "desert sky movie green valley az", + "evite", + "nelly nud epics", + "penndot.com", + "first banks", + "psp manual", + "google", + "jackieaudet hotmail.com", + "internet", + "shootinggames", + "shootinggames", + "montana western rendezvous of art", + "hello kitty layouts", + "yahoo", + "translation", + "glenn scott attorney", + "hallofshame", + "capitolone.com", + "recipe for popovers", + "pictures of demons", + "barnes and nobles.com", + "rbd", + "hart and hunnington tattoo shop", + "janepowellmovies.com", + "ged schools in the military", + "kelis", + "hvacagent", + "neat home organizer television show", + "2719 24-2-crime and courts", + "fsu", + "torpedo bomber games", + "love poems", + "polly pocket'toys", + "yweatherahoo.com", + "jungle gin", + "flemington new jersey real estate", + "milf hunter stories", + "budget.com", + "chopperstyle", + "keno player", + "up skirt", + "dogs", + "beerballers", + "phat white butt", + "phat white butt", + "va licensing for interpeters for the deaf", + "white page phone book maiden north carolina", + "controlled 20solutions 20corp.com", + "friedman jewelery", + "kelis", + "curtains", + "curtains", + "fuck me harder", + "naked girls", + "southwest airlines boarding pass", + "mailbox", + "1976 mavrick", + "adult diapers", + "horse nasal discharge", + "charles ludlam", + "google", + "himnos en espanol", + "quarter horses for sale in nebraska", + "cosmo", + "hi", + "mattel", + "aouto 20trader.com", + "sunsetter awnings", + "bl.cfm", + "at", + "tattoo designs", + "bubs", + "yahoo", + "free live gay cam chats", + "antibiotics", + "upgrade", + "aessuccess.org", + "yahoo", + "boobdex", + "the jackle", + "plus size lingerie magazines for home", + "lehigh valley little league", + "ancient trade coins", + "pillsbury", + "colorado springs", + "canada aviation jobs", + "free guitar tablature", + "kids aol", + "capitol community colage", + "kevin thomas bermuda", + "missouri lotto", + "homedepotsportscomplex.com", + "dr. franklin schneier", + "williamsburg va. hotels", + "aim", + "morningbuzz", + "probusines.com", + "wwwalbasoul.com", + "w.runehints.com", + "yahoo.com", + "yahoo.com", + "yahoo.com", + "fantasy 5", + "xxx rape", + "hawaiian gift baskets", + "madonna.com", + "myspace contact tables", + "white cock", + "safe space", + "drinks", + "o rly", + "dsl", + "wwww.uncc.edu", + "wwww.uncc.edu", + "wwww.uncc.edu", + "online overseas checkt.westernunion.com", + "angina", + "heba technologies", + "hebrew ancient coins", + "games", + "recent", + "international male.com", + "sex pics", + "paul wall layouts for myspace", + "health", + "wire lamp shade frames", + "windows", + "top business colleges", + "mary jo eustace", + "attored", + "oklahoma indian legal services", + "6arab", + "santo nino", + "10.1.0.199", + "http www.myspace.com daffydonn07", + "marine electrical", + "sandy creek cabins weekend new york", + "onionbutts", + "tucson classifieds", + "new york times", + "recently deleted screen names", + "goldeneagle.net", + "fta support forums", + "low protein in bloos", + "datring", + "lilwayne", + "free billiards games", + "yahoo", + "ako", + "a.tribalfusion.c script language", + "dustin davis", + "cooking", + "yahoo.com", + "universal studios", + "adult chat", + "santa monica flea market", + "carpevino.us", + "wine vinyard in stewertstown pa", + "y", + "craigslist", + "ups.com", + "1-866-347-3292", + "renegade boats", + "renegade boats", + "sunset state beach caping", + "artofstanlync.org", + "heart-i want make love to you video", + "triangles around the world", + "mycl.cravelyrics.com", + "in the bible what type of persons were forced to walk around in public and say unclean unclean", + "providence water fire", + "googlecom", + "yahoo.com", + "b.g", + "website de rebelde", + "stoplinks", + "allison 2000 transmission", + "thepriceanduseofgasoline.com", + "chamillinaire", + "veryspecialhomescom", + "crashbandicoot", + "a short sex story", + "yahoo.com", + "music now", + "east carolina university", + "vandalism in new york", + "the bainde soleil company", + "dicaprio movies", + "xxx dvds", + "visual basic scripting support", + "english bulldogs", + "travelocity.com", + "website for asstr.org", + "hypnotic slave training", + "pogo", + "university at buffalo addmissions", + "screen name services", + "superdrol", + "art institute", + "online business cards", + "aolfinancial", + "upgrade shop", + "anderson abrasive", + "weatherchannel.com", + "recent", + "ebay", + "diagram and xray of a normal shouldercheck out surgicalpoker.comfor more sports medicine and orthopedic information and images check out emedx.com by dr. allan mishranormal diagram normal x-ray", + "95 mustang gt chips", + "gold grills", + "hap housing in portland or", + "car sales", + "swimming with dolphins", + "jennifer lopez nude", + "wwwdubcnn.com", + "dominicks pizza", + "fl studio", + "http blackplanet .com", + "http blackplanet .com", + "http blackplanet .com", + "A$AP Rocky", + "benie mac", + "fujifilm.com", + "aol dialup setup", + "metal fabrication tools", + "internet", + "buy my painting", + "pulaski va classifieds", + "w.coj.net", + "postopia.com", + "no medical records hydrocodone", + "auto completes for deal or no deal contest", + "http www. big monster dicks .com", + "invacare wheelchairs", + "musicdownload.com", + "president bush", + "heavy equipment", + "inmate information", + "allina.com", + "megan law.gov", + "wwwl.eharmony.com", + "jobs in colombiaoqx0nq", + "beastsex", + "ferguisson", + "heart-i wanna make love to you vedio", + "west georgia university", + "west georgia university", + "hsn", + "bb&t", + "midas realty", + "yahoo", + "mytrip.com", + "donna texas mcdonalds", + "free picture of our lady", + "bubs", + "taken chemo for 5 month's cancer can still be seen on ct scan", + "porn 20video 20clips", + "lake monsters", + "freedj mix vibes", + "myspace.coim", + "la joya school district tx", + "colorado bungee jumping", + "yahoo", + "google.com", + "lafayette co vampire grave", + "ice cube", + "internet", + "tccd.edu", + "google", + "people", + "instructions on putting together a filing cabinet", + "click.babycenter.com", + "90minut", + "ramien noodles", + "lilwayne", + "danni virgin", + "nice sexy girls.com", + "guttural pouch", + "free male masturbating", + "good", + "rotton 20dot.com", + "fox sports", + "seth rogen", + "desb.mspaceads.com", + "betjc.com", + "pictures of quebec", + "gold in quartz", + "evergreen college", + "runescape", + "gastons white river resort", + "sunset beach santa cruz", + "auto parts", + "travelocity", + "myspace.com", + "laptops", + "beyaonce and j", + "free gay ebony knights webcams", + "google", + "derek watson", + "alice in wonderland tshirts", + "hippa p rivacy act", + "down payment mortgage", + "believe it or not", + "mys", + "datatreca", + "onesuite", + "names", + "lil john", + "scales of justice cuff links", + "localsales.com", + "alametris denise lipsey", + "adam for adam", + "flip flops crochet", + "arbors", + "heb hospital", + "myspae.com", + "midevil breast torture", + "askjeeves", + "assparade", + ".comhttp", + "weekly hotels reston virginia", + "noiceinparadise.com", + "pre diabetic diet", + "h.i.m.com", + "myspace", + "myspace", + "wwww.sex.lp.cpm", + "mcso mugshots", + "roush", + "wellfargo", + "lilwayne", + "hopecherie", + "frontgate.com", + "barbados registration department", + "american pitbull", + "free pc full flight simulation game downloads", + "google", + "vaginal secretion grey stuff", + "myspace layouts", + "kanye west", + "walmart", + "pain in hip and leg", + "tenneesseeaquarium.com", + "suncom.com", + "alysseandrachelwerehere", + "pimiclo", + "starmagazine.com", + "classifieds", + "mount rushmore in dakota", + "sams", + "disney com", + "beastyality", + "chief joseph paintings", + "henry scott", + "paris hilton", + "kb903235", + "autotrader", + "irish traveller", + "ajcobs.com", + "art of stanlync.org", + "fox news", + "freeporn", + "depo provera", + "air france", + "talk city active chats", + "codes for the gamecube game resident evil 4", + "good food to eat for sugar diabetes", + "warpmymind", + "arc jacksonville fl", + "7fwww.sendspace.com", + "j blackfoot", + "mcso madison street jail inmate", + "macys", + "eduscapes", + "free picture of our lady", + "http www.eastman.org", + "minneapolisstartribune localnews", + "minneapolisstartribune localnews", + "tennessee", + "foodtown", + "anti virous download", + "http www.mdland rec.net", + "ed edd eddy", + "maryjbilge", + "shipping services", + "baseball videogames", + "egyption ancient coins", + "internet", + "what is sodomy", + "international cub lowboy", + "mary j. bilge", + "scenic backgrounds", + "google.com", + "rosettalangueges.com", + "titanpoker.net", + "titie show", + "edelen realtor", + "lil cim", + "china.com", + "boost mobile", + "nc eipa", + "people's 20pharmacy 20guide 20to", + "costco", + "charles schultz drawings", + "nicisterling", + "a picture of author stephen crane", + "yahoo.com", + "sponge bob myspace layouts", + "g", + "calendar creator", + "careerbuilder.com", + "cool tex for web pages", + "yahoo.com", + "mcdougal littel", + "sign on", + "superman", + "radio", + "lajollaindians.com", + "mike tyson died", + "pink panther", + "lolita newgroups", + "nude girls", + "galveston 20texas", + "gerlach meat co.", + "thetakeover2006.com", + "yahoo", + "simpsons movie", + "saxy", + "yahoo", + "21st century realty", + "new zealand", + "dogs", + "weather", + "free porn sex", + "bugs bunny parties", + "mortal kombat 2 fatalities", + "sea life park hawaii", + "songs for middle school choir", + "rocky mountain jeep", + "householdbank.com", + "birdville isd", + "brutal dildo", + "brutal dildo", + "free live gay cam chats", + "wonder woman", + "ebay com", + "myspace.com", + "boost mobile", + "desktop themes sex", + "myspace.com", + "myspace.com", + "maroon chevy auto dealership", + "beyonce", + "cleopatra vii", + "accountcentralonline.com", + "juvenile", + "the game cock", + "pics of ashland city tennessee", + "coherent deos", + "microwsoft wireless connection", + "best buy", + "southwest airlines", + "southwest airlines", + "pogo games", + "family court record room in brooklyn newyork", + "60.ufc.net", + "us mint", + "people", + "firstcitycreditunion", + "washington mutual careers", + "beyonce", + "tab energy drink", + "http vemmabuilder.com", + "new york state lottery", + "yahoo", + "tmobile", + "yellow pages.com", + "az.central.com", + "pasco auto salvage", + "im help", + "home based businesses", + "studyisland", + "bible study from king james on 1 corinthians chapter 6 verses 18- 20", + "bellevue-ne", + "msn.com", + "aolsignupfree", + "the simsons", + "nevada", + "forsyth central high school", + "road state college", + "does my child have adhd", + "les tucanesde tijuana", + "yahoo.com", + "mexican pharmacy hyrocodone", + "ford motor co year end sales", + "google.com", + "google.com", + "person.com", + "marylyn monroe", + "nfl", + "the hun.net", + "nkena anderson", + "free netscape download", + "top fifty colleges", + "wil.", + "memphis tennessee", + "yahoo mail", + "corrections officer of juveniles", + "jada pinkett smith", + "mapquest.com", + "apartments", + "msn.com", + "msn.com", + "wasco state prison", + "solitaire", + "http", + "freeport seaman center", + "futbol soccer", + "screen names", + "kmov.com", + "survey.otxresearch.com", + "facial shaves", + "gle", + "flw.com", + "seasportboats.com", + "toysrus.com", + "animated sexy graphics", + "colombia", + "unitarian univeralist association", + "fr", + "google video.com", + "660-342-1072", + "suzan-lori parks", + "male facial", + "william bouguereau first kiss how much it is worth", + "streetfighter", + "nick.com", + "wonder woman", + "pentagram", + "mcafee virus protection", + "diary", + "037f34742140a5f761ad51d95180b4f8", + "free porn", + "no deposit casino bonus", + "spongebob the movie myspace layouts", + "on line banking", + "equestrian properties for sale", + "kazaa free muisc download", + "gay truckers", + "24", + "pay-pal", + "www yahoo.com", + "phatazz.white hoes", + "planets of the universe", + "free movies", + "budget rentals special", + "yahoogames", + "talaat pasha", + "mariah carey song lyrics don't forget about us", + "futbol soccer", + "msn groups", + "martha steward", + "martha steward", + "soap opera scoops cbs", + "cingular", + "stuwie", + "womengiving blowjobs", + "hear dancing queen by abba", + "love song", + "fhsaa.org", + "any dvd", + "any dvd", + "gallery.brookeskye.com", + "gibson ranch", + "wachovia com", + "kzg golf information", + "skylight curtains", + "c", + "123freeweblayouts.com", + "yahoo.com", + "allie.com", + "ghosts of bingham cemetery", + "resume maker", + "resume maker", + "resume maker", + "lymphomatoid papulosis", + "sez.com", + }; +} + +class TCodecsTest: public TTestBase { + UNIT_TEST_SUITE(TCodecsTest); + UNIT_TEST(TestPipeline) + UNIT_TEST(TestDelta) + UNIT_TEST(TestHuffman) + UNIT_TEST(TestZStdDict) + UNIT_TEST(TestCompTable) + UNIT_TEST(TestHuffmanLearnByFreqs) + UNIT_TEST(TestSolar) + UNIT_TEST(TestPFor) + UNIT_TEST(TestRegistry) + + UNIT_TEST_SUITE_END(); + +private: + TString PrintError(TStringBuf learn, TStringBuf test, TStringBuf codec, ui32 i) { + TString s; + TStringOutput sout(s); + sout << codec << ": " << i << ", " + << "\n"; + sout << HexEncode(learn.data(), learn.size()); //NEscJ::EscapeJ<true>(learn, sout); + sout << " != \n"; + sout << HexEncode(test.data(), test.size()); //NEscJ::EscapeJ<true>(test, sout); + + if (s.Size() > 1536) { + TString res = s.substr(0, 512); + res.append("...<skipped ").append(ToString(s.size() - 1024)).append(">..."); + res.append(s.substr(s.size() - 512)); + } + + return s; + } + + TStringBuf AsStrBuf(const TBuffer& b) { + return TStringBuf(b.data(), b.size()); + } + + template <typename TCodec, bool testsaveload> + void TestCodec(const TVector<TBuffer>& inlearn = TVector<TBuffer>(), const TVector<TBuffer>& in = TVector<TBuffer>(), NCodecs::TCodecPtr c = new TCodec) { + using namespace NCodecs; + + TBuffer buff; + + { + TVector<TBuffer> out; + + c->Learn(inlearn.begin(), inlearn.end()); + + if (testsaveload) { + { + TBufferOutput bout(buff); + ICodec::Store(&bout, c); + } + + { + TBufferInput bin(buff); + c = ICodec::Restore(&bin); + UNIT_ASSERT(c->AlreadyTrained()); + } + } + + { + size_t insz = 0; + size_t outsz = buff.Size(); + + for (ui32 i = 0; i < inlearn.size(); ++i) { + out.emplace_back(); + c->Encode(AsStrBuf(inlearn[i]), out[i]); + + insz += inlearn[i].Size(); + outsz += out[i].Size(); + } + + TBuffer vecl; + for (ui32 i = 0; i < out.size(); ++i) { + vecl.Clear(); + c->Decode(AsStrBuf(out[i]), vecl); + + UNIT_ASSERT_EQUAL_C(AsStrBuf(inlearn[i]), AsStrBuf(vecl), + PrintError(TStringBuf(inlearn[i].data(), inlearn[i].size()), + TStringBuf(vecl.data(), vecl.size()), c->GetName(), i)); + } + } + } + + { + if (testsaveload) { + TBufferInput bin(buff); + c = ICodec::Restore(&bin); + } + + size_t insz = 0; + size_t outsz = buff.Size(); + + TBuffer out, in1; + for (ui32 i = 0; i < in.size(); ++i) { + out.Clear(); + in1.Clear(); + c->Encode(AsStrBuf(in[i]), out); + insz += in[i].Size(); + outsz += out.Size(); + c->Decode(AsStrBuf(out), in1); + UNIT_ASSERT_EQUAL_C(AsStrBuf(in[i]), AsStrBuf(in1), + PrintError(TStringBuf(in[i].data(), in[i].size()), + TStringBuf(in1.data(), in1.size()), c->GetName(), i)); + } + } + } + + template <class T> + void AppendTo(TBuffer& b, T t) { + b.Append((char*)&t, sizeof(t)); + } + + void TestDelta() { + using namespace NCodecs; + TVector<TBuffer> d; + + // 1. common case + d.emplace_back(); + AppendTo(d.back(), 1ULL); + AppendTo(d.back(), 10ULL); + AppendTo(d.back(), 100ULL); + AppendTo(d.back(), 1000ULL); + AppendTo(d.back(), 10000ULL); + AppendTo(d.back(), 100000ULL); + + // 2. delta overflow + d.emplace_back(); + AppendTo(d.back(), 1ULL); + AppendTo(d.back(), 10ULL); + AppendTo(d.back(), 100ULL); + AppendTo(d.back(), 1000ULL); + AppendTo(d.back(), (ui64)-100LL); + AppendTo(d.back(), (ui64)-10ULL); + + // 3. bad sorting + d.emplace_back(); + AppendTo(d.back(), 1ULL); + AppendTo(d.back(), 10ULL); + AppendTo(d.back(), 1000ULL); + AppendTo(d.back(), 100ULL); + AppendTo(d.back(), 10000ULL); + AppendTo(d.back(), 100000ULL); + + // all bad + d.emplace_back(); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -1LL); + + TestCodec<TDeltaCodec<ui64, true>, false>(d); + TestCodec<TDeltaCodec<ui64, false>, false>(d); + } + + void TestPFor() { + using namespace NCodecs; + { + TVector<TBuffer> d; + d.emplace_back(); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -1LL); + d.emplace_back(); + AppendTo(d.back(), 0LL); + AppendTo(d.back(), 1LL); + AppendTo(d.back(), 2LL); + AppendTo(d.back(), 1LL); + AppendTo(d.back(), 0LL); + AppendTo(d.back(), 1LL); + AppendTo(d.back(), 2LL); + d.emplace_back(); + AppendTo(d.back(), 0LL); + AppendTo(d.back(), 1LL); + AppendTo(d.back(), 2LL); + AppendTo(d.back(), 1LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), 0LL); + AppendTo(d.back(), 1LL); + AppendTo(d.back(), 2LL); + d.emplace_back(); + AppendTo(d.back(), 0LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -2LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -2LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), 0LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -2LL); + + TestCodec<TPForCodec<ui64>, false>(d); + TestCodec<TPForCodec<ui64, true>, true>(d); + } + { + TVector<TBuffer> d; + d.emplace_back(); + AppendTo(d.back(), -1); + AppendTo(d.back(), -1); + AppendTo(d.back(), -1); + AppendTo(d.back(), -1); + d.emplace_back(); + AppendTo(d.back(), 0); + AppendTo(d.back(), 1); + AppendTo(d.back(), 2); + AppendTo(d.back(), 1); + AppendTo(d.back(), -1); + AppendTo(d.back(), 0); + AppendTo(d.back(), 1); + AppendTo(d.back(), 2); + d.emplace_back(); + AppendTo(d.back(), 0); + AppendTo(d.back(), -1); + AppendTo(d.back(), -2); + AppendTo(d.back(), -1); + AppendTo(d.back(), -2); + AppendTo(d.back(), -1); + AppendTo(d.back(), 0); + AppendTo(d.back(), -1); + AppendTo(d.back(), -2); + + TestCodec<TPForCodec<ui32>, false>(d); + TestCodec<TPForCodec<ui32, true>, false>(d); + } + { + TVector<TBuffer> d; + d.emplace_back(); + for (auto& textValue : TextValues) { + AppendTo(d.back(), (ui32)strlen(textValue)); + } + + TestCodec<TPForCodec<ui32>, false>(d); + TestCodec<TPForCodec<ui32, true>, false>(d); + } + { + TVector<TBuffer> d; + d.emplace_back(); + for (auto& textValue : TextValues) { + AppendTo(d.back(), (ui64)strlen(textValue)); + } + + TestCodec<TPForCodec<ui64>, false>(d); + TestCodec<TPForCodec<ui64, true>, false>(d); + } + } + + template <class TCodec> + void DoTestSimpleCodec() { + using namespace NCodecs; + { + TVector<TBuffer> learn; + + for (auto& textValue : TextValues) { + learn.emplace_back(textValue, strlen(textValue)); + } + + TestCodec<TCodec, true>(learn); + } + { + TestCodec<TCodec, true>(); + } + + { + TVector<TBuffer> learn; + learn.emplace_back(); + learn.back().Append('a'); + + TVector<TBuffer> test; + test.emplace_back(); + for (ui32 i = 0; i < 256; ++i) { + test.back().Append((ui8)i); + } + + TestCodec<TCodec, true>(learn, test); + } + + { + TVector<TBuffer> learn; + learn.emplace_back(); + for (ui32 i = 0; i < 256; ++i) { + for (ui32 j = 0; j < i; ++j) { + learn.back().Append((ui8)i); + } + } + + TVector<TBuffer> test; + test.emplace_back(); + for (ui32 i = 0; i < 256; ++i) { + test.back().Append((ui8)i); + } + + TestCodec<TCodec, true>(learn, test); + } + + { + TVector<TBuffer> learn; + learn.emplace_back(); + for (ui32 i = 0; i < 128; ++i) { + for (ui32 j = 0; j < i; ++j) { + learn.back().Append((ui8)i); + } + } + + TVector<TBuffer> test; + test.emplace_back(); + for (ui32 i = 128; i < 256; ++i) { + test.back().Append((ui8)i); + } + + TestCodec<TCodec, true>(learn, test); + } + } + + void TestHuffman() { + DoTestSimpleCodec<NCodecs::THuffmanCodec>(); + } + + void TestZStdDict() { + using namespace NCodecs; + { + TVector<TBuffer> learn; + + for (auto& textValue : TextValues) { + learn.emplace_back(textValue, strlen(textValue)); + } + + TestCodec<TZStdDictCodec, true>(learn); + } + + } + + void TestCompTable() { + DoTestSimpleCodec<NCodecs::TCompTableCodec>(); + } + + void TestHuffmanLearnByFreqs() { + using namespace NCodecs; + + TVector<TBuffer> data; + + for (auto& textValue : TextValues) { + data.emplace_back(textValue, strlen(textValue)); + } + + TVector<TBuffer> outLearn; + + { + THuffmanCodec codec; + static_cast<ICodec&>(codec).Learn(data.begin(), data.end()); + + for (ui32 i = 0; i < data.size(); ++i) { + outLearn.emplace_back(); + codec.Encode(AsStrBuf(data[i]), outLearn[i]); + } + } + + TVector<TBuffer> outLearnByFreqs; + + { + THuffmanCodec codec; + std::pair<char, ui64> freqs[256]; + + for (size_t i = 0; i < Y_ARRAY_SIZE(freqs); ++i) { + freqs[i].first = (char)i; + freqs[i].second = 0; + } + + for (auto& textValue : TextValues) { + size_t len = strlen(textValue); + for (size_t j = 0; j < len; ++j) { + ++freqs[(ui32)(0xFF & textValue[j])].second; + } + } + + codec.LearnByFreqs(TArrayRef<std::pair<char, ui64>>(freqs, Y_ARRAY_SIZE(freqs))); + + for (ui32 i = 0; i < data.size(); ++i) { + outLearnByFreqs.emplace_back(); + codec.Encode(AsStrBuf(data[i]), outLearnByFreqs[i]); + } + } + + UNIT_ASSERT_EQUAL(outLearn.size(), outLearnByFreqs.size()); + const size_t sz = outLearn.size(); + for (size_t n = 0; n < sz; ++n) { + UNIT_ASSERT_EQUAL(AsStrBuf(outLearn[n]), AsStrBuf(outLearnByFreqs[n])); + } + } + + void TestSolar() { + using namespace NCodecs; + { + TVector<TBuffer> learn; + + for (auto& textValue : TextValues) { + learn.emplace_back(textValue, strlen(textValue)); + } + + TestCodec<TSolarCodec, true>(learn, TVector<TBuffer>(), new TSolarCodec(512, 8)); + TestCodec<TAdaptiveSolarCodec, false>(learn, TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8)); + TestCodec<TAdaptiveSolarCodec, true>(learn, TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8)); + TestCodec<TSolarCodecShortInt, true>(learn, TVector<TBuffer>(), new TSolarCodecShortInt(512, 8)); + } + { + TestCodec<TSolarCodec, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TSolarCodec(512, 8)); + TestCodec<TAdaptiveSolarCodec, false>(TVector<TBuffer>(), TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8)); + TestCodec<TAdaptiveSolarCodec, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8)); + TestCodec<TSolarCodecShortInt, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TSolarCodecShortInt(512, 8)); + } + + { + TVector<TBuffer> learn; + learn.emplace_back(); + learn.back().Append('a'); + + TVector<TBuffer> test; + test.emplace_back(); + for (ui32 i = 0; i < 256; ++i) { + test.back().Append((ui8)i); + } + + TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8)); + TestCodec<TAdaptiveSolarCodec, false>(learn, test, new TAdaptiveSolarCodec(512, 8)); + TestCodec<TAdaptiveSolarCodec, true>(learn, test, new TAdaptiveSolarCodec(512, 8)); + TestCodec<TSolarCodecShortInt, true>(learn, test, new TSolarCodecShortInt(512, 8)); + } + + { + TVector<TBuffer> learn; + learn.emplace_back(); + for (ui32 i = 0; i < 256; ++i) { + for (ui32 j = 0; j < i; ++j) { + learn.back().Append((ui8)i); + } + } + + TVector<TBuffer> test; + test.emplace_back(); + for (ui32 i = 0; i < 256; ++i) { + test.back().Append((ui8)i); + } + + TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8)); + TestCodec<TAdaptiveSolarCodec, false>(learn, test, new TAdaptiveSolarCodec(512, 8)); + TestCodec<TAdaptiveSolarCodec, true>(learn, test, new TAdaptiveSolarCodec(512, 8)); + TestCodec<TSolarCodecShortInt, true>(learn, test, new TSolarCodecShortInt(512, 8)); + } + } + + void TestPipeline() { + using namespace NCodecs; + { + TVector<TBuffer> learn; + learn.emplace_back(); + for (ui32 i = 0; i < 256; ++i) { + for (i32 j = i; j >= 0; --j) { + learn.back().Append((ui8)j); + } + } + + TVector<TBuffer> test; + test.emplace_back(); + for (ui32 i = 0; i < 256; ++i) { + test.back().Append((ui8)i); + } + + TestCodec<TPipelineCodec, true>(learn, test, + new TPipelineCodec(new TSolarCodec(512, 8), new TSolarCodec(512, 8), new THuffmanCodec)); + } + { + TVector<TBuffer> d; + d.emplace_back(); + for (ui32 i = 0; i < 256; ++i) { + for (i32 j = i; j >= 0; --j) { + d.back().Append(i * i); + } + } + + TestCodec<TPipelineCodec, false>(d, TVector<TBuffer>(), + new TPipelineCodec(new TDeltaCodec<ui32, false>, new TPForCodec<ui32>)); + } + } + + void TestRegistry() { + using namespace NCodecs; + TVector<TString> vs = ICodec::GetCodecsList(); + for (const auto& v : vs) { + TCodecPtr p = ICodec::GetInstance(v); + if (v == "none") { + UNIT_ASSERT(!p); + continue; + } + UNIT_ASSERT_C(!!p, v); + UNIT_ASSERT_C(TStringBuf(v).Head(3) == TStringBuf(p->GetName()).Head(3), v + " " + p->GetName()); + } + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TCodecsTest) diff --git a/library/cpp/codecs/ut/float_huffman_ut.cpp b/library/cpp/codecs/ut/float_huffman_ut.cpp new file mode 100644 index 0000000000..3156fb1f46 --- /dev/null +++ b/library/cpp/codecs/ut/float_huffman_ut.cpp @@ -0,0 +1,237 @@ +#include <library/cpp/codecs/float_huffman.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/stream/format.h> +#include <util/stream/output.h> +#include <library/cpp/string_utils/base64/base64.h> + +namespace fh = NCodecs::NFloatHuff; + +Y_UNIT_TEST_SUITE(FloatHuffmanTest) { + static const float Factors[] = { + 0.340582, 0.000974026, 0.487168, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0.411765, 0.921569, + 0.00390625, 0.109371, 0, 1, 0, 0, 0, 0, 0.523322, 0, 1, 0, 0, 0, 0, 0.285714, 1, + 0.008253, 1, 0, 0, 0.00993935, 0.450213, 0.000974026, 1, 1, 1, 1, 0, 0, 0.20564, + 0.97561, 0.913896, 1, 1, 0, 1, 0, 0, 0.5, 0, 0, 0, 0.1, 1, 0, 0, 0, 0, 0, 0.450923, + 0, 0.5, 0, 0, 0.20564, 0, 0.5, 0, 0, 0.20564, 0, 0, 0.0313726, 0, 1, 1, 1, 0.363636, + 0.5, 0.686073, 0.45121, 0.00574382, 0.366166, 0.413295, 1, 1, 1, 0, 0, 0, 0, 0.160784, + 0, 0.937255, 0.537255, 0.133333, 0, 0, 0, 0, 0.00392157, 0, 0.333333, 0.027451, 0.0156863, + 1, 0.105882, 1, 0.00220908, 0.000112501, 0.0111262, 0.102384, 0.00140808, 0.123581, + 0.29308, 6.57282e-06, 0.00489498, 2.10209e-05, 0.00140559, 5.907e-06, 0, 0.559322, + 0.559322, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0.794765, 0, + 0.352648, 0.225904, 1, 0.047619, 0.0107276, 0.399461, 0.0304838, 0.292932, 0.00969929, + 0, 0, 0.886904, 0.714693, 0, 0.00223213, 0.000544069, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0.00507403, 0, 0, 0, 0, 0, 0.875, 0, 0, 1, 1, 1, 0, 0.20564, 0, 0.00176048, 0, + 0.000440121, 0, 0, 0, 0.000974026, 0.487168, 0, 0, 0.533333, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 1, 0, 0, 0.723187, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 0.206882, 0.00483367, 0.792983, 0.00126106, 1, 0.0313726, 0.470588, + 0.254902, 0.188235, 0.188235, 0.388235, 0.164706, 0, 0.870588, 0.843137, 0.635294, + 0.384314, 0.384314, 0.643137, 0, 0, 0, 0, 0, 0, 0, 0, 0.541176, 0, 0.541176, 0, 0, + 0.0532634, 1, 0, 0, 0, 0.015044, 1, 0, 1, 1, 1, 0.47451, 0.329412, 0.964706, 0, 0, + 0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0.0941176, 0.970588, 0.970588, 0, 0.970588, 0.97561, + 0, 0.0431373, 0.47451, 0.329412, 0.964706, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0.231373, 0.00392157, 0, 0, 0, 0.054902, 0, 0, + 1, 0, 0, 0.0235294, 0, 1, 0, 0, 0, 0, 0.34902, 0.0352941, 0.925379, 0.623681, 0, + 0.954543, 0, 0, 0.00102756, 0.709804, 0.498039, 0.0901961, 0.631373, 0.847059, 0.270588, + 0.0156863, 0.133333, 0.980392, 1e-12, 1e-12, 1e-12, 1e-12, 0.497159, 0, 0.407487, + 0, 0, 0, 0.00392157, 0.00202156, 0.046875, 0.187159, 0.046875, 0.15625, 0.434232, + 0.15625, 0, 2.95083e-07, 0.20564, 0.20564, 0.97561, 0.913896, 0, 0, 0, 0, 0, 0, 0.00784314, + 0, 0.695525, 1, 0.07205, 0, 0, 0.176471, 0, 0, 0, 1, 1, 0.98, 0.01, 0.01, 0, 0.00690702, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.29078, 0.29078, 1, 0, 0, 0, 0, 0.192157, 0.188235, + 0.0941176, 0, 0.0313726, 0, 0.141176, 0.207843, 0.0901961, 0.00784314, 0.0784314, + 0, 0, 0, 0, 0, 0.203922, 0.0196078, 0.34902, 0.0235294, 0.0980392, 0.164706, 0.133333, + 0.368627, 0, 0.0941176, 0, 1, 0.313726, 0, 0, 0.433582, 0.384508, 0.0532186, 0.0833333, + 0.01609, 0, 1, 0, 0, 0, 0.0666667, 0, 0, 0, 0, 1, 0, 0.564706, 0.501961, 0, 0, 0, + 0, 0, 0.0516447, 0.000173065, 0, 0, 0, 0, 0, 0, 0, 0.996309, 0, 0, 0.00392157, 1, + 0, 0.01, 0, 0, 0, 0, 0, 0.439505, 0.206882, 0.206882, 0.260891, 0, 0.875, 0, 0, 0, + 0, 0, 0.185657, 1, 1, 0, 0, 0, 0.0332647, 0.206106, 0.0688878, 0.239216, 0, 0, 0, + 0, 0.054902, 0, 0.101961, 0.160784, 0.180392, 0, 0.737828, 0, 0, 0.875, 0.0142566, + 0, 0.662745, 1, 0, 0, 0, 0.225806, 0.99992, 0.631373, 0.00392157, 1, 0, 0.143647, + 0.00270085, 1, 0.231482, 0.246735, 0.0428062, 0, 0, 1, 0, 0.186441, 0.0115358, 0, + 0.221762, 0, 0.2, 0, 0.0156863, 0, 0, 0, 0.976471, 0, 0.231373, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0.00392157, 0.00392157, 0.0666667, 0, 0, 0, 0, 0.0117647, 0.580392, 0.98737, + 1, 1, 1, 0, 0, 0, 0.153, 0.847, 0.931373, 0.94697, 0.94697, 0, 0.946294, 0.408118, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0.99992, 0.97561, 0, 0, 0, 0, 0, 0, + 0.274677, 0.153017, 0, 0.642356, 0, 0, 0.1, 0, 0, 0, 0, 0.327944, 0.327944, 0, 0, + 0.815686, 0, 0, 0, 0, 0.206106, 0.439126, 0, 0, 0, 0, 0, 1, 1, 1, 0.00392157, 0.232788, + 0.232465, 0.999899, 0.00309296, 0.0636097, 0.445954, 0.156863, 0, 0, 0, 0, 0, 0, + 0.3796, 0.0784, 0.0651664, 0, 0, 0.254902, 0.266667, 1, 0, 0, 0, 0, 0, 0.596073, + 0.517876, 0.145833, 0.372549, 0, 0.991667, 0.602125, 0.161979, 0, 0, 0, 0, 0.0255146, + 0.947855, 0, 0, 0, 0, 0, 0, 0, 0, 0.847059, 0.679841, 0, 0.156863, 0, 0, 1, 0, 0, + 0, 0, 0.969697, 0, 0, 0.564706, 0, 0, 0, 0, 0, 1, 0.0367282, 0.0395228, 0, 0, 0, + 0, 0, 0.0470588, 0.141176, 0.054902, 0, 0, 0, 0}; + static const size_t FactorCount = Y_ARRAY_SIZE(Factors); + + static const ui8 CodedFactors[] = { + 0x24, 0x06, 0x73, 0xB5, 0xC7, 0x55, 0x7F, 0x3A, 0xB4, 0x70, 0xCB, 0xEF, 0xEE, 0xFE, 0xB3, 0x5B, + 0x5A, 0x1A, 0x93, 0x5F, 0x5F, 0x13, 0x00, 0x00, 0x10, 0x00, 0x3D, 0xEF, 0xFF, 0xEE, 0x0F, 0xDC, + 0xF0, 0xAB, 0x3F, 0x37, 0x92, 0x24, 0x5D, 0x5E, 0xDE, 0x1C, 0xF8, 0x12, 0x15, 0x5B, 0x84, 0x51, + 0x82, 0xE6, 0xF6, 0xB8, 0xEA, 0x4F, 0xC7, 0xDD, 0x7D, 0x2E, 0x4D, 0x4A, 0x21, 0xCA, 0xE0, 0xC4, + 0x2E, 0xEA, 0xD3, 0xBD, 0x0F, 0x00, 0x00, 0xE0, 0xDA, 0xCC, 0xCC, 0xEC, 0x9F, 0x61, 0xDF, 0xE6, + 0x01, 0x00, 0x00, 0xCC, 0xA5, 0x49, 0xA9, 0x00, 0x00, 0x00, 0xE6, 0xD2, 0xA4, 0xD4, 0xEA, 0x08, + 0x08, 0xD0, 0xDD, 0xF9, 0xE7, 0xA2, 0x0B, 0x00, 0x00, 0x40, 0xD8, 0x13, 0x7D, 0xFE, 0x13, 0x9C, + 0x9B, 0xA8, 0x36, 0xBC, 0x00, 0x90, 0x43, 0x6F, 0x97, 0x67, 0x9B, 0xD3, 0xEE, 0xFE, 0x84, 0x24, + 0x25, 0x89, 0xC9, 0xBF, 0x3F, 0x58, 0x4C, 0x4C, 0xCA, 0x21, 0x22, 0xBC, 0x39, 0x08, 0x08, 0x08, + 0x40, 0x7E, 0xAA, 0xAA, 0xCA, 0x75, 0x70, 0x70, 0xE9, 0x08, 0x08, 0xE8, 0x9A, 0x8A, 0x8D, 0xED, + 0xA6, 0x8D, 0x31, 0x04, 0x00, 0x96, 0xD0, 0x7D, 0x1D, 0x47, 0xAA, 0x2A, 0xD9, 0x28, 0xAD, 0x6B, + 0xB4, 0x9D, 0x7A, 0xC4, 0xD5, 0xD1, 0x04, 0x8C, 0x7E, 0x56, 0x3A, 0x58, 0x5A, 0x0C, 0x46, 0x6E, + 0x1B, 0x53, 0xC2, 0x0C, 0x14, 0x00, 0xAB, 0x60, 0x05, 0x7B, 0x63, 0x8D, 0x77, 0x70, 0x75, 0xAC, + 0x2F, 0x8D, 0xB1, 0x4D, 0xA0, 0xFB, 0xF2, 0x40, 0xF7, 0xE5, 0x7F, 0xDF, 0xDD, 0xFD, 0xBB, 0x1B, + 0xB8, 0x75, 0x9B, 0x47, 0x8E, 0xB4, 0x0C, 0x9B, 0x3A, 0x73, 0x25, 0x61, 0x18, 0x92, 0xD1, 0xC2, + 0x2F, 0x3C, 0x31, 0x64, 0x96, 0x2A, 0xB9, 0xF9, 0x7C, 0xD9, 0xAF, 0x94, 0xC5, 0xE9, 0x1E, 0x63, + 0x24, 0x0C, 0x03, 0x7F, 0xD8, 0x5B, 0xB3, 0x1D, 0x49, 0x02, 0x00, 0xAB, 0xFD, 0xE9, 0xA0, 0xF3, + 0xBF, 0xC9, 0x40, 0x64, 0x0A, 0xC0, 0xC7, 0x00, 0x00, 0x60, 0x77, 0xCF, 0xA5, 0x49, 0xA9, 0x16, + 0xFD, 0xD7, 0x5C, 0xA7, 0x55, 0x00, 0x36, 0xCF, 0xB9, 0x3D, 0xAE, 0xFA, 0xD3, 0xA1, 0x85, 0x5B, + 0xFE, 0x60, 0x10, 0x11, 0xFF, 0xF7, 0x7D, 0x38, 0x59, 0x24, 0xFF, 0xFF, 0xDF, 0x13, 0x1C, 0x7B, + 0xCA, 0x1C, 0x1E, 0xF3, 0x04, 0xC0, 0x78, 0x07, 0x58, 0x7B, 0xA2, 0x54, 0xAA, 0xE3, 0xEA, 0x08, + 0x08, 0xC0, 0x74, 0x78, 0x78, 0x88, 0x50, 0x50, 0xD8, 0x0A, 0x0C, 0xC4, 0x56, 0x60, 0x20, 0xF6, + 0x1A, 0x1B, 0x33, 0x16, 0x15, 0xA5, 0xB8, 0xED, 0xED, 0x22, 0xF5, 0xF5, 0x09, 0xA1, 0xA2, 0x42, + 0x67, 0x62, 0x62, 0x3A, 0x13, 0x13, 0x0B, 0xA0, 0xA4, 0xF4, 0x0F, 0x06, 0x15, 0x35, 0x18, 0x54, + 0xD4, 0x35, 0x57, 0x45, 0xCB, 0x2F, 0x39, 0xF6, 0xEC, 0xBC, 0xBB, 0x53, 0x5F, 0x5E, 0x9E, 0xB1, + 0xA8, 0xA8, 0x28, 0xDF, 0xDE, 0x3E, 0x00, 0x00, 0x80, 0x5F, 0x75, 0x81, 0x81, 0x51, 0x1D, 0x1E, + 0xA2, 0x3A, 0x3C, 0x8C, 0xEA, 0xF0, 0x10, 0x51, 0x06, 0x67, 0xED, 0x85, 0x85, 0xA1, 0xBE, 0xBC, + 0x3C, 0x63, 0x51, 0x51, 0x51, 0xBE, 0xBD, 0xFD, 0xFF, 0xFD, 0xFE, 0xCE, 0x85, 0x76, 0x36, 0x73, + 0x10, 0x10, 0x10, 0x80, 0xEB, 0x3A, 0x38, 0xD8, 0xBE, 0xD4, 0x05, 0x06, 0xEE, 0x4F, 0x60, 0x59, + 0x59, 0x65, 0x84, 0x84, 0xC0, 0x46, 0xCB, 0x19, 0x7F, 0x4C, 0xFD, 0xC8, 0x9D, 0x8B, 0xB6, 0x31, + 0xAF, 0x86, 0x3A, 0xF0, 0x6D, 0x6D, 0x11, 0xDF, 0xDF, 0x5F, 0x79, 0x71, 0x71, 0x85, 0xD4, 0xD0, + 0x10, 0xB9, 0xB1, 0x11, 0x1A, 0x54, 0x54, 0xE9, 0x08, 0x08, 0x48, 0x39, 0x44, 0x04, 0x84, 0xAF, + 0xAF, 0x96, 0x99, 0x97, 0x71, 0xC5, 0x32, 0xF3, 0x32, 0xAE, 0x58, 0x66, 0x5E, 0xC6, 0x15, 0xCB, + 0xCC, 0xCB, 0xB8, 0x42, 0xD0, 0x45, 0xFF, 0x1C, 0x11, 0x85, 0xBE, 0x39, 0x08, 0x08, 0x08, 0x80, + 0x69, 0xC2, 0x47, 0x00, 0x80, 0x02, 0x00, 0x00, 0x91, 0xD3, 0xF4, 0x47, 0x01, 0x00, 0x80, 0x08, + 0x00, 0x00, 0x42, 0xD4, 0x29, 0x6F, 0x02, 0x00, 0x80, 0xB4, 0xE6, 0x6B, 0x9E, 0x34, 0x5C, 0x9A, + 0x94, 0xE2, 0xD2, 0xA4, 0x14, 0xA2, 0x0C, 0x4E, 0xEC, 0xA2, 0x3E, 0x7F, 0x39, 0x08, 0x08, 0x10, + 0x6E, 0x6F, 0x10, 0xD7, 0x79, 0xC7, 0xC9, 0x09, 0x4D, 0x4B, 0x73, 0x77, 0x84, 0x14, 0xAE, 0x52, + 0xE1, 0x7A, 0x44, 0x2A, 0x5C, 0x8F, 0x34, 0x93, 0xA8, 0xC4, 0x01, 0xF8, 0x3F, 0x3D, 0xC2, 0x29, + 0xE9, 0x11, 0x4E, 0xE9, 0x4F, 0x67, 0x62, 0x22, 0xB6, 0x02, 0x03, 0xA9, 0x2E, 0x30, 0x70, 0x75, + 0x04, 0x04, 0xC8, 0x38, 0x48, 0x08, 0x32, 0x53, 0x53, 0x29, 0x2F, 0x2E, 0xAE, 0x1C, 0x04, 0x04, + 0x50, 0x52, 0x50, 0xD0, 0x4F, 0x77, 0x68, 0x28, 0x99, 0x08, 0x0A, 0x4A, 0x60, 0x59, 0x59, 0xA9, + 0x0B, 0x0C, 0xAC, 0xC7, 0xC8, 0xC8, 0x8C, 0x45, 0x45, 0xA1, 0x1C, 0x22, 0x02, 0x5D, 0x79, 0x79, + 0xAB, 0x2E, 0x30, 0x70, 0xA7, 0x2C, 0x28, 0xE8, 0xB4, 0xF3, 0xEF, 0x26, 0x8F, 0x37, 0xB1, 0xFE, + 0xEE, 0x67, 0xA9, 0xA9, 0xAA, 0xAA, 0x6C, 0x79, 0x1E, 0xEC, 0xD7, 0x46, 0x44, 0xC4, 0xF7, 0xF8, + 0x24, 0x24, 0x00, 0x42, 0x40, 0xF8, 0x5A, 0x96, 0x38, 0x65, 0x91, 0xF1, 0x6A, 0x72, 0xFE, 0x68, + 0xC3, 0xE1, 0x37, 0x07, 0x01, 0x01, 0x01, 0xF0, 0x52, 0xE1, 0x7A, 0xE4, 0xB3, 0xD9, 0x20, 0x9C, + 0xE0, 0xD8, 0x53, 0x04, 0xC7, 0x9E, 0x82, 0x02, 0x27, 0x2B, 0x06, 0x00, 0x00, 0x9F, 0xDE, 0x1C, + 0x3E, 0xEE, 0xD7, 0x48, 0x20, 0x04, 0xD2, 0x35, 0x4C, 0x29, 0x43, 0x45, 0x23, 0x15, 0xEA, 0xE9, + 0x5E, 0xD7, 0xC1, 0xC1, 0xAA, 0x3B, 0x34, 0x34, 0x21, 0x49, 0x49, 0xE8, 0x8A, 0x8B, 0x13, 0x66, + 0x12, 0xE7, 0x31, 0x00, 0x00, 0x90, 0x84, 0x94, 0x69, 0x05, 0xD4, 0xD4, 0xF4, 0x13, 0x36, 0xE7, + 0x0C, 0x09, 0xEB, 0xBF, 0x90, 0x1A, 0x1A, 0xE6, 0x20, 0x20, 0x20, 0x00, 0x9E, 0x33, 0x18, 0x13, + 0xA6, 0x2F, 0x40, 0x0C, 0x00, 0x4E, 0xCF, 0x84, 0x36, 0x6A, 0xA0, 0xF2, 0xA9, 0x63, 0xD5, 0xCB, + 0x9E, 0x64, 0xEA, 0x3E, 0xF2, 0x14, 0xA0, 0x27, 0x29, 0x2B, 0xC6, 0xB2, 0x99, 0x99, 0xA9, 0x74, + 0x04, 0x04, 0x3C, 0x0A, 0xD0, 0xCF, 0x5C, 0x68, 0x67, 0xFB, 0xDF, 0x1C, 0x04, 0x04, 0x04, 0xC0, + 0x1C, 0x04, 0x04, 0x04, 0x40, 0x1B, 0x11, 0x11, 0x5F, 0xEA, 0x02, 0x03, 0xE1, 0x92, 0x94, 0x84, + 0x90, 0x88, 0xD9, 0xDD, 0x4F, 0x04, 0x56, 0x0E, 0xD1, 0x9F, 0x1A, 0x31, 0x3B, 0x37, 0x47, 0xA0, + 0x6C, 0x82, 0x40, 0xD9, 0x24, 0x9A, 0x02, 0x12, 0x62, 0xD3, 0x43, 0xFF, 0xBF, 0x8F, 0x84, 0xF5, + 0x1F, 0x51, 0x06, 0xE7, 0x0F, 0xDD, 0x89, 0x32, 0xFB, 0x60, 0x39, 0x0A, 0x71, 0x71, 0xB4, 0x36, + 0x33, 0x33, 0x3F, 0x8F, 0xD0, 0x4F, 0x79, 0x84, 0x7E, 0xBA, 0xC8, 0x0C, 0x0D, 0x4F, 0xBA, 0x86, + 0x29, 0x82, 0x54, 0x83, 0x7F, 0x77, 0x37, 0x07, 0x01, 0x01, 0x01, 0xA0, 0xFE, 0x97, 0x1B, 0x9D, + 0x16, 0xDC, 0x90, 0x58, 0xFE, 0x9B, 0x42, 0xB3, 0x4A, 0x00, 0x68, 0x73, 0x91, 0x20, 0x2B, 0xA8, + 0xC8, 0x29, 0x0B, 0x0A, 0xF2, 0xD3, 0x5D, 0x4B, 0x58, 0x5D, 0x20, 0x41, 0xD5, 0xBE, 0xAE, 0x70, + 0x88, 0x50, 0x50, 0x20, 0x4A, 0x44, 0xF4, 0x8F, 0xF7, 0x60, 0x22, 0x30, 0x9C, 0x24, 0xFE, 0x54, + 0x55, 0xD0, 0xD7, 0xD7, 0x37, 0x1A, 0xEF, 0x6E, 0xBC, 0x9B, 0x44, 0x39, 0xDD, 0x5D, 0xF2, 0xF2, + 0x7F, 0x20, 0x1A, 0x81, 0x9A, 0xCA, 0xBF, 0xC8, 0x8D, 0x8D, 0xC2, 0x83, 0x82, 0xA7, 0x2C, 0x28, + 0xC8, 0xFE, 0x08, 0xC2, 0x07, 0xC7, 0x27, 0x21, 0xE1, 0xBB, 0x3E, 0xC1, 0x59, 0x68, 0xAA, 0x78, + 0xC8, 0x57, 0x5D, 0x60, 0x20, 0xC6, 0x41, 0x42, 0xE8, 0x3A, 0x38, 0xD8, 0x9B, 0xFF, 0xFF, 0xFF, + 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + static const size_t CodedSize = Y_ARRAY_SIZE(CodedFactors); + static const TStringBuf CodedFactorsBuf(reinterpret_cast<const char*>(CodedFactors), CodedSize); + + void FillWithGarbage(float* factors, size_t count) { + void* data = static_cast<void*>(factors); + memset(data, 0xAA, sizeof(float) * count); + } + + // Helper for dumping compressed values + void PrintCompressed(const TVector<ui8>& codedFactors) { + for (size_t i = 0; i < codedFactors.size(); ++i) { + if (i % 0x10 == 0) + Cerr << Endl; + Cerr << Hex(codedFactors[i]) << ", "; + } + Cerr << Endl; + } + + // Helper for dumping decompressed values + void PrintDecompressed(const TVector<float>& factors) { + TStringStream result; + TStringStream line; + + for (size_t i = 0; i < factors.size(); ++i) { + line << factors[i] << ", "; + if (line.Str().size() > 80) { + result << line.Str() << Endl; + line.Clear(); + } + } + Cerr << result.Str() << Endl; + } + + Y_UNIT_TEST(TestCompress) { + const auto codedFactors = fh::Encode(Factors); + UNIT_ASSERT_VALUES_EQUAL(codedFactors.size(), CodedSize); + for (size_t i = 0; i < Min(codedFactors.size(), CodedSize); ++i) + UNIT_ASSERT_VALUES_EQUAL((ui8)codedFactors[i], CodedFactors[i]); + //PrintCompressed(codedFactors); + } + + Y_UNIT_TEST(TestSimpleDecompress) { + TVector<float> factors = fh::Decode(CodedFactorsBuf); + UNIT_ASSERT_VALUES_EQUAL(factors.size(), FactorCount); + for (size_t i = 0; i < Min(factors.size(), FactorCount); ++i) + UNIT_ASSERT_VALUES_EQUAL(factors[i], Factors[i]); + //PrintDecompressed(factors); + } + + Y_UNIT_TEST(TestDecompressInParts) { + float factors[FactorCount]; + FillWithGarbage(factors, FactorCount); + fh::TDecoder decoder(CodedFactorsBuf); + const size_t firstPack = 100; + // unpack first pack + UNIT_ASSERT_VALUES_EQUAL(decoder.Decode({factors, firstPack}), firstPack); + // unpack all the rest + UNIT_ASSERT_VALUES_EQUAL(decoder.Decode({factors + firstPack, FactorCount - firstPack}), FactorCount - firstPack); + + for (size_t i = 0; i < FactorCount; ++i) + UNIT_ASSERT_VALUES_EQUAL(factors[i], Factors[i]); + //PrintDecompressed(factors); + } + + Y_UNIT_TEST(TestSkip) { + float factors[FactorCount]; + FillWithGarbage(factors, FactorCount); + fh::TDecoder decoder(CodedFactorsBuf); + const size_t firstPack = 100; + // unpack first pack + UNIT_ASSERT_VALUES_EQUAL(decoder.Decode({factors, firstPack}), firstPack); + // skip some factors + const size_t skipCount = 60; + UNIT_ASSERT_VALUES_EQUAL(decoder.Skip(skipCount / 2), skipCount / 2); + // unpack all, except some factors in the end + const auto toDecode = FactorCount - firstPack - skipCount; + UNIT_ASSERT_VALUES_EQUAL(decoder.Decode({factors + firstPack, toDecode}), toDecode); + UNIT_ASSERT_VALUES_EQUAL(decoder.Skip(skipCount / 2), skipCount / 2); + for (size_t i = 0; i < FactorCount - skipCount; ++i) { + size_t correctedI = i < firstPack ? i : i + skipCount / 2; + UNIT_ASSERT_VALUES_EQUAL(factors[i], Factors[correctedI]); + } + //PrintDecompressed(factors); + } + + Y_UNIT_TEST(TestDecompressForgedData) { + // this coredumps without end-of-coded-stream check, see SEARCH-1156 for details + TString brokenBase64Encoded = + "NLjYltUWs5pqnd3d3f05Li4OAwCAEqrP6mv06jDt7PiAUVu7Y+PiMpuZmdzeM" + "ArqOLxS2q4FKCII52dktcVs7y0zL+OKgeO9SOzEkFj7uPfFqqoCAAAAAADAtZ" + "mZ2fdmICAgANQXhi1WVRUAAAAAAAAGjvcWq6oKAAAAAAAAA8d7qe4rV3Nxcd3" + "d4ZfQZrETm3B+OxxB8bbnTPM5+qtbQ92mJ3fHPGj+iH5+8tzcnJuamry1tWUw" + "MBD693f07+9+DQQEkIGAgIgPetzN5yEbAGxWpbCNxXK/0JGTKRz2KkIoR7aM"; + UNIT_ASSERT_EXCEPTION( + fh::Decode(Base64Decode(brokenBase64Encoded)), + yexception); + } + + Y_UNIT_TEST(TestDecompressEmpty) { + UNIT_ASSERT_EXCEPTION(fh::Decode({}), yexception); + } +}; diff --git a/library/cpp/codecs/ut/tls_cache_ut.cpp b/library/cpp/codecs/ut/tls_cache_ut.cpp new file mode 100644 index 0000000000..8101af761f --- /dev/null +++ b/library/cpp/codecs/ut/tls_cache_ut.cpp @@ -0,0 +1,36 @@ +#include <library/cpp/testing/unittest/registar.h> +#include <library/cpp/codecs/tls_cache.h> + +Y_UNIT_TEST_SUITE(CodecsBufferFactoryTest){ + void AssignToBuffer(TBuffer & buf, TStringBuf val){ + buf.Assign(val.data(), val.size()); +} + +TStringBuf AsStringBuf(const TBuffer& b) { + return TStringBuf(b.Data(), b.Size()); +} + +Y_UNIT_TEST(TestAcquireReleaseReuse) { + NCodecs::TBufferTlsCache factory; + // acquiring the first buffer + auto buf1 = factory.Item(); + AssignToBuffer(buf1.Get(), "Buffer_01"); + { + // acquiring the second buffer + auto buf2 = factory.Item(); + AssignToBuffer(buf2.Get(), "Buffer_02"); + } + // the first buffer should stay intact + UNIT_ASSERT_EQUAL(AsStringBuf(buf1.Get()), "Buffer_01"); + { + // reacquiring the last released buffer + // expecting it zero sized but having the same memory + auto buf2 = factory.Item(); + UNIT_ASSERT_VALUES_EQUAL(buf2.Get().Size(), 0u); + buf2.Get().Resize(TStringBuf("Buffer_02").Size()); + UNIT_ASSERT_EQUAL(AsStringBuf(buf2.Get()), "Buffer_02"); + } + // when the factory dies we should see no leaks +} +} +; diff --git a/library/cpp/codecs/ut/ya.make b/library/cpp/codecs/ut/ya.make new file mode 100644 index 0000000000..90841b05ef --- /dev/null +++ b/library/cpp/codecs/ut/ya.make @@ -0,0 +1,20 @@ +UNITTEST() + +OWNER( + g:base + velavokr +) + +PEERDIR( + library/cpp/string_utils/base64 + library/cpp/codecs + library/cpp/string_utils/relaxed_escaper +) + +SRCS( + tls_cache_ut.cpp + codecs_ut.cpp + float_huffman_ut.cpp +) + +END() diff --git a/library/cpp/codecs/ya.make b/library/cpp/codecs/ya.make new file mode 100644 index 0000000000..7e76fb0c9a --- /dev/null +++ b/library/cpp/codecs/ya.make @@ -0,0 +1,33 @@ +LIBRARY() + +OWNER( + g:base + velavokr +) + +SRCS( + tls_cache.cpp + codecs.cpp + codecs_registry.cpp + comptable_codec.cpp + delta_codec.cpp + float_huffman.cpp + huffman_codec.cpp + pfor_codec.cpp + solar_codec.cpp + zstd_dict_codec.cpp +) + +PEERDIR( + contrib/libs/zstd + library/cpp/bit_io + library/cpp/blockcodecs + library/cpp/codecs/greedy_dict + library/cpp/comptable + library/cpp/containers/comptrie + library/cpp/deprecated/accessors + library/cpp/packers + library/cpp/string_utils/relaxed_escaper +) + +END() diff --git a/library/cpp/codecs/zstd_dict_codec.cpp b/library/cpp/codecs/zstd_dict_codec.cpp new file mode 100644 index 0000000000..c42a2879e6 --- /dev/null +++ b/library/cpp/codecs/zstd_dict_codec.cpp @@ -0,0 +1,281 @@ +#include "zstd_dict_codec.h" + +#include <library/cpp/packers/packers.h> + +#include <util/generic/ptr.h> +#include <util/generic/refcount.h> +#include <util/generic/noncopyable.h> +#include <util/string/builder.h> +#include <util/system/src_location.h> +#include <util/ysaveload.h> + +#define ZDICT_STATIC_LINKING_ONLY + +#include <contrib/libs/zstd/include/zdict.h> +#include <contrib/libs/zstd/include/zstd.h> +#include <contrib/libs/zstd/include/zstd_errors.h> + +// See IGNIETFERRO-320 for possible bugs + +namespace NCodecs { + class TZStdDictCodec::TImpl: public TAtomicRefCount<TZStdDictCodec::TImpl> { + template <class T, size_t Deleter(T*)> + class TPtrHolder : TMoveOnly { + T* Ptr = nullptr; + + public: + TPtrHolder() = default; + + TPtrHolder(T* dict) + : Ptr(dict) + { + } + + T* Get() { + return Ptr; + } + + const T* Get() const { + return Ptr; + } + + void Reset(T* dict) { + Dispose(); + Ptr = dict; + } + + void Dispose() { + if (Ptr) { + Deleter(Ptr); + Ptr = nullptr; + } + } + + ~TPtrHolder() { + Dispose(); + } + }; + + using TCDict = TPtrHolder<ZSTD_CDict, ZSTD_freeCDict>; + using TDDict = TPtrHolder<ZSTD_DDict, ZSTD_freeDDict>; + using TCCtx = TPtrHolder<ZSTD_CCtx, ZSTD_freeCCtx>; + using TDCtx = TPtrHolder<ZSTD_DCtx, ZSTD_freeDCtx>; + + using TSizePacker = NPackers::TPacker<ui64>; + + public: + static const ui32 SampleSize = (1 << 22) * 5; + + explicit TImpl(ui32 comprLevel) + : CompressionLevel(comprLevel) + { + const size_t zeroSz = TSizePacker().MeasureLeaf(0); + Zero.Resize(zeroSz); + TSizePacker().PackLeaf(Zero.data(), 0, zeroSz); + } + + ui32 GetCompressionLevel() const { + return CompressionLevel; + } + + ui8 Encode(TStringBuf in, TBuffer& outbuf) const { + outbuf.Clear(); + + if (in.empty()) { + return 0; + } + + TSizePacker packer; + + const char* rawBeg = in.data(); + const size_t rawSz = in.size(); + + const size_t szSz = packer.MeasureLeaf(rawSz); + const size_t maxDatSz = ZSTD_compressBound(rawSz); + + outbuf.Resize(szSz + maxDatSz); + packer.PackLeaf(outbuf.data(), rawSz, szSz); + + TCCtx ctx{CheckPtr(ZSTD_createCCtx(), __LOCATION__)}; + const size_t resSz = CheckSize(ZSTD_compress_usingCDict( + ctx.Get(), outbuf.data() + szSz, maxDatSz, rawBeg, rawSz, CDict.Get()), + __LOCATION__); + + if (resSz < rawSz) { + outbuf.Resize(resSz + szSz); + } else { + outbuf.Resize(Zero.size() + rawSz); + memcpy(outbuf.data(), Zero.data(), Zero.size()); + memcpy(outbuf.data() + Zero.size(), rawBeg, rawSz); + } + return 0; + } + + void Decode(TStringBuf in, TBuffer& outbuf) const { + outbuf.Clear(); + + if (in.empty()) { + return; + } + + TSizePacker packer; + + const char* rawBeg = in.data(); + size_t rawSz = in.size(); + + const size_t szSz = packer.SkipLeaf(rawBeg); + ui64 datSz = 0; + packer.UnpackLeaf(rawBeg, datSz); + + rawBeg += szSz; + rawSz -= szSz; + + if (!datSz) { + outbuf.Resize(rawSz); + memcpy(outbuf.data(), rawBeg, rawSz); + } else { + // size_t zSz = ZSTD_getDecompressedSize(rawBeg, rawSz); + // Y_ENSURE_EX(datSz == zSz, TCodecException() << datSz << " != " << zSz); + outbuf.Resize(datSz); + TDCtx ctx{CheckPtr(ZSTD_createDCtx(), __LOCATION__)}; + CheckSize(ZSTD_decompress_usingDDict( + ctx.Get(), outbuf.data(), outbuf.size(), rawBeg, rawSz, DDict.Get()), + __LOCATION__); + outbuf.Resize(datSz); + } + } + + bool Learn(ISequenceReader& in, bool throwOnError) { + TBuffer data; + TVector<size_t> lens; + + data.Reserve(2 * SampleSize); + TStringBuf r; + while (in.NextRegion(r)) { + if (!r) { + continue; + } + data.Append(r.data(), r.size()); + lens.push_back(r.size()); + } + + ZDICT_legacy_params_t params; + memset(¶ms, 0, sizeof(params)); + params.zParams.compressionLevel = 1; + params.zParams.notificationLevel = 1; + Dict.Resize(Max<size_t>(1 << 20, data.Size() + 16 * lens.size())); + + if (!lens) { + Dict.Reset(); + } else { + size_t trainResult = ZDICT_trainFromBuffer_legacy( + Dict.data(), Dict.size(), data.Data(), const_cast<const size_t*>(&lens[0]), lens.size(), params); + if (ZSTD_isError(trainResult)) { + if (!throwOnError) { + return false; + } + CheckSize(trainResult, __LOCATION__); + } + Dict.Resize(trainResult); + Dict.ShrinkToFit(); + } + InitContexts(); + return true; + } + + void Save(IOutputStream* out) const { + ::Save(out, Dict); + } + + void Load(IInputStream* in) { + ::Load(in, Dict); + InitContexts(); + } + + void InitContexts() { + CDict.Reset(CheckPtr(ZSTD_createCDict(Dict.data(), Dict.size(), CompressionLevel), __LOCATION__)); + DDict.Reset(CheckPtr(ZSTD_createDDict(Dict.data(), Dict.size()), __LOCATION__)); + } + + static size_t CheckSize(size_t sz, TSourceLocation loc) { + if (ZSTD_isError(sz)) { + ythrow TCodecException() << loc << " " << ZSTD_getErrorName(sz) << " (code " << (int)ZSTD_getErrorCode(sz) << ")"; + } + return sz; + } + + template <class T> + static T* CheckPtr(T* t, TSourceLocation loc) { + Y_ENSURE_EX(t, TCodecException() << loc << " " + << "unexpected nullptr"); + return t; + } + + private: + ui32 CompressionLevel = 1; + + TBuffer Zero; + TBuffer Dict; + + TCDict CDict; + TDDict DDict; + }; + + TZStdDictCodec::TZStdDictCodec(ui32 comprLevel) + : Impl(new TImpl(comprLevel)) + { + MyTraits.NeedsTraining = true; + MyTraits.SizeOnEncodeMultiplier = 2; + MyTraits.SizeOnDecodeMultiplier = 10; + MyTraits.RecommendedSampleSize = TImpl::SampleSize; // same as for solar + } + + TZStdDictCodec::~TZStdDictCodec() { + } + + TString TZStdDictCodec::GetName() const { + return TStringBuilder() << MyName() << "-" << Impl->GetCompressionLevel(); + } + + ui8 TZStdDictCodec::Encode(TStringBuf in, TBuffer& out) const { + return Impl->Encode(in, out); + } + + void TZStdDictCodec::Decode(TStringBuf in, TBuffer& out) const { + Impl->Decode(in, out); + } + + void TZStdDictCodec::DoLearn(ISequenceReader& in) { + Impl = new TImpl(Impl->GetCompressionLevel()); + Impl->Learn(in, true/*throwOnError*/); + } + + bool TZStdDictCodec::DoTryToLearn(ISequenceReader& in) { + Impl = new TImpl(Impl->GetCompressionLevel()); + return Impl->Learn(in, false/*throwOnError*/); + } + + void TZStdDictCodec::Save(IOutputStream* out) const { + Impl->Save(out); + } + + void TZStdDictCodec::Load(IInputStream* in) { + Impl->Load(in); + } + + TVector<TString> TZStdDictCodec::ListCompressionNames() { + TVector<TString> res; + for (int i = 1; i <= ZSTD_maxCLevel(); ++i) { + res.emplace_back(TStringBuilder() << MyName() << "-" << i); + } + return res; + } + + int TZStdDictCodec::ParseCompressionName(TStringBuf name) { + int c = 0; + TryFromString(name.After('-'), c); + Y_ENSURE_EX(name.Before('-') == MyName() && c > 0 && c <= ZSTD_maxCLevel(), TCodecException() << "invald codec name" << name); + return c; + } + +} diff --git a/library/cpp/codecs/zstd_dict_codec.h b/library/cpp/codecs/zstd_dict_codec.h new file mode 100644 index 0000000000..59c1ad6c60 --- /dev/null +++ b/library/cpp/codecs/zstd_dict_codec.h @@ -0,0 +1,38 @@ +#pragma once + +#include "codecs.h" + +#include <util/generic/ptr.h> + +namespace NCodecs { + // benchmarks are here: https://st.yandex-team.ru/SEARCH-1655 + + class TZStdDictCodec: public ICodec { + class TImpl; + TIntrusivePtr<TImpl> Impl; + + public: + explicit TZStdDictCodec(ui32 comprLevel = 1); + ~TZStdDictCodec() override; + + static TStringBuf MyName() { + return "zstd08d"; + } + + TString GetName() const override; + + ui8 Encode(TStringBuf in, TBuffer& out) const override; + + void Decode(TStringBuf in, TBuffer& out) const override; + + static TVector<TString> ListCompressionNames(); + static int ParseCompressionName(TStringBuf); + + protected: + void DoLearn(ISequenceReader& in) override; + bool DoTryToLearn(ISequenceReader& in) final; + void Save(IOutputStream* out) const override; + void Load(IInputStream* in) override; + }; + +} |