diff options
author | Ruslan Kovalev <ruslan.a.kovalev@gmail.com> | 2022-02-10 16:46:44 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:46:44 +0300 |
commit | 59e19371de37995fcb36beb16cd6ec030af960bc (patch) | |
tree | fa68e36093ebff8b805462e9e6d331fe9d348214 /library/cpp/codecs | |
parent | 89db6fe2fe2c32d2a832ddfeb04e8d078e301084 (diff) | |
download | ydb-59e19371de37995fcb36beb16cd6ec030af960bc.tar.gz |
Restoring authorship annotation for Ruslan Kovalev <ruslan.a.kovalev@gmail.com>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/codecs')
58 files changed, 2702 insertions, 2702 deletions
diff --git a/library/cpp/codecs/README.md b/library/cpp/codecs/README.md index 42646ccd97..26fa96da59 100644 --- a/library/cpp/codecs/README.md +++ b/library/cpp/codecs/README.md @@ -1,8 +1,8 @@ This is a library of compression algorithms with a unified interface and serialization. See also library/cpp/codecs/static, where a support for statically compiled dictionaries is implemented. - + All algorithms have a common `ICodec` interface (described in codecs.h). - + The `ICodec` interface has the following methods:\ `virtual ui8 ICodec::Encode (TMemoryRegion, TBuffer&) const;`\ - Input - memory region. Output - filled buffer and the rest of the last byte, if it was not filled to the end.\ @@ -27,9 +27,9 @@ The `ICodec` interface has the following methods:\ For example, it allows you to save information about which combination of codecs was in use (see below).\ `virtual void Learn(ISequenceReader*);`\ - The interface for teaching codecs that use information about the distribution of data. - + In addition, the library has a number of utilities that allow a more flexible use of it. - + In the `ICodec` class the following methods are available:\ `static TCodecPtr GetInstance(const TString& name);`\ - Creation of a codec instance by a symbolic name\ diff --git a/library/cpp/codecs/codecs.cpp b/library/cpp/codecs/codecs.cpp index b17a3156d2..d2265dd9f9 100644 --- a/library/cpp/codecs/codecs.cpp +++ b/library/cpp/codecs/codecs.cpp @@ -1,69 +1,69 @@ -#include "codecs.h" -#include "tls_cache.h" - -#include <util/stream/mem.h> - -namespace NCodecs { +#include "codecs.h" +#include "tls_cache.h" + +#include <util/stream/mem.h> + +namespace NCodecs { void ICodec::Store(IOutputStream* out, TCodecPtr p) { if (!p.Get()) { ::Save(out, (ui16)0); return; } - + Y_ENSURE_EX(p->AlreadyTrained(), TCodecException() << "untrained codec " << p->GetName()); const TString& n = p->GetName(); Y_VERIFY(n.size() <= Max<ui16>()); ::Save(out, (ui16)n.size()); out->Write(n.data(), n.size()); p->Save(out); - } - + } + TCodecPtr ICodec::Restore(IInputStream* in) { ui16 l = 0; ::Load(in, l); - + if (!l) { return nullptr; } - + TString n; n.resize(l); - + Y_ENSURE_EX(in->Load(n.begin(), l) == l, TCodecException()); - + TCodecPtr p = ICodec::GetInstance(n); p->Load(in); p->Trained = true; return p; } - + TCodecPtr ICodec::RestoreFromString(TStringBuf s) { TMemoryInput minp{s.data(), s.size()}; return Restore(&minp); } - + TString ICodec::GetNameSafe(TCodecPtr p) { return !p ? TString("none") : p->GetName(); } - + ui8 TPipelineCodec::Encode(TStringBuf in, TBuffer& out) const { size_t res = Traits().ApproximateSizeOnEncode(in.size()); out.Reserve(res); out.Clear(); - + if (Pipeline.empty()) { out.Append(in.data(), in.size()); return 0; } else if (Pipeline.size() == 1) { return Pipeline.front()->Encode(in, out); } - + ui8 freelastbits = 0; - + auto buffer = TBufferTlsCache::TlsInstance().Item(); TBuffer& tmp = buffer.Get(); tmp.Reserve(res); - + for (auto it = Pipeline.begin(); it != Pipeline.end(); ++it) { if (it != Pipeline.begin()) { tmp.Clear(); @@ -72,15 +72,15 @@ namespace NCodecs { } freelastbits = (*it)->Encode(in, out); } - + return freelastbits; - } - + } + void TPipelineCodec::Decode(TStringBuf in, TBuffer& out) const { size_t res = Traits().ApproximateSizeOnDecode(in.size()); out.Reserve(res); out.Clear(); - + if (Pipeline.empty()) { out.Append(in.data(), in.size()); return; @@ -88,12 +88,12 @@ namespace NCodecs { Pipeline.front()->Decode(in, out); return; } - + auto buffer = TBufferTlsCache::TlsInstance().Item(); - + TBuffer& tmp = buffer.Get(); tmp.Reserve(res); - + for (TPipeline::const_reverse_iterator it = Pipeline.rbegin(); it != Pipeline.rend(); ++it) { if (it != Pipeline.rbegin()) { tmp.Clear(); @@ -101,40 +101,40 @@ namespace NCodecs { in = TStringBuf{tmp.data(), tmp.size()}; } (*it)->Decode(in, out); - } - } - + } + } + void TPipelineCodec::Save(IOutputStream* out) const { for (const auto& it : Pipeline) it->Save(out); - } - + } + void TPipelineCodec::Load(IInputStream* in) { for (const auto& it : Pipeline) { it->Load(in); it->SetTrained(true); } - } - + } + void TPipelineCodec::SetTrained(bool t) { for (const auto& it : Pipeline) { it->SetTrained(t); } - } - + } + TPipelineCodec& TPipelineCodec::AddCodec(TCodecPtr codec) { if (!codec) return *this; - + TCodecTraits tr = codec->Traits(); - + if (!MyName) { MyTraits.AssumesStructuredInput = tr.AssumesStructuredInput; MyTraits.SizeOfInputElement = tr.SizeOfInputElement; } else { MyName.append(':'); } - + MyName.append(codec->GetName()); MyTraits.PreservesPrefixGrouping &= tr.PreservesPrefixGrouping; MyTraits.PaddingBit = tr.PaddingBit; @@ -144,27 +144,27 @@ namespace NCodecs { MyTraits.SizeOnEncodeMultiplier *= tr.SizeOnEncodeMultiplier; MyTraits.SizeOnDecodeMultiplier *= tr.SizeOnDecodeMultiplier; MyTraits.RecommendedSampleSize = Max(MyTraits.RecommendedSampleSize, tr.RecommendedSampleSize); - + Pipeline.push_back(codec); return *this; - } - + } + void TPipelineCodec::DoLearnX(ISequenceReader& in, double sampleSizeMult) { if (!Traits().NeedsTraining) { return; } - + if (Pipeline.size() == 1) { Pipeline.back()->Learn(in); return; } - + TVector<TBuffer> trainingInput; - + TStringBuf r; while (in.NextRegion(r)) { trainingInput.emplace_back(r.data(), r.size()); - } + } TBuffer buff; for (const auto& it : Pipeline) { @@ -176,8 +176,8 @@ namespace NCodecs { buff.Swap(bit); } } - } - + } + bool TPipelineCodec::AlreadyTrained() const { for (const auto& it : Pipeline) { if (!it->AlreadyTrained()) @@ -185,6 +185,6 @@ namespace NCodecs { } return true; - } - -} + } + +} diff --git a/library/cpp/codecs/codecs.h b/library/cpp/codecs/codecs.h index cc5e72b285..aa7c24b4c6 100644 --- a/library/cpp/codecs/codecs.h +++ b/library/cpp/codecs/codecs.h @@ -1,63 +1,63 @@ -#pragma once - -#include "sample.h" - -#include <util/generic/bt_exception.h> -#include <util/generic/hash.h> -#include <util/generic/ptr.h> -#include <util/generic/singleton.h> - -#include <util/stream/input.h> -#include <util/stream/output.h> - +#pragma once + +#include "sample.h" + +#include <util/generic/bt_exception.h> +#include <util/generic/hash.h> +#include <util/generic/ptr.h> +#include <util/generic/singleton.h> + +#include <util/stream/input.h> +#include <util/stream/output.h> + #include <util/string/cast.h> -#include <util/string/vector.h> -#include <util/system/tls.h> -#include <util/ysaveload.h> - -namespace NCodecs { +#include <util/string/vector.h> +#include <util/system/tls.h> +#include <util/ysaveload.h> + +namespace NCodecs { class TCodecException: public TWithBackTrace<yexception> {}; - + class ICodec; - + using TCodecPtr = TIntrusivePtr<ICodec>; using TCodecConstPtr = TIntrusiveConstPtr<ICodec>; - + struct TCodecTraits { ui32 RecommendedSampleSize = 0; ui16 SizeOfInputElement = 1; ui8 SizeOnEncodeMultiplier = 1; ui8 SizeOnEncodeAddition = 0; ui8 SizeOnDecodeMultiplier = 1; - + bool NeedsTraining = false; bool PreservesPrefixGrouping = false; bool Irreversible = false; bool PaddingBit = 0; bool AssumesStructuredInput = false; - + size_t ApproximateSizeOnEncode(size_t sz) const { return sz * SizeOnEncodeMultiplier + SizeOnEncodeAddition; } - + size_t ApproximateSizeOnDecode(size_t sz) const { return sz * SizeOnDecodeMultiplier; } }; - + class ICodec: public TAtomicRefCount<ICodec> { protected: bool Trained = false; TCodecTraits MyTraits; - + public: TCodecTraits Traits() const { return MyTraits; } - + // the name of the codec (or its variant) to be used in the codec registry virtual TString GetName() const = 0; - + virtual ui8 /*free bits in last byte*/ Encode(TStringBuf, TBuffer&) const = 0; virtual ui8 Encode(const TBuffer& input, TBuffer& output) const { return Encode(TStringBuf(input.Data(), input.Data() + input.Size()), output); @@ -66,16 +66,16 @@ namespace NCodecs { virtual void Decode(const TBuffer& input, TBuffer& output) const { Decode(TStringBuf(input.Data(), input.Data() + input.Size()), output); } - + virtual ~ICodec() = default; - + virtual bool AlreadyTrained() const { return !Traits().NeedsTraining || Trained; } virtual void SetTrained(bool t) { Trained = t; } - + bool TryToLearn(ISequenceReader& r) { Trained = DoTryToLearn(r); return Trained; @@ -84,32 +84,32 @@ namespace NCodecs { void Learn(ISequenceReader& r) { LearnX(r, 1); } - + template <class TIter> void Learn(TIter beg, TIter end) { Learn(beg, end, IterToStringBuf<TIter>); } - + template <class TIter, class TGetter> void Learn(TIter beg, TIter end, TGetter getter) { auto sample = GetSample(beg, end, Traits().RecommendedSampleSize, getter); TSimpleSequenceReader<TBuffer> reader{sample}; Learn(reader); } - + static TCodecPtr GetInstance(TStringBuf name); - + static TVector<TString> GetCodecsList(); - + static TString GetNameSafe(TCodecPtr p); - + static void Store(IOutputStream* out, TCodecPtr p); static TCodecPtr Restore(IInputStream* in); static TCodecPtr RestoreFromString(TStringBuf); - + protected: virtual void DoLearn(ISequenceReader&) = 0; - + virtual bool DoTryToLearn(ISequenceReader& r) { DoLearn(r); return true; @@ -119,20 +119,20 @@ namespace NCodecs { virtual void DoLearnX(ISequenceReader& r, double /*sampleSizeMultiplier*/) { DoLearn(r); } - + virtual void Save(IOutputStream*) const { } virtual void Load(IInputStream*) { } friend class TPipelineCodec; - + public: // so the pipeline codec will know to adjust the sample for the subcodecs void LearnX(ISequenceReader& r, double sampleSizeMult) { DoLearnX(r, sampleSizeMult); Trained = true; } - + template <class TIter> void LearnX(TIter beg, TIter end, double sampleSizeMult) { auto sample = GetSample(beg, end, Traits().RecommendedSampleSize * sampleSizeMult); @@ -140,54 +140,54 @@ namespace NCodecs { LearnX(reader, sampleSizeMult); } }; - + class TBasicTrivialCodec: public ICodec { public: ui8 Encode(TStringBuf in, TBuffer& out) const override { out.Assign(in.data(), in.size()); return 0; } - + void Decode(TStringBuf in, TBuffer& out) const override { Encode(in, out); } - + protected: void DoLearn(ISequenceReader&) override { } }; - + class TTrivialCodec: public TBasicTrivialCodec { public: TTrivialCodec() { MyTraits.PreservesPrefixGrouping = true; } - + static TStringBuf MyName() { return "trivial"; } - + TString GetName() const override { return ToString(MyName()); } }; - + class TTrivialTrainableCodec: public TBasicTrivialCodec { public: TTrivialTrainableCodec() { MyTraits.PreservesPrefixGrouping = true; MyTraits.NeedsTraining = true; } - + static TStringBuf MyName() { return "trivial-trainable"; } - + TString GetName() const override { return ToString(MyName()); } }; - + class TNullCodec: public ICodec { public: TNullCodec() { @@ -195,31 +195,31 @@ namespace NCodecs { MyTraits.SizeOnDecodeMultiplier = 0; MyTraits.SizeOnEncodeMultiplier = 0; } - + TString GetName() const override { return "null"; } - + ui8 Encode(TStringBuf, TBuffer& out) const override { out.Clear(); return 0; } - + void Decode(TStringBuf, TBuffer& out) const override { out.Clear(); } - + protected: void DoLearn(ISequenceReader&) override { } }; - + class TPipelineCodec: public ICodec { typedef TVector<TCodecPtr> TPipeline; - + TPipeline Pipeline; TString MyName; - + public: explicit TPipelineCodec(TCodecPtr c0 = nullptr, TCodecPtr c1 = nullptr, TCodecPtr c2 = nullptr, TCodecPtr c3 = nullptr) { MyTraits.PreservesPrefixGrouping = true; @@ -228,32 +228,32 @@ namespace NCodecs { AddCodec(c2); AddCodec(c3); } - + TString GetName() const override { return MyName; } - + ui8 Encode(TStringBuf in, TBuffer& out) const override; void Decode(TStringBuf in, TBuffer& out) const override; - + public: /* - * Add codecs in the following order: - * uncompressed -> codec0 | codec1 | ... | codecN -> compressed - */ + * Add codecs in the following order: + * uncompressed -> codec0 | codec1 | ... | codecN -> compressed + */ TPipelineCodec& AddCodec(TCodecPtr codec); - + bool AlreadyTrained() const override; void SetTrained(bool t) override; - + protected: void DoLearn(ISequenceReader& in) override { DoLearnX(in, 1); } - + void DoLearnX(ISequenceReader& in, double sampleSizeMult) override; void Save(IOutputStream* out) const override; void Load(IInputStream* in) override; }; - -} + +} diff --git a/library/cpp/codecs/codecs_registry.cpp b/library/cpp/codecs/codecs_registry.cpp index 17d07062ab..7ccfd07a8a 100644 --- a/library/cpp/codecs/codecs_registry.cpp +++ b/library/cpp/codecs/codecs_registry.cpp @@ -1,104 +1,104 @@ -#include "codecs_registry.h" -#include "delta_codec.h" -#include "huffman_codec.h" -#include "pfor_codec.h" -#include "solar_codec.h" -#include "comptable_codec.h" -#include "zstd_dict_codec.h" - +#include "codecs_registry.h" +#include "delta_codec.h" +#include "huffman_codec.h" +#include "pfor_codec.h" +#include "solar_codec.h" +#include "comptable_codec.h" +#include "zstd_dict_codec.h" + #include <library/cpp/blockcodecs/codecs.h> - -#include <util/string/builder.h> + +#include <util/string/builder.h> #include <util/string/cast.h> - -namespace NCodecs { - TCodecPtr ICodec::GetInstance(TStringBuf name) { + +namespace NCodecs { + TCodecPtr ICodec::GetInstance(TStringBuf name) { return Singleton<NPrivate::TCodecRegistry>()->GetCodec(name); - } - + } + TVector<TString> ICodec::GetCodecsList() { return Singleton<NPrivate::TCodecRegistry>()->GetCodecsList(); - } - - namespace NPrivate { - void TCodecRegistry::RegisterFactory(TFactoryPtr fac) { + } + + namespace NPrivate { + void TCodecRegistry::RegisterFactory(TFactoryPtr fac) { TVector<TString> names = fac->ListNames(); - for (const auto& name : names) { + for (const auto& name : names) { Y_VERIFY(!Registry.contains(name), "already has %s", name.data()); - Registry[name] = fac; - } + Registry[name] = fac; + } } - TCodecPtr TCodecRegistry::GetCodec(TStringBuf name) const { - using namespace NPrivate; - - if (!name || "none" == name) { - return nullptr; - } - - if (TStringBuf::npos == name.find(':')) { + TCodecPtr TCodecRegistry::GetCodec(TStringBuf name) const { + using namespace NPrivate; + + if (!name || "none" == name) { + return nullptr; + } + + if (TStringBuf::npos == name.find(':')) { Y_ENSURE_EX(Registry.contains(name), TNoCodecException(name)); - return Registry.find(name)->second->MakeCodec(name); - } else { - TPipelineCodec* pipe = new TPipelineCodec; - + return Registry.find(name)->second->MakeCodec(name); + } else { + TPipelineCodec* pipe = new TPipelineCodec; + do { - TStringBuf v = name.NextTok(':'); - pipe->AddCodec(GetCodec(v)); - } while (name); - - return pipe; - } - } - + TStringBuf v = name.NextTok(':'); + pipe->AddCodec(GetCodec(v)); + } while (name); + + return pipe; + } + } + TVector<TString> TCodecRegistry::GetCodecsList() const { - using namespace NPrivate; + using namespace NPrivate; TVector<TString> vs; - vs.push_back("none"); - - for (const auto& it : Registry) { - vs.push_back(it.first); - } - - Sort(vs.begin(), vs.end()); - return vs; - } - + vs.push_back("none"); + + for (const auto& it : Registry) { + vs.push_back(it.first); + } + + Sort(vs.begin(), vs.end()); + return vs; + } + struct TSolarCodecFactory : ICodecFactory { - TCodecPtr MakeCodec(TStringBuf name) const override { - if (TSolarCodec::MyNameShortInt() == name) { - return new TSolarCodecShortInt(); - } - if (TSolarCodec::MyName() == name) { - return new TSolarCodec(); - } + TCodecPtr MakeCodec(TStringBuf name) const override { + if (TSolarCodec::MyNameShortInt() == name) { + return new TSolarCodecShortInt(); + } + if (TSolarCodec::MyName() == name) { + return new TSolarCodec(); + } if (name.EndsWith(TStringBuf("-a"))) { return MakeCodecImpl<TAdaptiveSolarCodec>(name, name.SubStr(TSolarCodec::MyName().size()).Chop(2)); - } else { + } else { return MakeCodecImpl<TSolarCodec>(name, name.SubStr(TSolarCodec::MyName().size())); - } - } - + } + } + template <class TCodecCls> - TCodecPtr MakeCodecImpl(const TStringBuf& name, const TStringBuf& type) const { + TCodecPtr MakeCodecImpl(const TStringBuf& name, const TStringBuf& type) const { if (TStringBuf("-8k") == type) { - return new TCodecCls(1 << 13); - } + return new TCodecCls(1 << 13); + } if (TStringBuf("-16k") == type) { - return new TCodecCls(1 << 14); - } + return new TCodecCls(1 << 14); + } if (TStringBuf("-32k") == type) { - return new TCodecCls(1 << 15); - } + return new TCodecCls(1 << 15); + } if (TStringBuf("-64k") == type) { - return new TCodecCls(1 << 16); - } + return new TCodecCls(1 << 16); + } if (TStringBuf("-256k") == type) { - return new TCodecCls(1 << 18); - } - ythrow TNoCodecException(name); - } - + return new TCodecCls(1 << 18); + } + ythrow TNoCodecException(name); + } + TVector<TString> ListNames() const override { TVector<TString> vs; vs.push_back(ToString(TSolarCodec::MyName())); @@ -113,114 +113,114 @@ namespace NCodecs { vs.push_back(ToString(TSolarCodec::MyName64kAdapt())); vs.push_back(ToString(TSolarCodec::MyName256kAdapt())); vs.push_back(ToString(TSolarCodec::MyNameShortInt())); - return vs; - } - }; - + return vs; + } + }; + struct TZStdDictCodecFactory : ICodecFactory { - TCodecPtr MakeCodec(TStringBuf name) const override { - return new TZStdDictCodec(TZStdDictCodec::ParseCompressionName(name)); - } - + TCodecPtr MakeCodec(TStringBuf name) const override { + return new TZStdDictCodec(TZStdDictCodec::ParseCompressionName(name)); + } + TVector<TString> ListNames() const override { - return TZStdDictCodec::ListCompressionNames(); - } - }; - + return TZStdDictCodec::ListCompressionNames(); + } + }; + struct TCompTableCodecFactory : ICodecFactory { - TCodecPtr MakeCodec(TStringBuf name) const override { - if (TCompTableCodec::MyNameHQ() == name) { - return new TCompTableCodec(TCompTableCodec::Q_HIGH); - } else if (TCompTableCodec::MyNameLQ() == name) { - return new TCompTableCodec(TCompTableCodec::Q_LOW); - } else { - Y_ENSURE_EX(false, TNoCodecException(name)); - return nullptr; - } - } - + TCodecPtr MakeCodec(TStringBuf name) const override { + if (TCompTableCodec::MyNameHQ() == name) { + return new TCompTableCodec(TCompTableCodec::Q_HIGH); + } else if (TCompTableCodec::MyNameLQ() == name) { + return new TCompTableCodec(TCompTableCodec::Q_LOW); + } else { + Y_ENSURE_EX(false, TNoCodecException(name)); + return nullptr; + } + } + TVector<TString> ListNames() const override { TVector<TString> vs; vs.push_back(ToString(TCompTableCodec::MyNameHQ())); vs.push_back(ToString(TCompTableCodec::MyNameLQ())); - return vs; - } - }; - + return vs; + } + }; + struct TBlockCodec : ICodec { - const NBlockCodecs::ICodec* Codec; - - TBlockCodec(TStringBuf name) + const NBlockCodecs::ICodec* Codec; + + TBlockCodec(TStringBuf name) : Codec(NBlockCodecs::Codec(name)) - { - } - + { + } + TString GetName() const override { return ToString(Codec->Name()); - } - - ui8 Encode(TStringBuf r, TBuffer& b) const override { - Codec->Encode(r, b); - return 0; - } - - void Decode(TStringBuf r, TBuffer& b) const override { - // TODO: throws exception that is not TCodecException - Codec->Decode(r, b); - } - - protected: - void DoLearn(ISequenceReader&) override { - } - }; - + } + + ui8 Encode(TStringBuf r, TBuffer& b) const override { + Codec->Encode(r, b); + return 0; + } + + void Decode(TStringBuf r, TBuffer& b) const override { + // TODO: throws exception that is not TCodecException + Codec->Decode(r, b); + } + + protected: + void DoLearn(ISequenceReader&) override { + } + }; + struct TBlockCodecsFactory : ICodecFactory { using TRegistry = THashMap<TString, TCodecPtr>; - TRegistry Registry; - + TRegistry Registry; + TBlockCodecsFactory() { for (TStringBuf codec : NBlockCodecs::ListAllCodecs()) { Register(codec); } - } - - void Register(TStringBuf name) { - TCodecPtr p = Registry[name] = new TBlockCodec(name); - Registry[p->GetName()] = p; - } - - TCodecPtr MakeCodec(TStringBuf name) const override { + } + + void Register(TStringBuf name) { + TCodecPtr p = Registry[name] = new TBlockCodec(name); + Registry[p->GetName()] = p; + } + + TCodecPtr MakeCodec(TStringBuf name) const override { if (!Registry.contains(name)) { - ythrow TNoCodecException(name); - } - return Registry.find(name)->second; - } - + ythrow TNoCodecException(name); + } + return Registry.find(name)->second; + } + TVector<TString> ListNames() const override { TVector<TString> res; - for (const auto& it : Registry) { - res.push_back(it.first); - } - return res; - } - }; - + for (const auto& it : Registry) { + res.push_back(it.first); + } + return res; + } + }; + TCodecRegistry::TCodecRegistry() { - RegisterFactory(new TInstanceFactory<TTrivialCodec>); - RegisterFactory(new TInstanceFactory<TTrivialTrainableCodec>); - RegisterFactory(new TInstanceFactory<THuffmanCodec>); + RegisterFactory(new TInstanceFactory<TTrivialCodec>); + RegisterFactory(new TInstanceFactory<TTrivialTrainableCodec>); + RegisterFactory(new TInstanceFactory<THuffmanCodec>); RegisterFactory(new TInstanceFactory<TPForCodec<ui64, true>>); RegisterFactory(new TInstanceFactory<TPForCodec<ui32, true>>); - RegisterFactory(new TSolarCodecFactory); - RegisterFactory(new TZStdDictCodecFactory); - RegisterFactory(new TCompTableCodecFactory); - RegisterFactory(new TBlockCodecsFactory); - } - - } - - void RegisterCodecFactory(TCodecFactoryPtr fact) { - Singleton<NPrivate::TCodecRegistry>()->RegisterFactory(fact); - } - -} + RegisterFactory(new TSolarCodecFactory); + RegisterFactory(new TZStdDictCodecFactory); + RegisterFactory(new TCompTableCodecFactory); + RegisterFactory(new TBlockCodecsFactory); + } + + } + + void RegisterCodecFactory(TCodecFactoryPtr fact) { + Singleton<NPrivate::TCodecRegistry>()->RegisterFactory(fact); + } + +} diff --git a/library/cpp/codecs/codecs_registry.h b/library/cpp/codecs/codecs_registry.h index 53710310d5..31170afd62 100644 --- a/library/cpp/codecs/codecs_registry.h +++ b/library/cpp/codecs/codecs_registry.h @@ -1,60 +1,60 @@ -#pragma once - -#include "codecs.h" +#pragma once + +#include "codecs.h" #include <util/string/cast.h> - -namespace NCodecs { + +namespace NCodecs { struct TNoCodecException : TCodecException { TNoCodecException(TStringBuf name) { - (*this) << "unknown codec: " << name; - } - }; - + (*this) << "unknown codec: " << name; + } + }; + struct ICodecFactory : TAtomicRefCount<ICodecFactory> { - virtual ~ICodecFactory() = default; - virtual TCodecPtr MakeCodec(TStringBuf name) const = 0; + virtual ~ICodecFactory() = default; + virtual TCodecPtr MakeCodec(TStringBuf name) const = 0; virtual TVector<TString> ListNames() const = 0; - }; - - typedef TIntrusivePtr<ICodecFactory> TCodecFactoryPtr; - - namespace NPrivate { + }; + + typedef TIntrusivePtr<ICodecFactory> TCodecFactoryPtr; + + namespace NPrivate { template <typename TCodec> struct TInstanceFactory : ICodecFactory { - TCodecPtr MakeCodec(TStringBuf) const override { - return new TCodec; - } - + TCodecPtr MakeCodec(TStringBuf) const override { + return new TCodec; + } + TVector<TString> ListNames() const override { TVector<TString> vs; vs.push_back(ToString(TCodec::MyName())); - return vs; - } - }; - - class TCodecRegistry { + return vs; + } + }; + + class TCodecRegistry { using TRegistry = THashMap<TString, TIntrusivePtr<ICodecFactory>>; - TRegistry Registry; - - public: - using TFactoryPtr = TIntrusivePtr<ICodecFactory>; - - TCodecRegistry(); - - void RegisterFactory(TFactoryPtr fac); - - TCodecPtr GetCodec(TStringBuf name) const; - + TRegistry Registry; + + public: + using TFactoryPtr = TIntrusivePtr<ICodecFactory>; + + TCodecRegistry(); + + void RegisterFactory(TFactoryPtr fac); + + TCodecPtr GetCodec(TStringBuf name) const; + TVector<TString> GetCodecsList() const; - }; - - } - - void RegisterCodecFactory(TCodecFactoryPtr fact); - + }; + + } + + void RegisterCodecFactory(TCodecFactoryPtr fact); + template <typename TCodec> - void RegisterCodec() { - RegisterCodecFactory(new NPrivate::TInstanceFactory<TCodec>()); - } - -} + void RegisterCodec() { + RegisterCodecFactory(new NPrivate::TInstanceFactory<TCodec>()); + } + +} diff --git a/library/cpp/codecs/comptable_codec.cpp b/library/cpp/codecs/comptable_codec.cpp index 476b8ada80..1eca4354c6 100644 --- a/library/cpp/codecs/comptable_codec.cpp +++ b/library/cpp/codecs/comptable_codec.cpp @@ -1,108 +1,108 @@ -#include "comptable_codec.h" - +#include "comptable_codec.h" + #include <library/cpp/comptable/comptable.h> #include <util/string/cast.h> - -namespace NCodecs { + +namespace NCodecs { class TCompTableCodec::TImpl: public TAtomicRefCount<TImpl> { - public: - TImpl(EQuality q) - : Quality(q) + public: + TImpl(EQuality q) + : Quality(q) { } - - void Init() { - Compressor.Reset(new NCompTable::TChunkCompressor{(bool)Quality, Table}); - Decompressor.Reset(new NCompTable::TChunkDecompressor{(bool)Quality, Table}); - } - - ui8 Encode(TStringBuf in, TBuffer& out) const { - out.Clear(); - if (!in) { - return 0; - } - + + void Init() { + Compressor.Reset(new NCompTable::TChunkCompressor{(bool)Quality, Table}); + Decompressor.Reset(new NCompTable::TChunkDecompressor{(bool)Quality, Table}); + } + + ui8 Encode(TStringBuf in, TBuffer& out) const { + out.Clear(); + if (!in) { + return 0; + } + TVector<char> result; - Compressor->Compress(in, &result); - out.Assign(&result[0], result.size()); - return 0; - } - - void Decode(TStringBuf in, TBuffer& out) const { - out.Clear(); - if (!in) { - return; - } - + Compressor->Compress(in, &result); + out.Assign(&result[0], result.size()); + return 0; + } + + void Decode(TStringBuf in, TBuffer& out) const { + out.Clear(); + if (!in) { + return; + } + TVector<char> result; - Decompressor->Decompress(in, &result); - out.Assign(&result[0], result.size()); - } - - void DoLearn(ISequenceReader& in) { - NCompTable::TDataSampler sampler; - TStringBuf region; - while (in.NextRegion(region)) { - if (!region) { - continue; - } - - sampler.AddStat(region); - } - - sampler.BuildTable(Table); - Init(); - } - + Decompressor->Decompress(in, &result); + out.Assign(&result[0], result.size()); + } + + void DoLearn(ISequenceReader& in) { + NCompTable::TDataSampler sampler; + TStringBuf region; + while (in.NextRegion(region)) { + if (!region) { + continue; + } + + sampler.AddStat(region); + } + + sampler.BuildTable(Table); + Init(); + } + void Save(IOutputStream* out) const { - ::Save(out, Table); - } - + ::Save(out, Table); + } + void Load(IInputStream* in) { - ::Load(in, Table); - Init(); - } - - NCompTable::TCompressorTable Table; - THolder<NCompTable::TChunkCompressor> Compressor; - THolder<NCompTable::TChunkDecompressor> Decompressor; - const EQuality Quality; - static const ui32 SampleSize = Max(NCompTable::TDataSampler::Size * 4, (1 << 22) * 5); - }; - - TCompTableCodec::TCompTableCodec(EQuality q) - : Impl(new TImpl{q}) - { - MyTraits.NeedsTraining = true; - MyTraits.SizeOnEncodeMultiplier = 2; - MyTraits.SizeOnDecodeMultiplier = 10; - MyTraits.RecommendedSampleSize = TImpl::SampleSize; - } - - TCompTableCodec::~TCompTableCodec() = default; - + ::Load(in, Table); + Init(); + } + + NCompTable::TCompressorTable Table; + THolder<NCompTable::TChunkCompressor> Compressor; + THolder<NCompTable::TChunkDecompressor> Decompressor; + const EQuality Quality; + static const ui32 SampleSize = Max(NCompTable::TDataSampler::Size * 4, (1 << 22) * 5); + }; + + TCompTableCodec::TCompTableCodec(EQuality q) + : Impl(new TImpl{q}) + { + MyTraits.NeedsTraining = true; + MyTraits.SizeOnEncodeMultiplier = 2; + MyTraits.SizeOnDecodeMultiplier = 10; + MyTraits.RecommendedSampleSize = TImpl::SampleSize; + } + + TCompTableCodec::~TCompTableCodec() = default; + TString TCompTableCodec::GetName() const { return ToString(Impl->Quality ? MyNameHQ() : MyNameLQ()); - } - - ui8 TCompTableCodec::Encode(TStringBuf in, TBuffer& out) const { - return Impl->Encode(in, out); - } - - void TCompTableCodec::Decode(TStringBuf in, TBuffer& out) const { - Impl->Decode(in, out); - } - - void TCompTableCodec::DoLearn(ISequenceReader& in) { - Impl->DoLearn(in); - } - + } + + ui8 TCompTableCodec::Encode(TStringBuf in, TBuffer& out) const { + return Impl->Encode(in, out); + } + + void TCompTableCodec::Decode(TStringBuf in, TBuffer& out) const { + Impl->Decode(in, out); + } + + void TCompTableCodec::DoLearn(ISequenceReader& in) { + Impl->DoLearn(in); + } + void TCompTableCodec::Save(IOutputStream* out) const { - Impl->Save(out); - } - + Impl->Save(out); + } + void TCompTableCodec::Load(IInputStream* in) { - Impl->Load(in); - } - -} + Impl->Load(in); + } + +} diff --git a/library/cpp/codecs/comptable_codec.h b/library/cpp/codecs/comptable_codec.h index 7ba4f4c543..1a10c8241e 100644 --- a/library/cpp/codecs/comptable_codec.h +++ b/library/cpp/codecs/comptable_codec.h @@ -1,40 +1,40 @@ -#pragma once - -#include "codecs.h" - -#include <util/generic/ptr.h> - -namespace NCodecs { +#pragma once + +#include "codecs.h" + +#include <util/generic/ptr.h> + +namespace NCodecs { class TCompTableCodec: public ICodec { class TImpl; TIntrusivePtr<TImpl> Impl; - + public: enum EQuality { Q_LOW = 0, Q_HIGH = 1 }; - + explicit TCompTableCodec(EQuality q = Q_HIGH); ~TCompTableCodec() override; - + static TStringBuf MyNameHQ() { return "comptable-hq"; } static TStringBuf MyNameLQ() { return "comptable-lq"; } - + TString GetName() const override; - + ui8 Encode(TStringBuf in, TBuffer& out) const override; - + void Decode(TStringBuf in, TBuffer& out) const override; - + protected: void DoLearn(ISequenceReader& in) override; void Save(IOutputStream* out) const override; void Load(IInputStream* in) override; }; - -} + +} diff --git a/library/cpp/codecs/delta_codec.cpp b/library/cpp/codecs/delta_codec.cpp index 61606d6f6f..28d6b6e3bb 100644 --- a/library/cpp/codecs/delta_codec.cpp +++ b/library/cpp/codecs/delta_codec.cpp @@ -1,6 +1,6 @@ -#include "delta_codec.h" - -namespace NCodecs { +#include "delta_codec.h" + +namespace NCodecs { template <> TStringBuf TDeltaCodec<ui64, true>::MyName() { return "delta64-unsigned"; @@ -17,5 +17,5 @@ namespace NCodecs { TStringBuf TDeltaCodec<ui32, false>::MyName() { return "delta32-signed"; } - -} + +} diff --git a/library/cpp/codecs/delta_codec.h b/library/cpp/codecs/delta_codec.h index 21325825e6..7398b3ae80 100644 --- a/library/cpp/codecs/delta_codec.h +++ b/library/cpp/codecs/delta_codec.h @@ -1,102 +1,102 @@ -#pragma once - -#include "codecs.h" - +#pragma once + +#include "codecs.h" + #include <util/generic/array_ref.h> -#include <util/generic/typetraits.h> +#include <util/generic/typetraits.h> #include <util/generic/bitops.h> #include <util/string/cast.h> - -namespace NCodecs { + +namespace NCodecs { template <typename T = ui64, bool UnsignedDelta = true> class TDeltaCodec: public ICodec { static_assert(std::is_integral<T>::value, "expect std::is_integral<T>::value"); - + public: using TUnsigned = std::make_unsigned_t<T>; using TSigned = std::make_signed_t<T>; using TDelta = std::conditional_t<UnsignedDelta, TUnsigned, TSigned>; - + private: const TDelta MinDelta{Min<TDelta>()}; const TDelta MaxDelta{Max<TDelta>() - 1}; const TDelta InvalidDelta{MaxDelta + 1}; - + Y_FORCE_INLINE static TDelta AddSafe(TUnsigned a, TUnsigned b) { return a + b; } - + Y_FORCE_INLINE static TDelta SubSafe(TUnsigned a, TUnsigned b) { return a - b; } - + public: struct TDecoder { const TDelta InvalidDelta{Max<TDelta>()}; - + T Last = 0; T Result = 0; - + bool First = true; bool Invalid = false; - + Y_FORCE_INLINE bool Decode(TDelta t) { if (Y_UNLIKELY(First)) { First = false; Result = Last = t; return true; } - + if (Y_UNLIKELY(Invalid)) { Invalid = false; Last = 0; Result = t; return true; } - + Result = (Last += t); Invalid = t == InvalidDelta; return !Invalid; - } + } }; - + public: static TStringBuf MyName(); - + TDeltaCodec() { MyTraits.SizeOfInputElement = sizeof(T); MyTraits.AssumesStructuredInput = true; - } - + } + TString GetName() const override { return ToString(MyName()); } - + template <class TItem> static void AppendTo(TBuffer& b, TItem t) { b.Append((char*)&t, sizeof(t)); } - + ui8 Encode(TStringBuf s, TBuffer& b) const override { b.Clear(); if (s.empty()) { return 0; } - + b.Reserve(s.size()); TArrayRef<const T> tin{(const T*)s.data(), s.size() / sizeof(T)}; - + const T* it = tin.begin(); TDelta last = *(it++); AppendTo(b, last); - + TDelta maxt = SubSafe(MaxDelta, last); TDelta mint = AddSafe(MinDelta, last); - + for (; it != tin.end(); ++it) { TDelta t = *it; - + if (Y_LIKELY((t >= mint) & (t <= maxt))) { AppendTo(b, t - last); last = t; @@ -111,33 +111,33 @@ namespace NCodecs { mint = MinDelta; } } - + return 0; } - + void Decode(TStringBuf s, TBuffer& b) const override { b.Clear(); if (s.empty()) { return; - } - + } + b.Reserve(s.size()); TArrayRef<const T> tin{(const T*)s.data(), s.size() / sizeof(T)}; - + TDecoder dec; - + for (const T* it = tin.begin(); it != tin.end(); ++it) { T tmp; memcpy(&tmp, it, sizeof(tmp)); if (dec.Decode(tmp)) { AppendTo(b, dec.Result); } - } - } - + } + } + protected: void DoLearn(ISequenceReader&) override { } }; - -} + +} diff --git a/library/cpp/codecs/float_huffman.h b/library/cpp/codecs/float_huffman.h index 786a8eae1d..f03fc240ce 100644 --- a/library/cpp/codecs/float_huffman.h +++ b/library/cpp/codecs/float_huffman.h @@ -5,7 +5,7 @@ #include <util/generic/strbuf.h> #include <array> - + namespace NCodecs::NFloatHuff { TString Encode(TArrayRef<const float> factors); diff --git a/library/cpp/codecs/greedy_dict/gd_builder.cpp b/library/cpp/codecs/greedy_dict/gd_builder.cpp index 561bfbca01..2fb46029bf 100644 --- a/library/cpp/codecs/greedy_dict/gd_builder.cpp +++ b/library/cpp/codecs/greedy_dict/gd_builder.cpp @@ -1,85 +1,85 @@ -#include "gd_builder.h" - +#include "gd_builder.h" + #include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> -#include <util/generic/algorithm.h> - -#include <util/random/shuffle.h> +#include <util/generic/algorithm.h> + +#include <util/random/shuffle.h> #include <util/stream/output.h> -#include <util/string/printf.h> -#include <util/system/rusage.h> - -namespace NGreedyDict { +#include <util/string/printf.h> +#include <util/system/rusage.h> + +namespace NGreedyDict { void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) { if (!Current) { Current = MakeHolder<TEntrySet>(); Current->InitWithAlpha(); } - + TEntrySet& set = *Current; - + for (auto& it : set) it.Count = 0; - + CompoundCounts = nullptr; CompoundCountsPool.Clear(); - + if (!final) { CompoundCounts = MakeHolder<TCompoundCounts>(&CompoundCountsPool); CompoundCounts->reserve(maxcand); } - + Shuffle(Input.begin(), Input.end(), Rng); - + for (auto str : Input) { if (!final && CompoundCounts->size() > maxcand) break; - + i32 prev = -1; - + while (!!str) { TEntry* e = set.FindPrefix(str); ui32 num = e->Number; - + e->Count += 1; if (!final && prev >= 0) { (*CompoundCounts)[Compose(prev, num)] += 1; } - + prev = num; ++set.TotalCount; - } + } } - + Current->SetModelP(); - } - + } + ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) { TAutoPtr<TEntrySet> newset = new TEntrySet; newset->InitWithAlpha(); maxent -= newset->size(); - + ui32 additions = 0; ui32 deletions = 0; - + { const TEntrySet& set = *Current; - + Candidates.clear(); const ui32 total = set.TotalCount; const float minpval = Settings.MinPValue; const EEntryStatTest test = Settings.StatTest; const EEntryScore score = Settings.Score; const ui32 mincnt = Settings.MinAbsCount; - + for (const auto& it : set) { const TEntry& e = it; float modelp = e.ModelP; ui32 cnt = e.Count; - + if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval) Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number)); } - + if (!!CompoundCounts) { for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) { const TEntry& prev = set.Get(Prev(it->first)); @@ -89,13 +89,13 @@ namespace NGreedyDict { if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval) Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first)); } - } - + } + Sort(Candidates.begin(), Candidates.end()); - + if (Candidates.size() > maxent) Candidates.resize(maxent); - + for (const auto& candidate : Candidates) { if (IsCompound(candidate.second)) { additions++; @@ -103,40 +103,40 @@ namespace NGreedyDict { } else { newset->Add(set.Get(candidate.second).Str); } - } + } deletions = set.size() - (newset->size() - additions); - } - + } + Current = newset; Current->BuildHierarchy(); return deletions + additions; - } - + } + ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) { size_t totalsz = 0; for (auto it : Input) totalsz += it.size(); - + while (maxiters) { maxiters--; - + RebuildCounts(maxentries * Settings.GrowLimit, false); - + if (Settings.Verbose) { TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size()); Clog << Sprintf("%-110s RSS=%" PRIu32 "M", mess.data(), (ui32)(TRusage::Get().MaxRss >> 20)) << Endl; } - + ui32 diff = BuildNextGeneration(maxentries); if (Current->size() == maxentries && diff < mindiff) break; - } - + } + RebuildCounts(0, true); Current->SetScores(Settings.Score); return maxiters; - } - -} + } + +} diff --git a/library/cpp/codecs/greedy_dict/gd_builder.h b/library/cpp/codecs/greedy_dict/gd_builder.h index b8e9a5e37b..7f3cea88cb 100644 --- a/library/cpp/codecs/greedy_dict/gd_builder.h +++ b/library/cpp/codecs/greedy_dict/gd_builder.h @@ -1,94 +1,94 @@ -#pragma once - -#include "gd_entry.h" - -#include <util/generic/hash.h> -#include <util/random/fast.h> - -namespace NGreedyDict { +#pragma once + +#include "gd_entry.h" + +#include <util/generic/hash.h> +#include <util/random/fast.h> + +namespace NGreedyDict { struct TBuildSettings { EEntryStatTest StatTest = EST_SIMPLE_NORM; EEntryScore Score = ES_LEN_SIMPLE; - + float MinPValue = 0.75; ui32 MinAbsCount = 10; ui32 GrowLimit = 10; // times of maxentries bool Verbose = false; }; - + class TDictBuilder { using TCompoundCounts = THashMap<ui64, ui32, THash<ui64>, TEqualTo<ui64>, TPoolAllocator>; using TCandidate = std::pair<float, ui64>; using TCandidates = TVector<TCandidate>; - + private: TFastRng64 Rng{0x1a5d0ac170565c1c, 0x0be7bc27, 0x6235f6f57820aa0d, 0xafdc7fb}; TStringBufs Input; - + THolder<TEntrySet> Current; - + TMemoryPool CompoundCountsPool; THolder<TCompoundCounts> CompoundCounts; - + TCandidates Candidates; - + TBuildSettings Settings; - + public: TDictBuilder(const TBuildSettings& s = TBuildSettings()) : CompoundCountsPool(8112, TMemoryPool::TLinearGrow::Instance()) , Settings(s) { } - + void SetInput(const TStringBufs& in) { Input = in; } - + const TBuildSettings& GetSettings() const { return Settings; } - + TBuildSettings& GetSettings() { return Settings; } - + void SetSettings(const TBuildSettings& s) { Settings = s; } - + TEntrySet& EntrySet() { return *Current; } - + const TEntrySet& EntrySet() const { return *Current; } - + THolder<TEntrySet> ReleaseEntrySet() { return std::move(Current); } - + ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10); - + public: void RebuildCounts(ui32 maxcand, bool final); ui32 /*diff size*/ BuildNextGeneration(ui32 maxent); - + static bool IsCompound(ui64 ent) { return ent & 0xFFFFFFFF00000000ULL; } - + static ui32 Next(ui64 ent) { return ent; } static ui32 Prev(ui64 ent) { return (ent >> 32) - 1; } - + static ui64 Compose(ui32 prev, ui32 next) { return ((prev + 1ULL) << 32) | next; } }; - -} + +} diff --git a/library/cpp/codecs/greedy_dict/gd_entry.cpp b/library/cpp/codecs/greedy_dict/gd_entry.cpp index 2c315c7f7c..f23a754976 100644 --- a/library/cpp/codecs/greedy_dict/gd_entry.cpp +++ b/library/cpp/codecs/greedy_dict/gd_entry.cpp @@ -1,98 +1,98 @@ -#include "gd_entry.h" -#include "gd_stats.h" - -#include <util/generic/algorithm.h> -#include <util/generic/singleton.h> - -namespace NGreedyDict { +#include "gd_entry.h" +#include "gd_stats.h" + +#include <util/generic/algorithm.h> +#include <util/generic/singleton.h> + +namespace NGreedyDict { class TAlphas { char Memory[512]; - + public: TStringBufs Alphas; - + TAlphas() { for (ui32 i = 0; i < 256; ++i) { Memory[2 * i] = (char)i; Memory[2 * i + 1] = 0; - + Alphas.push_back(TStringBuf(&Memory[2 * i], 1)); } } }; - + void TEntrySet::InitWithAlpha() { Pool.ClearKeepFirstChunk(); const TStringBufs& a = Singleton<TAlphas>()->Alphas; for (auto it : a) { Add(it); - } + } BuildHierarchy(); - } - + } + void TEntrySet::BuildHierarchy() { Sort(begin(), end(), TEntry::StrLess); - + TCompactTrieBuilder<char, ui32, TAsIsPacker<ui32>> builder(CTBF_PREFIX_GROUPED); - + for (iterator it = begin(); it != end(); ++it) { it->Number = (it - begin()); TStringBuf suff = it->Str; size_t len = 0; ui32 val = 0; - + if (builder.FindLongestPrefix(suff.data(), suff.size(), &len, &val) && len) { it->NearestPrefix = val; } - + builder.Add(suff.data(), suff.size(), it->Number); - } - + } + TBufferOutput bout; builder.Save(bout); Trie.Init(TBlob::FromBuffer(bout.Buffer())); - } - + } + TEntry* TEntrySet::FindPrefix(TStringBuf& str) { size_t len = 0; ui32 off = 0; - + if (!Trie.FindLongestPrefix(str, &len, &off)) { return nullptr; } - + str.Skip(len); return &Get(off); - } - + } + void TEntrySet::SetModelP() { for (iterator it = begin(); it != end(); ++it) { TEntry& e = *it; - + if (!e.HasPrefix()) { e.ModelP = 0; continue; } - + TStringBuf suff = e.Str; const TEntry& p = Get(e.NearestPrefix); suff.Skip(p.Len()); - + float modelp = float(p.Count + e.Count) / TotalCount; - + while (!!suff) { TEntry* pp = FindPrefix(suff); modelp *= float(pp->Count + e.Count) / TotalCount; } - + e.ModelP = modelp; - } - } - + } + } + void TEntrySet::SetScores(EEntryScore s) { for (auto& it : *this) { it.Score = Score(s, it.Len(), it.ModelP, it.Count, TotalCount); } - } - -} + } + +} diff --git a/library/cpp/codecs/greedy_dict/gd_entry.h b/library/cpp/codecs/greedy_dict/gd_entry.h index 18b5be0e15..0362fd9f99 100644 --- a/library/cpp/codecs/greedy_dict/gd_entry.h +++ b/library/cpp/codecs/greedy_dict/gd_entry.h @@ -1,42 +1,42 @@ -#pragma once - -#include "gd_stats.h" - +#pragma once + +#include "gd_stats.h" + #include <library/cpp/containers/comptrie/comptrie.h> - -#include <util/generic/ptr.h> -#include <util/generic/strbuf.h> -#include <util/generic/vector.h> - -#include <util/memory/pool.h> - -namespace NGreedyDict { + +#include <util/generic/ptr.h> +#include <util/generic/strbuf.h> +#include <util/generic/vector.h> + +#include <util/memory/pool.h> + +namespace NGreedyDict { using TStringBufs = TVector<TStringBuf>; - + struct TEntry { static const i32 NoPrefix = -1; - + TStringBuf Str; - + i32 NearestPrefix = NoPrefix; ui32 Count = 0; ui32 Number = 0; float ModelP = 0; float Score = 0; - + TEntry(TStringBuf b = TStringBuf(), ui32 cnt = 0) : Str(b) , Count(cnt) { } - + bool HasPrefix() const { return NearestPrefix != NoPrefix; } ui32 Len() const { return Str.size(); } - + static bool StrLess(const TEntry& a, const TEntry& b) { return a.Str < b.Str; } @@ -47,20 +47,20 @@ namespace NGreedyDict { return a.Score > b.Score; } }; - + class TEntrySet: public TVector<TEntry>, TNonCopyable { TMemoryPool Pool{8112}; TCompactTrie<char, ui32, TAsIsPacker<ui32>> Trie; - + public: ui32 TotalCount = 0; - + void InitWithAlpha(); - + void Add(TStringBuf a) { push_back(TStringBuf(Pool.Append(a.data(), a.size()), a.size())); } - + void Add(TStringBuf a, TStringBuf b) { size_t sz = a.size() + b.size(); char* p = (char*)Pool.Allocate(sz); @@ -68,36 +68,36 @@ namespace NGreedyDict { memcpy(p + a.size(), b.data(), b.size()); push_back(TStringBuf(p, sz)); } - + TEntry& Get(ui32 idx) { return (*this)[idx]; } - + const TEntry& Get(ui32 idx) const { return (*this)[idx]; } - + void BuildHierarchy(); - + // longest prefix TEntry* FindPrefix(TStringBuf& str); - + const TEntry* FindPrefix(TStringBuf& str) const { return ((TEntrySet*)this)->FindPrefix(str); } - + const TEntry* FirstPrefix(const TEntry& e, TStringBuf& suff) { if (!e.HasPrefix()) return nullptr; - + const TEntry& p = Get(e.NearestPrefix); suff = e.Str; suff.Skip(p.Str.size()); return &p; } - + void SetModelP(); void SetScores(EEntryScore); }; - -} + +} diff --git a/library/cpp/codecs/greedy_dict/gd_stats.h b/library/cpp/codecs/greedy_dict/gd_stats.h index b63c4c38d2..3c209fc67d 100644 --- a/library/cpp/codecs/greedy_dict/gd_stats.h +++ b/library/cpp/codecs/greedy_dict/gd_stats.h @@ -1,10 +1,10 @@ -#pragma once - +#pragma once + #include <util/generic/ymath.h> -#include <util/generic/algorithm.h> -#include <util/generic/yexception.h> - -namespace NGreedyDict { +#include <util/generic/algorithm.h> +#include <util/generic/yexception.h> + +namespace NGreedyDict { enum EEntryScore { ES_COUNT, ES_LEN_COUNT, @@ -12,33 +12,33 @@ namespace NGreedyDict { ES_LEN_SIMPLE, ES_SOLAR }; - + enum EEntryStatTest { EST_NONE = 0, EST_SIMPLE_NORM = 2 }; - + inline float ModelP(ui32 countA, ui32 countB, ui32 total) { return float(countA) * countB / total / total; } - + // P (ab | dependent) inline float SimpleTest(float modelp, ui32 countAB, ui32 total) { float realp = float(countAB) / total; return modelp >= realp ? 0 : (realp - modelp); } - + inline float SolarTest(float modelp, ui32 countAB, ui32 total) { float realp = float(countAB) / total; return modelp >= realp ? 0 : (modelp + realp * (log(realp / modelp) - 1)); } - + // P (ab | dependent) / P (ab) inline float SimpleTestNorm(float modelp, ui32 countAB, ui32 total) { float realp = float(countAB) / total; return modelp >= realp ? 0 : (realp - modelp) / realp; } - + inline float StatTest(EEntryStatTest test, float modelp, ui32 countAB, ui32 total) { if (!total) { return 0; @@ -50,9 +50,9 @@ namespace NGreedyDict { return SimpleTestNorm(modelp, countAB, total); } Y_FAIL("no way!"); - return 0; - } - + return 0; + } + inline float Score(EEntryScore score, ui32 len, float modelp, ui32 count, ui32 total) { if (!total) { return 0; @@ -73,7 +73,7 @@ namespace NGreedyDict { return SolarTest(modelp, count, total); } Y_FAIL("no way!"); - return 0; - } - -} + return 0; + } + +} diff --git a/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp index 679089a11b..60ab9f7c30 100644 --- a/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp +++ b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp @@ -1,282 +1,282 @@ -#include "gd_builder.h" - +#include "gd_builder.h" + #include <library/cpp/testing/unittest/registar.h> #include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> -#include <util/string/printf.h> +#include <util/string/printf.h> #include <util/generic/ymath.h> - -class TGreedyDictTest: public TTestBase { + +class TGreedyDictTest: public TTestBase { UNIT_TEST_SUITE(TGreedyDictTest); - UNIT_TEST(TestEntrySet) - UNIT_TEST(TestBuilder0) - UNIT_TEST(TestBuilder) + UNIT_TEST(TestEntrySet) + UNIT_TEST(TestBuilder0) + UNIT_TEST(TestBuilder) UNIT_TEST_SUITE_END(); - - void TestEntrySet() { - using namespace NGreedyDict; - - { - TEntrySet d; - - d.InitWithAlpha(); - - for (TEntrySet::const_iterator it = d.begin(); it != d.end(); ++it) { - UNIT_ASSERT_C(!it->HasPrefix(), Sprintf("%u -> %u", it->Number, it->NearestPrefix)); - UNIT_ASSERT_VALUES_EQUAL(it->Number, (ui32)(it - d.begin())); - } - - UNIT_ASSERT_VALUES_EQUAL(d.size(), 256u); - TStringBuf s = "aaabbb"; - UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a"); - UNIT_ASSERT_VALUES_EQUAL(s, "aabbb"); - UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a"); - UNIT_ASSERT_VALUES_EQUAL(s, "abbb"); - UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a"); - UNIT_ASSERT_VALUES_EQUAL(s, "bbb"); - UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b"); - UNIT_ASSERT_VALUES_EQUAL(s, "bb"); - UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b"); - UNIT_ASSERT_VALUES_EQUAL(s, "b"); - UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b"); - UNIT_ASSERT_VALUES_EQUAL(s, ""); - s = TStringBuf("", 1); - UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, TStringBuf("", 1)); - UNIT_ASSERT_VALUES_EQUAL(s, ""); - s = "\xFF"; - UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "\xFF"); - UNIT_ASSERT_VALUES_EQUAL(s, ""); - } - { - TEntrySet d; - d.Add("a"); - d.Add("b"); - d.Add("b", "a"); - d.BuildHierarchy(); - - UNIT_ASSERT_VALUES_EQUAL(d.size(), 3u); - - TStringBuf s = "bab"; - UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "ba"); - UNIT_ASSERT_VALUES_EQUAL(s, "b"); - } - { - TEntrySet d; - - d.Add("a"); - d.Add("aa"); - d.Add("aaa"); - d.Add("aab"); - d.Add("b"); - d.Add("ba"); - - d.BuildHierarchy(); - - UNIT_ASSERT_VALUES_EQUAL(d.size(), 6u); - { - TStringBuf s = "aaaaa"; - const TEntry* e = d.FindPrefix(s); - UNIT_ASSERT_VALUES_EQUAL(e->Str, "aaa"); - UNIT_ASSERT_VALUES_EQUAL(e->Number, 2u); - UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 1); - UNIT_ASSERT_VALUES_EQUAL(s, "aa"); - } - - { - TStringBuf s = "a"; - const TEntry* e = d.FindPrefix(s); - UNIT_ASSERT_VALUES_EQUAL(e->Str, "a"); - UNIT_ASSERT_VALUES_EQUAL(e->Number, 0u); - UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1); - UNIT_ASSERT_VALUES_EQUAL(s, ""); - } - - { - TStringBuf s = "bab"; - const TEntry* e = d.FindPrefix(s); - UNIT_ASSERT_VALUES_EQUAL(e->Str, "ba"); - UNIT_ASSERT_VALUES_EQUAL(e->Number, 5u); - UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 4); - UNIT_ASSERT_VALUES_EQUAL(s, "b"); - } - - { - TStringBuf s = "bba"; - const TEntry* e = d.FindPrefix(s); - UNIT_ASSERT_VALUES_EQUAL(e->Str, "b"); - UNIT_ASSERT_VALUES_EQUAL(e->Number, 4u); - UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1); - UNIT_ASSERT_VALUES_EQUAL(s, "ba"); - } - } - } - - void TestBuilder0() { - using namespace NGreedyDict; - ui32 a = 1, b = 11; - ui64 ab = TDictBuilder::Compose(a, b); - UNIT_ASSERT(TDictBuilder::IsCompound(ab)); - UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Prev(ab), a); - UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Next(ab), b); - } - - void FillData(NGreedyDict::TStringBufs& data) { + + void TestEntrySet() { + using namespace NGreedyDict; + + { + TEntrySet d; + + d.InitWithAlpha(); + + for (TEntrySet::const_iterator it = d.begin(); it != d.end(); ++it) { + UNIT_ASSERT_C(!it->HasPrefix(), Sprintf("%u -> %u", it->Number, it->NearestPrefix)); + UNIT_ASSERT_VALUES_EQUAL(it->Number, (ui32)(it - d.begin())); + } + + UNIT_ASSERT_VALUES_EQUAL(d.size(), 256u); + TStringBuf s = "aaabbb"; + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a"); + UNIT_ASSERT_VALUES_EQUAL(s, "aabbb"); + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a"); + UNIT_ASSERT_VALUES_EQUAL(s, "abbb"); + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a"); + UNIT_ASSERT_VALUES_EQUAL(s, "bbb"); + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b"); + UNIT_ASSERT_VALUES_EQUAL(s, "bb"); + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b"); + UNIT_ASSERT_VALUES_EQUAL(s, "b"); + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b"); + UNIT_ASSERT_VALUES_EQUAL(s, ""); + s = TStringBuf("", 1); + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, TStringBuf("", 1)); + UNIT_ASSERT_VALUES_EQUAL(s, ""); + s = "\xFF"; + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "\xFF"); + UNIT_ASSERT_VALUES_EQUAL(s, ""); + } + { + TEntrySet d; + d.Add("a"); + d.Add("b"); + d.Add("b", "a"); + d.BuildHierarchy(); + + UNIT_ASSERT_VALUES_EQUAL(d.size(), 3u); + + TStringBuf s = "bab"; + UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "ba"); + UNIT_ASSERT_VALUES_EQUAL(s, "b"); + } + { + TEntrySet d; + + d.Add("a"); + d.Add("aa"); + d.Add("aaa"); + d.Add("aab"); + d.Add("b"); + d.Add("ba"); + + d.BuildHierarchy(); + + UNIT_ASSERT_VALUES_EQUAL(d.size(), 6u); + { + TStringBuf s = "aaaaa"; + const TEntry* e = d.FindPrefix(s); + UNIT_ASSERT_VALUES_EQUAL(e->Str, "aaa"); + UNIT_ASSERT_VALUES_EQUAL(e->Number, 2u); + UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 1); + UNIT_ASSERT_VALUES_EQUAL(s, "aa"); + } + + { + TStringBuf s = "a"; + const TEntry* e = d.FindPrefix(s); + UNIT_ASSERT_VALUES_EQUAL(e->Str, "a"); + UNIT_ASSERT_VALUES_EQUAL(e->Number, 0u); + UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1); + UNIT_ASSERT_VALUES_EQUAL(s, ""); + } + + { + TStringBuf s = "bab"; + const TEntry* e = d.FindPrefix(s); + UNIT_ASSERT_VALUES_EQUAL(e->Str, "ba"); + UNIT_ASSERT_VALUES_EQUAL(e->Number, 5u); + UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 4); + UNIT_ASSERT_VALUES_EQUAL(s, "b"); + } + + { + TStringBuf s = "bba"; + const TEntry* e = d.FindPrefix(s); + UNIT_ASSERT_VALUES_EQUAL(e->Str, "b"); + UNIT_ASSERT_VALUES_EQUAL(e->Number, 4u); + UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1); + UNIT_ASSERT_VALUES_EQUAL(s, "ba"); + } + } + } + + void TestBuilder0() { + using namespace NGreedyDict; + ui32 a = 1, b = 11; + ui64 ab = TDictBuilder::Compose(a, b); + UNIT_ASSERT(TDictBuilder::IsCompound(ab)); + UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Prev(ab), a); + UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Next(ab), b); + } + + void FillData(NGreedyDict::TStringBufs& data) { static const char* urls[] = {"http://53.ru/car/motors/foreign/opel/tigra/", "http://abakan.24au.ru/tender/85904/", "http://anm15.gulaig.com/", "http://avto-parts.com/mercedes-benz/mercedes-benz-w220-1998-2005/category-442/category-443/", "http://ballooncousin.co.uk/", "http://benzol.ru/equipment/?id=1211&parent=514", "http://blazingseorank.com/blazing-seo-rank-free-website-analysis-to-increase-rank-and-traffic-450.html", "http://blogblaugrana.contadorwebmasters.com/", "http://bristolhash.org.uk/bh3cntct.php", "http://broker.borovichi.ru/category/item/3/1/0/8/28/257", "http://canoncompactcamerax.blogspot.com/", "http://classifieds.smashits.com/p,107881,email-to-friend.htm", "http://conferences.ksde.org/Portals/132/FallAssessment/SAVETHEDAY-FA09.pdf", "http://eway.vn/raovat/325-dien-tu-gia-dung/337-dieu-hoa/98041-b1-sua-may-lanh-quan-binh-tan-sua-may-lanh-quan-binh-chanh-hh-979676119-toan-quoc.html", "http://gallery.e2bn.org/asset73204_8-.html", "http://goplay.nsw.gov.au/activities-for-kids/by/historic-houses-trust/?startdate=2012-07-10", "http://grichards19067.multiply.com/", "http://hotkovo.egent.ru/user/89262269084/", "http://howimetyourself.com/?redirect_to=http://gomiso.com/m/suits/seasons/2/episodes/2", "http://islamqa.com/hi/ref/9014/DEAD%20PEOPLE%20GOOD%20DEEDS", "http://lapras.rutube.ru/", "http://nceluiko.ya.ru/", "http://nyanyanyanyaa.beon.ru/", "http://ozbo.com/Leaf-River-DV-7SS-7-0-MP-Game-Camera-K1-32541.html", "http://sbantom.ru/catalog/chasy/632753.html", "http://shopingoff.com/index.php?option=com_virtuemart&Itemid=65&category_id=&page=shop.browse&manufacturer_id=122&limit=32&limitstart=96", "http://shopingoff.com/katalog-odezhdy/manufacturer/62-christian-audigier.html?limit=32&start=448", "https://webwinkel.ah.nl/process?fh_location=//ecommerce/nl_NL/categories%3C%7Becommerce_shoc1%7D/it_show_product_code_1384%3E%7B10%3B20%7D/pr_startdate%3C20120519/pr_enddate%3E20120519/pr_ltc_allowed%3E%7Bbowi%7D/categories%3C%7Becommerce_shoc1_1al%7D/categories%3C%7Becommerce_shoc1_1al_1ahal%7D&&action=albert_noscript.modules.build", "http://top100.rambler.ru/navi/?theme=208/210/371&rgn=17", "http://volgogradskaya-oblast.extra-m.ru/classifieds/rabota/vakansii/banki-investicii/901467/", "http://wikien4.appspot.com/wiki/Warburg_hypothesis", "http://wola_baranowska.kamerzysta24.com.pl/", "http://www.10dot0dot0dot1.com/", "http://www.anima-redux.ru/index.php?key=gifts+teenage+girls", "http://www.aquaticabyseaworld.com/Calendar.aspx/CP/CP/CP/sp-us/CP/CP/ParkMap/Tickets/Weather.aspx", "http://www.autousa.com/360spin/2012_cadillac_ctssportwagon_3.6awdpremiumcollection.htm", "http://www.booking.com/city/gb/paignton-aireborough.html?inac=0&lang=pl", "http://www.booking.com/city/it/vodo-cadore.en.html", "http://www.booking.com/district/us/new-york/rockefeller-center.html&lang=no", "http://www.booking.com/hotel/bg/crown-fort-club.lv.html", "http://www.booking.com/hotel/ca/gouverneur-rimouski.ar.html", "http://www.booking.com/hotel/ch/l-auberge-du-chalet-a-gobet.fi.html", "http://www.booking.com/hotel/de/mark-garni.ru.html?aid=337384;label=yandex-hotel-mark-garni-68157-%7Bparam1%7D", "http://www.booking.com/hotel/de/mercure-goldschmieding-castrop-rauxel.ro.html", "http://www.booking.com/hotel/de/zollenspieker-fahrhaus.fr.html", "http://www.booking.com/hotel/es/jardin-metropolitano.ca.html", "http://www.booking.com/hotel/fr/clim.fr.html", "http://www.booking.com/hotel/fr/radisson-sas-toulouse-airport.et.html", "http://www.booking.com/hotel/gb/stgileshotel.ro.html?srfid=68c7fe42a03653a8796c84435c5299e4X16?tab=4", "http://www.booking.com/hotel/gr/rodos-park-suites.ru.html", "http://www.booking.com/hotel/id/le-grande-suites-bali.ru.html", "http://www.booking.com/hotel/it/mozart.it.html?aid=321655", "http://www.booking.com/hotel/ni/bahia-del-sol-villas.ru.html?dcid=1;dva=0", "http://www.booking.com/hotel/nl/cpschiphol.ro.html.ro.html?tab=4", "http://www.booking.com/hotel/th/laem-din.en-gb.html", "http://www.booking.com/hotel/th/tinidee-ranong.en.html", "http://www.booking.com/hotel/us/best-western-plus-merrimack-valley.hu.html", "http://www.booking.com/hotel/vn/tan-hai-long.km.html", "http://www.booking.com/landmark/au/royal-brisbane-women-s-hospital.vi.html", "http://www.booking.com/landmark/hk/nam-cheong-station.html&lang=id", "http://www.booking.com/landmark/it/spanish-steps.ca.html", "http://www.booking.com/landmark/sg/asian-civilisations-museum.html&lang=fi", "http://www.booking.com/place/fi-1376029.pt.html", "http://www.booking.com/place/tn257337.pl.html", "http://www.booking.com/region/ca/niagarafalls.ar.html&selected_currency=PLN", "http://www.booking.com/region/mx/queretaro.pt-pt.html&selected_currency=AUD", "http://www.booking.com/searchresults.en.html?city=20063074", "http://www.booking.com/searchresults.et.html?checkin=;checkout=;city=-394632", "http://www.booking.com/searchresults.lv.html?region=3936", "http://www.cevredanismanlari.com/index.php/component/k2/index.php/mevzuat/genel-yazlar/item/dosyalar/index.php?option=com_k2&view=item&id=16:iso-14001-%C3%A7evre-y%C3%B6netim-sistemi&Itemid=132&limitstart=107120", "http://www.dh-wholesaler.com/MENS-POLO-RACING-TEE-RL-p-417.html", "http://www.employabilityonline.net/", "http://www.esso.inc.ru/board/tools.php?event=profile&pname=Invinerrq", "http://www.filesurgery.ru/searchfw/kids_clothes-3.html", "http://www.furnitureandcarpetsource.com/Item.aspx?ItemID=-2107311899&ItemNum=53-T3048", "http://www.gets.cn/product/Gold-Sand-Lampwork-Glass-Beads--Flat-round--28x28x13mm_p260717.html", "http://www.gets.cn/wholesale-Sterling-Silver-Pendant-Findings-3577_S--L-Star-P-1.html?view=1&by=1", "http://www.homeandgardenadvice.com/diy/Mortgages_Loans_and_Financing/9221.html", "http://www.hongkongairport.com/eng/index.html/passenger/passenger/transport/to-from-airport/business/about-the-airport/transport/shopping/entertainment/t2/passenger/interactive-map.html", "http://www.hongkongairport.com/eng/index.html/shopping/insideshopping/all/passenger/transfer-transit/all/airline-information/shopping/entertainment/t2/business/about-the-airport/welcome.html", "http://www.hongkongairport.com/eng/index.html/transport/business/about-the-airport/transport/business/airport-authority/passenger/shopping/dining/all/dining.html", "http://www.idedge.com/index.cfm/fuseaction/category.display/category_id/298/index.cfm", "http://www.istanbulburda.com/aramalar.php", "http://www.jewelryinthenet.com/ads/AdDetail.aspx?AdID=1-0311002490689&stid=22-0111001020877", "http://www.johnnydepp.ru/forum/index.php?showtopic=1629&mode=linearplus&view=findpost&p=186977", "http://www.johnnydepp.ru/forum/index.php?showtopic=476&st=60&p=87379&", "http://www.joseleano.com/joomla/index.php/audio", "http://www.kaplicarehberi.com/tag/sakar-ilicali-kaplicalari/feed", "http://www.khaber.com.tr/arama.html?key=%C3%A7avdar", "http://www.kiz-oyunlari1.com/1783/4437/4363/1056/4170/Bump-Copter2-.html", "http://www.kiz-oyunlari1.com/3752/2612/4175/1166/3649/1047/Angelina-Oyunu.html", "http://www.kiz-oyunlari1.com/4266/3630/3665/3286/4121/301/3274/Sinir-Sinekler-.html", "http://www.kuldiga.lv/index.php?f=8&cat=371", "http://www.kuldiga.lv/index.php/img/index.php?l=lv&art_id=1836&show_c=&cat=85", "http://www.patronessa.ru/remontiruemsya/kuzovnie30raboti.html", "http://www.rapdict.org/Nu_Money?title=Talk:Nu_Money&action=edit", "http://www.serafin-phu.tabor24.com/?page=8", "http://www.shoes-store.org/brand1/Kids/Minnetonka.html", "http://www.shoes-store.org/shoes-store.xml", "http://www.way2allah.com/khotab-download-34695.htm"}; - data.clear(); + data.clear(); data.insert(data.begin(), urls, urls + Y_ARRAY_SIZE(urls)); - } - + } + typedef THashMap<TStringBuf, NGreedyDict::TEntry> TDict; - - TAutoPtr<NGreedyDict::TEntrySet> DoTestBuilder(const NGreedyDict::TBuildSettings& s, + + TAutoPtr<NGreedyDict::TEntrySet> DoTestBuilder(const NGreedyDict::TBuildSettings& s, TDict& res) { - using namespace NGreedyDict; - - TStringBufs data; - FillData(data); - - TDictBuilder b(s); - b.SetInput(data); - b.Build(256 + 128); - - TEntrySet& set = b.EntrySet(); - + using namespace NGreedyDict; + + TStringBufs data; + FillData(data); + + TDictBuilder b(s); + b.SetInput(data); + b.Build(256 + 128); + + TEntrySet& set = b.EntrySet(); + for (const auto& it : set) { if (it.Score) { res[it.Str] = it; - } - } - - return b.ReleaseEntrySet(); - } - - void DoAssertEntry(TStringBuf entry, ui32 number, i32 parent, float score, const TDict& dict) { - TDict::const_iterator it = dict.find(entry); - UNIT_ASSERT_C(it != dict.end(), entry); - UNIT_ASSERT_VALUES_EQUAL_C(it->second.Number, number, entry); - UNIT_ASSERT_VALUES_EQUAL_C(it->second.NearestPrefix, parent, entry); - UNIT_ASSERT_VALUES_EQUAL_C(round(it->second.Score * 10000), round(score * 10000), entry); - } - - void TestBuilder() { - TAutoPtr<NGreedyDict::TEntrySet> set; + } + } + + return b.ReleaseEntrySet(); + } + + void DoAssertEntry(TStringBuf entry, ui32 number, i32 parent, float score, const TDict& dict) { + TDict::const_iterator it = dict.find(entry); + UNIT_ASSERT_C(it != dict.end(), entry); + UNIT_ASSERT_VALUES_EQUAL_C(it->second.Number, number, entry); + UNIT_ASSERT_VALUES_EQUAL_C(it->second.NearestPrefix, parent, entry); + UNIT_ASSERT_VALUES_EQUAL_C(round(it->second.Score * 10000), round(score * 10000), entry); + } + + void TestBuilder() { + TAutoPtr<NGreedyDict::TEntrySet> set; THashMap<TStringBuf, NGreedyDict::TEntry> res; - NGreedyDict::TBuildSettings s; - set = DoTestBuilder(s, res); - - UNIT_ASSERT_VALUES_EQUAL(set->size(), 295u); - UNIT_ASSERT_VALUES_EQUAL(res.size(), 110u); - - DoAssertEntry("%", 37, -1, 0.00375193, res); - DoAssertEntry("%7", 38, 37, 0.00513299, res); - DoAssertEntry("&", 39, -1, 0.00794527, res); - DoAssertEntry("+", 44, -1, 0.000441404, res); - DoAssertEntry(",", 45, -1, 0.000441404, res); - DoAssertEntry("-", 46, -1, 0.0417126, res); - DoAssertEntry(".", 47, -1, 0.0196425, res); - DoAssertEntry(".com/", 48, 47, 0.0374482, res); - DoAssertEntry(".html", 49, 47, 0.0496577, res); - DoAssertEntry(".html?", 50, 49, 0.0153908, res); - DoAssertEntry(".php", 51, 47, 0.0123585, res); - DoAssertEntry(".ru/", 52, 47, 0.0150027, res); - DoAssertEntry("/", 53, -1, 0.0452439, res); - DoAssertEntry("/index", 54, 53, 0.0158905, res); - DoAssertEntry("0", 55, -1, 0.00816597, res); - DoAssertEntry("1", 56, -1, 0.0167733, res); - DoAssertEntry("10", 57, 56, 0.00530474, res); - DoAssertEntry("2", 58, -1, 0.0101523, res); - DoAssertEntry("20", 59, 58, 0.00674234, res); - DoAssertEntry("3", 60, -1, 0.01258, res); - DoAssertEntry("32", 61, 60, 0.00490697, res); - DoAssertEntry("4", 62, -1, 0.00993158, res); - DoAssertEntry("5", 63, -1, 0.00617965, res); - DoAssertEntry("6", 64, -1, 0.00971088, res); - DoAssertEntry("7", 65, -1, 0.0101523, res); - DoAssertEntry("8", 66, -1, 0.00728316, res); - DoAssertEntry("9", 67, -1, 0.00728316, res); - DoAssertEntry(":", 68, -1, 0.000662106, res); - DoAssertEntry(";", 69, -1, 0.000882807, res); - DoAssertEntry("=", 71, -1, 0.01258, res); - DoAssertEntry("?", 73, -1, 0.00397263, res); - DoAssertEntry("A", 75, -1, 0.00264842, res); - DoAssertEntry("B", 76, -1, 0.00220702, res); - DoAssertEntry("C", 77, -1, 0.00353123, res); - DoAssertEntry("D", 78, -1, 0.00375193, res); - DoAssertEntry("E", 79, -1, 0.00286912, res); - DoAssertEntry("F", 80, -1, 0.00110351, res); - DoAssertEntry("G", 81, -1, 0.00110351, res); - DoAssertEntry("H", 82, -1, 0.000220702, res); - DoAssertEntry("I", 83, -1, 0.00198632, res); - DoAssertEntry("K", 85, -1, 0.000441404, res); - DoAssertEntry("L", 86, -1, 0.00198632, res); - DoAssertEntry("M", 87, -1, 0.00154491, res); - DoAssertEntry("N", 88, -1, 0.00154491, res); - DoAssertEntry("O", 89, -1, 0.00132421, res); - DoAssertEntry("P", 90, -1, 0.00308983, res); - DoAssertEntry("R", 92, -1, 0.000662106, res); - DoAssertEntry("S", 93, -1, 0.00264842, res); - DoAssertEntry("T", 94, -1, 0.00110351, res); - DoAssertEntry("U", 95, -1, 0.000220702, res); - DoAssertEntry("V", 96, -1, 0.000441404, res); - DoAssertEntry("W", 97, -1, 0.000441404, res); - DoAssertEntry("X", 98, -1, 0.000220702, res); - DoAssertEntry("Y", 99, -1, 0.000220702, res); - DoAssertEntry("_", 105, -1, 0.00904877, res); - DoAssertEntry("a", 107, -1, 0.0505407, res); - DoAssertEntry("an", 108, 107, 0.018273, res); - DoAssertEntry("ar", 109, 107, 0.0169385, res); - DoAssertEntry("b", 110, -1, 0.0156698, res); - DoAssertEntry("c", 111, -1, 0.018539, res); - DoAssertEntry("cat", 112, 111, 0.00846732, res); - DoAssertEntry("ch", 113, 111, 0.00644872, res); - DoAssertEntry("com", 114, 111, 0.00724235, res); - DoAssertEntry("ct", 115, 111, 0.00605729, res); - DoAssertEntry("d", 116, -1, 0.020746, res); - DoAssertEntry("di", 117, 116, 0.00730659, res); - DoAssertEntry("e", 118, -1, 0.0624586, res); - DoAssertEntry("en", 119, 118, 0.0108999, res); - DoAssertEntry("ent", 120, 119, 0.00616002, res); - DoAssertEntry("f", 121, -1, 0.00860737, res); - DoAssertEntry("fi", 122, 121, 0.00423196, res); - DoAssertEntry("g", 123, -1, 0.0180975, res); - DoAssertEntry("go", 124, 123, 0.00601862, res); - DoAssertEntry("h", 125, -1, 0.010373, res); - DoAssertEntry("ho", 126, 125, 0.00570298, res); - DoAssertEntry("http://", 127, 125, 0.0494372, res); - DoAssertEntry("http://www.", 128, 127, 0.0849702, res); - DoAssertEntry("http://www.booking.com/", 129, 128, 0.071066, res); - DoAssertEntry("http://www.booking.com/hotel/", 130, 129, 0.121607, res); - DoAssertEntry("i", 131, -1, 0.0258221, res); - DoAssertEntry("id=", 132, 131, 0.00725369, res); - DoAssertEntry("im", 133, 131, 0.00373318, res); - DoAssertEntry("in", 134, 131, 0.013625, res); - DoAssertEntry("ing", 135, 134, 0.00795491, res); - DoAssertEntry("ion", 136, 131, 0.00796149, res); - DoAssertEntry("it", 137, 131, 0.00953416, res); - DoAssertEntry("j", 138, -1, 0.00132421, res); - DoAssertEntry("k", 139, -1, 0.0134628, res); - DoAssertEntry("l", 140, -1, 0.0381814, res); - DoAssertEntry("m", 141, -1, 0.0174354, res); - DoAssertEntry("mer", 142, 141, 0.00711846, res); - DoAssertEntry("n", 143, -1, 0.0132421, res); - DoAssertEntry("o", 144, -1, 0.0302362, res); - DoAssertEntry("on", 145, 144, 0.00802271, res); - DoAssertEntry("ou", 146, 144, 0.00414545, res); - DoAssertEntry("p", 147, -1, 0.0225116, res); - DoAssertEntry("port", 148, 147, 0.0123532, res); - DoAssertEntry("q", 149, -1, 0.00176561, res); - DoAssertEntry("r", 150, -1, 0.0401677, res); - DoAssertEntry("ran", 151, 150, 0.00686918, res); - DoAssertEntry("s", 152, -1, 0.0487751, res); - DoAssertEntry("sho", 153, 152, 0.0113876, res); - DoAssertEntry("t", 154, -1, 0.0379607, res); - DoAssertEntry("u", 155, -1, 0.0211874, res); - DoAssertEntry("v", 156, -1, 0.00595895, res); - DoAssertEntry("vi", 157, 156, 0.00480673, res); - DoAssertEntry("w", 158, -1, 0.00816597, res); - DoAssertEntry("x", 159, -1, 0.00375193, res); - DoAssertEntry("y", 160, -1, 0.0130214, res); - DoAssertEntry("z", 161, -1, 0.00353123, res); - } -}; - -UNIT_TEST_SUITE_REGISTRATION(TGreedyDictTest); + NGreedyDict::TBuildSettings s; + set = DoTestBuilder(s, res); + + UNIT_ASSERT_VALUES_EQUAL(set->size(), 295u); + UNIT_ASSERT_VALUES_EQUAL(res.size(), 110u); + + DoAssertEntry("%", 37, -1, 0.00375193, res); + DoAssertEntry("%7", 38, 37, 0.00513299, res); + DoAssertEntry("&", 39, -1, 0.00794527, res); + DoAssertEntry("+", 44, -1, 0.000441404, res); + DoAssertEntry(",", 45, -1, 0.000441404, res); + DoAssertEntry("-", 46, -1, 0.0417126, res); + DoAssertEntry(".", 47, -1, 0.0196425, res); + DoAssertEntry(".com/", 48, 47, 0.0374482, res); + DoAssertEntry(".html", 49, 47, 0.0496577, res); + DoAssertEntry(".html?", 50, 49, 0.0153908, res); + DoAssertEntry(".php", 51, 47, 0.0123585, res); + DoAssertEntry(".ru/", 52, 47, 0.0150027, res); + DoAssertEntry("/", 53, -1, 0.0452439, res); + DoAssertEntry("/index", 54, 53, 0.0158905, res); + DoAssertEntry("0", 55, -1, 0.00816597, res); + DoAssertEntry("1", 56, -1, 0.0167733, res); + DoAssertEntry("10", 57, 56, 0.00530474, res); + DoAssertEntry("2", 58, -1, 0.0101523, res); + DoAssertEntry("20", 59, 58, 0.00674234, res); + DoAssertEntry("3", 60, -1, 0.01258, res); + DoAssertEntry("32", 61, 60, 0.00490697, res); + DoAssertEntry("4", 62, -1, 0.00993158, res); + DoAssertEntry("5", 63, -1, 0.00617965, res); + DoAssertEntry("6", 64, -1, 0.00971088, res); + DoAssertEntry("7", 65, -1, 0.0101523, res); + DoAssertEntry("8", 66, -1, 0.00728316, res); + DoAssertEntry("9", 67, -1, 0.00728316, res); + DoAssertEntry(":", 68, -1, 0.000662106, res); + DoAssertEntry(";", 69, -1, 0.000882807, res); + DoAssertEntry("=", 71, -1, 0.01258, res); + DoAssertEntry("?", 73, -1, 0.00397263, res); + DoAssertEntry("A", 75, -1, 0.00264842, res); + DoAssertEntry("B", 76, -1, 0.00220702, res); + DoAssertEntry("C", 77, -1, 0.00353123, res); + DoAssertEntry("D", 78, -1, 0.00375193, res); + DoAssertEntry("E", 79, -1, 0.00286912, res); + DoAssertEntry("F", 80, -1, 0.00110351, res); + DoAssertEntry("G", 81, -1, 0.00110351, res); + DoAssertEntry("H", 82, -1, 0.000220702, res); + DoAssertEntry("I", 83, -1, 0.00198632, res); + DoAssertEntry("K", 85, -1, 0.000441404, res); + DoAssertEntry("L", 86, -1, 0.00198632, res); + DoAssertEntry("M", 87, -1, 0.00154491, res); + DoAssertEntry("N", 88, -1, 0.00154491, res); + DoAssertEntry("O", 89, -1, 0.00132421, res); + DoAssertEntry("P", 90, -1, 0.00308983, res); + DoAssertEntry("R", 92, -1, 0.000662106, res); + DoAssertEntry("S", 93, -1, 0.00264842, res); + DoAssertEntry("T", 94, -1, 0.00110351, res); + DoAssertEntry("U", 95, -1, 0.000220702, res); + DoAssertEntry("V", 96, -1, 0.000441404, res); + DoAssertEntry("W", 97, -1, 0.000441404, res); + DoAssertEntry("X", 98, -1, 0.000220702, res); + DoAssertEntry("Y", 99, -1, 0.000220702, res); + DoAssertEntry("_", 105, -1, 0.00904877, res); + DoAssertEntry("a", 107, -1, 0.0505407, res); + DoAssertEntry("an", 108, 107, 0.018273, res); + DoAssertEntry("ar", 109, 107, 0.0169385, res); + DoAssertEntry("b", 110, -1, 0.0156698, res); + DoAssertEntry("c", 111, -1, 0.018539, res); + DoAssertEntry("cat", 112, 111, 0.00846732, res); + DoAssertEntry("ch", 113, 111, 0.00644872, res); + DoAssertEntry("com", 114, 111, 0.00724235, res); + DoAssertEntry("ct", 115, 111, 0.00605729, res); + DoAssertEntry("d", 116, -1, 0.020746, res); + DoAssertEntry("di", 117, 116, 0.00730659, res); + DoAssertEntry("e", 118, -1, 0.0624586, res); + DoAssertEntry("en", 119, 118, 0.0108999, res); + DoAssertEntry("ent", 120, 119, 0.00616002, res); + DoAssertEntry("f", 121, -1, 0.00860737, res); + DoAssertEntry("fi", 122, 121, 0.00423196, res); + DoAssertEntry("g", 123, -1, 0.0180975, res); + DoAssertEntry("go", 124, 123, 0.00601862, res); + DoAssertEntry("h", 125, -1, 0.010373, res); + DoAssertEntry("ho", 126, 125, 0.00570298, res); + DoAssertEntry("http://", 127, 125, 0.0494372, res); + DoAssertEntry("http://www.", 128, 127, 0.0849702, res); + DoAssertEntry("http://www.booking.com/", 129, 128, 0.071066, res); + DoAssertEntry("http://www.booking.com/hotel/", 130, 129, 0.121607, res); + DoAssertEntry("i", 131, -1, 0.0258221, res); + DoAssertEntry("id=", 132, 131, 0.00725369, res); + DoAssertEntry("im", 133, 131, 0.00373318, res); + DoAssertEntry("in", 134, 131, 0.013625, res); + DoAssertEntry("ing", 135, 134, 0.00795491, res); + DoAssertEntry("ion", 136, 131, 0.00796149, res); + DoAssertEntry("it", 137, 131, 0.00953416, res); + DoAssertEntry("j", 138, -1, 0.00132421, res); + DoAssertEntry("k", 139, -1, 0.0134628, res); + DoAssertEntry("l", 140, -1, 0.0381814, res); + DoAssertEntry("m", 141, -1, 0.0174354, res); + DoAssertEntry("mer", 142, 141, 0.00711846, res); + DoAssertEntry("n", 143, -1, 0.0132421, res); + DoAssertEntry("o", 144, -1, 0.0302362, res); + DoAssertEntry("on", 145, 144, 0.00802271, res); + DoAssertEntry("ou", 146, 144, 0.00414545, res); + DoAssertEntry("p", 147, -1, 0.0225116, res); + DoAssertEntry("port", 148, 147, 0.0123532, res); + DoAssertEntry("q", 149, -1, 0.00176561, res); + DoAssertEntry("r", 150, -1, 0.0401677, res); + DoAssertEntry("ran", 151, 150, 0.00686918, res); + DoAssertEntry("s", 152, -1, 0.0487751, res); + DoAssertEntry("sho", 153, 152, 0.0113876, res); + DoAssertEntry("t", 154, -1, 0.0379607, res); + DoAssertEntry("u", 155, -1, 0.0211874, res); + DoAssertEntry("v", 156, -1, 0.00595895, res); + DoAssertEntry("vi", 157, 156, 0.00480673, res); + DoAssertEntry("w", 158, -1, 0.00816597, res); + DoAssertEntry("x", 159, -1, 0.00375193, res); + DoAssertEntry("y", 160, -1, 0.0130214, res); + DoAssertEntry("z", 161, -1, 0.00353123, res); + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TGreedyDictTest); diff --git a/library/cpp/codecs/greedy_dict/ut/ya.make b/library/cpp/codecs/greedy_dict/ut/ya.make index bd67d1a452..e5d597a083 100644 --- a/library/cpp/codecs/greedy_dict/ut/ya.make +++ b/library/cpp/codecs/greedy_dict/ut/ya.make @@ -1,5 +1,5 @@ UNITTEST_FOR(library/cpp/codecs/greedy_dict) - + OWNER(velavokr) SRCS( diff --git a/library/cpp/codecs/greedy_dict/ya.make b/library/cpp/codecs/greedy_dict/ya.make index 2a57224f7e..6904a354de 100644 --- a/library/cpp/codecs/greedy_dict/ya.make +++ b/library/cpp/codecs/greedy_dict/ya.make @@ -1,15 +1,15 @@ OWNER(velavokr) -LIBRARY() - -SRCS( - gd_builder.cpp - gd_entry.cpp -) - -PEERDIR( +LIBRARY() + +SRCS( + gd_builder.cpp + gd_entry.cpp +) + +PEERDIR( library/cpp/containers/comptrie library/cpp/string_utils/relaxed_escaper -) - +) + END() diff --git a/library/cpp/codecs/huffman_codec.cpp b/library/cpp/codecs/huffman_codec.cpp index 650fe7cdfd..391662fb0d 100644 --- a/library/cpp/codecs/huffman_codec.cpp +++ b/library/cpp/codecs/huffman_codec.cpp @@ -1,14 +1,14 @@ -#include "huffman_codec.h" +#include "huffman_codec.h" #include <library/cpp/bit_io/bitinput.h> #include <library/cpp/bit_io/bitoutput.h> - -#include <util/generic/algorithm.h> + +#include <util/generic/algorithm.h> #include <util/generic/bitops.h> -#include <util/stream/buffer.h> -#include <util/stream/length.h> -#include <util/string/printf.h> - -namespace NCodecs { +#include <util/stream/buffer.h> +#include <util/stream/length.h> +#include <util/string/printf.h> + +namespace NCodecs { template <typename T> struct TCanonicalCmp { bool operator()(const T& a, const T& b) const { @@ -19,40 +19,40 @@ namespace NCodecs { } } }; - + template <typename T> struct TByCharCmp { bool operator()(const T& a, const T& b) const { - return a.Char < b.Char; - } + return a.Char < b.Char; + } }; - + struct TTreeEntry { static const ui32 InvalidBranch = (ui32)-1; - + ui64 Freq = 0; ui32 Branches[2]{InvalidBranch, InvalidBranch}; - + ui32 CodeLength = 0; ui8 Char = 0; bool Invalid = false; - + TTreeEntry() = default; - + static bool ByFreq(const TTreeEntry& a, const TTreeEntry& b) { return a.Freq < b.Freq; } - + static bool ByFreqRev(const TTreeEntry& a, const TTreeEntry& b) { return a.Freq > b.Freq; } }; - + using TCodeTree = TVector<TTreeEntry>; - + void InitTreeByFreqs(TCodeTree& tree, const ui64 freqs[256]) { tree.reserve(255 * 256 / 2); // worst case - balanced tree - + for (ui32 i = 0; i < 256; ++i) { tree.emplace_back(); tree.back().Char = i; @@ -72,24 +72,24 @@ namespace NCodecs { for (ui64 i = 0; i < r.size(); ++i) ++freqs[(ui8)r[i]]; } - + InitTreeByFreqs(tree, freqs); - } - + } + void CalculateCodeLengths(TCodeTree& tree) { Y_ENSURE(tree.size() == 256, " "); const ui32 firstbranch = tree.size(); - + ui32 curleaf = 0; ui32 curbranch = firstbranch; - + // building code tree. two priority queues are combined in one. while (firstbranch - curleaf + tree.size() - curbranch >= 2) { TTreeEntry e; - + for (auto& branche : e.Branches) { ui32 br; - + if (curleaf >= firstbranch) br = curbranch++; else if (curbranch >= tree.size()) @@ -98,84 +98,84 @@ namespace NCodecs { br = curleaf++; else br = curbranch++; - + Y_ENSURE(br < tree.size(), " "); branche = br; e.Freq += tree[br].Freq; } - + tree.push_back(e); PushHeap(tree.begin() + curbranch, tree.end(), TTreeEntry::ByFreqRev); - } - + } + // computing code lengths for (ui64 i = tree.size() - 1; i >= firstbranch; --i) { TTreeEntry e = tree[i]; - + for (auto branche : e.Branches) tree[branche].CodeLength = e.CodeLength + 1; } - + // chopping off the branches tree.resize(firstbranch); - + Sort(tree.begin(), tree.end(), TCanonicalCmp<TTreeEntry>()); - + // simplification: we are stripping codes longer than 64 bits while (!tree.empty() && tree.back().CodeLength > 64) tree.pop_back(); - + // will not compress if (tree.empty()) return; - + // special invalid code word tree.back().Invalid = true; } - + struct TEncoderEntry { ui64 Code = 0; - + ui8 CodeLength = 0; ui8 Char = 0; ui8 Invalid = true; - + explicit TEncoderEntry(TTreeEntry e) : CodeLength(e.CodeLength) , Char(e.Char) , Invalid(e.Invalid) { } - + TEncoderEntry() = default; }; - + struct TEncoderTable { TEncoderEntry Entries[256]; - + void Save(IOutputStream* out) const { ui16 nval = 0; - + for (auto entrie : Entries) nval += !entrie.Invalid; - + ::Save(out, nval); - + for (auto entrie : Entries) { if (!entrie.Invalid) { ::Save(out, entrie.Char); ::Save(out, entrie.CodeLength); } - } - } - + } + } + void Load(IInputStream* in) { ui16 nval = 0; ::Load(in, nval); - + for (ui32 i = 0; i < 256; ++i) Entries[i].Char = i; - + for (ui32 i = 0; i < nval; ++i) { ui8 ch = 0; ui8 len = 0; @@ -184,15 +184,15 @@ namespace NCodecs { Entries[ch].CodeLength = len; Entries[ch].Invalid = false; } - } + } }; - + struct TDecoderEntry { ui32 NextTable : 10; ui32 Char : 8; ui32 Invalid : 1; ui32 Bad : 1; - + TDecoderEntry() : NextTable() , Char() @@ -201,27 +201,27 @@ namespace NCodecs { { } }; - + struct TDecoderTable: public TIntrusiveListItem<TDecoderTable> { ui64 Length = 0; ui64 BaseCode = 0; - + TDecoderEntry Entries[256]; - + TDecoderTable() { Zero(Entries); } }; - + const int CACHE_BITS_COUNT = 16; class THuffmanCodec::TImpl: public TAtomicRefCount<TImpl> { TEncoderTable Encoder; TDecoderTable Decoder[256]; - + TEncoderEntry Invalid; - + ui32 SubTablesNum; - + class THuffmanCache { struct TCacheEntry { int EndOffset : 24; @@ -230,7 +230,7 @@ namespace NCodecs { TVector<char> DecodeCache; TVector<TCacheEntry> CacheEntries; const TImpl& Original; - + public: THuffmanCache(const THuffmanCodec::TImpl& encoder); @@ -252,51 +252,51 @@ namespace NCodecs { if (in.empty()) { return 0; } - + out.Reserve(in.size() * 2); - + { NBitIO::TBitOutputVector<TBuffer> bout(&out); TStringBuf tin = in; - + // data is under compression bout.Write(1, 1); - + for (auto t : tin) { const TEncoderEntry& ce = Encoder.Entries[(ui8)t]; - + bout.Write(ce.Code, ce.CodeLength); - + if (ce.Invalid) { bout.Write(t, 8); } } - + // in canonical huffman coding there cannot be a code having no 0 in the suffix // and shorter than 8 bits. bout.Write((ui64)-1, bout.GetByteReminder()); return bout.GetByteReminder(); - } - } - + } + } + void Decode(TStringBuf in, TBuffer& out) const { out.Clear(); - + if (in.empty()) { return; } - + NBitIO::TBitInput bin(in); ui64 f = 0; bin.ReadK<1>(f); - + // if data is uncompressed if (!f) { in.Skip(1); out.Append(in.data(), in.size()); } else { out.Reserve(in.size() * 8); - + if (Cache.Get()) { Cache->Decode(bin, out); } else { @@ -304,36 +304,36 @@ namespace NCodecs { } } } - } - + } + Y_FORCE_INLINE int ReadNextChar(NBitIO::TBitInput& bin, TBuffer& out) const { const TDecoderTable* table = Decoder; TDecoderEntry e; - + int bitsRead = 0; while (true) { ui64 code = 0; - + if (Y_UNLIKELY(!bin.Read(code, table->Length))) return 0; bitsRead += table->Length; - + if (Y_UNLIKELY(code < table->BaseCode)) return 0; - + code -= table->BaseCode; - + if (Y_UNLIKELY(code > 255)) return 0; - + e = table->Entries[code]; - + if (Y_UNLIKELY(e.Bad)) return 0; - + if (e.NextTable) { table = Decoder + e.NextTable; - } else { + } else { if (e.Invalid) { code = 0; bin.ReadK<8>(code); @@ -344,77 +344,77 @@ namespace NCodecs { } return bitsRead; - } + } } - + Y_ENSURE(false, " could not decode input"); return 0; - } - + } + void GenerateEncoder(TCodeTree& tree) { const ui64 sz = tree.size(); - + TEncoderEntry lastcode = Encoder.Entries[tree[0].Char] = TEncoderEntry(tree[0]); - + for (ui32 i = 1; i < sz; ++i) { const TTreeEntry& te = tree[i]; TEncoderEntry& e = Encoder.Entries[te.Char]; e = TEncoderEntry(te); - + e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength); lastcode = e; - + e.Code = ReverseBits(e.Code, e.CodeLength); - + if (e.Invalid) Invalid = e; } - + for (auto& e : Encoder.Entries) { if (e.Invalid) e = Invalid; Y_ENSURE(e.CodeLength, " "); } - } - + } + void RegenerateEncoder() { for (auto& entrie : Encoder.Entries) { if (entrie.Invalid) entrie.CodeLength = Invalid.CodeLength; } - + Sort(Encoder.Entries, Encoder.Entries + 256, TCanonicalCmp<TEncoderEntry>()); - + TEncoderEntry lastcode = Encoder.Entries[0]; - + for (ui32 i = 1; i < 256; ++i) { TEncoderEntry& e = Encoder.Entries[i]; e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength); lastcode = e; - + e.Code = ReverseBits(e.Code, e.CodeLength); } - + for (auto& entrie : Encoder.Entries) { if (entrie.Invalid) { Invalid = entrie; break; } } - + Sort(Encoder.Entries, Encoder.Entries + 256, TByCharCmp<TEncoderEntry>()); - + for (auto& entrie : Encoder.Entries) { if (entrie.Invalid) entrie = Invalid; - } - } - + } + } + void BuildDecoder() { TEncoderTable enc = Encoder; Sort(enc.Entries, enc.Entries + 256, TCanonicalCmp<TEncoderEntry>()); - + TEncoderEntry& e1 = enc.Entries[0]; Decoder[0].BaseCode = e1.Code; Decoder[0].Length = e1.CodeLength; @@ -423,22 +423,22 @@ namespace NCodecs { SetEntry(Decoder, e2.Code, e2.CodeLength, e2); } Cache.Reset(new THuffmanCache(*this)); - } - + } + void SetEntry(TDecoderTable* t, ui64 code, ui64 len, TEncoderEntry e) { Y_ENSURE(len >= t->Length, len << " < " << t->Length); - + ui64 idx = (code & MaskLowerBits(t->Length)) - t->BaseCode; TDecoderEntry& d = t->Entries[idx]; - + if (len == t->Length) { Y_ENSURE(!d.NextTable, " "); - + d.Char = e.Char; d.Invalid = e.Invalid; return; } - + if (!d.NextTable) { Y_ENSURE(SubTablesNum < Y_ARRAY_SIZE(Decoder), " "); d.NextTable = SubTablesNum++; @@ -446,10 +446,10 @@ namespace NCodecs { nt->Length = Min<ui64>(8, len - t->Length); nt->BaseCode = (code >> t->Length) & MaskLowerBits(nt->Length); } - + SetEntry(Decoder + d.NextTable, code >> t->Length, len - t->Length, e); - } - + } + void Learn(ISequenceReader* in) { { TCodeTree tree; @@ -459,11 +459,11 @@ namespace NCodecs { GenerateEncoder(tree); } BuildDecoder(); - } - + } + void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) { - TCodeTree tree; - + TCodeTree tree; + ui64 freqsArray[256]; Zero(freqsArray); @@ -491,7 +491,7 @@ namespace NCodecs { BuildDecoder(); } }; - + THuffmanCodec::TImpl::THuffmanCache::THuffmanCache(const THuffmanCodec::TImpl& codec) : Original(codec) { @@ -512,7 +512,7 @@ namespace NCodecs { CacheEntries[i] = e; break; } - + for (TBuffer::TConstIterator it = decoded.Begin(); it != decoded.End(); ++it) { DecodeCache.push_back(*it); } @@ -558,32 +558,32 @@ namespace NCodecs { MyTraits.SizeOnDecodeMultiplier = 8; MyTraits.RecommendedSampleSize = 1 << 21; } - + THuffmanCodec::~THuffmanCodec() = default; - + ui8 THuffmanCodec::Encode(TStringBuf in, TBuffer& bbb) const { if (Y_UNLIKELY(!Trained)) ythrow TCodecException() << " not trained"; - + return Impl->Encode(in, bbb); } - + void THuffmanCodec::Decode(TStringBuf in, TBuffer& bbb) const { Impl->Decode(in, bbb); } - + void THuffmanCodec::Save(IOutputStream* out) const { Impl->Save(out); } - + void THuffmanCodec::Load(IInputStream* in) { Impl->Load(in); } - + void THuffmanCodec::DoLearn(ISequenceReader& in) { Impl->Learn(&in); } - + void THuffmanCodec::LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) { Impl->LearnByFreqs(freqs); Trained = true; diff --git a/library/cpp/codecs/huffman_codec.h b/library/cpp/codecs/huffman_codec.h index 559545b90d..1c00a80637 100644 --- a/library/cpp/codecs/huffman_codec.h +++ b/library/cpp/codecs/huffman_codec.h @@ -1,33 +1,33 @@ -#pragma once - -#include "codecs.h" - -#include <util/generic/ptr.h> +#pragma once + +#include "codecs.h" + +#include <util/generic/ptr.h> #include <util/string/cast.h> - -namespace NCodecs { + +namespace NCodecs { // for types greater than char, pipeline with TFreqCodec. - + class THuffmanCodec: public ICodec { class TImpl; TIntrusivePtr<TImpl> Impl; - + public: THuffmanCodec(); ~THuffmanCodec() override; - + static TStringBuf MyName() { return "huffman"; } - + TString GetName() const override { return ToString(MyName()); } - + ui8 Encode(TStringBuf in, TBuffer& bbb) const override; - + void Decode(TStringBuf in, TBuffer& bbb) const override; - + void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs); protected: @@ -35,5 +35,5 @@ namespace NCodecs { void Save(IOutputStream* out) const override; void Load(IInputStream* in) override; }; - -} + +} diff --git a/library/cpp/codecs/pfor_codec.cpp b/library/cpp/codecs/pfor_codec.cpp index f6b3b0920b..3b51c99afa 100644 --- a/library/cpp/codecs/pfor_codec.cpp +++ b/library/cpp/codecs/pfor_codec.cpp @@ -1,6 +1,6 @@ -#include "pfor_codec.h" - -namespace NCodecs { +#include "pfor_codec.h" + +namespace NCodecs { template <> TStringBuf TPForCodec<ui64, true>::MyName() { return "pfor-delta64-sorted"; @@ -9,7 +9,7 @@ namespace NCodecs { TStringBuf TPForCodec<ui32, true>::MyName() { return "pfor-delta32-sorted"; } - + template <> TStringBuf TPForCodec<ui64, false>::MyName() { return "pfor-ui64"; @@ -18,5 +18,5 @@ namespace NCodecs { TStringBuf TPForCodec<ui32, false>::MyName() { return "pfor-ui32"; } - -} + +} diff --git a/library/cpp/codecs/pfor_codec.h b/library/cpp/codecs/pfor_codec.h index d7d4bb8bf4..b0207512ac 100644 --- a/library/cpp/codecs/pfor_codec.h +++ b/library/cpp/codecs/pfor_codec.h @@ -1,48 +1,48 @@ -#pragma once - -#include "codecs.h" - -#include "delta_codec.h" -#include "tls_cache.h" - +#pragma once + +#include "codecs.h" + +#include "delta_codec.h" +#include "tls_cache.h" + #include <library/cpp/bit_io/bitinput.h> #include <library/cpp/bit_io/bitoutput.h> #include <util/string/cast.h> - -namespace NCodecs { + +namespace NCodecs { template <typename T, bool WithDelta = false> class TPForCodec: public ICodec { using TUnsigned = std::make_unsigned_t<T>; typedef TDeltaCodec<TUnsigned> TDCodec; - + typedef std::conditional_t<WithDelta, typename TDCodec::TDelta, T> TValue; static_assert(std::is_unsigned<TValue>::value, "expect std:is_unsigned<TValue>::value"); - + static const ui64 BitsInT = sizeof(TUnsigned) * 8; - + TDCodec DeltaCodec; - + public: static TStringBuf MyName(); - + TPForCodec() { MyTraits.AssumesStructuredInput = true; MyTraits.SizeOfInputElement = sizeof(T); MyTraits.SizeOnDecodeMultiplier = sizeof(T); } - + TString GetName() const override { return ToString(MyName()); } - + ui8 Encode(TStringBuf s, TBuffer& b) const override { b.Clear(); if (s.empty()) { return 0; } - + b.Reserve(2 * s.size() + b.Size()); - + if (WithDelta) { auto buffer = TBufferTlsCache::TlsInstance().Item(); TBuffer& db = buffer.Get(); @@ -51,50 +51,50 @@ namespace NCodecs { DeltaCodec.Encode(s, db); s = TStringBuf{db.data(), db.size()}; } - + TArrayRef<const TValue> tin{(const TValue*)s.data(), s.size() / sizeof(TValue)}; - + const ui64 sz = tin.size(); ui64 bitcounts[BitsInT + 1]; Zero(bitcounts); - + ui32 zeros = 0; - + for (const TValue* it = tin.begin(); it != tin.end(); ++it) { TUnsigned v = 1 + (TUnsigned)*it; ui64 l = MostSignificantBit(v) + 1; ++bitcounts[l]; - + if (!v) { ++zeros; } } - + // cumulative bit counts for (ui64 i = 0; i < BitsInT; ++i) { bitcounts[i + 1] += bitcounts[i]; - } - + } + bool hasexceptions = zeros; ui64 optimalbits = BitsInT; - + { ui64 excsize = 0; ui64 minsize = sz * BitsInT; - + for (ui64 current = BitsInT; current; --current) { ui64 size = bitcounts[current] * current + (sz - bitcounts[current]) * (current + 6 + excsize) + zeros * (current + 6); - + excsize += current * bitcounts[current]; - + if (size < minsize) { minsize = size; optimalbits = current; hasexceptions = zeros || sz - bitcounts[current]; } - } - } - + } + } + if (!optimalbits || BitsInT == optimalbits) { b.Append((ui8)-1); b.Append(s.data(), s.size()); @@ -104,7 +104,7 @@ namespace NCodecs { bout.Write(0, 1); bout.Write(hasexceptions, 1); bout.Write(optimalbits, 6); - + for (const TValue* it = tin.begin(); it != tin.end(); ++it) { TUnsigned word = 1 + (TUnsigned)*it; ui64 len = MostSignificantBit(word) + 1; @@ -116,29 +116,29 @@ namespace NCodecs { } else { bout.Write(word, optimalbits); } - } - + } + return bout.GetByteReminder(); } // the rest of the last byte is zero padded. BitsInT is always > 7. - } - + } + void Decode(TStringBuf s, TBuffer& b) const override { b.Clear(); if (s.empty()) { return; } - + b.Reserve(s.size() * sizeof(T) + b.Size()); - + ui64 isplain = 0; ui64 hasexceptions = 0; ui64 bits = 0; - + NBitIO::TBitInput bin(s); bin.ReadK<1>(isplain); bin.ReadK<1>(hasexceptions); bin.ReadK<6>(bits); - + if (Y_UNLIKELY(isplain)) { s.Skip(1); @@ -147,17 +147,17 @@ namespace NCodecs { } else { b.Append(s.data(), s.size()); } - } else { + } else { typename TDCodec::TDecoder decoder; - + if (hasexceptions) { ui64 word = 0; while (bin.Read(word, bits)) { if (word || (bin.ReadK<6>(word) && bin.Read(word, word))) { --word; - + TValue t = word; - + if (WithDelta) { if (decoder.Decode(t)) { TStringBuf r{(char*)&decoder.Result, sizeof(decoder.Result)}; @@ -166,46 +166,46 @@ namespace NCodecs { } else { TStringBuf r{(char*)&t, sizeof(t)}; b.Append(r.data(), r.size()); - } - } - } + } + } + } } else { ui64 word = 0; T outarr[256 / sizeof(T)]; ui32 cnt = 0; while (true) { ui64 v = bin.Read(word, bits); - + if ((!v) | (!word)) break; - + --word; TValue t = word; - + if (WithDelta) { if (decoder.Decode(t)) { outarr[cnt++] = decoder.Result; } } else { outarr[cnt++] = t; - } + } if (cnt == Y_ARRAY_SIZE(outarr)) { b.Append((const char*)outarr, sizeof(outarr)); cnt = 0; } - } - + } + if (cnt) { b.Append((const char*)outarr, cnt * sizeof(T)); - } - } - } - } - + } + } + } + } + protected: void DoLearn(ISequenceReader&) override { } }; - -} + +} diff --git a/library/cpp/codecs/sample.h b/library/cpp/codecs/sample.h index 15f03afcc5..bce37e6a2c 100644 --- a/library/cpp/codecs/sample.h +++ b/library/cpp/codecs/sample.h @@ -1,89 +1,89 @@ -#pragma once - +#pragma once + #include <library/cpp/deprecated/accessors/accessors.h> - -#include <util/generic/buffer.h> -#include <util/generic/vector.h> -#include <util/random/fast.h> -#include <util/random/shuffle.h> - -#include <functional> -#include <type_traits> - -namespace NCodecs { - class ISequenceReader { - public: - virtual bool NextRegion(TStringBuf& s) = 0; - - virtual ~ISequenceReader() = default; - }; - - template <class TValue> - TStringBuf ValueToStringBuf(TValue&& t) { - return TStringBuf{NAccessors::Begin(t), NAccessors::End(t)}; - } - - template <class TIter> + +#include <util/generic/buffer.h> +#include <util/generic/vector.h> +#include <util/random/fast.h> +#include <util/random/shuffle.h> + +#include <functional> +#include <type_traits> + +namespace NCodecs { + class ISequenceReader { + public: + virtual bool NextRegion(TStringBuf& s) = 0; + + virtual ~ISequenceReader() = default; + }; + + template <class TValue> + TStringBuf ValueToStringBuf(TValue&& t) { + return TStringBuf{NAccessors::Begin(t), NAccessors::End(t)}; + } + + template <class TIter> TStringBuf IterToStringBuf(TIter iter) { - return ValueToStringBuf(*iter); - } - - template <class TItem> + return ValueToStringBuf(*iter); + } + + template <class TItem> class TSimpleSequenceReader: public ISequenceReader { const TVector<TItem>& Items; - size_t Idx = 0; - - public: + size_t Idx = 0; + + public: TSimpleSequenceReader(const TVector<TItem>& items) - : Items(items) + : Items(items) { } - - bool NextRegion(TStringBuf& s) override { - if (Idx >= Items.size()) { - return false; - } - - s = ValueToStringBuf(Items[Idx++]); - return true; - } - }; - - template <class TIter, class TGetter> - size_t GetInputSize(TIter begin, TIter end, TGetter getter) { - size_t totalBytes = 0; - for (TIter iter = begin; iter != end; ++iter) { - totalBytes += getter(iter).size(); - } - return totalBytes; - } - - template <class TIter> - size_t GetInputSize(TIter begin, TIter end) { - return GetInputSize(begin, end, IterToStringBuf<TIter>); - } - - template <class TIter, class TGetter> + + bool NextRegion(TStringBuf& s) override { + if (Idx >= Items.size()) { + return false; + } + + s = ValueToStringBuf(Items[Idx++]); + return true; + } + }; + + template <class TIter, class TGetter> + size_t GetInputSize(TIter begin, TIter end, TGetter getter) { + size_t totalBytes = 0; + for (TIter iter = begin; iter != end; ++iter) { + totalBytes += getter(iter).size(); + } + return totalBytes; + } + + template <class TIter> + size_t GetInputSize(TIter begin, TIter end) { + return GetInputSize(begin, end, IterToStringBuf<TIter>); + } + + template <class TIter, class TGetter> TVector<TBuffer> GetSample(TIter begin, TIter end, size_t sampleSizeBytes, TGetter getter) { - TFastRng64 rng{0x1ce1f2e507541a05, 0x07d45659, 0x7b8771030dd9917e, 0x2d6636ce}; - - size_t totalBytes = GetInputSize(begin, end, getter); - double sampleProb = (double)sampleSizeBytes / Max<size_t>(1, totalBytes); - + TFastRng64 rng{0x1ce1f2e507541a05, 0x07d45659, 0x7b8771030dd9917e, 0x2d6636ce}; + + size_t totalBytes = GetInputSize(begin, end, getter); + double sampleProb = (double)sampleSizeBytes / Max<size_t>(1, totalBytes); + TVector<TBuffer> result; - for (TIter iter = begin; iter != end; ++iter) { - if (sampleProb >= 1 || rng.GenRandReal1() < sampleProb) { - TStringBuf reg = getter(iter); + for (TIter iter = begin; iter != end; ++iter) { + if (sampleProb >= 1 || rng.GenRandReal1() < sampleProb) { + TStringBuf reg = getter(iter); result.emplace_back(reg.data(), reg.size()); - } - } - Shuffle(result.begin(), result.end(), rng); - return result; - } - - template <class TIter> + } + } + Shuffle(result.begin(), result.end(), rng); + return result; + } + + template <class TIter> TVector<TBuffer> GetSample(TIter begin, TIter end, size_t sampleSizeBytes) { - return GetSample(begin, end, sampleSizeBytes, IterToStringBuf<TIter>); - } - -} + return GetSample(begin, end, sampleSizeBytes, IterToStringBuf<TIter>); + } + +} diff --git a/library/cpp/codecs/solar_codec.cpp b/library/cpp/codecs/solar_codec.cpp index d0692fe2a4..088bdead19 100644 --- a/library/cpp/codecs/solar_codec.cpp +++ b/library/cpp/codecs/solar_codec.cpp @@ -1,36 +1,36 @@ -#include "solar_codec.h" - +#include "solar_codec.h" + #include <library/cpp/codecs/greedy_dict/gd_builder.h> - + #include <library/cpp/containers/comptrie/comptrie_builder.h> #include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> -#include <util/stream/length.h> -#include <util/string/printf.h> -#include <util/ysaveload.h> - -namespace NCodecs { +#include <util/stream/length.h> +#include <util/string/printf.h> +#include <util/ysaveload.h> + +namespace NCodecs { static inline ui32 Append(TBuffer& pool, TStringBuf data) { pool.Append(data.data(), data.size()); return pool.Size(); } - + void TSolarCodec::DoLearn(ISequenceReader& r) { using namespace NGreedyDict; - + Decoder.clear(); Pool.Clear(); - + THolder<TEntrySet> set; - + { TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance()); TStringBufs bufs; - + TStringBuf m; while (r.NextRegion(m)) { bufs.push_back(pool.AppendString(m)); } - + { TDictBuilder b(Settings); b.SetInput(bufs); @@ -38,66 +38,66 @@ namespace NCodecs { set = b.ReleaseEntrySet(); } - } - + } + set->SetScores(ES_LEN_COUNT); - { + { TVector<std::pair<float, TStringBuf>> tmp; tmp.reserve(set->size()); - + for (const auto& it : *set) { tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1)))); } - + Sort(tmp.begin(), tmp.end()); - + Decoder.reserve(tmp.size() + 1); Decoder.push_back(0); - + for (const auto& it : tmp) { Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed"); ui32 endoff = Append(Pool, it.second); Decoder.push_back(endoff); } - } - + } + Pool.ShrinkToFit(); Decoder.shrink_to_fit(); - + TBufferOutput bout; - + { TVector<std::pair<TStringBuf, ui32>> tmp2; tmp2.reserve(Decoder.size()); - + for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) { TStringBuf s = DoDecode(i); tmp2.push_back(std::make_pair(s, i - 1)); Y_ENSURE(s.size() == (Decoder[i] - Decoder[i - 1]), "learning invariant failed"); } - + Sort(tmp2.begin(), tmp2.end()); - + { TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED); for (const auto& it : tmp2) { builder.Add(it.first.data(), it.first.size(), it.second); } - + builder.Save(bout); - } - } - + } + } + Encoder.Init(TBlob::FromBuffer(bout.Buffer())); - } - + } + void TSolarCodec::Save(IOutputStream* out) const { TBlob b = Encoder.Data(); ::Save(out, (ui32)b.Size()); out->Write(b.Data(), b.Size()); } - + void TSolarCodec::Load(IInputStream* in) { ui32 sz; ::Load(in, sz); @@ -105,29 +105,29 @@ namespace NCodecs { Encoder.Init(TBlob::FromStream(lin)); Pool.Clear(); Decoder.clear(); - + TVector<std::pair<ui32, TString>> tmp; - + ui32 poolsz = 0; for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) { const TString& s = it.GetKey(); tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s)); poolsz += Max<ui32>(s.size(), 1); } - + Sort(tmp.begin(), tmp.end()); - + Pool.Reserve(poolsz); Decoder.reserve(tmp.size() + 1); Decoder.push_back(0); - + for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) { Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first); Decoder.push_back(Append(Pool, tmp[i].second)); } - + Pool.ShrinkToFit(); Decoder.shrink_to_fit(); - } - -} + } + +} diff --git a/library/cpp/codecs/solar_codec.h b/library/cpp/codecs/solar_codec.h index 7158ae7926..e6c0b891ad 100644 --- a/library/cpp/codecs/solar_codec.h +++ b/library/cpp/codecs/solar_codec.h @@ -1,16 +1,16 @@ -#pragma once - -#include "codecs.h" +#pragma once + +#include "codecs.h" #include <library/cpp/containers/comptrie/comptrie_trie.h> #include <library/cpp/codecs/greedy_dict/gd_builder.h> - + #include <util/string/cast.h> -#include <util/string/escape.h> - -namespace NCodecs { - // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы. - // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе. - +#include <util/string/escape.h> + +namespace NCodecs { + // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы. + // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе. + struct TVarIntTraits { static const size_t MAX_VARINT32_BYTES = 5; @@ -52,7 +52,7 @@ namespace NCodecs { Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) { ui32 result = static_cast<ui8>(r[0]); - r.Skip(1); + r.Skip(1); if (result >= 0x80) { Y_ENSURE_EX(!r.empty(), TCodecException() << "Bad data"); result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]); @@ -100,7 +100,7 @@ namespace NCodecs { static TStringBuf MyNameShortInt() { return TStringBuf("solar-si"); } - + explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) : Settings(s) , MaxEntries(maxentries) @@ -110,7 +110,7 @@ namespace NCodecs { MyTraits.SizeOnDecodeMultiplier = 2; MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8; } - + ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { EncodeImpl<TVarIntTraits>(r, b); return 0; @@ -148,8 +148,8 @@ namespace NCodecs { TTraits::Write(val + 1, b); r.Skip(Max<size_t>(sz, 1)); } - } - + } + template <class TTraits> Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const { b.Clear(); @@ -160,25 +160,25 @@ namespace NCodecs { TStringBuf s = DoDecode(v); b.Append(s.data(), s.size()); } - } - + } + inline bool CanUseShortInt() const { return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT; } - + private: typedef TCompactTrie<char, ui32> TEncoder; typedef TVector<ui32> TDecoder; - + TBuffer Pool; TEncoder Encoder; TDecoder Decoder; - + NGreedyDict::TBuildSettings Settings; ui32 MaxEntries; ui32 MaxIterations; }; - + // Uses varints or shortints depending on the decoder size class TAdaptiveSolarCodec: public TSolarCodec { public: @@ -186,7 +186,7 @@ namespace NCodecs { : TSolarCodec(maxentries, maxiter, s) { } - + ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { if (CanUseShortInt()) { EncodeImpl<TShortIntTraits>(r, b); @@ -225,7 +225,7 @@ namespace NCodecs { EncodeImpl<TShortIntTraits>(r, b); return 0; } - + void Decode(TStringBuf r, TBuffer& b) const override { DecodeImpl<TShortIntTraits>(r, b); } @@ -241,4 +241,4 @@ namespace NCodecs { } }; -} +} diff --git a/library/cpp/codecs/static/builder.cpp b/library/cpp/codecs/static/builder.cpp index 93e34a3edb..083f0fc6f6 100644 --- a/library/cpp/codecs/static/builder.cpp +++ b/library/cpp/codecs/static/builder.cpp @@ -1,39 +1,39 @@ -#include "builder.h" -#include "common.h" - +#include "builder.h" +#include "common.h" + #include <library/cpp/codecs/static/static_codec_info.pb.h> - + #include <library/cpp/codecs/codecs.h> - -#include <util/generic/yexception.h> -#include <util/string/subst.h> - -namespace NCodecs { + +#include <util/generic/yexception.h> +#include <util/string/subst.h> + +namespace NCodecs { TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo& info) { - TStaticCodecInfo result; - TCodecPtr codec = ICodec::GetInstance(info.CodecName); - Y_ENSURE_EX(codec, TCodecException() << "empty codec is not allowed"); - - codec->LearnX(trainingData.begin(), trainingData.end(), info.SampleSizeMultiplier); - { - TStringOutput sout{*result.MutableStoredCodec()}; - ICodec::Store(&sout, codec); - } - - auto& debugInfo = *result.MutableDebugInfo(); - debugInfo.SetStoredCodecHash(DataSignature(result.GetStoredCodec())); - debugInfo.SetCodecName(info.CodecName); - debugInfo.SetSampleSizeMultiplier(info.SampleSizeMultiplier); - debugInfo.SetTimestamp(info.Timestamp); - debugInfo.SetRevisionInfo(info.RevisionInfo); - debugInfo.SetTrainingSetComment(info.TrainingSetComment); - debugInfo.SetTrainingSetResId(info.TrainingSetResId); - return result; - } - + TStaticCodecInfo result; + TCodecPtr codec = ICodec::GetInstance(info.CodecName); + Y_ENSURE_EX(codec, TCodecException() << "empty codec is not allowed"); + + codec->LearnX(trainingData.begin(), trainingData.end(), info.SampleSizeMultiplier); + { + TStringOutput sout{*result.MutableStoredCodec()}; + ICodec::Store(&sout, codec); + } + + auto& debugInfo = *result.MutableDebugInfo(); + debugInfo.SetStoredCodecHash(DataSignature(result.GetStoredCodec())); + debugInfo.SetCodecName(info.CodecName); + debugInfo.SetSampleSizeMultiplier(info.SampleSizeMultiplier); + debugInfo.SetTimestamp(info.Timestamp); + debugInfo.SetRevisionInfo(info.RevisionInfo); + debugInfo.SetTrainingSetComment(info.TrainingSetComment); + debugInfo.SetTrainingSetResId(info.TrainingSetResId); + return result; + } + TString GetStandardFileName(const TStaticCodecInfo& info) { TString cName = info.GetDebugInfo().GetCodecName(); - SubstGlobal(cName, ':', '.'); - return TStringBuilder() << cName << "." << info.GetDebugInfo().GetTimestamp() << ".codec_info"; - } -} + SubstGlobal(cName, ':', '.'); + return TStringBuilder() << cName << "." << info.GetDebugInfo().GetTimestamp() << ".codec_info"; + } +} diff --git a/library/cpp/codecs/static/builder.h b/library/cpp/codecs/static/builder.h index d7533be4d5..234ad42dff 100644 --- a/library/cpp/codecs/static/builder.h +++ b/library/cpp/codecs/static/builder.h @@ -1,29 +1,29 @@ -#pragma once - -#include "static.h" - +#pragma once + +#include "static.h" + #include <library/cpp/svnversion/svnversion.h> - -#include <util/datetime/base.h> + +#include <util/datetime/base.h> #include <util/generic/string.h> -#include <util/generic/vector.h> -#include <util/string/builder.h> - -namespace NCodecs { - struct TCodecBuildInfo { - // optimal values from SEARCH-1655 +#include <util/generic/vector.h> +#include <util/string/builder.h> + +namespace NCodecs { + struct TCodecBuildInfo { + // optimal values from SEARCH-1655 TString CodecName = "solar-8k-a:zstd08d-1"; - float SampleSizeMultiplier = 1; - - // debug info: - time_t Timestamp = TInstant::Now().TimeT(); + float SampleSizeMultiplier = 1; + + // debug info: + time_t Timestamp = TInstant::Now().TimeT(); TString RevisionInfo = (TStringBuilder() << "r" << ToString(GetProgramSvnRevision())); TString TrainingSetComment; // a human comment on the training data TString TrainingSetResId; // sandbox resid of the training set - }; - + }; + TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo&); - + TString GetStandardFileName(const TStaticCodecInfo&); - -} + +} diff --git a/library/cpp/codecs/static/common.h b/library/cpp/codecs/static/common.h index 211de2a27d..84b0349d82 100644 --- a/library/cpp/codecs/static/common.h +++ b/library/cpp/codecs/static/common.h @@ -1,32 +1,32 @@ -#pragma once - -#include <util/string/hex.h> -#include <util/digest/city.h> -#include <util/system/byteorder.h> - -namespace NCodecs { - template <class T> - ui64 DataSignature(const T& t) { - static_assert(!std::is_scalar<T>::value, "no scalars"); +#pragma once + +#include <util/string/hex.h> +#include <util/digest/city.h> +#include <util/system/byteorder.h> + +namespace NCodecs { + template <class T> + ui64 DataSignature(const T& t) { + static_assert(!std::is_scalar<T>::value, "no scalars"); return CityHash64(t.data(), t.size()); - } - - template <class T> + } + + template <class T> TString HexWriteScalar(T t) { - static_assert(std::is_scalar<T>::value, "scalars only"); - t = LittleToBig(t); + static_assert(std::is_scalar<T>::value, "scalars only"); + t = LittleToBig(t); TString res = HexEncode(&t, sizeof(t)); - res.to_lower(); - return res; - } - - template <class T> - T HexReadScalar(TStringBuf s) { - static_assert(std::is_scalar<T>::value, "scalars only"); - T t = 0; + res.to_lower(); + return res; + } + + template <class T> + T HexReadScalar(TStringBuf s) { + static_assert(std::is_scalar<T>::value, "scalars only"); + T t = 0; HexDecode(s.data(), Min(s.size(), sizeof(T)), &t); - t = BigToLittle(t); - return t; - } - -} + t = BigToLittle(t); + return t; + } + +} diff --git a/library/cpp/codecs/static/example/example.cpp b/library/cpp/codecs/static/example/example.cpp index 5b750b717e..0c50a1a5be 100644 --- a/library/cpp/codecs/static/example/example.cpp +++ b/library/cpp/codecs/static/example/example.cpp @@ -1,43 +1,43 @@ -#include "example.h" - +#include "example.h" + #include <library/cpp/codecs/static/static.h> - -#include <util/generic/yexception.h> - -extern "C" { + +#include <util/generic/yexception.h> + +extern "C" { extern const ui8 codec_info_huff_20160707[]; extern const ui32 codec_info_huff_20160707Size; extern const ui8 codec_info_sa_huff_20160707[]; extern const ui32 codec_info_sa_huff_20160707Size; -}; - -namespace NStaticCodecExample { - static const NCodecs::TCodecConstPtr CODECS[] = { - nullptr, - NCodecs::RestoreCodecFromArchive(codec_info_huff_20160707, codec_info_huff_20160707Size), - NCodecs::RestoreCodecFromArchive(codec_info_sa_huff_20160707, codec_info_sa_huff_20160707Size), - }; - - static_assert(Y_ARRAY_SIZE(CODECS) == DV_COUNT, "bad array size"); - - void Encode(TBuffer& out, TStringBuf in, EDictVersion dv) { - Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv); - out.Clear(); - if (!in) { - return; - } - CODECS[dv]->Encode(in, out); - out.Append((char)dv); - } - - void Decode(TBuffer& out, TStringBuf in) { - out.Clear(); - if (!in) { - return; - } - EDictVersion dv = (EDictVersion)in.back(); - Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv); - in.Chop(1); - CODECS[dv]->Decode(in, out); - } -} +}; + +namespace NStaticCodecExample { + static const NCodecs::TCodecConstPtr CODECS[] = { + nullptr, + NCodecs::RestoreCodecFromArchive(codec_info_huff_20160707, codec_info_huff_20160707Size), + NCodecs::RestoreCodecFromArchive(codec_info_sa_huff_20160707, codec_info_sa_huff_20160707Size), + }; + + static_assert(Y_ARRAY_SIZE(CODECS) == DV_COUNT, "bad array size"); + + void Encode(TBuffer& out, TStringBuf in, EDictVersion dv) { + Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv); + out.Clear(); + if (!in) { + return; + } + CODECS[dv]->Encode(in, out); + out.Append((char)dv); + } + + void Decode(TBuffer& out, TStringBuf in) { + out.Clear(); + if (!in) { + return; + } + EDictVersion dv = (EDictVersion)in.back(); + Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv); + in.Chop(1); + CODECS[dv]->Decode(in, out); + } +} diff --git a/library/cpp/codecs/static/example/example.h b/library/cpp/codecs/static/example/example.h index f9b3a7324b..070ca90f02 100644 --- a/library/cpp/codecs/static/example/example.h +++ b/library/cpp/codecs/static/example/example.h @@ -1,17 +1,17 @@ -#pragma once - -#include <util/generic/strbuf.h> -#include <util/generic/buffer.h> - -namespace NStaticCodecExample { +#pragma once + +#include <util/generic/strbuf.h> +#include <util/generic/buffer.h> + +namespace NStaticCodecExample { enum EDictVersion : ui8 { DV_NULL = 0, DV_HUFF_20160707, DV_SA_HUFF_20160707, DV_COUNT - }; - - void Encode(TBuffer&, TStringBuf, EDictVersion dv = DV_SA_HUFF_20160707); - - void Decode(TBuffer&, TStringBuf); -} + }; + + void Encode(TBuffer&, TStringBuf, EDictVersion dv = DV_SA_HUFF_20160707); + + void Decode(TBuffer&, TStringBuf); +} diff --git a/library/cpp/codecs/static/example/ya.make b/library/cpp/codecs/static/example/ya.make index ca6c5fd900..85dc222624 100644 --- a/library/cpp/codecs/static/example/ya.make +++ b/library/cpp/codecs/static/example/ya.make @@ -1,24 +1,24 @@ -LIBRARY() - -OWNER(velavokr) - -SRCS( - GLOBAL example.cpp -) - -PEERDIR( +LIBRARY() + +OWNER(velavokr) + +SRCS( + GLOBAL example.cpp +) + +PEERDIR( library/cpp/codecs library/cpp/codecs/static -) - -ARCHIVE_ASM( +) + +ARCHIVE_ASM( "solar-8k-a.huffman.1467494385.codec_info" NAME codec_info_sa_huff_20160707 -) - -ARCHIVE_ASM( +) + +ARCHIVE_ASM( "huffman.1467494385.codec_info" NAME codec_info_huff_20160707 -) - -END() +) + +END() diff --git a/library/cpp/codecs/static/static.cpp b/library/cpp/codecs/static/static.cpp index 44a07dd73a..d2c99a15ee 100644 --- a/library/cpp/codecs/static/static.cpp +++ b/library/cpp/codecs/static/static.cpp @@ -1,98 +1,98 @@ -#include "static.h" -#include "common.h" - +#include "static.h" +#include "common.h" + #include <library/cpp/codecs/static/static_codec_info.pb.h> #include <library/cpp/archive/yarchive.h> - -#include <util/draft/datetime.h> - -#include <util/string/builder.h> -#include <util/stream/buffer.h> -#include <util/stream/mem.h> -#include <util/string/hex.h> -#include <util/ysaveload.h> - -namespace NCodecs { + +#include <util/draft/datetime.h> + +#include <util/string/builder.h> +#include <util/stream/buffer.h> +#include <util/stream/mem.h> +#include <util/string/hex.h> +#include <util/ysaveload.h> + +namespace NCodecs { static constexpr TStringBuf STATIC_CODEC_INFO_MAGIC = "CodecInf"; - - static TStringBuf GetStaticCodecInfoMagic() { + + static TStringBuf GetStaticCodecInfoMagic() { return STATIC_CODEC_INFO_MAGIC; - } - + } + void SaveCodecInfoToStream(IOutputStream& out, const TStaticCodecInfo& info) { - TBufferOutput bout; + TBufferOutput bout; info.SerializeToArcadiaStream(&bout); - ui64 hash = DataSignature(bout.Buffer()); - out.Write(GetStaticCodecInfoMagic()); - ::Save(&out, hash); - ::Save(&out, bout.Buffer()); - } - + ui64 hash = DataSignature(bout.Buffer()); + out.Write(GetStaticCodecInfoMagic()); + ::Save(&out, hash); + ::Save(&out, bout.Buffer()); + } + TStaticCodecInfo LoadCodecInfoFromStream(IInputStream& in) { - { - TBuffer magic; + { + TBuffer magic; magic.Resize(GetStaticCodecInfoMagic().size()); Y_ENSURE_EX(in.Read(magic.Data(), GetStaticCodecInfoMagic().size()) == GetStaticCodecInfoMagic().size(), - TCodecException() << "bad codec info"); + TCodecException() << "bad codec info"); Y_ENSURE_EX(TStringBuf(magic.data(), magic.size()) == GetStaticCodecInfoMagic(), - TCodecException() << "bad codec info"); - } - - ui64 hash; - ::Load(&in, hash); - TBuffer info; - ::Load(&in, info); - Y_ENSURE_EX(hash == DataSignature(info), TCodecException() << "bad codec info"); - - TStaticCodecInfo result; + TCodecException() << "bad codec info"); + } + + ui64 hash; + ::Load(&in, hash); + TBuffer info; + ::Load(&in, info); + Y_ENSURE_EX(hash == DataSignature(info), TCodecException() << "bad codec info"); + + TStaticCodecInfo result; Y_ENSURE_EX(result.ParseFromArray(info.data(), info.size()), TCodecException() << "bad codec info"); - - return result; - } - + + return result; + } + TString SaveCodecInfoToString(const TStaticCodecInfo& info) { - TStringStream s; - SaveCodecInfoToStream(s, info); - return s.Str(); - } - - TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data) { + TStringStream s; + SaveCodecInfoToStream(s, info); + return s.Str(); + } + + TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data) { TMemoryInput m{data.data(), data.size()}; - return LoadCodecInfoFromStream(m); - } - + return LoadCodecInfoFromStream(m); + } + TString FormatCodecInfo(const TStaticCodecInfo& ci) { - TStringBuilder s; - s << "codec name: " << ci.GetDebugInfo().GetCodecName() << Endl; - s << "codec hash: " << HexWriteScalar(ci.GetDebugInfo().GetStoredCodecHash()) << Endl; - s << "dict size: " << ci.GetStoredCodec().Size() << Endl; - s << "sample mult: " << ci.GetDebugInfo().GetSampleSizeMultiplier() << Endl; - s << "orig.compress: " << ci.GetDebugInfo().GetCompression() * 100 << " %" << Endl; - s << "timestamp: " << ci.GetDebugInfo().GetTimestamp() << " (" + TStringBuilder s; + s << "codec name: " << ci.GetDebugInfo().GetCodecName() << Endl; + s << "codec hash: " << HexWriteScalar(ci.GetDebugInfo().GetStoredCodecHash()) << Endl; + s << "dict size: " << ci.GetStoredCodec().Size() << Endl; + s << "sample mult: " << ci.GetDebugInfo().GetSampleSizeMultiplier() << Endl; + s << "orig.compress: " << ci.GetDebugInfo().GetCompression() * 100 << " %" << Endl; + s << "timestamp: " << ci.GetDebugInfo().GetTimestamp() << " (" << NDatetime::TSimpleTM::NewLocal(ci.GetDebugInfo().GetTimestamp()).ToString() << ")" << Endl; - s << "revision: " << ci.GetDebugInfo().GetRevisionInfo() << Endl; - s << "training set comment: " << ci.GetDebugInfo().GetTrainingSetComment() << Endl; - s << "training set resId: " << ci.GetDebugInfo().GetTrainingSetResId() << Endl; - return s; - } - + s << "revision: " << ci.GetDebugInfo().GetRevisionInfo() << Endl; + s << "training set comment: " << ci.GetDebugInfo().GetTrainingSetComment() << Endl; + s << "training set resId: " << ci.GetDebugInfo().GetTrainingSetResId() << Endl; + return s; + } + TString LoadStringFromArchive(const ui8* begin, size_t size) { - TArchiveReader ar(TBlob::NoCopy(begin, size)); - Y_VERIFY(ar.Count() == 1, "invalid number of entries"); - auto blob = ar.ObjectBlobByKey(ar.KeyByIndex(0)); + TArchiveReader ar(TBlob::NoCopy(begin, size)); + Y_VERIFY(ar.Count() == 1, "invalid number of entries"); + auto blob = ar.ObjectBlobByKey(ar.KeyByIndex(0)); return TString{blob.AsCharPtr(), blob.Size()}; - } - - TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo& info) { - return NCodecs::ICodec::RestoreFromString(info.GetStoredCodec()); - } - - TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size) { - const auto& data = LoadStringFromArchive(begin, size); - const auto& info = LoadCodecInfoFromString(data); - const auto& codec = RestoreCodecFromCodecInfo(info); - Y_ENSURE_EX(codec, TCodecException() << "null codec"); - return codec; - } -} + } + + TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo& info) { + return NCodecs::ICodec::RestoreFromString(info.GetStoredCodec()); + } + + TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size) { + const auto& data = LoadStringFromArchive(begin, size); + const auto& info = LoadCodecInfoFromString(data); + const auto& codec = RestoreCodecFromCodecInfo(info); + Y_ENSURE_EX(codec, TCodecException() << "null codec"); + return codec; + } +} diff --git a/library/cpp/codecs/static/static.h b/library/cpp/codecs/static/static.h index c1eaed2a74..efa9c60c22 100644 --- a/library/cpp/codecs/static/static.h +++ b/library/cpp/codecs/static/static.h @@ -1,34 +1,34 @@ -#pragma once - +#pragma once + #include <library/cpp/codecs/codecs.h> - -#include <util/generic/strbuf.h> + +#include <util/generic/strbuf.h> #include <util/generic/string.h> #include <util/stream/output.h> - -namespace NCodecs { - class TStaticCodecInfo; - - // load - - TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo&); - - TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data); - + +namespace NCodecs { + class TStaticCodecInfo; + + // load + + TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo&); + + TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data); + TString LoadStringFromArchive(const ui8* begin, size_t size); - - TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size); - - // save - + + TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size); + + // save + TString SaveCodecInfoToString(const TStaticCodecInfo&); - + void SaveCodecInfoToStream(IOutputStream& out, const TStaticCodecInfo&); - - // misc - + + // misc + TStaticCodecInfo LoadCodecInfoFromStream(IInputStream& in); - + TString FormatCodecInfo(const TStaticCodecInfo&); - -} + +} diff --git a/library/cpp/codecs/static/static_codec_info.proto b/library/cpp/codecs/static/static_codec_info.proto index 362abb4dad..178459784b 100644 --- a/library/cpp/codecs/static/static_codec_info.proto +++ b/library/cpp/codecs/static/static_codec_info.proto @@ -1,17 +1,17 @@ -package NCodecs; - -message TStaticCodecInfo { - message TDebugInfo { - optional string CodecName = 1; // the exact codec variant name - optional uint64 Timestamp = 2; // when the codec was built - optional string RevisionInfo = 3; // the arcadia revision info - optional float SampleSizeMultiplier = 4; // how the default sample size was modified to improve compression - optional float Compression = 5; // the compression on the training set ((raw_size - coded_size) / raw_size) - optional string TrainingSetComment = 6; // a human readable description of the training set - optional string TrainingSetResId = 7; // the training set sandbox resource id - optional uint64 StoredCodecHash = 8; // cityhash64(data) - } - - optional bytes StoredCodec = 1; // the data of the codec - optional TDebugInfo DebugInfo = 2; // misc debug info which could be useful in finding whereabouts later -} +package NCodecs; + +message TStaticCodecInfo { + message TDebugInfo { + optional string CodecName = 1; // the exact codec variant name + optional uint64 Timestamp = 2; // when the codec was built + optional string RevisionInfo = 3; // the arcadia revision info + optional float SampleSizeMultiplier = 4; // how the default sample size was modified to improve compression + optional float Compression = 5; // the compression on the training set ((raw_size - coded_size) / raw_size) + optional string TrainingSetComment = 6; // a human readable description of the training set + optional string TrainingSetResId = 7; // the training set sandbox resource id + optional uint64 StoredCodecHash = 8; // cityhash64(data) + } + + optional bytes StoredCodec = 1; // the data of the codec + optional TDebugInfo DebugInfo = 2; // misc debug info which could be useful in finding whereabouts later +} diff --git a/library/cpp/codecs/static/tools/common/ct_common.cpp b/library/cpp/codecs/static/tools/common/ct_common.cpp index fe77691280..cea40506e1 100644 --- a/library/cpp/codecs/static/tools/common/ct_common.cpp +++ b/library/cpp/codecs/static/tools/common/ct_common.cpp @@ -1,74 +1,74 @@ -#include "ct_common.h" - +#include "ct_common.h" + #include <library/cpp/codecs/codecs.h> #include <library/cpp/codecs/static/static_codec_info.pb.h> #include <library/cpp/string_utils/base64/base64.h> - + #include <util/stream/output.h> -#include <util/string/builder.h> -#include <util/system/hp_timer.h> - -namespace NCodecs { +#include <util/string/builder.h> +#include <util/system/hp_timer.h> + +namespace NCodecs { TString TComprStats::Format(const TStaticCodecInfo& info, bool checkMode) const { - TStringBuilder s; - s << "raw size/item: " << RawSizePerRecord() << Endl; - s << "enc.size/item: " << EncSizePerRecord() << Endl; - if (checkMode) { - s << "orig.enc.size/item: " << OldEncSizePerRecord(info.GetDebugInfo().GetCompression()) << Endl; - } - s << "enc time us/item: " << EncTimePerRecordUS() << Endl; - s << "dec time us/item: " << DecTimePerRecordUS() << Endl; - s << "dict size: " << info.GetStoredCodec().Size() << Endl; - s << "compression: " << AsPercent(Compression()) << " %" << Endl; - if (checkMode) { - s << "orig.compression: " << AsPercent(info.GetDebugInfo().GetCompression()) << " %" << Endl; - } - return s; - } - + TStringBuilder s; + s << "raw size/item: " << RawSizePerRecord() << Endl; + s << "enc.size/item: " << EncSizePerRecord() << Endl; + if (checkMode) { + s << "orig.enc.size/item: " << OldEncSizePerRecord(info.GetDebugInfo().GetCompression()) << Endl; + } + s << "enc time us/item: " << EncTimePerRecordUS() << Endl; + s << "dec time us/item: " << DecTimePerRecordUS() << Endl; + s << "dict size: " << info.GetStoredCodec().Size() << Endl; + s << "compression: " << AsPercent(Compression()) << " %" << Endl; + if (checkMode) { + s << "orig.compression: " << AsPercent(info.GetDebugInfo().GetCompression()) << " %" << Endl; + } + return s; + } + TComprStats TestCodec(const ICodec& c, const TVector<TString>& input) { - TComprStats stats; - - TBuffer encodeBuffer; - TBuffer decodeBuffer; - for (const auto& data : input) { - encodeBuffer.Clear(); - decodeBuffer.Clear(); - - stats.Records += 1; + TComprStats stats; + + TBuffer encodeBuffer; + TBuffer decodeBuffer; + for (const auto& data : input) { + encodeBuffer.Clear(); + decodeBuffer.Clear(); + + stats.Records += 1; stats.RawSize += data.size(); - - THPTimer timer; - c.Encode(data, encodeBuffer); + + THPTimer timer; + c.Encode(data, encodeBuffer); stats.EncSize += encodeBuffer.size(); - stats.EncSeconds += timer.PassedReset(); - + stats.EncSeconds += timer.PassedReset(); + c.Decode(TStringBuf{encodeBuffer.data(), encodeBuffer.size()}, decodeBuffer); - stats.DecSeconds += timer.PassedReset(); + stats.DecSeconds += timer.PassedReset(); Y_ENSURE(data == TStringBuf(decodeBuffer.data(), decodeBuffer.size()), "invalid encoding at record " << stats.Records); - } - - return stats; - } - + } + + return stats; + } + void ParseBlob(TVector<TString>& result, EDataStreamFormat fmt, const TBlob& blob) { TStringBuf bin(blob.AsCharPtr(), blob.Size()); - TStringBuf line; + TStringBuf line; TString buffer; - while (bin.ReadLine(line)) { - if (DSF_BASE64_LF == fmt) { - Base64Decode(line, buffer); - line = buffer; - } - if (!line) { - continue; - } + while (bin.ReadLine(line)) { + if (DSF_BASE64_LF == fmt) { + Base64Decode(line, buffer); + line = buffer; + } + if (!line) { + continue; + } result.emplace_back(line.data(), line.size()); - } - } - + } + } + TBlob GetInputBlob(const TString& dataFile) { - return dataFile && dataFile != "-" ? TBlob::FromFile(dataFile) : TBlob::FromStream(Cin); - } - -} + return dataFile && dataFile != "-" ? TBlob::FromFile(dataFile) : TBlob::FromStream(Cin); + } + +} diff --git a/library/cpp/codecs/static/tools/common/ct_common.h b/library/cpp/codecs/static/tools/common/ct_common.h index 9d3dcbda93..de531b27e6 100644 --- a/library/cpp/codecs/static/tools/common/ct_common.h +++ b/library/cpp/codecs/static/tools/common/ct_common.h @@ -1,75 +1,75 @@ -#pragma once - +#pragma once + #include <util/generic/string.h> -#include <util/generic/vector.h> -#include <util/memory/blob.h> -#include <cmath> - -namespace NCodecs { - class TStaticCodecInfo; - class ICodec; - - struct TComprStats { - double EncSeconds = 0; - double DecSeconds = 0; - size_t Records = 0; - size_t RawSize = 0; - size_t EncSize = 0; - - static double Round(double n, size_t decPlaces = 2) { - double p = pow(10, decPlaces); - return round(n * p) / p; - } - - static double AsPercent(double n) { - return Round(n * 100); - } - - static double AsMicroSecond(double s) { - return s * 1000000; - } - - double PerRecord(double n) const { - return Round((double)(Records ? n / Records : 0)); - } - - double Compression() const { - return ((double)RawSize - (double)EncSize) / RawSize; - } - - double EncTimePerRecordUS() const { - return PerRecord(AsMicroSecond(EncSeconds)); - } - - double DecTimePerRecordUS() const { - return PerRecord(AsMicroSecond(DecSeconds)); - } - - double RawSizePerRecord() const { - return PerRecord(RawSize); - } - - double EncSizePerRecord() const { - return PerRecord(EncSize); - } - - double OldEncSizePerRecord(double compr) const { - return PerRecord((1 - compr) * RawSize); - } - +#include <util/generic/vector.h> +#include <util/memory/blob.h> +#include <cmath> + +namespace NCodecs { + class TStaticCodecInfo; + class ICodec; + + struct TComprStats { + double EncSeconds = 0; + double DecSeconds = 0; + size_t Records = 0; + size_t RawSize = 0; + size_t EncSize = 0; + + static double Round(double n, size_t decPlaces = 2) { + double p = pow(10, decPlaces); + return round(n * p) / p; + } + + static double AsPercent(double n) { + return Round(n * 100); + } + + static double AsMicroSecond(double s) { + return s * 1000000; + } + + double PerRecord(double n) const { + return Round((double)(Records ? n / Records : 0)); + } + + double Compression() const { + return ((double)RawSize - (double)EncSize) / RawSize; + } + + double EncTimePerRecordUS() const { + return PerRecord(AsMicroSecond(EncSeconds)); + } + + double DecTimePerRecordUS() const { + return PerRecord(AsMicroSecond(DecSeconds)); + } + + double RawSizePerRecord() const { + return PerRecord(RawSize); + } + + double EncSizePerRecord() const { + return PerRecord(EncSize); + } + + double OldEncSizePerRecord(double compr) const { + return PerRecord((1 - compr) * RawSize); + } + TString Format(const TStaticCodecInfo&, bool checkMode) const; - }; - + }; + TComprStats TestCodec(const ICodec&, const TVector<TString>& data); - - enum EDataStreamFormat { - DSF_NONE, - DSF_PLAIN_LF /* "plain" */, - DSF_BASE64_LF /* "base64" */, - }; - + + enum EDataStreamFormat { + DSF_NONE, + DSF_PLAIN_LF /* "plain" */, + DSF_BASE64_LF /* "base64" */, + }; + void ParseBlob(TVector<TString>&, EDataStreamFormat, const TBlob&); - + TBlob GetInputBlob(const TString& dataFile); - -} + +} diff --git a/library/cpp/codecs/static/tools/common/ya.make b/library/cpp/codecs/static/tools/common/ya.make index d624222dad..5f575a2f28 100644 --- a/library/cpp/codecs/static/tools/common/ya.make +++ b/library/cpp/codecs/static/tools/common/ya.make @@ -1,19 +1,19 @@ -LIBRARY() - +LIBRARY() + OWNER(velavokr) - -SRCS( - ct_common.cpp -) - -PEERDIR( + +SRCS( + ct_common.cpp +) + +PEERDIR( library/cpp/codecs library/cpp/codecs/static library/cpp/getopt/small library/cpp/string_utils/base64 - util/draft -) - + util/draft +) + GENERATE_ENUM_SERIALIZATION(ct_common.h) - -END() + +END() diff --git a/library/cpp/codecs/static/tools/static_codec_checker/README b/library/cpp/codecs/static/tools/static_codec_checker/README index 723a68300b..c66703227d 100644 --- a/library/cpp/codecs/static/tools/static_codec_checker/README +++ b/library/cpp/codecs/static/tools/static_codec_checker/README @@ -1,4 +1,4 @@ This is a viewer for generated codec and utility for verification of the compression quality on a new data. - + Usage: -static_codec_checker -t -c 029b29ff64a74927.codec_info -f plain samples.txt +static_codec_checker -t -c 029b29ff64a74927.codec_info -f plain samples.txt diff --git a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp index 9c8d568d82..5ae901d8f8 100644 --- a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp +++ b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp @@ -3,25 +3,25 @@ #include <library/cpp/codecs/static/static_codec_info.pb.h> #include <library/cpp/codecs/codecs.h> #include <library/cpp/getopt/small/last_getopt.h> - -#include <util/digest/city.h> -#include <util/generic/yexception.h> -#include <util/stream/file.h> -#include <util/stream/buffer.h> -#include <util/stream/format.h> -#include <util/string/builder.h> - -int main(int argc, char** argv) { - NCodecs::TCodecPtr codecPtr; - NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE; + +#include <util/digest/city.h> +#include <util/generic/yexception.h> +#include <util/stream/file.h> +#include <util/stream/buffer.h> +#include <util/stream/format.h> +#include <util/string/builder.h> + +int main(int argc, char** argv) { + NCodecs::TCodecPtr codecPtr; + NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE; TString codecFile; - bool testCompression = false; - - auto opts = NLastGetopt::TOpts::Default(); - opts.SetTitle("Prints a .codec_info file and optionally checks its performance on new data. See also static_codec_generator."); - opts.SetCmdLineDescr("-c 9089f3e9b7a0f0d4.codec_info -t -f base64 qtrees.sample.txt"); - NCodecs::TStaticCodecInfo codec; - + bool testCompression = false; + + auto opts = NLastGetopt::TOpts::Default(); + opts.SetTitle("Prints a .codec_info file and optionally checks its performance on new data. See also static_codec_generator."); + opts.SetCmdLineDescr("-c 9089f3e9b7a0f0d4.codec_info -t -f base64 qtrees.sample.txt"); + NCodecs::TStaticCodecInfo codec; + opts.AddLongOption('c', "codec-info").RequiredArgument("codec_info").Handler1T<TString>([&codecFile, &codec, &codecPtr](TString name) { codecFile = name; codec.CopyFrom(NCodecs::LoadCodecInfoFromString(TUnbufferedFileInput(name).ReadAll())); @@ -29,45 +29,45 @@ int main(int argc, char** argv) { }) .Required() .Help(".codec_info file with serialized static data for codec"); - + opts.AddLongOption('t', "test").NoArgument().StoreValue(&testCompression, true).Optional().Help("test current performance"); - + opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Optional().Help("test set input file format"); - - opts.SetFreeArgsMin(0); - opts.SetFreeArgTitle(0, "testing_set_input_file", "testing set input files"); - - NLastGetopt::TOptsParseResult res(&opts, argc, argv); - - Cout << codecFile << Endl; - Cout << NCodecs::FormatCodecInfo(codec) << Endl; - - if (testCompression) { - if (NCodecs::DSF_NONE == fmt) { - Cerr << "Specify format (-f|--format) for testing set input" << Endl; - exit(1); - } - - Cout << "Reading testing set data ... " << Flush; - + + opts.SetFreeArgsMin(0); + opts.SetFreeArgTitle(0, "testing_set_input_file", "testing set input files"); + + NLastGetopt::TOptsParseResult res(&opts, argc, argv); + + Cout << codecFile << Endl; + Cout << NCodecs::FormatCodecInfo(codec) << Endl; + + if (testCompression) { + if (NCodecs::DSF_NONE == fmt) { + Cerr << "Specify format (-f|--format) for testing set input" << Endl; + exit(1); + } + + Cout << "Reading testing set data ... " << Flush; + TVector<TString> allData; - for (const auto& freeArg : res.GetFreeArgs()) { - NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg)); - } - - if (!res.GetFreeArgs()) { - NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-")); - } - - Cout << "Done" << Endl << Endl; - - Cout << "records: " << allData.size() << Endl; - Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl; - - Cout << "Testing compression ... " << Flush; - auto stats = NCodecs::TestCodec(*codecPtr, allData); - Cout << "Done" << Endl << Endl; - - Cout << stats.Format(codec, true) << Endl; - } -} + for (const auto& freeArg : res.GetFreeArgs()) { + NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg)); + } + + if (!res.GetFreeArgs()) { + NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-")); + } + + Cout << "Done" << Endl << Endl; + + Cout << "records: " << allData.size() << Endl; + Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl; + + Cout << "Testing compression ... " << Flush; + auto stats = NCodecs::TestCodec(*codecPtr, allData); + Cout << "Done" << Endl << Endl; + + Cout << stats.Format(codec, true) << Endl; + } +} diff --git a/library/cpp/codecs/static/tools/static_codec_checker/ya.make b/library/cpp/codecs/static/tools/static_codec_checker/ya.make index 90e06ca448..86b73dff6c 100644 --- a/library/cpp/codecs/static/tools/static_codec_checker/ya.make +++ b/library/cpp/codecs/static/tools/static_codec_checker/ya.make @@ -1,16 +1,16 @@ -PROGRAM() - +PROGRAM() + OWNER(velavokr) - -SRCS( - static_codec_checker.cpp -) - -PEERDIR( + +SRCS( + static_codec_checker.cpp +) + +PEERDIR( library/cpp/codecs library/cpp/codecs/static library/cpp/codecs/static/tools/common library/cpp/getopt/small -) - -END() +) + +END() diff --git a/library/cpp/codecs/static/tools/static_codec_generator/README b/library/cpp/codecs/static/tools/static_codec_generator/README index e6bb52b959..f0fffd745a 100644 --- a/library/cpp/codecs/static/tools/static_codec_generator/README +++ b/library/cpp/codecs/static/tools/static_codec_generator/README @@ -1,4 +1,4 @@ This is a utility for reproducible teaching of a codec. And also for saving it into a file with a unique name for a static compilation as a resource. - + Usage: -static_codec_generator -t -m 'the training data description' -f plain samples.txt +static_codec_generator -t -m 'the training data description' -f plain samples.txt diff --git a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp index 45fdb5c5fe..b37a0f686d 100644 --- a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp +++ b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp @@ -2,81 +2,81 @@ #include <library/cpp/codecs/static/static_codec_info.pb.h> #include <library/cpp/codecs/static/builder.h> #include <library/cpp/codecs/codecs.h> - + #include <library/cpp/getopt/small/last_getopt.h> - -#include <util/generic/yexception.h> -#include <util/stream/file.h> -#include <util/string/builder.h> - -int main(int argc, char** argv) { - NCodecs::TCodecBuildInfo info; - NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE; - - auto opts = NLastGetopt::TOpts::Default(); - opts.SetCmdLineDescr("-m 'Training set: 100000 qtrees taken from web mmeta logs' -f base64 qtrees.sample.txt"); - opts.SetTitle("Teaches the codec and serializes it as a file named CODECNAME.hash(CODECDATA).bin"); - + +#include <util/generic/yexception.h> +#include <util/stream/file.h> +#include <util/string/builder.h> + +int main(int argc, char** argv) { + NCodecs::TCodecBuildInfo info; + NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE; + + auto opts = NLastGetopt::TOpts::Default(); + opts.SetCmdLineDescr("-m 'Training set: 100000 qtrees taken from web mmeta logs' -f base64 qtrees.sample.txt"); + opts.SetTitle("Teaches the codec and serializes it as a file named CODECNAME.hash(CODECDATA).bin"); + opts.AddLongOption('m', "message").RequiredArgument("training_set_comment").StoreResult(&info.TrainingSetComment).Required().Help("a human description for the training set"); - + opts.AddLongOption('r', "resource").RequiredArgument("training_set_res_id").StoreResult(&info.TrainingSetResId).Optional().Help("sandbox resource id for the training set"); - + opts.AddLongOption('c', "codec").RequiredArgument("codec_name").StoreResult(&info.CodecName).Optional().DefaultValue(info.CodecName); - + opts.AddLongOption('s', "sample-multiplier").RequiredArgument("multiplier").StoreResult(&info.SampleSizeMultiplier).Optional().DefaultValue(ToString(info.SampleSizeMultiplier)).Help("multiplier for default sample size"); - + opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Required().Help("training set input file format"); - + opts.AddLongOption("list-codecs").NoArgument().Handler0([]() { Cout << JoinStrings(NCodecs::ICodec::GetCodecsList(), "\n") << Endl; exit(0); }) .Optional() .Help("list available codecs"); - + opts.AddLongOption("fake-revision").RequiredArgument("revision").StoreResult(&info.RevisionInfo).Optional().Hidden(); // replace static_codec_generator revision in debug info - + opts.AddLongOption("fake-timestamp").RequiredArgument("timestamp").StoreResult(&info.Timestamp).Optional().Hidden(); // replace generating timestamp in debug info - - opts.SetFreeArgsMin(0); - opts.SetFreeArgTitle(0, "training_set_input_file", "training set input files"); - - NLastGetopt::TOptsParseResult res(&opts, argc, argv); - - Cout << "Reading training set data ... " << Flush; + + opts.SetFreeArgsMin(0); + opts.SetFreeArgTitle(0, "training_set_input_file", "training set input files"); + + NLastGetopt::TOptsParseResult res(&opts, argc, argv); + + Cout << "Reading training set data ... " << Flush; TVector<TString> allData; - for (const auto& freeArg : res.GetFreeArgs()) { - NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg)); - } - - if (!res.GetFreeArgs()) { - NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-")); - } - Cout << "Done" << Endl << Endl; - - Cout << "records: " << allData.size() << Endl; - Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl; - - Cout << "Training " << info.CodecName << " , sample size multiplier is " << info.SampleSizeMultiplier << " ... " << Flush; - auto codec = NCodecs::BuildStaticCodec(allData, info); - Cout << "Done" << Endl; - + for (const auto& freeArg : res.GetFreeArgs()) { + NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg)); + } + + if (!res.GetFreeArgs()) { + NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-")); + } + Cout << "Done" << Endl << Endl; + + Cout << "records: " << allData.size() << Endl; + Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl; + + Cout << "Training " << info.CodecName << " , sample size multiplier is " << info.SampleSizeMultiplier << " ... " << Flush; + auto codec = NCodecs::BuildStaticCodec(allData, info); + Cout << "Done" << Endl; + TString codecName = NCodecs::GetStandardFileName(codec); - NCodecs::TCodecPtr codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec()); - - Cout << "Testing compression ... " << Flush; - auto stats = NCodecs::TestCodec(*codecPtr, allData); - Cout << "Done" << Endl << Endl; - - codec.MutableDebugInfo()->SetCompression(stats.Compression()); - - Cout << stats.Format(codec, false) << Endl; - - Cout << "Saving as " << codecName << " ... " << Flush; - { + NCodecs::TCodecPtr codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec()); + + Cout << "Testing compression ... " << Flush; + auto stats = NCodecs::TestCodec(*codecPtr, allData); + Cout << "Done" << Endl << Endl; + + codec.MutableDebugInfo()->SetCompression(stats.Compression()); + + Cout << stats.Format(codec, false) << Endl; + + Cout << "Saving as " << codecName << " ... " << Flush; + { TUnbufferedFileOutput fout{codecName}; - NCodecs::SaveCodecInfoToStream(fout, codec); - fout.Finish(); - } - Cout << "Done" << Endl << Endl; -} + NCodecs::SaveCodecInfoToStream(fout, codec); + fout.Finish(); + } + Cout << "Done" << Endl << Endl; +} diff --git a/library/cpp/codecs/static/tools/static_codec_generator/ya.make b/library/cpp/codecs/static/tools/static_codec_generator/ya.make index efbc440dd1..21750dde49 100644 --- a/library/cpp/codecs/static/tools/static_codec_generator/ya.make +++ b/library/cpp/codecs/static/tools/static_codec_generator/ya.make @@ -1,17 +1,17 @@ -PROGRAM() - +PROGRAM() + OWNER(velavokr) - -SRCS( - static_codec_generator.cpp -) - -PEERDIR( + +SRCS( + static_codec_generator.cpp +) + +PEERDIR( library/cpp/codecs library/cpp/codecs/static library/cpp/codecs/static/tools/common library/cpp/digest/md5 library/cpp/getopt/small -) - -END() +) + +END() diff --git a/library/cpp/codecs/static/tools/tests/static_codec_tools.py b/library/cpp/codecs/static/tools/tests/static_codec_tools.py index db4140e370..a5baa262f7 100644 --- a/library/cpp/codecs/static/tools/tests/static_codec_tools.py +++ b/library/cpp/codecs/static/tools/tests/static_codec_tools.py @@ -1,18 +1,18 @@ -#!/usr/bin/env python - -import yatest.common as tt -import os.path as op - -def test_static_codec_tools(): +#!/usr/bin/env python + +import yatest.common as tt +import os.path as op + +def test_static_codec_tools(): tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator")] - + ["-m", "test codec", "-r", "sbr://143310406", "-f", "plain", "-c", "solar-8k-a:huffman", "-s", "1", - "--fake-revision", "r2385905", "--fake-timestamp", "1467494385", "sample.txt"], - timeout=60) - assert(op.exists("solar-8k-a.huffman.1467494385.codec_info")) + + ["-m", "test codec", "-r", "sbr://143310406", "-f", "plain", "-c", "solar-8k-a:huffman", "-s", "1", + "--fake-revision", "r2385905", "--fake-timestamp", "1467494385", "sample.txt"], + timeout=60) + assert(op.exists("solar-8k-a.huffman.1467494385.codec_info")) tt.canonical_execute(tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker"), - args=["-c", "solar-8k-a.huffman.1467494385.codec_info"], - timeout=60) + args=["-c", "solar-8k-a.huffman.1467494385.codec_info"], + timeout=60) tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker")] - + ["-c", "solar-8k-a.huffman.1467494385.codec_info", "-f", "plain", "-t", "sample.txt"], - timeout=60) - return tt.canonical_file("solar-8k-a.huffman.1467494385.codec_info") + + ["-c", "solar-8k-a.huffman.1467494385.codec_info", "-f", "plain", "-t", "sample.txt"], + timeout=60) + return tt.canonical_file("solar-8k-a.huffman.1467494385.codec_info") diff --git a/library/cpp/codecs/static/tools/tests/ya.make b/library/cpp/codecs/static/tools/tests/ya.make index c5324eaf53..5555d90cae 100644 --- a/library/cpp/codecs/static/tools/tests/ya.make +++ b/library/cpp/codecs/static/tools/tests/ya.make @@ -1,20 +1,20 @@ PY2TEST() - -OWNER(velavokr) - -TEST_SRCS(static_codec_tools.py) - -DATA(sbr://143310406) - -TIMEOUT(4200) - + +OWNER(velavokr) + +TEST_SRCS(static_codec_tools.py) + +DATA(sbr://143310406) + +TIMEOUT(4200) + TAG(ya:not_autocheck) -DEPENDS( +DEPENDS( library/cpp/codecs/static/tools/static_codec_checker library/cpp/codecs/static/tools/static_codec_generator -) - - +) + + -END() +END() diff --git a/library/cpp/codecs/static/tools/ya.make b/library/cpp/codecs/static/tools/ya.make index dd3e8437aa..ab72769153 100644 --- a/library/cpp/codecs/static/tools/ya.make +++ b/library/cpp/codecs/static/tools/ya.make @@ -1,5 +1,5 @@ -RECURSE( - common - static_codec_generator - static_codec_checker -) +RECURSE( + common + static_codec_generator + static_codec_checker +) diff --git a/library/cpp/codecs/static/ut/builder_ut.cpp b/library/cpp/codecs/static/ut/builder_ut.cpp index b47c279ed1..48d5c98d5d 100644 --- a/library/cpp/codecs/static/ut/builder_ut.cpp +++ b/library/cpp/codecs/static/ut/builder_ut.cpp @@ -1,57 +1,57 @@ #include <library/cpp/testing/unittest/registar.h> #include <library/cpp/codecs/static/builder.h> #include <library/cpp/codecs/static/static_codec_info.pb.h> -#include <util/string/vector.h> - +#include <util/string/vector.h> + class TStaticCodecInfoBuilderTest: public NUnitTest::TTestBase { - UNIT_TEST_SUITE(TStaticCodecInfoBuilderTest) + UNIT_TEST_SUITE(TStaticCodecInfoBuilderTest) UNIT_TEST(TestBuild) - UNIT_TEST_SUITE_END(); + UNIT_TEST_SUITE_END(); -private: +private: TVector<TString> PrepareData() { TVector<TString> data; - for (ui32 i = 'a'; i <= 'z'; ++i) { + for (ui32 i = 'a'; i <= 'z'; ++i) { data.push_back(TString(1, (char)i)); - } - return data; - } - - void TestBuild() { + } + return data; + } + + void TestBuild() { TVector<TString> data; - NCodecs::TCodecBuildInfo info; - info.CodecName = "huffman"; - info.SampleSizeMultiplier = 2; - info.Timestamp = 1467494385; - info.RevisionInfo = "r2385905"; - info.TrainingSetComment = "some dummy data"; - info.TrainingSetResId = "sbr://1234"; - auto res = NCodecs::BuildStaticCodec(PrepareData(), info); - UNIT_ASSERT_VALUES_EQUAL(res.ShortUtf8DebugString(), - "StoredCodec: \"\\007\\000huffman@S\\000a" - "\\006b\\005c\\005d\\005e\\005f\\005g\\005h\\005i\\005j\\005k\\005l\\005m\\005n\\005o" - "\\005p\\005q\\005r\\005s\\005t\\005u\\004v\\004w\\004x\\004y\\004z\\004\xC7?\xC8>" - "\xC9=\xCA<\xCB;\xCC:\3159\3168\3177\3206\3215\3224\3233\3242\3251\3260\xD7/\xD8." - "\xD9-\xDA,\xDB+\xDC*\xDD)\xDE(\xDF\\'\xE0&\xE1%\xE2$\xE3#\xE4\\\"\xE5!\xE6 \xE7" - "\\037\xE8\\036\xE9\\035\xEA\\034\xEB\\033\xEC\\032\xED\\031\xEE\\030\xEF\\027\xF0" - "\\026\xF1\\025\xF2\\024\xF3\\023\xF4\\022\xF5\\021\xF6\\020\xF7\\017\xF8\\016\xF9" - "\\r\xFA\\014\xFB\\013\xFC\\n\xFD\\t\xFE\\010\xFF\\007\" " - "DebugInfo { " - "CodecName: \"huffman\" " - "Timestamp: 1467494385 " - "RevisionInfo: \"r2385905\" " - "SampleSizeMultiplier: 2 " - "TrainingSetComment: \"some dummy data\" " - "TrainingSetResId: \"sbr://1234\" " - "StoredCodecHash: 2509195835471488613 " - "}"); - - UNIT_ASSERT_VALUES_EQUAL(NCodecs::GetStandardFileName(res), "huffman.1467494385.codec_info"); - UNIT_ASSERT_VALUES_EQUAL(res.GetDebugInfo().GetStoredCodecHash(), 2509195835471488613ULL); - - auto res1 = NCodecs::LoadCodecInfoFromString(NCodecs::SaveCodecInfoToString(res)); - UNIT_ASSERT_VALUES_EQUAL(res1.ShortUtf8DebugString(), res.ShortUtf8DebugString()); - } -}; - -UNIT_TEST_SUITE_REGISTRATION(TStaticCodecInfoBuilderTest); + NCodecs::TCodecBuildInfo info; + info.CodecName = "huffman"; + info.SampleSizeMultiplier = 2; + info.Timestamp = 1467494385; + info.RevisionInfo = "r2385905"; + info.TrainingSetComment = "some dummy data"; + info.TrainingSetResId = "sbr://1234"; + auto res = NCodecs::BuildStaticCodec(PrepareData(), info); + UNIT_ASSERT_VALUES_EQUAL(res.ShortUtf8DebugString(), + "StoredCodec: \"\\007\\000huffman@S\\000a" + "\\006b\\005c\\005d\\005e\\005f\\005g\\005h\\005i\\005j\\005k\\005l\\005m\\005n\\005o" + "\\005p\\005q\\005r\\005s\\005t\\005u\\004v\\004w\\004x\\004y\\004z\\004\xC7?\xC8>" + "\xC9=\xCA<\xCB;\xCC:\3159\3168\3177\3206\3215\3224\3233\3242\3251\3260\xD7/\xD8." + "\xD9-\xDA,\xDB+\xDC*\xDD)\xDE(\xDF\\'\xE0&\xE1%\xE2$\xE3#\xE4\\\"\xE5!\xE6 \xE7" + "\\037\xE8\\036\xE9\\035\xEA\\034\xEB\\033\xEC\\032\xED\\031\xEE\\030\xEF\\027\xF0" + "\\026\xF1\\025\xF2\\024\xF3\\023\xF4\\022\xF5\\021\xF6\\020\xF7\\017\xF8\\016\xF9" + "\\r\xFA\\014\xFB\\013\xFC\\n\xFD\\t\xFE\\010\xFF\\007\" " + "DebugInfo { " + "CodecName: \"huffman\" " + "Timestamp: 1467494385 " + "RevisionInfo: \"r2385905\" " + "SampleSizeMultiplier: 2 " + "TrainingSetComment: \"some dummy data\" " + "TrainingSetResId: \"sbr://1234\" " + "StoredCodecHash: 2509195835471488613 " + "}"); + + UNIT_ASSERT_VALUES_EQUAL(NCodecs::GetStandardFileName(res), "huffman.1467494385.codec_info"); + UNIT_ASSERT_VALUES_EQUAL(res.GetDebugInfo().GetStoredCodecHash(), 2509195835471488613ULL); + + auto res1 = NCodecs::LoadCodecInfoFromString(NCodecs::SaveCodecInfoToString(res)); + UNIT_ASSERT_VALUES_EQUAL(res1.ShortUtf8DebugString(), res.ShortUtf8DebugString()); + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TStaticCodecInfoBuilderTest); diff --git a/library/cpp/codecs/static/ut/static_ut.cpp b/library/cpp/codecs/static/ut/static_ut.cpp index 57e1e62887..fda9ffcccb 100644 --- a/library/cpp/codecs/static/ut/static_ut.cpp +++ b/library/cpp/codecs/static/ut/static_ut.cpp @@ -1,27 +1,27 @@ #include <library/cpp/testing/unittest/registar.h> #include <library/cpp/codecs/static/example/example.h> - + class TStaticCodecUsageTest: public NUnitTest::TTestBase { - UNIT_TEST_SUITE(TStaticCodecUsageTest) + UNIT_TEST_SUITE(TStaticCodecUsageTest) UNIT_TEST(TestUsage) - UNIT_TEST_SUITE_END(); + UNIT_TEST_SUITE_END(); -private: - void DoTestUsage(NStaticCodecExample::EDictVersion dv, size_t expectedSize) { +private: + void DoTestUsage(NStaticCodecExample::EDictVersion dv, size_t expectedSize) { const TStringBuf letov = "Всё идёт по плану"; - - TBuffer outEnc, outDec; - NStaticCodecExample::Encode(outEnc, letov, dv); + + TBuffer outEnc, outDec; + NStaticCodecExample::Encode(outEnc, letov, dv); NStaticCodecExample::Decode(outDec, TStringBuf{outEnc.data(), outEnc.size()}); - - UNIT_ASSERT_VALUES_EQUAL(outEnc.Size(), expectedSize); + + UNIT_ASSERT_VALUES_EQUAL(outEnc.Size(), expectedSize); UNIT_ASSERT_EQUAL(TStringBuf(outDec.data(), outDec.size()), letov); - } - - void TestUsage() { - DoTestUsage(NStaticCodecExample::DV_HUFF_20160707, 18u); - DoTestUsage(NStaticCodecExample::DV_SA_HUFF_20160707, 22u); - } -}; - -UNIT_TEST_SUITE_REGISTRATION(TStaticCodecUsageTest) + } + + void TestUsage() { + DoTestUsage(NStaticCodecExample::DV_HUFF_20160707, 18u); + DoTestUsage(NStaticCodecExample::DV_SA_HUFF_20160707, 22u); + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TStaticCodecUsageTest) diff --git a/library/cpp/codecs/static/ut/ya.make b/library/cpp/codecs/static/ut/ya.make index b9116097d8..5bb2017fac 100644 --- a/library/cpp/codecs/static/ut/ya.make +++ b/library/cpp/codecs/static/ut/ya.make @@ -1,14 +1,14 @@ UNITTEST_FOR(library/cpp/codecs/static) - -OWNER(velavokr) - -SRCS( - builder_ut.cpp - static_ut.cpp -) - -PEERDIR( + +OWNER(velavokr) + +SRCS( + builder_ut.cpp + static_ut.cpp +) + +PEERDIR( library/cpp/codecs/static/example -) - -END() +) + +END() diff --git a/library/cpp/codecs/static/ya.make b/library/cpp/codecs/static/ya.make index 00e00fd8d4..a2698b9432 100644 --- a/library/cpp/codecs/static/ya.make +++ b/library/cpp/codecs/static/ya.make @@ -1,18 +1,18 @@ -LIBRARY() - -OWNER(velavokr) - -SRCS( - builder.cpp - static_codec_info.proto - static.cpp -) - -PEERDIR( +LIBRARY() + +OWNER(velavokr) + +SRCS( + builder.cpp + static_codec_info.proto + static.cpp +) + +PEERDIR( library/cpp/codecs library/cpp/archive library/cpp/svnversion - util/draft -) - -END() + util/draft +) + +END() diff --git a/library/cpp/codecs/tls_cache.cpp b/library/cpp/codecs/tls_cache.cpp index 0a1b32bda1..d54339d869 100644 --- a/library/cpp/codecs/tls_cache.cpp +++ b/library/cpp/codecs/tls_cache.cpp @@ -1,4 +1,4 @@ -#include "tls_cache.h" - -namespace NCodecs { -} +#include "tls_cache.h" + +namespace NCodecs { +} diff --git a/library/cpp/codecs/tls_cache.h b/library/cpp/codecs/tls_cache.h index 0184e4bb6c..fa166729c5 100644 --- a/library/cpp/codecs/tls_cache.h +++ b/library/cpp/codecs/tls_cache.h @@ -1,100 +1,100 @@ -#pragma once - -#include <util/generic/buffer.h> -#include <util/generic/deque.h> -#include <util/generic/noncopyable.h> -#include <util/generic/strbuf.h> -#include <util/system/tls.h> -#include <util/thread/singleton.h> - -namespace NCodecs { - template <class TItem> - struct TClear { - void operator()(TItem& item) const { - item.Clear(); - } - }; - +#pragma once + +#include <util/generic/buffer.h> +#include <util/generic/deque.h> +#include <util/generic/noncopyable.h> +#include <util/generic/strbuf.h> +#include <util/system/tls.h> +#include <util/thread/singleton.h> + +namespace NCodecs { + template <class TItem> + struct TClear { + void operator()(TItem& item) const { + item.Clear(); + } + }; + template <class TItem, class TCleaner = TClear<TItem>> - class TTlsCache { - using TSelf = TTlsCache<TItem, TCleaner>; - + class TTlsCache { + using TSelf = TTlsCache<TItem, TCleaner>; + struct TItemHolder: public TIntrusiveListItem<TItemHolder> { - TItemHolder(TSelf& factory) - : Factory(factory) + TItemHolder(TSelf& factory) + : Factory(factory) { } - - void Release() { - Factory.Release(*this); - } - - TSelf& Factory; - TItem Item; - }; - - class TItemGuard { - public: - explicit TItemGuard(TSelf& fact) - : Holder(fact.Acquire()) + + void Release() { + Factory.Release(*this); + } + + TSelf& Factory; + TItem Item; + }; + + class TItemGuard { + public: + explicit TItemGuard(TSelf& fact) + : Holder(fact.Acquire()) { } - + TItemGuard(TItemGuard&& other) noexcept { - *this = std::move(other); - } - + *this = std::move(other); + } + TItemGuard& operator=(TItemGuard&& other) noexcept { - if (&other != this) { - std::swap(Holder, other.Holder); - } - return *this; - } - - ~TItemGuard() { - if (Holder) { - Holder->Release(); - } - } - - TItem& Get() & { - Y_ASSERT(Holder); - return Holder->Item; - } - - TItem& Get() && = delete; - - private: - TItemHolder* Holder = nullptr; - }; - - public: - TItemGuard Item() { - return TItemGuard(*this); - } - - static TSelf& TlsInstance() { - return *FastTlsSingleton<TSelf>(); - } - - private: - TItemHolder* Acquire() { - if (Free.Empty()) { - return new TItemHolder(*this); - } else { - return Free.PopBack(); - } - } - - void Release(TItemHolder& item) { - Cleaner(item.Item); - Free.PushBack(&item); - } - - private: - TIntrusiveListWithAutoDelete<TItemHolder, TDelete> Free; - TCleaner Cleaner; - }; - - using TBufferTlsCache = TTlsCache<TBuffer>; -} + if (&other != this) { + std::swap(Holder, other.Holder); + } + return *this; + } + + ~TItemGuard() { + if (Holder) { + Holder->Release(); + } + } + + TItem& Get() & { + Y_ASSERT(Holder); + return Holder->Item; + } + + TItem& Get() && = delete; + + private: + TItemHolder* Holder = nullptr; + }; + + public: + TItemGuard Item() { + return TItemGuard(*this); + } + + static TSelf& TlsInstance() { + return *FastTlsSingleton<TSelf>(); + } + + private: + TItemHolder* Acquire() { + if (Free.Empty()) { + return new TItemHolder(*this); + } else { + return Free.PopBack(); + } + } + + void Release(TItemHolder& item) { + Cleaner(item.Item); + Free.PushBack(&item); + } + + private: + TIntrusiveListWithAutoDelete<TItemHolder, TDelete> Free; + TCleaner Cleaner; + }; + + using TBufferTlsCache = TTlsCache<TBuffer>; +} diff --git a/library/cpp/codecs/ut/codecs_ut.cpp b/library/cpp/codecs/ut/codecs_ut.cpp index caf6089aef..1938202400 100644 --- a/library/cpp/codecs/ut/codecs_ut.cpp +++ b/library/cpp/codecs/ut/codecs_ut.cpp @@ -4,15 +4,15 @@ #include <library/cpp/codecs/solar_codec.h> #include <library/cpp/codecs/zstd_dict_codec.h> #include <library/cpp/codecs/comptable_codec.h> - + #include <library/cpp/testing/unittest/registar.h> - -#include <util/generic/buffer.h> -#include <util/string/util.h> -#include <util/string/hex.h> + +#include <util/generic/buffer.h> +#include <util/string/util.h> +#include <util/string/hex.h> #include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> - -namespace { + +namespace { const char* TextValues[] = { "! сентября газета", "!(возмездие это)!", @@ -855,328 +855,328 @@ namespace { "lymphomatoid papulosis", "sez.com", }; -} - -class TCodecsTest: public TTestBase { +} + +class TCodecsTest: public TTestBase { UNIT_TEST_SUITE(TCodecsTest); - UNIT_TEST(TestPipeline) - UNIT_TEST(TestDelta) - UNIT_TEST(TestHuffman) - UNIT_TEST(TestZStdDict) - UNIT_TEST(TestCompTable) + UNIT_TEST(TestPipeline) + UNIT_TEST(TestDelta) + UNIT_TEST(TestHuffman) + UNIT_TEST(TestZStdDict) + UNIT_TEST(TestCompTable) UNIT_TEST(TestHuffmanLearnByFreqs) - UNIT_TEST(TestSolar) - UNIT_TEST(TestPFor) - UNIT_TEST(TestRegistry) - + UNIT_TEST(TestSolar) + UNIT_TEST(TestPFor) + UNIT_TEST(TestRegistry) + UNIT_TEST_SUITE_END(); - -private: + +private: TString PrintError(TStringBuf learn, TStringBuf test, TStringBuf codec, ui32 i) { TString s; - TStringOutput sout(s); + TStringOutput sout(s); sout << codec << ": " << i << ", " << "\n"; sout << HexEncode(learn.data(), learn.size()); //NEscJ::EscapeJ<true>(learn, sout); - sout << " != \n"; + sout << " != \n"; sout << HexEncode(test.data(), test.size()); //NEscJ::EscapeJ<true>(test, sout); - - if (s.Size() > 1536) { + + if (s.Size() > 1536) { TString res = s.substr(0, 512); - res.append("...<skipped ").append(ToString(s.size() - 1024)).append(">..."); - res.append(s.substr(s.size() - 512)); - } - - return s; - } - - TStringBuf AsStrBuf(const TBuffer& b) { + res.append("...<skipped ").append(ToString(s.size() - 1024)).append(">..."); + res.append(s.substr(s.size() - 512)); + } + + return s; + } + + TStringBuf AsStrBuf(const TBuffer& b) { return TStringBuf(b.data(), b.size()); - } - - template <typename TCodec, bool testsaveload> + } + + template <typename TCodec, bool testsaveload> void TestCodec(const TVector<TBuffer>& inlearn = TVector<TBuffer>(), const TVector<TBuffer>& in = TVector<TBuffer>(), NCodecs::TCodecPtr c = new TCodec) { - using namespace NCodecs; - - TBuffer buff; - - { + using namespace NCodecs; + + TBuffer buff; + + { TVector<TBuffer> out; - - c->Learn(inlearn.begin(), inlearn.end()); - - if (testsaveload) { - { - TBufferOutput bout(buff); - ICodec::Store(&bout, c); - } - - { - TBufferInput bin(buff); - c = ICodec::Restore(&bin); + + c->Learn(inlearn.begin(), inlearn.end()); + + if (testsaveload) { + { + TBufferOutput bout(buff); + ICodec::Store(&bout, c); + } + + { + TBufferInput bin(buff); + c = ICodec::Restore(&bin); UNIT_ASSERT(c->AlreadyTrained()); - } - } - - { - size_t insz = 0; - size_t outsz = buff.Size(); - - for (ui32 i = 0; i < inlearn.size(); ++i) { + } + } + + { + size_t insz = 0; + size_t outsz = buff.Size(); + + for (ui32 i = 0; i < inlearn.size(); ++i) { out.emplace_back(); - c->Encode(AsStrBuf(inlearn[i]), out[i]); - - insz += inlearn[i].Size(); - outsz += out[i].Size(); - } - - TBuffer vecl; - for (ui32 i = 0; i < out.size(); ++i) { - vecl.Clear(); - c->Decode(AsStrBuf(out[i]), vecl); - - UNIT_ASSERT_EQUAL_C(AsStrBuf(inlearn[i]), AsStrBuf(vecl), + c->Encode(AsStrBuf(inlearn[i]), out[i]); + + insz += inlearn[i].Size(); + outsz += out[i].Size(); + } + + TBuffer vecl; + for (ui32 i = 0; i < out.size(); ++i) { + vecl.Clear(); + c->Decode(AsStrBuf(out[i]), vecl); + + UNIT_ASSERT_EQUAL_C(AsStrBuf(inlearn[i]), AsStrBuf(vecl), PrintError(TStringBuf(inlearn[i].data(), inlearn[i].size()), TStringBuf(vecl.data(), vecl.size()), c->GetName(), i)); - } - } - } - - { - if (testsaveload) { - TBufferInput bin(buff); - c = ICodec::Restore(&bin); - } - - size_t insz = 0; - size_t outsz = buff.Size(); - - TBuffer out, in1; - for (ui32 i = 0; i < in.size(); ++i) { - out.Clear(); - in1.Clear(); - c->Encode(AsStrBuf(in[i]), out); - insz += in[i].Size(); - outsz += out.Size(); - c->Decode(AsStrBuf(out), in1); - UNIT_ASSERT_EQUAL_C(AsStrBuf(in[i]), AsStrBuf(in1), + } + } + } + + { + if (testsaveload) { + TBufferInput bin(buff); + c = ICodec::Restore(&bin); + } + + size_t insz = 0; + size_t outsz = buff.Size(); + + TBuffer out, in1; + for (ui32 i = 0; i < in.size(); ++i) { + out.Clear(); + in1.Clear(); + c->Encode(AsStrBuf(in[i]), out); + insz += in[i].Size(); + outsz += out.Size(); + c->Decode(AsStrBuf(out), in1); + UNIT_ASSERT_EQUAL_C(AsStrBuf(in[i]), AsStrBuf(in1), PrintError(TStringBuf(in[i].data(), in[i].size()), TStringBuf(in1.data(), in1.size()), c->GetName(), i)); - } - } - } - - template <class T> - void AppendTo(TBuffer& b, T t) { - b.Append((char*)&t, sizeof(t)); - } - - void TestDelta() { - using namespace NCodecs; + } + } + } + + template <class T> + void AppendTo(TBuffer& b, T t) { + b.Append((char*)&t, sizeof(t)); + } + + void TestDelta() { + using namespace NCodecs; TVector<TBuffer> d; - - // 1. common case + + // 1. common case d.emplace_back(); - AppendTo(d.back(), 1ULL); - AppendTo(d.back(), 10ULL); - AppendTo(d.back(), 100ULL); - AppendTo(d.back(), 1000ULL); - AppendTo(d.back(), 10000ULL); - AppendTo(d.back(), 100000ULL); - - // 2. delta overflow + AppendTo(d.back(), 1ULL); + AppendTo(d.back(), 10ULL); + AppendTo(d.back(), 100ULL); + AppendTo(d.back(), 1000ULL); + AppendTo(d.back(), 10000ULL); + AppendTo(d.back(), 100000ULL); + + // 2. delta overflow d.emplace_back(); - AppendTo(d.back(), 1ULL); - AppendTo(d.back(), 10ULL); - AppendTo(d.back(), 100ULL); - AppendTo(d.back(), 1000ULL); - AppendTo(d.back(), (ui64)-100LL); - AppendTo(d.back(), (ui64)-10ULL); - - // 3. bad sorting + AppendTo(d.back(), 1ULL); + AppendTo(d.back(), 10ULL); + AppendTo(d.back(), 100ULL); + AppendTo(d.back(), 1000ULL); + AppendTo(d.back(), (ui64)-100LL); + AppendTo(d.back(), (ui64)-10ULL); + + // 3. bad sorting d.emplace_back(); - AppendTo(d.back(), 1ULL); - AppendTo(d.back(), 10ULL); - AppendTo(d.back(), 1000ULL); - AppendTo(d.back(), 100ULL); - AppendTo(d.back(), 10000ULL); - AppendTo(d.back(), 100000ULL); - - // all bad + AppendTo(d.back(), 1ULL); + AppendTo(d.back(), 10ULL); + AppendTo(d.back(), 1000ULL); + AppendTo(d.back(), 100ULL); + AppendTo(d.back(), 10000ULL); + AppendTo(d.back(), 100000ULL); + + // all bad d.emplace_back(); - AppendTo(d.back(), -1LL); - AppendTo(d.back(), -1LL); - AppendTo(d.back(), -1LL); - AppendTo(d.back(), -1LL); - + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -1LL); + TestCodec<TDeltaCodec<ui64, true>, false>(d); TestCodec<TDeltaCodec<ui64, false>, false>(d); - } - - void TestPFor() { - using namespace NCodecs; - { + } + + void TestPFor() { + using namespace NCodecs; + { TVector<TBuffer> d; d.emplace_back(); - AppendTo(d.back(), -1LL); - AppendTo(d.back(), -1LL); - AppendTo(d.back(), -1LL); - AppendTo(d.back(), -1LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -1LL); d.emplace_back(); - AppendTo(d.back(), 0LL); - AppendTo(d.back(), 1LL); - AppendTo(d.back(), 2LL); - AppendTo(d.back(), 1LL); - AppendTo(d.back(), 0LL); - AppendTo(d.back(), 1LL); - AppendTo(d.back(), 2LL); + AppendTo(d.back(), 0LL); + AppendTo(d.back(), 1LL); + AppendTo(d.back(), 2LL); + AppendTo(d.back(), 1LL); + AppendTo(d.back(), 0LL); + AppendTo(d.back(), 1LL); + AppendTo(d.back(), 2LL); d.emplace_back(); - AppendTo(d.back(), 0LL); - AppendTo(d.back(), 1LL); - AppendTo(d.back(), 2LL); - AppendTo(d.back(), 1LL); - AppendTo(d.back(), -1LL); - AppendTo(d.back(), 0LL); - AppendTo(d.back(), 1LL); - AppendTo(d.back(), 2LL); + AppendTo(d.back(), 0LL); + AppendTo(d.back(), 1LL); + AppendTo(d.back(), 2LL); + AppendTo(d.back(), 1LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), 0LL); + AppendTo(d.back(), 1LL); + AppendTo(d.back(), 2LL); d.emplace_back(); - AppendTo(d.back(), 0LL); - AppendTo(d.back(), -1LL); - AppendTo(d.back(), -2LL); - AppendTo(d.back(), -1LL); - AppendTo(d.back(), -2LL); - AppendTo(d.back(), -1LL); - AppendTo(d.back(), 0LL); - AppendTo(d.back(), -1LL); - AppendTo(d.back(), -2LL); - + AppendTo(d.back(), 0LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -2LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -2LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), 0LL); + AppendTo(d.back(), -1LL); + AppendTo(d.back(), -2LL); + TestCodec<TPForCodec<ui64>, false>(d); - TestCodec<TPForCodec<ui64, true>, true>(d); - } - { + TestCodec<TPForCodec<ui64, true>, true>(d); + } + { TVector<TBuffer> d; d.emplace_back(); - AppendTo(d.back(), -1); - AppendTo(d.back(), -1); - AppendTo(d.back(), -1); - AppendTo(d.back(), -1); + AppendTo(d.back(), -1); + AppendTo(d.back(), -1); + AppendTo(d.back(), -1); + AppendTo(d.back(), -1); d.emplace_back(); - AppendTo(d.back(), 0); - AppendTo(d.back(), 1); - AppendTo(d.back(), 2); - AppendTo(d.back(), 1); - AppendTo(d.back(), -1); - AppendTo(d.back(), 0); - AppendTo(d.back(), 1); - AppendTo(d.back(), 2); + AppendTo(d.back(), 0); + AppendTo(d.back(), 1); + AppendTo(d.back(), 2); + AppendTo(d.back(), 1); + AppendTo(d.back(), -1); + AppendTo(d.back(), 0); + AppendTo(d.back(), 1); + AppendTo(d.back(), 2); d.emplace_back(); - AppendTo(d.back(), 0); - AppendTo(d.back(), -1); - AppendTo(d.back(), -2); - AppendTo(d.back(), -1); - AppendTo(d.back(), -2); - AppendTo(d.back(), -1); - AppendTo(d.back(), 0); - AppendTo(d.back(), -1); - AppendTo(d.back(), -2); - + AppendTo(d.back(), 0); + AppendTo(d.back(), -1); + AppendTo(d.back(), -2); + AppendTo(d.back(), -1); + AppendTo(d.back(), -2); + AppendTo(d.back(), -1); + AppendTo(d.back(), 0); + AppendTo(d.back(), -1); + AppendTo(d.back(), -2); + TestCodec<TPForCodec<ui32>, false>(d); - TestCodec<TPForCodec<ui32, true>, false>(d); - } - { + TestCodec<TPForCodec<ui32, true>, false>(d); + } + { TVector<TBuffer> d; d.emplace_back(); - for (auto& textValue : TextValues) { - AppendTo(d.back(), (ui32)strlen(textValue)); - } - - TestCodec<TPForCodec<ui32>, false>(d); - TestCodec<TPForCodec<ui32, true>, false>(d); - } - { + for (auto& textValue : TextValues) { + AppendTo(d.back(), (ui32)strlen(textValue)); + } + + TestCodec<TPForCodec<ui32>, false>(d); + TestCodec<TPForCodec<ui32, true>, false>(d); + } + { TVector<TBuffer> d; d.emplace_back(); - for (auto& textValue : TextValues) { - AppendTo(d.back(), (ui64)strlen(textValue)); - } - - TestCodec<TPForCodec<ui64>, false>(d); - TestCodec<TPForCodec<ui64, true>, false>(d); - } - } - - template <class TCodec> - void DoTestSimpleCodec() { - using namespace NCodecs; - { + for (auto& textValue : TextValues) { + AppendTo(d.back(), (ui64)strlen(textValue)); + } + + TestCodec<TPForCodec<ui64>, false>(d); + TestCodec<TPForCodec<ui64, true>, false>(d); + } + } + + template <class TCodec> + void DoTestSimpleCodec() { + using namespace NCodecs; + { TVector<TBuffer> learn; - + for (auto& textValue : TextValues) { learn.emplace_back(textValue, strlen(textValue)); - } - - TestCodec<TCodec, true>(learn); - } - { - TestCodec<TCodec, true>(); - } - - { + } + + TestCodec<TCodec, true>(learn); + } + { + TestCodec<TCodec, true>(); + } + + { TVector<TBuffer> learn; learn.emplace_back(); - learn.back().Append('a'); - + learn.back().Append('a'); + TVector<TBuffer> test; test.emplace_back(); - for (ui32 i = 0; i < 256; ++i) { - test.back().Append((ui8)i); - } - - TestCodec<TCodec, true>(learn, test); - } - - { + for (ui32 i = 0; i < 256; ++i) { + test.back().Append((ui8)i); + } + + TestCodec<TCodec, true>(learn, test); + } + + { TVector<TBuffer> learn; learn.emplace_back(); - for (ui32 i = 0; i < 256; ++i) { - for (ui32 j = 0; j < i; ++j) { + for (ui32 i = 0; i < 256; ++i) { + for (ui32 j = 0; j < i; ++j) { learn.back().Append((ui8)i); - } - } - + } + } + TVector<TBuffer> test; test.emplace_back(); - for (ui32 i = 0; i < 256; ++i) { - test.back().Append((ui8)i); - } - - TestCodec<TCodec, true>(learn, test); - } - - { + for (ui32 i = 0; i < 256; ++i) { + test.back().Append((ui8)i); + } + + TestCodec<TCodec, true>(learn, test); + } + + { TVector<TBuffer> learn; learn.emplace_back(); - for (ui32 i = 0; i < 128; ++i) { - for (ui32 j = 0; j < i; ++j) { - learn.back().Append((ui8)i); - } - } - + for (ui32 i = 0; i < 128; ++i) { + for (ui32 j = 0; j < i; ++j) { + learn.back().Append((ui8)i); + } + } + TVector<TBuffer> test; test.emplace_back(); - for (ui32 i = 128; i < 256; ++i) { - test.back().Append((ui8)i); - } - - TestCodec<TCodec, true>(learn, test); - } - } - - void TestHuffman() { - DoTestSimpleCodec<NCodecs::THuffmanCodec>(); - } - - void TestZStdDict() { + for (ui32 i = 128; i < 256; ++i) { + test.back().Append((ui8)i); + } + + TestCodec<TCodec, true>(learn, test); + } + } + + void TestHuffman() { + DoTestSimpleCodec<NCodecs::THuffmanCodec>(); + } + + void TestZStdDict() { using namespace NCodecs; { TVector<TBuffer> learn; @@ -1188,12 +1188,12 @@ private: TestCodec<TZStdDictCodec, true>(learn); } - } - - void TestCompTable() { - DoTestSimpleCodec<NCodecs::TCompTableCodec>(); - } - + } + + void TestCompTable() { + DoTestSimpleCodec<NCodecs::TCompTableCodec>(); + } + void TestHuffmanLearnByFreqs() { using namespace NCodecs; @@ -1211,7 +1211,7 @@ private: for (ui32 i = 0; i < data.size(); ++i) { outLearn.emplace_back(); - codec.Encode(AsStrBuf(data[i]), outLearn[i]); + codec.Encode(AsStrBuf(data[i]), outLearn[i]); } } @@ -1228,133 +1228,133 @@ private: for (auto& textValue : TextValues) { size_t len = strlen(textValue); - for (size_t j = 0; j < len; ++j) { + for (size_t j = 0; j < len; ++j) { ++freqs[(ui32)(0xFF & textValue[j])].second; - } + } } codec.LearnByFreqs(TArrayRef<std::pair<char, ui64>>(freqs, Y_ARRAY_SIZE(freqs))); for (ui32 i = 0; i < data.size(); ++i) { outLearnByFreqs.emplace_back(); - codec.Encode(AsStrBuf(data[i]), outLearnByFreqs[i]); + codec.Encode(AsStrBuf(data[i]), outLearnByFreqs[i]); } } - UNIT_ASSERT_EQUAL(outLearn.size(), outLearnByFreqs.size()); - const size_t sz = outLearn.size(); - for (size_t n = 0; n < sz; ++n) { - UNIT_ASSERT_EQUAL(AsStrBuf(outLearn[n]), AsStrBuf(outLearnByFreqs[n])); - } + UNIT_ASSERT_EQUAL(outLearn.size(), outLearnByFreqs.size()); + const size_t sz = outLearn.size(); + for (size_t n = 0; n < sz; ++n) { + UNIT_ASSERT_EQUAL(AsStrBuf(outLearn[n]), AsStrBuf(outLearnByFreqs[n])); + } } - void TestSolar() { - using namespace NCodecs; - { + void TestSolar() { + using namespace NCodecs; + { TVector<TBuffer> learn; - + for (auto& textValue : TextValues) { learn.emplace_back(textValue, strlen(textValue)); - } - + } + TestCodec<TSolarCodec, true>(learn, TVector<TBuffer>(), new TSolarCodec(512, 8)); TestCodec<TAdaptiveSolarCodec, false>(learn, TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8)); TestCodec<TAdaptiveSolarCodec, true>(learn, TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8)); TestCodec<TSolarCodecShortInt, true>(learn, TVector<TBuffer>(), new TSolarCodecShortInt(512, 8)); - } - { + } + { TestCodec<TSolarCodec, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TSolarCodec(512, 8)); TestCodec<TAdaptiveSolarCodec, false>(TVector<TBuffer>(), TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8)); TestCodec<TAdaptiveSolarCodec, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8)); TestCodec<TSolarCodecShortInt, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TSolarCodecShortInt(512, 8)); - } - - { + } + + { TVector<TBuffer> learn; learn.emplace_back(); - learn.back().Append('a'); - + learn.back().Append('a'); + TVector<TBuffer> test; test.emplace_back(); - for (ui32 i = 0; i < 256; ++i) { - test.back().Append((ui8)i); - } - - TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8)); + for (ui32 i = 0; i < 256; ++i) { + test.back().Append((ui8)i); + } + + TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8)); TestCodec<TAdaptiveSolarCodec, false>(learn, test, new TAdaptiveSolarCodec(512, 8)); TestCodec<TAdaptiveSolarCodec, true>(learn, test, new TAdaptiveSolarCodec(512, 8)); TestCodec<TSolarCodecShortInt, true>(learn, test, new TSolarCodecShortInt(512, 8)); - } - - { + } + + { TVector<TBuffer> learn; learn.emplace_back(); - for (ui32 i = 0; i < 256; ++i) { - for (ui32 j = 0; j < i; ++j) { - learn.back().Append((ui8)i); - } - } - + for (ui32 i = 0; i < 256; ++i) { + for (ui32 j = 0; j < i; ++j) { + learn.back().Append((ui8)i); + } + } + TVector<TBuffer> test; test.emplace_back(); - for (ui32 i = 0; i < 256; ++i) { - test.back().Append((ui8)i); - } - - TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8)); + for (ui32 i = 0; i < 256; ++i) { + test.back().Append((ui8)i); + } + + TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8)); TestCodec<TAdaptiveSolarCodec, false>(learn, test, new TAdaptiveSolarCodec(512, 8)); TestCodec<TAdaptiveSolarCodec, true>(learn, test, new TAdaptiveSolarCodec(512, 8)); TestCodec<TSolarCodecShortInt, true>(learn, test, new TSolarCodecShortInt(512, 8)); - } - } - - void TestPipeline() { - using namespace NCodecs; - { + } + } + + void TestPipeline() { + using namespace NCodecs; + { TVector<TBuffer> learn; learn.emplace_back(); - for (ui32 i = 0; i < 256; ++i) { - for (i32 j = i; j >= 0; --j) { - learn.back().Append((ui8)j); - } - } - + for (ui32 i = 0; i < 256; ++i) { + for (i32 j = i; j >= 0; --j) { + learn.back().Append((ui8)j); + } + } + TVector<TBuffer> test; test.emplace_back(); - for (ui32 i = 0; i < 256; ++i) { - test.back().Append((ui8)i); - } - - TestCodec<TPipelineCodec, true>(learn, test, + for (ui32 i = 0; i < 256; ++i) { + test.back().Append((ui8)i); + } + + TestCodec<TPipelineCodec, true>(learn, test, new TPipelineCodec(new TSolarCodec(512, 8), new TSolarCodec(512, 8), new THuffmanCodec)); - } - { + } + { TVector<TBuffer> d; d.emplace_back(); - for (ui32 i = 0; i < 256; ++i) { - for (i32 j = i; j >= 0; --j) { - d.back().Append(i * i); - } - } - + for (ui32 i = 0; i < 256; ++i) { + for (i32 j = i; j >= 0; --j) { + d.back().Append(i * i); + } + } + TestCodec<TPipelineCodec, false>(d, TVector<TBuffer>(), new TPipelineCodec(new TDeltaCodec<ui32, false>, new TPForCodec<ui32>)); - } - } - - void TestRegistry() { - using namespace NCodecs; + } + } + + void TestRegistry() { + using namespace NCodecs; TVector<TString> vs = ICodec::GetCodecsList(); for (const auto& v : vs) { TCodecPtr p = ICodec::GetInstance(v); if (v == "none") { - UNIT_ASSERT(!p); - continue; - } + UNIT_ASSERT(!p); + continue; + } UNIT_ASSERT_C(!!p, v); UNIT_ASSERT_C(TStringBuf(v).Head(3) == TStringBuf(p->GetName()).Head(3), v + " " + p->GetName()); - } - } -}; - -UNIT_TEST_SUITE_REGISTRATION(TCodecsTest) + } + } +}; + +UNIT_TEST_SUITE_REGISTRATION(TCodecsTest) diff --git a/library/cpp/codecs/ut/tls_cache_ut.cpp b/library/cpp/codecs/ut/tls_cache_ut.cpp index 8101af761f..11dd5da53c 100644 --- a/library/cpp/codecs/ut/tls_cache_ut.cpp +++ b/library/cpp/codecs/ut/tls_cache_ut.cpp @@ -1,15 +1,15 @@ #include <library/cpp/testing/unittest/registar.h> #include <library/cpp/codecs/tls_cache.h> - + Y_UNIT_TEST_SUITE(CodecsBufferFactoryTest){ void AssignToBuffer(TBuffer & buf, TStringBuf val){ buf.Assign(val.data(), val.size()); } - + TStringBuf AsStringBuf(const TBuffer& b) { return TStringBuf(b.Data(), b.Size()); } - + Y_UNIT_TEST(TestAcquireReleaseReuse) { NCodecs::TBufferTlsCache factory; // acquiring the first buffer @@ -19,7 +19,7 @@ Y_UNIT_TEST(TestAcquireReleaseReuse) { // acquiring the second buffer auto buf2 = factory.Item(); AssignToBuffer(buf2.Get(), "Buffer_02"); - } + } // the first buffer should stay intact UNIT_ASSERT_EQUAL(AsStringBuf(buf1.Get()), "Buffer_01"); { diff --git a/library/cpp/codecs/ut/ya.make b/library/cpp/codecs/ut/ya.make index 90841b05ef..0b53eba9e5 100644 --- a/library/cpp/codecs/ut/ya.make +++ b/library/cpp/codecs/ut/ya.make @@ -12,7 +12,7 @@ PEERDIR( ) SRCS( - tls_cache_ut.cpp + tls_cache_ut.cpp codecs_ut.cpp float_huffman_ut.cpp ) diff --git a/library/cpp/codecs/ya.make b/library/cpp/codecs/ya.make index 7e76fb0c9a..d105d6925e 100644 --- a/library/cpp/codecs/ya.make +++ b/library/cpp/codecs/ya.make @@ -1,24 +1,24 @@ -LIBRARY() - +LIBRARY() + OWNER( g:base velavokr ) -SRCS( - tls_cache.cpp - codecs.cpp - codecs_registry.cpp - comptable_codec.cpp - delta_codec.cpp +SRCS( + tls_cache.cpp + codecs.cpp + codecs_registry.cpp + comptable_codec.cpp + delta_codec.cpp float_huffman.cpp - huffman_codec.cpp - pfor_codec.cpp - solar_codec.cpp - zstd_dict_codec.cpp -) - -PEERDIR( + huffman_codec.cpp + pfor_codec.cpp + solar_codec.cpp + zstd_dict_codec.cpp +) + +PEERDIR( contrib/libs/zstd library/cpp/bit_io library/cpp/blockcodecs @@ -28,6 +28,6 @@ PEERDIR( library/cpp/deprecated/accessors library/cpp/packers library/cpp/string_utils/relaxed_escaper -) - +) + END() diff --git a/library/cpp/codecs/zstd_dict_codec.cpp b/library/cpp/codecs/zstd_dict_codec.cpp index c42a2879e6..d543736b3d 100644 --- a/library/cpp/codecs/zstd_dict_codec.cpp +++ b/library/cpp/codecs/zstd_dict_codec.cpp @@ -1,173 +1,173 @@ -#include "zstd_dict_codec.h" - +#include "zstd_dict_codec.h" + #include <library/cpp/packers/packers.h> - -#include <util/generic/ptr.h> -#include <util/generic/refcount.h> -#include <util/generic/noncopyable.h> -#include <util/string/builder.h> -#include <util/system/src_location.h> -#include <util/ysaveload.h> - -#define ZDICT_STATIC_LINKING_ONLY - + +#include <util/generic/ptr.h> +#include <util/generic/refcount.h> +#include <util/generic/noncopyable.h> +#include <util/string/builder.h> +#include <util/system/src_location.h> +#include <util/ysaveload.h> + +#define ZDICT_STATIC_LINKING_ONLY + #include <contrib/libs/zstd/include/zdict.h> #include <contrib/libs/zstd/include/zstd.h> #include <contrib/libs/zstd/include/zstd_errors.h> - -// See IGNIETFERRO-320 for possible bugs - -namespace NCodecs { - class TZStdDictCodec::TImpl: public TAtomicRefCount<TZStdDictCodec::TImpl> { - template <class T, size_t Deleter(T*)> - class TPtrHolder : TMoveOnly { - T* Ptr = nullptr; - - public: - TPtrHolder() = default; - - TPtrHolder(T* dict) - : Ptr(dict) + +// See IGNIETFERRO-320 for possible bugs + +namespace NCodecs { + class TZStdDictCodec::TImpl: public TAtomicRefCount<TZStdDictCodec::TImpl> { + template <class T, size_t Deleter(T*)> + class TPtrHolder : TMoveOnly { + T* Ptr = nullptr; + + public: + TPtrHolder() = default; + + TPtrHolder(T* dict) + : Ptr(dict) { } - - T* Get() { - return Ptr; - } - - const T* Get() const { - return Ptr; - } - - void Reset(T* dict) { - Dispose(); - Ptr = dict; - } - - void Dispose() { - if (Ptr) { - Deleter(Ptr); - Ptr = nullptr; - } - } - - ~TPtrHolder() { - Dispose(); - } - }; - - using TCDict = TPtrHolder<ZSTD_CDict, ZSTD_freeCDict>; - using TDDict = TPtrHolder<ZSTD_DDict, ZSTD_freeDDict>; - using TCCtx = TPtrHolder<ZSTD_CCtx, ZSTD_freeCCtx>; - using TDCtx = TPtrHolder<ZSTD_DCtx, ZSTD_freeDCtx>; - - using TSizePacker = NPackers::TPacker<ui64>; - - public: - static const ui32 SampleSize = (1 << 22) * 5; - - explicit TImpl(ui32 comprLevel) - : CompressionLevel(comprLevel) - { - const size_t zeroSz = TSizePacker().MeasureLeaf(0); - Zero.Resize(zeroSz); + + T* Get() { + return Ptr; + } + + const T* Get() const { + return Ptr; + } + + void Reset(T* dict) { + Dispose(); + Ptr = dict; + } + + void Dispose() { + if (Ptr) { + Deleter(Ptr); + Ptr = nullptr; + } + } + + ~TPtrHolder() { + Dispose(); + } + }; + + using TCDict = TPtrHolder<ZSTD_CDict, ZSTD_freeCDict>; + using TDDict = TPtrHolder<ZSTD_DDict, ZSTD_freeDDict>; + using TCCtx = TPtrHolder<ZSTD_CCtx, ZSTD_freeCCtx>; + using TDCtx = TPtrHolder<ZSTD_DCtx, ZSTD_freeDCtx>; + + using TSizePacker = NPackers::TPacker<ui64>; + + public: + static const ui32 SampleSize = (1 << 22) * 5; + + explicit TImpl(ui32 comprLevel) + : CompressionLevel(comprLevel) + { + const size_t zeroSz = TSizePacker().MeasureLeaf(0); + Zero.Resize(zeroSz); TSizePacker().PackLeaf(Zero.data(), 0, zeroSz); - } - - ui32 GetCompressionLevel() const { - return CompressionLevel; - } - - ui8 Encode(TStringBuf in, TBuffer& outbuf) const { - outbuf.Clear(); - + } + + ui32 GetCompressionLevel() const { + return CompressionLevel; + } + + ui8 Encode(TStringBuf in, TBuffer& outbuf) const { + outbuf.Clear(); + if (in.empty()) { - return 0; - } - - TSizePacker packer; - + return 0; + } + + TSizePacker packer; + const char* rawBeg = in.data(); const size_t rawSz = in.size(); - - const size_t szSz = packer.MeasureLeaf(rawSz); - const size_t maxDatSz = ZSTD_compressBound(rawSz); - - outbuf.Resize(szSz + maxDatSz); + + const size_t szSz = packer.MeasureLeaf(rawSz); + const size_t maxDatSz = ZSTD_compressBound(rawSz); + + outbuf.Resize(szSz + maxDatSz); packer.PackLeaf(outbuf.data(), rawSz, szSz); - - TCCtx ctx{CheckPtr(ZSTD_createCCtx(), __LOCATION__)}; - const size_t resSz = CheckSize(ZSTD_compress_usingCDict( + + TCCtx ctx{CheckPtr(ZSTD_createCCtx(), __LOCATION__)}; + const size_t resSz = CheckSize(ZSTD_compress_usingCDict( ctx.Get(), outbuf.data() + szSz, maxDatSz, rawBeg, rawSz, CDict.Get()), __LOCATION__); - - if (resSz < rawSz) { - outbuf.Resize(resSz + szSz); - } else { + + if (resSz < rawSz) { + outbuf.Resize(resSz + szSz); + } else { outbuf.Resize(Zero.size() + rawSz); memcpy(outbuf.data(), Zero.data(), Zero.size()); memcpy(outbuf.data() + Zero.size(), rawBeg, rawSz); - } - return 0; - } - - void Decode(TStringBuf in, TBuffer& outbuf) const { - outbuf.Clear(); - + } + return 0; + } + + void Decode(TStringBuf in, TBuffer& outbuf) const { + outbuf.Clear(); + if (in.empty()) { - return; - } - - TSizePacker packer; - + return; + } + + TSizePacker packer; + const char* rawBeg = in.data(); size_t rawSz = in.size(); - - const size_t szSz = packer.SkipLeaf(rawBeg); - ui64 datSz = 0; - packer.UnpackLeaf(rawBeg, datSz); - - rawBeg += szSz; - rawSz -= szSz; - - if (!datSz) { - outbuf.Resize(rawSz); + + const size_t szSz = packer.SkipLeaf(rawBeg); + ui64 datSz = 0; + packer.UnpackLeaf(rawBeg, datSz); + + rawBeg += szSz; + rawSz -= szSz; + + if (!datSz) { + outbuf.Resize(rawSz); memcpy(outbuf.data(), rawBeg, rawSz); - } else { + } else { // size_t zSz = ZSTD_getDecompressedSize(rawBeg, rawSz); // Y_ENSURE_EX(datSz == zSz, TCodecException() << datSz << " != " << zSz); - outbuf.Resize(datSz); - TDCtx ctx{CheckPtr(ZSTD_createDCtx(), __LOCATION__)}; - CheckSize(ZSTD_decompress_usingDDict( + outbuf.Resize(datSz); + TDCtx ctx{CheckPtr(ZSTD_createDCtx(), __LOCATION__)}; + CheckSize(ZSTD_decompress_usingDDict( ctx.Get(), outbuf.data(), outbuf.size(), rawBeg, rawSz, DDict.Get()), __LOCATION__); - outbuf.Resize(datSz); - } - } - + outbuf.Resize(datSz); + } + } + bool Learn(ISequenceReader& in, bool throwOnError) { - TBuffer data; + TBuffer data; TVector<size_t> lens; - - data.Reserve(2 * SampleSize); - TStringBuf r; - while (in.NextRegion(r)) { - if (!r) { - continue; - } + + data.Reserve(2 * SampleSize); + TStringBuf r; + while (in.NextRegion(r)) { + if (!r) { + continue; + } data.Append(r.data(), r.size()); lens.push_back(r.size()); - } - + } + ZDICT_legacy_params_t params; - memset(¶ms, 0, sizeof(params)); + memset(¶ms, 0, sizeof(params)); params.zParams.compressionLevel = 1; params.zParams.notificationLevel = 1; - Dict.Resize(Max<size_t>(1 << 20, data.Size() + 16 * lens.size())); - - if (!lens) { - Dict.Reset(); - } else { + Dict.Resize(Max<size_t>(1 << 20, data.Size() + 16 * lens.size())); + + if (!lens) { + Dict.Reset(); + } else { size_t trainResult = ZDICT_trainFromBuffer_legacy( Dict.data(), Dict.size(), data.Data(), const_cast<const size_t*>(&lens[0]), lens.size(), params); if (ZSTD_isError(trainResult)) { @@ -177,105 +177,105 @@ namespace NCodecs { CheckSize(trainResult, __LOCATION__); } Dict.Resize(trainResult); - Dict.ShrinkToFit(); - } - InitContexts(); + Dict.ShrinkToFit(); + } + InitContexts(); return true; - } - + } + void Save(IOutputStream* out) const { - ::Save(out, Dict); - } - + ::Save(out, Dict); + } + void Load(IInputStream* in) { - ::Load(in, Dict); - InitContexts(); - } - - void InitContexts() { + ::Load(in, Dict); + InitContexts(); + } + + void InitContexts() { CDict.Reset(CheckPtr(ZSTD_createCDict(Dict.data(), Dict.size(), CompressionLevel), __LOCATION__)); DDict.Reset(CheckPtr(ZSTD_createDDict(Dict.data(), Dict.size()), __LOCATION__)); - } - - static size_t CheckSize(size_t sz, TSourceLocation loc) { - if (ZSTD_isError(sz)) { - ythrow TCodecException() << loc << " " << ZSTD_getErrorName(sz) << " (code " << (int)ZSTD_getErrorCode(sz) << ")"; - } - return sz; - } - - template <class T> - static T* CheckPtr(T* t, TSourceLocation loc) { + } + + static size_t CheckSize(size_t sz, TSourceLocation loc) { + if (ZSTD_isError(sz)) { + ythrow TCodecException() << loc << " " << ZSTD_getErrorName(sz) << " (code " << (int)ZSTD_getErrorCode(sz) << ")"; + } + return sz; + } + + template <class T> + static T* CheckPtr(T* t, TSourceLocation loc) { Y_ENSURE_EX(t, TCodecException() << loc << " " << "unexpected nullptr"); - return t; - } - - private: - ui32 CompressionLevel = 1; - - TBuffer Zero; - TBuffer Dict; - - TCDict CDict; - TDDict DDict; - }; - - TZStdDictCodec::TZStdDictCodec(ui32 comprLevel) - : Impl(new TImpl(comprLevel)) - { - MyTraits.NeedsTraining = true; - MyTraits.SizeOnEncodeMultiplier = 2; - MyTraits.SizeOnDecodeMultiplier = 10; - MyTraits.RecommendedSampleSize = TImpl::SampleSize; // same as for solar - } - + return t; + } + + private: + ui32 CompressionLevel = 1; + + TBuffer Zero; + TBuffer Dict; + + TCDict CDict; + TDDict DDict; + }; + + TZStdDictCodec::TZStdDictCodec(ui32 comprLevel) + : Impl(new TImpl(comprLevel)) + { + MyTraits.NeedsTraining = true; + MyTraits.SizeOnEncodeMultiplier = 2; + MyTraits.SizeOnDecodeMultiplier = 10; + MyTraits.RecommendedSampleSize = TImpl::SampleSize; // same as for solar + } + TZStdDictCodec::~TZStdDictCodec() { } - + TString TZStdDictCodec::GetName() const { return TStringBuilder() << MyName() << "-" << Impl->GetCompressionLevel(); - } - - ui8 TZStdDictCodec::Encode(TStringBuf in, TBuffer& out) const { - return Impl->Encode(in, out); - } - - void TZStdDictCodec::Decode(TStringBuf in, TBuffer& out) const { - Impl->Decode(in, out); - } - - void TZStdDictCodec::DoLearn(ISequenceReader& in) { - Impl = new TImpl(Impl->GetCompressionLevel()); + } + + ui8 TZStdDictCodec::Encode(TStringBuf in, TBuffer& out) const { + return Impl->Encode(in, out); + } + + void TZStdDictCodec::Decode(TStringBuf in, TBuffer& out) const { + Impl->Decode(in, out); + } + + void TZStdDictCodec::DoLearn(ISequenceReader& in) { + Impl = new TImpl(Impl->GetCompressionLevel()); Impl->Learn(in, true/*throwOnError*/); - } - + } + bool TZStdDictCodec::DoTryToLearn(ISequenceReader& in) { Impl = new TImpl(Impl->GetCompressionLevel()); return Impl->Learn(in, false/*throwOnError*/); } void TZStdDictCodec::Save(IOutputStream* out) const { - Impl->Save(out); - } - + Impl->Save(out); + } + void TZStdDictCodec::Load(IInputStream* in) { - Impl->Load(in); - } - + Impl->Load(in); + } + TVector<TString> TZStdDictCodec::ListCompressionNames() { TVector<TString> res; - for (int i = 1; i <= ZSTD_maxCLevel(); ++i) { - res.emplace_back(TStringBuilder() << MyName() << "-" << i); - } - return res; - } - - int TZStdDictCodec::ParseCompressionName(TStringBuf name) { - int c = 0; - TryFromString(name.After('-'), c); - Y_ENSURE_EX(name.Before('-') == MyName() && c > 0 && c <= ZSTD_maxCLevel(), TCodecException() << "invald codec name" << name); - return c; - } - -} + for (int i = 1; i <= ZSTD_maxCLevel(); ++i) { + res.emplace_back(TStringBuilder() << MyName() << "-" << i); + } + return res; + } + + int TZStdDictCodec::ParseCompressionName(TStringBuf name) { + int c = 0; + TryFromString(name.After('-'), c); + Y_ENSURE_EX(name.Before('-') == MyName() && c > 0 && c <= ZSTD_maxCLevel(), TCodecException() << "invald codec name" << name); + return c; + } + +} diff --git a/library/cpp/codecs/zstd_dict_codec.h b/library/cpp/codecs/zstd_dict_codec.h index 59c1ad6c60..cdfc5c8285 100644 --- a/library/cpp/codecs/zstd_dict_codec.h +++ b/library/cpp/codecs/zstd_dict_codec.h @@ -1,38 +1,38 @@ -#pragma once - -#include "codecs.h" - -#include <util/generic/ptr.h> - -namespace NCodecs { +#pragma once + +#include "codecs.h" + +#include <util/generic/ptr.h> + +namespace NCodecs { // benchmarks are here: https://st.yandex-team.ru/SEARCH-1655 - + class TZStdDictCodec: public ICodec { class TImpl; TIntrusivePtr<TImpl> Impl; - + public: explicit TZStdDictCodec(ui32 comprLevel = 1); ~TZStdDictCodec() override; - + static TStringBuf MyName() { return "zstd08d"; } - + TString GetName() const override; - + ui8 Encode(TStringBuf in, TBuffer& out) const override; - + void Decode(TStringBuf in, TBuffer& out) const override; - + static TVector<TString> ListCompressionNames(); static int ParseCompressionName(TStringBuf); - + protected: void DoLearn(ISequenceReader& in) override; bool DoTryToLearn(ISequenceReader& in) final; void Save(IOutputStream* out) const override; void Load(IInputStream* in) override; }; - -} + +} |