diff options
author | Anton Samokhvalov <pg83@yandex.ru> | 2022-02-10 16:45:15 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:15 +0300 |
commit | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch) | |
tree | da2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /library/cpp/codecs | |
parent | 778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff) | |
download | ydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz |
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/codecs')
37 files changed, 1952 insertions, 1952 deletions
diff --git a/library/cpp/codecs/codecs.cpp b/library/cpp/codecs/codecs.cpp index b17a3156d2..bc60d10cf3 100644 --- a/library/cpp/codecs/codecs.cpp +++ b/library/cpp/codecs/codecs.cpp @@ -4,187 +4,187 @@ #include <util/stream/mem.h> namespace NCodecs { - void ICodec::Store(IOutputStream* out, TCodecPtr p) { - if (!p.Get()) { - ::Save(out, (ui16)0); - return; - } - - Y_ENSURE_EX(p->AlreadyTrained(), TCodecException() << "untrained codec " << p->GetName()); - const TString& n = p->GetName(); + void ICodec::Store(IOutputStream* out, TCodecPtr p) { + if (!p.Get()) { + ::Save(out, (ui16)0); + return; + } + + Y_ENSURE_EX(p->AlreadyTrained(), TCodecException() << "untrained codec " << p->GetName()); + const TString& n = p->GetName(); Y_VERIFY(n.size() <= Max<ui16>()); ::Save(out, (ui16)n.size()); out->Write(n.data(), n.size()); - p->Save(out); + p->Save(out); } - TCodecPtr ICodec::Restore(IInputStream* in) { - ui16 l = 0; - ::Load(in, l); + TCodecPtr ICodec::Restore(IInputStream* in) { + ui16 l = 0; + ::Load(in, l); - if (!l) { - return nullptr; - } + if (!l) { + return nullptr; + } - TString n; - n.resize(l); + TString n; + n.resize(l); - Y_ENSURE_EX(in->Load(n.begin(), l) == l, TCodecException()); + Y_ENSURE_EX(in->Load(n.begin(), l) == l, TCodecException()); - TCodecPtr p = ICodec::GetInstance(n); - p->Load(in); - p->Trained = true; - return p; - } + TCodecPtr p = ICodec::GetInstance(n); + p->Load(in); + p->Trained = true; + return p; + } - TCodecPtr ICodec::RestoreFromString(TStringBuf s) { + TCodecPtr ICodec::RestoreFromString(TStringBuf s) { TMemoryInput minp{s.data(), s.size()}; - return Restore(&minp); - } + return Restore(&minp); + } - TString ICodec::GetNameSafe(TCodecPtr p) { - return !p ? TString("none") : p->GetName(); - } + TString ICodec::GetNameSafe(TCodecPtr p) { + return !p ? TString("none") : p->GetName(); + } - ui8 TPipelineCodec::Encode(TStringBuf in, TBuffer& out) const { + ui8 TPipelineCodec::Encode(TStringBuf in, TBuffer& out) const { size_t res = Traits().ApproximateSizeOnEncode(in.size()); - out.Reserve(res); - out.Clear(); + out.Reserve(res); + out.Clear(); - if (Pipeline.empty()) { + if (Pipeline.empty()) { out.Append(in.data(), in.size()); - return 0; - } else if (Pipeline.size() == 1) { - return Pipeline.front()->Encode(in, out); - } + return 0; + } else if (Pipeline.size() == 1) { + return Pipeline.front()->Encode(in, out); + } - ui8 freelastbits = 0; + ui8 freelastbits = 0; - auto buffer = TBufferTlsCache::TlsInstance().Item(); - TBuffer& tmp = buffer.Get(); - tmp.Reserve(res); + auto buffer = TBufferTlsCache::TlsInstance().Item(); + TBuffer& tmp = buffer.Get(); + tmp.Reserve(res); - for (auto it = Pipeline.begin(); it != Pipeline.end(); ++it) { - if (it != Pipeline.begin()) { - tmp.Clear(); - tmp.Swap(out); + for (auto it = Pipeline.begin(); it != Pipeline.end(); ++it) { + if (it != Pipeline.begin()) { + tmp.Clear(); + tmp.Swap(out); in = TStringBuf{tmp.data(), tmp.size()}; - } - freelastbits = (*it)->Encode(in, out); - } + } + freelastbits = (*it)->Encode(in, out); + } - return freelastbits; + return freelastbits; } - void TPipelineCodec::Decode(TStringBuf in, TBuffer& out) const { + void TPipelineCodec::Decode(TStringBuf in, TBuffer& out) const { size_t res = Traits().ApproximateSizeOnDecode(in.size()); - out.Reserve(res); - out.Clear(); + out.Reserve(res); + out.Clear(); - if (Pipeline.empty()) { + if (Pipeline.empty()) { out.Append(in.data(), in.size()); - return; - } else if (Pipeline.size() == 1) { - Pipeline.front()->Decode(in, out); - return; - } + return; + } else if (Pipeline.size() == 1) { + Pipeline.front()->Decode(in, out); + return; + } - auto buffer = TBufferTlsCache::TlsInstance().Item(); + auto buffer = TBufferTlsCache::TlsInstance().Item(); - TBuffer& tmp = buffer.Get(); - tmp.Reserve(res); + TBuffer& tmp = buffer.Get(); + tmp.Reserve(res); - for (TPipeline::const_reverse_iterator it = Pipeline.rbegin(); it != Pipeline.rend(); ++it) { - if (it != Pipeline.rbegin()) { - tmp.Clear(); - tmp.Swap(out); + for (TPipeline::const_reverse_iterator it = Pipeline.rbegin(); it != Pipeline.rend(); ++it) { + if (it != Pipeline.rbegin()) { + tmp.Clear(); + tmp.Swap(out); in = TStringBuf{tmp.data(), tmp.size()}; - } - (*it)->Decode(in, out); + } + (*it)->Decode(in, out); } } - void TPipelineCodec::Save(IOutputStream* out) const { - for (const auto& it : Pipeline) - it->Save(out); + void TPipelineCodec::Save(IOutputStream* out) const { + for (const auto& it : Pipeline) + it->Save(out); } - void TPipelineCodec::Load(IInputStream* in) { - for (const auto& it : Pipeline) { - it->Load(in); - it->SetTrained(true); - } + void TPipelineCodec::Load(IInputStream* in) { + for (const auto& it : Pipeline) { + it->Load(in); + it->SetTrained(true); + } } - void TPipelineCodec::SetTrained(bool t) { - for (const auto& it : Pipeline) { - it->SetTrained(t); - } + void TPipelineCodec::SetTrained(bool t) { + for (const auto& it : Pipeline) { + it->SetTrained(t); + } } - TPipelineCodec& TPipelineCodec::AddCodec(TCodecPtr codec) { - if (!codec) - return *this; - - TCodecTraits tr = codec->Traits(); - - if (!MyName) { - MyTraits.AssumesStructuredInput = tr.AssumesStructuredInput; - MyTraits.SizeOfInputElement = tr.SizeOfInputElement; - } else { - MyName.append(':'); - } - - MyName.append(codec->GetName()); - MyTraits.PreservesPrefixGrouping &= tr.PreservesPrefixGrouping; - MyTraits.PaddingBit = tr.PaddingBit; - MyTraits.NeedsTraining |= tr.NeedsTraining; - MyTraits.Irreversible |= tr.Irreversible; - MyTraits.SizeOnEncodeAddition = MyTraits.SizeOnEncodeAddition * tr.SizeOnEncodeMultiplier + tr.SizeOnEncodeAddition; - MyTraits.SizeOnEncodeMultiplier *= tr.SizeOnEncodeMultiplier; - MyTraits.SizeOnDecodeMultiplier *= tr.SizeOnDecodeMultiplier; - MyTraits.RecommendedSampleSize = Max(MyTraits.RecommendedSampleSize, tr.RecommendedSampleSize); - - Pipeline.push_back(codec); - return *this; + TPipelineCodec& TPipelineCodec::AddCodec(TCodecPtr codec) { + if (!codec) + return *this; + + TCodecTraits tr = codec->Traits(); + + if (!MyName) { + MyTraits.AssumesStructuredInput = tr.AssumesStructuredInput; + MyTraits.SizeOfInputElement = tr.SizeOfInputElement; + } else { + MyName.append(':'); + } + + MyName.append(codec->GetName()); + MyTraits.PreservesPrefixGrouping &= tr.PreservesPrefixGrouping; + MyTraits.PaddingBit = tr.PaddingBit; + MyTraits.NeedsTraining |= tr.NeedsTraining; + MyTraits.Irreversible |= tr.Irreversible; + MyTraits.SizeOnEncodeAddition = MyTraits.SizeOnEncodeAddition * tr.SizeOnEncodeMultiplier + tr.SizeOnEncodeAddition; + MyTraits.SizeOnEncodeMultiplier *= tr.SizeOnEncodeMultiplier; + MyTraits.SizeOnDecodeMultiplier *= tr.SizeOnDecodeMultiplier; + MyTraits.RecommendedSampleSize = Max(MyTraits.RecommendedSampleSize, tr.RecommendedSampleSize); + + Pipeline.push_back(codec); + return *this; } - void TPipelineCodec::DoLearnX(ISequenceReader& in, double sampleSizeMult) { - if (!Traits().NeedsTraining) { - return; - } + void TPipelineCodec::DoLearnX(ISequenceReader& in, double sampleSizeMult) { + if (!Traits().NeedsTraining) { + return; + } - if (Pipeline.size() == 1) { - Pipeline.back()->Learn(in); - return; - } + if (Pipeline.size() == 1) { + Pipeline.back()->Learn(in); + return; + } - TVector<TBuffer> trainingInput; + TVector<TBuffer> trainingInput; - TStringBuf r; - while (in.NextRegion(r)) { + TStringBuf r; + while (in.NextRegion(r)) { trainingInput.emplace_back(r.data(), r.size()); } - - TBuffer buff; - for (const auto& it : Pipeline) { - it->LearnX(trainingInput.begin(), trainingInput.end(), sampleSizeMult); - - for (auto& bit : trainingInput) { - buff.Clear(); + + TBuffer buff; + for (const auto& it : Pipeline) { + it->LearnX(trainingInput.begin(), trainingInput.end(), sampleSizeMult); + + for (auto& bit : trainingInput) { + buff.Clear(); it->Encode(TStringBuf{bit.data(), bit.size()}, buff); - buff.Swap(bit); - } - } + buff.Swap(bit); + } + } } - bool TPipelineCodec::AlreadyTrained() const { - for (const auto& it : Pipeline) { - if (!it->AlreadyTrained()) - return false; - } - - return true; + bool TPipelineCodec::AlreadyTrained() const { + for (const auto& it : Pipeline) { + if (!it->AlreadyTrained()) + return false; + } + + return true; } } diff --git a/library/cpp/codecs/codecs.h b/library/cpp/codecs/codecs.h index cc5e72b285..08ea9beb44 100644 --- a/library/cpp/codecs/codecs.h +++ b/library/cpp/codecs/codecs.h @@ -16,244 +16,244 @@ #include <util/ysaveload.h> namespace NCodecs { - class TCodecException: public TWithBackTrace<yexception> {}; + class TCodecException: public TWithBackTrace<yexception> {}; - class ICodec; + class ICodec; - using TCodecPtr = TIntrusivePtr<ICodec>; - using TCodecConstPtr = TIntrusiveConstPtr<ICodec>; + using TCodecPtr = TIntrusivePtr<ICodec>; + using TCodecConstPtr = TIntrusiveConstPtr<ICodec>; - struct TCodecTraits { - ui32 RecommendedSampleSize = 0; - ui16 SizeOfInputElement = 1; - ui8 SizeOnEncodeMultiplier = 1; - ui8 SizeOnEncodeAddition = 0; - ui8 SizeOnDecodeMultiplier = 1; + struct TCodecTraits { + ui32 RecommendedSampleSize = 0; + ui16 SizeOfInputElement = 1; + ui8 SizeOnEncodeMultiplier = 1; + ui8 SizeOnEncodeAddition = 0; + ui8 SizeOnDecodeMultiplier = 1; - bool NeedsTraining = false; - bool PreservesPrefixGrouping = false; - bool Irreversible = false; - bool PaddingBit = 0; - bool AssumesStructuredInput = false; + bool NeedsTraining = false; + bool PreservesPrefixGrouping = false; + bool Irreversible = false; + bool PaddingBit = 0; + bool AssumesStructuredInput = false; - size_t ApproximateSizeOnEncode(size_t sz) const { - return sz * SizeOnEncodeMultiplier + SizeOnEncodeAddition; - } + size_t ApproximateSizeOnEncode(size_t sz) const { + return sz * SizeOnEncodeMultiplier + SizeOnEncodeAddition; + } - size_t ApproximateSizeOnDecode(size_t sz) const { - return sz * SizeOnDecodeMultiplier; - } - }; + size_t ApproximateSizeOnDecode(size_t sz) const { + return sz * SizeOnDecodeMultiplier; + } + }; - class ICodec: public TAtomicRefCount<ICodec> { - protected: - bool Trained = false; - TCodecTraits MyTraits; + class ICodec: public TAtomicRefCount<ICodec> { + protected: + bool Trained = false; + TCodecTraits MyTraits; - public: - TCodecTraits Traits() const { - return MyTraits; - } + public: + TCodecTraits Traits() const { + return MyTraits; + } - // the name of the codec (or its variant) to be used in the codec registry - virtual TString GetName() const = 0; + // the name of the codec (or its variant) to be used in the codec registry + virtual TString GetName() const = 0; - virtual ui8 /*free bits in last byte*/ Encode(TStringBuf, TBuffer&) const = 0; + virtual ui8 /*free bits in last byte*/ Encode(TStringBuf, TBuffer&) const = 0; virtual ui8 Encode(const TBuffer& input, TBuffer& output) const { return Encode(TStringBuf(input.Data(), input.Data() + input.Size()), output); } - virtual void Decode(TStringBuf, TBuffer&) const = 0; + virtual void Decode(TStringBuf, TBuffer&) const = 0; virtual void Decode(const TBuffer& input, TBuffer& output) const { Decode(TStringBuf(input.Data(), input.Data() + input.Size()), output); } - virtual ~ICodec() = default; + virtual ~ICodec() = default; - virtual bool AlreadyTrained() const { - return !Traits().NeedsTraining || Trained; - } - virtual void SetTrained(bool t) { - Trained = t; - } + virtual bool AlreadyTrained() const { + return !Traits().NeedsTraining || Trained; + } + virtual void SetTrained(bool t) { + Trained = t; + } bool TryToLearn(ISequenceReader& r) { Trained = DoTryToLearn(r); return Trained; } - void Learn(ISequenceReader& r) { - LearnX(r, 1); - } + void Learn(ISequenceReader& r) { + LearnX(r, 1); + } - template <class TIter> - void Learn(TIter beg, TIter end) { - Learn(beg, end, IterToStringBuf<TIter>); - } + template <class TIter> + void Learn(TIter beg, TIter end) { + Learn(beg, end, IterToStringBuf<TIter>); + } - template <class TIter, class TGetter> - void Learn(TIter beg, TIter end, TGetter getter) { - auto sample = GetSample(beg, end, Traits().RecommendedSampleSize, getter); - TSimpleSequenceReader<TBuffer> reader{sample}; - Learn(reader); - } + template <class TIter, class TGetter> + void Learn(TIter beg, TIter end, TGetter getter) { + auto sample = GetSample(beg, end, Traits().RecommendedSampleSize, getter); + TSimpleSequenceReader<TBuffer> reader{sample}; + Learn(reader); + } - static TCodecPtr GetInstance(TStringBuf name); + static TCodecPtr GetInstance(TStringBuf name); - static TVector<TString> GetCodecsList(); + static TVector<TString> GetCodecsList(); - static TString GetNameSafe(TCodecPtr p); + static TString GetNameSafe(TCodecPtr p); - static void Store(IOutputStream* out, TCodecPtr p); - static TCodecPtr Restore(IInputStream* in); - static TCodecPtr RestoreFromString(TStringBuf); + static void Store(IOutputStream* out, TCodecPtr p); + static TCodecPtr Restore(IInputStream* in); + static TCodecPtr RestoreFromString(TStringBuf); - protected: - virtual void DoLearn(ISequenceReader&) = 0; + protected: + virtual void DoLearn(ISequenceReader&) = 0; virtual bool DoTryToLearn(ISequenceReader& r) { DoLearn(r); return true; } - // so the pipeline codec will know to adjust the sample for the subcodecs - virtual void DoLearnX(ISequenceReader& r, double /*sampleSizeMultiplier*/) { - DoLearn(r); - } - - virtual void Save(IOutputStream*) const { - } - virtual void Load(IInputStream*) { - } - friend class TPipelineCodec; - - public: - // so the pipeline codec will know to adjust the sample for the subcodecs - void LearnX(ISequenceReader& r, double sampleSizeMult) { - DoLearnX(r, sampleSizeMult); - Trained = true; - } - - template <class TIter> - void LearnX(TIter beg, TIter end, double sampleSizeMult) { - auto sample = GetSample(beg, end, Traits().RecommendedSampleSize * sampleSizeMult); - TSimpleSequenceReader<TBuffer> reader{sample}; - LearnX(reader, sampleSizeMult); - } - }; - - class TBasicTrivialCodec: public ICodec { - public: - ui8 Encode(TStringBuf in, TBuffer& out) const override { + // so the pipeline codec will know to adjust the sample for the subcodecs + virtual void DoLearnX(ISequenceReader& r, double /*sampleSizeMultiplier*/) { + DoLearn(r); + } + + virtual void Save(IOutputStream*) const { + } + virtual void Load(IInputStream*) { + } + friend class TPipelineCodec; + + public: + // so the pipeline codec will know to adjust the sample for the subcodecs + void LearnX(ISequenceReader& r, double sampleSizeMult) { + DoLearnX(r, sampleSizeMult); + Trained = true; + } + + template <class TIter> + void LearnX(TIter beg, TIter end, double sampleSizeMult) { + auto sample = GetSample(beg, end, Traits().RecommendedSampleSize * sampleSizeMult); + TSimpleSequenceReader<TBuffer> reader{sample}; + LearnX(reader, sampleSizeMult); + } + }; + + class TBasicTrivialCodec: public ICodec { + public: + ui8 Encode(TStringBuf in, TBuffer& out) const override { out.Assign(in.data(), in.size()); - return 0; - } + return 0; + } - void Decode(TStringBuf in, TBuffer& out) const override { - Encode(in, out); - } + void Decode(TStringBuf in, TBuffer& out) const override { + Encode(in, out); + } - protected: - void DoLearn(ISequenceReader&) override { - } - }; + protected: + void DoLearn(ISequenceReader&) override { + } + }; - class TTrivialCodec: public TBasicTrivialCodec { - public: - TTrivialCodec() { - MyTraits.PreservesPrefixGrouping = true; - } + class TTrivialCodec: public TBasicTrivialCodec { + public: + TTrivialCodec() { + MyTraits.PreservesPrefixGrouping = true; + } - static TStringBuf MyName() { - return "trivial"; - } + static TStringBuf MyName() { + return "trivial"; + } - TString GetName() const override { + TString GetName() const override { return ToString(MyName()); - } - }; + } + }; - class TTrivialTrainableCodec: public TBasicTrivialCodec { - public: - TTrivialTrainableCodec() { - MyTraits.PreservesPrefixGrouping = true; - MyTraits.NeedsTraining = true; - } + class TTrivialTrainableCodec: public TBasicTrivialCodec { + public: + TTrivialTrainableCodec() { + MyTraits.PreservesPrefixGrouping = true; + MyTraits.NeedsTraining = true; + } - static TStringBuf MyName() { - return "trivial-trainable"; - } + static TStringBuf MyName() { + return "trivial-trainable"; + } - TString GetName() const override { + TString GetName() const override { return ToString(MyName()); - } - }; - - class TNullCodec: public ICodec { - public: - TNullCodec() { - MyTraits.Irreversible = true; - MyTraits.SizeOnDecodeMultiplier = 0; - MyTraits.SizeOnEncodeMultiplier = 0; - } - - TString GetName() const override { - return "null"; - } - - ui8 Encode(TStringBuf, TBuffer& out) const override { - out.Clear(); - return 0; - } - - void Decode(TStringBuf, TBuffer& out) const override { - out.Clear(); - } - - protected: - void DoLearn(ISequenceReader&) override { - } - }; - - class TPipelineCodec: public ICodec { - typedef TVector<TCodecPtr> TPipeline; - - TPipeline Pipeline; - TString MyName; - - public: - explicit TPipelineCodec(TCodecPtr c0 = nullptr, TCodecPtr c1 = nullptr, TCodecPtr c2 = nullptr, TCodecPtr c3 = nullptr) { - MyTraits.PreservesPrefixGrouping = true; - AddCodec(c0); - AddCodec(c1); - AddCodec(c2); - AddCodec(c3); - } - - TString GetName() const override { - return MyName; - } - - ui8 Encode(TStringBuf in, TBuffer& out) const override; - void Decode(TStringBuf in, TBuffer& out) const override; - - public: - /* + } + }; + + class TNullCodec: public ICodec { + public: + TNullCodec() { + MyTraits.Irreversible = true; + MyTraits.SizeOnDecodeMultiplier = 0; + MyTraits.SizeOnEncodeMultiplier = 0; + } + + TString GetName() const override { + return "null"; + } + + ui8 Encode(TStringBuf, TBuffer& out) const override { + out.Clear(); + return 0; + } + + void Decode(TStringBuf, TBuffer& out) const override { + out.Clear(); + } + + protected: + void DoLearn(ISequenceReader&) override { + } + }; + + class TPipelineCodec: public ICodec { + typedef TVector<TCodecPtr> TPipeline; + + TPipeline Pipeline; + TString MyName; + + public: + explicit TPipelineCodec(TCodecPtr c0 = nullptr, TCodecPtr c1 = nullptr, TCodecPtr c2 = nullptr, TCodecPtr c3 = nullptr) { + MyTraits.PreservesPrefixGrouping = true; + AddCodec(c0); + AddCodec(c1); + AddCodec(c2); + AddCodec(c3); + } + + TString GetName() const override { + return MyName; + } + + ui8 Encode(TStringBuf in, TBuffer& out) const override; + void Decode(TStringBuf in, TBuffer& out) const override; + + public: + /* * Add codecs in the following order: * uncompressed -> codec0 | codec1 | ... | codecN -> compressed */ - TPipelineCodec& AddCodec(TCodecPtr codec); + TPipelineCodec& AddCodec(TCodecPtr codec); - bool AlreadyTrained() const override; - void SetTrained(bool t) override; + bool AlreadyTrained() const override; + void SetTrained(bool t) override; - protected: - void DoLearn(ISequenceReader& in) override { - DoLearnX(in, 1); - } + protected: + void DoLearn(ISequenceReader& in) override { + DoLearnX(in, 1); + } - void DoLearnX(ISequenceReader& in, double sampleSizeMult) override; - void Save(IOutputStream* out) const override; - void Load(IInputStream* in) override; - }; + void DoLearnX(ISequenceReader& in, double sampleSizeMult) override; + void Save(IOutputStream* out) const override; + void Load(IInputStream* in) override; + }; } diff --git a/library/cpp/codecs/codecs_registry.cpp b/library/cpp/codecs/codecs_registry.cpp index 17d07062ab..c8941ec337 100644 --- a/library/cpp/codecs/codecs_registry.cpp +++ b/library/cpp/codecs/codecs_registry.cpp @@ -42,7 +42,7 @@ namespace NCodecs { } else { TPipelineCodec* pipe = new TPipelineCodec; - do { + do { TStringBuf v = name.NextTok(':'); pipe->AddCodec(GetCodec(v)); } while (name); @@ -64,7 +64,7 @@ namespace NCodecs { return vs; } - struct TSolarCodecFactory : ICodecFactory { + struct TSolarCodecFactory : ICodecFactory { TCodecPtr MakeCodec(TStringBuf name) const override { if (TSolarCodec::MyNameShortInt() == name) { return new TSolarCodecShortInt(); @@ -79,7 +79,7 @@ namespace NCodecs { } } - template <class TCodecCls> + template <class TCodecCls> TCodecPtr MakeCodecImpl(const TStringBuf& name, const TStringBuf& type) const { if (TStringBuf("-8k") == type) { return new TCodecCls(1 << 13); @@ -117,7 +117,7 @@ namespace NCodecs { } }; - struct TZStdDictCodecFactory : ICodecFactory { + struct TZStdDictCodecFactory : ICodecFactory { TCodecPtr MakeCodec(TStringBuf name) const override { return new TZStdDictCodec(TZStdDictCodec::ParseCompressionName(name)); } @@ -127,7 +127,7 @@ namespace NCodecs { } }; - struct TCompTableCodecFactory : ICodecFactory { + struct TCompTableCodecFactory : ICodecFactory { TCodecPtr MakeCodec(TStringBuf name) const override { if (TCompTableCodec::MyNameHQ() == name) { return new TCompTableCodec(TCompTableCodec::Q_HIGH); @@ -147,11 +147,11 @@ namespace NCodecs { } }; - struct TBlockCodec : ICodec { + struct TBlockCodec : ICodec { const NBlockCodecs::ICodec* Codec; TBlockCodec(TStringBuf name) - : Codec(NBlockCodecs::Codec(name)) + : Codec(NBlockCodecs::Codec(name)) { } @@ -174,11 +174,11 @@ namespace NCodecs { } }; - struct TBlockCodecsFactory : ICodecFactory { + struct TBlockCodecsFactory : ICodecFactory { using TRegistry = THashMap<TString, TCodecPtr>; TRegistry Registry; - TBlockCodecsFactory() { + TBlockCodecsFactory() { for (TStringBuf codec : NBlockCodecs::ListAllCodecs()) { Register(codec); } @@ -205,12 +205,12 @@ namespace NCodecs { } }; - TCodecRegistry::TCodecRegistry() { + TCodecRegistry::TCodecRegistry() { RegisterFactory(new TInstanceFactory<TTrivialCodec>); RegisterFactory(new TInstanceFactory<TTrivialTrainableCodec>); RegisterFactory(new TInstanceFactory<THuffmanCodec>); - RegisterFactory(new TInstanceFactory<TPForCodec<ui64, true>>); - RegisterFactory(new TInstanceFactory<TPForCodec<ui32, true>>); + RegisterFactory(new TInstanceFactory<TPForCodec<ui64, true>>); + RegisterFactory(new TInstanceFactory<TPForCodec<ui32, true>>); RegisterFactory(new TSolarCodecFactory); RegisterFactory(new TZStdDictCodecFactory); RegisterFactory(new TCompTableCodecFactory); diff --git a/library/cpp/codecs/codecs_registry.h b/library/cpp/codecs/codecs_registry.h index 53710310d5..abd4a38cc5 100644 --- a/library/cpp/codecs/codecs_registry.h +++ b/library/cpp/codecs/codecs_registry.h @@ -4,13 +4,13 @@ #include <util/string/cast.h> namespace NCodecs { - struct TNoCodecException : TCodecException { - TNoCodecException(TStringBuf name) { + struct TNoCodecException : TCodecException { + TNoCodecException(TStringBuf name) { (*this) << "unknown codec: " << name; } }; - struct ICodecFactory : TAtomicRefCount<ICodecFactory> { + struct ICodecFactory : TAtomicRefCount<ICodecFactory> { virtual ~ICodecFactory() = default; virtual TCodecPtr MakeCodec(TStringBuf name) const = 0; virtual TVector<TString> ListNames() const = 0; @@ -19,8 +19,8 @@ namespace NCodecs { typedef TIntrusivePtr<ICodecFactory> TCodecFactoryPtr; namespace NPrivate { - template <typename TCodec> - struct TInstanceFactory : ICodecFactory { + template <typename TCodec> + struct TInstanceFactory : ICodecFactory { TCodecPtr MakeCodec(TStringBuf) const override { return new TCodec; } @@ -52,7 +52,7 @@ namespace NCodecs { void RegisterCodecFactory(TCodecFactoryPtr fact); - template <typename TCodec> + template <typename TCodec> void RegisterCodec() { RegisterCodecFactory(new NPrivate::TInstanceFactory<TCodec>()); } diff --git a/library/cpp/codecs/comptable_codec.cpp b/library/cpp/codecs/comptable_codec.cpp index 476b8ada80..cf747121ba 100644 --- a/library/cpp/codecs/comptable_codec.cpp +++ b/library/cpp/codecs/comptable_codec.cpp @@ -4,12 +4,12 @@ #include <util/string/cast.h> namespace NCodecs { - class TCompTableCodec::TImpl: public TAtomicRefCount<TImpl> { + class TCompTableCodec::TImpl: public TAtomicRefCount<TImpl> { public: TImpl(EQuality q) : Quality(q) - { - } + { + } void Init() { Compressor.Reset(new NCompTable::TChunkCompressor{(bool)Quality, Table}); diff --git a/library/cpp/codecs/comptable_codec.h b/library/cpp/codecs/comptable_codec.h index 7ba4f4c543..d0f4361780 100644 --- a/library/cpp/codecs/comptable_codec.h +++ b/library/cpp/codecs/comptable_codec.h @@ -5,36 +5,36 @@ #include <util/generic/ptr.h> namespace NCodecs { - class TCompTableCodec: public ICodec { - class TImpl; - TIntrusivePtr<TImpl> Impl; + class TCompTableCodec: public ICodec { + class TImpl; + TIntrusivePtr<TImpl> Impl; - public: - enum EQuality { - Q_LOW = 0, - Q_HIGH = 1 - }; + public: + enum EQuality { + Q_LOW = 0, + Q_HIGH = 1 + }; - explicit TCompTableCodec(EQuality q = Q_HIGH); - ~TCompTableCodec() override; + explicit TCompTableCodec(EQuality q = Q_HIGH); + ~TCompTableCodec() override; - static TStringBuf MyNameHQ() { - return "comptable-hq"; - } - static TStringBuf MyNameLQ() { - return "comptable-lq"; - } + static TStringBuf MyNameHQ() { + return "comptable-hq"; + } + static TStringBuf MyNameLQ() { + return "comptable-lq"; + } - TString GetName() const override; + TString GetName() const override; - ui8 Encode(TStringBuf in, TBuffer& out) const override; + ui8 Encode(TStringBuf in, TBuffer& out) const override; - void Decode(TStringBuf in, TBuffer& out) const override; + void Decode(TStringBuf in, TBuffer& out) const override; - protected: - void DoLearn(ISequenceReader& in) override; - void Save(IOutputStream* out) const override; - void Load(IInputStream* in) override; - }; + protected: + void DoLearn(ISequenceReader& in) override; + void Save(IOutputStream* out) const override; + void Load(IInputStream* in) override; + }; } diff --git a/library/cpp/codecs/delta_codec.cpp b/library/cpp/codecs/delta_codec.cpp index 61606d6f6f..b9ed146dcb 100644 --- a/library/cpp/codecs/delta_codec.cpp +++ b/library/cpp/codecs/delta_codec.cpp @@ -1,21 +1,21 @@ #include "delta_codec.h" namespace NCodecs { - template <> - TStringBuf TDeltaCodec<ui64, true>::MyName() { - return "delta64-unsigned"; - } - template <> - TStringBuf TDeltaCodec<ui32, true>::MyName() { - return "delta32-unsigned"; - } - template <> - TStringBuf TDeltaCodec<ui64, false>::MyName() { - return "delta64-signed"; - } - template <> - TStringBuf TDeltaCodec<ui32, false>::MyName() { - return "delta32-signed"; - } + template <> + TStringBuf TDeltaCodec<ui64, true>::MyName() { + return "delta64-unsigned"; + } + template <> + TStringBuf TDeltaCodec<ui32, true>::MyName() { + return "delta32-unsigned"; + } + template <> + TStringBuf TDeltaCodec<ui64, false>::MyName() { + return "delta64-signed"; + } + template <> + TStringBuf TDeltaCodec<ui32, false>::MyName() { + return "delta32-signed"; + } } diff --git a/library/cpp/codecs/delta_codec.h b/library/cpp/codecs/delta_codec.h index 21325825e6..4e5dbb8f75 100644 --- a/library/cpp/codecs/delta_codec.h +++ b/library/cpp/codecs/delta_codec.h @@ -8,136 +8,136 @@ #include <util/string/cast.h> namespace NCodecs { - template <typename T = ui64, bool UnsignedDelta = true> - class TDeltaCodec: public ICodec { - static_assert(std::is_integral<T>::value, "expect std::is_integral<T>::value"); - - public: - using TUnsigned = std::make_unsigned_t<T>; - using TSigned = std::make_signed_t<T>; - using TDelta = std::conditional_t<UnsignedDelta, TUnsigned, TSigned>; - - private: - const TDelta MinDelta{Min<TDelta>()}; - const TDelta MaxDelta{Max<TDelta>() - 1}; - const TDelta InvalidDelta{MaxDelta + 1}; - - Y_FORCE_INLINE static TDelta AddSafe(TUnsigned a, TUnsigned b) { - return a + b; - } - - Y_FORCE_INLINE static TDelta SubSafe(TUnsigned a, TUnsigned b) { - return a - b; - } - - public: - struct TDecoder { - const TDelta InvalidDelta{Max<TDelta>()}; - - T Last = 0; - T Result = 0; - - bool First = true; - bool Invalid = false; - - Y_FORCE_INLINE bool Decode(TDelta t) { - if (Y_UNLIKELY(First)) { - First = false; - Result = Last = t; - return true; - } - - if (Y_UNLIKELY(Invalid)) { - Invalid = false; - Last = 0; - Result = t; - return true; - } - - Result = (Last += t); - Invalid = t == InvalidDelta; - - return !Invalid; + template <typename T = ui64, bool UnsignedDelta = true> + class TDeltaCodec: public ICodec { + static_assert(std::is_integral<T>::value, "expect std::is_integral<T>::value"); + + public: + using TUnsigned = std::make_unsigned_t<T>; + using TSigned = std::make_signed_t<T>; + using TDelta = std::conditional_t<UnsignedDelta, TUnsigned, TSigned>; + + private: + const TDelta MinDelta{Min<TDelta>()}; + const TDelta MaxDelta{Max<TDelta>() - 1}; + const TDelta InvalidDelta{MaxDelta + 1}; + + Y_FORCE_INLINE static TDelta AddSafe(TUnsigned a, TUnsigned b) { + return a + b; + } + + Y_FORCE_INLINE static TDelta SubSafe(TUnsigned a, TUnsigned b) { + return a - b; + } + + public: + struct TDecoder { + const TDelta InvalidDelta{Max<TDelta>()}; + + T Last = 0; + T Result = 0; + + bool First = true; + bool Invalid = false; + + Y_FORCE_INLINE bool Decode(TDelta t) { + if (Y_UNLIKELY(First)) { + First = false; + Result = Last = t; + return true; + } + + if (Y_UNLIKELY(Invalid)) { + Invalid = false; + Last = 0; + Result = t; + return true; + } + + Result = (Last += t); + Invalid = t == InvalidDelta; + + return !Invalid; } - }; + }; - public: - static TStringBuf MyName(); + public: + static TStringBuf MyName(); - TDeltaCodec() { - MyTraits.SizeOfInputElement = sizeof(T); - MyTraits.AssumesStructuredInput = true; + TDeltaCodec() { + MyTraits.SizeOfInputElement = sizeof(T); + MyTraits.AssumesStructuredInput = true; } - TString GetName() const override { + TString GetName() const override { return ToString(MyName()); - } + } - template <class TItem> - static void AppendTo(TBuffer& b, TItem t) { - b.Append((char*)&t, sizeof(t)); - } + template <class TItem> + static void AppendTo(TBuffer& b, TItem t) { + b.Append((char*)&t, sizeof(t)); + } - ui8 Encode(TStringBuf s, TBuffer& b) const override { - b.Clear(); + ui8 Encode(TStringBuf s, TBuffer& b) const override { + b.Clear(); if (s.empty()) { - return 0; - } + return 0; + } b.Reserve(s.size()); TArrayRef<const T> tin{(const T*)s.data(), s.size() / sizeof(T)}; const T* it = tin.begin(); - TDelta last = *(it++); - AppendTo(b, last); + TDelta last = *(it++); + AppendTo(b, last); - TDelta maxt = SubSafe(MaxDelta, last); - TDelta mint = AddSafe(MinDelta, last); + TDelta maxt = SubSafe(MaxDelta, last); + TDelta mint = AddSafe(MinDelta, last); for (; it != tin.end(); ++it) { - TDelta t = *it; - - if (Y_LIKELY((t >= mint) & (t <= maxt))) { - AppendTo(b, t - last); - last = t; - maxt = SubSafe(MaxDelta, last); - mint = AddSafe(MinDelta, last); - } else { - // delta overflow - AppendTo(b, InvalidDelta); - AppendTo(b, t); - last = 0; - maxt = MaxDelta; - mint = MinDelta; - } - } - - return 0; - } - - void Decode(TStringBuf s, TBuffer& b) const override { - b.Clear(); + TDelta t = *it; + + if (Y_LIKELY((t >= mint) & (t <= maxt))) { + AppendTo(b, t - last); + last = t; + maxt = SubSafe(MaxDelta, last); + mint = AddSafe(MinDelta, last); + } else { + // delta overflow + AppendTo(b, InvalidDelta); + AppendTo(b, t); + last = 0; + maxt = MaxDelta; + mint = MinDelta; + } + } + + return 0; + } + + void Decode(TStringBuf s, TBuffer& b) const override { + b.Clear(); if (s.empty()) { - return; + return; } b.Reserve(s.size()); TArrayRef<const T> tin{(const T*)s.data(), s.size() / sizeof(T)}; - TDecoder dec; + TDecoder dec; for (const T* it = tin.begin(); it != tin.end(); ++it) { - T tmp; - memcpy(&tmp, it, sizeof(tmp)); - if (dec.Decode(tmp)) { - AppendTo(b, dec.Result); - } + T tmp; + memcpy(&tmp, it, sizeof(tmp)); + if (dec.Decode(tmp)) { + AppendTo(b, dec.Result); + } } } - protected: - void DoLearn(ISequenceReader&) override { - } - }; + protected: + void DoLearn(ISequenceReader&) override { + } + }; } diff --git a/library/cpp/codecs/float_huffman.cpp b/library/cpp/codecs/float_huffman.cpp index c4a8bd228f..a95ca5b41d 100644 --- a/library/cpp/codecs/float_huffman.cpp +++ b/library/cpp/codecs/float_huffman.cpp @@ -55,7 +55,7 @@ namespace NCodecs::NFloatHuff { {0x3c000000, 0x12, 5, 24}, // [0.0078125, 0.03125), 29 bits, prefix [01001] {0x3b000000, 0x26, 6, 34}, // [0.001953125, end of range), 40 bits, prefix [011001] {0x00000000, 0x16, 5, 32}, // whole range, 37 bits, prefix [01101] - }; + }; [[noreturn]] Y_NO_INLINE void ThrowInvalidOffset(size_t size, size_t byteOffset) { ythrow yexception() << diff --git a/library/cpp/codecs/greedy_dict/gd_builder.cpp b/library/cpp/codecs/greedy_dict/gd_builder.cpp index 561bfbca01..802c721753 100644 --- a/library/cpp/codecs/greedy_dict/gd_builder.cpp +++ b/library/cpp/codecs/greedy_dict/gd_builder.cpp @@ -9,134 +9,134 @@ #include <util/system/rusage.h> namespace NGreedyDict { - void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) { - if (!Current) { + void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) { + if (!Current) { Current = MakeHolder<TEntrySet>(); - Current->InitWithAlpha(); - } + Current->InitWithAlpha(); + } - TEntrySet& set = *Current; + TEntrySet& set = *Current; - for (auto& it : set) - it.Count = 0; + for (auto& it : set) + it.Count = 0; - CompoundCounts = nullptr; - CompoundCountsPool.Clear(); + CompoundCounts = nullptr; + CompoundCountsPool.Clear(); - if (!final) { + if (!final) { CompoundCounts = MakeHolder<TCompoundCounts>(&CompoundCountsPool); - CompoundCounts->reserve(maxcand); - } + CompoundCounts->reserve(maxcand); + } - Shuffle(Input.begin(), Input.end(), Rng); + Shuffle(Input.begin(), Input.end(), Rng); - for (auto str : Input) { - if (!final && CompoundCounts->size() > maxcand) - break; + for (auto str : Input) { + if (!final && CompoundCounts->size() > maxcand) + break; - i32 prev = -1; + i32 prev = -1; - while (!!str) { - TEntry* e = set.FindPrefix(str); - ui32 num = e->Number; + while (!!str) { + TEntry* e = set.FindPrefix(str); + ui32 num = e->Number; - e->Count += 1; - if (!final && prev >= 0) { - (*CompoundCounts)[Compose(prev, num)] += 1; - } + e->Count += 1; + if (!final && prev >= 0) { + (*CompoundCounts)[Compose(prev, num)] += 1; + } - prev = num; - ++set.TotalCount; + prev = num; + ++set.TotalCount; } - } + } - Current->SetModelP(); + Current->SetModelP(); } - ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) { - TAutoPtr<TEntrySet> newset = new TEntrySet; - newset->InitWithAlpha(); - maxent -= newset->size(); - - ui32 additions = 0; - ui32 deletions = 0; - - { - const TEntrySet& set = *Current; - - Candidates.clear(); - const ui32 total = set.TotalCount; - const float minpval = Settings.MinPValue; - const EEntryStatTest test = Settings.StatTest; - const EEntryScore score = Settings.Score; - const ui32 mincnt = Settings.MinAbsCount; - - for (const auto& it : set) { - const TEntry& e = it; - float modelp = e.ModelP; - ui32 cnt = e.Count; - - if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval) - Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number)); - } - - if (!!CompoundCounts) { - for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) { - const TEntry& prev = set.Get(Prev(it->first)); - const TEntry& next = set.Get(Next(it->first)); - float modelp = ModelP(prev.Count, next.Count, total); - ui32 cnt = it->second; - if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval) - Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first)); - } + ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) { + TAutoPtr<TEntrySet> newset = new TEntrySet; + newset->InitWithAlpha(); + maxent -= newset->size(); + + ui32 additions = 0; + ui32 deletions = 0; + + { + const TEntrySet& set = *Current; + + Candidates.clear(); + const ui32 total = set.TotalCount; + const float minpval = Settings.MinPValue; + const EEntryStatTest test = Settings.StatTest; + const EEntryScore score = Settings.Score; + const ui32 mincnt = Settings.MinAbsCount; + + for (const auto& it : set) { + const TEntry& e = it; + float modelp = e.ModelP; + ui32 cnt = e.Count; + + if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval) + Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number)); + } + + if (!!CompoundCounts) { + for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) { + const TEntry& prev = set.Get(Prev(it->first)); + const TEntry& next = set.Get(Next(it->first)); + float modelp = ModelP(prev.Count, next.Count, total); + ui32 cnt = it->second; + if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval) + Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first)); + } } - Sort(Candidates.begin(), Candidates.end()); + Sort(Candidates.begin(), Candidates.end()); - if (Candidates.size() > maxent) - Candidates.resize(maxent); + if (Candidates.size() > maxent) + Candidates.resize(maxent); - for (const auto& candidate : Candidates) { - if (IsCompound(candidate.second)) { - additions++; - newset->Add(set.Get(Prev(candidate.second)).Str, set.Get(Next(candidate.second)).Str); - } else { - newset->Add(set.Get(candidate.second).Str); - } + for (const auto& candidate : Candidates) { + if (IsCompound(candidate.second)) { + additions++; + newset->Add(set.Get(Prev(candidate.second)).Str, set.Get(Next(candidate.second)).Str); + } else { + newset->Add(set.Get(candidate.second).Str); + } } - - deletions = set.size() - (newset->size() - additions); + + deletions = set.size() - (newset->size() - additions); } - Current = newset; - Current->BuildHierarchy(); - return deletions + additions; + Current = newset; + Current->BuildHierarchy(); + return deletions + additions; } - ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) { - size_t totalsz = 0; - for (auto it : Input) + ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) { + size_t totalsz = 0; + for (auto it : Input) totalsz += it.size(); - while (maxiters) { - maxiters--; + while (maxiters) { + maxiters--; - RebuildCounts(maxentries * Settings.GrowLimit, false); + RebuildCounts(maxentries * Settings.GrowLimit, false); - if (Settings.Verbose) { - TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size()); + if (Settings.Verbose) { + TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size()); Clog << Sprintf("%-110s RSS=%" PRIu32 "M", mess.data(), (ui32)(TRusage::Get().MaxRss >> 20)) << Endl; - } - - ui32 diff = BuildNextGeneration(maxentries); + } - if (Current->size() == maxentries && diff < mindiff) - break; + ui32 diff = BuildNextGeneration(maxentries); + + if (Current->size() == maxentries && diff < mindiff) + break; } - RebuildCounts(0, true); - Current->SetScores(Settings.Score); - return maxiters; + RebuildCounts(0, true); + Current->SetScores(Settings.Score); + return maxiters; } } diff --git a/library/cpp/codecs/greedy_dict/gd_builder.h b/library/cpp/codecs/greedy_dict/gd_builder.h index b8e9a5e37b..ab0057e1ca 100644 --- a/library/cpp/codecs/greedy_dict/gd_builder.h +++ b/library/cpp/codecs/greedy_dict/gd_builder.h @@ -6,89 +6,89 @@ #include <util/random/fast.h> namespace NGreedyDict { - struct TBuildSettings { - EEntryStatTest StatTest = EST_SIMPLE_NORM; - EEntryScore Score = ES_LEN_SIMPLE; - - float MinPValue = 0.75; - ui32 MinAbsCount = 10; - ui32 GrowLimit = 10; // times of maxentries - bool Verbose = false; - }; - - class TDictBuilder { - using TCompoundCounts = THashMap<ui64, ui32, THash<ui64>, TEqualTo<ui64>, TPoolAllocator>; - using TCandidate = std::pair<float, ui64>; - using TCandidates = TVector<TCandidate>; - - private: - TFastRng64 Rng{0x1a5d0ac170565c1c, 0x0be7bc27, 0x6235f6f57820aa0d, 0xafdc7fb}; - TStringBufs Input; - - THolder<TEntrySet> Current; - - TMemoryPool CompoundCountsPool; - THolder<TCompoundCounts> CompoundCounts; - - TCandidates Candidates; - - TBuildSettings Settings; - - public: - TDictBuilder(const TBuildSettings& s = TBuildSettings()) - : CompoundCountsPool(8112, TMemoryPool::TLinearGrow::Instance()) - , Settings(s) - { - } - - void SetInput(const TStringBufs& in) { - Input = in; - } - - const TBuildSettings& GetSettings() const { - return Settings; - } - - TBuildSettings& GetSettings() { - return Settings; - } - - void SetSettings(const TBuildSettings& s) { - Settings = s; - } - - TEntrySet& EntrySet() { - return *Current; - } - - const TEntrySet& EntrySet() const { - return *Current; - } - - THolder<TEntrySet> ReleaseEntrySet() { - return std::move(Current); - } - - ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10); - - public: - void RebuildCounts(ui32 maxcand, bool final); - ui32 /*diff size*/ BuildNextGeneration(ui32 maxent); - - static bool IsCompound(ui64 ent) { - return ent & 0xFFFFFFFF00000000ULL; - } - - static ui32 Next(ui64 ent) { - return ent; - } - static ui32 Prev(ui64 ent) { - return (ent >> 32) - 1; - } - - static ui64 Compose(ui32 prev, ui32 next) { - return ((prev + 1ULL) << 32) | next; - } - }; + struct TBuildSettings { + EEntryStatTest StatTest = EST_SIMPLE_NORM; + EEntryScore Score = ES_LEN_SIMPLE; + + float MinPValue = 0.75; + ui32 MinAbsCount = 10; + ui32 GrowLimit = 10; // times of maxentries + bool Verbose = false; + }; + + class TDictBuilder { + using TCompoundCounts = THashMap<ui64, ui32, THash<ui64>, TEqualTo<ui64>, TPoolAllocator>; + using TCandidate = std::pair<float, ui64>; + using TCandidates = TVector<TCandidate>; + + private: + TFastRng64 Rng{0x1a5d0ac170565c1c, 0x0be7bc27, 0x6235f6f57820aa0d, 0xafdc7fb}; + TStringBufs Input; + + THolder<TEntrySet> Current; + + TMemoryPool CompoundCountsPool; + THolder<TCompoundCounts> CompoundCounts; + + TCandidates Candidates; + + TBuildSettings Settings; + + public: + TDictBuilder(const TBuildSettings& s = TBuildSettings()) + : CompoundCountsPool(8112, TMemoryPool::TLinearGrow::Instance()) + , Settings(s) + { + } + + void SetInput(const TStringBufs& in) { + Input = in; + } + + const TBuildSettings& GetSettings() const { + return Settings; + } + + TBuildSettings& GetSettings() { + return Settings; + } + + void SetSettings(const TBuildSettings& s) { + Settings = s; + } + + TEntrySet& EntrySet() { + return *Current; + } + + const TEntrySet& EntrySet() const { + return *Current; + } + + THolder<TEntrySet> ReleaseEntrySet() { + return std::move(Current); + } + + ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10); + + public: + void RebuildCounts(ui32 maxcand, bool final); + ui32 /*diff size*/ BuildNextGeneration(ui32 maxent); + + static bool IsCompound(ui64 ent) { + return ent & 0xFFFFFFFF00000000ULL; + } + + static ui32 Next(ui64 ent) { + return ent; + } + static ui32 Prev(ui64 ent) { + return (ent >> 32) - 1; + } + + static ui64 Compose(ui32 prev, ui32 next) { + return ((prev + 1ULL) << 32) | next; + } + }; } diff --git a/library/cpp/codecs/greedy_dict/gd_entry.cpp b/library/cpp/codecs/greedy_dict/gd_entry.cpp index 2c315c7f7c..0603a9fca8 100644 --- a/library/cpp/codecs/greedy_dict/gd_entry.cpp +++ b/library/cpp/codecs/greedy_dict/gd_entry.cpp @@ -5,94 +5,94 @@ #include <util/generic/singleton.h> namespace NGreedyDict { - class TAlphas { - char Memory[512]; - - public: - TStringBufs Alphas; - - TAlphas() { - for (ui32 i = 0; i < 256; ++i) { - Memory[2 * i] = (char)i; - Memory[2 * i + 1] = 0; - - Alphas.push_back(TStringBuf(&Memory[2 * i], 1)); - } + class TAlphas { + char Memory[512]; + + public: + TStringBufs Alphas; + + TAlphas() { + for (ui32 i = 0; i < 256; ++i) { + Memory[2 * i] = (char)i; + Memory[2 * i + 1] = 0; + + Alphas.push_back(TStringBuf(&Memory[2 * i], 1)); + } + } + }; + + void TEntrySet::InitWithAlpha() { + Pool.ClearKeepFirstChunk(); + const TStringBufs& a = Singleton<TAlphas>()->Alphas; + for (auto it : a) { + Add(it); } - }; - - void TEntrySet::InitWithAlpha() { - Pool.ClearKeepFirstChunk(); - const TStringBufs& a = Singleton<TAlphas>()->Alphas; - for (auto it : a) { - Add(it); - } - BuildHierarchy(); + BuildHierarchy(); } - void TEntrySet::BuildHierarchy() { - Sort(begin(), end(), TEntry::StrLess); + void TEntrySet::BuildHierarchy() { + Sort(begin(), end(), TEntry::StrLess); - TCompactTrieBuilder<char, ui32, TAsIsPacker<ui32>> builder(CTBF_PREFIX_GROUPED); + TCompactTrieBuilder<char, ui32, TAsIsPacker<ui32>> builder(CTBF_PREFIX_GROUPED); - for (iterator it = begin(); it != end(); ++it) { - it->Number = (it - begin()); - TStringBuf suff = it->Str; - size_t len = 0; - ui32 val = 0; + for (iterator it = begin(); it != end(); ++it) { + it->Number = (it - begin()); + TStringBuf suff = it->Str; + size_t len = 0; + ui32 val = 0; if (builder.FindLongestPrefix(suff.data(), suff.size(), &len, &val) && len) { - it->NearestPrefix = val; - } + it->NearestPrefix = val; + } builder.Add(suff.data(), suff.size(), it->Number); } - TBufferOutput bout; - builder.Save(bout); - Trie.Init(TBlob::FromBuffer(bout.Buffer())); + TBufferOutput bout; + builder.Save(bout); + Trie.Init(TBlob::FromBuffer(bout.Buffer())); } - TEntry* TEntrySet::FindPrefix(TStringBuf& str) { - size_t len = 0; - ui32 off = 0; + TEntry* TEntrySet::FindPrefix(TStringBuf& str) { + size_t len = 0; + ui32 off = 0; - if (!Trie.FindLongestPrefix(str, &len, &off)) { - return nullptr; - } + if (!Trie.FindLongestPrefix(str, &len, &off)) { + return nullptr; + } - str.Skip(len); - return &Get(off); + str.Skip(len); + return &Get(off); } - void TEntrySet::SetModelP() { - for (iterator it = begin(); it != end(); ++it) { - TEntry& e = *it; + void TEntrySet::SetModelP() { + for (iterator it = begin(); it != end(); ++it) { + TEntry& e = *it; - if (!e.HasPrefix()) { - e.ModelP = 0; - continue; - } + if (!e.HasPrefix()) { + e.ModelP = 0; + continue; + } - TStringBuf suff = e.Str; - const TEntry& p = Get(e.NearestPrefix); - suff.Skip(p.Len()); + TStringBuf suff = e.Str; + const TEntry& p = Get(e.NearestPrefix); + suff.Skip(p.Len()); - float modelp = float(p.Count + e.Count) / TotalCount; + float modelp = float(p.Count + e.Count) / TotalCount; - while (!!suff) { - TEntry* pp = FindPrefix(suff); - modelp *= float(pp->Count + e.Count) / TotalCount; - } + while (!!suff) { + TEntry* pp = FindPrefix(suff); + modelp *= float(pp->Count + e.Count) / TotalCount; + } - e.ModelP = modelp; + e.ModelP = modelp; } } - void TEntrySet::SetScores(EEntryScore s) { - for (auto& it : *this) { - it.Score = Score(s, it.Len(), it.ModelP, it.Count, TotalCount); - } + void TEntrySet::SetScores(EEntryScore s) { + for (auto& it : *this) { + it.Score = Score(s, it.Len(), it.ModelP, it.Count, TotalCount); + } } } diff --git a/library/cpp/codecs/greedy_dict/gd_entry.h b/library/cpp/codecs/greedy_dict/gd_entry.h index 18b5be0e15..e123c66b4a 100644 --- a/library/cpp/codecs/greedy_dict/gd_entry.h +++ b/library/cpp/codecs/greedy_dict/gd_entry.h @@ -11,93 +11,93 @@ #include <util/memory/pool.h> namespace NGreedyDict { - using TStringBufs = TVector<TStringBuf>; + using TStringBufs = TVector<TStringBuf>; - struct TEntry { - static const i32 NoPrefix = -1; + struct TEntry { + static const i32 NoPrefix = -1; - TStringBuf Str; + TStringBuf Str; - i32 NearestPrefix = NoPrefix; - ui32 Count = 0; - ui32 Number = 0; - float ModelP = 0; - float Score = 0; + i32 NearestPrefix = NoPrefix; + ui32 Count = 0; + ui32 Number = 0; + float ModelP = 0; + float Score = 0; - TEntry(TStringBuf b = TStringBuf(), ui32 cnt = 0) - : Str(b) - , Count(cnt) - { - } + TEntry(TStringBuf b = TStringBuf(), ui32 cnt = 0) + : Str(b) + , Count(cnt) + { + } - bool HasPrefix() const { - return NearestPrefix != NoPrefix; - } - ui32 Len() const { + bool HasPrefix() const { + return NearestPrefix != NoPrefix; + } + ui32 Len() const { return Str.size(); - } + } - static bool StrLess(const TEntry& a, const TEntry& b) { - return a.Str < b.Str; - } - static bool NumberLess(const TEntry& a, const TEntry& b) { - return a.Number < b.Number; - } - static bool ScoreMore(const TEntry& a, const TEntry& b) { - return a.Score > b.Score; - } - }; + static bool StrLess(const TEntry& a, const TEntry& b) { + return a.Str < b.Str; + } + static bool NumberLess(const TEntry& a, const TEntry& b) { + return a.Number < b.Number; + } + static bool ScoreMore(const TEntry& a, const TEntry& b) { + return a.Score > b.Score; + } + }; - class TEntrySet: public TVector<TEntry>, TNonCopyable { - TMemoryPool Pool{8112}; - TCompactTrie<char, ui32, TAsIsPacker<ui32>> Trie; + class TEntrySet: public TVector<TEntry>, TNonCopyable { + TMemoryPool Pool{8112}; + TCompactTrie<char, ui32, TAsIsPacker<ui32>> Trie; - public: - ui32 TotalCount = 0; + public: + ui32 TotalCount = 0; - void InitWithAlpha(); + void InitWithAlpha(); - void Add(TStringBuf a) { + void Add(TStringBuf a) { push_back(TStringBuf(Pool.Append(a.data(), a.size()), a.size())); - } + } - void Add(TStringBuf a, TStringBuf b) { + void Add(TStringBuf a, TStringBuf b) { size_t sz = a.size() + b.size(); - char* p = (char*)Pool.Allocate(sz); + char* p = (char*)Pool.Allocate(sz); memcpy(p, a.data(), a.size()); memcpy(p + a.size(), b.data(), b.size()); - push_back(TStringBuf(p, sz)); - } + push_back(TStringBuf(p, sz)); + } - TEntry& Get(ui32 idx) { - return (*this)[idx]; - } + TEntry& Get(ui32 idx) { + return (*this)[idx]; + } - const TEntry& Get(ui32 idx) const { - return (*this)[idx]; - } + const TEntry& Get(ui32 idx) const { + return (*this)[idx]; + } - void BuildHierarchy(); + void BuildHierarchy(); - // longest prefix - TEntry* FindPrefix(TStringBuf& str); + // longest prefix + TEntry* FindPrefix(TStringBuf& str); - const TEntry* FindPrefix(TStringBuf& str) const { - return ((TEntrySet*)this)->FindPrefix(str); - } + const TEntry* FindPrefix(TStringBuf& str) const { + return ((TEntrySet*)this)->FindPrefix(str); + } - const TEntry* FirstPrefix(const TEntry& e, TStringBuf& suff) { - if (!e.HasPrefix()) - return nullptr; + const TEntry* FirstPrefix(const TEntry& e, TStringBuf& suff) { + if (!e.HasPrefix()) + return nullptr; - const TEntry& p = Get(e.NearestPrefix); - suff = e.Str; + const TEntry& p = Get(e.NearestPrefix); + suff = e.Str; suff.Skip(p.Str.size()); - return &p; - } + return &p; + } - void SetModelP(); - void SetScores(EEntryScore); - }; + void SetModelP(); + void SetScores(EEntryScore); + }; } diff --git a/library/cpp/codecs/greedy_dict/gd_stats.h b/library/cpp/codecs/greedy_dict/gd_stats.h index b63c4c38d2..90f46a0fb9 100644 --- a/library/cpp/codecs/greedy_dict/gd_stats.h +++ b/library/cpp/codecs/greedy_dict/gd_stats.h @@ -1,78 +1,78 @@ #pragma once -#include <util/generic/ymath.h> +#include <util/generic/ymath.h> #include <util/generic/algorithm.h> #include <util/generic/yexception.h> namespace NGreedyDict { - enum EEntryScore { - ES_COUNT, - ES_LEN_COUNT, - ES_SIMPLE, - ES_LEN_SIMPLE, - ES_SOLAR - }; + enum EEntryScore { + ES_COUNT, + ES_LEN_COUNT, + ES_SIMPLE, + ES_LEN_SIMPLE, + ES_SOLAR + }; - enum EEntryStatTest { - EST_NONE = 0, - EST_SIMPLE_NORM = 2 - }; + enum EEntryStatTest { + EST_NONE = 0, + EST_SIMPLE_NORM = 2 + }; - inline float ModelP(ui32 countA, ui32 countB, ui32 total) { - return float(countA) * countB / total / total; - } + inline float ModelP(ui32 countA, ui32 countB, ui32 total) { + return float(countA) * countB / total / total; + } - // P (ab | dependent) - inline float SimpleTest(float modelp, ui32 countAB, ui32 total) { - float realp = float(countAB) / total; - return modelp >= realp ? 0 : (realp - modelp); - } + // P (ab | dependent) + inline float SimpleTest(float modelp, ui32 countAB, ui32 total) { + float realp = float(countAB) / total; + return modelp >= realp ? 0 : (realp - modelp); + } - inline float SolarTest(float modelp, ui32 countAB, ui32 total) { - float realp = float(countAB) / total; - return modelp >= realp ? 0 : (modelp + realp * (log(realp / modelp) - 1)); - } + inline float SolarTest(float modelp, ui32 countAB, ui32 total) { + float realp = float(countAB) / total; + return modelp >= realp ? 0 : (modelp + realp * (log(realp / modelp) - 1)); + } - // P (ab | dependent) / P (ab) - inline float SimpleTestNorm(float modelp, ui32 countAB, ui32 total) { - float realp = float(countAB) / total; - return modelp >= realp ? 0 : (realp - modelp) / realp; - } + // P (ab | dependent) / P (ab) + inline float SimpleTestNorm(float modelp, ui32 countAB, ui32 total) { + float realp = float(countAB) / total; + return modelp >= realp ? 0 : (realp - modelp) / realp; + } - inline float StatTest(EEntryStatTest test, float modelp, ui32 countAB, ui32 total) { - if (!total) { - return 0; - } - switch (test) { - case EST_NONE: - return 1; - case EST_SIMPLE_NORM: - return SimpleTestNorm(modelp, countAB, total); - } - Y_FAIL("no way!"); + inline float StatTest(EEntryStatTest test, float modelp, ui32 countAB, ui32 total) { + if (!total) { + return 0; + } + switch (test) { + case EST_NONE: + return 1; + case EST_SIMPLE_NORM: + return SimpleTestNorm(modelp, countAB, total); + } + Y_FAIL("no way!"); return 0; } - inline float Score(EEntryScore score, ui32 len, float modelp, ui32 count, ui32 total) { - if (!total) { - return 0; - } - ui32 m = 1; - switch (score) { - case ES_LEN_COUNT: - m = len; + inline float Score(EEntryScore score, ui32 len, float modelp, ui32 count, ui32 total) { + if (!total) { + return 0; + } + ui32 m = 1; + switch (score) { + case ES_LEN_COUNT: + m = len; [[fallthrough]]; - case ES_COUNT: - return m * count; - case ES_LEN_SIMPLE: - m = len; + case ES_COUNT: + return m * count; + case ES_LEN_SIMPLE: + m = len; [[fallthrough]]; - case ES_SIMPLE: - return m * SimpleTest(modelp, count, total); - case ES_SOLAR: - return SolarTest(modelp, count, total); - } - Y_FAIL("no way!"); + case ES_SIMPLE: + return m * SimpleTest(modelp, count, total); + case ES_SOLAR: + return SolarTest(modelp, count, total); + } + Y_FAIL("no way!"); return 0; } diff --git a/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp index 679089a11b..e33976d333 100644 --- a/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp +++ b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp @@ -6,11 +6,11 @@ #include <util/generic/ymath.h> class TGreedyDictTest: public TTestBase { - UNIT_TEST_SUITE(TGreedyDictTest); + UNIT_TEST_SUITE(TGreedyDictTest); UNIT_TEST(TestEntrySet) UNIT_TEST(TestBuilder0) UNIT_TEST(TestBuilder) - UNIT_TEST_SUITE_END(); + UNIT_TEST_SUITE_END(); void TestEntrySet() { using namespace NGreedyDict; @@ -120,7 +120,7 @@ class TGreedyDictTest: public TTestBase { } void FillData(NGreedyDict::TStringBufs& data) { - static const char* urls[] = {"http://53.ru/car/motors/foreign/opel/tigra/", "http://abakan.24au.ru/tender/85904/", "http://anm15.gulaig.com/", "http://avto-parts.com/mercedes-benz/mercedes-benz-w220-1998-2005/category-442/category-443/", "http://ballooncousin.co.uk/", "http://benzol.ru/equipment/?id=1211&parent=514", "http://blazingseorank.com/blazing-seo-rank-free-website-analysis-to-increase-rank-and-traffic-450.html", "http://blogblaugrana.contadorwebmasters.com/", "http://bristolhash.org.uk/bh3cntct.php", "http://broker.borovichi.ru/category/item/3/1/0/8/28/257", "http://canoncompactcamerax.blogspot.com/", "http://classifieds.smashits.com/p,107881,email-to-friend.htm", "http://conferences.ksde.org/Portals/132/FallAssessment/SAVETHEDAY-FA09.pdf", "http://eway.vn/raovat/325-dien-tu-gia-dung/337-dieu-hoa/98041-b1-sua-may-lanh-quan-binh-tan-sua-may-lanh-quan-binh-chanh-hh-979676119-toan-quoc.html", "http://gallery.e2bn.org/asset73204_8-.html", "http://goplay.nsw.gov.au/activities-for-kids/by/historic-houses-trust/?startdate=2012-07-10", "http://grichards19067.multiply.com/", "http://hotkovo.egent.ru/user/89262269084/", "http://howimetyourself.com/?redirect_to=http://gomiso.com/m/suits/seasons/2/episodes/2", "http://islamqa.com/hi/ref/9014/DEAD%20PEOPLE%20GOOD%20DEEDS", "http://lapras.rutube.ru/", "http://nceluiko.ya.ru/", "http://nyanyanyanyaa.beon.ru/", "http://ozbo.com/Leaf-River-DV-7SS-7-0-MP-Game-Camera-K1-32541.html", "http://sbantom.ru/catalog/chasy/632753.html", "http://shopingoff.com/index.php?option=com_virtuemart&Itemid=65&category_id=&page=shop.browse&manufacturer_id=122&limit=32&limitstart=96", "http://shopingoff.com/katalog-odezhdy/manufacturer/62-christian-audigier.html?limit=32&start=448", "https://webwinkel.ah.nl/process?fh_location=//ecommerce/nl_NL/categories%3C%7Becommerce_shoc1%7D/it_show_product_code_1384%3E%7B10%3B20%7D/pr_startdate%3C20120519/pr_enddate%3E20120519/pr_ltc_allowed%3E%7Bbowi%7D/categories%3C%7Becommerce_shoc1_1al%7D/categories%3C%7Becommerce_shoc1_1al_1ahal%7D&&action=albert_noscript.modules.build", "http://top100.rambler.ru/navi/?theme=208/210/371&rgn=17", "http://volgogradskaya-oblast.extra-m.ru/classifieds/rabota/vakansii/banki-investicii/901467/", "http://wikien4.appspot.com/wiki/Warburg_hypothesis", "http://wola_baranowska.kamerzysta24.com.pl/", "http://www.10dot0dot0dot1.com/", "http://www.anima-redux.ru/index.php?key=gifts+teenage+girls", "http://www.aquaticabyseaworld.com/Calendar.aspx/CP/CP/CP/sp-us/CP/CP/ParkMap/Tickets/Weather.aspx", "http://www.autousa.com/360spin/2012_cadillac_ctssportwagon_3.6awdpremiumcollection.htm", "http://www.booking.com/city/gb/paignton-aireborough.html?inac=0&lang=pl", "http://www.booking.com/city/it/vodo-cadore.en.html", "http://www.booking.com/district/us/new-york/rockefeller-center.html&lang=no", "http://www.booking.com/hotel/bg/crown-fort-club.lv.html", "http://www.booking.com/hotel/ca/gouverneur-rimouski.ar.html", "http://www.booking.com/hotel/ch/l-auberge-du-chalet-a-gobet.fi.html", "http://www.booking.com/hotel/de/mark-garni.ru.html?aid=337384;label=yandex-hotel-mark-garni-68157-%7Bparam1%7D", "http://www.booking.com/hotel/de/mercure-goldschmieding-castrop-rauxel.ro.html", "http://www.booking.com/hotel/de/zollenspieker-fahrhaus.fr.html", "http://www.booking.com/hotel/es/jardin-metropolitano.ca.html", "http://www.booking.com/hotel/fr/clim.fr.html", "http://www.booking.com/hotel/fr/radisson-sas-toulouse-airport.et.html", "http://www.booking.com/hotel/gb/stgileshotel.ro.html?srfid=68c7fe42a03653a8796c84435c5299e4X16?tab=4", "http://www.booking.com/hotel/gr/rodos-park-suites.ru.html", "http://www.booking.com/hotel/id/le-grande-suites-bali.ru.html", "http://www.booking.com/hotel/it/mozart.it.html?aid=321655", "http://www.booking.com/hotel/ni/bahia-del-sol-villas.ru.html?dcid=1;dva=0", "http://www.booking.com/hotel/nl/cpschiphol.ro.html.ro.html?tab=4", "http://www.booking.com/hotel/th/laem-din.en-gb.html", "http://www.booking.com/hotel/th/tinidee-ranong.en.html", "http://www.booking.com/hotel/us/best-western-plus-merrimack-valley.hu.html", "http://www.booking.com/hotel/vn/tan-hai-long.km.html", "http://www.booking.com/landmark/au/royal-brisbane-women-s-hospital.vi.html", "http://www.booking.com/landmark/hk/nam-cheong-station.html&lang=id", "http://www.booking.com/landmark/it/spanish-steps.ca.html", "http://www.booking.com/landmark/sg/asian-civilisations-museum.html&lang=fi", "http://www.booking.com/place/fi-1376029.pt.html", "http://www.booking.com/place/tn257337.pl.html", "http://www.booking.com/region/ca/niagarafalls.ar.html&selected_currency=PLN", "http://www.booking.com/region/mx/queretaro.pt-pt.html&selected_currency=AUD", "http://www.booking.com/searchresults.en.html?city=20063074", "http://www.booking.com/searchresults.et.html?checkin=;checkout=;city=-394632", "http://www.booking.com/searchresults.lv.html?region=3936", "http://www.cevredanismanlari.com/index.php/component/k2/index.php/mevzuat/genel-yazlar/item/dosyalar/index.php?option=com_k2&view=item&id=16:iso-14001-%C3%A7evre-y%C3%B6netim-sistemi&Itemid=132&limitstart=107120", "http://www.dh-wholesaler.com/MENS-POLO-RACING-TEE-RL-p-417.html", "http://www.employabilityonline.net/", "http://www.esso.inc.ru/board/tools.php?event=profile&pname=Invinerrq", "http://www.filesurgery.ru/searchfw/kids_clothes-3.html", "http://www.furnitureandcarpetsource.com/Item.aspx?ItemID=-2107311899&ItemNum=53-T3048", "http://www.gets.cn/product/Gold-Sand-Lampwork-Glass-Beads--Flat-round--28x28x13mm_p260717.html", "http://www.gets.cn/wholesale-Sterling-Silver-Pendant-Findings-3577_S--L-Star-P-1.html?view=1&by=1", "http://www.homeandgardenadvice.com/diy/Mortgages_Loans_and_Financing/9221.html", "http://www.hongkongairport.com/eng/index.html/passenger/passenger/transport/to-from-airport/business/about-the-airport/transport/shopping/entertainment/t2/passenger/interactive-map.html", "http://www.hongkongairport.com/eng/index.html/shopping/insideshopping/all/passenger/transfer-transit/all/airline-information/shopping/entertainment/t2/business/about-the-airport/welcome.html", "http://www.hongkongairport.com/eng/index.html/transport/business/about-the-airport/transport/business/airport-authority/passenger/shopping/dining/all/dining.html", "http://www.idedge.com/index.cfm/fuseaction/category.display/category_id/298/index.cfm", "http://www.istanbulburda.com/aramalar.php", "http://www.jewelryinthenet.com/ads/AdDetail.aspx?AdID=1-0311002490689&stid=22-0111001020877", "http://www.johnnydepp.ru/forum/index.php?showtopic=1629&mode=linearplus&view=findpost&p=186977", "http://www.johnnydepp.ru/forum/index.php?showtopic=476&st=60&p=87379&", "http://www.joseleano.com/joomla/index.php/audio", "http://www.kaplicarehberi.com/tag/sakar-ilicali-kaplicalari/feed", "http://www.khaber.com.tr/arama.html?key=%C3%A7avdar", "http://www.kiz-oyunlari1.com/1783/4437/4363/1056/4170/Bump-Copter2-.html", "http://www.kiz-oyunlari1.com/3752/2612/4175/1166/3649/1047/Angelina-Oyunu.html", "http://www.kiz-oyunlari1.com/4266/3630/3665/3286/4121/301/3274/Sinir-Sinekler-.html", "http://www.kuldiga.lv/index.php?f=8&cat=371", "http://www.kuldiga.lv/index.php/img/index.php?l=lv&art_id=1836&show_c=&cat=85", "http://www.patronessa.ru/remontiruemsya/kuzovnie30raboti.html", "http://www.rapdict.org/Nu_Money?title=Talk:Nu_Money&action=edit", "http://www.serafin-phu.tabor24.com/?page=8", "http://www.shoes-store.org/brand1/Kids/Minnetonka.html", "http://www.shoes-store.org/shoes-store.xml", "http://www.way2allah.com/khotab-download-34695.htm"}; + static const char* urls[] = {"http://53.ru/car/motors/foreign/opel/tigra/", "http://abakan.24au.ru/tender/85904/", "http://anm15.gulaig.com/", "http://avto-parts.com/mercedes-benz/mercedes-benz-w220-1998-2005/category-442/category-443/", "http://ballooncousin.co.uk/", "http://benzol.ru/equipment/?id=1211&parent=514", "http://blazingseorank.com/blazing-seo-rank-free-website-analysis-to-increase-rank-and-traffic-450.html", "http://blogblaugrana.contadorwebmasters.com/", "http://bristolhash.org.uk/bh3cntct.php", "http://broker.borovichi.ru/category/item/3/1/0/8/28/257", "http://canoncompactcamerax.blogspot.com/", "http://classifieds.smashits.com/p,107881,email-to-friend.htm", "http://conferences.ksde.org/Portals/132/FallAssessment/SAVETHEDAY-FA09.pdf", "http://eway.vn/raovat/325-dien-tu-gia-dung/337-dieu-hoa/98041-b1-sua-may-lanh-quan-binh-tan-sua-may-lanh-quan-binh-chanh-hh-979676119-toan-quoc.html", "http://gallery.e2bn.org/asset73204_8-.html", "http://goplay.nsw.gov.au/activities-for-kids/by/historic-houses-trust/?startdate=2012-07-10", "http://grichards19067.multiply.com/", "http://hotkovo.egent.ru/user/89262269084/", "http://howimetyourself.com/?redirect_to=http://gomiso.com/m/suits/seasons/2/episodes/2", "http://islamqa.com/hi/ref/9014/DEAD%20PEOPLE%20GOOD%20DEEDS", "http://lapras.rutube.ru/", "http://nceluiko.ya.ru/", "http://nyanyanyanyaa.beon.ru/", "http://ozbo.com/Leaf-River-DV-7SS-7-0-MP-Game-Camera-K1-32541.html", "http://sbantom.ru/catalog/chasy/632753.html", "http://shopingoff.com/index.php?option=com_virtuemart&Itemid=65&category_id=&page=shop.browse&manufacturer_id=122&limit=32&limitstart=96", "http://shopingoff.com/katalog-odezhdy/manufacturer/62-christian-audigier.html?limit=32&start=448", "https://webwinkel.ah.nl/process?fh_location=//ecommerce/nl_NL/categories%3C%7Becommerce_shoc1%7D/it_show_product_code_1384%3E%7B10%3B20%7D/pr_startdate%3C20120519/pr_enddate%3E20120519/pr_ltc_allowed%3E%7Bbowi%7D/categories%3C%7Becommerce_shoc1_1al%7D/categories%3C%7Becommerce_shoc1_1al_1ahal%7D&&action=albert_noscript.modules.build", "http://top100.rambler.ru/navi/?theme=208/210/371&rgn=17", "http://volgogradskaya-oblast.extra-m.ru/classifieds/rabota/vakansii/banki-investicii/901467/", "http://wikien4.appspot.com/wiki/Warburg_hypothesis", "http://wola_baranowska.kamerzysta24.com.pl/", "http://www.10dot0dot0dot1.com/", "http://www.anima-redux.ru/index.php?key=gifts+teenage+girls", "http://www.aquaticabyseaworld.com/Calendar.aspx/CP/CP/CP/sp-us/CP/CP/ParkMap/Tickets/Weather.aspx", "http://www.autousa.com/360spin/2012_cadillac_ctssportwagon_3.6awdpremiumcollection.htm", "http://www.booking.com/city/gb/paignton-aireborough.html?inac=0&lang=pl", "http://www.booking.com/city/it/vodo-cadore.en.html", "http://www.booking.com/district/us/new-york/rockefeller-center.html&lang=no", "http://www.booking.com/hotel/bg/crown-fort-club.lv.html", "http://www.booking.com/hotel/ca/gouverneur-rimouski.ar.html", "http://www.booking.com/hotel/ch/l-auberge-du-chalet-a-gobet.fi.html", "http://www.booking.com/hotel/de/mark-garni.ru.html?aid=337384;label=yandex-hotel-mark-garni-68157-%7Bparam1%7D", "http://www.booking.com/hotel/de/mercure-goldschmieding-castrop-rauxel.ro.html", "http://www.booking.com/hotel/de/zollenspieker-fahrhaus.fr.html", "http://www.booking.com/hotel/es/jardin-metropolitano.ca.html", "http://www.booking.com/hotel/fr/clim.fr.html", "http://www.booking.com/hotel/fr/radisson-sas-toulouse-airport.et.html", "http://www.booking.com/hotel/gb/stgileshotel.ro.html?srfid=68c7fe42a03653a8796c84435c5299e4X16?tab=4", "http://www.booking.com/hotel/gr/rodos-park-suites.ru.html", "http://www.booking.com/hotel/id/le-grande-suites-bali.ru.html", "http://www.booking.com/hotel/it/mozart.it.html?aid=321655", "http://www.booking.com/hotel/ni/bahia-del-sol-villas.ru.html?dcid=1;dva=0", "http://www.booking.com/hotel/nl/cpschiphol.ro.html.ro.html?tab=4", "http://www.booking.com/hotel/th/laem-din.en-gb.html", "http://www.booking.com/hotel/th/tinidee-ranong.en.html", "http://www.booking.com/hotel/us/best-western-plus-merrimack-valley.hu.html", "http://www.booking.com/hotel/vn/tan-hai-long.km.html", "http://www.booking.com/landmark/au/royal-brisbane-women-s-hospital.vi.html", "http://www.booking.com/landmark/hk/nam-cheong-station.html&lang=id", "http://www.booking.com/landmark/it/spanish-steps.ca.html", "http://www.booking.com/landmark/sg/asian-civilisations-museum.html&lang=fi", "http://www.booking.com/place/fi-1376029.pt.html", "http://www.booking.com/place/tn257337.pl.html", "http://www.booking.com/region/ca/niagarafalls.ar.html&selected_currency=PLN", "http://www.booking.com/region/mx/queretaro.pt-pt.html&selected_currency=AUD", "http://www.booking.com/searchresults.en.html?city=20063074", "http://www.booking.com/searchresults.et.html?checkin=;checkout=;city=-394632", "http://www.booking.com/searchresults.lv.html?region=3936", "http://www.cevredanismanlari.com/index.php/component/k2/index.php/mevzuat/genel-yazlar/item/dosyalar/index.php?option=com_k2&view=item&id=16:iso-14001-%C3%A7evre-y%C3%B6netim-sistemi&Itemid=132&limitstart=107120", "http://www.dh-wholesaler.com/MENS-POLO-RACING-TEE-RL-p-417.html", "http://www.employabilityonline.net/", "http://www.esso.inc.ru/board/tools.php?event=profile&pname=Invinerrq", "http://www.filesurgery.ru/searchfw/kids_clothes-3.html", "http://www.furnitureandcarpetsource.com/Item.aspx?ItemID=-2107311899&ItemNum=53-T3048", "http://www.gets.cn/product/Gold-Sand-Lampwork-Glass-Beads--Flat-round--28x28x13mm_p260717.html", "http://www.gets.cn/wholesale-Sterling-Silver-Pendant-Findings-3577_S--L-Star-P-1.html?view=1&by=1", "http://www.homeandgardenadvice.com/diy/Mortgages_Loans_and_Financing/9221.html", "http://www.hongkongairport.com/eng/index.html/passenger/passenger/transport/to-from-airport/business/about-the-airport/transport/shopping/entertainment/t2/passenger/interactive-map.html", "http://www.hongkongairport.com/eng/index.html/shopping/insideshopping/all/passenger/transfer-transit/all/airline-information/shopping/entertainment/t2/business/about-the-airport/welcome.html", "http://www.hongkongairport.com/eng/index.html/transport/business/about-the-airport/transport/business/airport-authority/passenger/shopping/dining/all/dining.html", "http://www.idedge.com/index.cfm/fuseaction/category.display/category_id/298/index.cfm", "http://www.istanbulburda.com/aramalar.php", "http://www.jewelryinthenet.com/ads/AdDetail.aspx?AdID=1-0311002490689&stid=22-0111001020877", "http://www.johnnydepp.ru/forum/index.php?showtopic=1629&mode=linearplus&view=findpost&p=186977", "http://www.johnnydepp.ru/forum/index.php?showtopic=476&st=60&p=87379&", "http://www.joseleano.com/joomla/index.php/audio", "http://www.kaplicarehberi.com/tag/sakar-ilicali-kaplicalari/feed", "http://www.khaber.com.tr/arama.html?key=%C3%A7avdar", "http://www.kiz-oyunlari1.com/1783/4437/4363/1056/4170/Bump-Copter2-.html", "http://www.kiz-oyunlari1.com/3752/2612/4175/1166/3649/1047/Angelina-Oyunu.html", "http://www.kiz-oyunlari1.com/4266/3630/3665/3286/4121/301/3274/Sinir-Sinekler-.html", "http://www.kuldiga.lv/index.php?f=8&cat=371", "http://www.kuldiga.lv/index.php/img/index.php?l=lv&art_id=1836&show_c=&cat=85", "http://www.patronessa.ru/remontiruemsya/kuzovnie30raboti.html", "http://www.rapdict.org/Nu_Money?title=Talk:Nu_Money&action=edit", "http://www.serafin-phu.tabor24.com/?page=8", "http://www.shoes-store.org/brand1/Kids/Minnetonka.html", "http://www.shoes-store.org/shoes-store.xml", "http://www.way2allah.com/khotab-download-34695.htm"}; data.clear(); data.insert(data.begin(), urls, urls + Y_ARRAY_SIZE(urls)); } @@ -128,7 +128,7 @@ class TGreedyDictTest: public TTestBase { typedef THashMap<TStringBuf, NGreedyDict::TEntry> TDict; TAutoPtr<NGreedyDict::TEntrySet> DoTestBuilder(const NGreedyDict::TBuildSettings& s, - TDict& res) { + TDict& res) { using namespace NGreedyDict; TStringBufs data; diff --git a/library/cpp/codecs/huffman_codec.cpp b/library/cpp/codecs/huffman_codec.cpp index 650fe7cdfd..c8b126ccd0 100644 --- a/library/cpp/codecs/huffman_codec.cpp +++ b/library/cpp/codecs/huffman_codec.cpp @@ -9,584 +9,584 @@ #include <util/string/printf.h> namespace NCodecs { - template <typename T> - struct TCanonicalCmp { - bool operator()(const T& a, const T& b) const { - if (a.CodeLength == b.CodeLength) { - return a.Char < b.Char; - } else { - return a.CodeLength < b.CodeLength; - } - } - }; - - template <typename T> - struct TByCharCmp { - bool operator()(const T& a, const T& b) const { + template <typename T> + struct TCanonicalCmp { + bool operator()(const T& a, const T& b) const { + if (a.CodeLength == b.CodeLength) { + return a.Char < b.Char; + } else { + return a.CodeLength < b.CodeLength; + } + } + }; + + template <typename T> + struct TByCharCmp { + bool operator()(const T& a, const T& b) const { return a.Char < b.Char; } - }; + }; - struct TTreeEntry { - static const ui32 InvalidBranch = (ui32)-1; + struct TTreeEntry { + static const ui32 InvalidBranch = (ui32)-1; - ui64 Freq = 0; - ui32 Branches[2]{InvalidBranch, InvalidBranch}; + ui64 Freq = 0; + ui32 Branches[2]{InvalidBranch, InvalidBranch}; - ui32 CodeLength = 0; - ui8 Char = 0; - bool Invalid = false; + ui32 CodeLength = 0; + ui8 Char = 0; + bool Invalid = false; - TTreeEntry() = default; + TTreeEntry() = default; - static bool ByFreq(const TTreeEntry& a, const TTreeEntry& b) { - return a.Freq < b.Freq; - } + static bool ByFreq(const TTreeEntry& a, const TTreeEntry& b) { + return a.Freq < b.Freq; + } - static bool ByFreqRev(const TTreeEntry& a, const TTreeEntry& b) { - return a.Freq > b.Freq; - } - }; + static bool ByFreqRev(const TTreeEntry& a, const TTreeEntry& b) { + return a.Freq > b.Freq; + } + }; - using TCodeTree = TVector<TTreeEntry>; + using TCodeTree = TVector<TTreeEntry>; - void InitTreeByFreqs(TCodeTree& tree, const ui64 freqs[256]) { - tree.reserve(255 * 256 / 2); // worst case - balanced tree + void InitTreeByFreqs(TCodeTree& tree, const ui64 freqs[256]) { + tree.reserve(255 * 256 / 2); // worst case - balanced tree - for (ui32 i = 0; i < 256; ++i) { - tree.emplace_back(); - tree.back().Char = i; - tree.back().Freq = freqs[i]; - } + for (ui32 i = 0; i < 256; ++i) { + tree.emplace_back(); + tree.back().Char = i; + tree.back().Freq = freqs[i]; + } - StableSort(tree.begin(), tree.end(), TTreeEntry::ByFreq); + StableSort(tree.begin(), tree.end(), TTreeEntry::ByFreq); } - void InitTree(TCodeTree& tree, ISequenceReader* in) { - using namespace NPrivate; - ui64 freqs[256]; - Zero(freqs); + void InitTree(TCodeTree& tree, ISequenceReader* in) { + using namespace NPrivate; + ui64 freqs[256]; + Zero(freqs); - TStringBuf r; - while (in->NextRegion(r)) { + TStringBuf r; + while (in->NextRegion(r)) { for (ui64 i = 0; i < r.size(); ++i) - ++freqs[(ui8)r[i]]; - } + ++freqs[(ui8)r[i]]; + } - InitTreeByFreqs(tree, freqs); + InitTreeByFreqs(tree, freqs); } - void CalculateCodeLengths(TCodeTree& tree) { - Y_ENSURE(tree.size() == 256, " "); - const ui32 firstbranch = tree.size(); + void CalculateCodeLengths(TCodeTree& tree) { + Y_ENSURE(tree.size() == 256, " "); + const ui32 firstbranch = tree.size(); - ui32 curleaf = 0; - ui32 curbranch = firstbranch; + ui32 curleaf = 0; + ui32 curbranch = firstbranch; - // building code tree. two priority queues are combined in one. - while (firstbranch - curleaf + tree.size() - curbranch >= 2) { - TTreeEntry e; + // building code tree. two priority queues are combined in one. + while (firstbranch - curleaf + tree.size() - curbranch >= 2) { + TTreeEntry e; - for (auto& branche : e.Branches) { - ui32 br; + for (auto& branche : e.Branches) { + ui32 br; - if (curleaf >= firstbranch) - br = curbranch++; - else if (curbranch >= tree.size()) - br = curleaf++; - else if (tree[curleaf].Freq < tree[curbranch].Freq) - br = curleaf++; - else - br = curbranch++; + if (curleaf >= firstbranch) + br = curbranch++; + else if (curbranch >= tree.size()) + br = curleaf++; + else if (tree[curleaf].Freq < tree[curbranch].Freq) + br = curleaf++; + else + br = curbranch++; - Y_ENSURE(br < tree.size(), " "); - branche = br; - e.Freq += tree[br].Freq; - } + Y_ENSURE(br < tree.size(), " "); + branche = br; + e.Freq += tree[br].Freq; + } - tree.push_back(e); - PushHeap(tree.begin() + curbranch, tree.end(), TTreeEntry::ByFreqRev); + tree.push_back(e); + PushHeap(tree.begin() + curbranch, tree.end(), TTreeEntry::ByFreqRev); } - // computing code lengths - for (ui64 i = tree.size() - 1; i >= firstbranch; --i) { - TTreeEntry e = tree[i]; + // computing code lengths + for (ui64 i = tree.size() - 1; i >= firstbranch; --i) { + TTreeEntry e = tree[i]; - for (auto branche : e.Branches) - tree[branche].CodeLength = e.CodeLength + 1; - } - - // chopping off the branches - tree.resize(firstbranch); + for (auto branche : e.Branches) + tree[branche].CodeLength = e.CodeLength + 1; + } - Sort(tree.begin(), tree.end(), TCanonicalCmp<TTreeEntry>()); + // chopping off the branches + tree.resize(firstbranch); - // simplification: we are stripping codes longer than 64 bits - while (!tree.empty() && tree.back().CodeLength > 64) - tree.pop_back(); + Sort(tree.begin(), tree.end(), TCanonicalCmp<TTreeEntry>()); - // will not compress - if (tree.empty()) - return; + // simplification: we are stripping codes longer than 64 bits + while (!tree.empty() && tree.back().CodeLength > 64) + tree.pop_back(); - // special invalid code word - tree.back().Invalid = true; - } + // will not compress + if (tree.empty()) + return; - struct TEncoderEntry { - ui64 Code = 0; + // special invalid code word + tree.back().Invalid = true; + } - ui8 CodeLength = 0; - ui8 Char = 0; - ui8 Invalid = true; + struct TEncoderEntry { + ui64 Code = 0; - explicit TEncoderEntry(TTreeEntry e) - : CodeLength(e.CodeLength) - , Char(e.Char) - , Invalid(e.Invalid) - { - } + ui8 CodeLength = 0; + ui8 Char = 0; + ui8 Invalid = true; - TEncoderEntry() = default; - }; + explicit TEncoderEntry(TTreeEntry e) + : CodeLength(e.CodeLength) + , Char(e.Char) + , Invalid(e.Invalid) + { + } - struct TEncoderTable { - TEncoderEntry Entries[256]; + TEncoderEntry() = default; + }; - void Save(IOutputStream* out) const { - ui16 nval = 0; + struct TEncoderTable { + TEncoderEntry Entries[256]; - for (auto entrie : Entries) - nval += !entrie.Invalid; - - ::Save(out, nval); - - for (auto entrie : Entries) { - if (!entrie.Invalid) { - ::Save(out, entrie.Char); - ::Save(out, entrie.CodeLength); - } - } - } + void Save(IOutputStream* out) const { + ui16 nval = 0; - void Load(IInputStream* in) { - ui16 nval = 0; - ::Load(in, nval); + for (auto entrie : Entries) + nval += !entrie.Invalid; - for (ui32 i = 0; i < 256; ++i) - Entries[i].Char = i; + ::Save(out, nval); - for (ui32 i = 0; i < nval; ++i) { - ui8 ch = 0; - ui8 len = 0; - ::Load(in, ch); - ::Load(in, len); - Entries[ch].CodeLength = len; - Entries[ch].Invalid = false; + for (auto entrie : Entries) { + if (!entrie.Invalid) { + ::Save(out, entrie.Char); + ::Save(out, entrie.CodeLength); + } } } - }; - - struct TDecoderEntry { - ui32 NextTable : 10; - ui32 Char : 8; - ui32 Invalid : 1; - ui32 Bad : 1; - - TDecoderEntry() - : NextTable() - , Char() - , Invalid() - , Bad() - { - } - }; - - struct TDecoderTable: public TIntrusiveListItem<TDecoderTable> { - ui64 Length = 0; - ui64 BaseCode = 0; - TDecoderEntry Entries[256]; - - TDecoderTable() { - Zero(Entries); + void Load(IInputStream* in) { + ui16 nval = 0; + ::Load(in, nval); + + for (ui32 i = 0; i < 256; ++i) + Entries[i].Char = i; + + for (ui32 i = 0; i < nval; ++i) { + ui8 ch = 0; + ui8 len = 0; + ::Load(in, ch); + ::Load(in, len); + Entries[ch].CodeLength = len; + Entries[ch].Invalid = false; + } } - }; - - const int CACHE_BITS_COUNT = 16; - class THuffmanCodec::TImpl: public TAtomicRefCount<TImpl> { - TEncoderTable Encoder; - TDecoderTable Decoder[256]; - - TEncoderEntry Invalid; - - ui32 SubTablesNum; - - class THuffmanCache { - struct TCacheEntry { - int EndOffset : 24; - int BitsLeft : 8; - }; - TVector<char> DecodeCache; - TVector<TCacheEntry> CacheEntries; - const TImpl& Original; - - public: - THuffmanCache(const THuffmanCodec::TImpl& encoder); - - void Decode(NBitIO::TBitInput& in, TBuffer& out) const; + }; + + struct TDecoderEntry { + ui32 NextTable : 10; + ui32 Char : 8; + ui32 Invalid : 1; + ui32 Bad : 1; + + TDecoderEntry() + : NextTable() + , Char() + , Invalid() + , Bad() + { + } + }; + + struct TDecoderTable: public TIntrusiveListItem<TDecoderTable> { + ui64 Length = 0; + ui64 BaseCode = 0; + + TDecoderEntry Entries[256]; + + TDecoderTable() { + Zero(Entries); + } + }; + + const int CACHE_BITS_COUNT = 16; + class THuffmanCodec::TImpl: public TAtomicRefCount<TImpl> { + TEncoderTable Encoder; + TDecoderTable Decoder[256]; + + TEncoderEntry Invalid; + + ui32 SubTablesNum; + + class THuffmanCache { + struct TCacheEntry { + int EndOffset : 24; + int BitsLeft : 8; + }; + TVector<char> DecodeCache; + TVector<TCacheEntry> CacheEntries; + const TImpl& Original; + + public: + THuffmanCache(const THuffmanCodec::TImpl& encoder); + + void Decode(NBitIO::TBitInput& in, TBuffer& out) const; }; - THolder<THuffmanCache> Cache; + THolder<THuffmanCache> Cache; - public: - TImpl() - : SubTablesNum(1) - { - Invalid.CodeLength = 255; - } + public: + TImpl() + : SubTablesNum(1) + { + Invalid.CodeLength = 255; + } - ui8 Encode(TStringBuf in, TBuffer& out) const { - out.Clear(); + ui8 Encode(TStringBuf in, TBuffer& out) const { + out.Clear(); if (in.empty()) { - return 0; - } + return 0; + } out.Reserve(in.size() * 2); - { - NBitIO::TBitOutputVector<TBuffer> bout(&out); - TStringBuf tin = in; + { + NBitIO::TBitOutputVector<TBuffer> bout(&out); + TStringBuf tin = in; - // data is under compression - bout.Write(1, 1); + // data is under compression + bout.Write(1, 1); - for (auto t : tin) { - const TEncoderEntry& ce = Encoder.Entries[(ui8)t]; + for (auto t : tin) { + const TEncoderEntry& ce = Encoder.Entries[(ui8)t]; - bout.Write(ce.Code, ce.CodeLength); + bout.Write(ce.Code, ce.CodeLength); - if (ce.Invalid) { - bout.Write(t, 8); - } - } + if (ce.Invalid) { + bout.Write(t, 8); + } + } - // in canonical huffman coding there cannot be a code having no 0 in the suffix - // and shorter than 8 bits. - bout.Write((ui64)-1, bout.GetByteReminder()); - return bout.GetByteReminder(); + // in canonical huffman coding there cannot be a code having no 0 in the suffix + // and shorter than 8 bits. + bout.Write((ui64)-1, bout.GetByteReminder()); + return bout.GetByteReminder(); } } - void Decode(TStringBuf in, TBuffer& out) const { - out.Clear(); + void Decode(TStringBuf in, TBuffer& out) const { + out.Clear(); if (in.empty()) { - return; - } + return; + } - NBitIO::TBitInput bin(in); - ui64 f = 0; - bin.ReadK<1>(f); + NBitIO::TBitInput bin(in); + ui64 f = 0; + bin.ReadK<1>(f); - // if data is uncompressed - if (!f) { - in.Skip(1); + // if data is uncompressed + if (!f) { + in.Skip(1); out.Append(in.data(), in.size()); - } else { + } else { out.Reserve(in.size() * 8); - if (Cache.Get()) { - Cache->Decode(bin, out); - } else { - while (ReadNextChar(bin, out)) { - } + if (Cache.Get()) { + Cache->Decode(bin, out); + } else { + while (ReadNextChar(bin, out)) { + } } } } - Y_FORCE_INLINE int ReadNextChar(NBitIO::TBitInput& bin, TBuffer& out) const { - const TDecoderTable* table = Decoder; - TDecoderEntry e; + Y_FORCE_INLINE int ReadNextChar(NBitIO::TBitInput& bin, TBuffer& out) const { + const TDecoderTable* table = Decoder; + TDecoderEntry e; - int bitsRead = 0; - while (true) { - ui64 code = 0; + int bitsRead = 0; + while (true) { + ui64 code = 0; - if (Y_UNLIKELY(!bin.Read(code, table->Length))) - return 0; - bitsRead += table->Length; + if (Y_UNLIKELY(!bin.Read(code, table->Length))) + return 0; + bitsRead += table->Length; - if (Y_UNLIKELY(code < table->BaseCode)) - return 0; + if (Y_UNLIKELY(code < table->BaseCode)) + return 0; - code -= table->BaseCode; + code -= table->BaseCode; - if (Y_UNLIKELY(code > 255)) - return 0; + if (Y_UNLIKELY(code > 255)) + return 0; - e = table->Entries[code]; + e = table->Entries[code]; - if (Y_UNLIKELY(e.Bad)) - return 0; + if (Y_UNLIKELY(e.Bad)) + return 0; - if (e.NextTable) { - table = Decoder + e.NextTable; + if (e.NextTable) { + table = Decoder + e.NextTable; } else { - if (e.Invalid) { - code = 0; - bin.ReadK<8>(code); - bitsRead += 8; - out.Append((ui8)code); - } else { - out.Append((ui8)e.Char); - } - - return bitsRead; + if (e.Invalid) { + code = 0; + bin.ReadK<8>(code); + bitsRead += 8; + out.Append((ui8)code); + } else { + out.Append((ui8)e.Char); + } + + return bitsRead; } - } + } - Y_ENSURE(false, " could not decode input"); - return 0; + Y_ENSURE(false, " could not decode input"); + return 0; } - void GenerateEncoder(TCodeTree& tree) { - const ui64 sz = tree.size(); + void GenerateEncoder(TCodeTree& tree) { + const ui64 sz = tree.size(); - TEncoderEntry lastcode = Encoder.Entries[tree[0].Char] = TEncoderEntry(tree[0]); + TEncoderEntry lastcode = Encoder.Entries[tree[0].Char] = TEncoderEntry(tree[0]); - for (ui32 i = 1; i < sz; ++i) { - const TTreeEntry& te = tree[i]; - TEncoderEntry& e = Encoder.Entries[te.Char]; - e = TEncoderEntry(te); + for (ui32 i = 1; i < sz; ++i) { + const TTreeEntry& te = tree[i]; + TEncoderEntry& e = Encoder.Entries[te.Char]; + e = TEncoderEntry(te); - e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength); - lastcode = e; + e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength); + lastcode = e; - e.Code = ReverseBits(e.Code, e.CodeLength); - - if (e.Invalid) - Invalid = e; - } + e.Code = ReverseBits(e.Code, e.CodeLength); - for (auto& e : Encoder.Entries) { - if (e.Invalid) - e = Invalid; + if (e.Invalid) + Invalid = e; + } - Y_ENSURE(e.CodeLength, " "); - } + for (auto& e : Encoder.Entries) { + if (e.Invalid) + e = Invalid; + + Y_ENSURE(e.CodeLength, " "); + } } - void RegenerateEncoder() { - for (auto& entrie : Encoder.Entries) { - if (entrie.Invalid) - entrie.CodeLength = Invalid.CodeLength; - } + void RegenerateEncoder() { + for (auto& entrie : Encoder.Entries) { + if (entrie.Invalid) + entrie.CodeLength = Invalid.CodeLength; + } - Sort(Encoder.Entries, Encoder.Entries + 256, TCanonicalCmp<TEncoderEntry>()); + Sort(Encoder.Entries, Encoder.Entries + 256, TCanonicalCmp<TEncoderEntry>()); - TEncoderEntry lastcode = Encoder.Entries[0]; + TEncoderEntry lastcode = Encoder.Entries[0]; - for (ui32 i = 1; i < 256; ++i) { - TEncoderEntry& e = Encoder.Entries[i]; - e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength); - lastcode = e; + for (ui32 i = 1; i < 256; ++i) { + TEncoderEntry& e = Encoder.Entries[i]; + e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength); + lastcode = e; - e.Code = ReverseBits(e.Code, e.CodeLength); - } + e.Code = ReverseBits(e.Code, e.CodeLength); + } - for (auto& entrie : Encoder.Entries) { - if (entrie.Invalid) { - Invalid = entrie; - break; - } - } + for (auto& entrie : Encoder.Entries) { + if (entrie.Invalid) { + Invalid = entrie; + break; + } + } - Sort(Encoder.Entries, Encoder.Entries + 256, TByCharCmp<TEncoderEntry>()); + Sort(Encoder.Entries, Encoder.Entries + 256, TByCharCmp<TEncoderEntry>()); - for (auto& entrie : Encoder.Entries) { - if (entrie.Invalid) - entrie = Invalid; + for (auto& entrie : Encoder.Entries) { + if (entrie.Invalid) + entrie = Invalid; } } - void BuildDecoder() { - TEncoderTable enc = Encoder; - Sort(enc.Entries, enc.Entries + 256, TCanonicalCmp<TEncoderEntry>()); - - TEncoderEntry& e1 = enc.Entries[0]; - Decoder[0].BaseCode = e1.Code; - Decoder[0].Length = e1.CodeLength; - - for (auto e2 : enc.Entries) { - SetEntry(Decoder, e2.Code, e2.CodeLength, e2); - } - Cache.Reset(new THuffmanCache(*this)); + void BuildDecoder() { + TEncoderTable enc = Encoder; + Sort(enc.Entries, enc.Entries + 256, TCanonicalCmp<TEncoderEntry>()); + + TEncoderEntry& e1 = enc.Entries[0]; + Decoder[0].BaseCode = e1.Code; + Decoder[0].Length = e1.CodeLength; + + for (auto e2 : enc.Entries) { + SetEntry(Decoder, e2.Code, e2.CodeLength, e2); + } + Cache.Reset(new THuffmanCache(*this)); } - void SetEntry(TDecoderTable* t, ui64 code, ui64 len, TEncoderEntry e) { - Y_ENSURE(len >= t->Length, len << " < " << t->Length); + void SetEntry(TDecoderTable* t, ui64 code, ui64 len, TEncoderEntry e) { + Y_ENSURE(len >= t->Length, len << " < " << t->Length); - ui64 idx = (code & MaskLowerBits(t->Length)) - t->BaseCode; - TDecoderEntry& d = t->Entries[idx]; + ui64 idx = (code & MaskLowerBits(t->Length)) - t->BaseCode; + TDecoderEntry& d = t->Entries[idx]; - if (len == t->Length) { - Y_ENSURE(!d.NextTable, " "); + if (len == t->Length) { + Y_ENSURE(!d.NextTable, " "); - d.Char = e.Char; - d.Invalid = e.Invalid; - return; - } + d.Char = e.Char; + d.Invalid = e.Invalid; + return; + } - if (!d.NextTable) { - Y_ENSURE(SubTablesNum < Y_ARRAY_SIZE(Decoder), " "); - d.NextTable = SubTablesNum++; - TDecoderTable* nt = Decoder + d.NextTable; - nt->Length = Min<ui64>(8, len - t->Length); - nt->BaseCode = (code >> t->Length) & MaskLowerBits(nt->Length); - } + if (!d.NextTable) { + Y_ENSURE(SubTablesNum < Y_ARRAY_SIZE(Decoder), " "); + d.NextTable = SubTablesNum++; + TDecoderTable* nt = Decoder + d.NextTable; + nt->Length = Min<ui64>(8, len - t->Length); + nt->BaseCode = (code >> t->Length) & MaskLowerBits(nt->Length); + } - SetEntry(Decoder + d.NextTable, code >> t->Length, len - t->Length, e); + SetEntry(Decoder + d.NextTable, code >> t->Length, len - t->Length, e); } - void Learn(ISequenceReader* in) { - { - TCodeTree tree; - InitTree(tree, in); - CalculateCodeLengths(tree); - Y_ENSURE(!tree.empty(), " "); - GenerateEncoder(tree); - } - BuildDecoder(); + void Learn(ISequenceReader* in) { + { + TCodeTree tree; + InitTree(tree, in); + CalculateCodeLengths(tree); + Y_ENSURE(!tree.empty(), " "); + GenerateEncoder(tree); + } + BuildDecoder(); } void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) { TCodeTree tree; - ui64 freqsArray[256]; - Zero(freqsArray); + ui64 freqsArray[256]; + Zero(freqsArray); - for (const auto& freq : freqs) - freqsArray[static_cast<ui8>(freq.first)] += freq.second; + for (const auto& freq : freqs) + freqsArray[static_cast<ui8>(freq.first)] += freq.second; - InitTreeByFreqs(tree, freqsArray); - CalculateCodeLengths(tree); + InitTreeByFreqs(tree, freqsArray); + CalculateCodeLengths(tree); - Y_ENSURE(!tree.empty(), " "); + Y_ENSURE(!tree.empty(), " "); - GenerateEncoder(tree); - BuildDecoder(); - } + GenerateEncoder(tree); + BuildDecoder(); + } - void Save(IOutputStream* out) { - ::Save(out, Invalid.CodeLength); - Encoder.Save(out); - } + void Save(IOutputStream* out) { + ::Save(out, Invalid.CodeLength); + Encoder.Save(out); + } - void Load(IInputStream* in) { - ::Load(in, Invalid.CodeLength); - Encoder.Load(in); - RegenerateEncoder(); - BuildDecoder(); - } - }; + void Load(IInputStream* in) { + ::Load(in, Invalid.CodeLength); + Encoder.Load(in); + RegenerateEncoder(); + BuildDecoder(); + } + }; - THuffmanCodec::TImpl::THuffmanCache::THuffmanCache(const THuffmanCodec::TImpl& codec) - : Original(codec) - { - CacheEntries.resize(1 << CACHE_BITS_COUNT); + THuffmanCodec::TImpl::THuffmanCache::THuffmanCache(const THuffmanCodec::TImpl& codec) + : Original(codec) + { + CacheEntries.resize(1 << CACHE_BITS_COUNT); DecodeCache.reserve(CacheEntries.size() * 2); - char buffer[2]; - TBuffer decoded; + char buffer[2]; + TBuffer decoded; for (size_t i = 0; i < CacheEntries.size(); i++) { - buffer[1] = i >> 8; - buffer[0] = i; - NBitIO::TBitInput bin(buffer, buffer + sizeof(buffer)); - int totalBits = 0; - while (true) { - decoded.Resize(0); - int bits = codec.ReadNextChar(bin, decoded); - if (totalBits + bits > 16 || !bits) { - TCacheEntry e = {static_cast<int>(DecodeCache.size()), 16 - totalBits}; - CacheEntries[i] = e; - break; - } - - for (TBuffer::TConstIterator it = decoded.Begin(); it != decoded.End(); ++it) { - DecodeCache.push_back(*it); - } - totalBits += bits; + buffer[1] = i >> 8; + buffer[0] = i; + NBitIO::TBitInput bin(buffer, buffer + sizeof(buffer)); + int totalBits = 0; + while (true) { + decoded.Resize(0); + int bits = codec.ReadNextChar(bin, decoded); + if (totalBits + bits > 16 || !bits) { + TCacheEntry e = {static_cast<int>(DecodeCache.size()), 16 - totalBits}; + CacheEntries[i] = e; + break; + } + + for (TBuffer::TConstIterator it = decoded.Begin(); it != decoded.End(); ++it) { + DecodeCache.push_back(*it); + } + totalBits += bits; } } - DecodeCache.push_back(0); - CacheEntries.shrink_to_fit(); - DecodeCache.shrink_to_fit(); + DecodeCache.push_back(0); + CacheEntries.shrink_to_fit(); + DecodeCache.shrink_to_fit(); } - void THuffmanCodec::TImpl::THuffmanCache::Decode(NBitIO::TBitInput& bin, TBuffer& out) const { - int bits = 0; - ui64 code = 0; - while (!bin.Eof()) { - ui64 f = 0; - const int toRead = 16 - bits; - if (toRead > 0 && bin.Read(f, toRead)) { - code = (code >> (16 - bits)) | (f << bits); - code &= 0xFFFF; - TCacheEntry entry = CacheEntries[code]; - int start = code > 0 ? CacheEntries[code - 1].EndOffset : 0; - out.Append((const char*)&DecodeCache[start], (const char*)&DecodeCache[entry.EndOffset]); - bits = entry.BitsLeft; - } else { // should never happen until there are exceptions or unaligned input - bin.Back(bits); - if (!Original.ReadNextChar(bin, out)) - break; - - code = 0; - bits = 0; - } + void THuffmanCodec::TImpl::THuffmanCache::Decode(NBitIO::TBitInput& bin, TBuffer& out) const { + int bits = 0; + ui64 code = 0; + while (!bin.Eof()) { + ui64 f = 0; + const int toRead = 16 - bits; + if (toRead > 0 && bin.Read(f, toRead)) { + code = (code >> (16 - bits)) | (f << bits); + code &= 0xFFFF; + TCacheEntry entry = CacheEntries[code]; + int start = code > 0 ? CacheEntries[code - 1].EndOffset : 0; + out.Append((const char*)&DecodeCache[start], (const char*)&DecodeCache[entry.EndOffset]); + bits = entry.BitsLeft; + } else { // should never happen until there are exceptions or unaligned input + bin.Back(bits); + if (!Original.ReadNextChar(bin, out)) + break; + + code = 0; + bits = 0; + } } } - THuffmanCodec::THuffmanCodec() - : Impl(new TImpl) - { - MyTraits.NeedsTraining = true; - MyTraits.PreservesPrefixGrouping = true; - MyTraits.PaddingBit = 1; - MyTraits.SizeOnEncodeMultiplier = 2; - MyTraits.SizeOnDecodeMultiplier = 8; - MyTraits.RecommendedSampleSize = 1 << 21; - } + THuffmanCodec::THuffmanCodec() + : Impl(new TImpl) + { + MyTraits.NeedsTraining = true; + MyTraits.PreservesPrefixGrouping = true; + MyTraits.PaddingBit = 1; + MyTraits.SizeOnEncodeMultiplier = 2; + MyTraits.SizeOnDecodeMultiplier = 8; + MyTraits.RecommendedSampleSize = 1 << 21; + } - THuffmanCodec::~THuffmanCodec() = default; + THuffmanCodec::~THuffmanCodec() = default; - ui8 THuffmanCodec::Encode(TStringBuf in, TBuffer& bbb) const { - if (Y_UNLIKELY(!Trained)) - ythrow TCodecException() << " not trained"; + ui8 THuffmanCodec::Encode(TStringBuf in, TBuffer& bbb) const { + if (Y_UNLIKELY(!Trained)) + ythrow TCodecException() << " not trained"; - return Impl->Encode(in, bbb); - } + return Impl->Encode(in, bbb); + } - void THuffmanCodec::Decode(TStringBuf in, TBuffer& bbb) const { - Impl->Decode(in, bbb); - } + void THuffmanCodec::Decode(TStringBuf in, TBuffer& bbb) const { + Impl->Decode(in, bbb); + } - void THuffmanCodec::Save(IOutputStream* out) const { - Impl->Save(out); - } + void THuffmanCodec::Save(IOutputStream* out) const { + Impl->Save(out); + } - void THuffmanCodec::Load(IInputStream* in) { - Impl->Load(in); - } + void THuffmanCodec::Load(IInputStream* in) { + Impl->Load(in); + } - void THuffmanCodec::DoLearn(ISequenceReader& in) { - Impl->Learn(&in); - } + void THuffmanCodec::DoLearn(ISequenceReader& in) { + Impl->Learn(&in); + } void THuffmanCodec::LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) { - Impl->LearnByFreqs(freqs); - Trained = true; - } + Impl->LearnByFreqs(freqs); + Trained = true; + } } diff --git a/library/cpp/codecs/huffman_codec.h b/library/cpp/codecs/huffman_codec.h index 559545b90d..24f8397694 100644 --- a/library/cpp/codecs/huffman_codec.h +++ b/library/cpp/codecs/huffman_codec.h @@ -6,34 +6,34 @@ #include <util/string/cast.h> namespace NCodecs { - // for types greater than char, pipeline with TFreqCodec. + // for types greater than char, pipeline with TFreqCodec. - class THuffmanCodec: public ICodec { - class TImpl; - TIntrusivePtr<TImpl> Impl; + class THuffmanCodec: public ICodec { + class TImpl; + TIntrusivePtr<TImpl> Impl; - public: - THuffmanCodec(); - ~THuffmanCodec() override; + public: + THuffmanCodec(); + ~THuffmanCodec() override; - static TStringBuf MyName() { - return "huffman"; - } + static TStringBuf MyName() { + return "huffman"; + } - TString GetName() const override { + TString GetName() const override { return ToString(MyName()); - } + } - ui8 Encode(TStringBuf in, TBuffer& bbb) const override; + ui8 Encode(TStringBuf in, TBuffer& bbb) const override; - void Decode(TStringBuf in, TBuffer& bbb) const override; + void Decode(TStringBuf in, TBuffer& bbb) const override; void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs); - protected: - void DoLearn(ISequenceReader& in) override; - void Save(IOutputStream* out) const override; - void Load(IInputStream* in) override; - }; + protected: + void DoLearn(ISequenceReader& in) override; + void Save(IOutputStream* out) const override; + void Load(IInputStream* in) override; + }; } diff --git a/library/cpp/codecs/pfor_codec.cpp b/library/cpp/codecs/pfor_codec.cpp index f6b3b0920b..d5dbc5a7fa 100644 --- a/library/cpp/codecs/pfor_codec.cpp +++ b/library/cpp/codecs/pfor_codec.cpp @@ -1,22 +1,22 @@ #include "pfor_codec.h" namespace NCodecs { - template <> - TStringBuf TPForCodec<ui64, true>::MyName() { - return "pfor-delta64-sorted"; - } - template <> - TStringBuf TPForCodec<ui32, true>::MyName() { - return "pfor-delta32-sorted"; - } + template <> + TStringBuf TPForCodec<ui64, true>::MyName() { + return "pfor-delta64-sorted"; + } + template <> + TStringBuf TPForCodec<ui32, true>::MyName() { + return "pfor-delta32-sorted"; + } - template <> - TStringBuf TPForCodec<ui64, false>::MyName() { - return "pfor-ui64"; - } - template <> - TStringBuf TPForCodec<ui32, false>::MyName() { - return "pfor-ui32"; - } + template <> + TStringBuf TPForCodec<ui64, false>::MyName() { + return "pfor-ui64"; + } + template <> + TStringBuf TPForCodec<ui32, false>::MyName() { + return "pfor-ui32"; + } } diff --git a/library/cpp/codecs/pfor_codec.h b/library/cpp/codecs/pfor_codec.h index d7d4bb8bf4..a1f2bf9f9a 100644 --- a/library/cpp/codecs/pfor_codec.h +++ b/library/cpp/codecs/pfor_codec.h @@ -10,202 +10,202 @@ #include <util/string/cast.h> namespace NCodecs { - template <typename T, bool WithDelta = false> - class TPForCodec: public ICodec { - using TUnsigned = std::make_unsigned_t<T>; - typedef TDeltaCodec<TUnsigned> TDCodec; + template <typename T, bool WithDelta = false> + class TPForCodec: public ICodec { + using TUnsigned = std::make_unsigned_t<T>; + typedef TDeltaCodec<TUnsigned> TDCodec; - typedef std::conditional_t<WithDelta, typename TDCodec::TDelta, T> TValue; - static_assert(std::is_unsigned<TValue>::value, "expect std:is_unsigned<TValue>::value"); + typedef std::conditional_t<WithDelta, typename TDCodec::TDelta, T> TValue; + static_assert(std::is_unsigned<TValue>::value, "expect std:is_unsigned<TValue>::value"); - static const ui64 BitsInT = sizeof(TUnsigned) * 8; + static const ui64 BitsInT = sizeof(TUnsigned) * 8; - TDCodec DeltaCodec; + TDCodec DeltaCodec; - public: - static TStringBuf MyName(); + public: + static TStringBuf MyName(); - TPForCodec() { - MyTraits.AssumesStructuredInput = true; - MyTraits.SizeOfInputElement = sizeof(T); - MyTraits.SizeOnDecodeMultiplier = sizeof(T); - } + TPForCodec() { + MyTraits.AssumesStructuredInput = true; + MyTraits.SizeOfInputElement = sizeof(T); + MyTraits.SizeOnDecodeMultiplier = sizeof(T); + } - TString GetName() const override { + TString GetName() const override { return ToString(MyName()); - } + } - ui8 Encode(TStringBuf s, TBuffer& b) const override { - b.Clear(); + ui8 Encode(TStringBuf s, TBuffer& b) const override { + b.Clear(); if (s.empty()) { - return 0; - } + return 0; + } b.Reserve(2 * s.size() + b.Size()); - if (WithDelta) { - auto buffer = TBufferTlsCache::TlsInstance().Item(); - TBuffer& db = buffer.Get(); - db.Clear(); + if (WithDelta) { + auto buffer = TBufferTlsCache::TlsInstance().Item(); + TBuffer& db = buffer.Get(); + db.Clear(); db.Reserve(2 * s.size()); - DeltaCodec.Encode(s, db); + DeltaCodec.Encode(s, db); s = TStringBuf{db.data(), db.size()}; - } + } TArrayRef<const TValue> tin{(const TValue*)s.data(), s.size() / sizeof(TValue)}; const ui64 sz = tin.size(); - ui64 bitcounts[BitsInT + 1]; - Zero(bitcounts); + ui64 bitcounts[BitsInT + 1]; + Zero(bitcounts); - ui32 zeros = 0; + ui32 zeros = 0; for (const TValue* it = tin.begin(); it != tin.end(); ++it) { - TUnsigned v = 1 + (TUnsigned)*it; - ui64 l = MostSignificantBit(v) + 1; - ++bitcounts[l]; - - if (!v) { - ++zeros; - } - } - - // cumulative bit counts - for (ui64 i = 0; i < BitsInT; ++i) { - bitcounts[i + 1] += bitcounts[i]; + TUnsigned v = 1 + (TUnsigned)*it; + ui64 l = MostSignificantBit(v) + 1; + ++bitcounts[l]; + + if (!v) { + ++zeros; + } + } + + // cumulative bit counts + for (ui64 i = 0; i < BitsInT; ++i) { + bitcounts[i + 1] += bitcounts[i]; } - bool hasexceptions = zeros; - ui64 optimalbits = BitsInT; + bool hasexceptions = zeros; + ui64 optimalbits = BitsInT; - { - ui64 excsize = 0; - ui64 minsize = sz * BitsInT; + { + ui64 excsize = 0; + ui64 minsize = sz * BitsInT; - for (ui64 current = BitsInT; current; --current) { - ui64 size = bitcounts[current] * current + (sz - bitcounts[current]) * (current + 6 + excsize) + zeros * (current + 6); + for (ui64 current = BitsInT; current; --current) { + ui64 size = bitcounts[current] * current + (sz - bitcounts[current]) * (current + 6 + excsize) + zeros * (current + 6); - excsize += current * bitcounts[current]; + excsize += current * bitcounts[current]; - if (size < minsize) { - minsize = size; - optimalbits = current; - hasexceptions = zeros || sz - bitcounts[current]; - } + if (size < minsize) { + minsize = size; + optimalbits = current; + hasexceptions = zeros || sz - bitcounts[current]; + } } } - if (!optimalbits || BitsInT == optimalbits) { - b.Append((ui8)-1); + if (!optimalbits || BitsInT == optimalbits) { + b.Append((ui8)-1); b.Append(s.data(), s.size()); - return 0; - } else { - NBitIO::TBitOutputVector<TBuffer> bout(&b); - bout.Write(0, 1); - bout.Write(hasexceptions, 1); - bout.Write(optimalbits, 6); + return 0; + } else { + NBitIO::TBitOutputVector<TBuffer> bout(&b); + bout.Write(0, 1); + bout.Write(hasexceptions, 1); + bout.Write(optimalbits, 6); for (const TValue* it = tin.begin(); it != tin.end(); ++it) { - TUnsigned word = 1 + (TUnsigned)*it; - ui64 len = MostSignificantBit(word) + 1; - if (len > optimalbits || !word) { - Y_ENSURE(hasexceptions, " "); - bout.Write(0, optimalbits); - bout.Write(len, 6); - bout.Write(word, len); - } else { - bout.Write(word, optimalbits); - } + TUnsigned word = 1 + (TUnsigned)*it; + ui64 len = MostSignificantBit(word) + 1; + if (len > optimalbits || !word) { + Y_ENSURE(hasexceptions, " "); + bout.Write(0, optimalbits); + bout.Write(len, 6); + bout.Write(word, len); + } else { + bout.Write(word, optimalbits); + } } - return bout.GetByteReminder(); - } // the rest of the last byte is zero padded. BitsInT is always > 7. + return bout.GetByteReminder(); + } // the rest of the last byte is zero padded. BitsInT is always > 7. } - void Decode(TStringBuf s, TBuffer& b) const override { - b.Clear(); + void Decode(TStringBuf s, TBuffer& b) const override { + b.Clear(); if (s.empty()) { - return; - } + return; + } b.Reserve(s.size() * sizeof(T) + b.Size()); - ui64 isplain = 0; - ui64 hasexceptions = 0; - ui64 bits = 0; - - NBitIO::TBitInput bin(s); - bin.ReadK<1>(isplain); - bin.ReadK<1>(hasexceptions); - bin.ReadK<6>(bits); - - if (Y_UNLIKELY(isplain)) { - s.Skip(1); - - if (WithDelta) { - DeltaCodec.Decode(s, b); - } else { + ui64 isplain = 0; + ui64 hasexceptions = 0; + ui64 bits = 0; + + NBitIO::TBitInput bin(s); + bin.ReadK<1>(isplain); + bin.ReadK<1>(hasexceptions); + bin.ReadK<6>(bits); + + if (Y_UNLIKELY(isplain)) { + s.Skip(1); + + if (WithDelta) { + DeltaCodec.Decode(s, b); + } else { b.Append(s.data(), s.size()); - } + } } else { - typename TDCodec::TDecoder decoder; + typename TDCodec::TDecoder decoder; - if (hasexceptions) { - ui64 word = 0; - while (bin.Read(word, bits)) { - if (word || (bin.ReadK<6>(word) && bin.Read(word, word))) { - --word; + if (hasexceptions) { + ui64 word = 0; + while (bin.Read(word, bits)) { + if (word || (bin.ReadK<6>(word) && bin.Read(word, word))) { + --word; - TValue t = word; + TValue t = word; - if (WithDelta) { - if (decoder.Decode(t)) { - TStringBuf r{(char*)&decoder.Result, sizeof(decoder.Result)}; + if (WithDelta) { + if (decoder.Decode(t)) { + TStringBuf r{(char*)&decoder.Result, sizeof(decoder.Result)}; b.Append(r.data(), r.size()); - } - } else { - TStringBuf r{(char*)&t, sizeof(t)}; + } + } else { + TStringBuf r{(char*)&t, sizeof(t)}; b.Append(r.data(), r.size()); } } } - } else { - ui64 word = 0; - T outarr[256 / sizeof(T)]; - ui32 cnt = 0; - while (true) { - ui64 v = bin.Read(word, bits); - - if ((!v) | (!word)) - break; - - --word; - TValue t = word; - - if (WithDelta) { - if (decoder.Decode(t)) { - outarr[cnt++] = decoder.Result; - } - } else { - outarr[cnt++] = t; - } - - if (cnt == Y_ARRAY_SIZE(outarr)) { - b.Append((const char*)outarr, sizeof(outarr)); - cnt = 0; + } else { + ui64 word = 0; + T outarr[256 / sizeof(T)]; + ui32 cnt = 0; + while (true) { + ui64 v = bin.Read(word, bits); + + if ((!v) | (!word)) + break; + + --word; + TValue t = word; + + if (WithDelta) { + if (decoder.Decode(t)) { + outarr[cnt++] = decoder.Result; + } + } else { + outarr[cnt++] = t; } + + if (cnt == Y_ARRAY_SIZE(outarr)) { + b.Append((const char*)outarr, sizeof(outarr)); + cnt = 0; + } } - if (cnt) { - b.Append((const char*)outarr, cnt * sizeof(T)); + if (cnt) { + b.Append((const char*)outarr, cnt * sizeof(T)); } } } } - protected: - void DoLearn(ISequenceReader&) override { - } - }; + protected: + void DoLearn(ISequenceReader&) override { + } + }; } diff --git a/library/cpp/codecs/sample.h b/library/cpp/codecs/sample.h index 15f03afcc5..5d3ab57f78 100644 --- a/library/cpp/codecs/sample.h +++ b/library/cpp/codecs/sample.h @@ -24,20 +24,20 @@ namespace NCodecs { } template <class TIter> - TStringBuf IterToStringBuf(TIter iter) { + TStringBuf IterToStringBuf(TIter iter) { return ValueToStringBuf(*iter); } template <class TItem> - class TSimpleSequenceReader: public ISequenceReader { + class TSimpleSequenceReader: public ISequenceReader { const TVector<TItem>& Items; size_t Idx = 0; public: TSimpleSequenceReader(const TVector<TItem>& items) : Items(items) - { - } + { + } bool NextRegion(TStringBuf& s) override { if (Idx >= Items.size()) { diff --git a/library/cpp/codecs/solar_codec.cpp b/library/cpp/codecs/solar_codec.cpp index d0692fe2a4..6c08b9e7bd 100644 --- a/library/cpp/codecs/solar_codec.cpp +++ b/library/cpp/codecs/solar_codec.cpp @@ -9,125 +9,125 @@ #include <util/ysaveload.h> namespace NCodecs { - static inline ui32 Append(TBuffer& pool, TStringBuf data) { + static inline ui32 Append(TBuffer& pool, TStringBuf data) { pool.Append(data.data(), data.size()); - return pool.Size(); - } - - void TSolarCodec::DoLearn(ISequenceReader& r) { - using namespace NGreedyDict; + return pool.Size(); + } - Decoder.clear(); - Pool.Clear(); + void TSolarCodec::DoLearn(ISequenceReader& r) { + using namespace NGreedyDict; - THolder<TEntrySet> set; + Decoder.clear(); + Pool.Clear(); - { - TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance()); - TStringBufs bufs; + THolder<TEntrySet> set; - TStringBuf m; - while (r.NextRegion(m)) { - bufs.push_back(pool.AppendString(m)); - } + { + TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance()); + TStringBufs bufs; - { - TDictBuilder b(Settings); - b.SetInput(bufs); - b.Build(MaxEntries, MaxIterations); + TStringBuf m; + while (r.NextRegion(m)) { + bufs.push_back(pool.AppendString(m)); + } - set = b.ReleaseEntrySet(); - } + { + TDictBuilder b(Settings); + b.SetInput(bufs); + b.Build(MaxEntries, MaxIterations); + + set = b.ReleaseEntrySet(); + } } - set->SetScores(ES_LEN_COUNT); - + set->SetScores(ES_LEN_COUNT); + { - TVector<std::pair<float, TStringBuf>> tmp; - tmp.reserve(set->size()); + TVector<std::pair<float, TStringBuf>> tmp; + tmp.reserve(set->size()); - for (const auto& it : *set) { - tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1)))); - } + for (const auto& it : *set) { + tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1)))); + } - Sort(tmp.begin(), tmp.end()); + Sort(tmp.begin(), tmp.end()); - Decoder.reserve(tmp.size() + 1); - Decoder.push_back(0); + Decoder.reserve(tmp.size() + 1); + Decoder.push_back(0); - for (const auto& it : tmp) { - Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed"); - ui32 endoff = Append(Pool, it.second); - Decoder.push_back(endoff); - } + for (const auto& it : tmp) { + Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed"); + ui32 endoff = Append(Pool, it.second); + Decoder.push_back(endoff); + } } - Pool.ShrinkToFit(); - Decoder.shrink_to_fit(); + Pool.ShrinkToFit(); + Decoder.shrink_to_fit(); - TBufferOutput bout; + TBufferOutput bout; - { - TVector<std::pair<TStringBuf, ui32>> tmp2; - tmp2.reserve(Decoder.size()); + { + TVector<std::pair<TStringBuf, ui32>> tmp2; + tmp2.reserve(Decoder.size()); - for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) { - TStringBuf s = DoDecode(i); - tmp2.push_back(std::make_pair(s, i - 1)); + for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) { + TStringBuf s = DoDecode(i); + tmp2.push_back(std::make_pair(s, i - 1)); Y_ENSURE(s.size() == (Decoder[i] - Decoder[i - 1]), "learning invariant failed"); - } + } - Sort(tmp2.begin(), tmp2.end()); + Sort(tmp2.begin(), tmp2.end()); - { - TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED); - for (const auto& it : tmp2) { + { + TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED); + for (const auto& it : tmp2) { builder.Add(it.first.data(), it.first.size(), it.second); - } + } - builder.Save(bout); + builder.Save(bout); } } - Encoder.Init(TBlob::FromBuffer(bout.Buffer())); - } - - void TSolarCodec::Save(IOutputStream* out) const { - TBlob b = Encoder.Data(); - ::Save(out, (ui32)b.Size()); - out->Write(b.Data(), b.Size()); + Encoder.Init(TBlob::FromBuffer(bout.Buffer())); } - void TSolarCodec::Load(IInputStream* in) { - ui32 sz; - ::Load(in, sz); - TLengthLimitedInput lin(in, sz); - Encoder.Init(TBlob::FromStream(lin)); - Pool.Clear(); - Decoder.clear(); - - TVector<std::pair<ui32, TString>> tmp; - - ui32 poolsz = 0; - for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) { - const TString& s = it.GetKey(); - tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s)); + void TSolarCodec::Save(IOutputStream* out) const { + TBlob b = Encoder.Data(); + ::Save(out, (ui32)b.Size()); + out->Write(b.Data(), b.Size()); + } + + void TSolarCodec::Load(IInputStream* in) { + ui32 sz; + ::Load(in, sz); + TLengthLimitedInput lin(in, sz); + Encoder.Init(TBlob::FromStream(lin)); + Pool.Clear(); + Decoder.clear(); + + TVector<std::pair<ui32, TString>> tmp; + + ui32 poolsz = 0; + for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) { + const TString& s = it.GetKey(); + tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s)); poolsz += Max<ui32>(s.size(), 1); - } + } - Sort(tmp.begin(), tmp.end()); + Sort(tmp.begin(), tmp.end()); - Pool.Reserve(poolsz); - Decoder.reserve(tmp.size() + 1); - Decoder.push_back(0); + Pool.Reserve(poolsz); + Decoder.reserve(tmp.size() + 1); + Decoder.push_back(0); - for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) { - Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first); - Decoder.push_back(Append(Pool, tmp[i].second)); - } + for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) { + Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first); + Decoder.push_back(Append(Pool, tmp[i].second)); + } - Pool.ShrinkToFit(); - Decoder.shrink_to_fit(); + Pool.ShrinkToFit(); + Decoder.shrink_to_fit(); } } diff --git a/library/cpp/codecs/solar_codec.h b/library/cpp/codecs/solar_codec.h index 7158ae7926..08fdf9d123 100644 --- a/library/cpp/codecs/solar_codec.h +++ b/library/cpp/codecs/solar_codec.h @@ -11,234 +11,234 @@ namespace NCodecs { // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы. // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе. - struct TVarIntTraits { - static const size_t MAX_VARINT32_BYTES = 5; - - static void Write(ui32 value, TBuffer& b) { - while (value > 0x7F) { - b.Append(static_cast<ui8>(value) | 0x80); - value >>= 7; - } - b.Append(static_cast<ui8>(value) & 0x7F); - } - - static void Read(TStringBuf& r, ui32& value) { - ui32 result = 0; - for (ui32 count = 0; count < MAX_VARINT32_BYTES; ++count) { - const ui32 b = static_cast<ui8>(r[0]); - r.Skip(1); - result |= static_cast<ui32>(b & 0x7F) << (7 * count); - if (!(b & 0x80)) { - value = result; - return; + struct TVarIntTraits { + static const size_t MAX_VARINT32_BYTES = 5; + + static void Write(ui32 value, TBuffer& b) { + while (value > 0x7F) { + b.Append(static_cast<ui8>(value) | 0x80); + value >>= 7; + } + b.Append(static_cast<ui8>(value) & 0x7F); + } + + static void Read(TStringBuf& r, ui32& value) { + ui32 result = 0; + for (ui32 count = 0; count < MAX_VARINT32_BYTES; ++count) { + const ui32 b = static_cast<ui8>(r[0]); + r.Skip(1); + result |= static_cast<ui32>(b & 0x7F) << (7 * count); + if (!(b & 0x80)) { + value = result; + return; } else if (Y_UNLIKELY(r.empty())) { - break; - } + break; + } } - Y_ENSURE_EX(false, TCodecException() << "Bad data"); + Y_ENSURE_EX(false, TCodecException() << "Bad data"); } - }; + }; - struct TShortIntTraits { - static const size_t SHORTINT_SIZE_LIMIT = 0x8000; + struct TShortIntTraits { + static const size_t SHORTINT_SIZE_LIMIT = 0x8000; - Y_FORCE_INLINE static void Write(ui32 value, TBuffer& b) { - Y_ENSURE_EX(value < SHORTINT_SIZE_LIMIT, TCodecException() << "Bad write method"); - if (value >= 0x80) { - b.Append(static_cast<ui8>(value >> 8) | 0x80); - } - b.Append(static_cast<ui8>(value)); + Y_FORCE_INLINE static void Write(ui32 value, TBuffer& b) { + Y_ENSURE_EX(value < SHORTINT_SIZE_LIMIT, TCodecException() << "Bad write method"); + if (value >= 0x80) { + b.Append(static_cast<ui8>(value >> 8) | 0x80); + } + b.Append(static_cast<ui8>(value)); } - Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) { - ui32 result = static_cast<ui8>(r[0]); + Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) { + ui32 result = static_cast<ui8>(r[0]); r.Skip(1); - if (result >= 0x80) { + if (result >= 0x80) { Y_ENSURE_EX(!r.empty(), TCodecException() << "Bad data"); - result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]); - r.Skip(1); - } - value = result; + result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]); + r.Skip(1); + } + value = result; } - }; + }; - class TSolarCodec: public ICodec { - public: - static TStringBuf MyName8k() { + class TSolarCodec: public ICodec { + public: + static TStringBuf MyName8k() { return TStringBuf("solar-8k"); - } - static TStringBuf MyName16k() { + } + static TStringBuf MyName16k() { return TStringBuf("solar-16k"); - } - static TStringBuf MyName32k() { + } + static TStringBuf MyName32k() { return TStringBuf("solar-32k"); - } - static TStringBuf MyName64k() { + } + static TStringBuf MyName64k() { return TStringBuf("solar-64k"); - } - static TStringBuf MyName256k() { + } + static TStringBuf MyName256k() { return TStringBuf("solar-256k"); - } - static TStringBuf MyName() { + } + static TStringBuf MyName() { return TStringBuf("solar"); - } - static TStringBuf MyName8kAdapt() { + } + static TStringBuf MyName8kAdapt() { return TStringBuf("solar-8k-a"); - } - static TStringBuf MyName16kAdapt() { + } + static TStringBuf MyName16kAdapt() { return TStringBuf("solar-16k-a"); - } - static TStringBuf MyName32kAdapt() { + } + static TStringBuf MyName32kAdapt() { return TStringBuf("solar-32k-a"); - } - static TStringBuf MyName64kAdapt() { + } + static TStringBuf MyName64kAdapt() { return TStringBuf("solar-64k-a"); - } - static TStringBuf MyName256kAdapt() { + } + static TStringBuf MyName256kAdapt() { return TStringBuf("solar-256k-a"); - } - static TStringBuf MyNameShortInt() { + } + static TStringBuf MyNameShortInt() { return TStringBuf("solar-si"); - } - - explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) - : Settings(s) - , MaxEntries(maxentries) - , MaxIterations(maxiter) - { - MyTraits.NeedsTraining = true; - MyTraits.SizeOnDecodeMultiplier = 2; - MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8; - } - - ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { - EncodeImpl<TVarIntTraits>(r, b); - return 0; - } - - void Decode(TStringBuf r, TBuffer& b) const override { - DecodeImpl<TVarIntTraits>(r, b); - } - - TString GetName() const override { + } + + explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) + : Settings(s) + , MaxEntries(maxentries) + , MaxIterations(maxiter) + { + MyTraits.NeedsTraining = true; + MyTraits.SizeOnDecodeMultiplier = 2; + MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8; + } + + ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { + EncodeImpl<TVarIntTraits>(r, b); + return 0; + } + + void Decode(TStringBuf r, TBuffer& b) const override { + DecodeImpl<TVarIntTraits>(r, b); + } + + TString GetName() const override { return ToString(MyName()); - } + } - protected: - void DoLearn(ISequenceReader&) override; - void Save(IOutputStream*) const override; - void Load(IInputStream*) override; + protected: + void DoLearn(ISequenceReader&) override; + void Save(IOutputStream*) const override; + void Load(IInputStream*) override; - Y_FORCE_INLINE TStringBuf SubStr(ui32 begoff, ui32 endoff) const { - return TStringBuf(Pool.Data() + begoff, endoff - begoff); - } + Y_FORCE_INLINE TStringBuf SubStr(ui32 begoff, ui32 endoff) const { + return TStringBuf(Pool.Data() + begoff, endoff - begoff); + } - Y_FORCE_INLINE TStringBuf DoDecode(ui32 num) const { - return SubStr(Decoder[num - 1], Decoder[num]); - } + Y_FORCE_INLINE TStringBuf DoDecode(ui32 num) const { + return SubStr(Decoder[num - 1], Decoder[num]); + } - template <class TTraits> - Y_FORCE_INLINE void EncodeImpl(TStringBuf r, TBuffer& b) const { - b.Clear(); + template <class TTraits> + Y_FORCE_INLINE void EncodeImpl(TStringBuf r, TBuffer& b) const { + b.Clear(); b.Reserve(r.size()); while (!r.empty()) { - size_t sz = 0; - ui32 val = (ui32)-1; - Encoder.FindLongestPrefix(r, &sz, &val); - TTraits::Write(val + 1, b); - r.Skip(Max<size_t>(sz, 1)); - } + size_t sz = 0; + ui32 val = (ui32)-1; + Encoder.FindLongestPrefix(r, &sz, &val); + TTraits::Write(val + 1, b); + r.Skip(Max<size_t>(sz, 1)); + } } - template <class TTraits> - Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const { - b.Clear(); + template <class TTraits> + Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const { + b.Clear(); b.Reserve(r.size()); - ui32 v = 0; + ui32 v = 0; while (!r.empty()) { - TTraits::Read(r, v); - TStringBuf s = DoDecode(v); + TTraits::Read(r, v); + TStringBuf s = DoDecode(v); b.Append(s.data(), s.size()); - } - } - - inline bool CanUseShortInt() const { - return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT; - } - - private: - typedef TCompactTrie<char, ui32> TEncoder; - typedef TVector<ui32> TDecoder; - - TBuffer Pool; - TEncoder Encoder; - TDecoder Decoder; - - NGreedyDict::TBuildSettings Settings; - ui32 MaxEntries; - ui32 MaxIterations; - }; - - // Uses varints or shortints depending on the decoder size - class TAdaptiveSolarCodec: public TSolarCodec { - public: - explicit TAdaptiveSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) - : TSolarCodec(maxentries, maxiter, s) - { - } - - ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { - if (CanUseShortInt()) { - EncodeImpl<TShortIntTraits>(r, b); - } else { - EncodeImpl<TVarIntTraits>(r, b); - } - - return 0; - } - - void Decode(TStringBuf r, TBuffer& b) const override { - if (CanUseShortInt()) { - DecodeImpl<TShortIntTraits>(r, b); - } else { - DecodeImpl<TVarIntTraits>(r, b); - } - } - - TString GetName() const override { - if (CanUseShortInt()) { + } + } + + inline bool CanUseShortInt() const { + return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT; + } + + private: + typedef TCompactTrie<char, ui32> TEncoder; + typedef TVector<ui32> TDecoder; + + TBuffer Pool; + TEncoder Encoder; + TDecoder Decoder; + + NGreedyDict::TBuildSettings Settings; + ui32 MaxEntries; + ui32 MaxIterations; + }; + + // Uses varints or shortints depending on the decoder size + class TAdaptiveSolarCodec: public TSolarCodec { + public: + explicit TAdaptiveSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) + : TSolarCodec(maxentries, maxiter, s) + { + } + + ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { + if (CanUseShortInt()) { + EncodeImpl<TShortIntTraits>(r, b); + } else { + EncodeImpl<TVarIntTraits>(r, b); + } + + return 0; + } + + void Decode(TStringBuf r, TBuffer& b) const override { + if (CanUseShortInt()) { + DecodeImpl<TShortIntTraits>(r, b); + } else { + DecodeImpl<TVarIntTraits>(r, b); + } + } + + TString GetName() const override { + if (CanUseShortInt()) { return ToString(MyNameShortInt()); - } else { + } else { return ToString(MyName()); - } + } } - }; + }; - class TSolarCodecShortInt: public TSolarCodec { - public: - explicit TSolarCodecShortInt(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) - : TSolarCodec(maxentries, maxiter, s) - { + class TSolarCodecShortInt: public TSolarCodec { + public: + explicit TSolarCodecShortInt(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) + : TSolarCodec(maxentries, maxiter, s) + { } - ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { - EncodeImpl<TShortIntTraits>(r, b); - return 0; - } + ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { + EncodeImpl<TShortIntTraits>(r, b); + return 0; + } - void Decode(TStringBuf r, TBuffer& b) const override { - DecodeImpl<TShortIntTraits>(r, b); - } + void Decode(TStringBuf r, TBuffer& b) const override { + DecodeImpl<TShortIntTraits>(r, b); + } - TString GetName() const override { + TString GetName() const override { return ToString(MyNameShortInt()); - } - - protected: - void Load(IInputStream* in) override { - TSolarCodec::Load(in); - Y_ENSURE_EX(CanUseShortInt(), TCodecException() << "Bad data"); - } - }; + } + + protected: + void Load(IInputStream* in) override { + TSolarCodec::Load(in); + Y_ENSURE_EX(CanUseShortInt(), TCodecException() << "Bad data"); + } + }; } diff --git a/library/cpp/codecs/static/builder.h b/library/cpp/codecs/static/builder.h index d7533be4d5..ece4dfa529 100644 --- a/library/cpp/codecs/static/builder.h +++ b/library/cpp/codecs/static/builder.h @@ -19,7 +19,7 @@ namespace NCodecs { time_t Timestamp = TInstant::Now().TimeT(); TString RevisionInfo = (TStringBuilder() << "r" << ToString(GetProgramSvnRevision())); TString TrainingSetComment; // a human comment on the training data - TString TrainingSetResId; // sandbox resid of the training set + TString TrainingSetResId; // sandbox resid of the training set }; TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo&); diff --git a/library/cpp/codecs/static/example/example.cpp b/library/cpp/codecs/static/example/example.cpp index 5b750b717e..b0566a8c2e 100644 --- a/library/cpp/codecs/static/example/example.cpp +++ b/library/cpp/codecs/static/example/example.cpp @@ -5,10 +5,10 @@ #include <util/generic/yexception.h> extern "C" { -extern const ui8 codec_info_huff_20160707[]; -extern const ui32 codec_info_huff_20160707Size; -extern const ui8 codec_info_sa_huff_20160707[]; -extern const ui32 codec_info_sa_huff_20160707Size; +extern const ui8 codec_info_huff_20160707[]; +extern const ui32 codec_info_huff_20160707Size; +extern const ui8 codec_info_sa_huff_20160707[]; +extern const ui32 codec_info_sa_huff_20160707Size; }; namespace NStaticCodecExample { diff --git a/library/cpp/codecs/static/example/example.h b/library/cpp/codecs/static/example/example.h index f9b3a7324b..41003fb187 100644 --- a/library/cpp/codecs/static/example/example.h +++ b/library/cpp/codecs/static/example/example.h @@ -4,11 +4,11 @@ #include <util/generic/buffer.h> namespace NStaticCodecExample { - enum EDictVersion : ui8 { - DV_NULL = 0, - DV_HUFF_20160707, - DV_SA_HUFF_20160707, - DV_COUNT + enum EDictVersion : ui8 { + DV_NULL = 0, + DV_HUFF_20160707, + DV_SA_HUFF_20160707, + DV_COUNT }; void Encode(TBuffer&, TStringBuf, EDictVersion dv = DV_SA_HUFF_20160707); diff --git a/library/cpp/codecs/static/static.cpp b/library/cpp/codecs/static/static.cpp index 44a07dd73a..97ddbd8364 100644 --- a/library/cpp/codecs/static/static.cpp +++ b/library/cpp/codecs/static/static.cpp @@ -69,8 +69,8 @@ namespace NCodecs { s << "sample mult: " << ci.GetDebugInfo().GetSampleSizeMultiplier() << Endl; s << "orig.compress: " << ci.GetDebugInfo().GetCompression() * 100 << " %" << Endl; s << "timestamp: " << ci.GetDebugInfo().GetTimestamp() << " (" - << NDatetime::TSimpleTM::NewLocal(ci.GetDebugInfo().GetTimestamp()).ToString() - << ")" << Endl; + << NDatetime::TSimpleTM::NewLocal(ci.GetDebugInfo().GetTimestamp()).ToString() + << ")" << Endl; s << "revision: " << ci.GetDebugInfo().GetRevisionInfo() << Endl; s << "training set comment: " << ci.GetDebugInfo().GetTrainingSetComment() << Endl; s << "training set resId: " << ci.GetDebugInfo().GetTrainingSetResId() << Endl; diff --git a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp index 9c8d568d82..3668a7583a 100644 --- a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp +++ b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp @@ -22,17 +22,17 @@ int main(int argc, char** argv) { opts.SetCmdLineDescr("-c 9089f3e9b7a0f0d4.codec_info -t -f base64 qtrees.sample.txt"); NCodecs::TStaticCodecInfo codec; - opts.AddLongOption('c', "codec-info").RequiredArgument("codec_info").Handler1T<TString>([&codecFile, &codec, &codecPtr](TString name) { - codecFile = name; - codec.CopyFrom(NCodecs::LoadCodecInfoFromString(TUnbufferedFileInput(name).ReadAll())); - codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec()); - }) - .Required() - .Help(".codec_info file with serialized static data for codec"); + opts.AddLongOption('c', "codec-info").RequiredArgument("codec_info").Handler1T<TString>([&codecFile, &codec, &codecPtr](TString name) { + codecFile = name; + codec.CopyFrom(NCodecs::LoadCodecInfoFromString(TUnbufferedFileInput(name).ReadAll())); + codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec()); + }) + .Required() + .Help(".codec_info file with serialized static data for codec"); - opts.AddLongOption('t', "test").NoArgument().StoreValue(&testCompression, true).Optional().Help("test current performance"); + opts.AddLongOption('t', "test").NoArgument().StoreValue(&testCompression, true).Optional().Help("test current performance"); - opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Optional().Help("test set input file format"); + opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Optional().Help("test set input file format"); opts.SetFreeArgsMin(0); opts.SetFreeArgTitle(0, "testing_set_input_file", "testing set input files"); diff --git a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp index 45fdb5c5fe..073689737d 100644 --- a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp +++ b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp @@ -17,26 +17,26 @@ int main(int argc, char** argv) { opts.SetCmdLineDescr("-m 'Training set: 100000 qtrees taken from web mmeta logs' -f base64 qtrees.sample.txt"); opts.SetTitle("Teaches the codec and serializes it as a file named CODECNAME.hash(CODECDATA).bin"); - opts.AddLongOption('m', "message").RequiredArgument("training_set_comment").StoreResult(&info.TrainingSetComment).Required().Help("a human description for the training set"); + opts.AddLongOption('m', "message").RequiredArgument("training_set_comment").StoreResult(&info.TrainingSetComment).Required().Help("a human description for the training set"); - opts.AddLongOption('r', "resource").RequiredArgument("training_set_res_id").StoreResult(&info.TrainingSetResId).Optional().Help("sandbox resource id for the training set"); + opts.AddLongOption('r', "resource").RequiredArgument("training_set_res_id").StoreResult(&info.TrainingSetResId).Optional().Help("sandbox resource id for the training set"); - opts.AddLongOption('c', "codec").RequiredArgument("codec_name").StoreResult(&info.CodecName).Optional().DefaultValue(info.CodecName); + opts.AddLongOption('c', "codec").RequiredArgument("codec_name").StoreResult(&info.CodecName).Optional().DefaultValue(info.CodecName); - opts.AddLongOption('s', "sample-multiplier").RequiredArgument("multiplier").StoreResult(&info.SampleSizeMultiplier).Optional().DefaultValue(ToString(info.SampleSizeMultiplier)).Help("multiplier for default sample size"); + opts.AddLongOption('s', "sample-multiplier").RequiredArgument("multiplier").StoreResult(&info.SampleSizeMultiplier).Optional().DefaultValue(ToString(info.SampleSizeMultiplier)).Help("multiplier for default sample size"); - opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Required().Help("training set input file format"); + opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Required().Help("training set input file format"); - opts.AddLongOption("list-codecs").NoArgument().Handler0([]() { - Cout << JoinStrings(NCodecs::ICodec::GetCodecsList(), "\n") << Endl; - exit(0); - }) - .Optional() - .Help("list available codecs"); + opts.AddLongOption("list-codecs").NoArgument().Handler0([]() { + Cout << JoinStrings(NCodecs::ICodec::GetCodecsList(), "\n") << Endl; + exit(0); + }) + .Optional() + .Help("list available codecs"); - opts.AddLongOption("fake-revision").RequiredArgument("revision").StoreResult(&info.RevisionInfo).Optional().Hidden(); // replace static_codec_generator revision in debug info + opts.AddLongOption("fake-revision").RequiredArgument("revision").StoreResult(&info.RevisionInfo).Optional().Hidden(); // replace static_codec_generator revision in debug info - opts.AddLongOption("fake-timestamp").RequiredArgument("timestamp").StoreResult(&info.Timestamp).Optional().Hidden(); // replace generating timestamp in debug info + opts.AddLongOption("fake-timestamp").RequiredArgument("timestamp").StoreResult(&info.Timestamp).Optional().Hidden(); // replace generating timestamp in debug info opts.SetFreeArgsMin(0); opts.SetFreeArgTitle(0, "training_set_input_file", "training set input files"); diff --git a/library/cpp/codecs/static/ut/builder_ut.cpp b/library/cpp/codecs/static/ut/builder_ut.cpp index b47c279ed1..d9c4eb9e47 100644 --- a/library/cpp/codecs/static/ut/builder_ut.cpp +++ b/library/cpp/codecs/static/ut/builder_ut.cpp @@ -3,11 +3,11 @@ #include <library/cpp/codecs/static/static_codec_info.pb.h> #include <util/string/vector.h> -class TStaticCodecInfoBuilderTest: public NUnitTest::TTestBase { +class TStaticCodecInfoBuilderTest: public NUnitTest::TTestBase { UNIT_TEST_SUITE(TStaticCodecInfoBuilderTest) - UNIT_TEST(TestBuild) + UNIT_TEST(TestBuild) UNIT_TEST_SUITE_END(); - + private: TVector<TString> PrepareData() { TVector<TString> data; diff --git a/library/cpp/codecs/static/ut/static_ut.cpp b/library/cpp/codecs/static/ut/static_ut.cpp index 57e1e62887..315e1bf0b0 100644 --- a/library/cpp/codecs/static/ut/static_ut.cpp +++ b/library/cpp/codecs/static/ut/static_ut.cpp @@ -1,11 +1,11 @@ #include <library/cpp/testing/unittest/registar.h> #include <library/cpp/codecs/static/example/example.h> -class TStaticCodecUsageTest: public NUnitTest::TTestBase { +class TStaticCodecUsageTest: public NUnitTest::TTestBase { UNIT_TEST_SUITE(TStaticCodecUsageTest) - UNIT_TEST(TestUsage) + UNIT_TEST(TestUsage) UNIT_TEST_SUITE_END(); - + private: void DoTestUsage(NStaticCodecExample::EDictVersion dv, size_t expectedSize) { const TStringBuf letov = "Всё идёт по плану"; diff --git a/library/cpp/codecs/tls_cache.h b/library/cpp/codecs/tls_cache.h index 0184e4bb6c..7068ea333f 100644 --- a/library/cpp/codecs/tls_cache.h +++ b/library/cpp/codecs/tls_cache.h @@ -15,15 +15,15 @@ namespace NCodecs { } }; - template <class TItem, class TCleaner = TClear<TItem>> + template <class TItem, class TCleaner = TClear<TItem>> class TTlsCache { using TSelf = TTlsCache<TItem, TCleaner>; - struct TItemHolder: public TIntrusiveListItem<TItemHolder> { + struct TItemHolder: public TIntrusiveListItem<TItemHolder> { TItemHolder(TSelf& factory) : Factory(factory) - { - } + { + } void Release() { Factory.Release(*this); @@ -37,14 +37,14 @@ namespace NCodecs { public: explicit TItemGuard(TSelf& fact) : Holder(fact.Acquire()) - { - } + { + } - TItemGuard(TItemGuard&& other) noexcept { + TItemGuard(TItemGuard&& other) noexcept { *this = std::move(other); } - TItemGuard& operator=(TItemGuard&& other) noexcept { + TItemGuard& operator=(TItemGuard&& other) noexcept { if (&other != this) { std::swap(Holder, other.Holder); } diff --git a/library/cpp/codecs/ut/codecs_ut.cpp b/library/cpp/codecs/ut/codecs_ut.cpp index caf6089aef..36675f6b63 100644 --- a/library/cpp/codecs/ut/codecs_ut.cpp +++ b/library/cpp/codecs/ut/codecs_ut.cpp @@ -13,107 +13,107 @@ #include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> namespace { - const char* TextValues[] = { - "! сентября газета", - "!(возмездие это)!", - "!(материнский капитал)", - "!(пермь березники)", - "!биография | !жизнь / + розинг | зворыгин & изобретение | телевидение | электронно лучевая трубка", - "!овсиенко николай павлович", - "!путин", - "\"i'm on you\" p. diddy тимати клип", - "\"билайн\" представит собственный планшет", - "\"в особо крупном размере\"", - "\"викиликс\" джулиан ассанж", - "\"вимм билль данн", - "\"газэнергосеть астрахань", - "\"газэнергосеть астрахань\"", - "\"домодедово\" ту-154", - "\"жилина\" \"спартак\" видео", - "\"зелёнsq шершнm\"", - "\"зелёного шершня\"", - "\"золотой граммофон\" марины яблоковой", - "\"золотой граммофон-2010\"", - "\"калинниковы\"", - "\"манчестер юнайтед\" (англия) \"валенсия\" (испания) 1:1 (0:1)", - "\"маркер\"", - "\"моника\" засыпает москву снегом", - "\"моника\" снегопад", - "\"о безопасности\",", - "\"памятку\" для пассажиров воздушных международных рейсов", - "\"петровский парк\" и \"ходынское поле\"", - "\"путинская\" трава", - "\"пятерочка\"купила \"копейку\"", - "\"пятёрочка\" и \"копейка\" объединились", - "\"реал\" \"осер\" 4:0", - "\"речь мутко\"", - "\"российский лес 2010\"", - "\"ростехинвентаризация федеральное бти\" рубцов", - "\"саня останется с нами\",", - "\"следопыт\" реалити шоу", - "\"слышишь\" молодые авторы", - "\"стадион\"", - "\"ходынское поле\" метро", - "\"хроники нарнии\"", - "\"чистая вода\"", - "\"школа деда мороза\"", - "# asus -1394", - "# сторонники wikileaks", - "#106#", - "#11", - "#8 какой цвет", - "#если клиент", - "$ 13,79", - "$ xnj ,s dct ,skb ljdjkmys !!!", - "$ в день", - "$ диск компьютера", - "$.ajax", - "$125 000", - "$курс", - "% в си", - "% влады", - "% годовых", - "% женщин и % мужчин в россии", - "% занятости персонала", - "% инфляции 2010", - "% инфляции в 2010 г.", - "% налога", - "% налогов в 2010г.", - "% общего количества", - "% от числа", - "% по налогу на прибыль организации", - "%24", - "%академия%", - "%комарова%татьяна", - "& в 1с", - "&& (+не существует | !такой проблемы)", - ">>>скачать | download c cs strikez.clan.su<<<", - ">hbq nbityrjd", - "< какой знак", - "< лицей | < техническая школа# < история#< лицей сегодня#< перечень профессий#< руководство лицея#< прием учащихся#< контакты#< схема проезда#< фотогалереяистория создания лицея и основные этапы путиулица купчинская дом 28", - "<<link>>", - "</storage>", - "<bfnkjy", - "<bktntd", - "<cr", - "<ddr3>", - "<e[ufknthcrbq abyfycjdsq", - "<fcctqys", - "<fhcf", - "<fhctkjyf he,by", - "<firbhbz", - "<fyr djphj;ltybt", - "<fyr vjcrds", - "<fyr резерв", - "<fyufkjh", - "<index>", - "<jkmifz jrhe;yfz rbtd", - "<kbpytws", - "<megafon> интернет", - "<thtpybrb gthvcrbq rhfq", - "<tkjxrf", - "<беларусь это мы", - "<бокс, версия ibf", + const char* TextValues[] = { + "! сентября газета", + "!(возмездие это)!", + "!(материнский капитал)", + "!(пермь березники)", + "!биография | !жизнь / + розинг | зворыгин & изобретение | телевидение | электронно лучевая трубка", + "!овсиенко николай павлович", + "!путин", + "\"i'm on you\" p. diddy тимати клип", + "\"билайн\" представит собственный планшет", + "\"в особо крупном размере\"", + "\"викиликс\" джулиан ассанж", + "\"вимм билль данн", + "\"газэнергосеть астрахань", + "\"газэнергосеть астрахань\"", + "\"домодедово\" ту-154", + "\"жилина\" \"спартак\" видео", + "\"зелёнsq шершнm\"", + "\"зелёного шершня\"", + "\"золотой граммофон\" марины яблоковой", + "\"золотой граммофон-2010\"", + "\"калинниковы\"", + "\"манчестер юнайтед\" (англия) \"валенсия\" (испания) 1:1 (0:1)", + "\"маркер\"", + "\"моника\" засыпает москву снегом", + "\"моника\" снегопад", + "\"о безопасности\",", + "\"памятку\" для пассажиров воздушных международных рейсов", + "\"петровский парк\" и \"ходынское поле\"", + "\"путинская\" трава", + "\"пятерочка\"купила \"копейку\"", + "\"пятёрочка\" и \"копейка\" объединились", + "\"реал\" \"осер\" 4:0", + "\"речь мутко\"", + "\"российский лес 2010\"", + "\"ростехинвентаризация федеральное бти\" рубцов", + "\"саня останется с нами\",", + "\"следопыт\" реалити шоу", + "\"слышишь\" молодые авторы", + "\"стадион\"", + "\"ходынское поле\" метро", + "\"хроники нарнии\"", + "\"чистая вода\"", + "\"школа деда мороза\"", + "# asus -1394", + "# сторонники wikileaks", + "#106#", + "#11", + "#8 какой цвет", + "#если клиент", + "$ 13,79", + "$ xnj ,s dct ,skb ljdjkmys !!!", + "$ в день", + "$ диск компьютера", + "$.ajax", + "$125 000", + "$курс", + "% в си", + "% влады", + "% годовых", + "% женщин и % мужчин в россии", + "% занятости персонала", + "% инфляции 2010", + "% инфляции в 2010 г.", + "% налога", + "% налогов в 2010г.", + "% общего количества", + "% от числа", + "% по налогу на прибыль организации", + "%24", + "%академия%", + "%комарова%татьяна", + "& в 1с", + "&& (+не существует | !такой проблемы)", + ">>>скачать | download c cs strikez.clan.su<<<", + ">hbq nbityrjd", + "< какой знак", + "< лицей | < техническая школа# < история#< лицей сегодня#< перечень профессий#< руководство лицея#< прием учащихся#< контакты#< схема проезда#< фотогалереяистория создания лицея и основные этапы путиулица купчинская дом 28", + "<<link>>", + "</storage>", + "<bfnkjy", + "<bktntd", + "<cr", + "<ddr3>", + "<e[ufknthcrbq abyfycjdsq", + "<fcctqys", + "<fhcf", + "<fhctkjyf he,by", + "<firbhbz", + "<fyr djphj;ltybt", + "<fyr vjcrds", + "<fyr резерв", + "<fyufkjh", + "<index>", + "<jkmifz jrhe;yfz rbtd", + "<kbpytws", + "<megafon> интернет", + "<thtpybrb gthvcrbq rhfq", + "<tkjxrf", + "<беларусь это мы", + "<бокс, версия ibf", "designer tree svc", "seriesg810", "doll makers", @@ -854,11 +854,11 @@ namespace { "resume maker", "lymphomatoid papulosis", "sez.com", - }; + }; } class TCodecsTest: public TTestBase { - UNIT_TEST_SUITE(TCodecsTest); + UNIT_TEST_SUITE(TCodecsTest); UNIT_TEST(TestPipeline) UNIT_TEST(TestDelta) UNIT_TEST(TestHuffman) @@ -869,14 +869,14 @@ class TCodecsTest: public TTestBase { UNIT_TEST(TestPFor) UNIT_TEST(TestRegistry) - UNIT_TEST_SUITE_END(); + UNIT_TEST_SUITE_END(); private: TString PrintError(TStringBuf learn, TStringBuf test, TStringBuf codec, ui32 i) { TString s; TStringOutput sout(s); - sout << codec << ": " << i << ", " - << "\n"; + sout << codec << ": " << i << ", " + << "\n"; sout << HexEncode(learn.data(), learn.size()); //NEscJ::EscapeJ<true>(learn, sout); sout << " != \n"; sout << HexEncode(test.data(), test.size()); //NEscJ::EscapeJ<true>(test, sout); @@ -1009,8 +1009,8 @@ private: AppendTo(d.back(), -1LL); AppendTo(d.back(), -1LL); - TestCodec<TDeltaCodec<ui64, true>, false>(d); - TestCodec<TDeltaCodec<ui64, false>, false>(d); + TestCodec<TDeltaCodec<ui64, true>, false>(d); + TestCodec<TDeltaCodec<ui64, false>, false>(d); } void TestPFor() { @@ -1050,7 +1050,7 @@ private: AppendTo(d.back(), -1LL); AppendTo(d.back(), -2LL); - TestCodec<TPForCodec<ui64>, false>(d); + TestCodec<TPForCodec<ui64>, false>(d); TestCodec<TPForCodec<ui64, true>, true>(d); } { @@ -1080,7 +1080,7 @@ private: AppendTo(d.back(), -1); AppendTo(d.back(), -2); - TestCodec<TPForCodec<ui32>, false>(d); + TestCodec<TPForCodec<ui32>, false>(d); TestCodec<TPForCodec<ui32, true>, false>(d); } { @@ -1326,7 +1326,7 @@ private: } TestCodec<TPipelineCodec, true>(learn, test, - new TPipelineCodec(new TSolarCodec(512, 8), new TSolarCodec(512, 8), new THuffmanCodec)); + new TPipelineCodec(new TSolarCodec(512, 8), new TSolarCodec(512, 8), new THuffmanCodec)); } { TVector<TBuffer> d; @@ -1338,7 +1338,7 @@ private: } TestCodec<TPipelineCodec, false>(d, TVector<TBuffer>(), - new TPipelineCodec(new TDeltaCodec<ui32, false>, new TPForCodec<ui32>)); + new TPipelineCodec(new TDeltaCodec<ui32, false>, new TPForCodec<ui32>)); } } diff --git a/library/cpp/codecs/ut/float_huffman_ut.cpp b/library/cpp/codecs/ut/float_huffman_ut.cpp index 3156fb1f46..dddff22173 100644 --- a/library/cpp/codecs/ut/float_huffman_ut.cpp +++ b/library/cpp/codecs/ut/float_huffman_ut.cpp @@ -60,7 +60,7 @@ Y_UNIT_TEST_SUITE(FloatHuffmanTest) { 0.517876, 0.145833, 0.372549, 0, 0.991667, 0.602125, 0.161979, 0, 0, 0, 0, 0.0255146, 0.947855, 0, 0, 0, 0, 0, 0, 0, 0, 0.847059, 0.679841, 0, 0.156863, 0, 0, 1, 0, 0, 0, 0, 0.969697, 0, 0, 0.564706, 0, 0, 0, 0, 0, 1, 0.0367282, 0.0395228, 0, 0, 0, - 0, 0, 0.0470588, 0.141176, 0.054902, 0, 0, 0, 0}; + 0, 0, 0.0470588, 0.141176, 0.054902, 0, 0, 0, 0}; static const size_t FactorCount = Y_ARRAY_SIZE(Factors); static const ui8 CodedFactors[] = { @@ -132,7 +132,7 @@ Y_UNIT_TEST_SUITE(FloatHuffmanTest) { 0x7F, 0x20, 0x1A, 0x81, 0x9A, 0xCA, 0xBF, 0xC8, 0x8D, 0x8D, 0xC2, 0x83, 0x82, 0xA7, 0x2C, 0x28, 0xC8, 0xFE, 0x08, 0xC2, 0x07, 0xC7, 0x27, 0x21, 0xE1, 0xBB, 0x3E, 0xC1, 0x59, 0x68, 0xAA, 0x78, 0xC8, 0x57, 0x5D, 0x60, 0x20, 0xC6, 0x41, 0x42, 0xE8, 0x3A, 0x38, 0xD8, 0x9B, 0xFF, 0xFF, 0xFF, - 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; static const size_t CodedSize = Y_ARRAY_SIZE(CodedFactors); static const TStringBuf CodedFactorsBuf(reinterpret_cast<const char*>(CodedFactors), CodedSize); @@ -228,7 +228,7 @@ Y_UNIT_TEST_SUITE(FloatHuffmanTest) { "MBD693f07+9+DQQEkIGAgIgPetzN5yEbAGxWpbCNxXK/0JGTKRz2KkIoR7aM"; UNIT_ASSERT_EXCEPTION( fh::Decode(Base64Decode(brokenBase64Encoded)), - yexception); + yexception); } Y_UNIT_TEST(TestDecompressEmpty) { diff --git a/library/cpp/codecs/ut/tls_cache_ut.cpp b/library/cpp/codecs/ut/tls_cache_ut.cpp index 8101af761f..361d41a02e 100644 --- a/library/cpp/codecs/ut/tls_cache_ut.cpp +++ b/library/cpp/codecs/ut/tls_cache_ut.cpp @@ -2,35 +2,35 @@ #include <library/cpp/codecs/tls_cache.h> Y_UNIT_TEST_SUITE(CodecsBufferFactoryTest){ - void AssignToBuffer(TBuffer & buf, TStringBuf val){ + void AssignToBuffer(TBuffer & buf, TStringBuf val){ buf.Assign(val.data(), val.size()); -} +} -TStringBuf AsStringBuf(const TBuffer& b) { - return TStringBuf(b.Data(), b.Size()); -} +TStringBuf AsStringBuf(const TBuffer& b) { + return TStringBuf(b.Data(), b.Size()); +} Y_UNIT_TEST(TestAcquireReleaseReuse) { - NCodecs::TBufferTlsCache factory; - // acquiring the first buffer - auto buf1 = factory.Item(); - AssignToBuffer(buf1.Get(), "Buffer_01"); - { - // acquiring the second buffer - auto buf2 = factory.Item(); - AssignToBuffer(buf2.Get(), "Buffer_02"); + NCodecs::TBufferTlsCache factory; + // acquiring the first buffer + auto buf1 = factory.Item(); + AssignToBuffer(buf1.Get(), "Buffer_01"); + { + // acquiring the second buffer + auto buf2 = factory.Item(); + AssignToBuffer(buf2.Get(), "Buffer_02"); } - // the first buffer should stay intact - UNIT_ASSERT_EQUAL(AsStringBuf(buf1.Get()), "Buffer_01"); - { - // reacquiring the last released buffer - // expecting it zero sized but having the same memory - auto buf2 = factory.Item(); - UNIT_ASSERT_VALUES_EQUAL(buf2.Get().Size(), 0u); - buf2.Get().Resize(TStringBuf("Buffer_02").Size()); - UNIT_ASSERT_EQUAL(AsStringBuf(buf2.Get()), "Buffer_02"); - } - // when the factory dies we should see no leaks -} -} -; + // the first buffer should stay intact + UNIT_ASSERT_EQUAL(AsStringBuf(buf1.Get()), "Buffer_01"); + { + // reacquiring the last released buffer + // expecting it zero sized but having the same memory + auto buf2 = factory.Item(); + UNIT_ASSERT_VALUES_EQUAL(buf2.Get().Size(), 0u); + buf2.Get().Resize(TStringBuf("Buffer_02").Size()); + UNIT_ASSERT_EQUAL(AsStringBuf(buf2.Get()), "Buffer_02"); + } + // when the factory dies we should see no leaks +} +} +; diff --git a/library/cpp/codecs/ya.make b/library/cpp/codecs/ya.make index 7e76fb0c9a..9f7a5b5de2 100644 --- a/library/cpp/codecs/ya.make +++ b/library/cpp/codecs/ya.make @@ -4,7 +4,7 @@ OWNER( g:base velavokr ) - + SRCS( tls_cache.cpp codecs.cpp diff --git a/library/cpp/codecs/zstd_dict_codec.cpp b/library/cpp/codecs/zstd_dict_codec.cpp index c42a2879e6..6aa67abd62 100644 --- a/library/cpp/codecs/zstd_dict_codec.cpp +++ b/library/cpp/codecs/zstd_dict_codec.cpp @@ -28,8 +28,8 @@ namespace NCodecs { TPtrHolder(T* dict) : Ptr(dict) - { - } + { + } T* Get() { return Ptr; @@ -99,7 +99,7 @@ namespace NCodecs { TCCtx ctx{CheckPtr(ZSTD_createCCtx(), __LOCATION__)}; const size_t resSz = CheckSize(ZSTD_compress_usingCDict( ctx.Get(), outbuf.data() + szSz, maxDatSz, rawBeg, rawSz, CDict.Get()), - __LOCATION__); + __LOCATION__); if (resSz < rawSz) { outbuf.Resize(resSz + szSz); @@ -134,13 +134,13 @@ namespace NCodecs { outbuf.Resize(rawSz); memcpy(outbuf.data(), rawBeg, rawSz); } else { - // size_t zSz = ZSTD_getDecompressedSize(rawBeg, rawSz); - // Y_ENSURE_EX(datSz == zSz, TCodecException() << datSz << " != " << zSz); + // size_t zSz = ZSTD_getDecompressedSize(rawBeg, rawSz); + // Y_ENSURE_EX(datSz == zSz, TCodecException() << datSz << " != " << zSz); outbuf.Resize(datSz); TDCtx ctx{CheckPtr(ZSTD_createDCtx(), __LOCATION__)}; CheckSize(ZSTD_decompress_usingDDict( ctx.Get(), outbuf.data(), outbuf.size(), rawBeg, rawSz, DDict.Get()), - __LOCATION__); + __LOCATION__); outbuf.Resize(datSz); } } @@ -206,8 +206,8 @@ namespace NCodecs { template <class T> static T* CheckPtr(T* t, TSourceLocation loc) { - Y_ENSURE_EX(t, TCodecException() << loc << " " - << "unexpected nullptr"); + Y_ENSURE_EX(t, TCodecException() << loc << " " + << "unexpected nullptr"); return t; } @@ -230,8 +230,8 @@ namespace NCodecs { MyTraits.RecommendedSampleSize = TImpl::SampleSize; // same as for solar } - TZStdDictCodec::~TZStdDictCodec() { - } + TZStdDictCodec::~TZStdDictCodec() { + } TString TZStdDictCodec::GetName() const { return TStringBuilder() << MyName() << "-" << Impl->GetCompressionLevel(); diff --git a/library/cpp/codecs/zstd_dict_codec.h b/library/cpp/codecs/zstd_dict_codec.h index 59c1ad6c60..70259989f6 100644 --- a/library/cpp/codecs/zstd_dict_codec.h +++ b/library/cpp/codecs/zstd_dict_codec.h @@ -5,34 +5,34 @@ #include <util/generic/ptr.h> namespace NCodecs { - // benchmarks are here: https://st.yandex-team.ru/SEARCH-1655 + // benchmarks are here: https://st.yandex-team.ru/SEARCH-1655 - class TZStdDictCodec: public ICodec { - class TImpl; - TIntrusivePtr<TImpl> Impl; + class TZStdDictCodec: public ICodec { + class TImpl; + TIntrusivePtr<TImpl> Impl; - public: - explicit TZStdDictCodec(ui32 comprLevel = 1); - ~TZStdDictCodec() override; + public: + explicit TZStdDictCodec(ui32 comprLevel = 1); + ~TZStdDictCodec() override; - static TStringBuf MyName() { - return "zstd08d"; - } + static TStringBuf MyName() { + return "zstd08d"; + } - TString GetName() const override; + TString GetName() const override; - ui8 Encode(TStringBuf in, TBuffer& out) const override; + ui8 Encode(TStringBuf in, TBuffer& out) const override; - void Decode(TStringBuf in, TBuffer& out) const override; + void Decode(TStringBuf in, TBuffer& out) const override; - static TVector<TString> ListCompressionNames(); - static int ParseCompressionName(TStringBuf); + static TVector<TString> ListCompressionNames(); + static int ParseCompressionName(TStringBuf); - protected: - void DoLearn(ISequenceReader& in) override; + protected: + void DoLearn(ISequenceReader& in) override; bool DoTryToLearn(ISequenceReader& in) final; - void Save(IOutputStream* out) const override; - void Load(IInputStream* in) override; - }; + void Save(IOutputStream* out) const override; + void Load(IInputStream* in) override; + }; } |