aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/codecs
diff options
context:
space:
mode:
authorAnton Samokhvalov <pg83@yandex.ru>2022-02-10 16:45:17 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:45:17 +0300
commitd3a398281c6fd1d3672036cb2d63f842d2cb28c5 (patch)
treedd4bd3ca0f36b817e96812825ffaf10d645803f2 /library/cpp/codecs
parent72cb13b4aff9bc9cf22e49251bc8fd143f82538f (diff)
downloadydb-d3a398281c6fd1d3672036cb2d63f842d2cb28c5.tar.gz
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/codecs')
-rw-r--r--library/cpp/codecs/codecs.cpp264
-rw-r--r--library/cpp/codecs/codecs.h370
-rw-r--r--library/cpp/codecs/codecs_registry.cpp24
-rw-r--r--library/cpp/codecs/codecs_registry.h12
-rw-r--r--library/cpp/codecs/comptable_codec.cpp6
-rw-r--r--library/cpp/codecs/comptable_codec.h48
-rw-r--r--library/cpp/codecs/delta_codec.cpp32
-rw-r--r--library/cpp/codecs/delta_codec.h206
-rw-r--r--library/cpp/codecs/float_huffman.cpp2
-rw-r--r--library/cpp/codecs/greedy_dict/gd_builder.cpp190
-rw-r--r--library/cpp/codecs/greedy_dict/gd_builder.h168
-rw-r--r--library/cpp/codecs/greedy_dict/gd_entry.cpp126
-rw-r--r--library/cpp/codecs/greedy_dict/gd_entry.h126
-rw-r--r--library/cpp/codecs/greedy_dict/gd_stats.h116
-rw-r--r--library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp8
-rw-r--r--library/cpp/codecs/huffman_codec.cpp870
-rw-r--r--library/cpp/codecs/huffman_codec.h38
-rw-r--r--library/cpp/codecs/pfor_codec.cpp32
-rw-r--r--library/cpp/codecs/pfor_codec.h276
-rw-r--r--library/cpp/codecs/sample.h8
-rw-r--r--library/cpp/codecs/solar_codec.cpp170
-rw-r--r--library/cpp/codecs/solar_codec.h370
-rw-r--r--library/cpp/codecs/static/builder.h2
-rw-r--r--library/cpp/codecs/static/example/example.cpp8
-rw-r--r--library/cpp/codecs/static/example/example.h10
-rw-r--r--library/cpp/codecs/static/static.cpp4
-rw-r--r--library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp18
-rw-r--r--library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp26
-rw-r--r--library/cpp/codecs/static/ut/builder_ut.cpp6
-rw-r--r--library/cpp/codecs/static/ut/static_ut.cpp6
-rw-r--r--library/cpp/codecs/tls_cache.h16
-rw-r--r--library/cpp/codecs/ut/codecs_ut.cpp224
-rw-r--r--library/cpp/codecs/ut/float_huffman_ut.cpp6
-rw-r--r--library/cpp/codecs/ut/tls_cache_ut.cpp54
-rw-r--r--library/cpp/codecs/ya.make2
-rw-r--r--library/cpp/codecs/zstd_dict_codec.cpp20
-rw-r--r--library/cpp/codecs/zstd_dict_codec.h40
37 files changed, 1952 insertions, 1952 deletions
diff --git a/library/cpp/codecs/codecs.cpp b/library/cpp/codecs/codecs.cpp
index bc60d10cf3..b17a3156d2 100644
--- a/library/cpp/codecs/codecs.cpp
+++ b/library/cpp/codecs/codecs.cpp
@@ -4,187 +4,187 @@
#include <util/stream/mem.h>
namespace NCodecs {
- void ICodec::Store(IOutputStream* out, TCodecPtr p) {
- if (!p.Get()) {
- ::Save(out, (ui16)0);
- return;
- }
-
- Y_ENSURE_EX(p->AlreadyTrained(), TCodecException() << "untrained codec " << p->GetName());
- const TString& n = p->GetName();
+ void ICodec::Store(IOutputStream* out, TCodecPtr p) {
+ if (!p.Get()) {
+ ::Save(out, (ui16)0);
+ return;
+ }
+
+ Y_ENSURE_EX(p->AlreadyTrained(), TCodecException() << "untrained codec " << p->GetName());
+ const TString& n = p->GetName();
Y_VERIFY(n.size() <= Max<ui16>());
::Save(out, (ui16)n.size());
out->Write(n.data(), n.size());
- p->Save(out);
+ p->Save(out);
}
- TCodecPtr ICodec::Restore(IInputStream* in) {
- ui16 l = 0;
- ::Load(in, l);
+ TCodecPtr ICodec::Restore(IInputStream* in) {
+ ui16 l = 0;
+ ::Load(in, l);
- if (!l) {
- return nullptr;
- }
+ if (!l) {
+ return nullptr;
+ }
- TString n;
- n.resize(l);
+ TString n;
+ n.resize(l);
- Y_ENSURE_EX(in->Load(n.begin(), l) == l, TCodecException());
+ Y_ENSURE_EX(in->Load(n.begin(), l) == l, TCodecException());
- TCodecPtr p = ICodec::GetInstance(n);
- p->Load(in);
- p->Trained = true;
- return p;
- }
+ TCodecPtr p = ICodec::GetInstance(n);
+ p->Load(in);
+ p->Trained = true;
+ return p;
+ }
- TCodecPtr ICodec::RestoreFromString(TStringBuf s) {
+ TCodecPtr ICodec::RestoreFromString(TStringBuf s) {
TMemoryInput minp{s.data(), s.size()};
- return Restore(&minp);
- }
+ return Restore(&minp);
+ }
- TString ICodec::GetNameSafe(TCodecPtr p) {
- return !p ? TString("none") : p->GetName();
- }
+ TString ICodec::GetNameSafe(TCodecPtr p) {
+ return !p ? TString("none") : p->GetName();
+ }
- ui8 TPipelineCodec::Encode(TStringBuf in, TBuffer& out) const {
+ ui8 TPipelineCodec::Encode(TStringBuf in, TBuffer& out) const {
size_t res = Traits().ApproximateSizeOnEncode(in.size());
- out.Reserve(res);
- out.Clear();
+ out.Reserve(res);
+ out.Clear();
- if (Pipeline.empty()) {
+ if (Pipeline.empty()) {
out.Append(in.data(), in.size());
- return 0;
- } else if (Pipeline.size() == 1) {
- return Pipeline.front()->Encode(in, out);
- }
+ return 0;
+ } else if (Pipeline.size() == 1) {
+ return Pipeline.front()->Encode(in, out);
+ }
- ui8 freelastbits = 0;
+ ui8 freelastbits = 0;
- auto buffer = TBufferTlsCache::TlsInstance().Item();
- TBuffer& tmp = buffer.Get();
- tmp.Reserve(res);
+ auto buffer = TBufferTlsCache::TlsInstance().Item();
+ TBuffer& tmp = buffer.Get();
+ tmp.Reserve(res);
- for (auto it = Pipeline.begin(); it != Pipeline.end(); ++it) {
- if (it != Pipeline.begin()) {
- tmp.Clear();
- tmp.Swap(out);
+ for (auto it = Pipeline.begin(); it != Pipeline.end(); ++it) {
+ if (it != Pipeline.begin()) {
+ tmp.Clear();
+ tmp.Swap(out);
in = TStringBuf{tmp.data(), tmp.size()};
- }
- freelastbits = (*it)->Encode(in, out);
- }
+ }
+ freelastbits = (*it)->Encode(in, out);
+ }
- return freelastbits;
+ return freelastbits;
}
- void TPipelineCodec::Decode(TStringBuf in, TBuffer& out) const {
+ void TPipelineCodec::Decode(TStringBuf in, TBuffer& out) const {
size_t res = Traits().ApproximateSizeOnDecode(in.size());
- out.Reserve(res);
- out.Clear();
+ out.Reserve(res);
+ out.Clear();
- if (Pipeline.empty()) {
+ if (Pipeline.empty()) {
out.Append(in.data(), in.size());
- return;
- } else if (Pipeline.size() == 1) {
- Pipeline.front()->Decode(in, out);
- return;
- }
+ return;
+ } else if (Pipeline.size() == 1) {
+ Pipeline.front()->Decode(in, out);
+ return;
+ }
- auto buffer = TBufferTlsCache::TlsInstance().Item();
+ auto buffer = TBufferTlsCache::TlsInstance().Item();
- TBuffer& tmp = buffer.Get();
- tmp.Reserve(res);
+ TBuffer& tmp = buffer.Get();
+ tmp.Reserve(res);
- for (TPipeline::const_reverse_iterator it = Pipeline.rbegin(); it != Pipeline.rend(); ++it) {
- if (it != Pipeline.rbegin()) {
- tmp.Clear();
- tmp.Swap(out);
+ for (TPipeline::const_reverse_iterator it = Pipeline.rbegin(); it != Pipeline.rend(); ++it) {
+ if (it != Pipeline.rbegin()) {
+ tmp.Clear();
+ tmp.Swap(out);
in = TStringBuf{tmp.data(), tmp.size()};
- }
- (*it)->Decode(in, out);
+ }
+ (*it)->Decode(in, out);
}
}
- void TPipelineCodec::Save(IOutputStream* out) const {
- for (const auto& it : Pipeline)
- it->Save(out);
+ void TPipelineCodec::Save(IOutputStream* out) const {
+ for (const auto& it : Pipeline)
+ it->Save(out);
}
- void TPipelineCodec::Load(IInputStream* in) {
- for (const auto& it : Pipeline) {
- it->Load(in);
- it->SetTrained(true);
- }
+ void TPipelineCodec::Load(IInputStream* in) {
+ for (const auto& it : Pipeline) {
+ it->Load(in);
+ it->SetTrained(true);
+ }
}
- void TPipelineCodec::SetTrained(bool t) {
- for (const auto& it : Pipeline) {
- it->SetTrained(t);
- }
+ void TPipelineCodec::SetTrained(bool t) {
+ for (const auto& it : Pipeline) {
+ it->SetTrained(t);
+ }
}
- TPipelineCodec& TPipelineCodec::AddCodec(TCodecPtr codec) {
- if (!codec)
- return *this;
-
- TCodecTraits tr = codec->Traits();
-
- if (!MyName) {
- MyTraits.AssumesStructuredInput = tr.AssumesStructuredInput;
- MyTraits.SizeOfInputElement = tr.SizeOfInputElement;
- } else {
- MyName.append(':');
- }
-
- MyName.append(codec->GetName());
- MyTraits.PreservesPrefixGrouping &= tr.PreservesPrefixGrouping;
- MyTraits.PaddingBit = tr.PaddingBit;
- MyTraits.NeedsTraining |= tr.NeedsTraining;
- MyTraits.Irreversible |= tr.Irreversible;
- MyTraits.SizeOnEncodeAddition = MyTraits.SizeOnEncodeAddition * tr.SizeOnEncodeMultiplier + tr.SizeOnEncodeAddition;
- MyTraits.SizeOnEncodeMultiplier *= tr.SizeOnEncodeMultiplier;
- MyTraits.SizeOnDecodeMultiplier *= tr.SizeOnDecodeMultiplier;
- MyTraits.RecommendedSampleSize = Max(MyTraits.RecommendedSampleSize, tr.RecommendedSampleSize);
-
- Pipeline.push_back(codec);
- return *this;
+ TPipelineCodec& TPipelineCodec::AddCodec(TCodecPtr codec) {
+ if (!codec)
+ return *this;
+
+ TCodecTraits tr = codec->Traits();
+
+ if (!MyName) {
+ MyTraits.AssumesStructuredInput = tr.AssumesStructuredInput;
+ MyTraits.SizeOfInputElement = tr.SizeOfInputElement;
+ } else {
+ MyName.append(':');
+ }
+
+ MyName.append(codec->GetName());
+ MyTraits.PreservesPrefixGrouping &= tr.PreservesPrefixGrouping;
+ MyTraits.PaddingBit = tr.PaddingBit;
+ MyTraits.NeedsTraining |= tr.NeedsTraining;
+ MyTraits.Irreversible |= tr.Irreversible;
+ MyTraits.SizeOnEncodeAddition = MyTraits.SizeOnEncodeAddition * tr.SizeOnEncodeMultiplier + tr.SizeOnEncodeAddition;
+ MyTraits.SizeOnEncodeMultiplier *= tr.SizeOnEncodeMultiplier;
+ MyTraits.SizeOnDecodeMultiplier *= tr.SizeOnDecodeMultiplier;
+ MyTraits.RecommendedSampleSize = Max(MyTraits.RecommendedSampleSize, tr.RecommendedSampleSize);
+
+ Pipeline.push_back(codec);
+ return *this;
}
- void TPipelineCodec::DoLearnX(ISequenceReader& in, double sampleSizeMult) {
- if (!Traits().NeedsTraining) {
- return;
- }
+ void TPipelineCodec::DoLearnX(ISequenceReader& in, double sampleSizeMult) {
+ if (!Traits().NeedsTraining) {
+ return;
+ }
- if (Pipeline.size() == 1) {
- Pipeline.back()->Learn(in);
- return;
- }
+ if (Pipeline.size() == 1) {
+ Pipeline.back()->Learn(in);
+ return;
+ }
- TVector<TBuffer> trainingInput;
+ TVector<TBuffer> trainingInput;
- TStringBuf r;
- while (in.NextRegion(r)) {
+ TStringBuf r;
+ while (in.NextRegion(r)) {
trainingInput.emplace_back(r.data(), r.size());
}
-
- TBuffer buff;
- for (const auto& it : Pipeline) {
- it->LearnX(trainingInput.begin(), trainingInput.end(), sampleSizeMult);
-
- for (auto& bit : trainingInput) {
- buff.Clear();
+
+ TBuffer buff;
+ for (const auto& it : Pipeline) {
+ it->LearnX(trainingInput.begin(), trainingInput.end(), sampleSizeMult);
+
+ for (auto& bit : trainingInput) {
+ buff.Clear();
it->Encode(TStringBuf{bit.data(), bit.size()}, buff);
- buff.Swap(bit);
- }
- }
+ buff.Swap(bit);
+ }
+ }
}
- bool TPipelineCodec::AlreadyTrained() const {
- for (const auto& it : Pipeline) {
- if (!it->AlreadyTrained())
- return false;
- }
-
- return true;
+ bool TPipelineCodec::AlreadyTrained() const {
+ for (const auto& it : Pipeline) {
+ if (!it->AlreadyTrained())
+ return false;
+ }
+
+ return true;
}
}
diff --git a/library/cpp/codecs/codecs.h b/library/cpp/codecs/codecs.h
index 08ea9beb44..cc5e72b285 100644
--- a/library/cpp/codecs/codecs.h
+++ b/library/cpp/codecs/codecs.h
@@ -16,244 +16,244 @@
#include <util/ysaveload.h>
namespace NCodecs {
- class TCodecException: public TWithBackTrace<yexception> {};
+ class TCodecException: public TWithBackTrace<yexception> {};
- class ICodec;
+ class ICodec;
- using TCodecPtr = TIntrusivePtr<ICodec>;
- using TCodecConstPtr = TIntrusiveConstPtr<ICodec>;
+ using TCodecPtr = TIntrusivePtr<ICodec>;
+ using TCodecConstPtr = TIntrusiveConstPtr<ICodec>;
- struct TCodecTraits {
- ui32 RecommendedSampleSize = 0;
- ui16 SizeOfInputElement = 1;
- ui8 SizeOnEncodeMultiplier = 1;
- ui8 SizeOnEncodeAddition = 0;
- ui8 SizeOnDecodeMultiplier = 1;
+ struct TCodecTraits {
+ ui32 RecommendedSampleSize = 0;
+ ui16 SizeOfInputElement = 1;
+ ui8 SizeOnEncodeMultiplier = 1;
+ ui8 SizeOnEncodeAddition = 0;
+ ui8 SizeOnDecodeMultiplier = 1;
- bool NeedsTraining = false;
- bool PreservesPrefixGrouping = false;
- bool Irreversible = false;
- bool PaddingBit = 0;
- bool AssumesStructuredInput = false;
+ bool NeedsTraining = false;
+ bool PreservesPrefixGrouping = false;
+ bool Irreversible = false;
+ bool PaddingBit = 0;
+ bool AssumesStructuredInput = false;
- size_t ApproximateSizeOnEncode(size_t sz) const {
- return sz * SizeOnEncodeMultiplier + SizeOnEncodeAddition;
- }
+ size_t ApproximateSizeOnEncode(size_t sz) const {
+ return sz * SizeOnEncodeMultiplier + SizeOnEncodeAddition;
+ }
- size_t ApproximateSizeOnDecode(size_t sz) const {
- return sz * SizeOnDecodeMultiplier;
- }
- };
+ size_t ApproximateSizeOnDecode(size_t sz) const {
+ return sz * SizeOnDecodeMultiplier;
+ }
+ };
- class ICodec: public TAtomicRefCount<ICodec> {
- protected:
- bool Trained = false;
- TCodecTraits MyTraits;
+ class ICodec: public TAtomicRefCount<ICodec> {
+ protected:
+ bool Trained = false;
+ TCodecTraits MyTraits;
- public:
- TCodecTraits Traits() const {
- return MyTraits;
- }
+ public:
+ TCodecTraits Traits() const {
+ return MyTraits;
+ }
- // the name of the codec (or its variant) to be used in the codec registry
- virtual TString GetName() const = 0;
+ // the name of the codec (or its variant) to be used in the codec registry
+ virtual TString GetName() const = 0;
- virtual ui8 /*free bits in last byte*/ Encode(TStringBuf, TBuffer&) const = 0;
+ virtual ui8 /*free bits in last byte*/ Encode(TStringBuf, TBuffer&) const = 0;
virtual ui8 Encode(const TBuffer& input, TBuffer& output) const {
return Encode(TStringBuf(input.Data(), input.Data() + input.Size()), output);
}
- virtual void Decode(TStringBuf, TBuffer&) const = 0;
+ virtual void Decode(TStringBuf, TBuffer&) const = 0;
virtual void Decode(const TBuffer& input, TBuffer& output) const {
Decode(TStringBuf(input.Data(), input.Data() + input.Size()), output);
}
- virtual ~ICodec() = default;
+ virtual ~ICodec() = default;
- virtual bool AlreadyTrained() const {
- return !Traits().NeedsTraining || Trained;
- }
- virtual void SetTrained(bool t) {
- Trained = t;
- }
+ virtual bool AlreadyTrained() const {
+ return !Traits().NeedsTraining || Trained;
+ }
+ virtual void SetTrained(bool t) {
+ Trained = t;
+ }
bool TryToLearn(ISequenceReader& r) {
Trained = DoTryToLearn(r);
return Trained;
}
- void Learn(ISequenceReader& r) {
- LearnX(r, 1);
- }
+ void Learn(ISequenceReader& r) {
+ LearnX(r, 1);
+ }
- template <class TIter>
- void Learn(TIter beg, TIter end) {
- Learn(beg, end, IterToStringBuf<TIter>);
- }
+ template <class TIter>
+ void Learn(TIter beg, TIter end) {
+ Learn(beg, end, IterToStringBuf<TIter>);
+ }
- template <class TIter, class TGetter>
- void Learn(TIter beg, TIter end, TGetter getter) {
- auto sample = GetSample(beg, end, Traits().RecommendedSampleSize, getter);
- TSimpleSequenceReader<TBuffer> reader{sample};
- Learn(reader);
- }
+ template <class TIter, class TGetter>
+ void Learn(TIter beg, TIter end, TGetter getter) {
+ auto sample = GetSample(beg, end, Traits().RecommendedSampleSize, getter);
+ TSimpleSequenceReader<TBuffer> reader{sample};
+ Learn(reader);
+ }
- static TCodecPtr GetInstance(TStringBuf name);
+ static TCodecPtr GetInstance(TStringBuf name);
- static TVector<TString> GetCodecsList();
+ static TVector<TString> GetCodecsList();
- static TString GetNameSafe(TCodecPtr p);
+ static TString GetNameSafe(TCodecPtr p);
- static void Store(IOutputStream* out, TCodecPtr p);
- static TCodecPtr Restore(IInputStream* in);
- static TCodecPtr RestoreFromString(TStringBuf);
+ static void Store(IOutputStream* out, TCodecPtr p);
+ static TCodecPtr Restore(IInputStream* in);
+ static TCodecPtr RestoreFromString(TStringBuf);
- protected:
- virtual void DoLearn(ISequenceReader&) = 0;
+ protected:
+ virtual void DoLearn(ISequenceReader&) = 0;
virtual bool DoTryToLearn(ISequenceReader& r) {
DoLearn(r);
return true;
}
- // so the pipeline codec will know to adjust the sample for the subcodecs
- virtual void DoLearnX(ISequenceReader& r, double /*sampleSizeMultiplier*/) {
- DoLearn(r);
- }
-
- virtual void Save(IOutputStream*) const {
- }
- virtual void Load(IInputStream*) {
- }
- friend class TPipelineCodec;
-
- public:
- // so the pipeline codec will know to adjust the sample for the subcodecs
- void LearnX(ISequenceReader& r, double sampleSizeMult) {
- DoLearnX(r, sampleSizeMult);
- Trained = true;
- }
-
- template <class TIter>
- void LearnX(TIter beg, TIter end, double sampleSizeMult) {
- auto sample = GetSample(beg, end, Traits().RecommendedSampleSize * sampleSizeMult);
- TSimpleSequenceReader<TBuffer> reader{sample};
- LearnX(reader, sampleSizeMult);
- }
- };
-
- class TBasicTrivialCodec: public ICodec {
- public:
- ui8 Encode(TStringBuf in, TBuffer& out) const override {
+ // so the pipeline codec will know to adjust the sample for the subcodecs
+ virtual void DoLearnX(ISequenceReader& r, double /*sampleSizeMultiplier*/) {
+ DoLearn(r);
+ }
+
+ virtual void Save(IOutputStream*) const {
+ }
+ virtual void Load(IInputStream*) {
+ }
+ friend class TPipelineCodec;
+
+ public:
+ // so the pipeline codec will know to adjust the sample for the subcodecs
+ void LearnX(ISequenceReader& r, double sampleSizeMult) {
+ DoLearnX(r, sampleSizeMult);
+ Trained = true;
+ }
+
+ template <class TIter>
+ void LearnX(TIter beg, TIter end, double sampleSizeMult) {
+ auto sample = GetSample(beg, end, Traits().RecommendedSampleSize * sampleSizeMult);
+ TSimpleSequenceReader<TBuffer> reader{sample};
+ LearnX(reader, sampleSizeMult);
+ }
+ };
+
+ class TBasicTrivialCodec: public ICodec {
+ public:
+ ui8 Encode(TStringBuf in, TBuffer& out) const override {
out.Assign(in.data(), in.size());
- return 0;
- }
+ return 0;
+ }
- void Decode(TStringBuf in, TBuffer& out) const override {
- Encode(in, out);
- }
+ void Decode(TStringBuf in, TBuffer& out) const override {
+ Encode(in, out);
+ }
- protected:
- void DoLearn(ISequenceReader&) override {
- }
- };
+ protected:
+ void DoLearn(ISequenceReader&) override {
+ }
+ };
- class TTrivialCodec: public TBasicTrivialCodec {
- public:
- TTrivialCodec() {
- MyTraits.PreservesPrefixGrouping = true;
- }
+ class TTrivialCodec: public TBasicTrivialCodec {
+ public:
+ TTrivialCodec() {
+ MyTraits.PreservesPrefixGrouping = true;
+ }
- static TStringBuf MyName() {
- return "trivial";
- }
+ static TStringBuf MyName() {
+ return "trivial";
+ }
- TString GetName() const override {
+ TString GetName() const override {
return ToString(MyName());
- }
- };
+ }
+ };
- class TTrivialTrainableCodec: public TBasicTrivialCodec {
- public:
- TTrivialTrainableCodec() {
- MyTraits.PreservesPrefixGrouping = true;
- MyTraits.NeedsTraining = true;
- }
+ class TTrivialTrainableCodec: public TBasicTrivialCodec {
+ public:
+ TTrivialTrainableCodec() {
+ MyTraits.PreservesPrefixGrouping = true;
+ MyTraits.NeedsTraining = true;
+ }
- static TStringBuf MyName() {
- return "trivial-trainable";
- }
+ static TStringBuf MyName() {
+ return "trivial-trainable";
+ }
- TString GetName() const override {
+ TString GetName() const override {
return ToString(MyName());
- }
- };
-
- class TNullCodec: public ICodec {
- public:
- TNullCodec() {
- MyTraits.Irreversible = true;
- MyTraits.SizeOnDecodeMultiplier = 0;
- MyTraits.SizeOnEncodeMultiplier = 0;
- }
-
- TString GetName() const override {
- return "null";
- }
-
- ui8 Encode(TStringBuf, TBuffer& out) const override {
- out.Clear();
- return 0;
- }
-
- void Decode(TStringBuf, TBuffer& out) const override {
- out.Clear();
- }
-
- protected:
- void DoLearn(ISequenceReader&) override {
- }
- };
-
- class TPipelineCodec: public ICodec {
- typedef TVector<TCodecPtr> TPipeline;
-
- TPipeline Pipeline;
- TString MyName;
-
- public:
- explicit TPipelineCodec(TCodecPtr c0 = nullptr, TCodecPtr c1 = nullptr, TCodecPtr c2 = nullptr, TCodecPtr c3 = nullptr) {
- MyTraits.PreservesPrefixGrouping = true;
- AddCodec(c0);
- AddCodec(c1);
- AddCodec(c2);
- AddCodec(c3);
- }
-
- TString GetName() const override {
- return MyName;
- }
-
- ui8 Encode(TStringBuf in, TBuffer& out) const override;
- void Decode(TStringBuf in, TBuffer& out) const override;
-
- public:
- /*
+ }
+ };
+
+ class TNullCodec: public ICodec {
+ public:
+ TNullCodec() {
+ MyTraits.Irreversible = true;
+ MyTraits.SizeOnDecodeMultiplier = 0;
+ MyTraits.SizeOnEncodeMultiplier = 0;
+ }
+
+ TString GetName() const override {
+ return "null";
+ }
+
+ ui8 Encode(TStringBuf, TBuffer& out) const override {
+ out.Clear();
+ return 0;
+ }
+
+ void Decode(TStringBuf, TBuffer& out) const override {
+ out.Clear();
+ }
+
+ protected:
+ void DoLearn(ISequenceReader&) override {
+ }
+ };
+
+ class TPipelineCodec: public ICodec {
+ typedef TVector<TCodecPtr> TPipeline;
+
+ TPipeline Pipeline;
+ TString MyName;
+
+ public:
+ explicit TPipelineCodec(TCodecPtr c0 = nullptr, TCodecPtr c1 = nullptr, TCodecPtr c2 = nullptr, TCodecPtr c3 = nullptr) {
+ MyTraits.PreservesPrefixGrouping = true;
+ AddCodec(c0);
+ AddCodec(c1);
+ AddCodec(c2);
+ AddCodec(c3);
+ }
+
+ TString GetName() const override {
+ return MyName;
+ }
+
+ ui8 Encode(TStringBuf in, TBuffer& out) const override;
+ void Decode(TStringBuf in, TBuffer& out) const override;
+
+ public:
+ /*
* Add codecs in the following order:
* uncompressed -> codec0 | codec1 | ... | codecN -> compressed
*/
- TPipelineCodec& AddCodec(TCodecPtr codec);
+ TPipelineCodec& AddCodec(TCodecPtr codec);
- bool AlreadyTrained() const override;
- void SetTrained(bool t) override;
+ bool AlreadyTrained() const override;
+ void SetTrained(bool t) override;
- protected:
- void DoLearn(ISequenceReader& in) override {
- DoLearnX(in, 1);
- }
+ protected:
+ void DoLearn(ISequenceReader& in) override {
+ DoLearnX(in, 1);
+ }
- void DoLearnX(ISequenceReader& in, double sampleSizeMult) override;
- void Save(IOutputStream* out) const override;
- void Load(IInputStream* in) override;
- };
+ void DoLearnX(ISequenceReader& in, double sampleSizeMult) override;
+ void Save(IOutputStream* out) const override;
+ void Load(IInputStream* in) override;
+ };
}
diff --git a/library/cpp/codecs/codecs_registry.cpp b/library/cpp/codecs/codecs_registry.cpp
index c8941ec337..17d07062ab 100644
--- a/library/cpp/codecs/codecs_registry.cpp
+++ b/library/cpp/codecs/codecs_registry.cpp
@@ -42,7 +42,7 @@ namespace NCodecs {
} else {
TPipelineCodec* pipe = new TPipelineCodec;
- do {
+ do {
TStringBuf v = name.NextTok(':');
pipe->AddCodec(GetCodec(v));
} while (name);
@@ -64,7 +64,7 @@ namespace NCodecs {
return vs;
}
- struct TSolarCodecFactory : ICodecFactory {
+ struct TSolarCodecFactory : ICodecFactory {
TCodecPtr MakeCodec(TStringBuf name) const override {
if (TSolarCodec::MyNameShortInt() == name) {
return new TSolarCodecShortInt();
@@ -79,7 +79,7 @@ namespace NCodecs {
}
}
- template <class TCodecCls>
+ template <class TCodecCls>
TCodecPtr MakeCodecImpl(const TStringBuf& name, const TStringBuf& type) const {
if (TStringBuf("-8k") == type) {
return new TCodecCls(1 << 13);
@@ -117,7 +117,7 @@ namespace NCodecs {
}
};
- struct TZStdDictCodecFactory : ICodecFactory {
+ struct TZStdDictCodecFactory : ICodecFactory {
TCodecPtr MakeCodec(TStringBuf name) const override {
return new TZStdDictCodec(TZStdDictCodec::ParseCompressionName(name));
}
@@ -127,7 +127,7 @@ namespace NCodecs {
}
};
- struct TCompTableCodecFactory : ICodecFactory {
+ struct TCompTableCodecFactory : ICodecFactory {
TCodecPtr MakeCodec(TStringBuf name) const override {
if (TCompTableCodec::MyNameHQ() == name) {
return new TCompTableCodec(TCompTableCodec::Q_HIGH);
@@ -147,11 +147,11 @@ namespace NCodecs {
}
};
- struct TBlockCodec : ICodec {
+ struct TBlockCodec : ICodec {
const NBlockCodecs::ICodec* Codec;
TBlockCodec(TStringBuf name)
- : Codec(NBlockCodecs::Codec(name))
+ : Codec(NBlockCodecs::Codec(name))
{
}
@@ -174,11 +174,11 @@ namespace NCodecs {
}
};
- struct TBlockCodecsFactory : ICodecFactory {
+ struct TBlockCodecsFactory : ICodecFactory {
using TRegistry = THashMap<TString, TCodecPtr>;
TRegistry Registry;
- TBlockCodecsFactory() {
+ TBlockCodecsFactory() {
for (TStringBuf codec : NBlockCodecs::ListAllCodecs()) {
Register(codec);
}
@@ -205,12 +205,12 @@ namespace NCodecs {
}
};
- TCodecRegistry::TCodecRegistry() {
+ TCodecRegistry::TCodecRegistry() {
RegisterFactory(new TInstanceFactory<TTrivialCodec>);
RegisterFactory(new TInstanceFactory<TTrivialTrainableCodec>);
RegisterFactory(new TInstanceFactory<THuffmanCodec>);
- RegisterFactory(new TInstanceFactory<TPForCodec<ui64, true>>);
- RegisterFactory(new TInstanceFactory<TPForCodec<ui32, true>>);
+ RegisterFactory(new TInstanceFactory<TPForCodec<ui64, true>>);
+ RegisterFactory(new TInstanceFactory<TPForCodec<ui32, true>>);
RegisterFactory(new TSolarCodecFactory);
RegisterFactory(new TZStdDictCodecFactory);
RegisterFactory(new TCompTableCodecFactory);
diff --git a/library/cpp/codecs/codecs_registry.h b/library/cpp/codecs/codecs_registry.h
index abd4a38cc5..53710310d5 100644
--- a/library/cpp/codecs/codecs_registry.h
+++ b/library/cpp/codecs/codecs_registry.h
@@ -4,13 +4,13 @@
#include <util/string/cast.h>
namespace NCodecs {
- struct TNoCodecException : TCodecException {
- TNoCodecException(TStringBuf name) {
+ struct TNoCodecException : TCodecException {
+ TNoCodecException(TStringBuf name) {
(*this) << "unknown codec: " << name;
}
};
- struct ICodecFactory : TAtomicRefCount<ICodecFactory> {
+ struct ICodecFactory : TAtomicRefCount<ICodecFactory> {
virtual ~ICodecFactory() = default;
virtual TCodecPtr MakeCodec(TStringBuf name) const = 0;
virtual TVector<TString> ListNames() const = 0;
@@ -19,8 +19,8 @@ namespace NCodecs {
typedef TIntrusivePtr<ICodecFactory> TCodecFactoryPtr;
namespace NPrivate {
- template <typename TCodec>
- struct TInstanceFactory : ICodecFactory {
+ template <typename TCodec>
+ struct TInstanceFactory : ICodecFactory {
TCodecPtr MakeCodec(TStringBuf) const override {
return new TCodec;
}
@@ -52,7 +52,7 @@ namespace NCodecs {
void RegisterCodecFactory(TCodecFactoryPtr fact);
- template <typename TCodec>
+ template <typename TCodec>
void RegisterCodec() {
RegisterCodecFactory(new NPrivate::TInstanceFactory<TCodec>());
}
diff --git a/library/cpp/codecs/comptable_codec.cpp b/library/cpp/codecs/comptable_codec.cpp
index cf747121ba..476b8ada80 100644
--- a/library/cpp/codecs/comptable_codec.cpp
+++ b/library/cpp/codecs/comptable_codec.cpp
@@ -4,12 +4,12 @@
#include <util/string/cast.h>
namespace NCodecs {
- class TCompTableCodec::TImpl: public TAtomicRefCount<TImpl> {
+ class TCompTableCodec::TImpl: public TAtomicRefCount<TImpl> {
public:
TImpl(EQuality q)
: Quality(q)
- {
- }
+ {
+ }
void Init() {
Compressor.Reset(new NCompTable::TChunkCompressor{(bool)Quality, Table});
diff --git a/library/cpp/codecs/comptable_codec.h b/library/cpp/codecs/comptable_codec.h
index d0f4361780..7ba4f4c543 100644
--- a/library/cpp/codecs/comptable_codec.h
+++ b/library/cpp/codecs/comptable_codec.h
@@ -5,36 +5,36 @@
#include <util/generic/ptr.h>
namespace NCodecs {
- class TCompTableCodec: public ICodec {
- class TImpl;
- TIntrusivePtr<TImpl> Impl;
+ class TCompTableCodec: public ICodec {
+ class TImpl;
+ TIntrusivePtr<TImpl> Impl;
- public:
- enum EQuality {
- Q_LOW = 0,
- Q_HIGH = 1
- };
+ public:
+ enum EQuality {
+ Q_LOW = 0,
+ Q_HIGH = 1
+ };
- explicit TCompTableCodec(EQuality q = Q_HIGH);
- ~TCompTableCodec() override;
+ explicit TCompTableCodec(EQuality q = Q_HIGH);
+ ~TCompTableCodec() override;
- static TStringBuf MyNameHQ() {
- return "comptable-hq";
- }
- static TStringBuf MyNameLQ() {
- return "comptable-lq";
- }
+ static TStringBuf MyNameHQ() {
+ return "comptable-hq";
+ }
+ static TStringBuf MyNameLQ() {
+ return "comptable-lq";
+ }
- TString GetName() const override;
+ TString GetName() const override;
- ui8 Encode(TStringBuf in, TBuffer& out) const override;
+ ui8 Encode(TStringBuf in, TBuffer& out) const override;
- void Decode(TStringBuf in, TBuffer& out) const override;
+ void Decode(TStringBuf in, TBuffer& out) const override;
- protected:
- void DoLearn(ISequenceReader& in) override;
- void Save(IOutputStream* out) const override;
- void Load(IInputStream* in) override;
- };
+ protected:
+ void DoLearn(ISequenceReader& in) override;
+ void Save(IOutputStream* out) const override;
+ void Load(IInputStream* in) override;
+ };
}
diff --git a/library/cpp/codecs/delta_codec.cpp b/library/cpp/codecs/delta_codec.cpp
index b9ed146dcb..61606d6f6f 100644
--- a/library/cpp/codecs/delta_codec.cpp
+++ b/library/cpp/codecs/delta_codec.cpp
@@ -1,21 +1,21 @@
#include "delta_codec.h"
namespace NCodecs {
- template <>
- TStringBuf TDeltaCodec<ui64, true>::MyName() {
- return "delta64-unsigned";
- }
- template <>
- TStringBuf TDeltaCodec<ui32, true>::MyName() {
- return "delta32-unsigned";
- }
- template <>
- TStringBuf TDeltaCodec<ui64, false>::MyName() {
- return "delta64-signed";
- }
- template <>
- TStringBuf TDeltaCodec<ui32, false>::MyName() {
- return "delta32-signed";
- }
+ template <>
+ TStringBuf TDeltaCodec<ui64, true>::MyName() {
+ return "delta64-unsigned";
+ }
+ template <>
+ TStringBuf TDeltaCodec<ui32, true>::MyName() {
+ return "delta32-unsigned";
+ }
+ template <>
+ TStringBuf TDeltaCodec<ui64, false>::MyName() {
+ return "delta64-signed";
+ }
+ template <>
+ TStringBuf TDeltaCodec<ui32, false>::MyName() {
+ return "delta32-signed";
+ }
}
diff --git a/library/cpp/codecs/delta_codec.h b/library/cpp/codecs/delta_codec.h
index 4e5dbb8f75..21325825e6 100644
--- a/library/cpp/codecs/delta_codec.h
+++ b/library/cpp/codecs/delta_codec.h
@@ -8,136 +8,136 @@
#include <util/string/cast.h>
namespace NCodecs {
- template <typename T = ui64, bool UnsignedDelta = true>
- class TDeltaCodec: public ICodec {
- static_assert(std::is_integral<T>::value, "expect std::is_integral<T>::value");
-
- public:
- using TUnsigned = std::make_unsigned_t<T>;
- using TSigned = std::make_signed_t<T>;
- using TDelta = std::conditional_t<UnsignedDelta, TUnsigned, TSigned>;
-
- private:
- const TDelta MinDelta{Min<TDelta>()};
- const TDelta MaxDelta{Max<TDelta>() - 1};
- const TDelta InvalidDelta{MaxDelta + 1};
-
- Y_FORCE_INLINE static TDelta AddSafe(TUnsigned a, TUnsigned b) {
- return a + b;
- }
-
- Y_FORCE_INLINE static TDelta SubSafe(TUnsigned a, TUnsigned b) {
- return a - b;
- }
-
- public:
- struct TDecoder {
- const TDelta InvalidDelta{Max<TDelta>()};
-
- T Last = 0;
- T Result = 0;
-
- bool First = true;
- bool Invalid = false;
-
- Y_FORCE_INLINE bool Decode(TDelta t) {
- if (Y_UNLIKELY(First)) {
- First = false;
- Result = Last = t;
- return true;
- }
-
- if (Y_UNLIKELY(Invalid)) {
- Invalid = false;
- Last = 0;
- Result = t;
- return true;
- }
-
- Result = (Last += t);
- Invalid = t == InvalidDelta;
-
- return !Invalid;
+ template <typename T = ui64, bool UnsignedDelta = true>
+ class TDeltaCodec: public ICodec {
+ static_assert(std::is_integral<T>::value, "expect std::is_integral<T>::value");
+
+ public:
+ using TUnsigned = std::make_unsigned_t<T>;
+ using TSigned = std::make_signed_t<T>;
+ using TDelta = std::conditional_t<UnsignedDelta, TUnsigned, TSigned>;
+
+ private:
+ const TDelta MinDelta{Min<TDelta>()};
+ const TDelta MaxDelta{Max<TDelta>() - 1};
+ const TDelta InvalidDelta{MaxDelta + 1};
+
+ Y_FORCE_INLINE static TDelta AddSafe(TUnsigned a, TUnsigned b) {
+ return a + b;
+ }
+
+ Y_FORCE_INLINE static TDelta SubSafe(TUnsigned a, TUnsigned b) {
+ return a - b;
+ }
+
+ public:
+ struct TDecoder {
+ const TDelta InvalidDelta{Max<TDelta>()};
+
+ T Last = 0;
+ T Result = 0;
+
+ bool First = true;
+ bool Invalid = false;
+
+ Y_FORCE_INLINE bool Decode(TDelta t) {
+ if (Y_UNLIKELY(First)) {
+ First = false;
+ Result = Last = t;
+ return true;
+ }
+
+ if (Y_UNLIKELY(Invalid)) {
+ Invalid = false;
+ Last = 0;
+ Result = t;
+ return true;
+ }
+
+ Result = (Last += t);
+ Invalid = t == InvalidDelta;
+
+ return !Invalid;
}
- };
+ };
- public:
- static TStringBuf MyName();
+ public:
+ static TStringBuf MyName();
- TDeltaCodec() {
- MyTraits.SizeOfInputElement = sizeof(T);
- MyTraits.AssumesStructuredInput = true;
+ TDeltaCodec() {
+ MyTraits.SizeOfInputElement = sizeof(T);
+ MyTraits.AssumesStructuredInput = true;
}
- TString GetName() const override {
+ TString GetName() const override {
return ToString(MyName());
- }
+ }
- template <class TItem>
- static void AppendTo(TBuffer& b, TItem t) {
- b.Append((char*)&t, sizeof(t));
- }
+ template <class TItem>
+ static void AppendTo(TBuffer& b, TItem t) {
+ b.Append((char*)&t, sizeof(t));
+ }
- ui8 Encode(TStringBuf s, TBuffer& b) const override {
- b.Clear();
+ ui8 Encode(TStringBuf s, TBuffer& b) const override {
+ b.Clear();
if (s.empty()) {
- return 0;
- }
+ return 0;
+ }
b.Reserve(s.size());
TArrayRef<const T> tin{(const T*)s.data(), s.size() / sizeof(T)};
const T* it = tin.begin();
- TDelta last = *(it++);
- AppendTo(b, last);
+ TDelta last = *(it++);
+ AppendTo(b, last);
- TDelta maxt = SubSafe(MaxDelta, last);
- TDelta mint = AddSafe(MinDelta, last);
+ TDelta maxt = SubSafe(MaxDelta, last);
+ TDelta mint = AddSafe(MinDelta, last);
for (; it != tin.end(); ++it) {
- TDelta t = *it;
-
- if (Y_LIKELY((t >= mint) & (t <= maxt))) {
- AppendTo(b, t - last);
- last = t;
- maxt = SubSafe(MaxDelta, last);
- mint = AddSafe(MinDelta, last);
- } else {
- // delta overflow
- AppendTo(b, InvalidDelta);
- AppendTo(b, t);
- last = 0;
- maxt = MaxDelta;
- mint = MinDelta;
- }
- }
-
- return 0;
- }
-
- void Decode(TStringBuf s, TBuffer& b) const override {
- b.Clear();
+ TDelta t = *it;
+
+ if (Y_LIKELY((t >= mint) & (t <= maxt))) {
+ AppendTo(b, t - last);
+ last = t;
+ maxt = SubSafe(MaxDelta, last);
+ mint = AddSafe(MinDelta, last);
+ } else {
+ // delta overflow
+ AppendTo(b, InvalidDelta);
+ AppendTo(b, t);
+ last = 0;
+ maxt = MaxDelta;
+ mint = MinDelta;
+ }
+ }
+
+ return 0;
+ }
+
+ void Decode(TStringBuf s, TBuffer& b) const override {
+ b.Clear();
if (s.empty()) {
- return;
+ return;
}
b.Reserve(s.size());
TArrayRef<const T> tin{(const T*)s.data(), s.size() / sizeof(T)};
- TDecoder dec;
+ TDecoder dec;
for (const T* it = tin.begin(); it != tin.end(); ++it) {
- T tmp;
- memcpy(&tmp, it, sizeof(tmp));
- if (dec.Decode(tmp)) {
- AppendTo(b, dec.Result);
- }
+ T tmp;
+ memcpy(&tmp, it, sizeof(tmp));
+ if (dec.Decode(tmp)) {
+ AppendTo(b, dec.Result);
+ }
}
}
- protected:
- void DoLearn(ISequenceReader&) override {
- }
- };
+ protected:
+ void DoLearn(ISequenceReader&) override {
+ }
+ };
}
diff --git a/library/cpp/codecs/float_huffman.cpp b/library/cpp/codecs/float_huffman.cpp
index a95ca5b41d..c4a8bd228f 100644
--- a/library/cpp/codecs/float_huffman.cpp
+++ b/library/cpp/codecs/float_huffman.cpp
@@ -55,7 +55,7 @@ namespace NCodecs::NFloatHuff {
{0x3c000000, 0x12, 5, 24}, // [0.0078125, 0.03125), 29 bits, prefix [01001]
{0x3b000000, 0x26, 6, 34}, // [0.001953125, end of range), 40 bits, prefix [011001]
{0x00000000, 0x16, 5, 32}, // whole range, 37 bits, prefix [01101]
- };
+ };
[[noreturn]] Y_NO_INLINE void ThrowInvalidOffset(size_t size, size_t byteOffset) {
ythrow yexception() <<
diff --git a/library/cpp/codecs/greedy_dict/gd_builder.cpp b/library/cpp/codecs/greedy_dict/gd_builder.cpp
index 802c721753..561bfbca01 100644
--- a/library/cpp/codecs/greedy_dict/gd_builder.cpp
+++ b/library/cpp/codecs/greedy_dict/gd_builder.cpp
@@ -9,134 +9,134 @@
#include <util/system/rusage.h>
namespace NGreedyDict {
- void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) {
- if (!Current) {
+ void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) {
+ if (!Current) {
Current = MakeHolder<TEntrySet>();
- Current->InitWithAlpha();
- }
+ Current->InitWithAlpha();
+ }
- TEntrySet& set = *Current;
+ TEntrySet& set = *Current;
- for (auto& it : set)
- it.Count = 0;
+ for (auto& it : set)
+ it.Count = 0;
- CompoundCounts = nullptr;
- CompoundCountsPool.Clear();
+ CompoundCounts = nullptr;
+ CompoundCountsPool.Clear();
- if (!final) {
+ if (!final) {
CompoundCounts = MakeHolder<TCompoundCounts>(&CompoundCountsPool);
- CompoundCounts->reserve(maxcand);
- }
+ CompoundCounts->reserve(maxcand);
+ }
- Shuffle(Input.begin(), Input.end(), Rng);
+ Shuffle(Input.begin(), Input.end(), Rng);
- for (auto str : Input) {
- if (!final && CompoundCounts->size() > maxcand)
- break;
+ for (auto str : Input) {
+ if (!final && CompoundCounts->size() > maxcand)
+ break;
- i32 prev = -1;
+ i32 prev = -1;
- while (!!str) {
- TEntry* e = set.FindPrefix(str);
- ui32 num = e->Number;
+ while (!!str) {
+ TEntry* e = set.FindPrefix(str);
+ ui32 num = e->Number;
- e->Count += 1;
- if (!final && prev >= 0) {
- (*CompoundCounts)[Compose(prev, num)] += 1;
- }
+ e->Count += 1;
+ if (!final && prev >= 0) {
+ (*CompoundCounts)[Compose(prev, num)] += 1;
+ }
- prev = num;
- ++set.TotalCount;
+ prev = num;
+ ++set.TotalCount;
}
- }
+ }
- Current->SetModelP();
+ Current->SetModelP();
}
- ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) {
- TAutoPtr<TEntrySet> newset = new TEntrySet;
- newset->InitWithAlpha();
- maxent -= newset->size();
-
- ui32 additions = 0;
- ui32 deletions = 0;
-
- {
- const TEntrySet& set = *Current;
-
- Candidates.clear();
- const ui32 total = set.TotalCount;
- const float minpval = Settings.MinPValue;
- const EEntryStatTest test = Settings.StatTest;
- const EEntryScore score = Settings.Score;
- const ui32 mincnt = Settings.MinAbsCount;
-
- for (const auto& it : set) {
- const TEntry& e = it;
- float modelp = e.ModelP;
- ui32 cnt = e.Count;
-
- if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval)
- Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number));
- }
-
- if (!!CompoundCounts) {
- for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) {
- const TEntry& prev = set.Get(Prev(it->first));
- const TEntry& next = set.Get(Next(it->first));
- float modelp = ModelP(prev.Count, next.Count, total);
- ui32 cnt = it->second;
- if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval)
- Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first));
- }
+ ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) {
+ TAutoPtr<TEntrySet> newset = new TEntrySet;
+ newset->InitWithAlpha();
+ maxent -= newset->size();
+
+ ui32 additions = 0;
+ ui32 deletions = 0;
+
+ {
+ const TEntrySet& set = *Current;
+
+ Candidates.clear();
+ const ui32 total = set.TotalCount;
+ const float minpval = Settings.MinPValue;
+ const EEntryStatTest test = Settings.StatTest;
+ const EEntryScore score = Settings.Score;
+ const ui32 mincnt = Settings.MinAbsCount;
+
+ for (const auto& it : set) {
+ const TEntry& e = it;
+ float modelp = e.ModelP;
+ ui32 cnt = e.Count;
+
+ if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval)
+ Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number));
+ }
+
+ if (!!CompoundCounts) {
+ for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) {
+ const TEntry& prev = set.Get(Prev(it->first));
+ const TEntry& next = set.Get(Next(it->first));
+ float modelp = ModelP(prev.Count, next.Count, total);
+ ui32 cnt = it->second;
+ if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval)
+ Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first));
+ }
}
- Sort(Candidates.begin(), Candidates.end());
+ Sort(Candidates.begin(), Candidates.end());
- if (Candidates.size() > maxent)
- Candidates.resize(maxent);
+ if (Candidates.size() > maxent)
+ Candidates.resize(maxent);
- for (const auto& candidate : Candidates) {
- if (IsCompound(candidate.second)) {
- additions++;
- newset->Add(set.Get(Prev(candidate.second)).Str, set.Get(Next(candidate.second)).Str);
- } else {
- newset->Add(set.Get(candidate.second).Str);
- }
+ for (const auto& candidate : Candidates) {
+ if (IsCompound(candidate.second)) {
+ additions++;
+ newset->Add(set.Get(Prev(candidate.second)).Str, set.Get(Next(candidate.second)).Str);
+ } else {
+ newset->Add(set.Get(candidate.second).Str);
+ }
}
-
- deletions = set.size() - (newset->size() - additions);
+
+ deletions = set.size() - (newset->size() - additions);
}
- Current = newset;
- Current->BuildHierarchy();
- return deletions + additions;
+ Current = newset;
+ Current->BuildHierarchy();
+ return deletions + additions;
}
- ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) {
- size_t totalsz = 0;
- for (auto it : Input)
+ ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) {
+ size_t totalsz = 0;
+ for (auto it : Input)
totalsz += it.size();
- while (maxiters) {
- maxiters--;
+ while (maxiters) {
+ maxiters--;
- RebuildCounts(maxentries * Settings.GrowLimit, false);
+ RebuildCounts(maxentries * Settings.GrowLimit, false);
- if (Settings.Verbose) {
- TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size());
+ if (Settings.Verbose) {
+ TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size());
Clog << Sprintf("%-110s RSS=%" PRIu32 "M", mess.data(), (ui32)(TRusage::Get().MaxRss >> 20)) << Endl;
- }
+ }
+
+ ui32 diff = BuildNextGeneration(maxentries);
- ui32 diff = BuildNextGeneration(maxentries);
-
- if (Current->size() == maxentries && diff < mindiff)
- break;
+ if (Current->size() == maxentries && diff < mindiff)
+ break;
}
- RebuildCounts(0, true);
- Current->SetScores(Settings.Score);
- return maxiters;
+ RebuildCounts(0, true);
+ Current->SetScores(Settings.Score);
+ return maxiters;
}
}
diff --git a/library/cpp/codecs/greedy_dict/gd_builder.h b/library/cpp/codecs/greedy_dict/gd_builder.h
index ab0057e1ca..b8e9a5e37b 100644
--- a/library/cpp/codecs/greedy_dict/gd_builder.h
+++ b/library/cpp/codecs/greedy_dict/gd_builder.h
@@ -6,89 +6,89 @@
#include <util/random/fast.h>
namespace NGreedyDict {
- struct TBuildSettings {
- EEntryStatTest StatTest = EST_SIMPLE_NORM;
- EEntryScore Score = ES_LEN_SIMPLE;
-
- float MinPValue = 0.75;
- ui32 MinAbsCount = 10;
- ui32 GrowLimit = 10; // times of maxentries
- bool Verbose = false;
- };
-
- class TDictBuilder {
- using TCompoundCounts = THashMap<ui64, ui32, THash<ui64>, TEqualTo<ui64>, TPoolAllocator>;
- using TCandidate = std::pair<float, ui64>;
- using TCandidates = TVector<TCandidate>;
-
- private:
- TFastRng64 Rng{0x1a5d0ac170565c1c, 0x0be7bc27, 0x6235f6f57820aa0d, 0xafdc7fb};
- TStringBufs Input;
-
- THolder<TEntrySet> Current;
-
- TMemoryPool CompoundCountsPool;
- THolder<TCompoundCounts> CompoundCounts;
-
- TCandidates Candidates;
-
- TBuildSettings Settings;
-
- public:
- TDictBuilder(const TBuildSettings& s = TBuildSettings())
- : CompoundCountsPool(8112, TMemoryPool::TLinearGrow::Instance())
- , Settings(s)
- {
- }
-
- void SetInput(const TStringBufs& in) {
- Input = in;
- }
-
- const TBuildSettings& GetSettings() const {
- return Settings;
- }
-
- TBuildSettings& GetSettings() {
- return Settings;
- }
-
- void SetSettings(const TBuildSettings& s) {
- Settings = s;
- }
-
- TEntrySet& EntrySet() {
- return *Current;
- }
-
- const TEntrySet& EntrySet() const {
- return *Current;
- }
-
- THolder<TEntrySet> ReleaseEntrySet() {
- return std::move(Current);
- }
-
- ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10);
-
- public:
- void RebuildCounts(ui32 maxcand, bool final);
- ui32 /*diff size*/ BuildNextGeneration(ui32 maxent);
-
- static bool IsCompound(ui64 ent) {
- return ent & 0xFFFFFFFF00000000ULL;
- }
-
- static ui32 Next(ui64 ent) {
- return ent;
- }
- static ui32 Prev(ui64 ent) {
- return (ent >> 32) - 1;
- }
-
- static ui64 Compose(ui32 prev, ui32 next) {
- return ((prev + 1ULL) << 32) | next;
- }
- };
+ struct TBuildSettings {
+ EEntryStatTest StatTest = EST_SIMPLE_NORM;
+ EEntryScore Score = ES_LEN_SIMPLE;
+
+ float MinPValue = 0.75;
+ ui32 MinAbsCount = 10;
+ ui32 GrowLimit = 10; // times of maxentries
+ bool Verbose = false;
+ };
+
+ class TDictBuilder {
+ using TCompoundCounts = THashMap<ui64, ui32, THash<ui64>, TEqualTo<ui64>, TPoolAllocator>;
+ using TCandidate = std::pair<float, ui64>;
+ using TCandidates = TVector<TCandidate>;
+
+ private:
+ TFastRng64 Rng{0x1a5d0ac170565c1c, 0x0be7bc27, 0x6235f6f57820aa0d, 0xafdc7fb};
+ TStringBufs Input;
+
+ THolder<TEntrySet> Current;
+
+ TMemoryPool CompoundCountsPool;
+ THolder<TCompoundCounts> CompoundCounts;
+
+ TCandidates Candidates;
+
+ TBuildSettings Settings;
+
+ public:
+ TDictBuilder(const TBuildSettings& s = TBuildSettings())
+ : CompoundCountsPool(8112, TMemoryPool::TLinearGrow::Instance())
+ , Settings(s)
+ {
+ }
+
+ void SetInput(const TStringBufs& in) {
+ Input = in;
+ }
+
+ const TBuildSettings& GetSettings() const {
+ return Settings;
+ }
+
+ TBuildSettings& GetSettings() {
+ return Settings;
+ }
+
+ void SetSettings(const TBuildSettings& s) {
+ Settings = s;
+ }
+
+ TEntrySet& EntrySet() {
+ return *Current;
+ }
+
+ const TEntrySet& EntrySet() const {
+ return *Current;
+ }
+
+ THolder<TEntrySet> ReleaseEntrySet() {
+ return std::move(Current);
+ }
+
+ ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10);
+
+ public:
+ void RebuildCounts(ui32 maxcand, bool final);
+ ui32 /*diff size*/ BuildNextGeneration(ui32 maxent);
+
+ static bool IsCompound(ui64 ent) {
+ return ent & 0xFFFFFFFF00000000ULL;
+ }
+
+ static ui32 Next(ui64 ent) {
+ return ent;
+ }
+ static ui32 Prev(ui64 ent) {
+ return (ent >> 32) - 1;
+ }
+
+ static ui64 Compose(ui32 prev, ui32 next) {
+ return ((prev + 1ULL) << 32) | next;
+ }
+ };
}
diff --git a/library/cpp/codecs/greedy_dict/gd_entry.cpp b/library/cpp/codecs/greedy_dict/gd_entry.cpp
index 0603a9fca8..2c315c7f7c 100644
--- a/library/cpp/codecs/greedy_dict/gd_entry.cpp
+++ b/library/cpp/codecs/greedy_dict/gd_entry.cpp
@@ -5,94 +5,94 @@
#include <util/generic/singleton.h>
namespace NGreedyDict {
- class TAlphas {
- char Memory[512];
-
- public:
- TStringBufs Alphas;
-
- TAlphas() {
- for (ui32 i = 0; i < 256; ++i) {
- Memory[2 * i] = (char)i;
- Memory[2 * i + 1] = 0;
-
- Alphas.push_back(TStringBuf(&Memory[2 * i], 1));
- }
- }
- };
-
- void TEntrySet::InitWithAlpha() {
- Pool.ClearKeepFirstChunk();
- const TStringBufs& a = Singleton<TAlphas>()->Alphas;
- for (auto it : a) {
- Add(it);
+ class TAlphas {
+ char Memory[512];
+
+ public:
+ TStringBufs Alphas;
+
+ TAlphas() {
+ for (ui32 i = 0; i < 256; ++i) {
+ Memory[2 * i] = (char)i;
+ Memory[2 * i + 1] = 0;
+
+ Alphas.push_back(TStringBuf(&Memory[2 * i], 1));
+ }
}
- BuildHierarchy();
+ };
+
+ void TEntrySet::InitWithAlpha() {
+ Pool.ClearKeepFirstChunk();
+ const TStringBufs& a = Singleton<TAlphas>()->Alphas;
+ for (auto it : a) {
+ Add(it);
+ }
+ BuildHierarchy();
}
- void TEntrySet::BuildHierarchy() {
- Sort(begin(), end(), TEntry::StrLess);
+ void TEntrySet::BuildHierarchy() {
+ Sort(begin(), end(), TEntry::StrLess);
- TCompactTrieBuilder<char, ui32, TAsIsPacker<ui32>> builder(CTBF_PREFIX_GROUPED);
+ TCompactTrieBuilder<char, ui32, TAsIsPacker<ui32>> builder(CTBF_PREFIX_GROUPED);
- for (iterator it = begin(); it != end(); ++it) {
- it->Number = (it - begin());
- TStringBuf suff = it->Str;
- size_t len = 0;
- ui32 val = 0;
+ for (iterator it = begin(); it != end(); ++it) {
+ it->Number = (it - begin());
+ TStringBuf suff = it->Str;
+ size_t len = 0;
+ ui32 val = 0;
if (builder.FindLongestPrefix(suff.data(), suff.size(), &len, &val) && len) {
- it->NearestPrefix = val;
- }
+ it->NearestPrefix = val;
+ }
builder.Add(suff.data(), suff.size(), it->Number);
}
- TBufferOutput bout;
- builder.Save(bout);
- Trie.Init(TBlob::FromBuffer(bout.Buffer()));
+ TBufferOutput bout;
+ builder.Save(bout);
+ Trie.Init(TBlob::FromBuffer(bout.Buffer()));
}
- TEntry* TEntrySet::FindPrefix(TStringBuf& str) {
- size_t len = 0;
- ui32 off = 0;
+ TEntry* TEntrySet::FindPrefix(TStringBuf& str) {
+ size_t len = 0;
+ ui32 off = 0;
- if (!Trie.FindLongestPrefix(str, &len, &off)) {
- return nullptr;
- }
+ if (!Trie.FindLongestPrefix(str, &len, &off)) {
+ return nullptr;
+ }
- str.Skip(len);
- return &Get(off);
+ str.Skip(len);
+ return &Get(off);
}
- void TEntrySet::SetModelP() {
- for (iterator it = begin(); it != end(); ++it) {
- TEntry& e = *it;
+ void TEntrySet::SetModelP() {
+ for (iterator it = begin(); it != end(); ++it) {
+ TEntry& e = *it;
- if (!e.HasPrefix()) {
- e.ModelP = 0;
- continue;
- }
+ if (!e.HasPrefix()) {
+ e.ModelP = 0;
+ continue;
+ }
- TStringBuf suff = e.Str;
- const TEntry& p = Get(e.NearestPrefix);
- suff.Skip(p.Len());
+ TStringBuf suff = e.Str;
+ const TEntry& p = Get(e.NearestPrefix);
+ suff.Skip(p.Len());
- float modelp = float(p.Count + e.Count) / TotalCount;
+ float modelp = float(p.Count + e.Count) / TotalCount;
- while (!!suff) {
- TEntry* pp = FindPrefix(suff);
- modelp *= float(pp->Count + e.Count) / TotalCount;
- }
+ while (!!suff) {
+ TEntry* pp = FindPrefix(suff);
+ modelp *= float(pp->Count + e.Count) / TotalCount;
+ }
- e.ModelP = modelp;
+ e.ModelP = modelp;
}
}
- void TEntrySet::SetScores(EEntryScore s) {
- for (auto& it : *this) {
- it.Score = Score(s, it.Len(), it.ModelP, it.Count, TotalCount);
- }
+ void TEntrySet::SetScores(EEntryScore s) {
+ for (auto& it : *this) {
+ it.Score = Score(s, it.Len(), it.ModelP, it.Count, TotalCount);
+ }
}
}
diff --git a/library/cpp/codecs/greedy_dict/gd_entry.h b/library/cpp/codecs/greedy_dict/gd_entry.h
index e123c66b4a..18b5be0e15 100644
--- a/library/cpp/codecs/greedy_dict/gd_entry.h
+++ b/library/cpp/codecs/greedy_dict/gd_entry.h
@@ -11,93 +11,93 @@
#include <util/memory/pool.h>
namespace NGreedyDict {
- using TStringBufs = TVector<TStringBuf>;
+ using TStringBufs = TVector<TStringBuf>;
- struct TEntry {
- static const i32 NoPrefix = -1;
+ struct TEntry {
+ static const i32 NoPrefix = -1;
- TStringBuf Str;
+ TStringBuf Str;
- i32 NearestPrefix = NoPrefix;
- ui32 Count = 0;
- ui32 Number = 0;
- float ModelP = 0;
- float Score = 0;
+ i32 NearestPrefix = NoPrefix;
+ ui32 Count = 0;
+ ui32 Number = 0;
+ float ModelP = 0;
+ float Score = 0;
- TEntry(TStringBuf b = TStringBuf(), ui32 cnt = 0)
- : Str(b)
- , Count(cnt)
- {
- }
+ TEntry(TStringBuf b = TStringBuf(), ui32 cnt = 0)
+ : Str(b)
+ , Count(cnt)
+ {
+ }
- bool HasPrefix() const {
- return NearestPrefix != NoPrefix;
- }
- ui32 Len() const {
+ bool HasPrefix() const {
+ return NearestPrefix != NoPrefix;
+ }
+ ui32 Len() const {
return Str.size();
- }
+ }
- static bool StrLess(const TEntry& a, const TEntry& b) {
- return a.Str < b.Str;
- }
- static bool NumberLess(const TEntry& a, const TEntry& b) {
- return a.Number < b.Number;
- }
- static bool ScoreMore(const TEntry& a, const TEntry& b) {
- return a.Score > b.Score;
- }
- };
+ static bool StrLess(const TEntry& a, const TEntry& b) {
+ return a.Str < b.Str;
+ }
+ static bool NumberLess(const TEntry& a, const TEntry& b) {
+ return a.Number < b.Number;
+ }
+ static bool ScoreMore(const TEntry& a, const TEntry& b) {
+ return a.Score > b.Score;
+ }
+ };
- class TEntrySet: public TVector<TEntry>, TNonCopyable {
- TMemoryPool Pool{8112};
- TCompactTrie<char, ui32, TAsIsPacker<ui32>> Trie;
+ class TEntrySet: public TVector<TEntry>, TNonCopyable {
+ TMemoryPool Pool{8112};
+ TCompactTrie<char, ui32, TAsIsPacker<ui32>> Trie;
- public:
- ui32 TotalCount = 0;
+ public:
+ ui32 TotalCount = 0;
- void InitWithAlpha();
+ void InitWithAlpha();
- void Add(TStringBuf a) {
+ void Add(TStringBuf a) {
push_back(TStringBuf(Pool.Append(a.data(), a.size()), a.size()));
- }
+ }
- void Add(TStringBuf a, TStringBuf b) {
+ void Add(TStringBuf a, TStringBuf b) {
size_t sz = a.size() + b.size();
- char* p = (char*)Pool.Allocate(sz);
+ char* p = (char*)Pool.Allocate(sz);
memcpy(p, a.data(), a.size());
memcpy(p + a.size(), b.data(), b.size());
- push_back(TStringBuf(p, sz));
- }
+ push_back(TStringBuf(p, sz));
+ }
- TEntry& Get(ui32 idx) {
- return (*this)[idx];
- }
+ TEntry& Get(ui32 idx) {
+ return (*this)[idx];
+ }
- const TEntry& Get(ui32 idx) const {
- return (*this)[idx];
- }
+ const TEntry& Get(ui32 idx) const {
+ return (*this)[idx];
+ }
- void BuildHierarchy();
+ void BuildHierarchy();
- // longest prefix
- TEntry* FindPrefix(TStringBuf& str);
+ // longest prefix
+ TEntry* FindPrefix(TStringBuf& str);
- const TEntry* FindPrefix(TStringBuf& str) const {
- return ((TEntrySet*)this)->FindPrefix(str);
- }
+ const TEntry* FindPrefix(TStringBuf& str) const {
+ return ((TEntrySet*)this)->FindPrefix(str);
+ }
- const TEntry* FirstPrefix(const TEntry& e, TStringBuf& suff) {
- if (!e.HasPrefix())
- return nullptr;
+ const TEntry* FirstPrefix(const TEntry& e, TStringBuf& suff) {
+ if (!e.HasPrefix())
+ return nullptr;
- const TEntry& p = Get(e.NearestPrefix);
- suff = e.Str;
+ const TEntry& p = Get(e.NearestPrefix);
+ suff = e.Str;
suff.Skip(p.Str.size());
- return &p;
- }
+ return &p;
+ }
- void SetModelP();
- void SetScores(EEntryScore);
- };
+ void SetModelP();
+ void SetScores(EEntryScore);
+ };
}
diff --git a/library/cpp/codecs/greedy_dict/gd_stats.h b/library/cpp/codecs/greedy_dict/gd_stats.h
index 90f46a0fb9..b63c4c38d2 100644
--- a/library/cpp/codecs/greedy_dict/gd_stats.h
+++ b/library/cpp/codecs/greedy_dict/gd_stats.h
@@ -1,78 +1,78 @@
#pragma once
-#include <util/generic/ymath.h>
+#include <util/generic/ymath.h>
#include <util/generic/algorithm.h>
#include <util/generic/yexception.h>
namespace NGreedyDict {
- enum EEntryScore {
- ES_COUNT,
- ES_LEN_COUNT,
- ES_SIMPLE,
- ES_LEN_SIMPLE,
- ES_SOLAR
- };
+ enum EEntryScore {
+ ES_COUNT,
+ ES_LEN_COUNT,
+ ES_SIMPLE,
+ ES_LEN_SIMPLE,
+ ES_SOLAR
+ };
- enum EEntryStatTest {
- EST_NONE = 0,
- EST_SIMPLE_NORM = 2
- };
+ enum EEntryStatTest {
+ EST_NONE = 0,
+ EST_SIMPLE_NORM = 2
+ };
- inline float ModelP(ui32 countA, ui32 countB, ui32 total) {
- return float(countA) * countB / total / total;
- }
+ inline float ModelP(ui32 countA, ui32 countB, ui32 total) {
+ return float(countA) * countB / total / total;
+ }
- // P (ab | dependent)
- inline float SimpleTest(float modelp, ui32 countAB, ui32 total) {
- float realp = float(countAB) / total;
- return modelp >= realp ? 0 : (realp - modelp);
- }
+ // P (ab | dependent)
+ inline float SimpleTest(float modelp, ui32 countAB, ui32 total) {
+ float realp = float(countAB) / total;
+ return modelp >= realp ? 0 : (realp - modelp);
+ }
- inline float SolarTest(float modelp, ui32 countAB, ui32 total) {
- float realp = float(countAB) / total;
- return modelp >= realp ? 0 : (modelp + realp * (log(realp / modelp) - 1));
- }
+ inline float SolarTest(float modelp, ui32 countAB, ui32 total) {
+ float realp = float(countAB) / total;
+ return modelp >= realp ? 0 : (modelp + realp * (log(realp / modelp) - 1));
+ }
- // P (ab | dependent) / P (ab)
- inline float SimpleTestNorm(float modelp, ui32 countAB, ui32 total) {
- float realp = float(countAB) / total;
- return modelp >= realp ? 0 : (realp - modelp) / realp;
- }
+ // P (ab | dependent) / P (ab)
+ inline float SimpleTestNorm(float modelp, ui32 countAB, ui32 total) {
+ float realp = float(countAB) / total;
+ return modelp >= realp ? 0 : (realp - modelp) / realp;
+ }
- inline float StatTest(EEntryStatTest test, float modelp, ui32 countAB, ui32 total) {
- if (!total) {
- return 0;
- }
- switch (test) {
- case EST_NONE:
- return 1;
- case EST_SIMPLE_NORM:
- return SimpleTestNorm(modelp, countAB, total);
- }
- Y_FAIL("no way!");
+ inline float StatTest(EEntryStatTest test, float modelp, ui32 countAB, ui32 total) {
+ if (!total) {
+ return 0;
+ }
+ switch (test) {
+ case EST_NONE:
+ return 1;
+ case EST_SIMPLE_NORM:
+ return SimpleTestNorm(modelp, countAB, total);
+ }
+ Y_FAIL("no way!");
return 0;
}
- inline float Score(EEntryScore score, ui32 len, float modelp, ui32 count, ui32 total) {
- if (!total) {
- return 0;
- }
- ui32 m = 1;
- switch (score) {
- case ES_LEN_COUNT:
- m = len;
+ inline float Score(EEntryScore score, ui32 len, float modelp, ui32 count, ui32 total) {
+ if (!total) {
+ return 0;
+ }
+ ui32 m = 1;
+ switch (score) {
+ case ES_LEN_COUNT:
+ m = len;
[[fallthrough]];
- case ES_COUNT:
- return m * count;
- case ES_LEN_SIMPLE:
- m = len;
+ case ES_COUNT:
+ return m * count;
+ case ES_LEN_SIMPLE:
+ m = len;
[[fallthrough]];
- case ES_SIMPLE:
- return m * SimpleTest(modelp, count, total);
- case ES_SOLAR:
- return SolarTest(modelp, count, total);
- }
- Y_FAIL("no way!");
+ case ES_SIMPLE:
+ return m * SimpleTest(modelp, count, total);
+ case ES_SOLAR:
+ return SolarTest(modelp, count, total);
+ }
+ Y_FAIL("no way!");
return 0;
}
diff --git a/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp
index e33976d333..679089a11b 100644
--- a/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp
+++ b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp
@@ -6,11 +6,11 @@
#include <util/generic/ymath.h>
class TGreedyDictTest: public TTestBase {
- UNIT_TEST_SUITE(TGreedyDictTest);
+ UNIT_TEST_SUITE(TGreedyDictTest);
UNIT_TEST(TestEntrySet)
UNIT_TEST(TestBuilder0)
UNIT_TEST(TestBuilder)
- UNIT_TEST_SUITE_END();
+ UNIT_TEST_SUITE_END();
void TestEntrySet() {
using namespace NGreedyDict;
@@ -120,7 +120,7 @@ class TGreedyDictTest: public TTestBase {
}
void FillData(NGreedyDict::TStringBufs& data) {
- static const char* urls[] = {"http://53.ru/car/motors/foreign/opel/tigra/", "http://abakan.24au.ru/tender/85904/", "http://anm15.gulaig.com/", "http://avto-parts.com/mercedes-benz/mercedes-benz-w220-1998-2005/category-442/category-443/", "http://ballooncousin.co.uk/", "http://benzol.ru/equipment/?id=1211&parent=514", "http://blazingseorank.com/blazing-seo-rank-free-website-analysis-to-increase-rank-and-traffic-450.html", "http://blogblaugrana.contadorwebmasters.com/", "http://bristolhash.org.uk/bh3cntct.php", "http://broker.borovichi.ru/category/item/3/1/0/8/28/257", "http://canoncompactcamerax.blogspot.com/", "http://classifieds.smashits.com/p,107881,email-to-friend.htm", "http://conferences.ksde.org/Portals/132/FallAssessment/SAVETHEDAY-FA09.pdf", "http://eway.vn/raovat/325-dien-tu-gia-dung/337-dieu-hoa/98041-b1-sua-may-lanh-quan-binh-tan-sua-may-lanh-quan-binh-chanh-hh-979676119-toan-quoc.html", "http://gallery.e2bn.org/asset73204_8-.html", "http://goplay.nsw.gov.au/activities-for-kids/by/historic-houses-trust/?startdate=2012-07-10", "http://grichards19067.multiply.com/", "http://hotkovo.egent.ru/user/89262269084/", "http://howimetyourself.com/?redirect_to=http://gomiso.com/m/suits/seasons/2/episodes/2", "http://islamqa.com/hi/ref/9014/DEAD%20PEOPLE%20GOOD%20DEEDS", "http://lapras.rutube.ru/", "http://nceluiko.ya.ru/", "http://nyanyanyanyaa.beon.ru/", "http://ozbo.com/Leaf-River-DV-7SS-7-0-MP-Game-Camera-K1-32541.html", "http://sbantom.ru/catalog/chasy/632753.html", "http://shopingoff.com/index.php?option=com_virtuemart&Itemid=65&category_id=&page=shop.browse&manufacturer_id=122&limit=32&limitstart=96", "http://shopingoff.com/katalog-odezhdy/manufacturer/62-christian-audigier.html?limit=32&start=448", "https://webwinkel.ah.nl/process?fh_location=//ecommerce/nl_NL/categories%3C%7Becommerce_shoc1%7D/it_show_product_code_1384%3E%7B10%3B20%7D/pr_startdate%3C20120519/pr_enddate%3E20120519/pr_ltc_allowed%3E%7Bbowi%7D/categories%3C%7Becommerce_shoc1_1al%7D/categories%3C%7Becommerce_shoc1_1al_1ahal%7D&&action=albert_noscript.modules.build", "http://top100.rambler.ru/navi/?theme=208/210/371&rgn=17", "http://volgogradskaya-oblast.extra-m.ru/classifieds/rabota/vakansii/banki-investicii/901467/", "http://wikien4.appspot.com/wiki/Warburg_hypothesis", "http://wola_baranowska.kamerzysta24.com.pl/", "http://www.10dot0dot0dot1.com/", "http://www.anima-redux.ru/index.php?key=gifts+teenage+girls", "http://www.aquaticabyseaworld.com/Calendar.aspx/CP/CP/CP/sp-us/CP/CP/ParkMap/Tickets/Weather.aspx", "http://www.autousa.com/360spin/2012_cadillac_ctssportwagon_3.6awdpremiumcollection.htm", "http://www.booking.com/city/gb/paignton-aireborough.html?inac=0&lang=pl", "http://www.booking.com/city/it/vodo-cadore.en.html", "http://www.booking.com/district/us/new-york/rockefeller-center.html&lang=no", "http://www.booking.com/hotel/bg/crown-fort-club.lv.html", "http://www.booking.com/hotel/ca/gouverneur-rimouski.ar.html", "http://www.booking.com/hotel/ch/l-auberge-du-chalet-a-gobet.fi.html", "http://www.booking.com/hotel/de/mark-garni.ru.html?aid=337384;label=yandex-hotel-mark-garni-68157-%7Bparam1%7D", "http://www.booking.com/hotel/de/mercure-goldschmieding-castrop-rauxel.ro.html", "http://www.booking.com/hotel/de/zollenspieker-fahrhaus.fr.html", "http://www.booking.com/hotel/es/jardin-metropolitano.ca.html", "http://www.booking.com/hotel/fr/clim.fr.html", "http://www.booking.com/hotel/fr/radisson-sas-toulouse-airport.et.html", "http://www.booking.com/hotel/gb/stgileshotel.ro.html?srfid=68c7fe42a03653a8796c84435c5299e4X16?tab=4", "http://www.booking.com/hotel/gr/rodos-park-suites.ru.html", "http://www.booking.com/hotel/id/le-grande-suites-bali.ru.html", "http://www.booking.com/hotel/it/mozart.it.html?aid=321655", "http://www.booking.com/hotel/ni/bahia-del-sol-villas.ru.html?dcid=1;dva=0", "http://www.booking.com/hotel/nl/cpschiphol.ro.html.ro.html?tab=4", "http://www.booking.com/hotel/th/laem-din.en-gb.html", "http://www.booking.com/hotel/th/tinidee-ranong.en.html", "http://www.booking.com/hotel/us/best-western-plus-merrimack-valley.hu.html", "http://www.booking.com/hotel/vn/tan-hai-long.km.html", "http://www.booking.com/landmark/au/royal-brisbane-women-s-hospital.vi.html", "http://www.booking.com/landmark/hk/nam-cheong-station.html&lang=id", "http://www.booking.com/landmark/it/spanish-steps.ca.html", "http://www.booking.com/landmark/sg/asian-civilisations-museum.html&lang=fi", "http://www.booking.com/place/fi-1376029.pt.html", "http://www.booking.com/place/tn257337.pl.html", "http://www.booking.com/region/ca/niagarafalls.ar.html&selected_currency=PLN", "http://www.booking.com/region/mx/queretaro.pt-pt.html&selected_currency=AUD", "http://www.booking.com/searchresults.en.html?city=20063074", "http://www.booking.com/searchresults.et.html?checkin=;checkout=;city=-394632", "http://www.booking.com/searchresults.lv.html?region=3936", "http://www.cevredanismanlari.com/index.php/component/k2/index.php/mevzuat/genel-yazlar/item/dosyalar/index.php?option=com_k2&view=item&id=16:iso-14001-%C3%A7evre-y%C3%B6netim-sistemi&Itemid=132&limitstart=107120", "http://www.dh-wholesaler.com/MENS-POLO-RACING-TEE-RL-p-417.html", "http://www.employabilityonline.net/", "http://www.esso.inc.ru/board/tools.php?event=profile&pname=Invinerrq", "http://www.filesurgery.ru/searchfw/kids_clothes-3.html", "http://www.furnitureandcarpetsource.com/Item.aspx?ItemID=-2107311899&ItemNum=53-T3048", "http://www.gets.cn/product/Gold-Sand-Lampwork-Glass-Beads--Flat-round--28x28x13mm_p260717.html", "http://www.gets.cn/wholesale-Sterling-Silver-Pendant-Findings-3577_S--L-Star-P-1.html?view=1&by=1", "http://www.homeandgardenadvice.com/diy/Mortgages_Loans_and_Financing/9221.html", "http://www.hongkongairport.com/eng/index.html/passenger/passenger/transport/to-from-airport/business/about-the-airport/transport/shopping/entertainment/t2/passenger/interactive-map.html", "http://www.hongkongairport.com/eng/index.html/shopping/insideshopping/all/passenger/transfer-transit/all/airline-information/shopping/entertainment/t2/business/about-the-airport/welcome.html", "http://www.hongkongairport.com/eng/index.html/transport/business/about-the-airport/transport/business/airport-authority/passenger/shopping/dining/all/dining.html", "http://www.idedge.com/index.cfm/fuseaction/category.display/category_id/298/index.cfm", "http://www.istanbulburda.com/aramalar.php", "http://www.jewelryinthenet.com/ads/AdDetail.aspx?AdID=1-0311002490689&stid=22-0111001020877", "http://www.johnnydepp.ru/forum/index.php?showtopic=1629&mode=linearplus&view=findpost&p=186977", "http://www.johnnydepp.ru/forum/index.php?showtopic=476&st=60&p=87379&", "http://www.joseleano.com/joomla/index.php/audio", "http://www.kaplicarehberi.com/tag/sakar-ilicali-kaplicalari/feed", "http://www.khaber.com.tr/arama.html?key=%C3%A7avdar", "http://www.kiz-oyunlari1.com/1783/4437/4363/1056/4170/Bump-Copter2-.html", "http://www.kiz-oyunlari1.com/3752/2612/4175/1166/3649/1047/Angelina-Oyunu.html", "http://www.kiz-oyunlari1.com/4266/3630/3665/3286/4121/301/3274/Sinir-Sinekler-.html", "http://www.kuldiga.lv/index.php?f=8&cat=371", "http://www.kuldiga.lv/index.php/img/index.php?l=lv&art_id=1836&show_c=&cat=85", "http://www.patronessa.ru/remontiruemsya/kuzovnie30raboti.html", "http://www.rapdict.org/Nu_Money?title=Talk:Nu_Money&action=edit", "http://www.serafin-phu.tabor24.com/?page=8", "http://www.shoes-store.org/brand1/Kids/Minnetonka.html", "http://www.shoes-store.org/shoes-store.xml", "http://www.way2allah.com/khotab-download-34695.htm"};
+ static const char* urls[] = {"http://53.ru/car/motors/foreign/opel/tigra/", "http://abakan.24au.ru/tender/85904/", "http://anm15.gulaig.com/", "http://avto-parts.com/mercedes-benz/mercedes-benz-w220-1998-2005/category-442/category-443/", "http://ballooncousin.co.uk/", "http://benzol.ru/equipment/?id=1211&parent=514", "http://blazingseorank.com/blazing-seo-rank-free-website-analysis-to-increase-rank-and-traffic-450.html", "http://blogblaugrana.contadorwebmasters.com/", "http://bristolhash.org.uk/bh3cntct.php", "http://broker.borovichi.ru/category/item/3/1/0/8/28/257", "http://canoncompactcamerax.blogspot.com/", "http://classifieds.smashits.com/p,107881,email-to-friend.htm", "http://conferences.ksde.org/Portals/132/FallAssessment/SAVETHEDAY-FA09.pdf", "http://eway.vn/raovat/325-dien-tu-gia-dung/337-dieu-hoa/98041-b1-sua-may-lanh-quan-binh-tan-sua-may-lanh-quan-binh-chanh-hh-979676119-toan-quoc.html", "http://gallery.e2bn.org/asset73204_8-.html", "http://goplay.nsw.gov.au/activities-for-kids/by/historic-houses-trust/?startdate=2012-07-10", "http://grichards19067.multiply.com/", "http://hotkovo.egent.ru/user/89262269084/", "http://howimetyourself.com/?redirect_to=http://gomiso.com/m/suits/seasons/2/episodes/2", "http://islamqa.com/hi/ref/9014/DEAD%20PEOPLE%20GOOD%20DEEDS", "http://lapras.rutube.ru/", "http://nceluiko.ya.ru/", "http://nyanyanyanyaa.beon.ru/", "http://ozbo.com/Leaf-River-DV-7SS-7-0-MP-Game-Camera-K1-32541.html", "http://sbantom.ru/catalog/chasy/632753.html", "http://shopingoff.com/index.php?option=com_virtuemart&Itemid=65&category_id=&page=shop.browse&manufacturer_id=122&limit=32&limitstart=96", "http://shopingoff.com/katalog-odezhdy/manufacturer/62-christian-audigier.html?limit=32&start=448", "https://webwinkel.ah.nl/process?fh_location=//ecommerce/nl_NL/categories%3C%7Becommerce_shoc1%7D/it_show_product_code_1384%3E%7B10%3B20%7D/pr_startdate%3C20120519/pr_enddate%3E20120519/pr_ltc_allowed%3E%7Bbowi%7D/categories%3C%7Becommerce_shoc1_1al%7D/categories%3C%7Becommerce_shoc1_1al_1ahal%7D&&action=albert_noscript.modules.build", "http://top100.rambler.ru/navi/?theme=208/210/371&rgn=17", "http://volgogradskaya-oblast.extra-m.ru/classifieds/rabota/vakansii/banki-investicii/901467/", "http://wikien4.appspot.com/wiki/Warburg_hypothesis", "http://wola_baranowska.kamerzysta24.com.pl/", "http://www.10dot0dot0dot1.com/", "http://www.anima-redux.ru/index.php?key=gifts+teenage+girls", "http://www.aquaticabyseaworld.com/Calendar.aspx/CP/CP/CP/sp-us/CP/CP/ParkMap/Tickets/Weather.aspx", "http://www.autousa.com/360spin/2012_cadillac_ctssportwagon_3.6awdpremiumcollection.htm", "http://www.booking.com/city/gb/paignton-aireborough.html?inac=0&lang=pl", "http://www.booking.com/city/it/vodo-cadore.en.html", "http://www.booking.com/district/us/new-york/rockefeller-center.html&lang=no", "http://www.booking.com/hotel/bg/crown-fort-club.lv.html", "http://www.booking.com/hotel/ca/gouverneur-rimouski.ar.html", "http://www.booking.com/hotel/ch/l-auberge-du-chalet-a-gobet.fi.html", "http://www.booking.com/hotel/de/mark-garni.ru.html?aid=337384;label=yandex-hotel-mark-garni-68157-%7Bparam1%7D", "http://www.booking.com/hotel/de/mercure-goldschmieding-castrop-rauxel.ro.html", "http://www.booking.com/hotel/de/zollenspieker-fahrhaus.fr.html", "http://www.booking.com/hotel/es/jardin-metropolitano.ca.html", "http://www.booking.com/hotel/fr/clim.fr.html", "http://www.booking.com/hotel/fr/radisson-sas-toulouse-airport.et.html", "http://www.booking.com/hotel/gb/stgileshotel.ro.html?srfid=68c7fe42a03653a8796c84435c5299e4X16?tab=4", "http://www.booking.com/hotel/gr/rodos-park-suites.ru.html", "http://www.booking.com/hotel/id/le-grande-suites-bali.ru.html", "http://www.booking.com/hotel/it/mozart.it.html?aid=321655", "http://www.booking.com/hotel/ni/bahia-del-sol-villas.ru.html?dcid=1;dva=0", "http://www.booking.com/hotel/nl/cpschiphol.ro.html.ro.html?tab=4", "http://www.booking.com/hotel/th/laem-din.en-gb.html", "http://www.booking.com/hotel/th/tinidee-ranong.en.html", "http://www.booking.com/hotel/us/best-western-plus-merrimack-valley.hu.html", "http://www.booking.com/hotel/vn/tan-hai-long.km.html", "http://www.booking.com/landmark/au/royal-brisbane-women-s-hospital.vi.html", "http://www.booking.com/landmark/hk/nam-cheong-station.html&lang=id", "http://www.booking.com/landmark/it/spanish-steps.ca.html", "http://www.booking.com/landmark/sg/asian-civilisations-museum.html&lang=fi", "http://www.booking.com/place/fi-1376029.pt.html", "http://www.booking.com/place/tn257337.pl.html", "http://www.booking.com/region/ca/niagarafalls.ar.html&selected_currency=PLN", "http://www.booking.com/region/mx/queretaro.pt-pt.html&selected_currency=AUD", "http://www.booking.com/searchresults.en.html?city=20063074", "http://www.booking.com/searchresults.et.html?checkin=;checkout=;city=-394632", "http://www.booking.com/searchresults.lv.html?region=3936", "http://www.cevredanismanlari.com/index.php/component/k2/index.php/mevzuat/genel-yazlar/item/dosyalar/index.php?option=com_k2&view=item&id=16:iso-14001-%C3%A7evre-y%C3%B6netim-sistemi&Itemid=132&limitstart=107120", "http://www.dh-wholesaler.com/MENS-POLO-RACING-TEE-RL-p-417.html", "http://www.employabilityonline.net/", "http://www.esso.inc.ru/board/tools.php?event=profile&pname=Invinerrq", "http://www.filesurgery.ru/searchfw/kids_clothes-3.html", "http://www.furnitureandcarpetsource.com/Item.aspx?ItemID=-2107311899&ItemNum=53-T3048", "http://www.gets.cn/product/Gold-Sand-Lampwork-Glass-Beads--Flat-round--28x28x13mm_p260717.html", "http://www.gets.cn/wholesale-Sterling-Silver-Pendant-Findings-3577_S--L-Star-P-1.html?view=1&by=1", "http://www.homeandgardenadvice.com/diy/Mortgages_Loans_and_Financing/9221.html", "http://www.hongkongairport.com/eng/index.html/passenger/passenger/transport/to-from-airport/business/about-the-airport/transport/shopping/entertainment/t2/passenger/interactive-map.html", "http://www.hongkongairport.com/eng/index.html/shopping/insideshopping/all/passenger/transfer-transit/all/airline-information/shopping/entertainment/t2/business/about-the-airport/welcome.html", "http://www.hongkongairport.com/eng/index.html/transport/business/about-the-airport/transport/business/airport-authority/passenger/shopping/dining/all/dining.html", "http://www.idedge.com/index.cfm/fuseaction/category.display/category_id/298/index.cfm", "http://www.istanbulburda.com/aramalar.php", "http://www.jewelryinthenet.com/ads/AdDetail.aspx?AdID=1-0311002490689&stid=22-0111001020877", "http://www.johnnydepp.ru/forum/index.php?showtopic=1629&mode=linearplus&view=findpost&p=186977", "http://www.johnnydepp.ru/forum/index.php?showtopic=476&st=60&p=87379&", "http://www.joseleano.com/joomla/index.php/audio", "http://www.kaplicarehberi.com/tag/sakar-ilicali-kaplicalari/feed", "http://www.khaber.com.tr/arama.html?key=%C3%A7avdar", "http://www.kiz-oyunlari1.com/1783/4437/4363/1056/4170/Bump-Copter2-.html", "http://www.kiz-oyunlari1.com/3752/2612/4175/1166/3649/1047/Angelina-Oyunu.html", "http://www.kiz-oyunlari1.com/4266/3630/3665/3286/4121/301/3274/Sinir-Sinekler-.html", "http://www.kuldiga.lv/index.php?f=8&cat=371", "http://www.kuldiga.lv/index.php/img/index.php?l=lv&art_id=1836&show_c=&cat=85", "http://www.patronessa.ru/remontiruemsya/kuzovnie30raboti.html", "http://www.rapdict.org/Nu_Money?title=Talk:Nu_Money&action=edit", "http://www.serafin-phu.tabor24.com/?page=8", "http://www.shoes-store.org/brand1/Kids/Minnetonka.html", "http://www.shoes-store.org/shoes-store.xml", "http://www.way2allah.com/khotab-download-34695.htm"};
data.clear();
data.insert(data.begin(), urls, urls + Y_ARRAY_SIZE(urls));
}
@@ -128,7 +128,7 @@ class TGreedyDictTest: public TTestBase {
typedef THashMap<TStringBuf, NGreedyDict::TEntry> TDict;
TAutoPtr<NGreedyDict::TEntrySet> DoTestBuilder(const NGreedyDict::TBuildSettings& s,
- TDict& res) {
+ TDict& res) {
using namespace NGreedyDict;
TStringBufs data;
diff --git a/library/cpp/codecs/huffman_codec.cpp b/library/cpp/codecs/huffman_codec.cpp
index c8b126ccd0..650fe7cdfd 100644
--- a/library/cpp/codecs/huffman_codec.cpp
+++ b/library/cpp/codecs/huffman_codec.cpp
@@ -9,584 +9,584 @@
#include <util/string/printf.h>
namespace NCodecs {
- template <typename T>
- struct TCanonicalCmp {
- bool operator()(const T& a, const T& b) const {
- if (a.CodeLength == b.CodeLength) {
- return a.Char < b.Char;
- } else {
- return a.CodeLength < b.CodeLength;
- }
- }
- };
-
- template <typename T>
- struct TByCharCmp {
- bool operator()(const T& a, const T& b) const {
+ template <typename T>
+ struct TCanonicalCmp {
+ bool operator()(const T& a, const T& b) const {
+ if (a.CodeLength == b.CodeLength) {
+ return a.Char < b.Char;
+ } else {
+ return a.CodeLength < b.CodeLength;
+ }
+ }
+ };
+
+ template <typename T>
+ struct TByCharCmp {
+ bool operator()(const T& a, const T& b) const {
return a.Char < b.Char;
}
- };
+ };
- struct TTreeEntry {
- static const ui32 InvalidBranch = (ui32)-1;
+ struct TTreeEntry {
+ static const ui32 InvalidBranch = (ui32)-1;
- ui64 Freq = 0;
- ui32 Branches[2]{InvalidBranch, InvalidBranch};
+ ui64 Freq = 0;
+ ui32 Branches[2]{InvalidBranch, InvalidBranch};
- ui32 CodeLength = 0;
- ui8 Char = 0;
- bool Invalid = false;
+ ui32 CodeLength = 0;
+ ui8 Char = 0;
+ bool Invalid = false;
- TTreeEntry() = default;
+ TTreeEntry() = default;
- static bool ByFreq(const TTreeEntry& a, const TTreeEntry& b) {
- return a.Freq < b.Freq;
- }
+ static bool ByFreq(const TTreeEntry& a, const TTreeEntry& b) {
+ return a.Freq < b.Freq;
+ }
- static bool ByFreqRev(const TTreeEntry& a, const TTreeEntry& b) {
- return a.Freq > b.Freq;
- }
- };
+ static bool ByFreqRev(const TTreeEntry& a, const TTreeEntry& b) {
+ return a.Freq > b.Freq;
+ }
+ };
- using TCodeTree = TVector<TTreeEntry>;
+ using TCodeTree = TVector<TTreeEntry>;
- void InitTreeByFreqs(TCodeTree& tree, const ui64 freqs[256]) {
- tree.reserve(255 * 256 / 2); // worst case - balanced tree
+ void InitTreeByFreqs(TCodeTree& tree, const ui64 freqs[256]) {
+ tree.reserve(255 * 256 / 2); // worst case - balanced tree
- for (ui32 i = 0; i < 256; ++i) {
- tree.emplace_back();
- tree.back().Char = i;
- tree.back().Freq = freqs[i];
- }
+ for (ui32 i = 0; i < 256; ++i) {
+ tree.emplace_back();
+ tree.back().Char = i;
+ tree.back().Freq = freqs[i];
+ }
- StableSort(tree.begin(), tree.end(), TTreeEntry::ByFreq);
+ StableSort(tree.begin(), tree.end(), TTreeEntry::ByFreq);
}
- void InitTree(TCodeTree& tree, ISequenceReader* in) {
- using namespace NPrivate;
- ui64 freqs[256];
- Zero(freqs);
+ void InitTree(TCodeTree& tree, ISequenceReader* in) {
+ using namespace NPrivate;
+ ui64 freqs[256];
+ Zero(freqs);
- TStringBuf r;
- while (in->NextRegion(r)) {
+ TStringBuf r;
+ while (in->NextRegion(r)) {
for (ui64 i = 0; i < r.size(); ++i)
- ++freqs[(ui8)r[i]];
- }
+ ++freqs[(ui8)r[i]];
+ }
- InitTreeByFreqs(tree, freqs);
+ InitTreeByFreqs(tree, freqs);
}
- void CalculateCodeLengths(TCodeTree& tree) {
- Y_ENSURE(tree.size() == 256, " ");
- const ui32 firstbranch = tree.size();
+ void CalculateCodeLengths(TCodeTree& tree) {
+ Y_ENSURE(tree.size() == 256, " ");
+ const ui32 firstbranch = tree.size();
- ui32 curleaf = 0;
- ui32 curbranch = firstbranch;
+ ui32 curleaf = 0;
+ ui32 curbranch = firstbranch;
- // building code tree. two priority queues are combined in one.
- while (firstbranch - curleaf + tree.size() - curbranch >= 2) {
- TTreeEntry e;
+ // building code tree. two priority queues are combined in one.
+ while (firstbranch - curleaf + tree.size() - curbranch >= 2) {
+ TTreeEntry e;
- for (auto& branche : e.Branches) {
- ui32 br;
+ for (auto& branche : e.Branches) {
+ ui32 br;
- if (curleaf >= firstbranch)
- br = curbranch++;
- else if (curbranch >= tree.size())
- br = curleaf++;
- else if (tree[curleaf].Freq < tree[curbranch].Freq)
- br = curleaf++;
- else
- br = curbranch++;
+ if (curleaf >= firstbranch)
+ br = curbranch++;
+ else if (curbranch >= tree.size())
+ br = curleaf++;
+ else if (tree[curleaf].Freq < tree[curbranch].Freq)
+ br = curleaf++;
+ else
+ br = curbranch++;
- Y_ENSURE(br < tree.size(), " ");
- branche = br;
- e.Freq += tree[br].Freq;
- }
+ Y_ENSURE(br < tree.size(), " ");
+ branche = br;
+ e.Freq += tree[br].Freq;
+ }
- tree.push_back(e);
- PushHeap(tree.begin() + curbranch, tree.end(), TTreeEntry::ByFreqRev);
+ tree.push_back(e);
+ PushHeap(tree.begin() + curbranch, tree.end(), TTreeEntry::ByFreqRev);
}
- // computing code lengths
- for (ui64 i = tree.size() - 1; i >= firstbranch; --i) {
- TTreeEntry e = tree[i];
+ // computing code lengths
+ for (ui64 i = tree.size() - 1; i >= firstbranch; --i) {
+ TTreeEntry e = tree[i];
- for (auto branche : e.Branches)
- tree[branche].CodeLength = e.CodeLength + 1;
- }
+ for (auto branche : e.Branches)
+ tree[branche].CodeLength = e.CodeLength + 1;
+ }
+
+ // chopping off the branches
+ tree.resize(firstbranch);
- // chopping off the branches
- tree.resize(firstbranch);
+ Sort(tree.begin(), tree.end(), TCanonicalCmp<TTreeEntry>());
- Sort(tree.begin(), tree.end(), TCanonicalCmp<TTreeEntry>());
+ // simplification: we are stripping codes longer than 64 bits
+ while (!tree.empty() && tree.back().CodeLength > 64)
+ tree.pop_back();
- // simplification: we are stripping codes longer than 64 bits
- while (!tree.empty() && tree.back().CodeLength > 64)
- tree.pop_back();
+ // will not compress
+ if (tree.empty())
+ return;
- // will not compress
- if (tree.empty())
- return;
+ // special invalid code word
+ tree.back().Invalid = true;
+ }
- // special invalid code word
- tree.back().Invalid = true;
- }
+ struct TEncoderEntry {
+ ui64 Code = 0;
- struct TEncoderEntry {
- ui64 Code = 0;
+ ui8 CodeLength = 0;
+ ui8 Char = 0;
+ ui8 Invalid = true;
- ui8 CodeLength = 0;
- ui8 Char = 0;
- ui8 Invalid = true;
+ explicit TEncoderEntry(TTreeEntry e)
+ : CodeLength(e.CodeLength)
+ , Char(e.Char)
+ , Invalid(e.Invalid)
+ {
+ }
- explicit TEncoderEntry(TTreeEntry e)
- : CodeLength(e.CodeLength)
- , Char(e.Char)
- , Invalid(e.Invalid)
- {
- }
+ TEncoderEntry() = default;
+ };
- TEncoderEntry() = default;
- };
+ struct TEncoderTable {
+ TEncoderEntry Entries[256];
- struct TEncoderTable {
- TEncoderEntry Entries[256];
+ void Save(IOutputStream* out) const {
+ ui16 nval = 0;
- void Save(IOutputStream* out) const {
- ui16 nval = 0;
+ for (auto entrie : Entries)
+ nval += !entrie.Invalid;
+
+ ::Save(out, nval);
+
+ for (auto entrie : Entries) {
+ if (!entrie.Invalid) {
+ ::Save(out, entrie.Char);
+ ::Save(out, entrie.CodeLength);
+ }
+ }
+ }
- for (auto entrie : Entries)
- nval += !entrie.Invalid;
+ void Load(IInputStream* in) {
+ ui16 nval = 0;
+ ::Load(in, nval);
- ::Save(out, nval);
+ for (ui32 i = 0; i < 256; ++i)
+ Entries[i].Char = i;
- for (auto entrie : Entries) {
- if (!entrie.Invalid) {
- ::Save(out, entrie.Char);
- ::Save(out, entrie.CodeLength);
- }
+ for (ui32 i = 0; i < nval; ++i) {
+ ui8 ch = 0;
+ ui8 len = 0;
+ ::Load(in, ch);
+ ::Load(in, len);
+ Entries[ch].CodeLength = len;
+ Entries[ch].Invalid = false;
}
}
+ };
+
+ struct TDecoderEntry {
+ ui32 NextTable : 10;
+ ui32 Char : 8;
+ ui32 Invalid : 1;
+ ui32 Bad : 1;
+
+ TDecoderEntry()
+ : NextTable()
+ , Char()
+ , Invalid()
+ , Bad()
+ {
+ }
+ };
+
+ struct TDecoderTable: public TIntrusiveListItem<TDecoderTable> {
+ ui64 Length = 0;
+ ui64 BaseCode = 0;
- void Load(IInputStream* in) {
- ui16 nval = 0;
- ::Load(in, nval);
-
- for (ui32 i = 0; i < 256; ++i)
- Entries[i].Char = i;
-
- for (ui32 i = 0; i < nval; ++i) {
- ui8 ch = 0;
- ui8 len = 0;
- ::Load(in, ch);
- ::Load(in, len);
- Entries[ch].CodeLength = len;
- Entries[ch].Invalid = false;
- }
+ TDecoderEntry Entries[256];
+
+ TDecoderTable() {
+ Zero(Entries);
}
- };
-
- struct TDecoderEntry {
- ui32 NextTable : 10;
- ui32 Char : 8;
- ui32 Invalid : 1;
- ui32 Bad : 1;
-
- TDecoderEntry()
- : NextTable()
- , Char()
- , Invalid()
- , Bad()
- {
- }
- };
-
- struct TDecoderTable: public TIntrusiveListItem<TDecoderTable> {
- ui64 Length = 0;
- ui64 BaseCode = 0;
-
- TDecoderEntry Entries[256];
-
- TDecoderTable() {
- Zero(Entries);
- }
- };
-
- const int CACHE_BITS_COUNT = 16;
- class THuffmanCodec::TImpl: public TAtomicRefCount<TImpl> {
- TEncoderTable Encoder;
- TDecoderTable Decoder[256];
-
- TEncoderEntry Invalid;
-
- ui32 SubTablesNum;
-
- class THuffmanCache {
- struct TCacheEntry {
- int EndOffset : 24;
- int BitsLeft : 8;
- };
- TVector<char> DecodeCache;
- TVector<TCacheEntry> CacheEntries;
- const TImpl& Original;
-
- public:
- THuffmanCache(const THuffmanCodec::TImpl& encoder);
-
- void Decode(NBitIO::TBitInput& in, TBuffer& out) const;
+ };
+
+ const int CACHE_BITS_COUNT = 16;
+ class THuffmanCodec::TImpl: public TAtomicRefCount<TImpl> {
+ TEncoderTable Encoder;
+ TDecoderTable Decoder[256];
+
+ TEncoderEntry Invalid;
+
+ ui32 SubTablesNum;
+
+ class THuffmanCache {
+ struct TCacheEntry {
+ int EndOffset : 24;
+ int BitsLeft : 8;
+ };
+ TVector<char> DecodeCache;
+ TVector<TCacheEntry> CacheEntries;
+ const TImpl& Original;
+
+ public:
+ THuffmanCache(const THuffmanCodec::TImpl& encoder);
+
+ void Decode(NBitIO::TBitInput& in, TBuffer& out) const;
};
- THolder<THuffmanCache> Cache;
+ THolder<THuffmanCache> Cache;
- public:
- TImpl()
- : SubTablesNum(1)
- {
- Invalid.CodeLength = 255;
- }
+ public:
+ TImpl()
+ : SubTablesNum(1)
+ {
+ Invalid.CodeLength = 255;
+ }
- ui8 Encode(TStringBuf in, TBuffer& out) const {
- out.Clear();
+ ui8 Encode(TStringBuf in, TBuffer& out) const {
+ out.Clear();
if (in.empty()) {
- return 0;
- }
+ return 0;
+ }
out.Reserve(in.size() * 2);
- {
- NBitIO::TBitOutputVector<TBuffer> bout(&out);
- TStringBuf tin = in;
+ {
+ NBitIO::TBitOutputVector<TBuffer> bout(&out);
+ TStringBuf tin = in;
- // data is under compression
- bout.Write(1, 1);
+ // data is under compression
+ bout.Write(1, 1);
- for (auto t : tin) {
- const TEncoderEntry& ce = Encoder.Entries[(ui8)t];
+ for (auto t : tin) {
+ const TEncoderEntry& ce = Encoder.Entries[(ui8)t];
- bout.Write(ce.Code, ce.CodeLength);
+ bout.Write(ce.Code, ce.CodeLength);
- if (ce.Invalid) {
- bout.Write(t, 8);
- }
- }
+ if (ce.Invalid) {
+ bout.Write(t, 8);
+ }
+ }
- // in canonical huffman coding there cannot be a code having no 0 in the suffix
- // and shorter than 8 bits.
- bout.Write((ui64)-1, bout.GetByteReminder());
- return bout.GetByteReminder();
+ // in canonical huffman coding there cannot be a code having no 0 in the suffix
+ // and shorter than 8 bits.
+ bout.Write((ui64)-1, bout.GetByteReminder());
+ return bout.GetByteReminder();
}
}
- void Decode(TStringBuf in, TBuffer& out) const {
- out.Clear();
+ void Decode(TStringBuf in, TBuffer& out) const {
+ out.Clear();
if (in.empty()) {
- return;
- }
+ return;
+ }
- NBitIO::TBitInput bin(in);
- ui64 f = 0;
- bin.ReadK<1>(f);
+ NBitIO::TBitInput bin(in);
+ ui64 f = 0;
+ bin.ReadK<1>(f);
- // if data is uncompressed
- if (!f) {
- in.Skip(1);
+ // if data is uncompressed
+ if (!f) {
+ in.Skip(1);
out.Append(in.data(), in.size());
- } else {
+ } else {
out.Reserve(in.size() * 8);
- if (Cache.Get()) {
- Cache->Decode(bin, out);
- } else {
- while (ReadNextChar(bin, out)) {
- }
+ if (Cache.Get()) {
+ Cache->Decode(bin, out);
+ } else {
+ while (ReadNextChar(bin, out)) {
+ }
}
}
}
- Y_FORCE_INLINE int ReadNextChar(NBitIO::TBitInput& bin, TBuffer& out) const {
- const TDecoderTable* table = Decoder;
- TDecoderEntry e;
+ Y_FORCE_INLINE int ReadNextChar(NBitIO::TBitInput& bin, TBuffer& out) const {
+ const TDecoderTable* table = Decoder;
+ TDecoderEntry e;
- int bitsRead = 0;
- while (true) {
- ui64 code = 0;
+ int bitsRead = 0;
+ while (true) {
+ ui64 code = 0;
- if (Y_UNLIKELY(!bin.Read(code, table->Length)))
- return 0;
- bitsRead += table->Length;
+ if (Y_UNLIKELY(!bin.Read(code, table->Length)))
+ return 0;
+ bitsRead += table->Length;
- if (Y_UNLIKELY(code < table->BaseCode))
- return 0;
+ if (Y_UNLIKELY(code < table->BaseCode))
+ return 0;
- code -= table->BaseCode;
+ code -= table->BaseCode;
- if (Y_UNLIKELY(code > 255))
- return 0;
+ if (Y_UNLIKELY(code > 255))
+ return 0;
- e = table->Entries[code];
+ e = table->Entries[code];
- if (Y_UNLIKELY(e.Bad))
- return 0;
+ if (Y_UNLIKELY(e.Bad))
+ return 0;
- if (e.NextTable) {
- table = Decoder + e.NextTable;
+ if (e.NextTable) {
+ table = Decoder + e.NextTable;
} else {
- if (e.Invalid) {
- code = 0;
- bin.ReadK<8>(code);
- bitsRead += 8;
- out.Append((ui8)code);
- } else {
- out.Append((ui8)e.Char);
- }
-
- return bitsRead;
+ if (e.Invalid) {
+ code = 0;
+ bin.ReadK<8>(code);
+ bitsRead += 8;
+ out.Append((ui8)code);
+ } else {
+ out.Append((ui8)e.Char);
+ }
+
+ return bitsRead;
}
- }
+ }
- Y_ENSURE(false, " could not decode input");
- return 0;
+ Y_ENSURE(false, " could not decode input");
+ return 0;
}
- void GenerateEncoder(TCodeTree& tree) {
- const ui64 sz = tree.size();
+ void GenerateEncoder(TCodeTree& tree) {
+ const ui64 sz = tree.size();
- TEncoderEntry lastcode = Encoder.Entries[tree[0].Char] = TEncoderEntry(tree[0]);
+ TEncoderEntry lastcode = Encoder.Entries[tree[0].Char] = TEncoderEntry(tree[0]);
- for (ui32 i = 1; i < sz; ++i) {
- const TTreeEntry& te = tree[i];
- TEncoderEntry& e = Encoder.Entries[te.Char];
- e = TEncoderEntry(te);
+ for (ui32 i = 1; i < sz; ++i) {
+ const TTreeEntry& te = tree[i];
+ TEncoderEntry& e = Encoder.Entries[te.Char];
+ e = TEncoderEntry(te);
- e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength);
- lastcode = e;
+ e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength);
+ lastcode = e;
- e.Code = ReverseBits(e.Code, e.CodeLength);
+ e.Code = ReverseBits(e.Code, e.CodeLength);
+
+ if (e.Invalid)
+ Invalid = e;
+ }
- if (e.Invalid)
- Invalid = e;
- }
+ for (auto& e : Encoder.Entries) {
+ if (e.Invalid)
+ e = Invalid;
- for (auto& e : Encoder.Entries) {
- if (e.Invalid)
- e = Invalid;
-
- Y_ENSURE(e.CodeLength, " ");
- }
+ Y_ENSURE(e.CodeLength, " ");
+ }
}
- void RegenerateEncoder() {
- for (auto& entrie : Encoder.Entries) {
- if (entrie.Invalid)
- entrie.CodeLength = Invalid.CodeLength;
- }
+ void RegenerateEncoder() {
+ for (auto& entrie : Encoder.Entries) {
+ if (entrie.Invalid)
+ entrie.CodeLength = Invalid.CodeLength;
+ }
- Sort(Encoder.Entries, Encoder.Entries + 256, TCanonicalCmp<TEncoderEntry>());
+ Sort(Encoder.Entries, Encoder.Entries + 256, TCanonicalCmp<TEncoderEntry>());
- TEncoderEntry lastcode = Encoder.Entries[0];
+ TEncoderEntry lastcode = Encoder.Entries[0];
- for (ui32 i = 1; i < 256; ++i) {
- TEncoderEntry& e = Encoder.Entries[i];
- e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength);
- lastcode = e;
+ for (ui32 i = 1; i < 256; ++i) {
+ TEncoderEntry& e = Encoder.Entries[i];
+ e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength);
+ lastcode = e;
- e.Code = ReverseBits(e.Code, e.CodeLength);
- }
+ e.Code = ReverseBits(e.Code, e.CodeLength);
+ }
- for (auto& entrie : Encoder.Entries) {
- if (entrie.Invalid) {
- Invalid = entrie;
- break;
- }
- }
+ for (auto& entrie : Encoder.Entries) {
+ if (entrie.Invalid) {
+ Invalid = entrie;
+ break;
+ }
+ }
- Sort(Encoder.Entries, Encoder.Entries + 256, TByCharCmp<TEncoderEntry>());
+ Sort(Encoder.Entries, Encoder.Entries + 256, TByCharCmp<TEncoderEntry>());
- for (auto& entrie : Encoder.Entries) {
- if (entrie.Invalid)
- entrie = Invalid;
+ for (auto& entrie : Encoder.Entries) {
+ if (entrie.Invalid)
+ entrie = Invalid;
}
}
- void BuildDecoder() {
- TEncoderTable enc = Encoder;
- Sort(enc.Entries, enc.Entries + 256, TCanonicalCmp<TEncoderEntry>());
-
- TEncoderEntry& e1 = enc.Entries[0];
- Decoder[0].BaseCode = e1.Code;
- Decoder[0].Length = e1.CodeLength;
-
- for (auto e2 : enc.Entries) {
- SetEntry(Decoder, e2.Code, e2.CodeLength, e2);
- }
- Cache.Reset(new THuffmanCache(*this));
+ void BuildDecoder() {
+ TEncoderTable enc = Encoder;
+ Sort(enc.Entries, enc.Entries + 256, TCanonicalCmp<TEncoderEntry>());
+
+ TEncoderEntry& e1 = enc.Entries[0];
+ Decoder[0].BaseCode = e1.Code;
+ Decoder[0].Length = e1.CodeLength;
+
+ for (auto e2 : enc.Entries) {
+ SetEntry(Decoder, e2.Code, e2.CodeLength, e2);
+ }
+ Cache.Reset(new THuffmanCache(*this));
}
- void SetEntry(TDecoderTable* t, ui64 code, ui64 len, TEncoderEntry e) {
- Y_ENSURE(len >= t->Length, len << " < " << t->Length);
+ void SetEntry(TDecoderTable* t, ui64 code, ui64 len, TEncoderEntry e) {
+ Y_ENSURE(len >= t->Length, len << " < " << t->Length);
- ui64 idx = (code & MaskLowerBits(t->Length)) - t->BaseCode;
- TDecoderEntry& d = t->Entries[idx];
+ ui64 idx = (code & MaskLowerBits(t->Length)) - t->BaseCode;
+ TDecoderEntry& d = t->Entries[idx];
- if (len == t->Length) {
- Y_ENSURE(!d.NextTable, " ");
+ if (len == t->Length) {
+ Y_ENSURE(!d.NextTable, " ");
- d.Char = e.Char;
- d.Invalid = e.Invalid;
- return;
- }
+ d.Char = e.Char;
+ d.Invalid = e.Invalid;
+ return;
+ }
- if (!d.NextTable) {
- Y_ENSURE(SubTablesNum < Y_ARRAY_SIZE(Decoder), " ");
- d.NextTable = SubTablesNum++;
- TDecoderTable* nt = Decoder + d.NextTable;
- nt->Length = Min<ui64>(8, len - t->Length);
- nt->BaseCode = (code >> t->Length) & MaskLowerBits(nt->Length);
- }
+ if (!d.NextTable) {
+ Y_ENSURE(SubTablesNum < Y_ARRAY_SIZE(Decoder), " ");
+ d.NextTable = SubTablesNum++;
+ TDecoderTable* nt = Decoder + d.NextTable;
+ nt->Length = Min<ui64>(8, len - t->Length);
+ nt->BaseCode = (code >> t->Length) & MaskLowerBits(nt->Length);
+ }
- SetEntry(Decoder + d.NextTable, code >> t->Length, len - t->Length, e);
+ SetEntry(Decoder + d.NextTable, code >> t->Length, len - t->Length, e);
}
- void Learn(ISequenceReader* in) {
- {
- TCodeTree tree;
- InitTree(tree, in);
- CalculateCodeLengths(tree);
- Y_ENSURE(!tree.empty(), " ");
- GenerateEncoder(tree);
- }
- BuildDecoder();
+ void Learn(ISequenceReader* in) {
+ {
+ TCodeTree tree;
+ InitTree(tree, in);
+ CalculateCodeLengths(tree);
+ Y_ENSURE(!tree.empty(), " ");
+ GenerateEncoder(tree);
+ }
+ BuildDecoder();
}
void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) {
TCodeTree tree;
- ui64 freqsArray[256];
- Zero(freqsArray);
+ ui64 freqsArray[256];
+ Zero(freqsArray);
- for (const auto& freq : freqs)
- freqsArray[static_cast<ui8>(freq.first)] += freq.second;
+ for (const auto& freq : freqs)
+ freqsArray[static_cast<ui8>(freq.first)] += freq.second;
- InitTreeByFreqs(tree, freqsArray);
- CalculateCodeLengths(tree);
+ InitTreeByFreqs(tree, freqsArray);
+ CalculateCodeLengths(tree);
- Y_ENSURE(!tree.empty(), " ");
+ Y_ENSURE(!tree.empty(), " ");
- GenerateEncoder(tree);
- BuildDecoder();
- }
+ GenerateEncoder(tree);
+ BuildDecoder();
+ }
- void Save(IOutputStream* out) {
- ::Save(out, Invalid.CodeLength);
- Encoder.Save(out);
- }
+ void Save(IOutputStream* out) {
+ ::Save(out, Invalid.CodeLength);
+ Encoder.Save(out);
+ }
- void Load(IInputStream* in) {
- ::Load(in, Invalid.CodeLength);
- Encoder.Load(in);
- RegenerateEncoder();
- BuildDecoder();
- }
- };
+ void Load(IInputStream* in) {
+ ::Load(in, Invalid.CodeLength);
+ Encoder.Load(in);
+ RegenerateEncoder();
+ BuildDecoder();
+ }
+ };
- THuffmanCodec::TImpl::THuffmanCache::THuffmanCache(const THuffmanCodec::TImpl& codec)
- : Original(codec)
- {
- CacheEntries.resize(1 << CACHE_BITS_COUNT);
+ THuffmanCodec::TImpl::THuffmanCache::THuffmanCache(const THuffmanCodec::TImpl& codec)
+ : Original(codec)
+ {
+ CacheEntries.resize(1 << CACHE_BITS_COUNT);
DecodeCache.reserve(CacheEntries.size() * 2);
- char buffer[2];
- TBuffer decoded;
+ char buffer[2];
+ TBuffer decoded;
for (size_t i = 0; i < CacheEntries.size(); i++) {
- buffer[1] = i >> 8;
- buffer[0] = i;
- NBitIO::TBitInput bin(buffer, buffer + sizeof(buffer));
- int totalBits = 0;
- while (true) {
- decoded.Resize(0);
- int bits = codec.ReadNextChar(bin, decoded);
- if (totalBits + bits > 16 || !bits) {
- TCacheEntry e = {static_cast<int>(DecodeCache.size()), 16 - totalBits};
- CacheEntries[i] = e;
- break;
- }
-
- for (TBuffer::TConstIterator it = decoded.Begin(); it != decoded.End(); ++it) {
- DecodeCache.push_back(*it);
- }
- totalBits += bits;
+ buffer[1] = i >> 8;
+ buffer[0] = i;
+ NBitIO::TBitInput bin(buffer, buffer + sizeof(buffer));
+ int totalBits = 0;
+ while (true) {
+ decoded.Resize(0);
+ int bits = codec.ReadNextChar(bin, decoded);
+ if (totalBits + bits > 16 || !bits) {
+ TCacheEntry e = {static_cast<int>(DecodeCache.size()), 16 - totalBits};
+ CacheEntries[i] = e;
+ break;
+ }
+
+ for (TBuffer::TConstIterator it = decoded.Begin(); it != decoded.End(); ++it) {
+ DecodeCache.push_back(*it);
+ }
+ totalBits += bits;
}
}
- DecodeCache.push_back(0);
- CacheEntries.shrink_to_fit();
- DecodeCache.shrink_to_fit();
+ DecodeCache.push_back(0);
+ CacheEntries.shrink_to_fit();
+ DecodeCache.shrink_to_fit();
}
- void THuffmanCodec::TImpl::THuffmanCache::Decode(NBitIO::TBitInput& bin, TBuffer& out) const {
- int bits = 0;
- ui64 code = 0;
- while (!bin.Eof()) {
- ui64 f = 0;
- const int toRead = 16 - bits;
- if (toRead > 0 && bin.Read(f, toRead)) {
- code = (code >> (16 - bits)) | (f << bits);
- code &= 0xFFFF;
- TCacheEntry entry = CacheEntries[code];
- int start = code > 0 ? CacheEntries[code - 1].EndOffset : 0;
- out.Append((const char*)&DecodeCache[start], (const char*)&DecodeCache[entry.EndOffset]);
- bits = entry.BitsLeft;
- } else { // should never happen until there are exceptions or unaligned input
- bin.Back(bits);
- if (!Original.ReadNextChar(bin, out))
- break;
-
- code = 0;
- bits = 0;
- }
+ void THuffmanCodec::TImpl::THuffmanCache::Decode(NBitIO::TBitInput& bin, TBuffer& out) const {
+ int bits = 0;
+ ui64 code = 0;
+ while (!bin.Eof()) {
+ ui64 f = 0;
+ const int toRead = 16 - bits;
+ if (toRead > 0 && bin.Read(f, toRead)) {
+ code = (code >> (16 - bits)) | (f << bits);
+ code &= 0xFFFF;
+ TCacheEntry entry = CacheEntries[code];
+ int start = code > 0 ? CacheEntries[code - 1].EndOffset : 0;
+ out.Append((const char*)&DecodeCache[start], (const char*)&DecodeCache[entry.EndOffset]);
+ bits = entry.BitsLeft;
+ } else { // should never happen until there are exceptions or unaligned input
+ bin.Back(bits);
+ if (!Original.ReadNextChar(bin, out))
+ break;
+
+ code = 0;
+ bits = 0;
+ }
}
}
- THuffmanCodec::THuffmanCodec()
- : Impl(new TImpl)
- {
- MyTraits.NeedsTraining = true;
- MyTraits.PreservesPrefixGrouping = true;
- MyTraits.PaddingBit = 1;
- MyTraits.SizeOnEncodeMultiplier = 2;
- MyTraits.SizeOnDecodeMultiplier = 8;
- MyTraits.RecommendedSampleSize = 1 << 21;
- }
+ THuffmanCodec::THuffmanCodec()
+ : Impl(new TImpl)
+ {
+ MyTraits.NeedsTraining = true;
+ MyTraits.PreservesPrefixGrouping = true;
+ MyTraits.PaddingBit = 1;
+ MyTraits.SizeOnEncodeMultiplier = 2;
+ MyTraits.SizeOnDecodeMultiplier = 8;
+ MyTraits.RecommendedSampleSize = 1 << 21;
+ }
- THuffmanCodec::~THuffmanCodec() = default;
+ THuffmanCodec::~THuffmanCodec() = default;
- ui8 THuffmanCodec::Encode(TStringBuf in, TBuffer& bbb) const {
- if (Y_UNLIKELY(!Trained))
- ythrow TCodecException() << " not trained";
+ ui8 THuffmanCodec::Encode(TStringBuf in, TBuffer& bbb) const {
+ if (Y_UNLIKELY(!Trained))
+ ythrow TCodecException() << " not trained";
- return Impl->Encode(in, bbb);
- }
+ return Impl->Encode(in, bbb);
+ }
- void THuffmanCodec::Decode(TStringBuf in, TBuffer& bbb) const {
- Impl->Decode(in, bbb);
- }
+ void THuffmanCodec::Decode(TStringBuf in, TBuffer& bbb) const {
+ Impl->Decode(in, bbb);
+ }
- void THuffmanCodec::Save(IOutputStream* out) const {
- Impl->Save(out);
- }
+ void THuffmanCodec::Save(IOutputStream* out) const {
+ Impl->Save(out);
+ }
- void THuffmanCodec::Load(IInputStream* in) {
- Impl->Load(in);
- }
+ void THuffmanCodec::Load(IInputStream* in) {
+ Impl->Load(in);
+ }
- void THuffmanCodec::DoLearn(ISequenceReader& in) {
- Impl->Learn(&in);
- }
+ void THuffmanCodec::DoLearn(ISequenceReader& in) {
+ Impl->Learn(&in);
+ }
void THuffmanCodec::LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) {
- Impl->LearnByFreqs(freqs);
- Trained = true;
- }
+ Impl->LearnByFreqs(freqs);
+ Trained = true;
+ }
}
diff --git a/library/cpp/codecs/huffman_codec.h b/library/cpp/codecs/huffman_codec.h
index 24f8397694..559545b90d 100644
--- a/library/cpp/codecs/huffman_codec.h
+++ b/library/cpp/codecs/huffman_codec.h
@@ -6,34 +6,34 @@
#include <util/string/cast.h>
namespace NCodecs {
- // for types greater than char, pipeline with TFreqCodec.
+ // for types greater than char, pipeline with TFreqCodec.
- class THuffmanCodec: public ICodec {
- class TImpl;
- TIntrusivePtr<TImpl> Impl;
+ class THuffmanCodec: public ICodec {
+ class TImpl;
+ TIntrusivePtr<TImpl> Impl;
- public:
- THuffmanCodec();
- ~THuffmanCodec() override;
+ public:
+ THuffmanCodec();
+ ~THuffmanCodec() override;
- static TStringBuf MyName() {
- return "huffman";
- }
+ static TStringBuf MyName() {
+ return "huffman";
+ }
- TString GetName() const override {
+ TString GetName() const override {
return ToString(MyName());
- }
+ }
- ui8 Encode(TStringBuf in, TBuffer& bbb) const override;
+ ui8 Encode(TStringBuf in, TBuffer& bbb) const override;
- void Decode(TStringBuf in, TBuffer& bbb) const override;
+ void Decode(TStringBuf in, TBuffer& bbb) const override;
void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs);
- protected:
- void DoLearn(ISequenceReader& in) override;
- void Save(IOutputStream* out) const override;
- void Load(IInputStream* in) override;
- };
+ protected:
+ void DoLearn(ISequenceReader& in) override;
+ void Save(IOutputStream* out) const override;
+ void Load(IInputStream* in) override;
+ };
}
diff --git a/library/cpp/codecs/pfor_codec.cpp b/library/cpp/codecs/pfor_codec.cpp
index d5dbc5a7fa..f6b3b0920b 100644
--- a/library/cpp/codecs/pfor_codec.cpp
+++ b/library/cpp/codecs/pfor_codec.cpp
@@ -1,22 +1,22 @@
#include "pfor_codec.h"
namespace NCodecs {
- template <>
- TStringBuf TPForCodec<ui64, true>::MyName() {
- return "pfor-delta64-sorted";
- }
- template <>
- TStringBuf TPForCodec<ui32, true>::MyName() {
- return "pfor-delta32-sorted";
- }
+ template <>
+ TStringBuf TPForCodec<ui64, true>::MyName() {
+ return "pfor-delta64-sorted";
+ }
+ template <>
+ TStringBuf TPForCodec<ui32, true>::MyName() {
+ return "pfor-delta32-sorted";
+ }
- template <>
- TStringBuf TPForCodec<ui64, false>::MyName() {
- return "pfor-ui64";
- }
- template <>
- TStringBuf TPForCodec<ui32, false>::MyName() {
- return "pfor-ui32";
- }
+ template <>
+ TStringBuf TPForCodec<ui64, false>::MyName() {
+ return "pfor-ui64";
+ }
+ template <>
+ TStringBuf TPForCodec<ui32, false>::MyName() {
+ return "pfor-ui32";
+ }
}
diff --git a/library/cpp/codecs/pfor_codec.h b/library/cpp/codecs/pfor_codec.h
index a1f2bf9f9a..d7d4bb8bf4 100644
--- a/library/cpp/codecs/pfor_codec.h
+++ b/library/cpp/codecs/pfor_codec.h
@@ -10,202 +10,202 @@
#include <util/string/cast.h>
namespace NCodecs {
- template <typename T, bool WithDelta = false>
- class TPForCodec: public ICodec {
- using TUnsigned = std::make_unsigned_t<T>;
- typedef TDeltaCodec<TUnsigned> TDCodec;
+ template <typename T, bool WithDelta = false>
+ class TPForCodec: public ICodec {
+ using TUnsigned = std::make_unsigned_t<T>;
+ typedef TDeltaCodec<TUnsigned> TDCodec;
- typedef std::conditional_t<WithDelta, typename TDCodec::TDelta, T> TValue;
- static_assert(std::is_unsigned<TValue>::value, "expect std:is_unsigned<TValue>::value");
+ typedef std::conditional_t<WithDelta, typename TDCodec::TDelta, T> TValue;
+ static_assert(std::is_unsigned<TValue>::value, "expect std:is_unsigned<TValue>::value");
- static const ui64 BitsInT = sizeof(TUnsigned) * 8;
+ static const ui64 BitsInT = sizeof(TUnsigned) * 8;
- TDCodec DeltaCodec;
+ TDCodec DeltaCodec;
- public:
- static TStringBuf MyName();
+ public:
+ static TStringBuf MyName();
- TPForCodec() {
- MyTraits.AssumesStructuredInput = true;
- MyTraits.SizeOfInputElement = sizeof(T);
- MyTraits.SizeOnDecodeMultiplier = sizeof(T);
- }
+ TPForCodec() {
+ MyTraits.AssumesStructuredInput = true;
+ MyTraits.SizeOfInputElement = sizeof(T);
+ MyTraits.SizeOnDecodeMultiplier = sizeof(T);
+ }
- TString GetName() const override {
+ TString GetName() const override {
return ToString(MyName());
- }
+ }
- ui8 Encode(TStringBuf s, TBuffer& b) const override {
- b.Clear();
+ ui8 Encode(TStringBuf s, TBuffer& b) const override {
+ b.Clear();
if (s.empty()) {
- return 0;
- }
+ return 0;
+ }
b.Reserve(2 * s.size() + b.Size());
- if (WithDelta) {
- auto buffer = TBufferTlsCache::TlsInstance().Item();
- TBuffer& db = buffer.Get();
- db.Clear();
+ if (WithDelta) {
+ auto buffer = TBufferTlsCache::TlsInstance().Item();
+ TBuffer& db = buffer.Get();
+ db.Clear();
db.Reserve(2 * s.size());
- DeltaCodec.Encode(s, db);
+ DeltaCodec.Encode(s, db);
s = TStringBuf{db.data(), db.size()};
- }
+ }
TArrayRef<const TValue> tin{(const TValue*)s.data(), s.size() / sizeof(TValue)};
const ui64 sz = tin.size();
- ui64 bitcounts[BitsInT + 1];
- Zero(bitcounts);
+ ui64 bitcounts[BitsInT + 1];
+ Zero(bitcounts);
- ui32 zeros = 0;
+ ui32 zeros = 0;
for (const TValue* it = tin.begin(); it != tin.end(); ++it) {
- TUnsigned v = 1 + (TUnsigned)*it;
- ui64 l = MostSignificantBit(v) + 1;
- ++bitcounts[l];
-
- if (!v) {
- ++zeros;
- }
- }
-
- // cumulative bit counts
- for (ui64 i = 0; i < BitsInT; ++i) {
- bitcounts[i + 1] += bitcounts[i];
+ TUnsigned v = 1 + (TUnsigned)*it;
+ ui64 l = MostSignificantBit(v) + 1;
+ ++bitcounts[l];
+
+ if (!v) {
+ ++zeros;
+ }
+ }
+
+ // cumulative bit counts
+ for (ui64 i = 0; i < BitsInT; ++i) {
+ bitcounts[i + 1] += bitcounts[i];
}
- bool hasexceptions = zeros;
- ui64 optimalbits = BitsInT;
+ bool hasexceptions = zeros;
+ ui64 optimalbits = BitsInT;
- {
- ui64 excsize = 0;
- ui64 minsize = sz * BitsInT;
+ {
+ ui64 excsize = 0;
+ ui64 minsize = sz * BitsInT;
- for (ui64 current = BitsInT; current; --current) {
- ui64 size = bitcounts[current] * current + (sz - bitcounts[current]) * (current + 6 + excsize) + zeros * (current + 6);
+ for (ui64 current = BitsInT; current; --current) {
+ ui64 size = bitcounts[current] * current + (sz - bitcounts[current]) * (current + 6 + excsize) + zeros * (current + 6);
- excsize += current * bitcounts[current];
+ excsize += current * bitcounts[current];
- if (size < minsize) {
- minsize = size;
- optimalbits = current;
- hasexceptions = zeros || sz - bitcounts[current];
- }
+ if (size < minsize) {
+ minsize = size;
+ optimalbits = current;
+ hasexceptions = zeros || sz - bitcounts[current];
+ }
}
}
- if (!optimalbits || BitsInT == optimalbits) {
- b.Append((ui8)-1);
+ if (!optimalbits || BitsInT == optimalbits) {
+ b.Append((ui8)-1);
b.Append(s.data(), s.size());
- return 0;
- } else {
- NBitIO::TBitOutputVector<TBuffer> bout(&b);
- bout.Write(0, 1);
- bout.Write(hasexceptions, 1);
- bout.Write(optimalbits, 6);
+ return 0;
+ } else {
+ NBitIO::TBitOutputVector<TBuffer> bout(&b);
+ bout.Write(0, 1);
+ bout.Write(hasexceptions, 1);
+ bout.Write(optimalbits, 6);
for (const TValue* it = tin.begin(); it != tin.end(); ++it) {
- TUnsigned word = 1 + (TUnsigned)*it;
- ui64 len = MostSignificantBit(word) + 1;
- if (len > optimalbits || !word) {
- Y_ENSURE(hasexceptions, " ");
- bout.Write(0, optimalbits);
- bout.Write(len, 6);
- bout.Write(word, len);
- } else {
- bout.Write(word, optimalbits);
- }
+ TUnsigned word = 1 + (TUnsigned)*it;
+ ui64 len = MostSignificantBit(word) + 1;
+ if (len > optimalbits || !word) {
+ Y_ENSURE(hasexceptions, " ");
+ bout.Write(0, optimalbits);
+ bout.Write(len, 6);
+ bout.Write(word, len);
+ } else {
+ bout.Write(word, optimalbits);
+ }
}
- return bout.GetByteReminder();
- } // the rest of the last byte is zero padded. BitsInT is always > 7.
+ return bout.GetByteReminder();
+ } // the rest of the last byte is zero padded. BitsInT is always > 7.
}
- void Decode(TStringBuf s, TBuffer& b) const override {
- b.Clear();
+ void Decode(TStringBuf s, TBuffer& b) const override {
+ b.Clear();
if (s.empty()) {
- return;
- }
+ return;
+ }
b.Reserve(s.size() * sizeof(T) + b.Size());
- ui64 isplain = 0;
- ui64 hasexceptions = 0;
- ui64 bits = 0;
-
- NBitIO::TBitInput bin(s);
- bin.ReadK<1>(isplain);
- bin.ReadK<1>(hasexceptions);
- bin.ReadK<6>(bits);
-
- if (Y_UNLIKELY(isplain)) {
- s.Skip(1);
-
- if (WithDelta) {
- DeltaCodec.Decode(s, b);
- } else {
+ ui64 isplain = 0;
+ ui64 hasexceptions = 0;
+ ui64 bits = 0;
+
+ NBitIO::TBitInput bin(s);
+ bin.ReadK<1>(isplain);
+ bin.ReadK<1>(hasexceptions);
+ bin.ReadK<6>(bits);
+
+ if (Y_UNLIKELY(isplain)) {
+ s.Skip(1);
+
+ if (WithDelta) {
+ DeltaCodec.Decode(s, b);
+ } else {
b.Append(s.data(), s.size());
- }
+ }
} else {
- typename TDCodec::TDecoder decoder;
+ typename TDCodec::TDecoder decoder;
- if (hasexceptions) {
- ui64 word = 0;
- while (bin.Read(word, bits)) {
- if (word || (bin.ReadK<6>(word) && bin.Read(word, word))) {
- --word;
+ if (hasexceptions) {
+ ui64 word = 0;
+ while (bin.Read(word, bits)) {
+ if (word || (bin.ReadK<6>(word) && bin.Read(word, word))) {
+ --word;
- TValue t = word;
+ TValue t = word;
- if (WithDelta) {
- if (decoder.Decode(t)) {
- TStringBuf r{(char*)&decoder.Result, sizeof(decoder.Result)};
+ if (WithDelta) {
+ if (decoder.Decode(t)) {
+ TStringBuf r{(char*)&decoder.Result, sizeof(decoder.Result)};
b.Append(r.data(), r.size());
- }
- } else {
- TStringBuf r{(char*)&t, sizeof(t)};
+ }
+ } else {
+ TStringBuf r{(char*)&t, sizeof(t)};
b.Append(r.data(), r.size());
}
}
}
- } else {
- ui64 word = 0;
- T outarr[256 / sizeof(T)];
- ui32 cnt = 0;
- while (true) {
- ui64 v = bin.Read(word, bits);
-
- if ((!v) | (!word))
- break;
-
- --word;
- TValue t = word;
-
- if (WithDelta) {
- if (decoder.Decode(t)) {
- outarr[cnt++] = decoder.Result;
- }
- } else {
- outarr[cnt++] = t;
+ } else {
+ ui64 word = 0;
+ T outarr[256 / sizeof(T)];
+ ui32 cnt = 0;
+ while (true) {
+ ui64 v = bin.Read(word, bits);
+
+ if ((!v) | (!word))
+ break;
+
+ --word;
+ TValue t = word;
+
+ if (WithDelta) {
+ if (decoder.Decode(t)) {
+ outarr[cnt++] = decoder.Result;
+ }
+ } else {
+ outarr[cnt++] = t;
+ }
+
+ if (cnt == Y_ARRAY_SIZE(outarr)) {
+ b.Append((const char*)outarr, sizeof(outarr));
+ cnt = 0;
}
-
- if (cnt == Y_ARRAY_SIZE(outarr)) {
- b.Append((const char*)outarr, sizeof(outarr));
- cnt = 0;
- }
}
- if (cnt) {
- b.Append((const char*)outarr, cnt * sizeof(T));
+ if (cnt) {
+ b.Append((const char*)outarr, cnt * sizeof(T));
}
}
}
}
- protected:
- void DoLearn(ISequenceReader&) override {
- }
- };
+ protected:
+ void DoLearn(ISequenceReader&) override {
+ }
+ };
}
diff --git a/library/cpp/codecs/sample.h b/library/cpp/codecs/sample.h
index 5d3ab57f78..15f03afcc5 100644
--- a/library/cpp/codecs/sample.h
+++ b/library/cpp/codecs/sample.h
@@ -24,20 +24,20 @@ namespace NCodecs {
}
template <class TIter>
- TStringBuf IterToStringBuf(TIter iter) {
+ TStringBuf IterToStringBuf(TIter iter) {
return ValueToStringBuf(*iter);
}
template <class TItem>
- class TSimpleSequenceReader: public ISequenceReader {
+ class TSimpleSequenceReader: public ISequenceReader {
const TVector<TItem>& Items;
size_t Idx = 0;
public:
TSimpleSequenceReader(const TVector<TItem>& items)
: Items(items)
- {
- }
+ {
+ }
bool NextRegion(TStringBuf& s) override {
if (Idx >= Items.size()) {
diff --git a/library/cpp/codecs/solar_codec.cpp b/library/cpp/codecs/solar_codec.cpp
index 6c08b9e7bd..d0692fe2a4 100644
--- a/library/cpp/codecs/solar_codec.cpp
+++ b/library/cpp/codecs/solar_codec.cpp
@@ -9,125 +9,125 @@
#include <util/ysaveload.h>
namespace NCodecs {
- static inline ui32 Append(TBuffer& pool, TStringBuf data) {
+ static inline ui32 Append(TBuffer& pool, TStringBuf data) {
pool.Append(data.data(), data.size());
- return pool.Size();
- }
+ return pool.Size();
+ }
+
+ void TSolarCodec::DoLearn(ISequenceReader& r) {
+ using namespace NGreedyDict;
- void TSolarCodec::DoLearn(ISequenceReader& r) {
- using namespace NGreedyDict;
+ Decoder.clear();
+ Pool.Clear();
- Decoder.clear();
- Pool.Clear();
+ THolder<TEntrySet> set;
- THolder<TEntrySet> set;
+ {
+ TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance());
+ TStringBufs bufs;
- {
- TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance());
- TStringBufs bufs;
+ TStringBuf m;
+ while (r.NextRegion(m)) {
+ bufs.push_back(pool.AppendString(m));
+ }
- TStringBuf m;
- while (r.NextRegion(m)) {
- bufs.push_back(pool.AppendString(m));
- }
+ {
+ TDictBuilder b(Settings);
+ b.SetInput(bufs);
+ b.Build(MaxEntries, MaxIterations);
- {
- TDictBuilder b(Settings);
- b.SetInput(bufs);
- b.Build(MaxEntries, MaxIterations);
-
- set = b.ReleaseEntrySet();
- }
+ set = b.ReleaseEntrySet();
+ }
}
- set->SetScores(ES_LEN_COUNT);
-
+ set->SetScores(ES_LEN_COUNT);
+
{
- TVector<std::pair<float, TStringBuf>> tmp;
- tmp.reserve(set->size());
+ TVector<std::pair<float, TStringBuf>> tmp;
+ tmp.reserve(set->size());
- for (const auto& it : *set) {
- tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1))));
- }
+ for (const auto& it : *set) {
+ tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1))));
+ }
- Sort(tmp.begin(), tmp.end());
+ Sort(tmp.begin(), tmp.end());
- Decoder.reserve(tmp.size() + 1);
- Decoder.push_back(0);
+ Decoder.reserve(tmp.size() + 1);
+ Decoder.push_back(0);
- for (const auto& it : tmp) {
- Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed");
- ui32 endoff = Append(Pool, it.second);
- Decoder.push_back(endoff);
- }
+ for (const auto& it : tmp) {
+ Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed");
+ ui32 endoff = Append(Pool, it.second);
+ Decoder.push_back(endoff);
+ }
}
- Pool.ShrinkToFit();
- Decoder.shrink_to_fit();
+ Pool.ShrinkToFit();
+ Decoder.shrink_to_fit();
- TBufferOutput bout;
+ TBufferOutput bout;
- {
- TVector<std::pair<TStringBuf, ui32>> tmp2;
- tmp2.reserve(Decoder.size());
+ {
+ TVector<std::pair<TStringBuf, ui32>> tmp2;
+ tmp2.reserve(Decoder.size());
- for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) {
- TStringBuf s = DoDecode(i);
- tmp2.push_back(std::make_pair(s, i - 1));
+ for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) {
+ TStringBuf s = DoDecode(i);
+ tmp2.push_back(std::make_pair(s, i - 1));
Y_ENSURE(s.size() == (Decoder[i] - Decoder[i - 1]), "learning invariant failed");
- }
+ }
- Sort(tmp2.begin(), tmp2.end());
+ Sort(tmp2.begin(), tmp2.end());
- {
- TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED);
- for (const auto& it : tmp2) {
+ {
+ TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED);
+ for (const auto& it : tmp2) {
builder.Add(it.first.data(), it.first.size(), it.second);
- }
+ }
- builder.Save(bout);
+ builder.Save(bout);
}
}
- Encoder.Init(TBlob::FromBuffer(bout.Buffer()));
+ Encoder.Init(TBlob::FromBuffer(bout.Buffer()));
+ }
+
+ void TSolarCodec::Save(IOutputStream* out) const {
+ TBlob b = Encoder.Data();
+ ::Save(out, (ui32)b.Size());
+ out->Write(b.Data(), b.Size());
}
- void TSolarCodec::Save(IOutputStream* out) const {
- TBlob b = Encoder.Data();
- ::Save(out, (ui32)b.Size());
- out->Write(b.Data(), b.Size());
- }
-
- void TSolarCodec::Load(IInputStream* in) {
- ui32 sz;
- ::Load(in, sz);
- TLengthLimitedInput lin(in, sz);
- Encoder.Init(TBlob::FromStream(lin));
- Pool.Clear();
- Decoder.clear();
-
- TVector<std::pair<ui32, TString>> tmp;
-
- ui32 poolsz = 0;
- for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) {
- const TString& s = it.GetKey();
- tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s));
+ void TSolarCodec::Load(IInputStream* in) {
+ ui32 sz;
+ ::Load(in, sz);
+ TLengthLimitedInput lin(in, sz);
+ Encoder.Init(TBlob::FromStream(lin));
+ Pool.Clear();
+ Decoder.clear();
+
+ TVector<std::pair<ui32, TString>> tmp;
+
+ ui32 poolsz = 0;
+ for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) {
+ const TString& s = it.GetKey();
+ tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s));
poolsz += Max<ui32>(s.size(), 1);
- }
+ }
- Sort(tmp.begin(), tmp.end());
+ Sort(tmp.begin(), tmp.end());
- Pool.Reserve(poolsz);
- Decoder.reserve(tmp.size() + 1);
- Decoder.push_back(0);
+ Pool.Reserve(poolsz);
+ Decoder.reserve(tmp.size() + 1);
+ Decoder.push_back(0);
- for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) {
- Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first);
- Decoder.push_back(Append(Pool, tmp[i].second));
- }
+ for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) {
+ Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first);
+ Decoder.push_back(Append(Pool, tmp[i].second));
+ }
- Pool.ShrinkToFit();
- Decoder.shrink_to_fit();
+ Pool.ShrinkToFit();
+ Decoder.shrink_to_fit();
}
}
diff --git a/library/cpp/codecs/solar_codec.h b/library/cpp/codecs/solar_codec.h
index 08fdf9d123..7158ae7926 100644
--- a/library/cpp/codecs/solar_codec.h
+++ b/library/cpp/codecs/solar_codec.h
@@ -11,234 +11,234 @@ namespace NCodecs {
// TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы.
// TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе.
- struct TVarIntTraits {
- static const size_t MAX_VARINT32_BYTES = 5;
-
- static void Write(ui32 value, TBuffer& b) {
- while (value > 0x7F) {
- b.Append(static_cast<ui8>(value) | 0x80);
- value >>= 7;
- }
- b.Append(static_cast<ui8>(value) & 0x7F);
- }
-
- static void Read(TStringBuf& r, ui32& value) {
- ui32 result = 0;
- for (ui32 count = 0; count < MAX_VARINT32_BYTES; ++count) {
- const ui32 b = static_cast<ui8>(r[0]);
- r.Skip(1);
- result |= static_cast<ui32>(b & 0x7F) << (7 * count);
- if (!(b & 0x80)) {
- value = result;
- return;
+ struct TVarIntTraits {
+ static const size_t MAX_VARINT32_BYTES = 5;
+
+ static void Write(ui32 value, TBuffer& b) {
+ while (value > 0x7F) {
+ b.Append(static_cast<ui8>(value) | 0x80);
+ value >>= 7;
+ }
+ b.Append(static_cast<ui8>(value) & 0x7F);
+ }
+
+ static void Read(TStringBuf& r, ui32& value) {
+ ui32 result = 0;
+ for (ui32 count = 0; count < MAX_VARINT32_BYTES; ++count) {
+ const ui32 b = static_cast<ui8>(r[0]);
+ r.Skip(1);
+ result |= static_cast<ui32>(b & 0x7F) << (7 * count);
+ if (!(b & 0x80)) {
+ value = result;
+ return;
} else if (Y_UNLIKELY(r.empty())) {
- break;
- }
+ break;
+ }
}
- Y_ENSURE_EX(false, TCodecException() << "Bad data");
+ Y_ENSURE_EX(false, TCodecException() << "Bad data");
}
- };
+ };
- struct TShortIntTraits {
- static const size_t SHORTINT_SIZE_LIMIT = 0x8000;
+ struct TShortIntTraits {
+ static const size_t SHORTINT_SIZE_LIMIT = 0x8000;
- Y_FORCE_INLINE static void Write(ui32 value, TBuffer& b) {
- Y_ENSURE_EX(value < SHORTINT_SIZE_LIMIT, TCodecException() << "Bad write method");
- if (value >= 0x80) {
- b.Append(static_cast<ui8>(value >> 8) | 0x80);
- }
- b.Append(static_cast<ui8>(value));
+ Y_FORCE_INLINE static void Write(ui32 value, TBuffer& b) {
+ Y_ENSURE_EX(value < SHORTINT_SIZE_LIMIT, TCodecException() << "Bad write method");
+ if (value >= 0x80) {
+ b.Append(static_cast<ui8>(value >> 8) | 0x80);
+ }
+ b.Append(static_cast<ui8>(value));
}
- Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) {
- ui32 result = static_cast<ui8>(r[0]);
+ Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) {
+ ui32 result = static_cast<ui8>(r[0]);
r.Skip(1);
- if (result >= 0x80) {
+ if (result >= 0x80) {
Y_ENSURE_EX(!r.empty(), TCodecException() << "Bad data");
- result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]);
- r.Skip(1);
- }
- value = result;
+ result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]);
+ r.Skip(1);
+ }
+ value = result;
}
- };
+ };
- class TSolarCodec: public ICodec {
- public:
- static TStringBuf MyName8k() {
+ class TSolarCodec: public ICodec {
+ public:
+ static TStringBuf MyName8k() {
return TStringBuf("solar-8k");
- }
- static TStringBuf MyName16k() {
+ }
+ static TStringBuf MyName16k() {
return TStringBuf("solar-16k");
- }
- static TStringBuf MyName32k() {
+ }
+ static TStringBuf MyName32k() {
return TStringBuf("solar-32k");
- }
- static TStringBuf MyName64k() {
+ }
+ static TStringBuf MyName64k() {
return TStringBuf("solar-64k");
- }
- static TStringBuf MyName256k() {
+ }
+ static TStringBuf MyName256k() {
return TStringBuf("solar-256k");
- }
- static TStringBuf MyName() {
+ }
+ static TStringBuf MyName() {
return TStringBuf("solar");
- }
- static TStringBuf MyName8kAdapt() {
+ }
+ static TStringBuf MyName8kAdapt() {
return TStringBuf("solar-8k-a");
- }
- static TStringBuf MyName16kAdapt() {
+ }
+ static TStringBuf MyName16kAdapt() {
return TStringBuf("solar-16k-a");
- }
- static TStringBuf MyName32kAdapt() {
+ }
+ static TStringBuf MyName32kAdapt() {
return TStringBuf("solar-32k-a");
- }
- static TStringBuf MyName64kAdapt() {
+ }
+ static TStringBuf MyName64kAdapt() {
return TStringBuf("solar-64k-a");
- }
- static TStringBuf MyName256kAdapt() {
+ }
+ static TStringBuf MyName256kAdapt() {
return TStringBuf("solar-256k-a");
- }
- static TStringBuf MyNameShortInt() {
+ }
+ static TStringBuf MyNameShortInt() {
return TStringBuf("solar-si");
- }
-
- explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
- : Settings(s)
- , MaxEntries(maxentries)
- , MaxIterations(maxiter)
- {
- MyTraits.NeedsTraining = true;
- MyTraits.SizeOnDecodeMultiplier = 2;
- MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8;
- }
-
- ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
- EncodeImpl<TVarIntTraits>(r, b);
- return 0;
- }
-
- void Decode(TStringBuf r, TBuffer& b) const override {
- DecodeImpl<TVarIntTraits>(r, b);
- }
-
- TString GetName() const override {
+ }
+
+ explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
+ : Settings(s)
+ , MaxEntries(maxentries)
+ , MaxIterations(maxiter)
+ {
+ MyTraits.NeedsTraining = true;
+ MyTraits.SizeOnDecodeMultiplier = 2;
+ MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8;
+ }
+
+ ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
+ EncodeImpl<TVarIntTraits>(r, b);
+ return 0;
+ }
+
+ void Decode(TStringBuf r, TBuffer& b) const override {
+ DecodeImpl<TVarIntTraits>(r, b);
+ }
+
+ TString GetName() const override {
return ToString(MyName());
- }
+ }
- protected:
- void DoLearn(ISequenceReader&) override;
- void Save(IOutputStream*) const override;
- void Load(IInputStream*) override;
+ protected:
+ void DoLearn(ISequenceReader&) override;
+ void Save(IOutputStream*) const override;
+ void Load(IInputStream*) override;
- Y_FORCE_INLINE TStringBuf SubStr(ui32 begoff, ui32 endoff) const {
- return TStringBuf(Pool.Data() + begoff, endoff - begoff);
- }
+ Y_FORCE_INLINE TStringBuf SubStr(ui32 begoff, ui32 endoff) const {
+ return TStringBuf(Pool.Data() + begoff, endoff - begoff);
+ }
- Y_FORCE_INLINE TStringBuf DoDecode(ui32 num) const {
- return SubStr(Decoder[num - 1], Decoder[num]);
- }
+ Y_FORCE_INLINE TStringBuf DoDecode(ui32 num) const {
+ return SubStr(Decoder[num - 1], Decoder[num]);
+ }
- template <class TTraits>
- Y_FORCE_INLINE void EncodeImpl(TStringBuf r, TBuffer& b) const {
- b.Clear();
+ template <class TTraits>
+ Y_FORCE_INLINE void EncodeImpl(TStringBuf r, TBuffer& b) const {
+ b.Clear();
b.Reserve(r.size());
while (!r.empty()) {
- size_t sz = 0;
- ui32 val = (ui32)-1;
- Encoder.FindLongestPrefix(r, &sz, &val);
- TTraits::Write(val + 1, b);
- r.Skip(Max<size_t>(sz, 1));
- }
+ size_t sz = 0;
+ ui32 val = (ui32)-1;
+ Encoder.FindLongestPrefix(r, &sz, &val);
+ TTraits::Write(val + 1, b);
+ r.Skip(Max<size_t>(sz, 1));
+ }
}
- template <class TTraits>
- Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const {
- b.Clear();
+ template <class TTraits>
+ Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const {
+ b.Clear();
b.Reserve(r.size());
- ui32 v = 0;
+ ui32 v = 0;
while (!r.empty()) {
- TTraits::Read(r, v);
- TStringBuf s = DoDecode(v);
+ TTraits::Read(r, v);
+ TStringBuf s = DoDecode(v);
b.Append(s.data(), s.size());
- }
- }
-
- inline bool CanUseShortInt() const {
- return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT;
- }
-
- private:
- typedef TCompactTrie<char, ui32> TEncoder;
- typedef TVector<ui32> TDecoder;
-
- TBuffer Pool;
- TEncoder Encoder;
- TDecoder Decoder;
-
- NGreedyDict::TBuildSettings Settings;
- ui32 MaxEntries;
- ui32 MaxIterations;
- };
-
- // Uses varints or shortints depending on the decoder size
- class TAdaptiveSolarCodec: public TSolarCodec {
- public:
- explicit TAdaptiveSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
- : TSolarCodec(maxentries, maxiter, s)
- {
- }
-
- ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
- if (CanUseShortInt()) {
- EncodeImpl<TShortIntTraits>(r, b);
- } else {
- EncodeImpl<TVarIntTraits>(r, b);
- }
-
- return 0;
- }
-
- void Decode(TStringBuf r, TBuffer& b) const override {
- if (CanUseShortInt()) {
- DecodeImpl<TShortIntTraits>(r, b);
- } else {
- DecodeImpl<TVarIntTraits>(r, b);
- }
- }
-
- TString GetName() const override {
- if (CanUseShortInt()) {
+ }
+ }
+
+ inline bool CanUseShortInt() const {
+ return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT;
+ }
+
+ private:
+ typedef TCompactTrie<char, ui32> TEncoder;
+ typedef TVector<ui32> TDecoder;
+
+ TBuffer Pool;
+ TEncoder Encoder;
+ TDecoder Decoder;
+
+ NGreedyDict::TBuildSettings Settings;
+ ui32 MaxEntries;
+ ui32 MaxIterations;
+ };
+
+ // Uses varints or shortints depending on the decoder size
+ class TAdaptiveSolarCodec: public TSolarCodec {
+ public:
+ explicit TAdaptiveSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
+ : TSolarCodec(maxentries, maxiter, s)
+ {
+ }
+
+ ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
+ if (CanUseShortInt()) {
+ EncodeImpl<TShortIntTraits>(r, b);
+ } else {
+ EncodeImpl<TVarIntTraits>(r, b);
+ }
+
+ return 0;
+ }
+
+ void Decode(TStringBuf r, TBuffer& b) const override {
+ if (CanUseShortInt()) {
+ DecodeImpl<TShortIntTraits>(r, b);
+ } else {
+ DecodeImpl<TVarIntTraits>(r, b);
+ }
+ }
+
+ TString GetName() const override {
+ if (CanUseShortInt()) {
return ToString(MyNameShortInt());
- } else {
+ } else {
return ToString(MyName());
- }
+ }
}
- };
+ };
- class TSolarCodecShortInt: public TSolarCodec {
- public:
- explicit TSolarCodecShortInt(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
- : TSolarCodec(maxentries, maxiter, s)
- {
+ class TSolarCodecShortInt: public TSolarCodec {
+ public:
+ explicit TSolarCodecShortInt(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
+ : TSolarCodec(maxentries, maxiter, s)
+ {
}
- ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
- EncodeImpl<TShortIntTraits>(r, b);
- return 0;
- }
+ ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
+ EncodeImpl<TShortIntTraits>(r, b);
+ return 0;
+ }
- void Decode(TStringBuf r, TBuffer& b) const override {
- DecodeImpl<TShortIntTraits>(r, b);
- }
+ void Decode(TStringBuf r, TBuffer& b) const override {
+ DecodeImpl<TShortIntTraits>(r, b);
+ }
- TString GetName() const override {
+ TString GetName() const override {
return ToString(MyNameShortInt());
- }
-
- protected:
- void Load(IInputStream* in) override {
- TSolarCodec::Load(in);
- Y_ENSURE_EX(CanUseShortInt(), TCodecException() << "Bad data");
- }
- };
+ }
+
+ protected:
+ void Load(IInputStream* in) override {
+ TSolarCodec::Load(in);
+ Y_ENSURE_EX(CanUseShortInt(), TCodecException() << "Bad data");
+ }
+ };
}
diff --git a/library/cpp/codecs/static/builder.h b/library/cpp/codecs/static/builder.h
index ece4dfa529..d7533be4d5 100644
--- a/library/cpp/codecs/static/builder.h
+++ b/library/cpp/codecs/static/builder.h
@@ -19,7 +19,7 @@ namespace NCodecs {
time_t Timestamp = TInstant::Now().TimeT();
TString RevisionInfo = (TStringBuilder() << "r" << ToString(GetProgramSvnRevision()));
TString TrainingSetComment; // a human comment on the training data
- TString TrainingSetResId; // sandbox resid of the training set
+ TString TrainingSetResId; // sandbox resid of the training set
};
TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo&);
diff --git a/library/cpp/codecs/static/example/example.cpp b/library/cpp/codecs/static/example/example.cpp
index b0566a8c2e..5b750b717e 100644
--- a/library/cpp/codecs/static/example/example.cpp
+++ b/library/cpp/codecs/static/example/example.cpp
@@ -5,10 +5,10 @@
#include <util/generic/yexception.h>
extern "C" {
-extern const ui8 codec_info_huff_20160707[];
-extern const ui32 codec_info_huff_20160707Size;
-extern const ui8 codec_info_sa_huff_20160707[];
-extern const ui32 codec_info_sa_huff_20160707Size;
+extern const ui8 codec_info_huff_20160707[];
+extern const ui32 codec_info_huff_20160707Size;
+extern const ui8 codec_info_sa_huff_20160707[];
+extern const ui32 codec_info_sa_huff_20160707Size;
};
namespace NStaticCodecExample {
diff --git a/library/cpp/codecs/static/example/example.h b/library/cpp/codecs/static/example/example.h
index 41003fb187..f9b3a7324b 100644
--- a/library/cpp/codecs/static/example/example.h
+++ b/library/cpp/codecs/static/example/example.h
@@ -4,11 +4,11 @@
#include <util/generic/buffer.h>
namespace NStaticCodecExample {
- enum EDictVersion : ui8 {
- DV_NULL = 0,
- DV_HUFF_20160707,
- DV_SA_HUFF_20160707,
- DV_COUNT
+ enum EDictVersion : ui8 {
+ DV_NULL = 0,
+ DV_HUFF_20160707,
+ DV_SA_HUFF_20160707,
+ DV_COUNT
};
void Encode(TBuffer&, TStringBuf, EDictVersion dv = DV_SA_HUFF_20160707);
diff --git a/library/cpp/codecs/static/static.cpp b/library/cpp/codecs/static/static.cpp
index 97ddbd8364..44a07dd73a 100644
--- a/library/cpp/codecs/static/static.cpp
+++ b/library/cpp/codecs/static/static.cpp
@@ -69,8 +69,8 @@ namespace NCodecs {
s << "sample mult: " << ci.GetDebugInfo().GetSampleSizeMultiplier() << Endl;
s << "orig.compress: " << ci.GetDebugInfo().GetCompression() * 100 << " %" << Endl;
s << "timestamp: " << ci.GetDebugInfo().GetTimestamp() << " ("
- << NDatetime::TSimpleTM::NewLocal(ci.GetDebugInfo().GetTimestamp()).ToString()
- << ")" << Endl;
+ << NDatetime::TSimpleTM::NewLocal(ci.GetDebugInfo().GetTimestamp()).ToString()
+ << ")" << Endl;
s << "revision: " << ci.GetDebugInfo().GetRevisionInfo() << Endl;
s << "training set comment: " << ci.GetDebugInfo().GetTrainingSetComment() << Endl;
s << "training set resId: " << ci.GetDebugInfo().GetTrainingSetResId() << Endl;
diff --git a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp
index 3668a7583a..9c8d568d82 100644
--- a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp
+++ b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp
@@ -22,17 +22,17 @@ int main(int argc, char** argv) {
opts.SetCmdLineDescr("-c 9089f3e9b7a0f0d4.codec_info -t -f base64 qtrees.sample.txt");
NCodecs::TStaticCodecInfo codec;
- opts.AddLongOption('c', "codec-info").RequiredArgument("codec_info").Handler1T<TString>([&codecFile, &codec, &codecPtr](TString name) {
- codecFile = name;
- codec.CopyFrom(NCodecs::LoadCodecInfoFromString(TUnbufferedFileInput(name).ReadAll()));
- codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec());
- })
- .Required()
- .Help(".codec_info file with serialized static data for codec");
+ opts.AddLongOption('c', "codec-info").RequiredArgument("codec_info").Handler1T<TString>([&codecFile, &codec, &codecPtr](TString name) {
+ codecFile = name;
+ codec.CopyFrom(NCodecs::LoadCodecInfoFromString(TUnbufferedFileInput(name).ReadAll()));
+ codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec());
+ })
+ .Required()
+ .Help(".codec_info file with serialized static data for codec");
- opts.AddLongOption('t', "test").NoArgument().StoreValue(&testCompression, true).Optional().Help("test current performance");
+ opts.AddLongOption('t', "test").NoArgument().StoreValue(&testCompression, true).Optional().Help("test current performance");
- opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Optional().Help("test set input file format");
+ opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Optional().Help("test set input file format");
opts.SetFreeArgsMin(0);
opts.SetFreeArgTitle(0, "testing_set_input_file", "testing set input files");
diff --git a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp
index 073689737d..45fdb5c5fe 100644
--- a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp
+++ b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp
@@ -17,26 +17,26 @@ int main(int argc, char** argv) {
opts.SetCmdLineDescr("-m 'Training set: 100000 qtrees taken from web mmeta logs' -f base64 qtrees.sample.txt");
opts.SetTitle("Teaches the codec and serializes it as a file named CODECNAME.hash(CODECDATA).bin");
- opts.AddLongOption('m', "message").RequiredArgument("training_set_comment").StoreResult(&info.TrainingSetComment).Required().Help("a human description for the training set");
+ opts.AddLongOption('m', "message").RequiredArgument("training_set_comment").StoreResult(&info.TrainingSetComment).Required().Help("a human description for the training set");
- opts.AddLongOption('r', "resource").RequiredArgument("training_set_res_id").StoreResult(&info.TrainingSetResId).Optional().Help("sandbox resource id for the training set");
+ opts.AddLongOption('r', "resource").RequiredArgument("training_set_res_id").StoreResult(&info.TrainingSetResId).Optional().Help("sandbox resource id for the training set");
- opts.AddLongOption('c', "codec").RequiredArgument("codec_name").StoreResult(&info.CodecName).Optional().DefaultValue(info.CodecName);
+ opts.AddLongOption('c', "codec").RequiredArgument("codec_name").StoreResult(&info.CodecName).Optional().DefaultValue(info.CodecName);
- opts.AddLongOption('s', "sample-multiplier").RequiredArgument("multiplier").StoreResult(&info.SampleSizeMultiplier).Optional().DefaultValue(ToString(info.SampleSizeMultiplier)).Help("multiplier for default sample size");
+ opts.AddLongOption('s', "sample-multiplier").RequiredArgument("multiplier").StoreResult(&info.SampleSizeMultiplier).Optional().DefaultValue(ToString(info.SampleSizeMultiplier)).Help("multiplier for default sample size");
- opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Required().Help("training set input file format");
+ opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Required().Help("training set input file format");
- opts.AddLongOption("list-codecs").NoArgument().Handler0([]() {
- Cout << JoinStrings(NCodecs::ICodec::GetCodecsList(), "\n") << Endl;
- exit(0);
- })
- .Optional()
- .Help("list available codecs");
+ opts.AddLongOption("list-codecs").NoArgument().Handler0([]() {
+ Cout << JoinStrings(NCodecs::ICodec::GetCodecsList(), "\n") << Endl;
+ exit(0);
+ })
+ .Optional()
+ .Help("list available codecs");
- opts.AddLongOption("fake-revision").RequiredArgument("revision").StoreResult(&info.RevisionInfo).Optional().Hidden(); // replace static_codec_generator revision in debug info
+ opts.AddLongOption("fake-revision").RequiredArgument("revision").StoreResult(&info.RevisionInfo).Optional().Hidden(); // replace static_codec_generator revision in debug info
- opts.AddLongOption("fake-timestamp").RequiredArgument("timestamp").StoreResult(&info.Timestamp).Optional().Hidden(); // replace generating timestamp in debug info
+ opts.AddLongOption("fake-timestamp").RequiredArgument("timestamp").StoreResult(&info.Timestamp).Optional().Hidden(); // replace generating timestamp in debug info
opts.SetFreeArgsMin(0);
opts.SetFreeArgTitle(0, "training_set_input_file", "training set input files");
diff --git a/library/cpp/codecs/static/ut/builder_ut.cpp b/library/cpp/codecs/static/ut/builder_ut.cpp
index d9c4eb9e47..b47c279ed1 100644
--- a/library/cpp/codecs/static/ut/builder_ut.cpp
+++ b/library/cpp/codecs/static/ut/builder_ut.cpp
@@ -3,11 +3,11 @@
#include <library/cpp/codecs/static/static_codec_info.pb.h>
#include <util/string/vector.h>
-class TStaticCodecInfoBuilderTest: public NUnitTest::TTestBase {
+class TStaticCodecInfoBuilderTest: public NUnitTest::TTestBase {
UNIT_TEST_SUITE(TStaticCodecInfoBuilderTest)
- UNIT_TEST(TestBuild)
+ UNIT_TEST(TestBuild)
UNIT_TEST_SUITE_END();
-
+
private:
TVector<TString> PrepareData() {
TVector<TString> data;
diff --git a/library/cpp/codecs/static/ut/static_ut.cpp b/library/cpp/codecs/static/ut/static_ut.cpp
index 315e1bf0b0..57e1e62887 100644
--- a/library/cpp/codecs/static/ut/static_ut.cpp
+++ b/library/cpp/codecs/static/ut/static_ut.cpp
@@ -1,11 +1,11 @@
#include <library/cpp/testing/unittest/registar.h>
#include <library/cpp/codecs/static/example/example.h>
-class TStaticCodecUsageTest: public NUnitTest::TTestBase {
+class TStaticCodecUsageTest: public NUnitTest::TTestBase {
UNIT_TEST_SUITE(TStaticCodecUsageTest)
- UNIT_TEST(TestUsage)
+ UNIT_TEST(TestUsage)
UNIT_TEST_SUITE_END();
-
+
private:
void DoTestUsage(NStaticCodecExample::EDictVersion dv, size_t expectedSize) {
const TStringBuf letov = "Всё идёт по плану";
diff --git a/library/cpp/codecs/tls_cache.h b/library/cpp/codecs/tls_cache.h
index 7068ea333f..0184e4bb6c 100644
--- a/library/cpp/codecs/tls_cache.h
+++ b/library/cpp/codecs/tls_cache.h
@@ -15,15 +15,15 @@ namespace NCodecs {
}
};
- template <class TItem, class TCleaner = TClear<TItem>>
+ template <class TItem, class TCleaner = TClear<TItem>>
class TTlsCache {
using TSelf = TTlsCache<TItem, TCleaner>;
- struct TItemHolder: public TIntrusiveListItem<TItemHolder> {
+ struct TItemHolder: public TIntrusiveListItem<TItemHolder> {
TItemHolder(TSelf& factory)
: Factory(factory)
- {
- }
+ {
+ }
void Release() {
Factory.Release(*this);
@@ -37,14 +37,14 @@ namespace NCodecs {
public:
explicit TItemGuard(TSelf& fact)
: Holder(fact.Acquire())
- {
- }
+ {
+ }
- TItemGuard(TItemGuard&& other) noexcept {
+ TItemGuard(TItemGuard&& other) noexcept {
*this = std::move(other);
}
- TItemGuard& operator=(TItemGuard&& other) noexcept {
+ TItemGuard& operator=(TItemGuard&& other) noexcept {
if (&other != this) {
std::swap(Holder, other.Holder);
}
diff --git a/library/cpp/codecs/ut/codecs_ut.cpp b/library/cpp/codecs/ut/codecs_ut.cpp
index 36675f6b63..caf6089aef 100644
--- a/library/cpp/codecs/ut/codecs_ut.cpp
+++ b/library/cpp/codecs/ut/codecs_ut.cpp
@@ -13,107 +13,107 @@
#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h>
namespace {
- const char* TextValues[] = {
- "! сентября газета",
- "!(возмездие это)!",
- "!(материнский капитал)",
- "!(пермь березники)",
- "!биография | !жизнь / + розинг | зворыгин & изобретение | телевидение | электронно лучевая трубка",
- "!овсиенко николай павлович",
- "!путин",
- "\"i'm on you\" p. diddy тимати клип",
- "\"билайн\" представит собственный планшет",
- "\"в особо крупном размере\"",
- "\"викиликс\" джулиан ассанж",
- "\"вимм билль данн",
- "\"газэнергосеть астрахань",
- "\"газэнергосеть астрахань\"",
- "\"домодедово\" ту-154",
- "\"жилина\" \"спартак\" видео",
- "\"зелёнsq шершнm\"",
- "\"зелёного шершня\"",
- "\"золотой граммофон\" марины яблоковой",
- "\"золотой граммофон-2010\"",
- "\"калинниковы\"",
- "\"манчестер юнайтед\" (англия) \"валенсия\" (испания) 1:1 (0:1)",
- "\"маркер\"",
- "\"моника\" засыпает москву снегом",
- "\"моника\" снегопад",
- "\"о безопасности\",",
- "\"памятку\" для пассажиров воздушных международных рейсов",
- "\"петровский парк\" и \"ходынское поле\"",
- "\"путинская\" трава",
- "\"пятерочка\"купила \"копейку\"",
- "\"пятёрочка\" и \"копейка\" объединились",
- "\"реал\" \"осер\" 4:0",
- "\"речь мутко\"",
- "\"российский лес 2010\"",
- "\"ростехинвентаризация федеральное бти\" рубцов",
- "\"саня останется с нами\",",
- "\"следопыт\" реалити шоу",
- "\"слышишь\" молодые авторы",
- "\"стадион\"",
- "\"ходынское поле\" метро",
- "\"хроники нарнии\"",
- "\"чистая вода\"",
- "\"школа деда мороза\"",
- "# asus -1394",
- "# сторонники wikileaks",
- "#106#",
- "#11",
- "#8 какой цвет",
- "#если клиент",
- "$ 13,79",
- "$ xnj ,s dct ,skb ljdjkmys !!!",
- "$ в день",
- "$ диск компьютера",
- "$.ajax",
- "$125 000",
- "$курс",
- "% в си",
- "% влады",
- "% годовых",
- "% женщин и % мужчин в россии",
- "% занятости персонала",
- "% инфляции 2010",
- "% инфляции в 2010 г.",
- "% налога",
- "% налогов в 2010г.",
- "% общего количества",
- "% от числа",
- "% по налогу на прибыль организации",
- "%24",
- "%академия%",
- "%комарова%татьяна",
- "& в 1с",
- "&& (+не существует | !такой проблемы)",
- "&gt;&gt;&gt;скачать | download c cs strikez.clan.su&lt;&lt;&lt;",
- "&gt;hbq nbityrjd",
- "&lt; какой знак",
- "&lt; лицей | &lt; техническая школа# &lt; история#&lt; лицей сегодня#&lt; перечень профессий#&lt; руководство лицея#&lt; прием учащихся#&lt; контакты#&lt; схема проезда#&lt; фотогалереяистория создания лицея и основные этапы путиулица купчинская дом 28",
- "&lt;&lt;link&gt;&gt;",
- "&lt;/storage&gt;",
- "&lt;bfnkjy",
- "&lt;bktntd",
- "&lt;cr",
- "&lt;ddr3&gt;",
- "&lt;e[ufknthcrbq abyfycjdsq",
- "&lt;fcctqys",
- "&lt;fhcf",
- "&lt;fhctkjyf he,by",
- "&lt;firbhbz",
- "&lt;fyr djphj;ltybt",
- "&lt;fyr vjcrds",
- "&lt;fyr резерв",
- "&lt;fyufkjh",
- "&lt;index&gt;",
- "&lt;jkmifz jrhe;yfz rbtd",
- "&lt;kbpytws",
- "&lt;megafon&gt; интернет",
- "&lt;thtpybrb gthvcrbq rhfq",
- "&lt;tkjxrf",
- "&lt;беларусь это мы",
- "&lt;бокс, версия ibf",
+ const char* TextValues[] = {
+ "! сентября газета",
+ "!(возмездие это)!",
+ "!(материнский капитал)",
+ "!(пермь березники)",
+ "!биография | !жизнь / + розинг | зворыгин & изобретение | телевидение | электронно лучевая трубка",
+ "!овсиенко николай павлович",
+ "!путин",
+ "\"i'm on you\" p. diddy тимати клип",
+ "\"билайн\" представит собственный планшет",
+ "\"в особо крупном размере\"",
+ "\"викиликс\" джулиан ассанж",
+ "\"вимм билль данн",
+ "\"газэнергосеть астрахань",
+ "\"газэнергосеть астрахань\"",
+ "\"домодедово\" ту-154",
+ "\"жилина\" \"спартак\" видео",
+ "\"зелёнsq шершнm\"",
+ "\"зелёного шершня\"",
+ "\"золотой граммофон\" марины яблоковой",
+ "\"золотой граммофон-2010\"",
+ "\"калинниковы\"",
+ "\"манчестер юнайтед\" (англия) \"валенсия\" (испания) 1:1 (0:1)",
+ "\"маркер\"",
+ "\"моника\" засыпает москву снегом",
+ "\"моника\" снегопад",
+ "\"о безопасности\",",
+ "\"памятку\" для пассажиров воздушных международных рейсов",
+ "\"петровский парк\" и \"ходынское поле\"",
+ "\"путинская\" трава",
+ "\"пятерочка\"купила \"копейку\"",
+ "\"пятёрочка\" и \"копейка\" объединились",
+ "\"реал\" \"осер\" 4:0",
+ "\"речь мутко\"",
+ "\"российский лес 2010\"",
+ "\"ростехинвентаризация федеральное бти\" рубцов",
+ "\"саня останется с нами\",",
+ "\"следопыт\" реалити шоу",
+ "\"слышишь\" молодые авторы",
+ "\"стадион\"",
+ "\"ходынское поле\" метро",
+ "\"хроники нарнии\"",
+ "\"чистая вода\"",
+ "\"школа деда мороза\"",
+ "# asus -1394",
+ "# сторонники wikileaks",
+ "#106#",
+ "#11",
+ "#8 какой цвет",
+ "#если клиент",
+ "$ 13,79",
+ "$ xnj ,s dct ,skb ljdjkmys !!!",
+ "$ в день",
+ "$ диск компьютера",
+ "$.ajax",
+ "$125 000",
+ "$курс",
+ "% в си",
+ "% влады",
+ "% годовых",
+ "% женщин и % мужчин в россии",
+ "% занятости персонала",
+ "% инфляции 2010",
+ "% инфляции в 2010 г.",
+ "% налога",
+ "% налогов в 2010г.",
+ "% общего количества",
+ "% от числа",
+ "% по налогу на прибыль организации",
+ "%24",
+ "%академия%",
+ "%комарова%татьяна",
+ "& в 1с",
+ "&& (+не существует | !такой проблемы)",
+ "&gt;&gt;&gt;скачать | download c cs strikez.clan.su&lt;&lt;&lt;",
+ "&gt;hbq nbityrjd",
+ "&lt; какой знак",
+ "&lt; лицей | &lt; техническая школа# &lt; история#&lt; лицей сегодня#&lt; перечень профессий#&lt; руководство лицея#&lt; прием учащихся#&lt; контакты#&lt; схема проезда#&lt; фотогалереяистория создания лицея и основные этапы путиулица купчинская дом 28",
+ "&lt;&lt;link&gt;&gt;",
+ "&lt;/storage&gt;",
+ "&lt;bfnkjy",
+ "&lt;bktntd",
+ "&lt;cr",
+ "&lt;ddr3&gt;",
+ "&lt;e[ufknthcrbq abyfycjdsq",
+ "&lt;fcctqys",
+ "&lt;fhcf",
+ "&lt;fhctkjyf he,by",
+ "&lt;firbhbz",
+ "&lt;fyr djphj;ltybt",
+ "&lt;fyr vjcrds",
+ "&lt;fyr резерв",
+ "&lt;fyufkjh",
+ "&lt;index&gt;",
+ "&lt;jkmifz jrhe;yfz rbtd",
+ "&lt;kbpytws",
+ "&lt;megafon&gt; интернет",
+ "&lt;thtpybrb gthvcrbq rhfq",
+ "&lt;tkjxrf",
+ "&lt;беларусь это мы",
+ "&lt;бокс, версия ibf",
"designer tree svc",
"seriesg810",
"doll makers",
@@ -854,11 +854,11 @@ namespace {
"resume maker",
"lymphomatoid papulosis",
"sez.com",
- };
+ };
}
class TCodecsTest: public TTestBase {
- UNIT_TEST_SUITE(TCodecsTest);
+ UNIT_TEST_SUITE(TCodecsTest);
UNIT_TEST(TestPipeline)
UNIT_TEST(TestDelta)
UNIT_TEST(TestHuffman)
@@ -869,14 +869,14 @@ class TCodecsTest: public TTestBase {
UNIT_TEST(TestPFor)
UNIT_TEST(TestRegistry)
- UNIT_TEST_SUITE_END();
+ UNIT_TEST_SUITE_END();
private:
TString PrintError(TStringBuf learn, TStringBuf test, TStringBuf codec, ui32 i) {
TString s;
TStringOutput sout(s);
- sout << codec << ": " << i << ", "
- << "\n";
+ sout << codec << ": " << i << ", "
+ << "\n";
sout << HexEncode(learn.data(), learn.size()); //NEscJ::EscapeJ<true>(learn, sout);
sout << " != \n";
sout << HexEncode(test.data(), test.size()); //NEscJ::EscapeJ<true>(test, sout);
@@ -1009,8 +1009,8 @@ private:
AppendTo(d.back(), -1LL);
AppendTo(d.back(), -1LL);
- TestCodec<TDeltaCodec<ui64, true>, false>(d);
- TestCodec<TDeltaCodec<ui64, false>, false>(d);
+ TestCodec<TDeltaCodec<ui64, true>, false>(d);
+ TestCodec<TDeltaCodec<ui64, false>, false>(d);
}
void TestPFor() {
@@ -1050,7 +1050,7 @@ private:
AppendTo(d.back(), -1LL);
AppendTo(d.back(), -2LL);
- TestCodec<TPForCodec<ui64>, false>(d);
+ TestCodec<TPForCodec<ui64>, false>(d);
TestCodec<TPForCodec<ui64, true>, true>(d);
}
{
@@ -1080,7 +1080,7 @@ private:
AppendTo(d.back(), -1);
AppendTo(d.back(), -2);
- TestCodec<TPForCodec<ui32>, false>(d);
+ TestCodec<TPForCodec<ui32>, false>(d);
TestCodec<TPForCodec<ui32, true>, false>(d);
}
{
@@ -1326,7 +1326,7 @@ private:
}
TestCodec<TPipelineCodec, true>(learn, test,
- new TPipelineCodec(new TSolarCodec(512, 8), new TSolarCodec(512, 8), new THuffmanCodec));
+ new TPipelineCodec(new TSolarCodec(512, 8), new TSolarCodec(512, 8), new THuffmanCodec));
}
{
TVector<TBuffer> d;
@@ -1338,7 +1338,7 @@ private:
}
TestCodec<TPipelineCodec, false>(d, TVector<TBuffer>(),
- new TPipelineCodec(new TDeltaCodec<ui32, false>, new TPForCodec<ui32>));
+ new TPipelineCodec(new TDeltaCodec<ui32, false>, new TPForCodec<ui32>));
}
}
diff --git a/library/cpp/codecs/ut/float_huffman_ut.cpp b/library/cpp/codecs/ut/float_huffman_ut.cpp
index dddff22173..3156fb1f46 100644
--- a/library/cpp/codecs/ut/float_huffman_ut.cpp
+++ b/library/cpp/codecs/ut/float_huffman_ut.cpp
@@ -60,7 +60,7 @@ Y_UNIT_TEST_SUITE(FloatHuffmanTest) {
0.517876, 0.145833, 0.372549, 0, 0.991667, 0.602125, 0.161979, 0, 0, 0, 0, 0.0255146,
0.947855, 0, 0, 0, 0, 0, 0, 0, 0, 0.847059, 0.679841, 0, 0.156863, 0, 0, 1, 0, 0,
0, 0, 0.969697, 0, 0, 0.564706, 0, 0, 0, 0, 0, 1, 0.0367282, 0.0395228, 0, 0, 0,
- 0, 0, 0.0470588, 0.141176, 0.054902, 0, 0, 0, 0};
+ 0, 0, 0.0470588, 0.141176, 0.054902, 0, 0, 0, 0};
static const size_t FactorCount = Y_ARRAY_SIZE(Factors);
static const ui8 CodedFactors[] = {
@@ -132,7 +132,7 @@ Y_UNIT_TEST_SUITE(FloatHuffmanTest) {
0x7F, 0x20, 0x1A, 0x81, 0x9A, 0xCA, 0xBF, 0xC8, 0x8D, 0x8D, 0xC2, 0x83, 0x82, 0xA7, 0x2C, 0x28,
0xC8, 0xFE, 0x08, 0xC2, 0x07, 0xC7, 0x27, 0x21, 0xE1, 0xBB, 0x3E, 0xC1, 0x59, 0x68, 0xAA, 0x78,
0xC8, 0x57, 0x5D, 0x60, 0x20, 0xC6, 0x41, 0x42, 0xE8, 0x3A, 0x38, 0xD8, 0x9B, 0xFF, 0xFF, 0xFF,
- 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+ 0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
static const size_t CodedSize = Y_ARRAY_SIZE(CodedFactors);
static const TStringBuf CodedFactorsBuf(reinterpret_cast<const char*>(CodedFactors), CodedSize);
@@ -228,7 +228,7 @@ Y_UNIT_TEST_SUITE(FloatHuffmanTest) {
"MBD693f07+9+DQQEkIGAgIgPetzN5yEbAGxWpbCNxXK/0JGTKRz2KkIoR7aM";
UNIT_ASSERT_EXCEPTION(
fh::Decode(Base64Decode(brokenBase64Encoded)),
- yexception);
+ yexception);
}
Y_UNIT_TEST(TestDecompressEmpty) {
diff --git a/library/cpp/codecs/ut/tls_cache_ut.cpp b/library/cpp/codecs/ut/tls_cache_ut.cpp
index 361d41a02e..8101af761f 100644
--- a/library/cpp/codecs/ut/tls_cache_ut.cpp
+++ b/library/cpp/codecs/ut/tls_cache_ut.cpp
@@ -2,35 +2,35 @@
#include <library/cpp/codecs/tls_cache.h>
Y_UNIT_TEST_SUITE(CodecsBufferFactoryTest){
- void AssignToBuffer(TBuffer & buf, TStringBuf val){
+ void AssignToBuffer(TBuffer & buf, TStringBuf val){
buf.Assign(val.data(), val.size());
-}
+}
-TStringBuf AsStringBuf(const TBuffer& b) {
- return TStringBuf(b.Data(), b.Size());
-}
+TStringBuf AsStringBuf(const TBuffer& b) {
+ return TStringBuf(b.Data(), b.Size());
+}
Y_UNIT_TEST(TestAcquireReleaseReuse) {
- NCodecs::TBufferTlsCache factory;
- // acquiring the first buffer
- auto buf1 = factory.Item();
- AssignToBuffer(buf1.Get(), "Buffer_01");
- {
- // acquiring the second buffer
- auto buf2 = factory.Item();
- AssignToBuffer(buf2.Get(), "Buffer_02");
+ NCodecs::TBufferTlsCache factory;
+ // acquiring the first buffer
+ auto buf1 = factory.Item();
+ AssignToBuffer(buf1.Get(), "Buffer_01");
+ {
+ // acquiring the second buffer
+ auto buf2 = factory.Item();
+ AssignToBuffer(buf2.Get(), "Buffer_02");
}
- // the first buffer should stay intact
- UNIT_ASSERT_EQUAL(AsStringBuf(buf1.Get()), "Buffer_01");
- {
- // reacquiring the last released buffer
- // expecting it zero sized but having the same memory
- auto buf2 = factory.Item();
- UNIT_ASSERT_VALUES_EQUAL(buf2.Get().Size(), 0u);
- buf2.Get().Resize(TStringBuf("Buffer_02").Size());
- UNIT_ASSERT_EQUAL(AsStringBuf(buf2.Get()), "Buffer_02");
- }
- // when the factory dies we should see no leaks
-}
-}
-;
+ // the first buffer should stay intact
+ UNIT_ASSERT_EQUAL(AsStringBuf(buf1.Get()), "Buffer_01");
+ {
+ // reacquiring the last released buffer
+ // expecting it zero sized but having the same memory
+ auto buf2 = factory.Item();
+ UNIT_ASSERT_VALUES_EQUAL(buf2.Get().Size(), 0u);
+ buf2.Get().Resize(TStringBuf("Buffer_02").Size());
+ UNIT_ASSERT_EQUAL(AsStringBuf(buf2.Get()), "Buffer_02");
+ }
+ // when the factory dies we should see no leaks
+}
+}
+;
diff --git a/library/cpp/codecs/ya.make b/library/cpp/codecs/ya.make
index 9f7a5b5de2..7e76fb0c9a 100644
--- a/library/cpp/codecs/ya.make
+++ b/library/cpp/codecs/ya.make
@@ -4,7 +4,7 @@ OWNER(
g:base
velavokr
)
-
+
SRCS(
tls_cache.cpp
codecs.cpp
diff --git a/library/cpp/codecs/zstd_dict_codec.cpp b/library/cpp/codecs/zstd_dict_codec.cpp
index 6aa67abd62..c42a2879e6 100644
--- a/library/cpp/codecs/zstd_dict_codec.cpp
+++ b/library/cpp/codecs/zstd_dict_codec.cpp
@@ -28,8 +28,8 @@ namespace NCodecs {
TPtrHolder(T* dict)
: Ptr(dict)
- {
- }
+ {
+ }
T* Get() {
return Ptr;
@@ -99,7 +99,7 @@ namespace NCodecs {
TCCtx ctx{CheckPtr(ZSTD_createCCtx(), __LOCATION__)};
const size_t resSz = CheckSize(ZSTD_compress_usingCDict(
ctx.Get(), outbuf.data() + szSz, maxDatSz, rawBeg, rawSz, CDict.Get()),
- __LOCATION__);
+ __LOCATION__);
if (resSz < rawSz) {
outbuf.Resize(resSz + szSz);
@@ -134,13 +134,13 @@ namespace NCodecs {
outbuf.Resize(rawSz);
memcpy(outbuf.data(), rawBeg, rawSz);
} else {
- // size_t zSz = ZSTD_getDecompressedSize(rawBeg, rawSz);
- // Y_ENSURE_EX(datSz == zSz, TCodecException() << datSz << " != " << zSz);
+ // size_t zSz = ZSTD_getDecompressedSize(rawBeg, rawSz);
+ // Y_ENSURE_EX(datSz == zSz, TCodecException() << datSz << " != " << zSz);
outbuf.Resize(datSz);
TDCtx ctx{CheckPtr(ZSTD_createDCtx(), __LOCATION__)};
CheckSize(ZSTD_decompress_usingDDict(
ctx.Get(), outbuf.data(), outbuf.size(), rawBeg, rawSz, DDict.Get()),
- __LOCATION__);
+ __LOCATION__);
outbuf.Resize(datSz);
}
}
@@ -206,8 +206,8 @@ namespace NCodecs {
template <class T>
static T* CheckPtr(T* t, TSourceLocation loc) {
- Y_ENSURE_EX(t, TCodecException() << loc << " "
- << "unexpected nullptr");
+ Y_ENSURE_EX(t, TCodecException() << loc << " "
+ << "unexpected nullptr");
return t;
}
@@ -230,8 +230,8 @@ namespace NCodecs {
MyTraits.RecommendedSampleSize = TImpl::SampleSize; // same as for solar
}
- TZStdDictCodec::~TZStdDictCodec() {
- }
+ TZStdDictCodec::~TZStdDictCodec() {
+ }
TString TZStdDictCodec::GetName() const {
return TStringBuilder() << MyName() << "-" << Impl->GetCompressionLevel();
diff --git a/library/cpp/codecs/zstd_dict_codec.h b/library/cpp/codecs/zstd_dict_codec.h
index 70259989f6..59c1ad6c60 100644
--- a/library/cpp/codecs/zstd_dict_codec.h
+++ b/library/cpp/codecs/zstd_dict_codec.h
@@ -5,34 +5,34 @@
#include <util/generic/ptr.h>
namespace NCodecs {
- // benchmarks are here: https://st.yandex-team.ru/SEARCH-1655
+ // benchmarks are here: https://st.yandex-team.ru/SEARCH-1655
- class TZStdDictCodec: public ICodec {
- class TImpl;
- TIntrusivePtr<TImpl> Impl;
+ class TZStdDictCodec: public ICodec {
+ class TImpl;
+ TIntrusivePtr<TImpl> Impl;
- public:
- explicit TZStdDictCodec(ui32 comprLevel = 1);
- ~TZStdDictCodec() override;
+ public:
+ explicit TZStdDictCodec(ui32 comprLevel = 1);
+ ~TZStdDictCodec() override;
- static TStringBuf MyName() {
- return "zstd08d";
- }
+ static TStringBuf MyName() {
+ return "zstd08d";
+ }
- TString GetName() const override;
+ TString GetName() const override;
- ui8 Encode(TStringBuf in, TBuffer& out) const override;
+ ui8 Encode(TStringBuf in, TBuffer& out) const override;
- void Decode(TStringBuf in, TBuffer& out) const override;
+ void Decode(TStringBuf in, TBuffer& out) const override;
- static TVector<TString> ListCompressionNames();
- static int ParseCompressionName(TStringBuf);
+ static TVector<TString> ListCompressionNames();
+ static int ParseCompressionName(TStringBuf);
- protected:
- void DoLearn(ISequenceReader& in) override;
+ protected:
+ void DoLearn(ISequenceReader& in) override;
bool DoTryToLearn(ISequenceReader& in) final;
- void Save(IOutputStream* out) const override;
- void Load(IInputStream* in) override;
- };
+ void Save(IOutputStream* out) const override;
+ void Load(IInputStream* in) override;
+ };
}