aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/codecs
diff options
context:
space:
mode:
authorRuslan Kovalev <ruslan.a.kovalev@gmail.com>2022-02-10 16:46:44 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:46:44 +0300
commit59e19371de37995fcb36beb16cd6ec030af960bc (patch)
treefa68e36093ebff8b805462e9e6d331fe9d348214 /library/cpp/codecs
parent89db6fe2fe2c32d2a832ddfeb04e8d078e301084 (diff)
downloadydb-59e19371de37995fcb36beb16cd6ec030af960bc.tar.gz
Restoring authorship annotation for Ruslan Kovalev <ruslan.a.kovalev@gmail.com>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/codecs')
-rw-r--r--library/cpp/codecs/README.md8
-rw-r--r--library/cpp/codecs/codecs.cpp102
-rw-r--r--library/cpp/codecs/codecs.h138
-rw-r--r--library/cpp/codecs/codecs_registry.cpp332
-rw-r--r--library/cpp/codecs/codecs_registry.h94
-rw-r--r--library/cpp/codecs/comptable_codec.cpp190
-rw-r--r--library/cpp/codecs/comptable_codec.h32
-rw-r--r--library/cpp/codecs/delta_codec.cpp10
-rw-r--r--library/cpp/codecs/delta_codec.h80
-rw-r--r--library/cpp/codecs/float_huffman.h2
-rw-r--r--library/cpp/codecs/greedy_dict/gd_builder.cpp94
-rw-r--r--library/cpp/codecs/greedy_dict/gd_builder.h60
-rw-r--r--library/cpp/codecs/greedy_dict/gd_entry.cpp74
-rw-r--r--library/cpp/codecs/greedy_dict/gd_entry.h66
-rw-r--r--library/cpp/codecs/greedy_dict/gd_stats.h38
-rw-r--r--library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp538
-rw-r--r--library/cpp/codecs/greedy_dict/ut/ya.make2
-rw-r--r--library/cpp/codecs/greedy_dict/ya.make20
-rw-r--r--library/cpp/codecs/huffman_codec.cpp266
-rw-r--r--library/cpp/codecs/huffman_codec.h32
-rw-r--r--library/cpp/codecs/pfor_codec.cpp12
-rw-r--r--library/cpp/codecs/pfor_codec.h124
-rw-r--r--library/cpp/codecs/sample.h156
-rw-r--r--library/cpp/codecs/solar_codec.cpp88
-rw-r--r--library/cpp/codecs/solar_codec.h48
-rw-r--r--library/cpp/codecs/static/builder.cpp68
-rw-r--r--library/cpp/codecs/static/builder.h42
-rw-r--r--library/cpp/codecs/static/common.h56
-rw-r--r--library/cpp/codecs/static/example/example.cpp76
-rw-r--r--library/cpp/codecs/static/example/example.h24
-rw-r--r--library/cpp/codecs/static/example/ya.make36
-rw-r--r--library/cpp/codecs/static/static.cpp160
-rw-r--r--library/cpp/codecs/static/static.h52
-rw-r--r--library/cpp/codecs/static/static_codec_info.proto34
-rw-r--r--library/cpp/codecs/static/tools/common/ct_common.cpp118
-rw-r--r--library/cpp/codecs/static/tools/common/ct_common.h140
-rw-r--r--library/cpp/codecs/static/tools/common/ya.make26
-rw-r--r--library/cpp/codecs/static/tools/static_codec_checker/README4
-rw-r--r--library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp114
-rw-r--r--library/cpp/codecs/static/tools/static_codec_checker/ya.make22
-rw-r--r--library/cpp/codecs/static/tools/static_codec_generator/README4
-rw-r--r--library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp122
-rw-r--r--library/cpp/codecs/static/tools/static_codec_generator/ya.make22
-rw-r--r--library/cpp/codecs/static/tools/tests/static_codec_tools.py30
-rw-r--r--library/cpp/codecs/static/tools/tests/ya.make28
-rw-r--r--library/cpp/codecs/static/tools/ya.make10
-rw-r--r--library/cpp/codecs/static/ut/builder_ut.cpp94
-rw-r--r--library/cpp/codecs/static/ut/static_ut.cpp38
-rw-r--r--library/cpp/codecs/static/ut/ya.make24
-rw-r--r--library/cpp/codecs/static/ya.make30
-rw-r--r--library/cpp/codecs/tls_cache.cpp8
-rw-r--r--library/cpp/codecs/tls_cache.h184
-rw-r--r--library/cpp/codecs/ut/codecs_ut.cpp714
-rw-r--r--library/cpp/codecs/ut/tls_cache_ut.cpp8
-rw-r--r--library/cpp/codecs/ut/ya.make2
-rw-r--r--library/cpp/codecs/ya.make34
-rw-r--r--library/cpp/codecs/zstd_dict_codec.cpp440
-rw-r--r--library/cpp/codecs/zstd_dict_codec.h34
58 files changed, 2702 insertions, 2702 deletions
diff --git a/library/cpp/codecs/README.md b/library/cpp/codecs/README.md
index 42646ccd97..26fa96da59 100644
--- a/library/cpp/codecs/README.md
+++ b/library/cpp/codecs/README.md
@@ -1,8 +1,8 @@
This is a library of compression algorithms with a unified interface and serialization.
See also library/cpp/codecs/static, where a support for statically compiled dictionaries is implemented.
-
+
All algorithms have a common `ICodec` interface (described in codecs.h).
-
+
The `ICodec` interface has the following methods:\
    `virtual ui8 ICodec::Encode (TMemoryRegion, TBuffer&) const;`\
            - Input - memory region. Output - filled buffer and the rest of the last byte, if it was not filled to the end.\
@@ -27,9 +27,9 @@ The `ICodec` interface has the following methods:\
                    For example, it allows you to save information about which combination of codecs was in use (see below).\
    `virtual void Learn(ISequenceReader*);`\
            - The interface for teaching codecs that use information about the distribution of data.
-
+
In addition, the library has a number of utilities that allow a more flexible use of it.
-
+
In the `ICodec` class the following methods are available:\
    `static TCodecPtr GetInstance(const TString& name);`\
            - Creation of a codec instance by a symbolic name\
diff --git a/library/cpp/codecs/codecs.cpp b/library/cpp/codecs/codecs.cpp
index b17a3156d2..d2265dd9f9 100644
--- a/library/cpp/codecs/codecs.cpp
+++ b/library/cpp/codecs/codecs.cpp
@@ -1,69 +1,69 @@
-#include "codecs.h"
-#include "tls_cache.h"
-
-#include <util/stream/mem.h>
-
-namespace NCodecs {
+#include "codecs.h"
+#include "tls_cache.h"
+
+#include <util/stream/mem.h>
+
+namespace NCodecs {
void ICodec::Store(IOutputStream* out, TCodecPtr p) {
if (!p.Get()) {
::Save(out, (ui16)0);
return;
}
-
+
Y_ENSURE_EX(p->AlreadyTrained(), TCodecException() << "untrained codec " << p->GetName());
const TString& n = p->GetName();
Y_VERIFY(n.size() <= Max<ui16>());
::Save(out, (ui16)n.size());
out->Write(n.data(), n.size());
p->Save(out);
- }
-
+ }
+
TCodecPtr ICodec::Restore(IInputStream* in) {
ui16 l = 0;
::Load(in, l);
-
+
if (!l) {
return nullptr;
}
-
+
TString n;
n.resize(l);
-
+
Y_ENSURE_EX(in->Load(n.begin(), l) == l, TCodecException());
-
+
TCodecPtr p = ICodec::GetInstance(n);
p->Load(in);
p->Trained = true;
return p;
}
-
+
TCodecPtr ICodec::RestoreFromString(TStringBuf s) {
TMemoryInput minp{s.data(), s.size()};
return Restore(&minp);
}
-
+
TString ICodec::GetNameSafe(TCodecPtr p) {
return !p ? TString("none") : p->GetName();
}
-
+
ui8 TPipelineCodec::Encode(TStringBuf in, TBuffer& out) const {
size_t res = Traits().ApproximateSizeOnEncode(in.size());
out.Reserve(res);
out.Clear();
-
+
if (Pipeline.empty()) {
out.Append(in.data(), in.size());
return 0;
} else if (Pipeline.size() == 1) {
return Pipeline.front()->Encode(in, out);
}
-
+
ui8 freelastbits = 0;
-
+
auto buffer = TBufferTlsCache::TlsInstance().Item();
TBuffer& tmp = buffer.Get();
tmp.Reserve(res);
-
+
for (auto it = Pipeline.begin(); it != Pipeline.end(); ++it) {
if (it != Pipeline.begin()) {
tmp.Clear();
@@ -72,15 +72,15 @@ namespace NCodecs {
}
freelastbits = (*it)->Encode(in, out);
}
-
+
return freelastbits;
- }
-
+ }
+
void TPipelineCodec::Decode(TStringBuf in, TBuffer& out) const {
size_t res = Traits().ApproximateSizeOnDecode(in.size());
out.Reserve(res);
out.Clear();
-
+
if (Pipeline.empty()) {
out.Append(in.data(), in.size());
return;
@@ -88,12 +88,12 @@ namespace NCodecs {
Pipeline.front()->Decode(in, out);
return;
}
-
+
auto buffer = TBufferTlsCache::TlsInstance().Item();
-
+
TBuffer& tmp = buffer.Get();
tmp.Reserve(res);
-
+
for (TPipeline::const_reverse_iterator it = Pipeline.rbegin(); it != Pipeline.rend(); ++it) {
if (it != Pipeline.rbegin()) {
tmp.Clear();
@@ -101,40 +101,40 @@ namespace NCodecs {
in = TStringBuf{tmp.data(), tmp.size()};
}
(*it)->Decode(in, out);
- }
- }
-
+ }
+ }
+
void TPipelineCodec::Save(IOutputStream* out) const {
for (const auto& it : Pipeline)
it->Save(out);
- }
-
+ }
+
void TPipelineCodec::Load(IInputStream* in) {
for (const auto& it : Pipeline) {
it->Load(in);
it->SetTrained(true);
}
- }
-
+ }
+
void TPipelineCodec::SetTrained(bool t) {
for (const auto& it : Pipeline) {
it->SetTrained(t);
}
- }
-
+ }
+
TPipelineCodec& TPipelineCodec::AddCodec(TCodecPtr codec) {
if (!codec)
return *this;
-
+
TCodecTraits tr = codec->Traits();
-
+
if (!MyName) {
MyTraits.AssumesStructuredInput = tr.AssumesStructuredInput;
MyTraits.SizeOfInputElement = tr.SizeOfInputElement;
} else {
MyName.append(':');
}
-
+
MyName.append(codec->GetName());
MyTraits.PreservesPrefixGrouping &= tr.PreservesPrefixGrouping;
MyTraits.PaddingBit = tr.PaddingBit;
@@ -144,27 +144,27 @@ namespace NCodecs {
MyTraits.SizeOnEncodeMultiplier *= tr.SizeOnEncodeMultiplier;
MyTraits.SizeOnDecodeMultiplier *= tr.SizeOnDecodeMultiplier;
MyTraits.RecommendedSampleSize = Max(MyTraits.RecommendedSampleSize, tr.RecommendedSampleSize);
-
+
Pipeline.push_back(codec);
return *this;
- }
-
+ }
+
void TPipelineCodec::DoLearnX(ISequenceReader& in, double sampleSizeMult) {
if (!Traits().NeedsTraining) {
return;
}
-
+
if (Pipeline.size() == 1) {
Pipeline.back()->Learn(in);
return;
}
-
+
TVector<TBuffer> trainingInput;
-
+
TStringBuf r;
while (in.NextRegion(r)) {
trainingInput.emplace_back(r.data(), r.size());
- }
+ }
TBuffer buff;
for (const auto& it : Pipeline) {
@@ -176,8 +176,8 @@ namespace NCodecs {
buff.Swap(bit);
}
}
- }
-
+ }
+
bool TPipelineCodec::AlreadyTrained() const {
for (const auto& it : Pipeline) {
if (!it->AlreadyTrained())
@@ -185,6 +185,6 @@ namespace NCodecs {
}
return true;
- }
-
-}
+ }
+
+}
diff --git a/library/cpp/codecs/codecs.h b/library/cpp/codecs/codecs.h
index cc5e72b285..aa7c24b4c6 100644
--- a/library/cpp/codecs/codecs.h
+++ b/library/cpp/codecs/codecs.h
@@ -1,63 +1,63 @@
-#pragma once
-
-#include "sample.h"
-
-#include <util/generic/bt_exception.h>
-#include <util/generic/hash.h>
-#include <util/generic/ptr.h>
-#include <util/generic/singleton.h>
-
-#include <util/stream/input.h>
-#include <util/stream/output.h>
-
+#pragma once
+
+#include "sample.h"
+
+#include <util/generic/bt_exception.h>
+#include <util/generic/hash.h>
+#include <util/generic/ptr.h>
+#include <util/generic/singleton.h>
+
+#include <util/stream/input.h>
+#include <util/stream/output.h>
+
#include <util/string/cast.h>
-#include <util/string/vector.h>
-#include <util/system/tls.h>
-#include <util/ysaveload.h>
-
-namespace NCodecs {
+#include <util/string/vector.h>
+#include <util/system/tls.h>
+#include <util/ysaveload.h>
+
+namespace NCodecs {
class TCodecException: public TWithBackTrace<yexception> {};
-
+
class ICodec;
-
+
using TCodecPtr = TIntrusivePtr<ICodec>;
using TCodecConstPtr = TIntrusiveConstPtr<ICodec>;
-
+
struct TCodecTraits {
ui32 RecommendedSampleSize = 0;
ui16 SizeOfInputElement = 1;
ui8 SizeOnEncodeMultiplier = 1;
ui8 SizeOnEncodeAddition = 0;
ui8 SizeOnDecodeMultiplier = 1;
-
+
bool NeedsTraining = false;
bool PreservesPrefixGrouping = false;
bool Irreversible = false;
bool PaddingBit = 0;
bool AssumesStructuredInput = false;
-
+
size_t ApproximateSizeOnEncode(size_t sz) const {
return sz * SizeOnEncodeMultiplier + SizeOnEncodeAddition;
}
-
+
size_t ApproximateSizeOnDecode(size_t sz) const {
return sz * SizeOnDecodeMultiplier;
}
};
-
+
class ICodec: public TAtomicRefCount<ICodec> {
protected:
bool Trained = false;
TCodecTraits MyTraits;
-
+
public:
TCodecTraits Traits() const {
return MyTraits;
}
-
+
// the name of the codec (or its variant) to be used in the codec registry
virtual TString GetName() const = 0;
-
+
virtual ui8 /*free bits in last byte*/ Encode(TStringBuf, TBuffer&) const = 0;
virtual ui8 Encode(const TBuffer& input, TBuffer& output) const {
return Encode(TStringBuf(input.Data(), input.Data() + input.Size()), output);
@@ -66,16 +66,16 @@ namespace NCodecs {
virtual void Decode(const TBuffer& input, TBuffer& output) const {
Decode(TStringBuf(input.Data(), input.Data() + input.Size()), output);
}
-
+
virtual ~ICodec() = default;
-
+
virtual bool AlreadyTrained() const {
return !Traits().NeedsTraining || Trained;
}
virtual void SetTrained(bool t) {
Trained = t;
}
-
+
bool TryToLearn(ISequenceReader& r) {
Trained = DoTryToLearn(r);
return Trained;
@@ -84,32 +84,32 @@ namespace NCodecs {
void Learn(ISequenceReader& r) {
LearnX(r, 1);
}
-
+
template <class TIter>
void Learn(TIter beg, TIter end) {
Learn(beg, end, IterToStringBuf<TIter>);
}
-
+
template <class TIter, class TGetter>
void Learn(TIter beg, TIter end, TGetter getter) {
auto sample = GetSample(beg, end, Traits().RecommendedSampleSize, getter);
TSimpleSequenceReader<TBuffer> reader{sample};
Learn(reader);
}
-
+
static TCodecPtr GetInstance(TStringBuf name);
-
+
static TVector<TString> GetCodecsList();
-
+
static TString GetNameSafe(TCodecPtr p);
-
+
static void Store(IOutputStream* out, TCodecPtr p);
static TCodecPtr Restore(IInputStream* in);
static TCodecPtr RestoreFromString(TStringBuf);
-
+
protected:
virtual void DoLearn(ISequenceReader&) = 0;
-
+
virtual bool DoTryToLearn(ISequenceReader& r) {
DoLearn(r);
return true;
@@ -119,20 +119,20 @@ namespace NCodecs {
virtual void DoLearnX(ISequenceReader& r, double /*sampleSizeMultiplier*/) {
DoLearn(r);
}
-
+
virtual void Save(IOutputStream*) const {
}
virtual void Load(IInputStream*) {
}
friend class TPipelineCodec;
-
+
public:
// so the pipeline codec will know to adjust the sample for the subcodecs
void LearnX(ISequenceReader& r, double sampleSizeMult) {
DoLearnX(r, sampleSizeMult);
Trained = true;
}
-
+
template <class TIter>
void LearnX(TIter beg, TIter end, double sampleSizeMult) {
auto sample = GetSample(beg, end, Traits().RecommendedSampleSize * sampleSizeMult);
@@ -140,54 +140,54 @@ namespace NCodecs {
LearnX(reader, sampleSizeMult);
}
};
-
+
class TBasicTrivialCodec: public ICodec {
public:
ui8 Encode(TStringBuf in, TBuffer& out) const override {
out.Assign(in.data(), in.size());
return 0;
}
-
+
void Decode(TStringBuf in, TBuffer& out) const override {
Encode(in, out);
}
-
+
protected:
void DoLearn(ISequenceReader&) override {
}
};
-
+
class TTrivialCodec: public TBasicTrivialCodec {
public:
TTrivialCodec() {
MyTraits.PreservesPrefixGrouping = true;
}
-
+
static TStringBuf MyName() {
return "trivial";
}
-
+
TString GetName() const override {
return ToString(MyName());
}
};
-
+
class TTrivialTrainableCodec: public TBasicTrivialCodec {
public:
TTrivialTrainableCodec() {
MyTraits.PreservesPrefixGrouping = true;
MyTraits.NeedsTraining = true;
}
-
+
static TStringBuf MyName() {
return "trivial-trainable";
}
-
+
TString GetName() const override {
return ToString(MyName());
}
};
-
+
class TNullCodec: public ICodec {
public:
TNullCodec() {
@@ -195,31 +195,31 @@ namespace NCodecs {
MyTraits.SizeOnDecodeMultiplier = 0;
MyTraits.SizeOnEncodeMultiplier = 0;
}
-
+
TString GetName() const override {
return "null";
}
-
+
ui8 Encode(TStringBuf, TBuffer& out) const override {
out.Clear();
return 0;
}
-
+
void Decode(TStringBuf, TBuffer& out) const override {
out.Clear();
}
-
+
protected:
void DoLearn(ISequenceReader&) override {
}
};
-
+
class TPipelineCodec: public ICodec {
typedef TVector<TCodecPtr> TPipeline;
-
+
TPipeline Pipeline;
TString MyName;
-
+
public:
explicit TPipelineCodec(TCodecPtr c0 = nullptr, TCodecPtr c1 = nullptr, TCodecPtr c2 = nullptr, TCodecPtr c3 = nullptr) {
MyTraits.PreservesPrefixGrouping = true;
@@ -228,32 +228,32 @@ namespace NCodecs {
AddCodec(c2);
AddCodec(c3);
}
-
+
TString GetName() const override {
return MyName;
}
-
+
ui8 Encode(TStringBuf in, TBuffer& out) const override;
void Decode(TStringBuf in, TBuffer& out) const override;
-
+
public:
/*
- * Add codecs in the following order:
- * uncompressed -> codec0 | codec1 | ... | codecN -> compressed
- */
+ * Add codecs in the following order:
+ * uncompressed -> codec0 | codec1 | ... | codecN -> compressed
+ */
TPipelineCodec& AddCodec(TCodecPtr codec);
-
+
bool AlreadyTrained() const override;
void SetTrained(bool t) override;
-
+
protected:
void DoLearn(ISequenceReader& in) override {
DoLearnX(in, 1);
}
-
+
void DoLearnX(ISequenceReader& in, double sampleSizeMult) override;
void Save(IOutputStream* out) const override;
void Load(IInputStream* in) override;
};
-
-}
+
+}
diff --git a/library/cpp/codecs/codecs_registry.cpp b/library/cpp/codecs/codecs_registry.cpp
index 17d07062ab..7ccfd07a8a 100644
--- a/library/cpp/codecs/codecs_registry.cpp
+++ b/library/cpp/codecs/codecs_registry.cpp
@@ -1,104 +1,104 @@
-#include "codecs_registry.h"
-#include "delta_codec.h"
-#include "huffman_codec.h"
-#include "pfor_codec.h"
-#include "solar_codec.h"
-#include "comptable_codec.h"
-#include "zstd_dict_codec.h"
-
+#include "codecs_registry.h"
+#include "delta_codec.h"
+#include "huffman_codec.h"
+#include "pfor_codec.h"
+#include "solar_codec.h"
+#include "comptable_codec.h"
+#include "zstd_dict_codec.h"
+
#include <library/cpp/blockcodecs/codecs.h>
-
-#include <util/string/builder.h>
+
+#include <util/string/builder.h>
#include <util/string/cast.h>
-
-namespace NCodecs {
- TCodecPtr ICodec::GetInstance(TStringBuf name) {
+
+namespace NCodecs {
+ TCodecPtr ICodec::GetInstance(TStringBuf name) {
return Singleton<NPrivate::TCodecRegistry>()->GetCodec(name);
- }
-
+ }
+
TVector<TString> ICodec::GetCodecsList() {
return Singleton<NPrivate::TCodecRegistry>()->GetCodecsList();
- }
-
- namespace NPrivate {
- void TCodecRegistry::RegisterFactory(TFactoryPtr fac) {
+ }
+
+ namespace NPrivate {
+ void TCodecRegistry::RegisterFactory(TFactoryPtr fac) {
TVector<TString> names = fac->ListNames();
- for (const auto& name : names) {
+ for (const auto& name : names) {
Y_VERIFY(!Registry.contains(name), "already has %s", name.data());
- Registry[name] = fac;
- }
+ Registry[name] = fac;
+ }
}
- TCodecPtr TCodecRegistry::GetCodec(TStringBuf name) const {
- using namespace NPrivate;
-
- if (!name || "none" == name) {
- return nullptr;
- }
-
- if (TStringBuf::npos == name.find(':')) {
+ TCodecPtr TCodecRegistry::GetCodec(TStringBuf name) const {
+ using namespace NPrivate;
+
+ if (!name || "none" == name) {
+ return nullptr;
+ }
+
+ if (TStringBuf::npos == name.find(':')) {
Y_ENSURE_EX(Registry.contains(name), TNoCodecException(name));
- return Registry.find(name)->second->MakeCodec(name);
- } else {
- TPipelineCodec* pipe = new TPipelineCodec;
-
+ return Registry.find(name)->second->MakeCodec(name);
+ } else {
+ TPipelineCodec* pipe = new TPipelineCodec;
+
do {
- TStringBuf v = name.NextTok(':');
- pipe->AddCodec(GetCodec(v));
- } while (name);
-
- return pipe;
- }
- }
-
+ TStringBuf v = name.NextTok(':');
+ pipe->AddCodec(GetCodec(v));
+ } while (name);
+
+ return pipe;
+ }
+ }
+
TVector<TString> TCodecRegistry::GetCodecsList() const {
- using namespace NPrivate;
+ using namespace NPrivate;
TVector<TString> vs;
- vs.push_back("none");
-
- for (const auto& it : Registry) {
- vs.push_back(it.first);
- }
-
- Sort(vs.begin(), vs.end());
- return vs;
- }
-
+ vs.push_back("none");
+
+ for (const auto& it : Registry) {
+ vs.push_back(it.first);
+ }
+
+ Sort(vs.begin(), vs.end());
+ return vs;
+ }
+
struct TSolarCodecFactory : ICodecFactory {
- TCodecPtr MakeCodec(TStringBuf name) const override {
- if (TSolarCodec::MyNameShortInt() == name) {
- return new TSolarCodecShortInt();
- }
- if (TSolarCodec::MyName() == name) {
- return new TSolarCodec();
- }
+ TCodecPtr MakeCodec(TStringBuf name) const override {
+ if (TSolarCodec::MyNameShortInt() == name) {
+ return new TSolarCodecShortInt();
+ }
+ if (TSolarCodec::MyName() == name) {
+ return new TSolarCodec();
+ }
if (name.EndsWith(TStringBuf("-a"))) {
return MakeCodecImpl<TAdaptiveSolarCodec>(name, name.SubStr(TSolarCodec::MyName().size()).Chop(2));
- } else {
+ } else {
return MakeCodecImpl<TSolarCodec>(name, name.SubStr(TSolarCodec::MyName().size()));
- }
- }
-
+ }
+ }
+
template <class TCodecCls>
- TCodecPtr MakeCodecImpl(const TStringBuf& name, const TStringBuf& type) const {
+ TCodecPtr MakeCodecImpl(const TStringBuf& name, const TStringBuf& type) const {
if (TStringBuf("-8k") == type) {
- return new TCodecCls(1 << 13);
- }
+ return new TCodecCls(1 << 13);
+ }
if (TStringBuf("-16k") == type) {
- return new TCodecCls(1 << 14);
- }
+ return new TCodecCls(1 << 14);
+ }
if (TStringBuf("-32k") == type) {
- return new TCodecCls(1 << 15);
- }
+ return new TCodecCls(1 << 15);
+ }
if (TStringBuf("-64k") == type) {
- return new TCodecCls(1 << 16);
- }
+ return new TCodecCls(1 << 16);
+ }
if (TStringBuf("-256k") == type) {
- return new TCodecCls(1 << 18);
- }
- ythrow TNoCodecException(name);
- }
-
+ return new TCodecCls(1 << 18);
+ }
+ ythrow TNoCodecException(name);
+ }
+
TVector<TString> ListNames() const override {
TVector<TString> vs;
vs.push_back(ToString(TSolarCodec::MyName()));
@@ -113,114 +113,114 @@ namespace NCodecs {
vs.push_back(ToString(TSolarCodec::MyName64kAdapt()));
vs.push_back(ToString(TSolarCodec::MyName256kAdapt()));
vs.push_back(ToString(TSolarCodec::MyNameShortInt()));
- return vs;
- }
- };
-
+ return vs;
+ }
+ };
+
struct TZStdDictCodecFactory : ICodecFactory {
- TCodecPtr MakeCodec(TStringBuf name) const override {
- return new TZStdDictCodec(TZStdDictCodec::ParseCompressionName(name));
- }
-
+ TCodecPtr MakeCodec(TStringBuf name) const override {
+ return new TZStdDictCodec(TZStdDictCodec::ParseCompressionName(name));
+ }
+
TVector<TString> ListNames() const override {
- return TZStdDictCodec::ListCompressionNames();
- }
- };
-
+ return TZStdDictCodec::ListCompressionNames();
+ }
+ };
+
struct TCompTableCodecFactory : ICodecFactory {
- TCodecPtr MakeCodec(TStringBuf name) const override {
- if (TCompTableCodec::MyNameHQ() == name) {
- return new TCompTableCodec(TCompTableCodec::Q_HIGH);
- } else if (TCompTableCodec::MyNameLQ() == name) {
- return new TCompTableCodec(TCompTableCodec::Q_LOW);
- } else {
- Y_ENSURE_EX(false, TNoCodecException(name));
- return nullptr;
- }
- }
-
+ TCodecPtr MakeCodec(TStringBuf name) const override {
+ if (TCompTableCodec::MyNameHQ() == name) {
+ return new TCompTableCodec(TCompTableCodec::Q_HIGH);
+ } else if (TCompTableCodec::MyNameLQ() == name) {
+ return new TCompTableCodec(TCompTableCodec::Q_LOW);
+ } else {
+ Y_ENSURE_EX(false, TNoCodecException(name));
+ return nullptr;
+ }
+ }
+
TVector<TString> ListNames() const override {
TVector<TString> vs;
vs.push_back(ToString(TCompTableCodec::MyNameHQ()));
vs.push_back(ToString(TCompTableCodec::MyNameLQ()));
- return vs;
- }
- };
-
+ return vs;
+ }
+ };
+
struct TBlockCodec : ICodec {
- const NBlockCodecs::ICodec* Codec;
-
- TBlockCodec(TStringBuf name)
+ const NBlockCodecs::ICodec* Codec;
+
+ TBlockCodec(TStringBuf name)
: Codec(NBlockCodecs::Codec(name))
- {
- }
-
+ {
+ }
+
TString GetName() const override {
return ToString(Codec->Name());
- }
-
- ui8 Encode(TStringBuf r, TBuffer& b) const override {
- Codec->Encode(r, b);
- return 0;
- }
-
- void Decode(TStringBuf r, TBuffer& b) const override {
- // TODO: throws exception that is not TCodecException
- Codec->Decode(r, b);
- }
-
- protected:
- void DoLearn(ISequenceReader&) override {
- }
- };
-
+ }
+
+ ui8 Encode(TStringBuf r, TBuffer& b) const override {
+ Codec->Encode(r, b);
+ return 0;
+ }
+
+ void Decode(TStringBuf r, TBuffer& b) const override {
+ // TODO: throws exception that is not TCodecException
+ Codec->Decode(r, b);
+ }
+
+ protected:
+ void DoLearn(ISequenceReader&) override {
+ }
+ };
+
struct TBlockCodecsFactory : ICodecFactory {
using TRegistry = THashMap<TString, TCodecPtr>;
- TRegistry Registry;
-
+ TRegistry Registry;
+
TBlockCodecsFactory() {
for (TStringBuf codec : NBlockCodecs::ListAllCodecs()) {
Register(codec);
}
- }
-
- void Register(TStringBuf name) {
- TCodecPtr p = Registry[name] = new TBlockCodec(name);
- Registry[p->GetName()] = p;
- }
-
- TCodecPtr MakeCodec(TStringBuf name) const override {
+ }
+
+ void Register(TStringBuf name) {
+ TCodecPtr p = Registry[name] = new TBlockCodec(name);
+ Registry[p->GetName()] = p;
+ }
+
+ TCodecPtr MakeCodec(TStringBuf name) const override {
if (!Registry.contains(name)) {
- ythrow TNoCodecException(name);
- }
- return Registry.find(name)->second;
- }
-
+ ythrow TNoCodecException(name);
+ }
+ return Registry.find(name)->second;
+ }
+
TVector<TString> ListNames() const override {
TVector<TString> res;
- for (const auto& it : Registry) {
- res.push_back(it.first);
- }
- return res;
- }
- };
-
+ for (const auto& it : Registry) {
+ res.push_back(it.first);
+ }
+ return res;
+ }
+ };
+
TCodecRegistry::TCodecRegistry() {
- RegisterFactory(new TInstanceFactory<TTrivialCodec>);
- RegisterFactory(new TInstanceFactory<TTrivialTrainableCodec>);
- RegisterFactory(new TInstanceFactory<THuffmanCodec>);
+ RegisterFactory(new TInstanceFactory<TTrivialCodec>);
+ RegisterFactory(new TInstanceFactory<TTrivialTrainableCodec>);
+ RegisterFactory(new TInstanceFactory<THuffmanCodec>);
RegisterFactory(new TInstanceFactory<TPForCodec<ui64, true>>);
RegisterFactory(new TInstanceFactory<TPForCodec<ui32, true>>);
- RegisterFactory(new TSolarCodecFactory);
- RegisterFactory(new TZStdDictCodecFactory);
- RegisterFactory(new TCompTableCodecFactory);
- RegisterFactory(new TBlockCodecsFactory);
- }
-
- }
-
- void RegisterCodecFactory(TCodecFactoryPtr fact) {
- Singleton<NPrivate::TCodecRegistry>()->RegisterFactory(fact);
- }
-
-}
+ RegisterFactory(new TSolarCodecFactory);
+ RegisterFactory(new TZStdDictCodecFactory);
+ RegisterFactory(new TCompTableCodecFactory);
+ RegisterFactory(new TBlockCodecsFactory);
+ }
+
+ }
+
+ void RegisterCodecFactory(TCodecFactoryPtr fact) {
+ Singleton<NPrivate::TCodecRegistry>()->RegisterFactory(fact);
+ }
+
+}
diff --git a/library/cpp/codecs/codecs_registry.h b/library/cpp/codecs/codecs_registry.h
index 53710310d5..31170afd62 100644
--- a/library/cpp/codecs/codecs_registry.h
+++ b/library/cpp/codecs/codecs_registry.h
@@ -1,60 +1,60 @@
-#pragma once
-
-#include "codecs.h"
+#pragma once
+
+#include "codecs.h"
#include <util/string/cast.h>
-
-namespace NCodecs {
+
+namespace NCodecs {
struct TNoCodecException : TCodecException {
TNoCodecException(TStringBuf name) {
- (*this) << "unknown codec: " << name;
- }
- };
-
+ (*this) << "unknown codec: " << name;
+ }
+ };
+
struct ICodecFactory : TAtomicRefCount<ICodecFactory> {
- virtual ~ICodecFactory() = default;
- virtual TCodecPtr MakeCodec(TStringBuf name) const = 0;
+ virtual ~ICodecFactory() = default;
+ virtual TCodecPtr MakeCodec(TStringBuf name) const = 0;
virtual TVector<TString> ListNames() const = 0;
- };
-
- typedef TIntrusivePtr<ICodecFactory> TCodecFactoryPtr;
-
- namespace NPrivate {
+ };
+
+ typedef TIntrusivePtr<ICodecFactory> TCodecFactoryPtr;
+
+ namespace NPrivate {
template <typename TCodec>
struct TInstanceFactory : ICodecFactory {
- TCodecPtr MakeCodec(TStringBuf) const override {
- return new TCodec;
- }
-
+ TCodecPtr MakeCodec(TStringBuf) const override {
+ return new TCodec;
+ }
+
TVector<TString> ListNames() const override {
TVector<TString> vs;
vs.push_back(ToString(TCodec::MyName()));
- return vs;
- }
- };
-
- class TCodecRegistry {
+ return vs;
+ }
+ };
+
+ class TCodecRegistry {
using TRegistry = THashMap<TString, TIntrusivePtr<ICodecFactory>>;
- TRegistry Registry;
-
- public:
- using TFactoryPtr = TIntrusivePtr<ICodecFactory>;
-
- TCodecRegistry();
-
- void RegisterFactory(TFactoryPtr fac);
-
- TCodecPtr GetCodec(TStringBuf name) const;
-
+ TRegistry Registry;
+
+ public:
+ using TFactoryPtr = TIntrusivePtr<ICodecFactory>;
+
+ TCodecRegistry();
+
+ void RegisterFactory(TFactoryPtr fac);
+
+ TCodecPtr GetCodec(TStringBuf name) const;
+
TVector<TString> GetCodecsList() const;
- };
-
- }
-
- void RegisterCodecFactory(TCodecFactoryPtr fact);
-
+ };
+
+ }
+
+ void RegisterCodecFactory(TCodecFactoryPtr fact);
+
template <typename TCodec>
- void RegisterCodec() {
- RegisterCodecFactory(new NPrivate::TInstanceFactory<TCodec>());
- }
-
-}
+ void RegisterCodec() {
+ RegisterCodecFactory(new NPrivate::TInstanceFactory<TCodec>());
+ }
+
+}
diff --git a/library/cpp/codecs/comptable_codec.cpp b/library/cpp/codecs/comptable_codec.cpp
index 476b8ada80..1eca4354c6 100644
--- a/library/cpp/codecs/comptable_codec.cpp
+++ b/library/cpp/codecs/comptable_codec.cpp
@@ -1,108 +1,108 @@
-#include "comptable_codec.h"
-
+#include "comptable_codec.h"
+
#include <library/cpp/comptable/comptable.h>
#include <util/string/cast.h>
-
-namespace NCodecs {
+
+namespace NCodecs {
class TCompTableCodec::TImpl: public TAtomicRefCount<TImpl> {
- public:
- TImpl(EQuality q)
- : Quality(q)
+ public:
+ TImpl(EQuality q)
+ : Quality(q)
{
}
-
- void Init() {
- Compressor.Reset(new NCompTable::TChunkCompressor{(bool)Quality, Table});
- Decompressor.Reset(new NCompTable::TChunkDecompressor{(bool)Quality, Table});
- }
-
- ui8 Encode(TStringBuf in, TBuffer& out) const {
- out.Clear();
- if (!in) {
- return 0;
- }
-
+
+ void Init() {
+ Compressor.Reset(new NCompTable::TChunkCompressor{(bool)Quality, Table});
+ Decompressor.Reset(new NCompTable::TChunkDecompressor{(bool)Quality, Table});
+ }
+
+ ui8 Encode(TStringBuf in, TBuffer& out) const {
+ out.Clear();
+ if (!in) {
+ return 0;
+ }
+
TVector<char> result;
- Compressor->Compress(in, &result);
- out.Assign(&result[0], result.size());
- return 0;
- }
-
- void Decode(TStringBuf in, TBuffer& out) const {
- out.Clear();
- if (!in) {
- return;
- }
-
+ Compressor->Compress(in, &result);
+ out.Assign(&result[0], result.size());
+ return 0;
+ }
+
+ void Decode(TStringBuf in, TBuffer& out) const {
+ out.Clear();
+ if (!in) {
+ return;
+ }
+
TVector<char> result;
- Decompressor->Decompress(in, &result);
- out.Assign(&result[0], result.size());
- }
-
- void DoLearn(ISequenceReader& in) {
- NCompTable::TDataSampler sampler;
- TStringBuf region;
- while (in.NextRegion(region)) {
- if (!region) {
- continue;
- }
-
- sampler.AddStat(region);
- }
-
- sampler.BuildTable(Table);
- Init();
- }
-
+ Decompressor->Decompress(in, &result);
+ out.Assign(&result[0], result.size());
+ }
+
+ void DoLearn(ISequenceReader& in) {
+ NCompTable::TDataSampler sampler;
+ TStringBuf region;
+ while (in.NextRegion(region)) {
+ if (!region) {
+ continue;
+ }
+
+ sampler.AddStat(region);
+ }
+
+ sampler.BuildTable(Table);
+ Init();
+ }
+
void Save(IOutputStream* out) const {
- ::Save(out, Table);
- }
-
+ ::Save(out, Table);
+ }
+
void Load(IInputStream* in) {
- ::Load(in, Table);
- Init();
- }
-
- NCompTable::TCompressorTable Table;
- THolder<NCompTable::TChunkCompressor> Compressor;
- THolder<NCompTable::TChunkDecompressor> Decompressor;
- const EQuality Quality;
- static const ui32 SampleSize = Max(NCompTable::TDataSampler::Size * 4, (1 << 22) * 5);
- };
-
- TCompTableCodec::TCompTableCodec(EQuality q)
- : Impl(new TImpl{q})
- {
- MyTraits.NeedsTraining = true;
- MyTraits.SizeOnEncodeMultiplier = 2;
- MyTraits.SizeOnDecodeMultiplier = 10;
- MyTraits.RecommendedSampleSize = TImpl::SampleSize;
- }
-
- TCompTableCodec::~TCompTableCodec() = default;
-
+ ::Load(in, Table);
+ Init();
+ }
+
+ NCompTable::TCompressorTable Table;
+ THolder<NCompTable::TChunkCompressor> Compressor;
+ THolder<NCompTable::TChunkDecompressor> Decompressor;
+ const EQuality Quality;
+ static const ui32 SampleSize = Max(NCompTable::TDataSampler::Size * 4, (1 << 22) * 5);
+ };
+
+ TCompTableCodec::TCompTableCodec(EQuality q)
+ : Impl(new TImpl{q})
+ {
+ MyTraits.NeedsTraining = true;
+ MyTraits.SizeOnEncodeMultiplier = 2;
+ MyTraits.SizeOnDecodeMultiplier = 10;
+ MyTraits.RecommendedSampleSize = TImpl::SampleSize;
+ }
+
+ TCompTableCodec::~TCompTableCodec() = default;
+
TString TCompTableCodec::GetName() const {
return ToString(Impl->Quality ? MyNameHQ() : MyNameLQ());
- }
-
- ui8 TCompTableCodec::Encode(TStringBuf in, TBuffer& out) const {
- return Impl->Encode(in, out);
- }
-
- void TCompTableCodec::Decode(TStringBuf in, TBuffer& out) const {
- Impl->Decode(in, out);
- }
-
- void TCompTableCodec::DoLearn(ISequenceReader& in) {
- Impl->DoLearn(in);
- }
-
+ }
+
+ ui8 TCompTableCodec::Encode(TStringBuf in, TBuffer& out) const {
+ return Impl->Encode(in, out);
+ }
+
+ void TCompTableCodec::Decode(TStringBuf in, TBuffer& out) const {
+ Impl->Decode(in, out);
+ }
+
+ void TCompTableCodec::DoLearn(ISequenceReader& in) {
+ Impl->DoLearn(in);
+ }
+
void TCompTableCodec::Save(IOutputStream* out) const {
- Impl->Save(out);
- }
-
+ Impl->Save(out);
+ }
+
void TCompTableCodec::Load(IInputStream* in) {
- Impl->Load(in);
- }
-
-}
+ Impl->Load(in);
+ }
+
+}
diff --git a/library/cpp/codecs/comptable_codec.h b/library/cpp/codecs/comptable_codec.h
index 7ba4f4c543..1a10c8241e 100644
--- a/library/cpp/codecs/comptable_codec.h
+++ b/library/cpp/codecs/comptable_codec.h
@@ -1,40 +1,40 @@
-#pragma once
-
-#include "codecs.h"
-
-#include <util/generic/ptr.h>
-
-namespace NCodecs {
+#pragma once
+
+#include "codecs.h"
+
+#include <util/generic/ptr.h>
+
+namespace NCodecs {
class TCompTableCodec: public ICodec {
class TImpl;
TIntrusivePtr<TImpl> Impl;
-
+
public:
enum EQuality {
Q_LOW = 0,
Q_HIGH = 1
};
-
+
explicit TCompTableCodec(EQuality q = Q_HIGH);
~TCompTableCodec() override;
-
+
static TStringBuf MyNameHQ() {
return "comptable-hq";
}
static TStringBuf MyNameLQ() {
return "comptable-lq";
}
-
+
TString GetName() const override;
-
+
ui8 Encode(TStringBuf in, TBuffer& out) const override;
-
+
void Decode(TStringBuf in, TBuffer& out) const override;
-
+
protected:
void DoLearn(ISequenceReader& in) override;
void Save(IOutputStream* out) const override;
void Load(IInputStream* in) override;
};
-
-}
+
+}
diff --git a/library/cpp/codecs/delta_codec.cpp b/library/cpp/codecs/delta_codec.cpp
index 61606d6f6f..28d6b6e3bb 100644
--- a/library/cpp/codecs/delta_codec.cpp
+++ b/library/cpp/codecs/delta_codec.cpp
@@ -1,6 +1,6 @@
-#include "delta_codec.h"
-
-namespace NCodecs {
+#include "delta_codec.h"
+
+namespace NCodecs {
template <>
TStringBuf TDeltaCodec<ui64, true>::MyName() {
return "delta64-unsigned";
@@ -17,5 +17,5 @@ namespace NCodecs {
TStringBuf TDeltaCodec<ui32, false>::MyName() {
return "delta32-signed";
}
-
-}
+
+}
diff --git a/library/cpp/codecs/delta_codec.h b/library/cpp/codecs/delta_codec.h
index 21325825e6..7398b3ae80 100644
--- a/library/cpp/codecs/delta_codec.h
+++ b/library/cpp/codecs/delta_codec.h
@@ -1,102 +1,102 @@
-#pragma once
-
-#include "codecs.h"
-
+#pragma once
+
+#include "codecs.h"
+
#include <util/generic/array_ref.h>
-#include <util/generic/typetraits.h>
+#include <util/generic/typetraits.h>
#include <util/generic/bitops.h>
#include <util/string/cast.h>
-
-namespace NCodecs {
+
+namespace NCodecs {
template <typename T = ui64, bool UnsignedDelta = true>
class TDeltaCodec: public ICodec {
static_assert(std::is_integral<T>::value, "expect std::is_integral<T>::value");
-
+
public:
using TUnsigned = std::make_unsigned_t<T>;
using TSigned = std::make_signed_t<T>;
using TDelta = std::conditional_t<UnsignedDelta, TUnsigned, TSigned>;
-
+
private:
const TDelta MinDelta{Min<TDelta>()};
const TDelta MaxDelta{Max<TDelta>() - 1};
const TDelta InvalidDelta{MaxDelta + 1};
-
+
Y_FORCE_INLINE static TDelta AddSafe(TUnsigned a, TUnsigned b) {
return a + b;
}
-
+
Y_FORCE_INLINE static TDelta SubSafe(TUnsigned a, TUnsigned b) {
return a - b;
}
-
+
public:
struct TDecoder {
const TDelta InvalidDelta{Max<TDelta>()};
-
+
T Last = 0;
T Result = 0;
-
+
bool First = true;
bool Invalid = false;
-
+
Y_FORCE_INLINE bool Decode(TDelta t) {
if (Y_UNLIKELY(First)) {
First = false;
Result = Last = t;
return true;
}
-
+
if (Y_UNLIKELY(Invalid)) {
Invalid = false;
Last = 0;
Result = t;
return true;
}
-
+
Result = (Last += t);
Invalid = t == InvalidDelta;
return !Invalid;
- }
+ }
};
-
+
public:
static TStringBuf MyName();
-
+
TDeltaCodec() {
MyTraits.SizeOfInputElement = sizeof(T);
MyTraits.AssumesStructuredInput = true;
- }
-
+ }
+
TString GetName() const override {
return ToString(MyName());
}
-
+
template <class TItem>
static void AppendTo(TBuffer& b, TItem t) {
b.Append((char*)&t, sizeof(t));
}
-
+
ui8 Encode(TStringBuf s, TBuffer& b) const override {
b.Clear();
if (s.empty()) {
return 0;
}
-
+
b.Reserve(s.size());
TArrayRef<const T> tin{(const T*)s.data(), s.size() / sizeof(T)};
-
+
const T* it = tin.begin();
TDelta last = *(it++);
AppendTo(b, last);
-
+
TDelta maxt = SubSafe(MaxDelta, last);
TDelta mint = AddSafe(MinDelta, last);
-
+
for (; it != tin.end(); ++it) {
TDelta t = *it;
-
+
if (Y_LIKELY((t >= mint) & (t <= maxt))) {
AppendTo(b, t - last);
last = t;
@@ -111,33 +111,33 @@ namespace NCodecs {
mint = MinDelta;
}
}
-
+
return 0;
}
-
+
void Decode(TStringBuf s, TBuffer& b) const override {
b.Clear();
if (s.empty()) {
return;
- }
-
+ }
+
b.Reserve(s.size());
TArrayRef<const T> tin{(const T*)s.data(), s.size() / sizeof(T)};
-
+
TDecoder dec;
-
+
for (const T* it = tin.begin(); it != tin.end(); ++it) {
T tmp;
memcpy(&tmp, it, sizeof(tmp));
if (dec.Decode(tmp)) {
AppendTo(b, dec.Result);
}
- }
- }
-
+ }
+ }
+
protected:
void DoLearn(ISequenceReader&) override {
}
};
-
-}
+
+}
diff --git a/library/cpp/codecs/float_huffman.h b/library/cpp/codecs/float_huffman.h
index 786a8eae1d..f03fc240ce 100644
--- a/library/cpp/codecs/float_huffman.h
+++ b/library/cpp/codecs/float_huffman.h
@@ -5,7 +5,7 @@
#include <util/generic/strbuf.h>
#include <array>
-
+
namespace NCodecs::NFloatHuff {
TString Encode(TArrayRef<const float> factors);
diff --git a/library/cpp/codecs/greedy_dict/gd_builder.cpp b/library/cpp/codecs/greedy_dict/gd_builder.cpp
index 561bfbca01..2fb46029bf 100644
--- a/library/cpp/codecs/greedy_dict/gd_builder.cpp
+++ b/library/cpp/codecs/greedy_dict/gd_builder.cpp
@@ -1,85 +1,85 @@
-#include "gd_builder.h"
-
+#include "gd_builder.h"
+
#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h>
-#include <util/generic/algorithm.h>
-
-#include <util/random/shuffle.h>
+#include <util/generic/algorithm.h>
+
+#include <util/random/shuffle.h>
#include <util/stream/output.h>
-#include <util/string/printf.h>
-#include <util/system/rusage.h>
-
-namespace NGreedyDict {
+#include <util/string/printf.h>
+#include <util/system/rusage.h>
+
+namespace NGreedyDict {
void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) {
if (!Current) {
Current = MakeHolder<TEntrySet>();
Current->InitWithAlpha();
}
-
+
TEntrySet& set = *Current;
-
+
for (auto& it : set)
it.Count = 0;
-
+
CompoundCounts = nullptr;
CompoundCountsPool.Clear();
-
+
if (!final) {
CompoundCounts = MakeHolder<TCompoundCounts>(&CompoundCountsPool);
CompoundCounts->reserve(maxcand);
}
-
+
Shuffle(Input.begin(), Input.end(), Rng);
-
+
for (auto str : Input) {
if (!final && CompoundCounts->size() > maxcand)
break;
-
+
i32 prev = -1;
-
+
while (!!str) {
TEntry* e = set.FindPrefix(str);
ui32 num = e->Number;
-
+
e->Count += 1;
if (!final && prev >= 0) {
(*CompoundCounts)[Compose(prev, num)] += 1;
}
-
+
prev = num;
++set.TotalCount;
- }
+ }
}
-
+
Current->SetModelP();
- }
-
+ }
+
ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) {
TAutoPtr<TEntrySet> newset = new TEntrySet;
newset->InitWithAlpha();
maxent -= newset->size();
-
+
ui32 additions = 0;
ui32 deletions = 0;
-
+
{
const TEntrySet& set = *Current;
-
+
Candidates.clear();
const ui32 total = set.TotalCount;
const float minpval = Settings.MinPValue;
const EEntryStatTest test = Settings.StatTest;
const EEntryScore score = Settings.Score;
const ui32 mincnt = Settings.MinAbsCount;
-
+
for (const auto& it : set) {
const TEntry& e = it;
float modelp = e.ModelP;
ui32 cnt = e.Count;
-
+
if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval)
Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number));
}
-
+
if (!!CompoundCounts) {
for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) {
const TEntry& prev = set.Get(Prev(it->first));
@@ -89,13 +89,13 @@ namespace NGreedyDict {
if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval)
Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first));
}
- }
-
+ }
+
Sort(Candidates.begin(), Candidates.end());
-
+
if (Candidates.size() > maxent)
Candidates.resize(maxent);
-
+
for (const auto& candidate : Candidates) {
if (IsCompound(candidate.second)) {
additions++;
@@ -103,40 +103,40 @@ namespace NGreedyDict {
} else {
newset->Add(set.Get(candidate.second).Str);
}
- }
+ }
deletions = set.size() - (newset->size() - additions);
- }
-
+ }
+
Current = newset;
Current->BuildHierarchy();
return deletions + additions;
- }
-
+ }
+
ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) {
size_t totalsz = 0;
for (auto it : Input)
totalsz += it.size();
-
+
while (maxiters) {
maxiters--;
-
+
RebuildCounts(maxentries * Settings.GrowLimit, false);
-
+
if (Settings.Verbose) {
TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size());
Clog << Sprintf("%-110s RSS=%" PRIu32 "M", mess.data(), (ui32)(TRusage::Get().MaxRss >> 20)) << Endl;
}
-
+
ui32 diff = BuildNextGeneration(maxentries);
if (Current->size() == maxentries && diff < mindiff)
break;
- }
-
+ }
+
RebuildCounts(0, true);
Current->SetScores(Settings.Score);
return maxiters;
- }
-
-}
+ }
+
+}
diff --git a/library/cpp/codecs/greedy_dict/gd_builder.h b/library/cpp/codecs/greedy_dict/gd_builder.h
index b8e9a5e37b..7f3cea88cb 100644
--- a/library/cpp/codecs/greedy_dict/gd_builder.h
+++ b/library/cpp/codecs/greedy_dict/gd_builder.h
@@ -1,94 +1,94 @@
-#pragma once
-
-#include "gd_entry.h"
-
-#include <util/generic/hash.h>
-#include <util/random/fast.h>
-
-namespace NGreedyDict {
+#pragma once
+
+#include "gd_entry.h"
+
+#include <util/generic/hash.h>
+#include <util/random/fast.h>
+
+namespace NGreedyDict {
struct TBuildSettings {
EEntryStatTest StatTest = EST_SIMPLE_NORM;
EEntryScore Score = ES_LEN_SIMPLE;
-
+
float MinPValue = 0.75;
ui32 MinAbsCount = 10;
ui32 GrowLimit = 10; // times of maxentries
bool Verbose = false;
};
-
+
class TDictBuilder {
using TCompoundCounts = THashMap<ui64, ui32, THash<ui64>, TEqualTo<ui64>, TPoolAllocator>;
using TCandidate = std::pair<float, ui64>;
using TCandidates = TVector<TCandidate>;
-
+
private:
TFastRng64 Rng{0x1a5d0ac170565c1c, 0x0be7bc27, 0x6235f6f57820aa0d, 0xafdc7fb};
TStringBufs Input;
-
+
THolder<TEntrySet> Current;
-
+
TMemoryPool CompoundCountsPool;
THolder<TCompoundCounts> CompoundCounts;
-
+
TCandidates Candidates;
-
+
TBuildSettings Settings;
-
+
public:
TDictBuilder(const TBuildSettings& s = TBuildSettings())
: CompoundCountsPool(8112, TMemoryPool::TLinearGrow::Instance())
, Settings(s)
{
}
-
+
void SetInput(const TStringBufs& in) {
Input = in;
}
-
+
const TBuildSettings& GetSettings() const {
return Settings;
}
-
+
TBuildSettings& GetSettings() {
return Settings;
}
-
+
void SetSettings(const TBuildSettings& s) {
Settings = s;
}
-
+
TEntrySet& EntrySet() {
return *Current;
}
-
+
const TEntrySet& EntrySet() const {
return *Current;
}
-
+
THolder<TEntrySet> ReleaseEntrySet() {
return std::move(Current);
}
-
+
ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10);
-
+
public:
void RebuildCounts(ui32 maxcand, bool final);
ui32 /*diff size*/ BuildNextGeneration(ui32 maxent);
-
+
static bool IsCompound(ui64 ent) {
return ent & 0xFFFFFFFF00000000ULL;
}
-
+
static ui32 Next(ui64 ent) {
return ent;
}
static ui32 Prev(ui64 ent) {
return (ent >> 32) - 1;
}
-
+
static ui64 Compose(ui32 prev, ui32 next) {
return ((prev + 1ULL) << 32) | next;
}
};
-
-}
+
+}
diff --git a/library/cpp/codecs/greedy_dict/gd_entry.cpp b/library/cpp/codecs/greedy_dict/gd_entry.cpp
index 2c315c7f7c..f23a754976 100644
--- a/library/cpp/codecs/greedy_dict/gd_entry.cpp
+++ b/library/cpp/codecs/greedy_dict/gd_entry.cpp
@@ -1,98 +1,98 @@
-#include "gd_entry.h"
-#include "gd_stats.h"
-
-#include <util/generic/algorithm.h>
-#include <util/generic/singleton.h>
-
-namespace NGreedyDict {
+#include "gd_entry.h"
+#include "gd_stats.h"
+
+#include <util/generic/algorithm.h>
+#include <util/generic/singleton.h>
+
+namespace NGreedyDict {
class TAlphas {
char Memory[512];
-
+
public:
TStringBufs Alphas;
-
+
TAlphas() {
for (ui32 i = 0; i < 256; ++i) {
Memory[2 * i] = (char)i;
Memory[2 * i + 1] = 0;
-
+
Alphas.push_back(TStringBuf(&Memory[2 * i], 1));
}
}
};
-
+
void TEntrySet::InitWithAlpha() {
Pool.ClearKeepFirstChunk();
const TStringBufs& a = Singleton<TAlphas>()->Alphas;
for (auto it : a) {
Add(it);
- }
+ }
BuildHierarchy();
- }
-
+ }
+
void TEntrySet::BuildHierarchy() {
Sort(begin(), end(), TEntry::StrLess);
-
+
TCompactTrieBuilder<char, ui32, TAsIsPacker<ui32>> builder(CTBF_PREFIX_GROUPED);
-
+
for (iterator it = begin(); it != end(); ++it) {
it->Number = (it - begin());
TStringBuf suff = it->Str;
size_t len = 0;
ui32 val = 0;
-
+
if (builder.FindLongestPrefix(suff.data(), suff.size(), &len, &val) && len) {
it->NearestPrefix = val;
}
-
+
builder.Add(suff.data(), suff.size(), it->Number);
- }
-
+ }
+
TBufferOutput bout;
builder.Save(bout);
Trie.Init(TBlob::FromBuffer(bout.Buffer()));
- }
-
+ }
+
TEntry* TEntrySet::FindPrefix(TStringBuf& str) {
size_t len = 0;
ui32 off = 0;
-
+
if (!Trie.FindLongestPrefix(str, &len, &off)) {
return nullptr;
}
-
+
str.Skip(len);
return &Get(off);
- }
-
+ }
+
void TEntrySet::SetModelP() {
for (iterator it = begin(); it != end(); ++it) {
TEntry& e = *it;
-
+
if (!e.HasPrefix()) {
e.ModelP = 0;
continue;
}
-
+
TStringBuf suff = e.Str;
const TEntry& p = Get(e.NearestPrefix);
suff.Skip(p.Len());
-
+
float modelp = float(p.Count + e.Count) / TotalCount;
-
+
while (!!suff) {
TEntry* pp = FindPrefix(suff);
modelp *= float(pp->Count + e.Count) / TotalCount;
}
-
+
e.ModelP = modelp;
- }
- }
-
+ }
+ }
+
void TEntrySet::SetScores(EEntryScore s) {
for (auto& it : *this) {
it.Score = Score(s, it.Len(), it.ModelP, it.Count, TotalCount);
}
- }
-
-}
+ }
+
+}
diff --git a/library/cpp/codecs/greedy_dict/gd_entry.h b/library/cpp/codecs/greedy_dict/gd_entry.h
index 18b5be0e15..0362fd9f99 100644
--- a/library/cpp/codecs/greedy_dict/gd_entry.h
+++ b/library/cpp/codecs/greedy_dict/gd_entry.h
@@ -1,42 +1,42 @@
-#pragma once
-
-#include "gd_stats.h"
-
+#pragma once
+
+#include "gd_stats.h"
+
#include <library/cpp/containers/comptrie/comptrie.h>
-
-#include <util/generic/ptr.h>
-#include <util/generic/strbuf.h>
-#include <util/generic/vector.h>
-
-#include <util/memory/pool.h>
-
-namespace NGreedyDict {
+
+#include <util/generic/ptr.h>
+#include <util/generic/strbuf.h>
+#include <util/generic/vector.h>
+
+#include <util/memory/pool.h>
+
+namespace NGreedyDict {
using TStringBufs = TVector<TStringBuf>;
-
+
struct TEntry {
static const i32 NoPrefix = -1;
-
+
TStringBuf Str;
-
+
i32 NearestPrefix = NoPrefix;
ui32 Count = 0;
ui32 Number = 0;
float ModelP = 0;
float Score = 0;
-
+
TEntry(TStringBuf b = TStringBuf(), ui32 cnt = 0)
: Str(b)
, Count(cnt)
{
}
-
+
bool HasPrefix() const {
return NearestPrefix != NoPrefix;
}
ui32 Len() const {
return Str.size();
}
-
+
static bool StrLess(const TEntry& a, const TEntry& b) {
return a.Str < b.Str;
}
@@ -47,20 +47,20 @@ namespace NGreedyDict {
return a.Score > b.Score;
}
};
-
+
class TEntrySet: public TVector<TEntry>, TNonCopyable {
TMemoryPool Pool{8112};
TCompactTrie<char, ui32, TAsIsPacker<ui32>> Trie;
-
+
public:
ui32 TotalCount = 0;
-
+
void InitWithAlpha();
-
+
void Add(TStringBuf a) {
push_back(TStringBuf(Pool.Append(a.data(), a.size()), a.size()));
}
-
+
void Add(TStringBuf a, TStringBuf b) {
size_t sz = a.size() + b.size();
char* p = (char*)Pool.Allocate(sz);
@@ -68,36 +68,36 @@ namespace NGreedyDict {
memcpy(p + a.size(), b.data(), b.size());
push_back(TStringBuf(p, sz));
}
-
+
TEntry& Get(ui32 idx) {
return (*this)[idx];
}
-
+
const TEntry& Get(ui32 idx) const {
return (*this)[idx];
}
-
+
void BuildHierarchy();
-
+
// longest prefix
TEntry* FindPrefix(TStringBuf& str);
-
+
const TEntry* FindPrefix(TStringBuf& str) const {
return ((TEntrySet*)this)->FindPrefix(str);
}
-
+
const TEntry* FirstPrefix(const TEntry& e, TStringBuf& suff) {
if (!e.HasPrefix())
return nullptr;
-
+
const TEntry& p = Get(e.NearestPrefix);
suff = e.Str;
suff.Skip(p.Str.size());
return &p;
}
-
+
void SetModelP();
void SetScores(EEntryScore);
};
-
-}
+
+}
diff --git a/library/cpp/codecs/greedy_dict/gd_stats.h b/library/cpp/codecs/greedy_dict/gd_stats.h
index b63c4c38d2..3c209fc67d 100644
--- a/library/cpp/codecs/greedy_dict/gd_stats.h
+++ b/library/cpp/codecs/greedy_dict/gd_stats.h
@@ -1,10 +1,10 @@
-#pragma once
-
+#pragma once
+
#include <util/generic/ymath.h>
-#include <util/generic/algorithm.h>
-#include <util/generic/yexception.h>
-
-namespace NGreedyDict {
+#include <util/generic/algorithm.h>
+#include <util/generic/yexception.h>
+
+namespace NGreedyDict {
enum EEntryScore {
ES_COUNT,
ES_LEN_COUNT,
@@ -12,33 +12,33 @@ namespace NGreedyDict {
ES_LEN_SIMPLE,
ES_SOLAR
};
-
+
enum EEntryStatTest {
EST_NONE = 0,
EST_SIMPLE_NORM = 2
};
-
+
inline float ModelP(ui32 countA, ui32 countB, ui32 total) {
return float(countA) * countB / total / total;
}
-
+
// P (ab | dependent)
inline float SimpleTest(float modelp, ui32 countAB, ui32 total) {
float realp = float(countAB) / total;
return modelp >= realp ? 0 : (realp - modelp);
}
-
+
inline float SolarTest(float modelp, ui32 countAB, ui32 total) {
float realp = float(countAB) / total;
return modelp >= realp ? 0 : (modelp + realp * (log(realp / modelp) - 1));
}
-
+
// P (ab | dependent) / P (ab)
inline float SimpleTestNorm(float modelp, ui32 countAB, ui32 total) {
float realp = float(countAB) / total;
return modelp >= realp ? 0 : (realp - modelp) / realp;
}
-
+
inline float StatTest(EEntryStatTest test, float modelp, ui32 countAB, ui32 total) {
if (!total) {
return 0;
@@ -50,9 +50,9 @@ namespace NGreedyDict {
return SimpleTestNorm(modelp, countAB, total);
}
Y_FAIL("no way!");
- return 0;
- }
-
+ return 0;
+ }
+
inline float Score(EEntryScore score, ui32 len, float modelp, ui32 count, ui32 total) {
if (!total) {
return 0;
@@ -73,7 +73,7 @@ namespace NGreedyDict {
return SolarTest(modelp, count, total);
}
Y_FAIL("no way!");
- return 0;
- }
-
-}
+ return 0;
+ }
+
+}
diff --git a/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp
index 679089a11b..60ab9f7c30 100644
--- a/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp
+++ b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp
@@ -1,282 +1,282 @@
-#include "gd_builder.h"
-
+#include "gd_builder.h"
+
#include <library/cpp/testing/unittest/registar.h>
#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h>
-#include <util/string/printf.h>
+#include <util/string/printf.h>
#include <util/generic/ymath.h>
-
-class TGreedyDictTest: public TTestBase {
+
+class TGreedyDictTest: public TTestBase {
UNIT_TEST_SUITE(TGreedyDictTest);
- UNIT_TEST(TestEntrySet)
- UNIT_TEST(TestBuilder0)
- UNIT_TEST(TestBuilder)
+ UNIT_TEST(TestEntrySet)
+ UNIT_TEST(TestBuilder0)
+ UNIT_TEST(TestBuilder)
UNIT_TEST_SUITE_END();
-
- void TestEntrySet() {
- using namespace NGreedyDict;
-
- {
- TEntrySet d;
-
- d.InitWithAlpha();
-
- for (TEntrySet::const_iterator it = d.begin(); it != d.end(); ++it) {
- UNIT_ASSERT_C(!it->HasPrefix(), Sprintf("%u -> %u", it->Number, it->NearestPrefix));
- UNIT_ASSERT_VALUES_EQUAL(it->Number, (ui32)(it - d.begin()));
- }
-
- UNIT_ASSERT_VALUES_EQUAL(d.size(), 256u);
- TStringBuf s = "aaabbb";
- UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a");
- UNIT_ASSERT_VALUES_EQUAL(s, "aabbb");
- UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a");
- UNIT_ASSERT_VALUES_EQUAL(s, "abbb");
- UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a");
- UNIT_ASSERT_VALUES_EQUAL(s, "bbb");
- UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b");
- UNIT_ASSERT_VALUES_EQUAL(s, "bb");
- UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b");
- UNIT_ASSERT_VALUES_EQUAL(s, "b");
- UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b");
- UNIT_ASSERT_VALUES_EQUAL(s, "");
- s = TStringBuf("", 1);
- UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, TStringBuf("", 1));
- UNIT_ASSERT_VALUES_EQUAL(s, "");
- s = "\xFF";
- UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "\xFF");
- UNIT_ASSERT_VALUES_EQUAL(s, "");
- }
- {
- TEntrySet d;
- d.Add("a");
- d.Add("b");
- d.Add("b", "a");
- d.BuildHierarchy();
-
- UNIT_ASSERT_VALUES_EQUAL(d.size(), 3u);
-
- TStringBuf s = "bab";
- UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "ba");
- UNIT_ASSERT_VALUES_EQUAL(s, "b");
- }
- {
- TEntrySet d;
-
- d.Add("a");
- d.Add("aa");
- d.Add("aaa");
- d.Add("aab");
- d.Add("b");
- d.Add("ba");
-
- d.BuildHierarchy();
-
- UNIT_ASSERT_VALUES_EQUAL(d.size(), 6u);
- {
- TStringBuf s = "aaaaa";
- const TEntry* e = d.FindPrefix(s);
- UNIT_ASSERT_VALUES_EQUAL(e->Str, "aaa");
- UNIT_ASSERT_VALUES_EQUAL(e->Number, 2u);
- UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 1);
- UNIT_ASSERT_VALUES_EQUAL(s, "aa");
- }
-
- {
- TStringBuf s = "a";
- const TEntry* e = d.FindPrefix(s);
- UNIT_ASSERT_VALUES_EQUAL(e->Str, "a");
- UNIT_ASSERT_VALUES_EQUAL(e->Number, 0u);
- UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1);
- UNIT_ASSERT_VALUES_EQUAL(s, "");
- }
-
- {
- TStringBuf s = "bab";
- const TEntry* e = d.FindPrefix(s);
- UNIT_ASSERT_VALUES_EQUAL(e->Str, "ba");
- UNIT_ASSERT_VALUES_EQUAL(e->Number, 5u);
- UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 4);
- UNIT_ASSERT_VALUES_EQUAL(s, "b");
- }
-
- {
- TStringBuf s = "bba";
- const TEntry* e = d.FindPrefix(s);
- UNIT_ASSERT_VALUES_EQUAL(e->Str, "b");
- UNIT_ASSERT_VALUES_EQUAL(e->Number, 4u);
- UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1);
- UNIT_ASSERT_VALUES_EQUAL(s, "ba");
- }
- }
- }
-
- void TestBuilder0() {
- using namespace NGreedyDict;
- ui32 a = 1, b = 11;
- ui64 ab = TDictBuilder::Compose(a, b);
- UNIT_ASSERT(TDictBuilder::IsCompound(ab));
- UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Prev(ab), a);
- UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Next(ab), b);
- }
-
- void FillData(NGreedyDict::TStringBufs& data) {
+
+ void TestEntrySet() {
+ using namespace NGreedyDict;
+
+ {
+ TEntrySet d;
+
+ d.InitWithAlpha();
+
+ for (TEntrySet::const_iterator it = d.begin(); it != d.end(); ++it) {
+ UNIT_ASSERT_C(!it->HasPrefix(), Sprintf("%u -> %u", it->Number, it->NearestPrefix));
+ UNIT_ASSERT_VALUES_EQUAL(it->Number, (ui32)(it - d.begin()));
+ }
+
+ UNIT_ASSERT_VALUES_EQUAL(d.size(), 256u);
+ TStringBuf s = "aaabbb";
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a");
+ UNIT_ASSERT_VALUES_EQUAL(s, "aabbb");
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a");
+ UNIT_ASSERT_VALUES_EQUAL(s, "abbb");
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a");
+ UNIT_ASSERT_VALUES_EQUAL(s, "bbb");
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b");
+ UNIT_ASSERT_VALUES_EQUAL(s, "bb");
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b");
+ UNIT_ASSERT_VALUES_EQUAL(s, "b");
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b");
+ UNIT_ASSERT_VALUES_EQUAL(s, "");
+ s = TStringBuf("", 1);
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, TStringBuf("", 1));
+ UNIT_ASSERT_VALUES_EQUAL(s, "");
+ s = "\xFF";
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "\xFF");
+ UNIT_ASSERT_VALUES_EQUAL(s, "");
+ }
+ {
+ TEntrySet d;
+ d.Add("a");
+ d.Add("b");
+ d.Add("b", "a");
+ d.BuildHierarchy();
+
+ UNIT_ASSERT_VALUES_EQUAL(d.size(), 3u);
+
+ TStringBuf s = "bab";
+ UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "ba");
+ UNIT_ASSERT_VALUES_EQUAL(s, "b");
+ }
+ {
+ TEntrySet d;
+
+ d.Add("a");
+ d.Add("aa");
+ d.Add("aaa");
+ d.Add("aab");
+ d.Add("b");
+ d.Add("ba");
+
+ d.BuildHierarchy();
+
+ UNIT_ASSERT_VALUES_EQUAL(d.size(), 6u);
+ {
+ TStringBuf s = "aaaaa";
+ const TEntry* e = d.FindPrefix(s);
+ UNIT_ASSERT_VALUES_EQUAL(e->Str, "aaa");
+ UNIT_ASSERT_VALUES_EQUAL(e->Number, 2u);
+ UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 1);
+ UNIT_ASSERT_VALUES_EQUAL(s, "aa");
+ }
+
+ {
+ TStringBuf s = "a";
+ const TEntry* e = d.FindPrefix(s);
+ UNIT_ASSERT_VALUES_EQUAL(e->Str, "a");
+ UNIT_ASSERT_VALUES_EQUAL(e->Number, 0u);
+ UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1);
+ UNIT_ASSERT_VALUES_EQUAL(s, "");
+ }
+
+ {
+ TStringBuf s = "bab";
+ const TEntry* e = d.FindPrefix(s);
+ UNIT_ASSERT_VALUES_EQUAL(e->Str, "ba");
+ UNIT_ASSERT_VALUES_EQUAL(e->Number, 5u);
+ UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 4);
+ UNIT_ASSERT_VALUES_EQUAL(s, "b");
+ }
+
+ {
+ TStringBuf s = "bba";
+ const TEntry* e = d.FindPrefix(s);
+ UNIT_ASSERT_VALUES_EQUAL(e->Str, "b");
+ UNIT_ASSERT_VALUES_EQUAL(e->Number, 4u);
+ UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1);
+ UNIT_ASSERT_VALUES_EQUAL(s, "ba");
+ }
+ }
+ }
+
+ void TestBuilder0() {
+ using namespace NGreedyDict;
+ ui32 a = 1, b = 11;
+ ui64 ab = TDictBuilder::Compose(a, b);
+ UNIT_ASSERT(TDictBuilder::IsCompound(ab));
+ UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Prev(ab), a);
+ UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Next(ab), b);
+ }
+
+ void FillData(NGreedyDict::TStringBufs& data) {
static const char* urls[] = {"http://53.ru/car/motors/foreign/opel/tigra/", "http://abakan.24au.ru/tender/85904/", "http://anm15.gulaig.com/", "http://avto-parts.com/mercedes-benz/mercedes-benz-w220-1998-2005/category-442/category-443/", "http://ballooncousin.co.uk/", "http://benzol.ru/equipment/?id=1211&parent=514", "http://blazingseorank.com/blazing-seo-rank-free-website-analysis-to-increase-rank-and-traffic-450.html", "http://blogblaugrana.contadorwebmasters.com/", "http://bristolhash.org.uk/bh3cntct.php", "http://broker.borovichi.ru/category/item/3/1/0/8/28/257", "http://canoncompactcamerax.blogspot.com/", "http://classifieds.smashits.com/p,107881,email-to-friend.htm", "http://conferences.ksde.org/Portals/132/FallAssessment/SAVETHEDAY-FA09.pdf", "http://eway.vn/raovat/325-dien-tu-gia-dung/337-dieu-hoa/98041-b1-sua-may-lanh-quan-binh-tan-sua-may-lanh-quan-binh-chanh-hh-979676119-toan-quoc.html", "http://gallery.e2bn.org/asset73204_8-.html", "http://goplay.nsw.gov.au/activities-for-kids/by/historic-houses-trust/?startdate=2012-07-10", "http://grichards19067.multiply.com/", "http://hotkovo.egent.ru/user/89262269084/", "http://howimetyourself.com/?redirect_to=http://gomiso.com/m/suits/seasons/2/episodes/2", "http://islamqa.com/hi/ref/9014/DEAD%20PEOPLE%20GOOD%20DEEDS", "http://lapras.rutube.ru/", "http://nceluiko.ya.ru/", "http://nyanyanyanyaa.beon.ru/", "http://ozbo.com/Leaf-River-DV-7SS-7-0-MP-Game-Camera-K1-32541.html", "http://sbantom.ru/catalog/chasy/632753.html", "http://shopingoff.com/index.php?option=com_virtuemart&Itemid=65&category_id=&page=shop.browse&manufacturer_id=122&limit=32&limitstart=96", "http://shopingoff.com/katalog-odezhdy/manufacturer/62-christian-audigier.html?limit=32&start=448", "https://webwinkel.ah.nl/process?fh_location=//ecommerce/nl_NL/categories%3C%7Becommerce_shoc1%7D/it_show_product_code_1384%3E%7B10%3B20%7D/pr_startdate%3C20120519/pr_enddate%3E20120519/pr_ltc_allowed%3E%7Bbowi%7D/categories%3C%7Becommerce_shoc1_1al%7D/categories%3C%7Becommerce_shoc1_1al_1ahal%7D&&action=albert_noscript.modules.build", "http://top100.rambler.ru/navi/?theme=208/210/371&rgn=17", "http://volgogradskaya-oblast.extra-m.ru/classifieds/rabota/vakansii/banki-investicii/901467/", "http://wikien4.appspot.com/wiki/Warburg_hypothesis", "http://wola_baranowska.kamerzysta24.com.pl/", "http://www.10dot0dot0dot1.com/", "http://www.anima-redux.ru/index.php?key=gifts+teenage+girls", "http://www.aquaticabyseaworld.com/Calendar.aspx/CP/CP/CP/sp-us/CP/CP/ParkMap/Tickets/Weather.aspx", "http://www.autousa.com/360spin/2012_cadillac_ctssportwagon_3.6awdpremiumcollection.htm", "http://www.booking.com/city/gb/paignton-aireborough.html?inac=0&lang=pl", "http://www.booking.com/city/it/vodo-cadore.en.html", "http://www.booking.com/district/us/new-york/rockefeller-center.html&lang=no", "http://www.booking.com/hotel/bg/crown-fort-club.lv.html", "http://www.booking.com/hotel/ca/gouverneur-rimouski.ar.html", "http://www.booking.com/hotel/ch/l-auberge-du-chalet-a-gobet.fi.html", "http://www.booking.com/hotel/de/mark-garni.ru.html?aid=337384;label=yandex-hotel-mark-garni-68157-%7Bparam1%7D", "http://www.booking.com/hotel/de/mercure-goldschmieding-castrop-rauxel.ro.html", "http://www.booking.com/hotel/de/zollenspieker-fahrhaus.fr.html", "http://www.booking.com/hotel/es/jardin-metropolitano.ca.html", "http://www.booking.com/hotel/fr/clim.fr.html", "http://www.booking.com/hotel/fr/radisson-sas-toulouse-airport.et.html", "http://www.booking.com/hotel/gb/stgileshotel.ro.html?srfid=68c7fe42a03653a8796c84435c5299e4X16?tab=4", "http://www.booking.com/hotel/gr/rodos-park-suites.ru.html", "http://www.booking.com/hotel/id/le-grande-suites-bali.ru.html", "http://www.booking.com/hotel/it/mozart.it.html?aid=321655", "http://www.booking.com/hotel/ni/bahia-del-sol-villas.ru.html?dcid=1;dva=0", "http://www.booking.com/hotel/nl/cpschiphol.ro.html.ro.html?tab=4", "http://www.booking.com/hotel/th/laem-din.en-gb.html", "http://www.booking.com/hotel/th/tinidee-ranong.en.html", "http://www.booking.com/hotel/us/best-western-plus-merrimack-valley.hu.html", "http://www.booking.com/hotel/vn/tan-hai-long.km.html", "http://www.booking.com/landmark/au/royal-brisbane-women-s-hospital.vi.html", "http://www.booking.com/landmark/hk/nam-cheong-station.html&lang=id", "http://www.booking.com/landmark/it/spanish-steps.ca.html", "http://www.booking.com/landmark/sg/asian-civilisations-museum.html&lang=fi", "http://www.booking.com/place/fi-1376029.pt.html", "http://www.booking.com/place/tn257337.pl.html", "http://www.booking.com/region/ca/niagarafalls.ar.html&selected_currency=PLN", "http://www.booking.com/region/mx/queretaro.pt-pt.html&selected_currency=AUD", "http://www.booking.com/searchresults.en.html?city=20063074", "http://www.booking.com/searchresults.et.html?checkin=;checkout=;city=-394632", "http://www.booking.com/searchresults.lv.html?region=3936", "http://www.cevredanismanlari.com/index.php/component/k2/index.php/mevzuat/genel-yazlar/item/dosyalar/index.php?option=com_k2&view=item&id=16:iso-14001-%C3%A7evre-y%C3%B6netim-sistemi&Itemid=132&limitstart=107120", "http://www.dh-wholesaler.com/MENS-POLO-RACING-TEE-RL-p-417.html", "http://www.employabilityonline.net/", "http://www.esso.inc.ru/board/tools.php?event=profile&pname=Invinerrq", "http://www.filesurgery.ru/searchfw/kids_clothes-3.html", "http://www.furnitureandcarpetsource.com/Item.aspx?ItemID=-2107311899&ItemNum=53-T3048", "http://www.gets.cn/product/Gold-Sand-Lampwork-Glass-Beads--Flat-round--28x28x13mm_p260717.html", "http://www.gets.cn/wholesale-Sterling-Silver-Pendant-Findings-3577_S--L-Star-P-1.html?view=1&by=1", "http://www.homeandgardenadvice.com/diy/Mortgages_Loans_and_Financing/9221.html", "http://www.hongkongairport.com/eng/index.html/passenger/passenger/transport/to-from-airport/business/about-the-airport/transport/shopping/entertainment/t2/passenger/interactive-map.html", "http://www.hongkongairport.com/eng/index.html/shopping/insideshopping/all/passenger/transfer-transit/all/airline-information/shopping/entertainment/t2/business/about-the-airport/welcome.html", "http://www.hongkongairport.com/eng/index.html/transport/business/about-the-airport/transport/business/airport-authority/passenger/shopping/dining/all/dining.html", "http://www.idedge.com/index.cfm/fuseaction/category.display/category_id/298/index.cfm", "http://www.istanbulburda.com/aramalar.php", "http://www.jewelryinthenet.com/ads/AdDetail.aspx?AdID=1-0311002490689&stid=22-0111001020877", "http://www.johnnydepp.ru/forum/index.php?showtopic=1629&mode=linearplus&view=findpost&p=186977", "http://www.johnnydepp.ru/forum/index.php?showtopic=476&st=60&p=87379&", "http://www.joseleano.com/joomla/index.php/audio", "http://www.kaplicarehberi.com/tag/sakar-ilicali-kaplicalari/feed", "http://www.khaber.com.tr/arama.html?key=%C3%A7avdar", "http://www.kiz-oyunlari1.com/1783/4437/4363/1056/4170/Bump-Copter2-.html", "http://www.kiz-oyunlari1.com/3752/2612/4175/1166/3649/1047/Angelina-Oyunu.html", "http://www.kiz-oyunlari1.com/4266/3630/3665/3286/4121/301/3274/Sinir-Sinekler-.html", "http://www.kuldiga.lv/index.php?f=8&cat=371", "http://www.kuldiga.lv/index.php/img/index.php?l=lv&art_id=1836&show_c=&cat=85", "http://www.patronessa.ru/remontiruemsya/kuzovnie30raboti.html", "http://www.rapdict.org/Nu_Money?title=Talk:Nu_Money&action=edit", "http://www.serafin-phu.tabor24.com/?page=8", "http://www.shoes-store.org/brand1/Kids/Minnetonka.html", "http://www.shoes-store.org/shoes-store.xml", "http://www.way2allah.com/khotab-download-34695.htm"};
- data.clear();
+ data.clear();
data.insert(data.begin(), urls, urls + Y_ARRAY_SIZE(urls));
- }
-
+ }
+
typedef THashMap<TStringBuf, NGreedyDict::TEntry> TDict;
-
- TAutoPtr<NGreedyDict::TEntrySet> DoTestBuilder(const NGreedyDict::TBuildSettings& s,
+
+ TAutoPtr<NGreedyDict::TEntrySet> DoTestBuilder(const NGreedyDict::TBuildSettings& s,
TDict& res) {
- using namespace NGreedyDict;
-
- TStringBufs data;
- FillData(data);
-
- TDictBuilder b(s);
- b.SetInput(data);
- b.Build(256 + 128);
-
- TEntrySet& set = b.EntrySet();
-
+ using namespace NGreedyDict;
+
+ TStringBufs data;
+ FillData(data);
+
+ TDictBuilder b(s);
+ b.SetInput(data);
+ b.Build(256 + 128);
+
+ TEntrySet& set = b.EntrySet();
+
for (const auto& it : set) {
if (it.Score) {
res[it.Str] = it;
- }
- }
-
- return b.ReleaseEntrySet();
- }
-
- void DoAssertEntry(TStringBuf entry, ui32 number, i32 parent, float score, const TDict& dict) {
- TDict::const_iterator it = dict.find(entry);
- UNIT_ASSERT_C(it != dict.end(), entry);
- UNIT_ASSERT_VALUES_EQUAL_C(it->second.Number, number, entry);
- UNIT_ASSERT_VALUES_EQUAL_C(it->second.NearestPrefix, parent, entry);
- UNIT_ASSERT_VALUES_EQUAL_C(round(it->second.Score * 10000), round(score * 10000), entry);
- }
-
- void TestBuilder() {
- TAutoPtr<NGreedyDict::TEntrySet> set;
+ }
+ }
+
+ return b.ReleaseEntrySet();
+ }
+
+ void DoAssertEntry(TStringBuf entry, ui32 number, i32 parent, float score, const TDict& dict) {
+ TDict::const_iterator it = dict.find(entry);
+ UNIT_ASSERT_C(it != dict.end(), entry);
+ UNIT_ASSERT_VALUES_EQUAL_C(it->second.Number, number, entry);
+ UNIT_ASSERT_VALUES_EQUAL_C(it->second.NearestPrefix, parent, entry);
+ UNIT_ASSERT_VALUES_EQUAL_C(round(it->second.Score * 10000), round(score * 10000), entry);
+ }
+
+ void TestBuilder() {
+ TAutoPtr<NGreedyDict::TEntrySet> set;
THashMap<TStringBuf, NGreedyDict::TEntry> res;
- NGreedyDict::TBuildSettings s;
- set = DoTestBuilder(s, res);
-
- UNIT_ASSERT_VALUES_EQUAL(set->size(), 295u);
- UNIT_ASSERT_VALUES_EQUAL(res.size(), 110u);
-
- DoAssertEntry("%", 37, -1, 0.00375193, res);
- DoAssertEntry("%7", 38, 37, 0.00513299, res);
- DoAssertEntry("&", 39, -1, 0.00794527, res);
- DoAssertEntry("+", 44, -1, 0.000441404, res);
- DoAssertEntry(",", 45, -1, 0.000441404, res);
- DoAssertEntry("-", 46, -1, 0.0417126, res);
- DoAssertEntry(".", 47, -1, 0.0196425, res);
- DoAssertEntry(".com/", 48, 47, 0.0374482, res);
- DoAssertEntry(".html", 49, 47, 0.0496577, res);
- DoAssertEntry(".html?", 50, 49, 0.0153908, res);
- DoAssertEntry(".php", 51, 47, 0.0123585, res);
- DoAssertEntry(".ru/", 52, 47, 0.0150027, res);
- DoAssertEntry("/", 53, -1, 0.0452439, res);
- DoAssertEntry("/index", 54, 53, 0.0158905, res);
- DoAssertEntry("0", 55, -1, 0.00816597, res);
- DoAssertEntry("1", 56, -1, 0.0167733, res);
- DoAssertEntry("10", 57, 56, 0.00530474, res);
- DoAssertEntry("2", 58, -1, 0.0101523, res);
- DoAssertEntry("20", 59, 58, 0.00674234, res);
- DoAssertEntry("3", 60, -1, 0.01258, res);
- DoAssertEntry("32", 61, 60, 0.00490697, res);
- DoAssertEntry("4", 62, -1, 0.00993158, res);
- DoAssertEntry("5", 63, -1, 0.00617965, res);
- DoAssertEntry("6", 64, -1, 0.00971088, res);
- DoAssertEntry("7", 65, -1, 0.0101523, res);
- DoAssertEntry("8", 66, -1, 0.00728316, res);
- DoAssertEntry("9", 67, -1, 0.00728316, res);
- DoAssertEntry(":", 68, -1, 0.000662106, res);
- DoAssertEntry(";", 69, -1, 0.000882807, res);
- DoAssertEntry("=", 71, -1, 0.01258, res);
- DoAssertEntry("?", 73, -1, 0.00397263, res);
- DoAssertEntry("A", 75, -1, 0.00264842, res);
- DoAssertEntry("B", 76, -1, 0.00220702, res);
- DoAssertEntry("C", 77, -1, 0.00353123, res);
- DoAssertEntry("D", 78, -1, 0.00375193, res);
- DoAssertEntry("E", 79, -1, 0.00286912, res);
- DoAssertEntry("F", 80, -1, 0.00110351, res);
- DoAssertEntry("G", 81, -1, 0.00110351, res);
- DoAssertEntry("H", 82, -1, 0.000220702, res);
- DoAssertEntry("I", 83, -1, 0.00198632, res);
- DoAssertEntry("K", 85, -1, 0.000441404, res);
- DoAssertEntry("L", 86, -1, 0.00198632, res);
- DoAssertEntry("M", 87, -1, 0.00154491, res);
- DoAssertEntry("N", 88, -1, 0.00154491, res);
- DoAssertEntry("O", 89, -1, 0.00132421, res);
- DoAssertEntry("P", 90, -1, 0.00308983, res);
- DoAssertEntry("R", 92, -1, 0.000662106, res);
- DoAssertEntry("S", 93, -1, 0.00264842, res);
- DoAssertEntry("T", 94, -1, 0.00110351, res);
- DoAssertEntry("U", 95, -1, 0.000220702, res);
- DoAssertEntry("V", 96, -1, 0.000441404, res);
- DoAssertEntry("W", 97, -1, 0.000441404, res);
- DoAssertEntry("X", 98, -1, 0.000220702, res);
- DoAssertEntry("Y", 99, -1, 0.000220702, res);
- DoAssertEntry("_", 105, -1, 0.00904877, res);
- DoAssertEntry("a", 107, -1, 0.0505407, res);
- DoAssertEntry("an", 108, 107, 0.018273, res);
- DoAssertEntry("ar", 109, 107, 0.0169385, res);
- DoAssertEntry("b", 110, -1, 0.0156698, res);
- DoAssertEntry("c", 111, -1, 0.018539, res);
- DoAssertEntry("cat", 112, 111, 0.00846732, res);
- DoAssertEntry("ch", 113, 111, 0.00644872, res);
- DoAssertEntry("com", 114, 111, 0.00724235, res);
- DoAssertEntry("ct", 115, 111, 0.00605729, res);
- DoAssertEntry("d", 116, -1, 0.020746, res);
- DoAssertEntry("di", 117, 116, 0.00730659, res);
- DoAssertEntry("e", 118, -1, 0.0624586, res);
- DoAssertEntry("en", 119, 118, 0.0108999, res);
- DoAssertEntry("ent", 120, 119, 0.00616002, res);
- DoAssertEntry("f", 121, -1, 0.00860737, res);
- DoAssertEntry("fi", 122, 121, 0.00423196, res);
- DoAssertEntry("g", 123, -1, 0.0180975, res);
- DoAssertEntry("go", 124, 123, 0.00601862, res);
- DoAssertEntry("h", 125, -1, 0.010373, res);
- DoAssertEntry("ho", 126, 125, 0.00570298, res);
- DoAssertEntry("http://", 127, 125, 0.0494372, res);
- DoAssertEntry("http://www.", 128, 127, 0.0849702, res);
- DoAssertEntry("http://www.booking.com/", 129, 128, 0.071066, res);
- DoAssertEntry("http://www.booking.com/hotel/", 130, 129, 0.121607, res);
- DoAssertEntry("i", 131, -1, 0.0258221, res);
- DoAssertEntry("id=", 132, 131, 0.00725369, res);
- DoAssertEntry("im", 133, 131, 0.00373318, res);
- DoAssertEntry("in", 134, 131, 0.013625, res);
- DoAssertEntry("ing", 135, 134, 0.00795491, res);
- DoAssertEntry("ion", 136, 131, 0.00796149, res);
- DoAssertEntry("it", 137, 131, 0.00953416, res);
- DoAssertEntry("j", 138, -1, 0.00132421, res);
- DoAssertEntry("k", 139, -1, 0.0134628, res);
- DoAssertEntry("l", 140, -1, 0.0381814, res);
- DoAssertEntry("m", 141, -1, 0.0174354, res);
- DoAssertEntry("mer", 142, 141, 0.00711846, res);
- DoAssertEntry("n", 143, -1, 0.0132421, res);
- DoAssertEntry("o", 144, -1, 0.0302362, res);
- DoAssertEntry("on", 145, 144, 0.00802271, res);
- DoAssertEntry("ou", 146, 144, 0.00414545, res);
- DoAssertEntry("p", 147, -1, 0.0225116, res);
- DoAssertEntry("port", 148, 147, 0.0123532, res);
- DoAssertEntry("q", 149, -1, 0.00176561, res);
- DoAssertEntry("r", 150, -1, 0.0401677, res);
- DoAssertEntry("ran", 151, 150, 0.00686918, res);
- DoAssertEntry("s", 152, -1, 0.0487751, res);
- DoAssertEntry("sho", 153, 152, 0.0113876, res);
- DoAssertEntry("t", 154, -1, 0.0379607, res);
- DoAssertEntry("u", 155, -1, 0.0211874, res);
- DoAssertEntry("v", 156, -1, 0.00595895, res);
- DoAssertEntry("vi", 157, 156, 0.00480673, res);
- DoAssertEntry("w", 158, -1, 0.00816597, res);
- DoAssertEntry("x", 159, -1, 0.00375193, res);
- DoAssertEntry("y", 160, -1, 0.0130214, res);
- DoAssertEntry("z", 161, -1, 0.00353123, res);
- }
-};
-
-UNIT_TEST_SUITE_REGISTRATION(TGreedyDictTest);
+ NGreedyDict::TBuildSettings s;
+ set = DoTestBuilder(s, res);
+
+ UNIT_ASSERT_VALUES_EQUAL(set->size(), 295u);
+ UNIT_ASSERT_VALUES_EQUAL(res.size(), 110u);
+
+ DoAssertEntry("%", 37, -1, 0.00375193, res);
+ DoAssertEntry("%7", 38, 37, 0.00513299, res);
+ DoAssertEntry("&", 39, -1, 0.00794527, res);
+ DoAssertEntry("+", 44, -1, 0.000441404, res);
+ DoAssertEntry(",", 45, -1, 0.000441404, res);
+ DoAssertEntry("-", 46, -1, 0.0417126, res);
+ DoAssertEntry(".", 47, -1, 0.0196425, res);
+ DoAssertEntry(".com/", 48, 47, 0.0374482, res);
+ DoAssertEntry(".html", 49, 47, 0.0496577, res);
+ DoAssertEntry(".html?", 50, 49, 0.0153908, res);
+ DoAssertEntry(".php", 51, 47, 0.0123585, res);
+ DoAssertEntry(".ru/", 52, 47, 0.0150027, res);
+ DoAssertEntry("/", 53, -1, 0.0452439, res);
+ DoAssertEntry("/index", 54, 53, 0.0158905, res);
+ DoAssertEntry("0", 55, -1, 0.00816597, res);
+ DoAssertEntry("1", 56, -1, 0.0167733, res);
+ DoAssertEntry("10", 57, 56, 0.00530474, res);
+ DoAssertEntry("2", 58, -1, 0.0101523, res);
+ DoAssertEntry("20", 59, 58, 0.00674234, res);
+ DoAssertEntry("3", 60, -1, 0.01258, res);
+ DoAssertEntry("32", 61, 60, 0.00490697, res);
+ DoAssertEntry("4", 62, -1, 0.00993158, res);
+ DoAssertEntry("5", 63, -1, 0.00617965, res);
+ DoAssertEntry("6", 64, -1, 0.00971088, res);
+ DoAssertEntry("7", 65, -1, 0.0101523, res);
+ DoAssertEntry("8", 66, -1, 0.00728316, res);
+ DoAssertEntry("9", 67, -1, 0.00728316, res);
+ DoAssertEntry(":", 68, -1, 0.000662106, res);
+ DoAssertEntry(";", 69, -1, 0.000882807, res);
+ DoAssertEntry("=", 71, -1, 0.01258, res);
+ DoAssertEntry("?", 73, -1, 0.00397263, res);
+ DoAssertEntry("A", 75, -1, 0.00264842, res);
+ DoAssertEntry("B", 76, -1, 0.00220702, res);
+ DoAssertEntry("C", 77, -1, 0.00353123, res);
+ DoAssertEntry("D", 78, -1, 0.00375193, res);
+ DoAssertEntry("E", 79, -1, 0.00286912, res);
+ DoAssertEntry("F", 80, -1, 0.00110351, res);
+ DoAssertEntry("G", 81, -1, 0.00110351, res);
+ DoAssertEntry("H", 82, -1, 0.000220702, res);
+ DoAssertEntry("I", 83, -1, 0.00198632, res);
+ DoAssertEntry("K", 85, -1, 0.000441404, res);
+ DoAssertEntry("L", 86, -1, 0.00198632, res);
+ DoAssertEntry("M", 87, -1, 0.00154491, res);
+ DoAssertEntry("N", 88, -1, 0.00154491, res);
+ DoAssertEntry("O", 89, -1, 0.00132421, res);
+ DoAssertEntry("P", 90, -1, 0.00308983, res);
+ DoAssertEntry("R", 92, -1, 0.000662106, res);
+ DoAssertEntry("S", 93, -1, 0.00264842, res);
+ DoAssertEntry("T", 94, -1, 0.00110351, res);
+ DoAssertEntry("U", 95, -1, 0.000220702, res);
+ DoAssertEntry("V", 96, -1, 0.000441404, res);
+ DoAssertEntry("W", 97, -1, 0.000441404, res);
+ DoAssertEntry("X", 98, -1, 0.000220702, res);
+ DoAssertEntry("Y", 99, -1, 0.000220702, res);
+ DoAssertEntry("_", 105, -1, 0.00904877, res);
+ DoAssertEntry("a", 107, -1, 0.0505407, res);
+ DoAssertEntry("an", 108, 107, 0.018273, res);
+ DoAssertEntry("ar", 109, 107, 0.0169385, res);
+ DoAssertEntry("b", 110, -1, 0.0156698, res);
+ DoAssertEntry("c", 111, -1, 0.018539, res);
+ DoAssertEntry("cat", 112, 111, 0.00846732, res);
+ DoAssertEntry("ch", 113, 111, 0.00644872, res);
+ DoAssertEntry("com", 114, 111, 0.00724235, res);
+ DoAssertEntry("ct", 115, 111, 0.00605729, res);
+ DoAssertEntry("d", 116, -1, 0.020746, res);
+ DoAssertEntry("di", 117, 116, 0.00730659, res);
+ DoAssertEntry("e", 118, -1, 0.0624586, res);
+ DoAssertEntry("en", 119, 118, 0.0108999, res);
+ DoAssertEntry("ent", 120, 119, 0.00616002, res);
+ DoAssertEntry("f", 121, -1, 0.00860737, res);
+ DoAssertEntry("fi", 122, 121, 0.00423196, res);
+ DoAssertEntry("g", 123, -1, 0.0180975, res);
+ DoAssertEntry("go", 124, 123, 0.00601862, res);
+ DoAssertEntry("h", 125, -1, 0.010373, res);
+ DoAssertEntry("ho", 126, 125, 0.00570298, res);
+ DoAssertEntry("http://", 127, 125, 0.0494372, res);
+ DoAssertEntry("http://www.", 128, 127, 0.0849702, res);
+ DoAssertEntry("http://www.booking.com/", 129, 128, 0.071066, res);
+ DoAssertEntry("http://www.booking.com/hotel/", 130, 129, 0.121607, res);
+ DoAssertEntry("i", 131, -1, 0.0258221, res);
+ DoAssertEntry("id=", 132, 131, 0.00725369, res);
+ DoAssertEntry("im", 133, 131, 0.00373318, res);
+ DoAssertEntry("in", 134, 131, 0.013625, res);
+ DoAssertEntry("ing", 135, 134, 0.00795491, res);
+ DoAssertEntry("ion", 136, 131, 0.00796149, res);
+ DoAssertEntry("it", 137, 131, 0.00953416, res);
+ DoAssertEntry("j", 138, -1, 0.00132421, res);
+ DoAssertEntry("k", 139, -1, 0.0134628, res);
+ DoAssertEntry("l", 140, -1, 0.0381814, res);
+ DoAssertEntry("m", 141, -1, 0.0174354, res);
+ DoAssertEntry("mer", 142, 141, 0.00711846, res);
+ DoAssertEntry("n", 143, -1, 0.0132421, res);
+ DoAssertEntry("o", 144, -1, 0.0302362, res);
+ DoAssertEntry("on", 145, 144, 0.00802271, res);
+ DoAssertEntry("ou", 146, 144, 0.00414545, res);
+ DoAssertEntry("p", 147, -1, 0.0225116, res);
+ DoAssertEntry("port", 148, 147, 0.0123532, res);
+ DoAssertEntry("q", 149, -1, 0.00176561, res);
+ DoAssertEntry("r", 150, -1, 0.0401677, res);
+ DoAssertEntry("ran", 151, 150, 0.00686918, res);
+ DoAssertEntry("s", 152, -1, 0.0487751, res);
+ DoAssertEntry("sho", 153, 152, 0.0113876, res);
+ DoAssertEntry("t", 154, -1, 0.0379607, res);
+ DoAssertEntry("u", 155, -1, 0.0211874, res);
+ DoAssertEntry("v", 156, -1, 0.00595895, res);
+ DoAssertEntry("vi", 157, 156, 0.00480673, res);
+ DoAssertEntry("w", 158, -1, 0.00816597, res);
+ DoAssertEntry("x", 159, -1, 0.00375193, res);
+ DoAssertEntry("y", 160, -1, 0.0130214, res);
+ DoAssertEntry("z", 161, -1, 0.00353123, res);
+ }
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TGreedyDictTest);
diff --git a/library/cpp/codecs/greedy_dict/ut/ya.make b/library/cpp/codecs/greedy_dict/ut/ya.make
index bd67d1a452..e5d597a083 100644
--- a/library/cpp/codecs/greedy_dict/ut/ya.make
+++ b/library/cpp/codecs/greedy_dict/ut/ya.make
@@ -1,5 +1,5 @@
UNITTEST_FOR(library/cpp/codecs/greedy_dict)
-
+
OWNER(velavokr)
SRCS(
diff --git a/library/cpp/codecs/greedy_dict/ya.make b/library/cpp/codecs/greedy_dict/ya.make
index 2a57224f7e..6904a354de 100644
--- a/library/cpp/codecs/greedy_dict/ya.make
+++ b/library/cpp/codecs/greedy_dict/ya.make
@@ -1,15 +1,15 @@
OWNER(velavokr)
-LIBRARY()
-
-SRCS(
- gd_builder.cpp
- gd_entry.cpp
-)
-
-PEERDIR(
+LIBRARY()
+
+SRCS(
+ gd_builder.cpp
+ gd_entry.cpp
+)
+
+PEERDIR(
library/cpp/containers/comptrie
library/cpp/string_utils/relaxed_escaper
-)
-
+)
+
END()
diff --git a/library/cpp/codecs/huffman_codec.cpp b/library/cpp/codecs/huffman_codec.cpp
index 650fe7cdfd..391662fb0d 100644
--- a/library/cpp/codecs/huffman_codec.cpp
+++ b/library/cpp/codecs/huffman_codec.cpp
@@ -1,14 +1,14 @@
-#include "huffman_codec.h"
+#include "huffman_codec.h"
#include <library/cpp/bit_io/bitinput.h>
#include <library/cpp/bit_io/bitoutput.h>
-
-#include <util/generic/algorithm.h>
+
+#include <util/generic/algorithm.h>
#include <util/generic/bitops.h>
-#include <util/stream/buffer.h>
-#include <util/stream/length.h>
-#include <util/string/printf.h>
-
-namespace NCodecs {
+#include <util/stream/buffer.h>
+#include <util/stream/length.h>
+#include <util/string/printf.h>
+
+namespace NCodecs {
template <typename T>
struct TCanonicalCmp {
bool operator()(const T& a, const T& b) const {
@@ -19,40 +19,40 @@ namespace NCodecs {
}
}
};
-
+
template <typename T>
struct TByCharCmp {
bool operator()(const T& a, const T& b) const {
- return a.Char < b.Char;
- }
+ return a.Char < b.Char;
+ }
};
-
+
struct TTreeEntry {
static const ui32 InvalidBranch = (ui32)-1;
-
+
ui64 Freq = 0;
ui32 Branches[2]{InvalidBranch, InvalidBranch};
-
+
ui32 CodeLength = 0;
ui8 Char = 0;
bool Invalid = false;
-
+
TTreeEntry() = default;
-
+
static bool ByFreq(const TTreeEntry& a, const TTreeEntry& b) {
return a.Freq < b.Freq;
}
-
+
static bool ByFreqRev(const TTreeEntry& a, const TTreeEntry& b) {
return a.Freq > b.Freq;
}
};
-
+
using TCodeTree = TVector<TTreeEntry>;
-
+
void InitTreeByFreqs(TCodeTree& tree, const ui64 freqs[256]) {
tree.reserve(255 * 256 / 2); // worst case - balanced tree
-
+
for (ui32 i = 0; i < 256; ++i) {
tree.emplace_back();
tree.back().Char = i;
@@ -72,24 +72,24 @@ namespace NCodecs {
for (ui64 i = 0; i < r.size(); ++i)
++freqs[(ui8)r[i]];
}
-
+
InitTreeByFreqs(tree, freqs);
- }
-
+ }
+
void CalculateCodeLengths(TCodeTree& tree) {
Y_ENSURE(tree.size() == 256, " ");
const ui32 firstbranch = tree.size();
-
+
ui32 curleaf = 0;
ui32 curbranch = firstbranch;
-
+
// building code tree. two priority queues are combined in one.
while (firstbranch - curleaf + tree.size() - curbranch >= 2) {
TTreeEntry e;
-
+
for (auto& branche : e.Branches) {
ui32 br;
-
+
if (curleaf >= firstbranch)
br = curbranch++;
else if (curbranch >= tree.size())
@@ -98,84 +98,84 @@ namespace NCodecs {
br = curleaf++;
else
br = curbranch++;
-
+
Y_ENSURE(br < tree.size(), " ");
branche = br;
e.Freq += tree[br].Freq;
}
-
+
tree.push_back(e);
PushHeap(tree.begin() + curbranch, tree.end(), TTreeEntry::ByFreqRev);
- }
-
+ }
+
// computing code lengths
for (ui64 i = tree.size() - 1; i >= firstbranch; --i) {
TTreeEntry e = tree[i];
-
+
for (auto branche : e.Branches)
tree[branche].CodeLength = e.CodeLength + 1;
}
-
+
// chopping off the branches
tree.resize(firstbranch);
-
+
Sort(tree.begin(), tree.end(), TCanonicalCmp<TTreeEntry>());
-
+
// simplification: we are stripping codes longer than 64 bits
while (!tree.empty() && tree.back().CodeLength > 64)
tree.pop_back();
-
+
// will not compress
if (tree.empty())
return;
-
+
// special invalid code word
tree.back().Invalid = true;
}
-
+
struct TEncoderEntry {
ui64 Code = 0;
-
+
ui8 CodeLength = 0;
ui8 Char = 0;
ui8 Invalid = true;
-
+
explicit TEncoderEntry(TTreeEntry e)
: CodeLength(e.CodeLength)
, Char(e.Char)
, Invalid(e.Invalid)
{
}
-
+
TEncoderEntry() = default;
};
-
+
struct TEncoderTable {
TEncoderEntry Entries[256];
-
+
void Save(IOutputStream* out) const {
ui16 nval = 0;
-
+
for (auto entrie : Entries)
nval += !entrie.Invalid;
-
+
::Save(out, nval);
-
+
for (auto entrie : Entries) {
if (!entrie.Invalid) {
::Save(out, entrie.Char);
::Save(out, entrie.CodeLength);
}
- }
- }
-
+ }
+ }
+
void Load(IInputStream* in) {
ui16 nval = 0;
::Load(in, nval);
-
+
for (ui32 i = 0; i < 256; ++i)
Entries[i].Char = i;
-
+
for (ui32 i = 0; i < nval; ++i) {
ui8 ch = 0;
ui8 len = 0;
@@ -184,15 +184,15 @@ namespace NCodecs {
Entries[ch].CodeLength = len;
Entries[ch].Invalid = false;
}
- }
+ }
};
-
+
struct TDecoderEntry {
ui32 NextTable : 10;
ui32 Char : 8;
ui32 Invalid : 1;
ui32 Bad : 1;
-
+
TDecoderEntry()
: NextTable()
, Char()
@@ -201,27 +201,27 @@ namespace NCodecs {
{
}
};
-
+
struct TDecoderTable: public TIntrusiveListItem<TDecoderTable> {
ui64 Length = 0;
ui64 BaseCode = 0;
-
+
TDecoderEntry Entries[256];
-
+
TDecoderTable() {
Zero(Entries);
}
};
-
+
const int CACHE_BITS_COUNT = 16;
class THuffmanCodec::TImpl: public TAtomicRefCount<TImpl> {
TEncoderTable Encoder;
TDecoderTable Decoder[256];
-
+
TEncoderEntry Invalid;
-
+
ui32 SubTablesNum;
-
+
class THuffmanCache {
struct TCacheEntry {
int EndOffset : 24;
@@ -230,7 +230,7 @@ namespace NCodecs {
TVector<char> DecodeCache;
TVector<TCacheEntry> CacheEntries;
const TImpl& Original;
-
+
public:
THuffmanCache(const THuffmanCodec::TImpl& encoder);
@@ -252,51 +252,51 @@ namespace NCodecs {
if (in.empty()) {
return 0;
}
-
+
out.Reserve(in.size() * 2);
-
+
{
NBitIO::TBitOutputVector<TBuffer> bout(&out);
TStringBuf tin = in;
-
+
// data is under compression
bout.Write(1, 1);
-
+
for (auto t : tin) {
const TEncoderEntry& ce = Encoder.Entries[(ui8)t];
-
+
bout.Write(ce.Code, ce.CodeLength);
-
+
if (ce.Invalid) {
bout.Write(t, 8);
}
}
-
+
// in canonical huffman coding there cannot be a code having no 0 in the suffix
// and shorter than 8 bits.
bout.Write((ui64)-1, bout.GetByteReminder());
return bout.GetByteReminder();
- }
- }
-
+ }
+ }
+
void Decode(TStringBuf in, TBuffer& out) const {
out.Clear();
-
+
if (in.empty()) {
return;
}
-
+
NBitIO::TBitInput bin(in);
ui64 f = 0;
bin.ReadK<1>(f);
-
+
// if data is uncompressed
if (!f) {
in.Skip(1);
out.Append(in.data(), in.size());
} else {
out.Reserve(in.size() * 8);
-
+
if (Cache.Get()) {
Cache->Decode(bin, out);
} else {
@@ -304,36 +304,36 @@ namespace NCodecs {
}
}
}
- }
-
+ }
+
Y_FORCE_INLINE int ReadNextChar(NBitIO::TBitInput& bin, TBuffer& out) const {
const TDecoderTable* table = Decoder;
TDecoderEntry e;
-
+
int bitsRead = 0;
while (true) {
ui64 code = 0;
-
+
if (Y_UNLIKELY(!bin.Read(code, table->Length)))
return 0;
bitsRead += table->Length;
-
+
if (Y_UNLIKELY(code < table->BaseCode))
return 0;
-
+
code -= table->BaseCode;
-
+
if (Y_UNLIKELY(code > 255))
return 0;
-
+
e = table->Entries[code];
-
+
if (Y_UNLIKELY(e.Bad))
return 0;
-
+
if (e.NextTable) {
table = Decoder + e.NextTable;
- } else {
+ } else {
if (e.Invalid) {
code = 0;
bin.ReadK<8>(code);
@@ -344,77 +344,77 @@ namespace NCodecs {
}
return bitsRead;
- }
+ }
}
-
+
Y_ENSURE(false, " could not decode input");
return 0;
- }
-
+ }
+
void GenerateEncoder(TCodeTree& tree) {
const ui64 sz = tree.size();
-
+
TEncoderEntry lastcode = Encoder.Entries[tree[0].Char] = TEncoderEntry(tree[0]);
-
+
for (ui32 i = 1; i < sz; ++i) {
const TTreeEntry& te = tree[i];
TEncoderEntry& e = Encoder.Entries[te.Char];
e = TEncoderEntry(te);
-
+
e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength);
lastcode = e;
-
+
e.Code = ReverseBits(e.Code, e.CodeLength);
-
+
if (e.Invalid)
Invalid = e;
}
-
+
for (auto& e : Encoder.Entries) {
if (e.Invalid)
e = Invalid;
Y_ENSURE(e.CodeLength, " ");
}
- }
-
+ }
+
void RegenerateEncoder() {
for (auto& entrie : Encoder.Entries) {
if (entrie.Invalid)
entrie.CodeLength = Invalid.CodeLength;
}
-
+
Sort(Encoder.Entries, Encoder.Entries + 256, TCanonicalCmp<TEncoderEntry>());
-
+
TEncoderEntry lastcode = Encoder.Entries[0];
-
+
for (ui32 i = 1; i < 256; ++i) {
TEncoderEntry& e = Encoder.Entries[i];
e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength);
lastcode = e;
-
+
e.Code = ReverseBits(e.Code, e.CodeLength);
}
-
+
for (auto& entrie : Encoder.Entries) {
if (entrie.Invalid) {
Invalid = entrie;
break;
}
}
-
+
Sort(Encoder.Entries, Encoder.Entries + 256, TByCharCmp<TEncoderEntry>());
-
+
for (auto& entrie : Encoder.Entries) {
if (entrie.Invalid)
entrie = Invalid;
- }
- }
-
+ }
+ }
+
void BuildDecoder() {
TEncoderTable enc = Encoder;
Sort(enc.Entries, enc.Entries + 256, TCanonicalCmp<TEncoderEntry>());
-
+
TEncoderEntry& e1 = enc.Entries[0];
Decoder[0].BaseCode = e1.Code;
Decoder[0].Length = e1.CodeLength;
@@ -423,22 +423,22 @@ namespace NCodecs {
SetEntry(Decoder, e2.Code, e2.CodeLength, e2);
}
Cache.Reset(new THuffmanCache(*this));
- }
-
+ }
+
void SetEntry(TDecoderTable* t, ui64 code, ui64 len, TEncoderEntry e) {
Y_ENSURE(len >= t->Length, len << " < " << t->Length);
-
+
ui64 idx = (code & MaskLowerBits(t->Length)) - t->BaseCode;
TDecoderEntry& d = t->Entries[idx];
-
+
if (len == t->Length) {
Y_ENSURE(!d.NextTable, " ");
-
+
d.Char = e.Char;
d.Invalid = e.Invalid;
return;
}
-
+
if (!d.NextTable) {
Y_ENSURE(SubTablesNum < Y_ARRAY_SIZE(Decoder), " ");
d.NextTable = SubTablesNum++;
@@ -446,10 +446,10 @@ namespace NCodecs {
nt->Length = Min<ui64>(8, len - t->Length);
nt->BaseCode = (code >> t->Length) & MaskLowerBits(nt->Length);
}
-
+
SetEntry(Decoder + d.NextTable, code >> t->Length, len - t->Length, e);
- }
-
+ }
+
void Learn(ISequenceReader* in) {
{
TCodeTree tree;
@@ -459,11 +459,11 @@ namespace NCodecs {
GenerateEncoder(tree);
}
BuildDecoder();
- }
-
+ }
+
void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) {
- TCodeTree tree;
-
+ TCodeTree tree;
+
ui64 freqsArray[256];
Zero(freqsArray);
@@ -491,7 +491,7 @@ namespace NCodecs {
BuildDecoder();
}
};
-
+
THuffmanCodec::TImpl::THuffmanCache::THuffmanCache(const THuffmanCodec::TImpl& codec)
: Original(codec)
{
@@ -512,7 +512,7 @@ namespace NCodecs {
CacheEntries[i] = e;
break;
}
-
+
for (TBuffer::TConstIterator it = decoded.Begin(); it != decoded.End(); ++it) {
DecodeCache.push_back(*it);
}
@@ -558,32 +558,32 @@ namespace NCodecs {
MyTraits.SizeOnDecodeMultiplier = 8;
MyTraits.RecommendedSampleSize = 1 << 21;
}
-
+
THuffmanCodec::~THuffmanCodec() = default;
-
+
ui8 THuffmanCodec::Encode(TStringBuf in, TBuffer& bbb) const {
if (Y_UNLIKELY(!Trained))
ythrow TCodecException() << " not trained";
-
+
return Impl->Encode(in, bbb);
}
-
+
void THuffmanCodec::Decode(TStringBuf in, TBuffer& bbb) const {
Impl->Decode(in, bbb);
}
-
+
void THuffmanCodec::Save(IOutputStream* out) const {
Impl->Save(out);
}
-
+
void THuffmanCodec::Load(IInputStream* in) {
Impl->Load(in);
}
-
+
void THuffmanCodec::DoLearn(ISequenceReader& in) {
Impl->Learn(&in);
}
-
+
void THuffmanCodec::LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) {
Impl->LearnByFreqs(freqs);
Trained = true;
diff --git a/library/cpp/codecs/huffman_codec.h b/library/cpp/codecs/huffman_codec.h
index 559545b90d..1c00a80637 100644
--- a/library/cpp/codecs/huffman_codec.h
+++ b/library/cpp/codecs/huffman_codec.h
@@ -1,33 +1,33 @@
-#pragma once
-
-#include "codecs.h"
-
-#include <util/generic/ptr.h>
+#pragma once
+
+#include "codecs.h"
+
+#include <util/generic/ptr.h>
#include <util/string/cast.h>
-
-namespace NCodecs {
+
+namespace NCodecs {
// for types greater than char, pipeline with TFreqCodec.
-
+
class THuffmanCodec: public ICodec {
class TImpl;
TIntrusivePtr<TImpl> Impl;
-
+
public:
THuffmanCodec();
~THuffmanCodec() override;
-
+
static TStringBuf MyName() {
return "huffman";
}
-
+
TString GetName() const override {
return ToString(MyName());
}
-
+
ui8 Encode(TStringBuf in, TBuffer& bbb) const override;
-
+
void Decode(TStringBuf in, TBuffer& bbb) const override;
-
+
void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs);
protected:
@@ -35,5 +35,5 @@ namespace NCodecs {
void Save(IOutputStream* out) const override;
void Load(IInputStream* in) override;
};
-
-}
+
+}
diff --git a/library/cpp/codecs/pfor_codec.cpp b/library/cpp/codecs/pfor_codec.cpp
index f6b3b0920b..3b51c99afa 100644
--- a/library/cpp/codecs/pfor_codec.cpp
+++ b/library/cpp/codecs/pfor_codec.cpp
@@ -1,6 +1,6 @@
-#include "pfor_codec.h"
-
-namespace NCodecs {
+#include "pfor_codec.h"
+
+namespace NCodecs {
template <>
TStringBuf TPForCodec<ui64, true>::MyName() {
return "pfor-delta64-sorted";
@@ -9,7 +9,7 @@ namespace NCodecs {
TStringBuf TPForCodec<ui32, true>::MyName() {
return "pfor-delta32-sorted";
}
-
+
template <>
TStringBuf TPForCodec<ui64, false>::MyName() {
return "pfor-ui64";
@@ -18,5 +18,5 @@ namespace NCodecs {
TStringBuf TPForCodec<ui32, false>::MyName() {
return "pfor-ui32";
}
-
-}
+
+}
diff --git a/library/cpp/codecs/pfor_codec.h b/library/cpp/codecs/pfor_codec.h
index d7d4bb8bf4..b0207512ac 100644
--- a/library/cpp/codecs/pfor_codec.h
+++ b/library/cpp/codecs/pfor_codec.h
@@ -1,48 +1,48 @@
-#pragma once
-
-#include "codecs.h"
-
-#include "delta_codec.h"
-#include "tls_cache.h"
-
+#pragma once
+
+#include "codecs.h"
+
+#include "delta_codec.h"
+#include "tls_cache.h"
+
#include <library/cpp/bit_io/bitinput.h>
#include <library/cpp/bit_io/bitoutput.h>
#include <util/string/cast.h>
-
-namespace NCodecs {
+
+namespace NCodecs {
template <typename T, bool WithDelta = false>
class TPForCodec: public ICodec {
using TUnsigned = std::make_unsigned_t<T>;
typedef TDeltaCodec<TUnsigned> TDCodec;
-
+
typedef std::conditional_t<WithDelta, typename TDCodec::TDelta, T> TValue;
static_assert(std::is_unsigned<TValue>::value, "expect std:is_unsigned<TValue>::value");
-
+
static const ui64 BitsInT = sizeof(TUnsigned) * 8;
-
+
TDCodec DeltaCodec;
-
+
public:
static TStringBuf MyName();
-
+
TPForCodec() {
MyTraits.AssumesStructuredInput = true;
MyTraits.SizeOfInputElement = sizeof(T);
MyTraits.SizeOnDecodeMultiplier = sizeof(T);
}
-
+
TString GetName() const override {
return ToString(MyName());
}
-
+
ui8 Encode(TStringBuf s, TBuffer& b) const override {
b.Clear();
if (s.empty()) {
return 0;
}
-
+
b.Reserve(2 * s.size() + b.Size());
-
+
if (WithDelta) {
auto buffer = TBufferTlsCache::TlsInstance().Item();
TBuffer& db = buffer.Get();
@@ -51,50 +51,50 @@ namespace NCodecs {
DeltaCodec.Encode(s, db);
s = TStringBuf{db.data(), db.size()};
}
-
+
TArrayRef<const TValue> tin{(const TValue*)s.data(), s.size() / sizeof(TValue)};
-
+
const ui64 sz = tin.size();
ui64 bitcounts[BitsInT + 1];
Zero(bitcounts);
-
+
ui32 zeros = 0;
-
+
for (const TValue* it = tin.begin(); it != tin.end(); ++it) {
TUnsigned v = 1 + (TUnsigned)*it;
ui64 l = MostSignificantBit(v) + 1;
++bitcounts[l];
-
+
if (!v) {
++zeros;
}
}
-
+
// cumulative bit counts
for (ui64 i = 0; i < BitsInT; ++i) {
bitcounts[i + 1] += bitcounts[i];
- }
-
+ }
+
bool hasexceptions = zeros;
ui64 optimalbits = BitsInT;
-
+
{
ui64 excsize = 0;
ui64 minsize = sz * BitsInT;
-
+
for (ui64 current = BitsInT; current; --current) {
ui64 size = bitcounts[current] * current + (sz - bitcounts[current]) * (current + 6 + excsize) + zeros * (current + 6);
-
+
excsize += current * bitcounts[current];
-
+
if (size < minsize) {
minsize = size;
optimalbits = current;
hasexceptions = zeros || sz - bitcounts[current];
}
- }
- }
-
+ }
+ }
+
if (!optimalbits || BitsInT == optimalbits) {
b.Append((ui8)-1);
b.Append(s.data(), s.size());
@@ -104,7 +104,7 @@ namespace NCodecs {
bout.Write(0, 1);
bout.Write(hasexceptions, 1);
bout.Write(optimalbits, 6);
-
+
for (const TValue* it = tin.begin(); it != tin.end(); ++it) {
TUnsigned word = 1 + (TUnsigned)*it;
ui64 len = MostSignificantBit(word) + 1;
@@ -116,29 +116,29 @@ namespace NCodecs {
} else {
bout.Write(word, optimalbits);
}
- }
-
+ }
+
return bout.GetByteReminder();
} // the rest of the last byte is zero padded. BitsInT is always > 7.
- }
-
+ }
+
void Decode(TStringBuf s, TBuffer& b) const override {
b.Clear();
if (s.empty()) {
return;
}
-
+
b.Reserve(s.size() * sizeof(T) + b.Size());
-
+
ui64 isplain = 0;
ui64 hasexceptions = 0;
ui64 bits = 0;
-
+
NBitIO::TBitInput bin(s);
bin.ReadK<1>(isplain);
bin.ReadK<1>(hasexceptions);
bin.ReadK<6>(bits);
-
+
if (Y_UNLIKELY(isplain)) {
s.Skip(1);
@@ -147,17 +147,17 @@ namespace NCodecs {
} else {
b.Append(s.data(), s.size());
}
- } else {
+ } else {
typename TDCodec::TDecoder decoder;
-
+
if (hasexceptions) {
ui64 word = 0;
while (bin.Read(word, bits)) {
if (word || (bin.ReadK<6>(word) && bin.Read(word, word))) {
--word;
-
+
TValue t = word;
-
+
if (WithDelta) {
if (decoder.Decode(t)) {
TStringBuf r{(char*)&decoder.Result, sizeof(decoder.Result)};
@@ -166,46 +166,46 @@ namespace NCodecs {
} else {
TStringBuf r{(char*)&t, sizeof(t)};
b.Append(r.data(), r.size());
- }
- }
- }
+ }
+ }
+ }
} else {
ui64 word = 0;
T outarr[256 / sizeof(T)];
ui32 cnt = 0;
while (true) {
ui64 v = bin.Read(word, bits);
-
+
if ((!v) | (!word))
break;
-
+
--word;
TValue t = word;
-
+
if (WithDelta) {
if (decoder.Decode(t)) {
outarr[cnt++] = decoder.Result;
}
} else {
outarr[cnt++] = t;
- }
+ }
if (cnt == Y_ARRAY_SIZE(outarr)) {
b.Append((const char*)outarr, sizeof(outarr));
cnt = 0;
}
- }
-
+ }
+
if (cnt) {
b.Append((const char*)outarr, cnt * sizeof(T));
- }
- }
- }
- }
-
+ }
+ }
+ }
+ }
+
protected:
void DoLearn(ISequenceReader&) override {
}
};
-
-}
+
+}
diff --git a/library/cpp/codecs/sample.h b/library/cpp/codecs/sample.h
index 15f03afcc5..bce37e6a2c 100644
--- a/library/cpp/codecs/sample.h
+++ b/library/cpp/codecs/sample.h
@@ -1,89 +1,89 @@
-#pragma once
-
+#pragma once
+
#include <library/cpp/deprecated/accessors/accessors.h>
-
-#include <util/generic/buffer.h>
-#include <util/generic/vector.h>
-#include <util/random/fast.h>
-#include <util/random/shuffle.h>
-
-#include <functional>
-#include <type_traits>
-
-namespace NCodecs {
- class ISequenceReader {
- public:
- virtual bool NextRegion(TStringBuf& s) = 0;
-
- virtual ~ISequenceReader() = default;
- };
-
- template <class TValue>
- TStringBuf ValueToStringBuf(TValue&& t) {
- return TStringBuf{NAccessors::Begin(t), NAccessors::End(t)};
- }
-
- template <class TIter>
+
+#include <util/generic/buffer.h>
+#include <util/generic/vector.h>
+#include <util/random/fast.h>
+#include <util/random/shuffle.h>
+
+#include <functional>
+#include <type_traits>
+
+namespace NCodecs {
+ class ISequenceReader {
+ public:
+ virtual bool NextRegion(TStringBuf& s) = 0;
+
+ virtual ~ISequenceReader() = default;
+ };
+
+ template <class TValue>
+ TStringBuf ValueToStringBuf(TValue&& t) {
+ return TStringBuf{NAccessors::Begin(t), NAccessors::End(t)};
+ }
+
+ template <class TIter>
TStringBuf IterToStringBuf(TIter iter) {
- return ValueToStringBuf(*iter);
- }
-
- template <class TItem>
+ return ValueToStringBuf(*iter);
+ }
+
+ template <class TItem>
class TSimpleSequenceReader: public ISequenceReader {
const TVector<TItem>& Items;
- size_t Idx = 0;
-
- public:
+ size_t Idx = 0;
+
+ public:
TSimpleSequenceReader(const TVector<TItem>& items)
- : Items(items)
+ : Items(items)
{
}
-
- bool NextRegion(TStringBuf& s) override {
- if (Idx >= Items.size()) {
- return false;
- }
-
- s = ValueToStringBuf(Items[Idx++]);
- return true;
- }
- };
-
- template <class TIter, class TGetter>
- size_t GetInputSize(TIter begin, TIter end, TGetter getter) {
- size_t totalBytes = 0;
- for (TIter iter = begin; iter != end; ++iter) {
- totalBytes += getter(iter).size();
- }
- return totalBytes;
- }
-
- template <class TIter>
- size_t GetInputSize(TIter begin, TIter end) {
- return GetInputSize(begin, end, IterToStringBuf<TIter>);
- }
-
- template <class TIter, class TGetter>
+
+ bool NextRegion(TStringBuf& s) override {
+ if (Idx >= Items.size()) {
+ return false;
+ }
+
+ s = ValueToStringBuf(Items[Idx++]);
+ return true;
+ }
+ };
+
+ template <class TIter, class TGetter>
+ size_t GetInputSize(TIter begin, TIter end, TGetter getter) {
+ size_t totalBytes = 0;
+ for (TIter iter = begin; iter != end; ++iter) {
+ totalBytes += getter(iter).size();
+ }
+ return totalBytes;
+ }
+
+ template <class TIter>
+ size_t GetInputSize(TIter begin, TIter end) {
+ return GetInputSize(begin, end, IterToStringBuf<TIter>);
+ }
+
+ template <class TIter, class TGetter>
TVector<TBuffer> GetSample(TIter begin, TIter end, size_t sampleSizeBytes, TGetter getter) {
- TFastRng64 rng{0x1ce1f2e507541a05, 0x07d45659, 0x7b8771030dd9917e, 0x2d6636ce};
-
- size_t totalBytes = GetInputSize(begin, end, getter);
- double sampleProb = (double)sampleSizeBytes / Max<size_t>(1, totalBytes);
-
+ TFastRng64 rng{0x1ce1f2e507541a05, 0x07d45659, 0x7b8771030dd9917e, 0x2d6636ce};
+
+ size_t totalBytes = GetInputSize(begin, end, getter);
+ double sampleProb = (double)sampleSizeBytes / Max<size_t>(1, totalBytes);
+
TVector<TBuffer> result;
- for (TIter iter = begin; iter != end; ++iter) {
- if (sampleProb >= 1 || rng.GenRandReal1() < sampleProb) {
- TStringBuf reg = getter(iter);
+ for (TIter iter = begin; iter != end; ++iter) {
+ if (sampleProb >= 1 || rng.GenRandReal1() < sampleProb) {
+ TStringBuf reg = getter(iter);
result.emplace_back(reg.data(), reg.size());
- }
- }
- Shuffle(result.begin(), result.end(), rng);
- return result;
- }
-
- template <class TIter>
+ }
+ }
+ Shuffle(result.begin(), result.end(), rng);
+ return result;
+ }
+
+ template <class TIter>
TVector<TBuffer> GetSample(TIter begin, TIter end, size_t sampleSizeBytes) {
- return GetSample(begin, end, sampleSizeBytes, IterToStringBuf<TIter>);
- }
-
-}
+ return GetSample(begin, end, sampleSizeBytes, IterToStringBuf<TIter>);
+ }
+
+}
diff --git a/library/cpp/codecs/solar_codec.cpp b/library/cpp/codecs/solar_codec.cpp
index d0692fe2a4..088bdead19 100644
--- a/library/cpp/codecs/solar_codec.cpp
+++ b/library/cpp/codecs/solar_codec.cpp
@@ -1,36 +1,36 @@
-#include "solar_codec.h"
-
+#include "solar_codec.h"
+
#include <library/cpp/codecs/greedy_dict/gd_builder.h>
-
+
#include <library/cpp/containers/comptrie/comptrie_builder.h>
#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h>
-#include <util/stream/length.h>
-#include <util/string/printf.h>
-#include <util/ysaveload.h>
-
-namespace NCodecs {
+#include <util/stream/length.h>
+#include <util/string/printf.h>
+#include <util/ysaveload.h>
+
+namespace NCodecs {
static inline ui32 Append(TBuffer& pool, TStringBuf data) {
pool.Append(data.data(), data.size());
return pool.Size();
}
-
+
void TSolarCodec::DoLearn(ISequenceReader& r) {
using namespace NGreedyDict;
-
+
Decoder.clear();
Pool.Clear();
-
+
THolder<TEntrySet> set;
-
+
{
TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance());
TStringBufs bufs;
-
+
TStringBuf m;
while (r.NextRegion(m)) {
bufs.push_back(pool.AppendString(m));
}
-
+
{
TDictBuilder b(Settings);
b.SetInput(bufs);
@@ -38,66 +38,66 @@ namespace NCodecs {
set = b.ReleaseEntrySet();
}
- }
-
+ }
+
set->SetScores(ES_LEN_COUNT);
- {
+ {
TVector<std::pair<float, TStringBuf>> tmp;
tmp.reserve(set->size());
-
+
for (const auto& it : *set) {
tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1))));
}
-
+
Sort(tmp.begin(), tmp.end());
-
+
Decoder.reserve(tmp.size() + 1);
Decoder.push_back(0);
-
+
for (const auto& it : tmp) {
Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed");
ui32 endoff = Append(Pool, it.second);
Decoder.push_back(endoff);
}
- }
-
+ }
+
Pool.ShrinkToFit();
Decoder.shrink_to_fit();
-
+
TBufferOutput bout;
-
+
{
TVector<std::pair<TStringBuf, ui32>> tmp2;
tmp2.reserve(Decoder.size());
-
+
for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) {
TStringBuf s = DoDecode(i);
tmp2.push_back(std::make_pair(s, i - 1));
Y_ENSURE(s.size() == (Decoder[i] - Decoder[i - 1]), "learning invariant failed");
}
-
+
Sort(tmp2.begin(), tmp2.end());
-
+
{
TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED);
for (const auto& it : tmp2) {
builder.Add(it.first.data(), it.first.size(), it.second);
}
-
+
builder.Save(bout);
- }
- }
-
+ }
+ }
+
Encoder.Init(TBlob::FromBuffer(bout.Buffer()));
- }
-
+ }
+
void TSolarCodec::Save(IOutputStream* out) const {
TBlob b = Encoder.Data();
::Save(out, (ui32)b.Size());
out->Write(b.Data(), b.Size());
}
-
+
void TSolarCodec::Load(IInputStream* in) {
ui32 sz;
::Load(in, sz);
@@ -105,29 +105,29 @@ namespace NCodecs {
Encoder.Init(TBlob::FromStream(lin));
Pool.Clear();
Decoder.clear();
-
+
TVector<std::pair<ui32, TString>> tmp;
-
+
ui32 poolsz = 0;
for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) {
const TString& s = it.GetKey();
tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s));
poolsz += Max<ui32>(s.size(), 1);
}
-
+
Sort(tmp.begin(), tmp.end());
-
+
Pool.Reserve(poolsz);
Decoder.reserve(tmp.size() + 1);
Decoder.push_back(0);
-
+
for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) {
Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first);
Decoder.push_back(Append(Pool, tmp[i].second));
}
-
+
Pool.ShrinkToFit();
Decoder.shrink_to_fit();
- }
-
-}
+ }
+
+}
diff --git a/library/cpp/codecs/solar_codec.h b/library/cpp/codecs/solar_codec.h
index 7158ae7926..e6c0b891ad 100644
--- a/library/cpp/codecs/solar_codec.h
+++ b/library/cpp/codecs/solar_codec.h
@@ -1,16 +1,16 @@
-#pragma once
-
-#include "codecs.h"
+#pragma once
+
+#include "codecs.h"
#include <library/cpp/containers/comptrie/comptrie_trie.h>
#include <library/cpp/codecs/greedy_dict/gd_builder.h>
-
+
#include <util/string/cast.h>
-#include <util/string/escape.h>
-
-namespace NCodecs {
- // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы.
- // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе.
-
+#include <util/string/escape.h>
+
+namespace NCodecs {
+ // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы.
+ // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе.
+
struct TVarIntTraits {
static const size_t MAX_VARINT32_BYTES = 5;
@@ -52,7 +52,7 @@ namespace NCodecs {
Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) {
ui32 result = static_cast<ui8>(r[0]);
- r.Skip(1);
+ r.Skip(1);
if (result >= 0x80) {
Y_ENSURE_EX(!r.empty(), TCodecException() << "Bad data");
result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]);
@@ -100,7 +100,7 @@ namespace NCodecs {
static TStringBuf MyNameShortInt() {
return TStringBuf("solar-si");
}
-
+
explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())
: Settings(s)
, MaxEntries(maxentries)
@@ -110,7 +110,7 @@ namespace NCodecs {
MyTraits.SizeOnDecodeMultiplier = 2;
MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8;
}
-
+
ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
EncodeImpl<TVarIntTraits>(r, b);
return 0;
@@ -148,8 +148,8 @@ namespace NCodecs {
TTraits::Write(val + 1, b);
r.Skip(Max<size_t>(sz, 1));
}
- }
-
+ }
+
template <class TTraits>
Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const {
b.Clear();
@@ -160,25 +160,25 @@ namespace NCodecs {
TStringBuf s = DoDecode(v);
b.Append(s.data(), s.size());
}
- }
-
+ }
+
inline bool CanUseShortInt() const {
return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT;
}
-
+
private:
typedef TCompactTrie<char, ui32> TEncoder;
typedef TVector<ui32> TDecoder;
-
+
TBuffer Pool;
TEncoder Encoder;
TDecoder Decoder;
-
+
NGreedyDict::TBuildSettings Settings;
ui32 MaxEntries;
ui32 MaxIterations;
};
-
+
// Uses varints or shortints depending on the decoder size
class TAdaptiveSolarCodec: public TSolarCodec {
public:
@@ -186,7 +186,7 @@ namespace NCodecs {
: TSolarCodec(maxentries, maxiter, s)
{
}
-
+
ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {
if (CanUseShortInt()) {
EncodeImpl<TShortIntTraits>(r, b);
@@ -225,7 +225,7 @@ namespace NCodecs {
EncodeImpl<TShortIntTraits>(r, b);
return 0;
}
-
+
void Decode(TStringBuf r, TBuffer& b) const override {
DecodeImpl<TShortIntTraits>(r, b);
}
@@ -241,4 +241,4 @@ namespace NCodecs {
}
};
-}
+}
diff --git a/library/cpp/codecs/static/builder.cpp b/library/cpp/codecs/static/builder.cpp
index 93e34a3edb..083f0fc6f6 100644
--- a/library/cpp/codecs/static/builder.cpp
+++ b/library/cpp/codecs/static/builder.cpp
@@ -1,39 +1,39 @@
-#include "builder.h"
-#include "common.h"
-
+#include "builder.h"
+#include "common.h"
+
#include <library/cpp/codecs/static/static_codec_info.pb.h>
-
+
#include <library/cpp/codecs/codecs.h>
-
-#include <util/generic/yexception.h>
-#include <util/string/subst.h>
-
-namespace NCodecs {
+
+#include <util/generic/yexception.h>
+#include <util/string/subst.h>
+
+namespace NCodecs {
TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo& info) {
- TStaticCodecInfo result;
- TCodecPtr codec = ICodec::GetInstance(info.CodecName);
- Y_ENSURE_EX(codec, TCodecException() << "empty codec is not allowed");
-
- codec->LearnX(trainingData.begin(), trainingData.end(), info.SampleSizeMultiplier);
- {
- TStringOutput sout{*result.MutableStoredCodec()};
- ICodec::Store(&sout, codec);
- }
-
- auto& debugInfo = *result.MutableDebugInfo();
- debugInfo.SetStoredCodecHash(DataSignature(result.GetStoredCodec()));
- debugInfo.SetCodecName(info.CodecName);
- debugInfo.SetSampleSizeMultiplier(info.SampleSizeMultiplier);
- debugInfo.SetTimestamp(info.Timestamp);
- debugInfo.SetRevisionInfo(info.RevisionInfo);
- debugInfo.SetTrainingSetComment(info.TrainingSetComment);
- debugInfo.SetTrainingSetResId(info.TrainingSetResId);
- return result;
- }
-
+ TStaticCodecInfo result;
+ TCodecPtr codec = ICodec::GetInstance(info.CodecName);
+ Y_ENSURE_EX(codec, TCodecException() << "empty codec is not allowed");
+
+ codec->LearnX(trainingData.begin(), trainingData.end(), info.SampleSizeMultiplier);
+ {
+ TStringOutput sout{*result.MutableStoredCodec()};
+ ICodec::Store(&sout, codec);
+ }
+
+ auto& debugInfo = *result.MutableDebugInfo();
+ debugInfo.SetStoredCodecHash(DataSignature(result.GetStoredCodec()));
+ debugInfo.SetCodecName(info.CodecName);
+ debugInfo.SetSampleSizeMultiplier(info.SampleSizeMultiplier);
+ debugInfo.SetTimestamp(info.Timestamp);
+ debugInfo.SetRevisionInfo(info.RevisionInfo);
+ debugInfo.SetTrainingSetComment(info.TrainingSetComment);
+ debugInfo.SetTrainingSetResId(info.TrainingSetResId);
+ return result;
+ }
+
TString GetStandardFileName(const TStaticCodecInfo& info) {
TString cName = info.GetDebugInfo().GetCodecName();
- SubstGlobal(cName, ':', '.');
- return TStringBuilder() << cName << "." << info.GetDebugInfo().GetTimestamp() << ".codec_info";
- }
-}
+ SubstGlobal(cName, ':', '.');
+ return TStringBuilder() << cName << "." << info.GetDebugInfo().GetTimestamp() << ".codec_info";
+ }
+}
diff --git a/library/cpp/codecs/static/builder.h b/library/cpp/codecs/static/builder.h
index d7533be4d5..234ad42dff 100644
--- a/library/cpp/codecs/static/builder.h
+++ b/library/cpp/codecs/static/builder.h
@@ -1,29 +1,29 @@
-#pragma once
-
-#include "static.h"
-
+#pragma once
+
+#include "static.h"
+
#include <library/cpp/svnversion/svnversion.h>
-
-#include <util/datetime/base.h>
+
+#include <util/datetime/base.h>
#include <util/generic/string.h>
-#include <util/generic/vector.h>
-#include <util/string/builder.h>
-
-namespace NCodecs {
- struct TCodecBuildInfo {
- // optimal values from SEARCH-1655
+#include <util/generic/vector.h>
+#include <util/string/builder.h>
+
+namespace NCodecs {
+ struct TCodecBuildInfo {
+ // optimal values from SEARCH-1655
TString CodecName = "solar-8k-a:zstd08d-1";
- float SampleSizeMultiplier = 1;
-
- // debug info:
- time_t Timestamp = TInstant::Now().TimeT();
+ float SampleSizeMultiplier = 1;
+
+ // debug info:
+ time_t Timestamp = TInstant::Now().TimeT();
TString RevisionInfo = (TStringBuilder() << "r" << ToString(GetProgramSvnRevision()));
TString TrainingSetComment; // a human comment on the training data
TString TrainingSetResId; // sandbox resid of the training set
- };
-
+ };
+
TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo&);
-
+
TString GetStandardFileName(const TStaticCodecInfo&);
-
-}
+
+}
diff --git a/library/cpp/codecs/static/common.h b/library/cpp/codecs/static/common.h
index 211de2a27d..84b0349d82 100644
--- a/library/cpp/codecs/static/common.h
+++ b/library/cpp/codecs/static/common.h
@@ -1,32 +1,32 @@
-#pragma once
-
-#include <util/string/hex.h>
-#include <util/digest/city.h>
-#include <util/system/byteorder.h>
-
-namespace NCodecs {
- template <class T>
- ui64 DataSignature(const T& t) {
- static_assert(!std::is_scalar<T>::value, "no scalars");
+#pragma once
+
+#include <util/string/hex.h>
+#include <util/digest/city.h>
+#include <util/system/byteorder.h>
+
+namespace NCodecs {
+ template <class T>
+ ui64 DataSignature(const T& t) {
+ static_assert(!std::is_scalar<T>::value, "no scalars");
return CityHash64(t.data(), t.size());
- }
-
- template <class T>
+ }
+
+ template <class T>
TString HexWriteScalar(T t) {
- static_assert(std::is_scalar<T>::value, "scalars only");
- t = LittleToBig(t);
+ static_assert(std::is_scalar<T>::value, "scalars only");
+ t = LittleToBig(t);
TString res = HexEncode(&t, sizeof(t));
- res.to_lower();
- return res;
- }
-
- template <class T>
- T HexReadScalar(TStringBuf s) {
- static_assert(std::is_scalar<T>::value, "scalars only");
- T t = 0;
+ res.to_lower();
+ return res;
+ }
+
+ template <class T>
+ T HexReadScalar(TStringBuf s) {
+ static_assert(std::is_scalar<T>::value, "scalars only");
+ T t = 0;
HexDecode(s.data(), Min(s.size(), sizeof(T)), &t);
- t = BigToLittle(t);
- return t;
- }
-
-}
+ t = BigToLittle(t);
+ return t;
+ }
+
+}
diff --git a/library/cpp/codecs/static/example/example.cpp b/library/cpp/codecs/static/example/example.cpp
index 5b750b717e..0c50a1a5be 100644
--- a/library/cpp/codecs/static/example/example.cpp
+++ b/library/cpp/codecs/static/example/example.cpp
@@ -1,43 +1,43 @@
-#include "example.h"
-
+#include "example.h"
+
#include <library/cpp/codecs/static/static.h>
-
-#include <util/generic/yexception.h>
-
-extern "C" {
+
+#include <util/generic/yexception.h>
+
+extern "C" {
extern const ui8 codec_info_huff_20160707[];
extern const ui32 codec_info_huff_20160707Size;
extern const ui8 codec_info_sa_huff_20160707[];
extern const ui32 codec_info_sa_huff_20160707Size;
-};
-
-namespace NStaticCodecExample {
- static const NCodecs::TCodecConstPtr CODECS[] = {
- nullptr,
- NCodecs::RestoreCodecFromArchive(codec_info_huff_20160707, codec_info_huff_20160707Size),
- NCodecs::RestoreCodecFromArchive(codec_info_sa_huff_20160707, codec_info_sa_huff_20160707Size),
- };
-
- static_assert(Y_ARRAY_SIZE(CODECS) == DV_COUNT, "bad array size");
-
- void Encode(TBuffer& out, TStringBuf in, EDictVersion dv) {
- Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv);
- out.Clear();
- if (!in) {
- return;
- }
- CODECS[dv]->Encode(in, out);
- out.Append((char)dv);
- }
-
- void Decode(TBuffer& out, TStringBuf in) {
- out.Clear();
- if (!in) {
- return;
- }
- EDictVersion dv = (EDictVersion)in.back();
- Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv);
- in.Chop(1);
- CODECS[dv]->Decode(in, out);
- }
-}
+};
+
+namespace NStaticCodecExample {
+ static const NCodecs::TCodecConstPtr CODECS[] = {
+ nullptr,
+ NCodecs::RestoreCodecFromArchive(codec_info_huff_20160707, codec_info_huff_20160707Size),
+ NCodecs::RestoreCodecFromArchive(codec_info_sa_huff_20160707, codec_info_sa_huff_20160707Size),
+ };
+
+ static_assert(Y_ARRAY_SIZE(CODECS) == DV_COUNT, "bad array size");
+
+ void Encode(TBuffer& out, TStringBuf in, EDictVersion dv) {
+ Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv);
+ out.Clear();
+ if (!in) {
+ return;
+ }
+ CODECS[dv]->Encode(in, out);
+ out.Append((char)dv);
+ }
+
+ void Decode(TBuffer& out, TStringBuf in) {
+ out.Clear();
+ if (!in) {
+ return;
+ }
+ EDictVersion dv = (EDictVersion)in.back();
+ Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv);
+ in.Chop(1);
+ CODECS[dv]->Decode(in, out);
+ }
+}
diff --git a/library/cpp/codecs/static/example/example.h b/library/cpp/codecs/static/example/example.h
index f9b3a7324b..070ca90f02 100644
--- a/library/cpp/codecs/static/example/example.h
+++ b/library/cpp/codecs/static/example/example.h
@@ -1,17 +1,17 @@
-#pragma once
-
-#include <util/generic/strbuf.h>
-#include <util/generic/buffer.h>
-
-namespace NStaticCodecExample {
+#pragma once
+
+#include <util/generic/strbuf.h>
+#include <util/generic/buffer.h>
+
+namespace NStaticCodecExample {
enum EDictVersion : ui8 {
DV_NULL = 0,
DV_HUFF_20160707,
DV_SA_HUFF_20160707,
DV_COUNT
- };
-
- void Encode(TBuffer&, TStringBuf, EDictVersion dv = DV_SA_HUFF_20160707);
-
- void Decode(TBuffer&, TStringBuf);
-}
+ };
+
+ void Encode(TBuffer&, TStringBuf, EDictVersion dv = DV_SA_HUFF_20160707);
+
+ void Decode(TBuffer&, TStringBuf);
+}
diff --git a/library/cpp/codecs/static/example/ya.make b/library/cpp/codecs/static/example/ya.make
index ca6c5fd900..85dc222624 100644
--- a/library/cpp/codecs/static/example/ya.make
+++ b/library/cpp/codecs/static/example/ya.make
@@ -1,24 +1,24 @@
-LIBRARY()
-
-OWNER(velavokr)
-
-SRCS(
- GLOBAL example.cpp
-)
-
-PEERDIR(
+LIBRARY()
+
+OWNER(velavokr)
+
+SRCS(
+ GLOBAL example.cpp
+)
+
+PEERDIR(
library/cpp/codecs
library/cpp/codecs/static
-)
-
-ARCHIVE_ASM(
+)
+
+ARCHIVE_ASM(
"solar-8k-a.huffman.1467494385.codec_info"
NAME codec_info_sa_huff_20160707
-)
-
-ARCHIVE_ASM(
+)
+
+ARCHIVE_ASM(
"huffman.1467494385.codec_info"
NAME codec_info_huff_20160707
-)
-
-END()
+)
+
+END()
diff --git a/library/cpp/codecs/static/static.cpp b/library/cpp/codecs/static/static.cpp
index 44a07dd73a..d2c99a15ee 100644
--- a/library/cpp/codecs/static/static.cpp
+++ b/library/cpp/codecs/static/static.cpp
@@ -1,98 +1,98 @@
-#include "static.h"
-#include "common.h"
-
+#include "static.h"
+#include "common.h"
+
#include <library/cpp/codecs/static/static_codec_info.pb.h>
#include <library/cpp/archive/yarchive.h>
-
-#include <util/draft/datetime.h>
-
-#include <util/string/builder.h>
-#include <util/stream/buffer.h>
-#include <util/stream/mem.h>
-#include <util/string/hex.h>
-#include <util/ysaveload.h>
-
-namespace NCodecs {
+
+#include <util/draft/datetime.h>
+
+#include <util/string/builder.h>
+#include <util/stream/buffer.h>
+#include <util/stream/mem.h>
+#include <util/string/hex.h>
+#include <util/ysaveload.h>
+
+namespace NCodecs {
static constexpr TStringBuf STATIC_CODEC_INFO_MAGIC = "CodecInf";
-
- static TStringBuf GetStaticCodecInfoMagic() {
+
+ static TStringBuf GetStaticCodecInfoMagic() {
return STATIC_CODEC_INFO_MAGIC;
- }
-
+ }
+
void SaveCodecInfoToStream(IOutputStream& out, const TStaticCodecInfo& info) {
- TBufferOutput bout;
+ TBufferOutput bout;
info.SerializeToArcadiaStream(&bout);
- ui64 hash = DataSignature(bout.Buffer());
- out.Write(GetStaticCodecInfoMagic());
- ::Save(&out, hash);
- ::Save(&out, bout.Buffer());
- }
-
+ ui64 hash = DataSignature(bout.Buffer());
+ out.Write(GetStaticCodecInfoMagic());
+ ::Save(&out, hash);
+ ::Save(&out, bout.Buffer());
+ }
+
TStaticCodecInfo LoadCodecInfoFromStream(IInputStream& in) {
- {
- TBuffer magic;
+ {
+ TBuffer magic;
magic.Resize(GetStaticCodecInfoMagic().size());
Y_ENSURE_EX(in.Read(magic.Data(), GetStaticCodecInfoMagic().size()) == GetStaticCodecInfoMagic().size(),
- TCodecException() << "bad codec info");
+ TCodecException() << "bad codec info");
Y_ENSURE_EX(TStringBuf(magic.data(), magic.size()) == GetStaticCodecInfoMagic(),
- TCodecException() << "bad codec info");
- }
-
- ui64 hash;
- ::Load(&in, hash);
- TBuffer info;
- ::Load(&in, info);
- Y_ENSURE_EX(hash == DataSignature(info), TCodecException() << "bad codec info");
-
- TStaticCodecInfo result;
+ TCodecException() << "bad codec info");
+ }
+
+ ui64 hash;
+ ::Load(&in, hash);
+ TBuffer info;
+ ::Load(&in, info);
+ Y_ENSURE_EX(hash == DataSignature(info), TCodecException() << "bad codec info");
+
+ TStaticCodecInfo result;
Y_ENSURE_EX(result.ParseFromArray(info.data(), info.size()), TCodecException() << "bad codec info");
-
- return result;
- }
-
+
+ return result;
+ }
+
TString SaveCodecInfoToString(const TStaticCodecInfo& info) {
- TStringStream s;
- SaveCodecInfoToStream(s, info);
- return s.Str();
- }
-
- TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data) {
+ TStringStream s;
+ SaveCodecInfoToStream(s, info);
+ return s.Str();
+ }
+
+ TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data) {
TMemoryInput m{data.data(), data.size()};
- return LoadCodecInfoFromStream(m);
- }
-
+ return LoadCodecInfoFromStream(m);
+ }
+
TString FormatCodecInfo(const TStaticCodecInfo& ci) {
- TStringBuilder s;
- s << "codec name: " << ci.GetDebugInfo().GetCodecName() << Endl;
- s << "codec hash: " << HexWriteScalar(ci.GetDebugInfo().GetStoredCodecHash()) << Endl;
- s << "dict size: " << ci.GetStoredCodec().Size() << Endl;
- s << "sample mult: " << ci.GetDebugInfo().GetSampleSizeMultiplier() << Endl;
- s << "orig.compress: " << ci.GetDebugInfo().GetCompression() * 100 << " %" << Endl;
- s << "timestamp: " << ci.GetDebugInfo().GetTimestamp() << " ("
+ TStringBuilder s;
+ s << "codec name: " << ci.GetDebugInfo().GetCodecName() << Endl;
+ s << "codec hash: " << HexWriteScalar(ci.GetDebugInfo().GetStoredCodecHash()) << Endl;
+ s << "dict size: " << ci.GetStoredCodec().Size() << Endl;
+ s << "sample mult: " << ci.GetDebugInfo().GetSampleSizeMultiplier() << Endl;
+ s << "orig.compress: " << ci.GetDebugInfo().GetCompression() * 100 << " %" << Endl;
+ s << "timestamp: " << ci.GetDebugInfo().GetTimestamp() << " ("
<< NDatetime::TSimpleTM::NewLocal(ci.GetDebugInfo().GetTimestamp()).ToString()
<< ")" << Endl;
- s << "revision: " << ci.GetDebugInfo().GetRevisionInfo() << Endl;
- s << "training set comment: " << ci.GetDebugInfo().GetTrainingSetComment() << Endl;
- s << "training set resId: " << ci.GetDebugInfo().GetTrainingSetResId() << Endl;
- return s;
- }
-
+ s << "revision: " << ci.GetDebugInfo().GetRevisionInfo() << Endl;
+ s << "training set comment: " << ci.GetDebugInfo().GetTrainingSetComment() << Endl;
+ s << "training set resId: " << ci.GetDebugInfo().GetTrainingSetResId() << Endl;
+ return s;
+ }
+
TString LoadStringFromArchive(const ui8* begin, size_t size) {
- TArchiveReader ar(TBlob::NoCopy(begin, size));
- Y_VERIFY(ar.Count() == 1, "invalid number of entries");
- auto blob = ar.ObjectBlobByKey(ar.KeyByIndex(0));
+ TArchiveReader ar(TBlob::NoCopy(begin, size));
+ Y_VERIFY(ar.Count() == 1, "invalid number of entries");
+ auto blob = ar.ObjectBlobByKey(ar.KeyByIndex(0));
return TString{blob.AsCharPtr(), blob.Size()};
- }
-
- TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo& info) {
- return NCodecs::ICodec::RestoreFromString(info.GetStoredCodec());
- }
-
- TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size) {
- const auto& data = LoadStringFromArchive(begin, size);
- const auto& info = LoadCodecInfoFromString(data);
- const auto& codec = RestoreCodecFromCodecInfo(info);
- Y_ENSURE_EX(codec, TCodecException() << "null codec");
- return codec;
- }
-}
+ }
+
+ TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo& info) {
+ return NCodecs::ICodec::RestoreFromString(info.GetStoredCodec());
+ }
+
+ TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size) {
+ const auto& data = LoadStringFromArchive(begin, size);
+ const auto& info = LoadCodecInfoFromString(data);
+ const auto& codec = RestoreCodecFromCodecInfo(info);
+ Y_ENSURE_EX(codec, TCodecException() << "null codec");
+ return codec;
+ }
+}
diff --git a/library/cpp/codecs/static/static.h b/library/cpp/codecs/static/static.h
index c1eaed2a74..efa9c60c22 100644
--- a/library/cpp/codecs/static/static.h
+++ b/library/cpp/codecs/static/static.h
@@ -1,34 +1,34 @@
-#pragma once
-
+#pragma once
+
#include <library/cpp/codecs/codecs.h>
-
-#include <util/generic/strbuf.h>
+
+#include <util/generic/strbuf.h>
#include <util/generic/string.h>
#include <util/stream/output.h>
-
-namespace NCodecs {
- class TStaticCodecInfo;
-
- // load
-
- TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo&);
-
- TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data);
-
+
+namespace NCodecs {
+ class TStaticCodecInfo;
+
+ // load
+
+ TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo&);
+
+ TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data);
+
TString LoadStringFromArchive(const ui8* begin, size_t size);
-
- TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size);
-
- // save
-
+
+ TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size);
+
+ // save
+
TString SaveCodecInfoToString(const TStaticCodecInfo&);
-
+
void SaveCodecInfoToStream(IOutputStream& out, const TStaticCodecInfo&);
-
- // misc
-
+
+ // misc
+
TStaticCodecInfo LoadCodecInfoFromStream(IInputStream& in);
-
+
TString FormatCodecInfo(const TStaticCodecInfo&);
-
-}
+
+}
diff --git a/library/cpp/codecs/static/static_codec_info.proto b/library/cpp/codecs/static/static_codec_info.proto
index 362abb4dad..178459784b 100644
--- a/library/cpp/codecs/static/static_codec_info.proto
+++ b/library/cpp/codecs/static/static_codec_info.proto
@@ -1,17 +1,17 @@
-package NCodecs;
-
-message TStaticCodecInfo {
- message TDebugInfo {
- optional string CodecName = 1; // the exact codec variant name
- optional uint64 Timestamp = 2; // when the codec was built
- optional string RevisionInfo = 3; // the arcadia revision info
- optional float SampleSizeMultiplier = 4; // how the default sample size was modified to improve compression
- optional float Compression = 5; // the compression on the training set ((raw_size - coded_size) / raw_size)
- optional string TrainingSetComment = 6; // a human readable description of the training set
- optional string TrainingSetResId = 7; // the training set sandbox resource id
- optional uint64 StoredCodecHash = 8; // cityhash64(data)
- }
-
- optional bytes StoredCodec = 1; // the data of the codec
- optional TDebugInfo DebugInfo = 2; // misc debug info which could be useful in finding whereabouts later
-}
+package NCodecs;
+
+message TStaticCodecInfo {
+ message TDebugInfo {
+ optional string CodecName = 1; // the exact codec variant name
+ optional uint64 Timestamp = 2; // when the codec was built
+ optional string RevisionInfo = 3; // the arcadia revision info
+ optional float SampleSizeMultiplier = 4; // how the default sample size was modified to improve compression
+ optional float Compression = 5; // the compression on the training set ((raw_size - coded_size) / raw_size)
+ optional string TrainingSetComment = 6; // a human readable description of the training set
+ optional string TrainingSetResId = 7; // the training set sandbox resource id
+ optional uint64 StoredCodecHash = 8; // cityhash64(data)
+ }
+
+ optional bytes StoredCodec = 1; // the data of the codec
+ optional TDebugInfo DebugInfo = 2; // misc debug info which could be useful in finding whereabouts later
+}
diff --git a/library/cpp/codecs/static/tools/common/ct_common.cpp b/library/cpp/codecs/static/tools/common/ct_common.cpp
index fe77691280..cea40506e1 100644
--- a/library/cpp/codecs/static/tools/common/ct_common.cpp
+++ b/library/cpp/codecs/static/tools/common/ct_common.cpp
@@ -1,74 +1,74 @@
-#include "ct_common.h"
-
+#include "ct_common.h"
+
#include <library/cpp/codecs/codecs.h>
#include <library/cpp/codecs/static/static_codec_info.pb.h>
#include <library/cpp/string_utils/base64/base64.h>
-
+
#include <util/stream/output.h>
-#include <util/string/builder.h>
-#include <util/system/hp_timer.h>
-
-namespace NCodecs {
+#include <util/string/builder.h>
+#include <util/system/hp_timer.h>
+
+namespace NCodecs {
TString TComprStats::Format(const TStaticCodecInfo& info, bool checkMode) const {
- TStringBuilder s;
- s << "raw size/item: " << RawSizePerRecord() << Endl;
- s << "enc.size/item: " << EncSizePerRecord() << Endl;
- if (checkMode) {
- s << "orig.enc.size/item: " << OldEncSizePerRecord(info.GetDebugInfo().GetCompression()) << Endl;
- }
- s << "enc time us/item: " << EncTimePerRecordUS() << Endl;
- s << "dec time us/item: " << DecTimePerRecordUS() << Endl;
- s << "dict size: " << info.GetStoredCodec().Size() << Endl;
- s << "compression: " << AsPercent(Compression()) << " %" << Endl;
- if (checkMode) {
- s << "orig.compression: " << AsPercent(info.GetDebugInfo().GetCompression()) << " %" << Endl;
- }
- return s;
- }
-
+ TStringBuilder s;
+ s << "raw size/item: " << RawSizePerRecord() << Endl;
+ s << "enc.size/item: " << EncSizePerRecord() << Endl;
+ if (checkMode) {
+ s << "orig.enc.size/item: " << OldEncSizePerRecord(info.GetDebugInfo().GetCompression()) << Endl;
+ }
+ s << "enc time us/item: " << EncTimePerRecordUS() << Endl;
+ s << "dec time us/item: " << DecTimePerRecordUS() << Endl;
+ s << "dict size: " << info.GetStoredCodec().Size() << Endl;
+ s << "compression: " << AsPercent(Compression()) << " %" << Endl;
+ if (checkMode) {
+ s << "orig.compression: " << AsPercent(info.GetDebugInfo().GetCompression()) << " %" << Endl;
+ }
+ return s;
+ }
+
TComprStats TestCodec(const ICodec& c, const TVector<TString>& input) {
- TComprStats stats;
-
- TBuffer encodeBuffer;
- TBuffer decodeBuffer;
- for (const auto& data : input) {
- encodeBuffer.Clear();
- decodeBuffer.Clear();
-
- stats.Records += 1;
+ TComprStats stats;
+
+ TBuffer encodeBuffer;
+ TBuffer decodeBuffer;
+ for (const auto& data : input) {
+ encodeBuffer.Clear();
+ decodeBuffer.Clear();
+
+ stats.Records += 1;
stats.RawSize += data.size();
-
- THPTimer timer;
- c.Encode(data, encodeBuffer);
+
+ THPTimer timer;
+ c.Encode(data, encodeBuffer);
stats.EncSize += encodeBuffer.size();
- stats.EncSeconds += timer.PassedReset();
-
+ stats.EncSeconds += timer.PassedReset();
+
c.Decode(TStringBuf{encodeBuffer.data(), encodeBuffer.size()}, decodeBuffer);
- stats.DecSeconds += timer.PassedReset();
+ stats.DecSeconds += timer.PassedReset();
Y_ENSURE(data == TStringBuf(decodeBuffer.data(), decodeBuffer.size()), "invalid encoding at record " << stats.Records);
- }
-
- return stats;
- }
-
+ }
+
+ return stats;
+ }
+
void ParseBlob(TVector<TString>& result, EDataStreamFormat fmt, const TBlob& blob) {
TStringBuf bin(blob.AsCharPtr(), blob.Size());
- TStringBuf line;
+ TStringBuf line;
TString buffer;
- while (bin.ReadLine(line)) {
- if (DSF_BASE64_LF == fmt) {
- Base64Decode(line, buffer);
- line = buffer;
- }
- if (!line) {
- continue;
- }
+ while (bin.ReadLine(line)) {
+ if (DSF_BASE64_LF == fmt) {
+ Base64Decode(line, buffer);
+ line = buffer;
+ }
+ if (!line) {
+ continue;
+ }
result.emplace_back(line.data(), line.size());
- }
- }
-
+ }
+ }
+
TBlob GetInputBlob(const TString& dataFile) {
- return dataFile && dataFile != "-" ? TBlob::FromFile(dataFile) : TBlob::FromStream(Cin);
- }
-
-}
+ return dataFile && dataFile != "-" ? TBlob::FromFile(dataFile) : TBlob::FromStream(Cin);
+ }
+
+}
diff --git a/library/cpp/codecs/static/tools/common/ct_common.h b/library/cpp/codecs/static/tools/common/ct_common.h
index 9d3dcbda93..de531b27e6 100644
--- a/library/cpp/codecs/static/tools/common/ct_common.h
+++ b/library/cpp/codecs/static/tools/common/ct_common.h
@@ -1,75 +1,75 @@
-#pragma once
-
+#pragma once
+
#include <util/generic/string.h>
-#include <util/generic/vector.h>
-#include <util/memory/blob.h>
-#include <cmath>
-
-namespace NCodecs {
- class TStaticCodecInfo;
- class ICodec;
-
- struct TComprStats {
- double EncSeconds = 0;
- double DecSeconds = 0;
- size_t Records = 0;
- size_t RawSize = 0;
- size_t EncSize = 0;
-
- static double Round(double n, size_t decPlaces = 2) {
- double p = pow(10, decPlaces);
- return round(n * p) / p;
- }
-
- static double AsPercent(double n) {
- return Round(n * 100);
- }
-
- static double AsMicroSecond(double s) {
- return s * 1000000;
- }
-
- double PerRecord(double n) const {
- return Round((double)(Records ? n / Records : 0));
- }
-
- double Compression() const {
- return ((double)RawSize - (double)EncSize) / RawSize;
- }
-
- double EncTimePerRecordUS() const {
- return PerRecord(AsMicroSecond(EncSeconds));
- }
-
- double DecTimePerRecordUS() const {
- return PerRecord(AsMicroSecond(DecSeconds));
- }
-
- double RawSizePerRecord() const {
- return PerRecord(RawSize);
- }
-
- double EncSizePerRecord() const {
- return PerRecord(EncSize);
- }
-
- double OldEncSizePerRecord(double compr) const {
- return PerRecord((1 - compr) * RawSize);
- }
-
+#include <util/generic/vector.h>
+#include <util/memory/blob.h>
+#include <cmath>
+
+namespace NCodecs {
+ class TStaticCodecInfo;
+ class ICodec;
+
+ struct TComprStats {
+ double EncSeconds = 0;
+ double DecSeconds = 0;
+ size_t Records = 0;
+ size_t RawSize = 0;
+ size_t EncSize = 0;
+
+ static double Round(double n, size_t decPlaces = 2) {
+ double p = pow(10, decPlaces);
+ return round(n * p) / p;
+ }
+
+ static double AsPercent(double n) {
+ return Round(n * 100);
+ }
+
+ static double AsMicroSecond(double s) {
+ return s * 1000000;
+ }
+
+ double PerRecord(double n) const {
+ return Round((double)(Records ? n / Records : 0));
+ }
+
+ double Compression() const {
+ return ((double)RawSize - (double)EncSize) / RawSize;
+ }
+
+ double EncTimePerRecordUS() const {
+ return PerRecord(AsMicroSecond(EncSeconds));
+ }
+
+ double DecTimePerRecordUS() const {
+ return PerRecord(AsMicroSecond(DecSeconds));
+ }
+
+ double RawSizePerRecord() const {
+ return PerRecord(RawSize);
+ }
+
+ double EncSizePerRecord() const {
+ return PerRecord(EncSize);
+ }
+
+ double OldEncSizePerRecord(double compr) const {
+ return PerRecord((1 - compr) * RawSize);
+ }
+
TString Format(const TStaticCodecInfo&, bool checkMode) const;
- };
-
+ };
+
TComprStats TestCodec(const ICodec&, const TVector<TString>& data);
-
- enum EDataStreamFormat {
- DSF_NONE,
- DSF_PLAIN_LF /* "plain" */,
- DSF_BASE64_LF /* "base64" */,
- };
-
+
+ enum EDataStreamFormat {
+ DSF_NONE,
+ DSF_PLAIN_LF /* "plain" */,
+ DSF_BASE64_LF /* "base64" */,
+ };
+
void ParseBlob(TVector<TString>&, EDataStreamFormat, const TBlob&);
-
+
TBlob GetInputBlob(const TString& dataFile);
-
-}
+
+}
diff --git a/library/cpp/codecs/static/tools/common/ya.make b/library/cpp/codecs/static/tools/common/ya.make
index d624222dad..5f575a2f28 100644
--- a/library/cpp/codecs/static/tools/common/ya.make
+++ b/library/cpp/codecs/static/tools/common/ya.make
@@ -1,19 +1,19 @@
-LIBRARY()
-
+LIBRARY()
+
OWNER(velavokr)
-
-SRCS(
- ct_common.cpp
-)
-
-PEERDIR(
+
+SRCS(
+ ct_common.cpp
+)
+
+PEERDIR(
library/cpp/codecs
library/cpp/codecs/static
library/cpp/getopt/small
library/cpp/string_utils/base64
- util/draft
-)
-
+ util/draft
+)
+
GENERATE_ENUM_SERIALIZATION(ct_common.h)
-
-END()
+
+END()
diff --git a/library/cpp/codecs/static/tools/static_codec_checker/README b/library/cpp/codecs/static/tools/static_codec_checker/README
index 723a68300b..c66703227d 100644
--- a/library/cpp/codecs/static/tools/static_codec_checker/README
+++ b/library/cpp/codecs/static/tools/static_codec_checker/README
@@ -1,4 +1,4 @@
This is a viewer for generated codec and utility for verification of the compression quality on a new data.
-
+
Usage:
-static_codec_checker -t -c 029b29ff64a74927.codec_info -f plain samples.txt
+static_codec_checker -t -c 029b29ff64a74927.codec_info -f plain samples.txt
diff --git a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp
index 9c8d568d82..5ae901d8f8 100644
--- a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp
+++ b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp
@@ -3,25 +3,25 @@
#include <library/cpp/codecs/static/static_codec_info.pb.h>
#include <library/cpp/codecs/codecs.h>
#include <library/cpp/getopt/small/last_getopt.h>
-
-#include <util/digest/city.h>
-#include <util/generic/yexception.h>
-#include <util/stream/file.h>
-#include <util/stream/buffer.h>
-#include <util/stream/format.h>
-#include <util/string/builder.h>
-
-int main(int argc, char** argv) {
- NCodecs::TCodecPtr codecPtr;
- NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE;
+
+#include <util/digest/city.h>
+#include <util/generic/yexception.h>
+#include <util/stream/file.h>
+#include <util/stream/buffer.h>
+#include <util/stream/format.h>
+#include <util/string/builder.h>
+
+int main(int argc, char** argv) {
+ NCodecs::TCodecPtr codecPtr;
+ NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE;
TString codecFile;
- bool testCompression = false;
-
- auto opts = NLastGetopt::TOpts::Default();
- opts.SetTitle("Prints a .codec_info file and optionally checks its performance on new data. See also static_codec_generator.");
- opts.SetCmdLineDescr("-c 9089f3e9b7a0f0d4.codec_info -t -f base64 qtrees.sample.txt");
- NCodecs::TStaticCodecInfo codec;
-
+ bool testCompression = false;
+
+ auto opts = NLastGetopt::TOpts::Default();
+ opts.SetTitle("Prints a .codec_info file and optionally checks its performance on new data. See also static_codec_generator.");
+ opts.SetCmdLineDescr("-c 9089f3e9b7a0f0d4.codec_info -t -f base64 qtrees.sample.txt");
+ NCodecs::TStaticCodecInfo codec;
+
opts.AddLongOption('c', "codec-info").RequiredArgument("codec_info").Handler1T<TString>([&codecFile, &codec, &codecPtr](TString name) {
codecFile = name;
codec.CopyFrom(NCodecs::LoadCodecInfoFromString(TUnbufferedFileInput(name).ReadAll()));
@@ -29,45 +29,45 @@ int main(int argc, char** argv) {
})
.Required()
.Help(".codec_info file with serialized static data for codec");
-
+
opts.AddLongOption('t', "test").NoArgument().StoreValue(&testCompression, true).Optional().Help("test current performance");
-
+
opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Optional().Help("test set input file format");
-
- opts.SetFreeArgsMin(0);
- opts.SetFreeArgTitle(0, "testing_set_input_file", "testing set input files");
-
- NLastGetopt::TOptsParseResult res(&opts, argc, argv);
-
- Cout << codecFile << Endl;
- Cout << NCodecs::FormatCodecInfo(codec) << Endl;
-
- if (testCompression) {
- if (NCodecs::DSF_NONE == fmt) {
- Cerr << "Specify format (-f|--format) for testing set input" << Endl;
- exit(1);
- }
-
- Cout << "Reading testing set data ... " << Flush;
-
+
+ opts.SetFreeArgsMin(0);
+ opts.SetFreeArgTitle(0, "testing_set_input_file", "testing set input files");
+
+ NLastGetopt::TOptsParseResult res(&opts, argc, argv);
+
+ Cout << codecFile << Endl;
+ Cout << NCodecs::FormatCodecInfo(codec) << Endl;
+
+ if (testCompression) {
+ if (NCodecs::DSF_NONE == fmt) {
+ Cerr << "Specify format (-f|--format) for testing set input" << Endl;
+ exit(1);
+ }
+
+ Cout << "Reading testing set data ... " << Flush;
+
TVector<TString> allData;
- for (const auto& freeArg : res.GetFreeArgs()) {
- NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg));
- }
-
- if (!res.GetFreeArgs()) {
- NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-"));
- }
-
- Cout << "Done" << Endl << Endl;
-
- Cout << "records: " << allData.size() << Endl;
- Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl;
-
- Cout << "Testing compression ... " << Flush;
- auto stats = NCodecs::TestCodec(*codecPtr, allData);
- Cout << "Done" << Endl << Endl;
-
- Cout << stats.Format(codec, true) << Endl;
- }
-}
+ for (const auto& freeArg : res.GetFreeArgs()) {
+ NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg));
+ }
+
+ if (!res.GetFreeArgs()) {
+ NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-"));
+ }
+
+ Cout << "Done" << Endl << Endl;
+
+ Cout << "records: " << allData.size() << Endl;
+ Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl;
+
+ Cout << "Testing compression ... " << Flush;
+ auto stats = NCodecs::TestCodec(*codecPtr, allData);
+ Cout << "Done" << Endl << Endl;
+
+ Cout << stats.Format(codec, true) << Endl;
+ }
+}
diff --git a/library/cpp/codecs/static/tools/static_codec_checker/ya.make b/library/cpp/codecs/static/tools/static_codec_checker/ya.make
index 90e06ca448..86b73dff6c 100644
--- a/library/cpp/codecs/static/tools/static_codec_checker/ya.make
+++ b/library/cpp/codecs/static/tools/static_codec_checker/ya.make
@@ -1,16 +1,16 @@
-PROGRAM()
-
+PROGRAM()
+
OWNER(velavokr)
-
-SRCS(
- static_codec_checker.cpp
-)
-
-PEERDIR(
+
+SRCS(
+ static_codec_checker.cpp
+)
+
+PEERDIR(
library/cpp/codecs
library/cpp/codecs/static
library/cpp/codecs/static/tools/common
library/cpp/getopt/small
-)
-
-END()
+)
+
+END()
diff --git a/library/cpp/codecs/static/tools/static_codec_generator/README b/library/cpp/codecs/static/tools/static_codec_generator/README
index e6bb52b959..f0fffd745a 100644
--- a/library/cpp/codecs/static/tools/static_codec_generator/README
+++ b/library/cpp/codecs/static/tools/static_codec_generator/README
@@ -1,4 +1,4 @@
This is a utility for reproducible teaching of a codec. And also for saving it into a file with a unique name for a static compilation as a resource.
-
+
Usage:
-static_codec_generator -t -m 'the training data description' -f plain samples.txt
+static_codec_generator -t -m 'the training data description' -f plain samples.txt
diff --git a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp
index 45fdb5c5fe..b37a0f686d 100644
--- a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp
+++ b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp
@@ -2,81 +2,81 @@
#include <library/cpp/codecs/static/static_codec_info.pb.h>
#include <library/cpp/codecs/static/builder.h>
#include <library/cpp/codecs/codecs.h>
-
+
#include <library/cpp/getopt/small/last_getopt.h>
-
-#include <util/generic/yexception.h>
-#include <util/stream/file.h>
-#include <util/string/builder.h>
-
-int main(int argc, char** argv) {
- NCodecs::TCodecBuildInfo info;
- NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE;
-
- auto opts = NLastGetopt::TOpts::Default();
- opts.SetCmdLineDescr("-m 'Training set: 100000 qtrees taken from web mmeta logs' -f base64 qtrees.sample.txt");
- opts.SetTitle("Teaches the codec and serializes it as a file named CODECNAME.hash(CODECDATA).bin");
-
+
+#include <util/generic/yexception.h>
+#include <util/stream/file.h>
+#include <util/string/builder.h>
+
+int main(int argc, char** argv) {
+ NCodecs::TCodecBuildInfo info;
+ NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE;
+
+ auto opts = NLastGetopt::TOpts::Default();
+ opts.SetCmdLineDescr("-m 'Training set: 100000 qtrees taken from web mmeta logs' -f base64 qtrees.sample.txt");
+ opts.SetTitle("Teaches the codec and serializes it as a file named CODECNAME.hash(CODECDATA).bin");
+
opts.AddLongOption('m', "message").RequiredArgument("training_set_comment").StoreResult(&info.TrainingSetComment).Required().Help("a human description for the training set");
-
+
opts.AddLongOption('r', "resource").RequiredArgument("training_set_res_id").StoreResult(&info.TrainingSetResId).Optional().Help("sandbox resource id for the training set");
-
+
opts.AddLongOption('c', "codec").RequiredArgument("codec_name").StoreResult(&info.CodecName).Optional().DefaultValue(info.CodecName);
-
+
opts.AddLongOption('s', "sample-multiplier").RequiredArgument("multiplier").StoreResult(&info.SampleSizeMultiplier).Optional().DefaultValue(ToString(info.SampleSizeMultiplier)).Help("multiplier for default sample size");
-
+
opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Required().Help("training set input file format");
-
+
opts.AddLongOption("list-codecs").NoArgument().Handler0([]() {
Cout << JoinStrings(NCodecs::ICodec::GetCodecsList(), "\n") << Endl;
exit(0);
})
.Optional()
.Help("list available codecs");
-
+
opts.AddLongOption("fake-revision").RequiredArgument("revision").StoreResult(&info.RevisionInfo).Optional().Hidden(); // replace static_codec_generator revision in debug info
-
+
opts.AddLongOption("fake-timestamp").RequiredArgument("timestamp").StoreResult(&info.Timestamp).Optional().Hidden(); // replace generating timestamp in debug info
-
- opts.SetFreeArgsMin(0);
- opts.SetFreeArgTitle(0, "training_set_input_file", "training set input files");
-
- NLastGetopt::TOptsParseResult res(&opts, argc, argv);
-
- Cout << "Reading training set data ... " << Flush;
+
+ opts.SetFreeArgsMin(0);
+ opts.SetFreeArgTitle(0, "training_set_input_file", "training set input files");
+
+ NLastGetopt::TOptsParseResult res(&opts, argc, argv);
+
+ Cout << "Reading training set data ... " << Flush;
TVector<TString> allData;
- for (const auto& freeArg : res.GetFreeArgs()) {
- NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg));
- }
-
- if (!res.GetFreeArgs()) {
- NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-"));
- }
- Cout << "Done" << Endl << Endl;
-
- Cout << "records: " << allData.size() << Endl;
- Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl;
-
- Cout << "Training " << info.CodecName << " , sample size multiplier is " << info.SampleSizeMultiplier << " ... " << Flush;
- auto codec = NCodecs::BuildStaticCodec(allData, info);
- Cout << "Done" << Endl;
-
+ for (const auto& freeArg : res.GetFreeArgs()) {
+ NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg));
+ }
+
+ if (!res.GetFreeArgs()) {
+ NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-"));
+ }
+ Cout << "Done" << Endl << Endl;
+
+ Cout << "records: " << allData.size() << Endl;
+ Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl;
+
+ Cout << "Training " << info.CodecName << " , sample size multiplier is " << info.SampleSizeMultiplier << " ... " << Flush;
+ auto codec = NCodecs::BuildStaticCodec(allData, info);
+ Cout << "Done" << Endl;
+
TString codecName = NCodecs::GetStandardFileName(codec);
- NCodecs::TCodecPtr codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec());
-
- Cout << "Testing compression ... " << Flush;
- auto stats = NCodecs::TestCodec(*codecPtr, allData);
- Cout << "Done" << Endl << Endl;
-
- codec.MutableDebugInfo()->SetCompression(stats.Compression());
-
- Cout << stats.Format(codec, false) << Endl;
-
- Cout << "Saving as " << codecName << " ... " << Flush;
- {
+ NCodecs::TCodecPtr codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec());
+
+ Cout << "Testing compression ... " << Flush;
+ auto stats = NCodecs::TestCodec(*codecPtr, allData);
+ Cout << "Done" << Endl << Endl;
+
+ codec.MutableDebugInfo()->SetCompression(stats.Compression());
+
+ Cout << stats.Format(codec, false) << Endl;
+
+ Cout << "Saving as " << codecName << " ... " << Flush;
+ {
TUnbufferedFileOutput fout{codecName};
- NCodecs::SaveCodecInfoToStream(fout, codec);
- fout.Finish();
- }
- Cout << "Done" << Endl << Endl;
-}
+ NCodecs::SaveCodecInfoToStream(fout, codec);
+ fout.Finish();
+ }
+ Cout << "Done" << Endl << Endl;
+}
diff --git a/library/cpp/codecs/static/tools/static_codec_generator/ya.make b/library/cpp/codecs/static/tools/static_codec_generator/ya.make
index efbc440dd1..21750dde49 100644
--- a/library/cpp/codecs/static/tools/static_codec_generator/ya.make
+++ b/library/cpp/codecs/static/tools/static_codec_generator/ya.make
@@ -1,17 +1,17 @@
-PROGRAM()
-
+PROGRAM()
+
OWNER(velavokr)
-
-SRCS(
- static_codec_generator.cpp
-)
-
-PEERDIR(
+
+SRCS(
+ static_codec_generator.cpp
+)
+
+PEERDIR(
library/cpp/codecs
library/cpp/codecs/static
library/cpp/codecs/static/tools/common
library/cpp/digest/md5
library/cpp/getopt/small
-)
-
-END()
+)
+
+END()
diff --git a/library/cpp/codecs/static/tools/tests/static_codec_tools.py b/library/cpp/codecs/static/tools/tests/static_codec_tools.py
index db4140e370..a5baa262f7 100644
--- a/library/cpp/codecs/static/tools/tests/static_codec_tools.py
+++ b/library/cpp/codecs/static/tools/tests/static_codec_tools.py
@@ -1,18 +1,18 @@
-#!/usr/bin/env python
-
-import yatest.common as tt
-import os.path as op
-
-def test_static_codec_tools():
+#!/usr/bin/env python
+
+import yatest.common as tt
+import os.path as op
+
+def test_static_codec_tools():
tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator")]
- + ["-m", "test codec", "-r", "sbr://143310406", "-f", "plain", "-c", "solar-8k-a:huffman", "-s", "1",
- "--fake-revision", "r2385905", "--fake-timestamp", "1467494385", "sample.txt"],
- timeout=60)
- assert(op.exists("solar-8k-a.huffman.1467494385.codec_info"))
+ + ["-m", "test codec", "-r", "sbr://143310406", "-f", "plain", "-c", "solar-8k-a:huffman", "-s", "1",
+ "--fake-revision", "r2385905", "--fake-timestamp", "1467494385", "sample.txt"],
+ timeout=60)
+ assert(op.exists("solar-8k-a.huffman.1467494385.codec_info"))
tt.canonical_execute(tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker"),
- args=["-c", "solar-8k-a.huffman.1467494385.codec_info"],
- timeout=60)
+ args=["-c", "solar-8k-a.huffman.1467494385.codec_info"],
+ timeout=60)
tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker")]
- + ["-c", "solar-8k-a.huffman.1467494385.codec_info", "-f", "plain", "-t", "sample.txt"],
- timeout=60)
- return tt.canonical_file("solar-8k-a.huffman.1467494385.codec_info")
+ + ["-c", "solar-8k-a.huffman.1467494385.codec_info", "-f", "plain", "-t", "sample.txt"],
+ timeout=60)
+ return tt.canonical_file("solar-8k-a.huffman.1467494385.codec_info")
diff --git a/library/cpp/codecs/static/tools/tests/ya.make b/library/cpp/codecs/static/tools/tests/ya.make
index c5324eaf53..5555d90cae 100644
--- a/library/cpp/codecs/static/tools/tests/ya.make
+++ b/library/cpp/codecs/static/tools/tests/ya.make
@@ -1,20 +1,20 @@
PY2TEST()
-
-OWNER(velavokr)
-
-TEST_SRCS(static_codec_tools.py)
-
-DATA(sbr://143310406)
-
-TIMEOUT(4200)
-
+
+OWNER(velavokr)
+
+TEST_SRCS(static_codec_tools.py)
+
+DATA(sbr://143310406)
+
+TIMEOUT(4200)
+
TAG(ya:not_autocheck)
-DEPENDS(
+DEPENDS(
library/cpp/codecs/static/tools/static_codec_checker
library/cpp/codecs/static/tools/static_codec_generator
-)
-
-
+)
+
+
-END()
+END()
diff --git a/library/cpp/codecs/static/tools/ya.make b/library/cpp/codecs/static/tools/ya.make
index dd3e8437aa..ab72769153 100644
--- a/library/cpp/codecs/static/tools/ya.make
+++ b/library/cpp/codecs/static/tools/ya.make
@@ -1,5 +1,5 @@
-RECURSE(
- common
- static_codec_generator
- static_codec_checker
-)
+RECURSE(
+ common
+ static_codec_generator
+ static_codec_checker
+)
diff --git a/library/cpp/codecs/static/ut/builder_ut.cpp b/library/cpp/codecs/static/ut/builder_ut.cpp
index b47c279ed1..48d5c98d5d 100644
--- a/library/cpp/codecs/static/ut/builder_ut.cpp
+++ b/library/cpp/codecs/static/ut/builder_ut.cpp
@@ -1,57 +1,57 @@
#include <library/cpp/testing/unittest/registar.h>
#include <library/cpp/codecs/static/builder.h>
#include <library/cpp/codecs/static/static_codec_info.pb.h>
-#include <util/string/vector.h>
-
+#include <util/string/vector.h>
+
class TStaticCodecInfoBuilderTest: public NUnitTest::TTestBase {
- UNIT_TEST_SUITE(TStaticCodecInfoBuilderTest)
+ UNIT_TEST_SUITE(TStaticCodecInfoBuilderTest)
UNIT_TEST(TestBuild)
- UNIT_TEST_SUITE_END();
+ UNIT_TEST_SUITE_END();
-private:
+private:
TVector<TString> PrepareData() {
TVector<TString> data;
- for (ui32 i = 'a'; i <= 'z'; ++i) {
+ for (ui32 i = 'a'; i <= 'z'; ++i) {
data.push_back(TString(1, (char)i));
- }
- return data;
- }
-
- void TestBuild() {
+ }
+ return data;
+ }
+
+ void TestBuild() {
TVector<TString> data;
- NCodecs::TCodecBuildInfo info;
- info.CodecName = "huffman";
- info.SampleSizeMultiplier = 2;
- info.Timestamp = 1467494385;
- info.RevisionInfo = "r2385905";
- info.TrainingSetComment = "some dummy data";
- info.TrainingSetResId = "sbr://1234";
- auto res = NCodecs::BuildStaticCodec(PrepareData(), info);
- UNIT_ASSERT_VALUES_EQUAL(res.ShortUtf8DebugString(),
- "StoredCodec: \"\\007\\000huffman@S\\000a"
- "\\006b\\005c\\005d\\005e\\005f\\005g\\005h\\005i\\005j\\005k\\005l\\005m\\005n\\005o"
- "\\005p\\005q\\005r\\005s\\005t\\005u\\004v\\004w\\004x\\004y\\004z\\004\xC7?\xC8>"
- "\xC9=\xCA<\xCB;\xCC:\3159\3168\3177\3206\3215\3224\3233\3242\3251\3260\xD7/\xD8."
- "\xD9-\xDA,\xDB+\xDC*\xDD)\xDE(\xDF\\'\xE0&\xE1%\xE2$\xE3#\xE4\\\"\xE5!\xE6 \xE7"
- "\\037\xE8\\036\xE9\\035\xEA\\034\xEB\\033\xEC\\032\xED\\031\xEE\\030\xEF\\027\xF0"
- "\\026\xF1\\025\xF2\\024\xF3\\023\xF4\\022\xF5\\021\xF6\\020\xF7\\017\xF8\\016\xF9"
- "\\r\xFA\\014\xFB\\013\xFC\\n\xFD\\t\xFE\\010\xFF\\007\" "
- "DebugInfo { "
- "CodecName: \"huffman\" "
- "Timestamp: 1467494385 "
- "RevisionInfo: \"r2385905\" "
- "SampleSizeMultiplier: 2 "
- "TrainingSetComment: \"some dummy data\" "
- "TrainingSetResId: \"sbr://1234\" "
- "StoredCodecHash: 2509195835471488613 "
- "}");
-
- UNIT_ASSERT_VALUES_EQUAL(NCodecs::GetStandardFileName(res), "huffman.1467494385.codec_info");
- UNIT_ASSERT_VALUES_EQUAL(res.GetDebugInfo().GetStoredCodecHash(), 2509195835471488613ULL);
-
- auto res1 = NCodecs::LoadCodecInfoFromString(NCodecs::SaveCodecInfoToString(res));
- UNIT_ASSERT_VALUES_EQUAL(res1.ShortUtf8DebugString(), res.ShortUtf8DebugString());
- }
-};
-
-UNIT_TEST_SUITE_REGISTRATION(TStaticCodecInfoBuilderTest);
+ NCodecs::TCodecBuildInfo info;
+ info.CodecName = "huffman";
+ info.SampleSizeMultiplier = 2;
+ info.Timestamp = 1467494385;
+ info.RevisionInfo = "r2385905";
+ info.TrainingSetComment = "some dummy data";
+ info.TrainingSetResId = "sbr://1234";
+ auto res = NCodecs::BuildStaticCodec(PrepareData(), info);
+ UNIT_ASSERT_VALUES_EQUAL(res.ShortUtf8DebugString(),
+ "StoredCodec: \"\\007\\000huffman@S\\000a"
+ "\\006b\\005c\\005d\\005e\\005f\\005g\\005h\\005i\\005j\\005k\\005l\\005m\\005n\\005o"
+ "\\005p\\005q\\005r\\005s\\005t\\005u\\004v\\004w\\004x\\004y\\004z\\004\xC7?\xC8>"
+ "\xC9=\xCA<\xCB;\xCC:\3159\3168\3177\3206\3215\3224\3233\3242\3251\3260\xD7/\xD8."
+ "\xD9-\xDA,\xDB+\xDC*\xDD)\xDE(\xDF\\'\xE0&\xE1%\xE2$\xE3#\xE4\\\"\xE5!\xE6 \xE7"
+ "\\037\xE8\\036\xE9\\035\xEA\\034\xEB\\033\xEC\\032\xED\\031\xEE\\030\xEF\\027\xF0"
+ "\\026\xF1\\025\xF2\\024\xF3\\023\xF4\\022\xF5\\021\xF6\\020\xF7\\017\xF8\\016\xF9"
+ "\\r\xFA\\014\xFB\\013\xFC\\n\xFD\\t\xFE\\010\xFF\\007\" "
+ "DebugInfo { "
+ "CodecName: \"huffman\" "
+ "Timestamp: 1467494385 "
+ "RevisionInfo: \"r2385905\" "
+ "SampleSizeMultiplier: 2 "
+ "TrainingSetComment: \"some dummy data\" "
+ "TrainingSetResId: \"sbr://1234\" "
+ "StoredCodecHash: 2509195835471488613 "
+ "}");
+
+ UNIT_ASSERT_VALUES_EQUAL(NCodecs::GetStandardFileName(res), "huffman.1467494385.codec_info");
+ UNIT_ASSERT_VALUES_EQUAL(res.GetDebugInfo().GetStoredCodecHash(), 2509195835471488613ULL);
+
+ auto res1 = NCodecs::LoadCodecInfoFromString(NCodecs::SaveCodecInfoToString(res));
+ UNIT_ASSERT_VALUES_EQUAL(res1.ShortUtf8DebugString(), res.ShortUtf8DebugString());
+ }
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TStaticCodecInfoBuilderTest);
diff --git a/library/cpp/codecs/static/ut/static_ut.cpp b/library/cpp/codecs/static/ut/static_ut.cpp
index 57e1e62887..fda9ffcccb 100644
--- a/library/cpp/codecs/static/ut/static_ut.cpp
+++ b/library/cpp/codecs/static/ut/static_ut.cpp
@@ -1,27 +1,27 @@
#include <library/cpp/testing/unittest/registar.h>
#include <library/cpp/codecs/static/example/example.h>
-
+
class TStaticCodecUsageTest: public NUnitTest::TTestBase {
- UNIT_TEST_SUITE(TStaticCodecUsageTest)
+ UNIT_TEST_SUITE(TStaticCodecUsageTest)
UNIT_TEST(TestUsage)
- UNIT_TEST_SUITE_END();
+ UNIT_TEST_SUITE_END();
-private:
- void DoTestUsage(NStaticCodecExample::EDictVersion dv, size_t expectedSize) {
+private:
+ void DoTestUsage(NStaticCodecExample::EDictVersion dv, size_t expectedSize) {
const TStringBuf letov = "Всё идёт по плану";
-
- TBuffer outEnc, outDec;
- NStaticCodecExample::Encode(outEnc, letov, dv);
+
+ TBuffer outEnc, outDec;
+ NStaticCodecExample::Encode(outEnc, letov, dv);
NStaticCodecExample::Decode(outDec, TStringBuf{outEnc.data(), outEnc.size()});
-
- UNIT_ASSERT_VALUES_EQUAL(outEnc.Size(), expectedSize);
+
+ UNIT_ASSERT_VALUES_EQUAL(outEnc.Size(), expectedSize);
UNIT_ASSERT_EQUAL(TStringBuf(outDec.data(), outDec.size()), letov);
- }
-
- void TestUsage() {
- DoTestUsage(NStaticCodecExample::DV_HUFF_20160707, 18u);
- DoTestUsage(NStaticCodecExample::DV_SA_HUFF_20160707, 22u);
- }
-};
-
-UNIT_TEST_SUITE_REGISTRATION(TStaticCodecUsageTest)
+ }
+
+ void TestUsage() {
+ DoTestUsage(NStaticCodecExample::DV_HUFF_20160707, 18u);
+ DoTestUsage(NStaticCodecExample::DV_SA_HUFF_20160707, 22u);
+ }
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TStaticCodecUsageTest)
diff --git a/library/cpp/codecs/static/ut/ya.make b/library/cpp/codecs/static/ut/ya.make
index b9116097d8..5bb2017fac 100644
--- a/library/cpp/codecs/static/ut/ya.make
+++ b/library/cpp/codecs/static/ut/ya.make
@@ -1,14 +1,14 @@
UNITTEST_FOR(library/cpp/codecs/static)
-
-OWNER(velavokr)
-
-SRCS(
- builder_ut.cpp
- static_ut.cpp
-)
-
-PEERDIR(
+
+OWNER(velavokr)
+
+SRCS(
+ builder_ut.cpp
+ static_ut.cpp
+)
+
+PEERDIR(
library/cpp/codecs/static/example
-)
-
-END()
+)
+
+END()
diff --git a/library/cpp/codecs/static/ya.make b/library/cpp/codecs/static/ya.make
index 00e00fd8d4..a2698b9432 100644
--- a/library/cpp/codecs/static/ya.make
+++ b/library/cpp/codecs/static/ya.make
@@ -1,18 +1,18 @@
-LIBRARY()
-
-OWNER(velavokr)
-
-SRCS(
- builder.cpp
- static_codec_info.proto
- static.cpp
-)
-
-PEERDIR(
+LIBRARY()
+
+OWNER(velavokr)
+
+SRCS(
+ builder.cpp
+ static_codec_info.proto
+ static.cpp
+)
+
+PEERDIR(
library/cpp/codecs
library/cpp/archive
library/cpp/svnversion
- util/draft
-)
-
-END()
+ util/draft
+)
+
+END()
diff --git a/library/cpp/codecs/tls_cache.cpp b/library/cpp/codecs/tls_cache.cpp
index 0a1b32bda1..d54339d869 100644
--- a/library/cpp/codecs/tls_cache.cpp
+++ b/library/cpp/codecs/tls_cache.cpp
@@ -1,4 +1,4 @@
-#include "tls_cache.h"
-
-namespace NCodecs {
-}
+#include "tls_cache.h"
+
+namespace NCodecs {
+}
diff --git a/library/cpp/codecs/tls_cache.h b/library/cpp/codecs/tls_cache.h
index 0184e4bb6c..fa166729c5 100644
--- a/library/cpp/codecs/tls_cache.h
+++ b/library/cpp/codecs/tls_cache.h
@@ -1,100 +1,100 @@
-#pragma once
-
-#include <util/generic/buffer.h>
-#include <util/generic/deque.h>
-#include <util/generic/noncopyable.h>
-#include <util/generic/strbuf.h>
-#include <util/system/tls.h>
-#include <util/thread/singleton.h>
-
-namespace NCodecs {
- template <class TItem>
- struct TClear {
- void operator()(TItem& item) const {
- item.Clear();
- }
- };
-
+#pragma once
+
+#include <util/generic/buffer.h>
+#include <util/generic/deque.h>
+#include <util/generic/noncopyable.h>
+#include <util/generic/strbuf.h>
+#include <util/system/tls.h>
+#include <util/thread/singleton.h>
+
+namespace NCodecs {
+ template <class TItem>
+ struct TClear {
+ void operator()(TItem& item) const {
+ item.Clear();
+ }
+ };
+
template <class TItem, class TCleaner = TClear<TItem>>
- class TTlsCache {
- using TSelf = TTlsCache<TItem, TCleaner>;
-
+ class TTlsCache {
+ using TSelf = TTlsCache<TItem, TCleaner>;
+
struct TItemHolder: public TIntrusiveListItem<TItemHolder> {
- TItemHolder(TSelf& factory)
- : Factory(factory)
+ TItemHolder(TSelf& factory)
+ : Factory(factory)
{
}
-
- void Release() {
- Factory.Release(*this);
- }
-
- TSelf& Factory;
- TItem Item;
- };
-
- class TItemGuard {
- public:
- explicit TItemGuard(TSelf& fact)
- : Holder(fact.Acquire())
+
+ void Release() {
+ Factory.Release(*this);
+ }
+
+ TSelf& Factory;
+ TItem Item;
+ };
+
+ class TItemGuard {
+ public:
+ explicit TItemGuard(TSelf& fact)
+ : Holder(fact.Acquire())
{
}
-
+
TItemGuard(TItemGuard&& other) noexcept {
- *this = std::move(other);
- }
-
+ *this = std::move(other);
+ }
+
TItemGuard& operator=(TItemGuard&& other) noexcept {
- if (&other != this) {
- std::swap(Holder, other.Holder);
- }
- return *this;
- }
-
- ~TItemGuard() {
- if (Holder) {
- Holder->Release();
- }
- }
-
- TItem& Get() & {
- Y_ASSERT(Holder);
- return Holder->Item;
- }
-
- TItem& Get() && = delete;
-
- private:
- TItemHolder* Holder = nullptr;
- };
-
- public:
- TItemGuard Item() {
- return TItemGuard(*this);
- }
-
- static TSelf& TlsInstance() {
- return *FastTlsSingleton<TSelf>();
- }
-
- private:
- TItemHolder* Acquire() {
- if (Free.Empty()) {
- return new TItemHolder(*this);
- } else {
- return Free.PopBack();
- }
- }
-
- void Release(TItemHolder& item) {
- Cleaner(item.Item);
- Free.PushBack(&item);
- }
-
- private:
- TIntrusiveListWithAutoDelete<TItemHolder, TDelete> Free;
- TCleaner Cleaner;
- };
-
- using TBufferTlsCache = TTlsCache<TBuffer>;
-}
+ if (&other != this) {
+ std::swap(Holder, other.Holder);
+ }
+ return *this;
+ }
+
+ ~TItemGuard() {
+ if (Holder) {
+ Holder->Release();
+ }
+ }
+
+ TItem& Get() & {
+ Y_ASSERT(Holder);
+ return Holder->Item;
+ }
+
+ TItem& Get() && = delete;
+
+ private:
+ TItemHolder* Holder = nullptr;
+ };
+
+ public:
+ TItemGuard Item() {
+ return TItemGuard(*this);
+ }
+
+ static TSelf& TlsInstance() {
+ return *FastTlsSingleton<TSelf>();
+ }
+
+ private:
+ TItemHolder* Acquire() {
+ if (Free.Empty()) {
+ return new TItemHolder(*this);
+ } else {
+ return Free.PopBack();
+ }
+ }
+
+ void Release(TItemHolder& item) {
+ Cleaner(item.Item);
+ Free.PushBack(&item);
+ }
+
+ private:
+ TIntrusiveListWithAutoDelete<TItemHolder, TDelete> Free;
+ TCleaner Cleaner;
+ };
+
+ using TBufferTlsCache = TTlsCache<TBuffer>;
+}
diff --git a/library/cpp/codecs/ut/codecs_ut.cpp b/library/cpp/codecs/ut/codecs_ut.cpp
index caf6089aef..1938202400 100644
--- a/library/cpp/codecs/ut/codecs_ut.cpp
+++ b/library/cpp/codecs/ut/codecs_ut.cpp
@@ -4,15 +4,15 @@
#include <library/cpp/codecs/solar_codec.h>
#include <library/cpp/codecs/zstd_dict_codec.h>
#include <library/cpp/codecs/comptable_codec.h>
-
+
#include <library/cpp/testing/unittest/registar.h>
-
-#include <util/generic/buffer.h>
-#include <util/string/util.h>
-#include <util/string/hex.h>
+
+#include <util/generic/buffer.h>
+#include <util/string/util.h>
+#include <util/string/hex.h>
#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h>
-
-namespace {
+
+namespace {
const char* TextValues[] = {
"! сентября газета",
"!(возмездие это)!",
@@ -855,328 +855,328 @@ namespace {
"lymphomatoid papulosis",
"sez.com",
};
-}
-
-class TCodecsTest: public TTestBase {
+}
+
+class TCodecsTest: public TTestBase {
UNIT_TEST_SUITE(TCodecsTest);
- UNIT_TEST(TestPipeline)
- UNIT_TEST(TestDelta)
- UNIT_TEST(TestHuffman)
- UNIT_TEST(TestZStdDict)
- UNIT_TEST(TestCompTable)
+ UNIT_TEST(TestPipeline)
+ UNIT_TEST(TestDelta)
+ UNIT_TEST(TestHuffman)
+ UNIT_TEST(TestZStdDict)
+ UNIT_TEST(TestCompTable)
UNIT_TEST(TestHuffmanLearnByFreqs)
- UNIT_TEST(TestSolar)
- UNIT_TEST(TestPFor)
- UNIT_TEST(TestRegistry)
-
+ UNIT_TEST(TestSolar)
+ UNIT_TEST(TestPFor)
+ UNIT_TEST(TestRegistry)
+
UNIT_TEST_SUITE_END();
-
-private:
+
+private:
TString PrintError(TStringBuf learn, TStringBuf test, TStringBuf codec, ui32 i) {
TString s;
- TStringOutput sout(s);
+ TStringOutput sout(s);
sout << codec << ": " << i << ", "
<< "\n";
sout << HexEncode(learn.data(), learn.size()); //NEscJ::EscapeJ<true>(learn, sout);
- sout << " != \n";
+ sout << " != \n";
sout << HexEncode(test.data(), test.size()); //NEscJ::EscapeJ<true>(test, sout);
-
- if (s.Size() > 1536) {
+
+ if (s.Size() > 1536) {
TString res = s.substr(0, 512);
- res.append("...<skipped ").append(ToString(s.size() - 1024)).append(">...");
- res.append(s.substr(s.size() - 512));
- }
-
- return s;
- }
-
- TStringBuf AsStrBuf(const TBuffer& b) {
+ res.append("...<skipped ").append(ToString(s.size() - 1024)).append(">...");
+ res.append(s.substr(s.size() - 512));
+ }
+
+ return s;
+ }
+
+ TStringBuf AsStrBuf(const TBuffer& b) {
return TStringBuf(b.data(), b.size());
- }
-
- template <typename TCodec, bool testsaveload>
+ }
+
+ template <typename TCodec, bool testsaveload>
void TestCodec(const TVector<TBuffer>& inlearn = TVector<TBuffer>(), const TVector<TBuffer>& in = TVector<TBuffer>(), NCodecs::TCodecPtr c = new TCodec) {
- using namespace NCodecs;
-
- TBuffer buff;
-
- {
+ using namespace NCodecs;
+
+ TBuffer buff;
+
+ {
TVector<TBuffer> out;
-
- c->Learn(inlearn.begin(), inlearn.end());
-
- if (testsaveload) {
- {
- TBufferOutput bout(buff);
- ICodec::Store(&bout, c);
- }
-
- {
- TBufferInput bin(buff);
- c = ICodec::Restore(&bin);
+
+ c->Learn(inlearn.begin(), inlearn.end());
+
+ if (testsaveload) {
+ {
+ TBufferOutput bout(buff);
+ ICodec::Store(&bout, c);
+ }
+
+ {
+ TBufferInput bin(buff);
+ c = ICodec::Restore(&bin);
UNIT_ASSERT(c->AlreadyTrained());
- }
- }
-
- {
- size_t insz = 0;
- size_t outsz = buff.Size();
-
- for (ui32 i = 0; i < inlearn.size(); ++i) {
+ }
+ }
+
+ {
+ size_t insz = 0;
+ size_t outsz = buff.Size();
+
+ for (ui32 i = 0; i < inlearn.size(); ++i) {
out.emplace_back();
- c->Encode(AsStrBuf(inlearn[i]), out[i]);
-
- insz += inlearn[i].Size();
- outsz += out[i].Size();
- }
-
- TBuffer vecl;
- for (ui32 i = 0; i < out.size(); ++i) {
- vecl.Clear();
- c->Decode(AsStrBuf(out[i]), vecl);
-
- UNIT_ASSERT_EQUAL_C(AsStrBuf(inlearn[i]), AsStrBuf(vecl),
+ c->Encode(AsStrBuf(inlearn[i]), out[i]);
+
+ insz += inlearn[i].Size();
+ outsz += out[i].Size();
+ }
+
+ TBuffer vecl;
+ for (ui32 i = 0; i < out.size(); ++i) {
+ vecl.Clear();
+ c->Decode(AsStrBuf(out[i]), vecl);
+
+ UNIT_ASSERT_EQUAL_C(AsStrBuf(inlearn[i]), AsStrBuf(vecl),
PrintError(TStringBuf(inlearn[i].data(), inlearn[i].size()),
TStringBuf(vecl.data(), vecl.size()), c->GetName(), i));
- }
- }
- }
-
- {
- if (testsaveload) {
- TBufferInput bin(buff);
- c = ICodec::Restore(&bin);
- }
-
- size_t insz = 0;
- size_t outsz = buff.Size();
-
- TBuffer out, in1;
- for (ui32 i = 0; i < in.size(); ++i) {
- out.Clear();
- in1.Clear();
- c->Encode(AsStrBuf(in[i]), out);
- insz += in[i].Size();
- outsz += out.Size();
- c->Decode(AsStrBuf(out), in1);
- UNIT_ASSERT_EQUAL_C(AsStrBuf(in[i]), AsStrBuf(in1),
+ }
+ }
+ }
+
+ {
+ if (testsaveload) {
+ TBufferInput bin(buff);
+ c = ICodec::Restore(&bin);
+ }
+
+ size_t insz = 0;
+ size_t outsz = buff.Size();
+
+ TBuffer out, in1;
+ for (ui32 i = 0; i < in.size(); ++i) {
+ out.Clear();
+ in1.Clear();
+ c->Encode(AsStrBuf(in[i]), out);
+ insz += in[i].Size();
+ outsz += out.Size();
+ c->Decode(AsStrBuf(out), in1);
+ UNIT_ASSERT_EQUAL_C(AsStrBuf(in[i]), AsStrBuf(in1),
PrintError(TStringBuf(in[i].data(), in[i].size()),
TStringBuf(in1.data(), in1.size()), c->GetName(), i));
- }
- }
- }
-
- template <class T>
- void AppendTo(TBuffer& b, T t) {
- b.Append((char*)&t, sizeof(t));
- }
-
- void TestDelta() {
- using namespace NCodecs;
+ }
+ }
+ }
+
+ template <class T>
+ void AppendTo(TBuffer& b, T t) {
+ b.Append((char*)&t, sizeof(t));
+ }
+
+ void TestDelta() {
+ using namespace NCodecs;
TVector<TBuffer> d;
-
- // 1. common case
+
+ // 1. common case
d.emplace_back();
- AppendTo(d.back(), 1ULL);
- AppendTo(d.back(), 10ULL);
- AppendTo(d.back(), 100ULL);
- AppendTo(d.back(), 1000ULL);
- AppendTo(d.back(), 10000ULL);
- AppendTo(d.back(), 100000ULL);
-
- // 2. delta overflow
+ AppendTo(d.back(), 1ULL);
+ AppendTo(d.back(), 10ULL);
+ AppendTo(d.back(), 100ULL);
+ AppendTo(d.back(), 1000ULL);
+ AppendTo(d.back(), 10000ULL);
+ AppendTo(d.back(), 100000ULL);
+
+ // 2. delta overflow
d.emplace_back();
- AppendTo(d.back(), 1ULL);
- AppendTo(d.back(), 10ULL);
- AppendTo(d.back(), 100ULL);
- AppendTo(d.back(), 1000ULL);
- AppendTo(d.back(), (ui64)-100LL);
- AppendTo(d.back(), (ui64)-10ULL);
-
- // 3. bad sorting
+ AppendTo(d.back(), 1ULL);
+ AppendTo(d.back(), 10ULL);
+ AppendTo(d.back(), 100ULL);
+ AppendTo(d.back(), 1000ULL);
+ AppendTo(d.back(), (ui64)-100LL);
+ AppendTo(d.back(), (ui64)-10ULL);
+
+ // 3. bad sorting
d.emplace_back();
- AppendTo(d.back(), 1ULL);
- AppendTo(d.back(), 10ULL);
- AppendTo(d.back(), 1000ULL);
- AppendTo(d.back(), 100ULL);
- AppendTo(d.back(), 10000ULL);
- AppendTo(d.back(), 100000ULL);
-
- // all bad
+ AppendTo(d.back(), 1ULL);
+ AppendTo(d.back(), 10ULL);
+ AppendTo(d.back(), 1000ULL);
+ AppendTo(d.back(), 100ULL);
+ AppendTo(d.back(), 10000ULL);
+ AppendTo(d.back(), 100000ULL);
+
+ // all bad
d.emplace_back();
- AppendTo(d.back(), -1LL);
- AppendTo(d.back(), -1LL);
- AppendTo(d.back(), -1LL);
- AppendTo(d.back(), -1LL);
-
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -1LL);
+
TestCodec<TDeltaCodec<ui64, true>, false>(d);
TestCodec<TDeltaCodec<ui64, false>, false>(d);
- }
-
- void TestPFor() {
- using namespace NCodecs;
- {
+ }
+
+ void TestPFor() {
+ using namespace NCodecs;
+ {
TVector<TBuffer> d;
d.emplace_back();
- AppendTo(d.back(), -1LL);
- AppendTo(d.back(), -1LL);
- AppendTo(d.back(), -1LL);
- AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -1LL);
d.emplace_back();
- AppendTo(d.back(), 0LL);
- AppendTo(d.back(), 1LL);
- AppendTo(d.back(), 2LL);
- AppendTo(d.back(), 1LL);
- AppendTo(d.back(), 0LL);
- AppendTo(d.back(), 1LL);
- AppendTo(d.back(), 2LL);
+ AppendTo(d.back(), 0LL);
+ AppendTo(d.back(), 1LL);
+ AppendTo(d.back(), 2LL);
+ AppendTo(d.back(), 1LL);
+ AppendTo(d.back(), 0LL);
+ AppendTo(d.back(), 1LL);
+ AppendTo(d.back(), 2LL);
d.emplace_back();
- AppendTo(d.back(), 0LL);
- AppendTo(d.back(), 1LL);
- AppendTo(d.back(), 2LL);
- AppendTo(d.back(), 1LL);
- AppendTo(d.back(), -1LL);
- AppendTo(d.back(), 0LL);
- AppendTo(d.back(), 1LL);
- AppendTo(d.back(), 2LL);
+ AppendTo(d.back(), 0LL);
+ AppendTo(d.back(), 1LL);
+ AppendTo(d.back(), 2LL);
+ AppendTo(d.back(), 1LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), 0LL);
+ AppendTo(d.back(), 1LL);
+ AppendTo(d.back(), 2LL);
d.emplace_back();
- AppendTo(d.back(), 0LL);
- AppendTo(d.back(), -1LL);
- AppendTo(d.back(), -2LL);
- AppendTo(d.back(), -1LL);
- AppendTo(d.back(), -2LL);
- AppendTo(d.back(), -1LL);
- AppendTo(d.back(), 0LL);
- AppendTo(d.back(), -1LL);
- AppendTo(d.back(), -2LL);
-
+ AppendTo(d.back(), 0LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -2LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -2LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), 0LL);
+ AppendTo(d.back(), -1LL);
+ AppendTo(d.back(), -2LL);
+
TestCodec<TPForCodec<ui64>, false>(d);
- TestCodec<TPForCodec<ui64, true>, true>(d);
- }
- {
+ TestCodec<TPForCodec<ui64, true>, true>(d);
+ }
+ {
TVector<TBuffer> d;
d.emplace_back();
- AppendTo(d.back(), -1);
- AppendTo(d.back(), -1);
- AppendTo(d.back(), -1);
- AppendTo(d.back(), -1);
+ AppendTo(d.back(), -1);
+ AppendTo(d.back(), -1);
+ AppendTo(d.back(), -1);
+ AppendTo(d.back(), -1);
d.emplace_back();
- AppendTo(d.back(), 0);
- AppendTo(d.back(), 1);
- AppendTo(d.back(), 2);
- AppendTo(d.back(), 1);
- AppendTo(d.back(), -1);
- AppendTo(d.back(), 0);
- AppendTo(d.back(), 1);
- AppendTo(d.back(), 2);
+ AppendTo(d.back(), 0);
+ AppendTo(d.back(), 1);
+ AppendTo(d.back(), 2);
+ AppendTo(d.back(), 1);
+ AppendTo(d.back(), -1);
+ AppendTo(d.back(), 0);
+ AppendTo(d.back(), 1);
+ AppendTo(d.back(), 2);
d.emplace_back();
- AppendTo(d.back(), 0);
- AppendTo(d.back(), -1);
- AppendTo(d.back(), -2);
- AppendTo(d.back(), -1);
- AppendTo(d.back(), -2);
- AppendTo(d.back(), -1);
- AppendTo(d.back(), 0);
- AppendTo(d.back(), -1);
- AppendTo(d.back(), -2);
-
+ AppendTo(d.back(), 0);
+ AppendTo(d.back(), -1);
+ AppendTo(d.back(), -2);
+ AppendTo(d.back(), -1);
+ AppendTo(d.back(), -2);
+ AppendTo(d.back(), -1);
+ AppendTo(d.back(), 0);
+ AppendTo(d.back(), -1);
+ AppendTo(d.back(), -2);
+
TestCodec<TPForCodec<ui32>, false>(d);
- TestCodec<TPForCodec<ui32, true>, false>(d);
- }
- {
+ TestCodec<TPForCodec<ui32, true>, false>(d);
+ }
+ {
TVector<TBuffer> d;
d.emplace_back();
- for (auto& textValue : TextValues) {
- AppendTo(d.back(), (ui32)strlen(textValue));
- }
-
- TestCodec<TPForCodec<ui32>, false>(d);
- TestCodec<TPForCodec<ui32, true>, false>(d);
- }
- {
+ for (auto& textValue : TextValues) {
+ AppendTo(d.back(), (ui32)strlen(textValue));
+ }
+
+ TestCodec<TPForCodec<ui32>, false>(d);
+ TestCodec<TPForCodec<ui32, true>, false>(d);
+ }
+ {
TVector<TBuffer> d;
d.emplace_back();
- for (auto& textValue : TextValues) {
- AppendTo(d.back(), (ui64)strlen(textValue));
- }
-
- TestCodec<TPForCodec<ui64>, false>(d);
- TestCodec<TPForCodec<ui64, true>, false>(d);
- }
- }
-
- template <class TCodec>
- void DoTestSimpleCodec() {
- using namespace NCodecs;
- {
+ for (auto& textValue : TextValues) {
+ AppendTo(d.back(), (ui64)strlen(textValue));
+ }
+
+ TestCodec<TPForCodec<ui64>, false>(d);
+ TestCodec<TPForCodec<ui64, true>, false>(d);
+ }
+ }
+
+ template <class TCodec>
+ void DoTestSimpleCodec() {
+ using namespace NCodecs;
+ {
TVector<TBuffer> learn;
-
+
for (auto& textValue : TextValues) {
learn.emplace_back(textValue, strlen(textValue));
- }
-
- TestCodec<TCodec, true>(learn);
- }
- {
- TestCodec<TCodec, true>();
- }
-
- {
+ }
+
+ TestCodec<TCodec, true>(learn);
+ }
+ {
+ TestCodec<TCodec, true>();
+ }
+
+ {
TVector<TBuffer> learn;
learn.emplace_back();
- learn.back().Append('a');
-
+ learn.back().Append('a');
+
TVector<TBuffer> test;
test.emplace_back();
- for (ui32 i = 0; i < 256; ++i) {
- test.back().Append((ui8)i);
- }
-
- TestCodec<TCodec, true>(learn, test);
- }
-
- {
+ for (ui32 i = 0; i < 256; ++i) {
+ test.back().Append((ui8)i);
+ }
+
+ TestCodec<TCodec, true>(learn, test);
+ }
+
+ {
TVector<TBuffer> learn;
learn.emplace_back();
- for (ui32 i = 0; i < 256; ++i) {
- for (ui32 j = 0; j < i; ++j) {
+ for (ui32 i = 0; i < 256; ++i) {
+ for (ui32 j = 0; j < i; ++j) {
learn.back().Append((ui8)i);
- }
- }
-
+ }
+ }
+
TVector<TBuffer> test;
test.emplace_back();
- for (ui32 i = 0; i < 256; ++i) {
- test.back().Append((ui8)i);
- }
-
- TestCodec<TCodec, true>(learn, test);
- }
-
- {
+ for (ui32 i = 0; i < 256; ++i) {
+ test.back().Append((ui8)i);
+ }
+
+ TestCodec<TCodec, true>(learn, test);
+ }
+
+ {
TVector<TBuffer> learn;
learn.emplace_back();
- for (ui32 i = 0; i < 128; ++i) {
- for (ui32 j = 0; j < i; ++j) {
- learn.back().Append((ui8)i);
- }
- }
-
+ for (ui32 i = 0; i < 128; ++i) {
+ for (ui32 j = 0; j < i; ++j) {
+ learn.back().Append((ui8)i);
+ }
+ }
+
TVector<TBuffer> test;
test.emplace_back();
- for (ui32 i = 128; i < 256; ++i) {
- test.back().Append((ui8)i);
- }
-
- TestCodec<TCodec, true>(learn, test);
- }
- }
-
- void TestHuffman() {
- DoTestSimpleCodec<NCodecs::THuffmanCodec>();
- }
-
- void TestZStdDict() {
+ for (ui32 i = 128; i < 256; ++i) {
+ test.back().Append((ui8)i);
+ }
+
+ TestCodec<TCodec, true>(learn, test);
+ }
+ }
+
+ void TestHuffman() {
+ DoTestSimpleCodec<NCodecs::THuffmanCodec>();
+ }
+
+ void TestZStdDict() {
using namespace NCodecs;
{
TVector<TBuffer> learn;
@@ -1188,12 +1188,12 @@ private:
TestCodec<TZStdDictCodec, true>(learn);
}
- }
-
- void TestCompTable() {
- DoTestSimpleCodec<NCodecs::TCompTableCodec>();
- }
-
+ }
+
+ void TestCompTable() {
+ DoTestSimpleCodec<NCodecs::TCompTableCodec>();
+ }
+
void TestHuffmanLearnByFreqs() {
using namespace NCodecs;
@@ -1211,7 +1211,7 @@ private:
for (ui32 i = 0; i < data.size(); ++i) {
outLearn.emplace_back();
- codec.Encode(AsStrBuf(data[i]), outLearn[i]);
+ codec.Encode(AsStrBuf(data[i]), outLearn[i]);
}
}
@@ -1228,133 +1228,133 @@ private:
for (auto& textValue : TextValues) {
size_t len = strlen(textValue);
- for (size_t j = 0; j < len; ++j) {
+ for (size_t j = 0; j < len; ++j) {
++freqs[(ui32)(0xFF & textValue[j])].second;
- }
+ }
}
codec.LearnByFreqs(TArrayRef<std::pair<char, ui64>>(freqs, Y_ARRAY_SIZE(freqs)));
for (ui32 i = 0; i < data.size(); ++i) {
outLearnByFreqs.emplace_back();
- codec.Encode(AsStrBuf(data[i]), outLearnByFreqs[i]);
+ codec.Encode(AsStrBuf(data[i]), outLearnByFreqs[i]);
}
}
- UNIT_ASSERT_EQUAL(outLearn.size(), outLearnByFreqs.size());
- const size_t sz = outLearn.size();
- for (size_t n = 0; n < sz; ++n) {
- UNIT_ASSERT_EQUAL(AsStrBuf(outLearn[n]), AsStrBuf(outLearnByFreqs[n]));
- }
+ UNIT_ASSERT_EQUAL(outLearn.size(), outLearnByFreqs.size());
+ const size_t sz = outLearn.size();
+ for (size_t n = 0; n < sz; ++n) {
+ UNIT_ASSERT_EQUAL(AsStrBuf(outLearn[n]), AsStrBuf(outLearnByFreqs[n]));
+ }
}
- void TestSolar() {
- using namespace NCodecs;
- {
+ void TestSolar() {
+ using namespace NCodecs;
+ {
TVector<TBuffer> learn;
-
+
for (auto& textValue : TextValues) {
learn.emplace_back(textValue, strlen(textValue));
- }
-
+ }
+
TestCodec<TSolarCodec, true>(learn, TVector<TBuffer>(), new TSolarCodec(512, 8));
TestCodec<TAdaptiveSolarCodec, false>(learn, TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8));
TestCodec<TAdaptiveSolarCodec, true>(learn, TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8));
TestCodec<TSolarCodecShortInt, true>(learn, TVector<TBuffer>(), new TSolarCodecShortInt(512, 8));
- }
- {
+ }
+ {
TestCodec<TSolarCodec, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TSolarCodec(512, 8));
TestCodec<TAdaptiveSolarCodec, false>(TVector<TBuffer>(), TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8));
TestCodec<TAdaptiveSolarCodec, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8));
TestCodec<TSolarCodecShortInt, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TSolarCodecShortInt(512, 8));
- }
-
- {
+ }
+
+ {
TVector<TBuffer> learn;
learn.emplace_back();
- learn.back().Append('a');
-
+ learn.back().Append('a');
+
TVector<TBuffer> test;
test.emplace_back();
- for (ui32 i = 0; i < 256; ++i) {
- test.back().Append((ui8)i);
- }
-
- TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8));
+ for (ui32 i = 0; i < 256; ++i) {
+ test.back().Append((ui8)i);
+ }
+
+ TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8));
TestCodec<TAdaptiveSolarCodec, false>(learn, test, new TAdaptiveSolarCodec(512, 8));
TestCodec<TAdaptiveSolarCodec, true>(learn, test, new TAdaptiveSolarCodec(512, 8));
TestCodec<TSolarCodecShortInt, true>(learn, test, new TSolarCodecShortInt(512, 8));
- }
-
- {
+ }
+
+ {
TVector<TBuffer> learn;
learn.emplace_back();
- for (ui32 i = 0; i < 256; ++i) {
- for (ui32 j = 0; j < i; ++j) {
- learn.back().Append((ui8)i);
- }
- }
-
+ for (ui32 i = 0; i < 256; ++i) {
+ for (ui32 j = 0; j < i; ++j) {
+ learn.back().Append((ui8)i);
+ }
+ }
+
TVector<TBuffer> test;
test.emplace_back();
- for (ui32 i = 0; i < 256; ++i) {
- test.back().Append((ui8)i);
- }
-
- TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8));
+ for (ui32 i = 0; i < 256; ++i) {
+ test.back().Append((ui8)i);
+ }
+
+ TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8));
TestCodec<TAdaptiveSolarCodec, false>(learn, test, new TAdaptiveSolarCodec(512, 8));
TestCodec<TAdaptiveSolarCodec, true>(learn, test, new TAdaptiveSolarCodec(512, 8));
TestCodec<TSolarCodecShortInt, true>(learn, test, new TSolarCodecShortInt(512, 8));
- }
- }
-
- void TestPipeline() {
- using namespace NCodecs;
- {
+ }
+ }
+
+ void TestPipeline() {
+ using namespace NCodecs;
+ {
TVector<TBuffer> learn;
learn.emplace_back();
- for (ui32 i = 0; i < 256; ++i) {
- for (i32 j = i; j >= 0; --j) {
- learn.back().Append((ui8)j);
- }
- }
-
+ for (ui32 i = 0; i < 256; ++i) {
+ for (i32 j = i; j >= 0; --j) {
+ learn.back().Append((ui8)j);
+ }
+ }
+
TVector<TBuffer> test;
test.emplace_back();
- for (ui32 i = 0; i < 256; ++i) {
- test.back().Append((ui8)i);
- }
-
- TestCodec<TPipelineCodec, true>(learn, test,
+ for (ui32 i = 0; i < 256; ++i) {
+ test.back().Append((ui8)i);
+ }
+
+ TestCodec<TPipelineCodec, true>(learn, test,
new TPipelineCodec(new TSolarCodec(512, 8), new TSolarCodec(512, 8), new THuffmanCodec));
- }
- {
+ }
+ {
TVector<TBuffer> d;
d.emplace_back();
- for (ui32 i = 0; i < 256; ++i) {
- for (i32 j = i; j >= 0; --j) {
- d.back().Append(i * i);
- }
- }
-
+ for (ui32 i = 0; i < 256; ++i) {
+ for (i32 j = i; j >= 0; --j) {
+ d.back().Append(i * i);
+ }
+ }
+
TestCodec<TPipelineCodec, false>(d, TVector<TBuffer>(),
new TPipelineCodec(new TDeltaCodec<ui32, false>, new TPForCodec<ui32>));
- }
- }
-
- void TestRegistry() {
- using namespace NCodecs;
+ }
+ }
+
+ void TestRegistry() {
+ using namespace NCodecs;
TVector<TString> vs = ICodec::GetCodecsList();
for (const auto& v : vs) {
TCodecPtr p = ICodec::GetInstance(v);
if (v == "none") {
- UNIT_ASSERT(!p);
- continue;
- }
+ UNIT_ASSERT(!p);
+ continue;
+ }
UNIT_ASSERT_C(!!p, v);
UNIT_ASSERT_C(TStringBuf(v).Head(3) == TStringBuf(p->GetName()).Head(3), v + " " + p->GetName());
- }
- }
-};
-
-UNIT_TEST_SUITE_REGISTRATION(TCodecsTest)
+ }
+ }
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TCodecsTest)
diff --git a/library/cpp/codecs/ut/tls_cache_ut.cpp b/library/cpp/codecs/ut/tls_cache_ut.cpp
index 8101af761f..11dd5da53c 100644
--- a/library/cpp/codecs/ut/tls_cache_ut.cpp
+++ b/library/cpp/codecs/ut/tls_cache_ut.cpp
@@ -1,15 +1,15 @@
#include <library/cpp/testing/unittest/registar.h>
#include <library/cpp/codecs/tls_cache.h>
-
+
Y_UNIT_TEST_SUITE(CodecsBufferFactoryTest){
void AssignToBuffer(TBuffer & buf, TStringBuf val){
buf.Assign(val.data(), val.size());
}
-
+
TStringBuf AsStringBuf(const TBuffer& b) {
return TStringBuf(b.Data(), b.Size());
}
-
+
Y_UNIT_TEST(TestAcquireReleaseReuse) {
NCodecs::TBufferTlsCache factory;
// acquiring the first buffer
@@ -19,7 +19,7 @@ Y_UNIT_TEST(TestAcquireReleaseReuse) {
// acquiring the second buffer
auto buf2 = factory.Item();
AssignToBuffer(buf2.Get(), "Buffer_02");
- }
+ }
// the first buffer should stay intact
UNIT_ASSERT_EQUAL(AsStringBuf(buf1.Get()), "Buffer_01");
{
diff --git a/library/cpp/codecs/ut/ya.make b/library/cpp/codecs/ut/ya.make
index 90841b05ef..0b53eba9e5 100644
--- a/library/cpp/codecs/ut/ya.make
+++ b/library/cpp/codecs/ut/ya.make
@@ -12,7 +12,7 @@ PEERDIR(
)
SRCS(
- tls_cache_ut.cpp
+ tls_cache_ut.cpp
codecs_ut.cpp
float_huffman_ut.cpp
)
diff --git a/library/cpp/codecs/ya.make b/library/cpp/codecs/ya.make
index 7e76fb0c9a..d105d6925e 100644
--- a/library/cpp/codecs/ya.make
+++ b/library/cpp/codecs/ya.make
@@ -1,24 +1,24 @@
-LIBRARY()
-
+LIBRARY()
+
OWNER(
g:base
velavokr
)
-SRCS(
- tls_cache.cpp
- codecs.cpp
- codecs_registry.cpp
- comptable_codec.cpp
- delta_codec.cpp
+SRCS(
+ tls_cache.cpp
+ codecs.cpp
+ codecs_registry.cpp
+ comptable_codec.cpp
+ delta_codec.cpp
float_huffman.cpp
- huffman_codec.cpp
- pfor_codec.cpp
- solar_codec.cpp
- zstd_dict_codec.cpp
-)
-
-PEERDIR(
+ huffman_codec.cpp
+ pfor_codec.cpp
+ solar_codec.cpp
+ zstd_dict_codec.cpp
+)
+
+PEERDIR(
contrib/libs/zstd
library/cpp/bit_io
library/cpp/blockcodecs
@@ -28,6 +28,6 @@ PEERDIR(
library/cpp/deprecated/accessors
library/cpp/packers
library/cpp/string_utils/relaxed_escaper
-)
-
+)
+
END()
diff --git a/library/cpp/codecs/zstd_dict_codec.cpp b/library/cpp/codecs/zstd_dict_codec.cpp
index c42a2879e6..d543736b3d 100644
--- a/library/cpp/codecs/zstd_dict_codec.cpp
+++ b/library/cpp/codecs/zstd_dict_codec.cpp
@@ -1,173 +1,173 @@
-#include "zstd_dict_codec.h"
-
+#include "zstd_dict_codec.h"
+
#include <library/cpp/packers/packers.h>
-
-#include <util/generic/ptr.h>
-#include <util/generic/refcount.h>
-#include <util/generic/noncopyable.h>
-#include <util/string/builder.h>
-#include <util/system/src_location.h>
-#include <util/ysaveload.h>
-
-#define ZDICT_STATIC_LINKING_ONLY
-
+
+#include <util/generic/ptr.h>
+#include <util/generic/refcount.h>
+#include <util/generic/noncopyable.h>
+#include <util/string/builder.h>
+#include <util/system/src_location.h>
+#include <util/ysaveload.h>
+
+#define ZDICT_STATIC_LINKING_ONLY
+
#include <contrib/libs/zstd/include/zdict.h>
#include <contrib/libs/zstd/include/zstd.h>
#include <contrib/libs/zstd/include/zstd_errors.h>
-
-// See IGNIETFERRO-320 for possible bugs
-
-namespace NCodecs {
- class TZStdDictCodec::TImpl: public TAtomicRefCount<TZStdDictCodec::TImpl> {
- template <class T, size_t Deleter(T*)>
- class TPtrHolder : TMoveOnly {
- T* Ptr = nullptr;
-
- public:
- TPtrHolder() = default;
-
- TPtrHolder(T* dict)
- : Ptr(dict)
+
+// See IGNIETFERRO-320 for possible bugs
+
+namespace NCodecs {
+ class TZStdDictCodec::TImpl: public TAtomicRefCount<TZStdDictCodec::TImpl> {
+ template <class T, size_t Deleter(T*)>
+ class TPtrHolder : TMoveOnly {
+ T* Ptr = nullptr;
+
+ public:
+ TPtrHolder() = default;
+
+ TPtrHolder(T* dict)
+ : Ptr(dict)
{
}
-
- T* Get() {
- return Ptr;
- }
-
- const T* Get() const {
- return Ptr;
- }
-
- void Reset(T* dict) {
- Dispose();
- Ptr = dict;
- }
-
- void Dispose() {
- if (Ptr) {
- Deleter(Ptr);
- Ptr = nullptr;
- }
- }
-
- ~TPtrHolder() {
- Dispose();
- }
- };
-
- using TCDict = TPtrHolder<ZSTD_CDict, ZSTD_freeCDict>;
- using TDDict = TPtrHolder<ZSTD_DDict, ZSTD_freeDDict>;
- using TCCtx = TPtrHolder<ZSTD_CCtx, ZSTD_freeCCtx>;
- using TDCtx = TPtrHolder<ZSTD_DCtx, ZSTD_freeDCtx>;
-
- using TSizePacker = NPackers::TPacker<ui64>;
-
- public:
- static const ui32 SampleSize = (1 << 22) * 5;
-
- explicit TImpl(ui32 comprLevel)
- : CompressionLevel(comprLevel)
- {
- const size_t zeroSz = TSizePacker().MeasureLeaf(0);
- Zero.Resize(zeroSz);
+
+ T* Get() {
+ return Ptr;
+ }
+
+ const T* Get() const {
+ return Ptr;
+ }
+
+ void Reset(T* dict) {
+ Dispose();
+ Ptr = dict;
+ }
+
+ void Dispose() {
+ if (Ptr) {
+ Deleter(Ptr);
+ Ptr = nullptr;
+ }
+ }
+
+ ~TPtrHolder() {
+ Dispose();
+ }
+ };
+
+ using TCDict = TPtrHolder<ZSTD_CDict, ZSTD_freeCDict>;
+ using TDDict = TPtrHolder<ZSTD_DDict, ZSTD_freeDDict>;
+ using TCCtx = TPtrHolder<ZSTD_CCtx, ZSTD_freeCCtx>;
+ using TDCtx = TPtrHolder<ZSTD_DCtx, ZSTD_freeDCtx>;
+
+ using TSizePacker = NPackers::TPacker<ui64>;
+
+ public:
+ static const ui32 SampleSize = (1 << 22) * 5;
+
+ explicit TImpl(ui32 comprLevel)
+ : CompressionLevel(comprLevel)
+ {
+ const size_t zeroSz = TSizePacker().MeasureLeaf(0);
+ Zero.Resize(zeroSz);
TSizePacker().PackLeaf(Zero.data(), 0, zeroSz);
- }
-
- ui32 GetCompressionLevel() const {
- return CompressionLevel;
- }
-
- ui8 Encode(TStringBuf in, TBuffer& outbuf) const {
- outbuf.Clear();
-
+ }
+
+ ui32 GetCompressionLevel() const {
+ return CompressionLevel;
+ }
+
+ ui8 Encode(TStringBuf in, TBuffer& outbuf) const {
+ outbuf.Clear();
+
if (in.empty()) {
- return 0;
- }
-
- TSizePacker packer;
-
+ return 0;
+ }
+
+ TSizePacker packer;
+
const char* rawBeg = in.data();
const size_t rawSz = in.size();
-
- const size_t szSz = packer.MeasureLeaf(rawSz);
- const size_t maxDatSz = ZSTD_compressBound(rawSz);
-
- outbuf.Resize(szSz + maxDatSz);
+
+ const size_t szSz = packer.MeasureLeaf(rawSz);
+ const size_t maxDatSz = ZSTD_compressBound(rawSz);
+
+ outbuf.Resize(szSz + maxDatSz);
packer.PackLeaf(outbuf.data(), rawSz, szSz);
-
- TCCtx ctx{CheckPtr(ZSTD_createCCtx(), __LOCATION__)};
- const size_t resSz = CheckSize(ZSTD_compress_usingCDict(
+
+ TCCtx ctx{CheckPtr(ZSTD_createCCtx(), __LOCATION__)};
+ const size_t resSz = CheckSize(ZSTD_compress_usingCDict(
ctx.Get(), outbuf.data() + szSz, maxDatSz, rawBeg, rawSz, CDict.Get()),
__LOCATION__);
-
- if (resSz < rawSz) {
- outbuf.Resize(resSz + szSz);
- } else {
+
+ if (resSz < rawSz) {
+ outbuf.Resize(resSz + szSz);
+ } else {
outbuf.Resize(Zero.size() + rawSz);
memcpy(outbuf.data(), Zero.data(), Zero.size());
memcpy(outbuf.data() + Zero.size(), rawBeg, rawSz);
- }
- return 0;
- }
-
- void Decode(TStringBuf in, TBuffer& outbuf) const {
- outbuf.Clear();
-
+ }
+ return 0;
+ }
+
+ void Decode(TStringBuf in, TBuffer& outbuf) const {
+ outbuf.Clear();
+
if (in.empty()) {
- return;
- }
-
- TSizePacker packer;
-
+ return;
+ }
+
+ TSizePacker packer;
+
const char* rawBeg = in.data();
size_t rawSz = in.size();
-
- const size_t szSz = packer.SkipLeaf(rawBeg);
- ui64 datSz = 0;
- packer.UnpackLeaf(rawBeg, datSz);
-
- rawBeg += szSz;
- rawSz -= szSz;
-
- if (!datSz) {
- outbuf.Resize(rawSz);
+
+ const size_t szSz = packer.SkipLeaf(rawBeg);
+ ui64 datSz = 0;
+ packer.UnpackLeaf(rawBeg, datSz);
+
+ rawBeg += szSz;
+ rawSz -= szSz;
+
+ if (!datSz) {
+ outbuf.Resize(rawSz);
memcpy(outbuf.data(), rawBeg, rawSz);
- } else {
+ } else {
// size_t zSz = ZSTD_getDecompressedSize(rawBeg, rawSz);
// Y_ENSURE_EX(datSz == zSz, TCodecException() << datSz << " != " << zSz);
- outbuf.Resize(datSz);
- TDCtx ctx{CheckPtr(ZSTD_createDCtx(), __LOCATION__)};
- CheckSize(ZSTD_decompress_usingDDict(
+ outbuf.Resize(datSz);
+ TDCtx ctx{CheckPtr(ZSTD_createDCtx(), __LOCATION__)};
+ CheckSize(ZSTD_decompress_usingDDict(
ctx.Get(), outbuf.data(), outbuf.size(), rawBeg, rawSz, DDict.Get()),
__LOCATION__);
- outbuf.Resize(datSz);
- }
- }
-
+ outbuf.Resize(datSz);
+ }
+ }
+
bool Learn(ISequenceReader& in, bool throwOnError) {
- TBuffer data;
+ TBuffer data;
TVector<size_t> lens;
-
- data.Reserve(2 * SampleSize);
- TStringBuf r;
- while (in.NextRegion(r)) {
- if (!r) {
- continue;
- }
+
+ data.Reserve(2 * SampleSize);
+ TStringBuf r;
+ while (in.NextRegion(r)) {
+ if (!r) {
+ continue;
+ }
data.Append(r.data(), r.size());
lens.push_back(r.size());
- }
-
+ }
+
ZDICT_legacy_params_t params;
- memset(&params, 0, sizeof(params));
+ memset(&params, 0, sizeof(params));
params.zParams.compressionLevel = 1;
params.zParams.notificationLevel = 1;
- Dict.Resize(Max<size_t>(1 << 20, data.Size() + 16 * lens.size()));
-
- if (!lens) {
- Dict.Reset();
- } else {
+ Dict.Resize(Max<size_t>(1 << 20, data.Size() + 16 * lens.size()));
+
+ if (!lens) {
+ Dict.Reset();
+ } else {
size_t trainResult = ZDICT_trainFromBuffer_legacy(
Dict.data(), Dict.size(), data.Data(), const_cast<const size_t*>(&lens[0]), lens.size(), params);
if (ZSTD_isError(trainResult)) {
@@ -177,105 +177,105 @@ namespace NCodecs {
CheckSize(trainResult, __LOCATION__);
}
Dict.Resize(trainResult);
- Dict.ShrinkToFit();
- }
- InitContexts();
+ Dict.ShrinkToFit();
+ }
+ InitContexts();
return true;
- }
-
+ }
+
void Save(IOutputStream* out) const {
- ::Save(out, Dict);
- }
-
+ ::Save(out, Dict);
+ }
+
void Load(IInputStream* in) {
- ::Load(in, Dict);
- InitContexts();
- }
-
- void InitContexts() {
+ ::Load(in, Dict);
+ InitContexts();
+ }
+
+ void InitContexts() {
CDict.Reset(CheckPtr(ZSTD_createCDict(Dict.data(), Dict.size(), CompressionLevel), __LOCATION__));
DDict.Reset(CheckPtr(ZSTD_createDDict(Dict.data(), Dict.size()), __LOCATION__));
- }
-
- static size_t CheckSize(size_t sz, TSourceLocation loc) {
- if (ZSTD_isError(sz)) {
- ythrow TCodecException() << loc << " " << ZSTD_getErrorName(sz) << " (code " << (int)ZSTD_getErrorCode(sz) << ")";
- }
- return sz;
- }
-
- template <class T>
- static T* CheckPtr(T* t, TSourceLocation loc) {
+ }
+
+ static size_t CheckSize(size_t sz, TSourceLocation loc) {
+ if (ZSTD_isError(sz)) {
+ ythrow TCodecException() << loc << " " << ZSTD_getErrorName(sz) << " (code " << (int)ZSTD_getErrorCode(sz) << ")";
+ }
+ return sz;
+ }
+
+ template <class T>
+ static T* CheckPtr(T* t, TSourceLocation loc) {
Y_ENSURE_EX(t, TCodecException() << loc << " "
<< "unexpected nullptr");
- return t;
- }
-
- private:
- ui32 CompressionLevel = 1;
-
- TBuffer Zero;
- TBuffer Dict;
-
- TCDict CDict;
- TDDict DDict;
- };
-
- TZStdDictCodec::TZStdDictCodec(ui32 comprLevel)
- : Impl(new TImpl(comprLevel))
- {
- MyTraits.NeedsTraining = true;
- MyTraits.SizeOnEncodeMultiplier = 2;
- MyTraits.SizeOnDecodeMultiplier = 10;
- MyTraits.RecommendedSampleSize = TImpl::SampleSize; // same as for solar
- }
-
+ return t;
+ }
+
+ private:
+ ui32 CompressionLevel = 1;
+
+ TBuffer Zero;
+ TBuffer Dict;
+
+ TCDict CDict;
+ TDDict DDict;
+ };
+
+ TZStdDictCodec::TZStdDictCodec(ui32 comprLevel)
+ : Impl(new TImpl(comprLevel))
+ {
+ MyTraits.NeedsTraining = true;
+ MyTraits.SizeOnEncodeMultiplier = 2;
+ MyTraits.SizeOnDecodeMultiplier = 10;
+ MyTraits.RecommendedSampleSize = TImpl::SampleSize; // same as for solar
+ }
+
TZStdDictCodec::~TZStdDictCodec() {
}
-
+
TString TZStdDictCodec::GetName() const {
return TStringBuilder() << MyName() << "-" << Impl->GetCompressionLevel();
- }
-
- ui8 TZStdDictCodec::Encode(TStringBuf in, TBuffer& out) const {
- return Impl->Encode(in, out);
- }
-
- void TZStdDictCodec::Decode(TStringBuf in, TBuffer& out) const {
- Impl->Decode(in, out);
- }
-
- void TZStdDictCodec::DoLearn(ISequenceReader& in) {
- Impl = new TImpl(Impl->GetCompressionLevel());
+ }
+
+ ui8 TZStdDictCodec::Encode(TStringBuf in, TBuffer& out) const {
+ return Impl->Encode(in, out);
+ }
+
+ void TZStdDictCodec::Decode(TStringBuf in, TBuffer& out) const {
+ Impl->Decode(in, out);
+ }
+
+ void TZStdDictCodec::DoLearn(ISequenceReader& in) {
+ Impl = new TImpl(Impl->GetCompressionLevel());
Impl->Learn(in, true/*throwOnError*/);
- }
-
+ }
+
bool TZStdDictCodec::DoTryToLearn(ISequenceReader& in) {
Impl = new TImpl(Impl->GetCompressionLevel());
return Impl->Learn(in, false/*throwOnError*/);
}
void TZStdDictCodec::Save(IOutputStream* out) const {
- Impl->Save(out);
- }
-
+ Impl->Save(out);
+ }
+
void TZStdDictCodec::Load(IInputStream* in) {
- Impl->Load(in);
- }
-
+ Impl->Load(in);
+ }
+
TVector<TString> TZStdDictCodec::ListCompressionNames() {
TVector<TString> res;
- for (int i = 1; i <= ZSTD_maxCLevel(); ++i) {
- res.emplace_back(TStringBuilder() << MyName() << "-" << i);
- }
- return res;
- }
-
- int TZStdDictCodec::ParseCompressionName(TStringBuf name) {
- int c = 0;
- TryFromString(name.After('-'), c);
- Y_ENSURE_EX(name.Before('-') == MyName() && c > 0 && c <= ZSTD_maxCLevel(), TCodecException() << "invald codec name" << name);
- return c;
- }
-
-}
+ for (int i = 1; i <= ZSTD_maxCLevel(); ++i) {
+ res.emplace_back(TStringBuilder() << MyName() << "-" << i);
+ }
+ return res;
+ }
+
+ int TZStdDictCodec::ParseCompressionName(TStringBuf name) {
+ int c = 0;
+ TryFromString(name.After('-'), c);
+ Y_ENSURE_EX(name.Before('-') == MyName() && c > 0 && c <= ZSTD_maxCLevel(), TCodecException() << "invald codec name" << name);
+ return c;
+ }
+
+}
diff --git a/library/cpp/codecs/zstd_dict_codec.h b/library/cpp/codecs/zstd_dict_codec.h
index 59c1ad6c60..cdfc5c8285 100644
--- a/library/cpp/codecs/zstd_dict_codec.h
+++ b/library/cpp/codecs/zstd_dict_codec.h
@@ -1,38 +1,38 @@
-#pragma once
-
-#include "codecs.h"
-
-#include <util/generic/ptr.h>
-
-namespace NCodecs {
+#pragma once
+
+#include "codecs.h"
+
+#include <util/generic/ptr.h>
+
+namespace NCodecs {
// benchmarks are here: https://st.yandex-team.ru/SEARCH-1655
-
+
class TZStdDictCodec: public ICodec {
class TImpl;
TIntrusivePtr<TImpl> Impl;
-
+
public:
explicit TZStdDictCodec(ui32 comprLevel = 1);
~TZStdDictCodec() override;
-
+
static TStringBuf MyName() {
return "zstd08d";
}
-
+
TString GetName() const override;
-
+
ui8 Encode(TStringBuf in, TBuffer& out) const override;
-
+
void Decode(TStringBuf in, TBuffer& out) const override;
-
+
static TVector<TString> ListCompressionNames();
static int ParseCompressionName(TStringBuf);
-
+
protected:
void DoLearn(ISequenceReader& in) override;
bool DoTryToLearn(ISequenceReader& in) final;
void Save(IOutputStream* out) const override;
void Load(IInputStream* in) override;
};
-
-}
+
+}