aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/codecs/static
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/codecs/static
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/codecs/static')
-rw-r--r--library/cpp/codecs/static/README1
-rw-r--r--library/cpp/codecs/static/builder.cpp39
-rw-r--r--library/cpp/codecs/static/builder.h29
-rw-r--r--library/cpp/codecs/static/common.h32
-rw-r--r--library/cpp/codecs/static/example/example.cpp43
-rw-r--r--library/cpp/codecs/static/example/example.h17
-rw-r--r--library/cpp/codecs/static/example/huffman.1467494385.codec_infobin0 -> 385 bytes
-rw-r--r--library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_infobin0 -> 3425 bytes
-rw-r--r--library/cpp/codecs/static/example/ya.make24
-rw-r--r--library/cpp/codecs/static/static.cpp98
-rw-r--r--library/cpp/codecs/static/static.h34
-rw-r--r--library/cpp/codecs/static/static_codec_info.proto17
-rw-r--r--library/cpp/codecs/static/tools/common/ct_common.cpp74
-rw-r--r--library/cpp/codecs/static/tools/common/ct_common.h75
-rw-r--r--library/cpp/codecs/static/tools/common/ya.make19
-rw-r--r--library/cpp/codecs/static/tools/static_codec_checker/README4
-rw-r--r--library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp73
-rw-r--r--library/cpp/codecs/static/tools/static_codec_checker/ya.make16
-rw-r--r--library/cpp/codecs/static/tools/static_codec_generator/README4
-rw-r--r--library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp82
-rw-r--r--library/cpp/codecs/static/tools/static_codec_generator/ya.make17
-rw-r--r--library/cpp/codecs/static/tools/tests/canondata/result.json6
-rw-r--r--library/cpp/codecs/static/tools/tests/static_codec_tools.py18
-rw-r--r--library/cpp/codecs/static/tools/tests/ya.make20
-rw-r--r--library/cpp/codecs/static/tools/ya.make5
-rw-r--r--library/cpp/codecs/static/ut/builder_ut.cpp57
-rw-r--r--library/cpp/codecs/static/ut/static_ut.cpp27
-rw-r--r--library/cpp/codecs/static/ut/ya.make14
-rw-r--r--library/cpp/codecs/static/ya.make18
29 files changed, 863 insertions, 0 deletions
diff --git a/library/cpp/codecs/static/README b/library/cpp/codecs/static/README
new file mode 100644
index 0000000000..1b07f02433
--- /dev/null
+++ b/library/cpp/codecs/static/README
@@ -0,0 +1 @@
+Support of static libraries in library/cpp/codecs. See library/cpp/codecs/static/example.
diff --git a/library/cpp/codecs/static/builder.cpp b/library/cpp/codecs/static/builder.cpp
new file mode 100644
index 0000000000..93e34a3edb
--- /dev/null
+++ b/library/cpp/codecs/static/builder.cpp
@@ -0,0 +1,39 @@
+#include "builder.h"
+#include "common.h"
+
+#include <library/cpp/codecs/static/static_codec_info.pb.h>
+
+#include <library/cpp/codecs/codecs.h>
+
+#include <util/generic/yexception.h>
+#include <util/string/subst.h>
+
+namespace NCodecs {
+ TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo& info) {
+ TStaticCodecInfo result;
+ TCodecPtr codec = ICodec::GetInstance(info.CodecName);
+ Y_ENSURE_EX(codec, TCodecException() << "empty codec is not allowed");
+
+ codec->LearnX(trainingData.begin(), trainingData.end(), info.SampleSizeMultiplier);
+ {
+ TStringOutput sout{*result.MutableStoredCodec()};
+ ICodec::Store(&sout, codec);
+ }
+
+ auto& debugInfo = *result.MutableDebugInfo();
+ debugInfo.SetStoredCodecHash(DataSignature(result.GetStoredCodec()));
+ debugInfo.SetCodecName(info.CodecName);
+ debugInfo.SetSampleSizeMultiplier(info.SampleSizeMultiplier);
+ debugInfo.SetTimestamp(info.Timestamp);
+ debugInfo.SetRevisionInfo(info.RevisionInfo);
+ debugInfo.SetTrainingSetComment(info.TrainingSetComment);
+ debugInfo.SetTrainingSetResId(info.TrainingSetResId);
+ return result;
+ }
+
+ TString GetStandardFileName(const TStaticCodecInfo& info) {
+ TString cName = info.GetDebugInfo().GetCodecName();
+ SubstGlobal(cName, ':', '.');
+ return TStringBuilder() << cName << "." << info.GetDebugInfo().GetTimestamp() << ".codec_info";
+ }
+}
diff --git a/library/cpp/codecs/static/builder.h b/library/cpp/codecs/static/builder.h
new file mode 100644
index 0000000000..d7533be4d5
--- /dev/null
+++ b/library/cpp/codecs/static/builder.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "static.h"
+
+#include <library/cpp/svnversion/svnversion.h>
+
+#include <util/datetime/base.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/string/builder.h>
+
+namespace NCodecs {
+ struct TCodecBuildInfo {
+ // optimal values from SEARCH-1655
+ TString CodecName = "solar-8k-a:zstd08d-1";
+ float SampleSizeMultiplier = 1;
+
+ // debug info:
+ time_t Timestamp = TInstant::Now().TimeT();
+ TString RevisionInfo = (TStringBuilder() << "r" << ToString(GetProgramSvnRevision()));
+ TString TrainingSetComment; // a human comment on the training data
+ TString TrainingSetResId; // sandbox resid of the training set
+ };
+
+ TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo&);
+
+ TString GetStandardFileName(const TStaticCodecInfo&);
+
+}
diff --git a/library/cpp/codecs/static/common.h b/library/cpp/codecs/static/common.h
new file mode 100644
index 0000000000..211de2a27d
--- /dev/null
+++ b/library/cpp/codecs/static/common.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <util/string/hex.h>
+#include <util/digest/city.h>
+#include <util/system/byteorder.h>
+
+namespace NCodecs {
+ template <class T>
+ ui64 DataSignature(const T& t) {
+ static_assert(!std::is_scalar<T>::value, "no scalars");
+ return CityHash64(t.data(), t.size());
+ }
+
+ template <class T>
+ TString HexWriteScalar(T t) {
+ static_assert(std::is_scalar<T>::value, "scalars only");
+ t = LittleToBig(t);
+ TString res = HexEncode(&t, sizeof(t));
+ res.to_lower();
+ return res;
+ }
+
+ template <class T>
+ T HexReadScalar(TStringBuf s) {
+ static_assert(std::is_scalar<T>::value, "scalars only");
+ T t = 0;
+ HexDecode(s.data(), Min(s.size(), sizeof(T)), &t);
+ t = BigToLittle(t);
+ return t;
+ }
+
+}
diff --git a/library/cpp/codecs/static/example/example.cpp b/library/cpp/codecs/static/example/example.cpp
new file mode 100644
index 0000000000..5b750b717e
--- /dev/null
+++ b/library/cpp/codecs/static/example/example.cpp
@@ -0,0 +1,43 @@
+#include "example.h"
+
+#include <library/cpp/codecs/static/static.h>
+
+#include <util/generic/yexception.h>
+
+extern "C" {
+extern const ui8 codec_info_huff_20160707[];
+extern const ui32 codec_info_huff_20160707Size;
+extern const ui8 codec_info_sa_huff_20160707[];
+extern const ui32 codec_info_sa_huff_20160707Size;
+};
+
+namespace NStaticCodecExample {
+ static const NCodecs::TCodecConstPtr CODECS[] = {
+ nullptr,
+ NCodecs::RestoreCodecFromArchive(codec_info_huff_20160707, codec_info_huff_20160707Size),
+ NCodecs::RestoreCodecFromArchive(codec_info_sa_huff_20160707, codec_info_sa_huff_20160707Size),
+ };
+
+ static_assert(Y_ARRAY_SIZE(CODECS) == DV_COUNT, "bad array size");
+
+ void Encode(TBuffer& out, TStringBuf in, EDictVersion dv) {
+ Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv);
+ out.Clear();
+ if (!in) {
+ return;
+ }
+ CODECS[dv]->Encode(in, out);
+ out.Append((char)dv);
+ }
+
+ void Decode(TBuffer& out, TStringBuf in) {
+ out.Clear();
+ if (!in) {
+ return;
+ }
+ EDictVersion dv = (EDictVersion)in.back();
+ Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv);
+ in.Chop(1);
+ CODECS[dv]->Decode(in, out);
+ }
+}
diff --git a/library/cpp/codecs/static/example/example.h b/library/cpp/codecs/static/example/example.h
new file mode 100644
index 0000000000..f9b3a7324b
--- /dev/null
+++ b/library/cpp/codecs/static/example/example.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <util/generic/strbuf.h>
+#include <util/generic/buffer.h>
+
+namespace NStaticCodecExample {
+ enum EDictVersion : ui8 {
+ DV_NULL = 0,
+ DV_HUFF_20160707,
+ DV_SA_HUFF_20160707,
+ DV_COUNT
+ };
+
+ void Encode(TBuffer&, TStringBuf, EDictVersion dv = DV_SA_HUFF_20160707);
+
+ void Decode(TBuffer&, TStringBuf);
+}
diff --git a/library/cpp/codecs/static/example/huffman.1467494385.codec_info b/library/cpp/codecs/static/example/huffman.1467494385.codec_info
new file mode 100644
index 0000000000..5fc18270a6
--- /dev/null
+++ b/library/cpp/codecs/static/example/huffman.1467494385.codec_info
Binary files differ
diff --git a/library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_info b/library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_info
new file mode 100644
index 0000000000..d36d8e24ec
--- /dev/null
+++ b/library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_info
Binary files differ
diff --git a/library/cpp/codecs/static/example/ya.make b/library/cpp/codecs/static/example/ya.make
new file mode 100644
index 0000000000..ca6c5fd900
--- /dev/null
+++ b/library/cpp/codecs/static/example/ya.make
@@ -0,0 +1,24 @@
+LIBRARY()
+
+OWNER(velavokr)
+
+SRCS(
+ GLOBAL example.cpp
+)
+
+PEERDIR(
+ library/cpp/codecs
+ library/cpp/codecs/static
+)
+
+ARCHIVE_ASM(
+ "solar-8k-a.huffman.1467494385.codec_info"
+ NAME codec_info_sa_huff_20160707
+)
+
+ARCHIVE_ASM(
+ "huffman.1467494385.codec_info"
+ NAME codec_info_huff_20160707
+)
+
+END()
diff --git a/library/cpp/codecs/static/static.cpp b/library/cpp/codecs/static/static.cpp
new file mode 100644
index 0000000000..44a07dd73a
--- /dev/null
+++ b/library/cpp/codecs/static/static.cpp
@@ -0,0 +1,98 @@
+#include "static.h"
+#include "common.h"
+
+#include <library/cpp/codecs/static/static_codec_info.pb.h>
+#include <library/cpp/archive/yarchive.h>
+
+#include <util/draft/datetime.h>
+
+#include <util/string/builder.h>
+#include <util/stream/buffer.h>
+#include <util/stream/mem.h>
+#include <util/string/hex.h>
+#include <util/ysaveload.h>
+
+namespace NCodecs {
+ static constexpr TStringBuf STATIC_CODEC_INFO_MAGIC = "CodecInf";
+
+ static TStringBuf GetStaticCodecInfoMagic() {
+ return STATIC_CODEC_INFO_MAGIC;
+ }
+
+ void SaveCodecInfoToStream(IOutputStream& out, const TStaticCodecInfo& info) {
+ TBufferOutput bout;
+ info.SerializeToArcadiaStream(&bout);
+ ui64 hash = DataSignature(bout.Buffer());
+ out.Write(GetStaticCodecInfoMagic());
+ ::Save(&out, hash);
+ ::Save(&out, bout.Buffer());
+ }
+
+ TStaticCodecInfo LoadCodecInfoFromStream(IInputStream& in) {
+ {
+ TBuffer magic;
+ magic.Resize(GetStaticCodecInfoMagic().size());
+ Y_ENSURE_EX(in.Read(magic.Data(), GetStaticCodecInfoMagic().size()) == GetStaticCodecInfoMagic().size(),
+ TCodecException() << "bad codec info");
+ Y_ENSURE_EX(TStringBuf(magic.data(), magic.size()) == GetStaticCodecInfoMagic(),
+ TCodecException() << "bad codec info");
+ }
+
+ ui64 hash;
+ ::Load(&in, hash);
+ TBuffer info;
+ ::Load(&in, info);
+ Y_ENSURE_EX(hash == DataSignature(info), TCodecException() << "bad codec info");
+
+ TStaticCodecInfo result;
+ Y_ENSURE_EX(result.ParseFromArray(info.data(), info.size()), TCodecException() << "bad codec info");
+
+ return result;
+ }
+
+ TString SaveCodecInfoToString(const TStaticCodecInfo& info) {
+ TStringStream s;
+ SaveCodecInfoToStream(s, info);
+ return s.Str();
+ }
+
+ TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data) {
+ TMemoryInput m{data.data(), data.size()};
+ return LoadCodecInfoFromStream(m);
+ }
+
+ TString FormatCodecInfo(const TStaticCodecInfo& ci) {
+ TStringBuilder s;
+ s << "codec name: " << ci.GetDebugInfo().GetCodecName() << Endl;
+ s << "codec hash: " << HexWriteScalar(ci.GetDebugInfo().GetStoredCodecHash()) << Endl;
+ s << "dict size: " << ci.GetStoredCodec().Size() << Endl;
+ s << "sample mult: " << ci.GetDebugInfo().GetSampleSizeMultiplier() << Endl;
+ s << "orig.compress: " << ci.GetDebugInfo().GetCompression() * 100 << " %" << Endl;
+ s << "timestamp: " << ci.GetDebugInfo().GetTimestamp() << " ("
+ << NDatetime::TSimpleTM::NewLocal(ci.GetDebugInfo().GetTimestamp()).ToString()
+ << ")" << Endl;
+ s << "revision: " << ci.GetDebugInfo().GetRevisionInfo() << Endl;
+ s << "training set comment: " << ci.GetDebugInfo().GetTrainingSetComment() << Endl;
+ s << "training set resId: " << ci.GetDebugInfo().GetTrainingSetResId() << Endl;
+ return s;
+ }
+
+ TString LoadStringFromArchive(const ui8* begin, size_t size) {
+ TArchiveReader ar(TBlob::NoCopy(begin, size));
+ Y_VERIFY(ar.Count() == 1, "invalid number of entries");
+ auto blob = ar.ObjectBlobByKey(ar.KeyByIndex(0));
+ return TString{blob.AsCharPtr(), blob.Size()};
+ }
+
+ TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo& info) {
+ return NCodecs::ICodec::RestoreFromString(info.GetStoredCodec());
+ }
+
+ TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size) {
+ const auto& data = LoadStringFromArchive(begin, size);
+ const auto& info = LoadCodecInfoFromString(data);
+ const auto& codec = RestoreCodecFromCodecInfo(info);
+ Y_ENSURE_EX(codec, TCodecException() << "null codec");
+ return codec;
+ }
+}
diff --git a/library/cpp/codecs/static/static.h b/library/cpp/codecs/static/static.h
new file mode 100644
index 0000000000..c1eaed2a74
--- /dev/null
+++ b/library/cpp/codecs/static/static.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <library/cpp/codecs/codecs.h>
+
+#include <util/generic/strbuf.h>
+#include <util/generic/string.h>
+#include <util/stream/output.h>
+
+namespace NCodecs {
+ class TStaticCodecInfo;
+
+ // load
+
+ TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo&);
+
+ TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data);
+
+ TString LoadStringFromArchive(const ui8* begin, size_t size);
+
+ TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size);
+
+ // save
+
+ TString SaveCodecInfoToString(const TStaticCodecInfo&);
+
+ void SaveCodecInfoToStream(IOutputStream& out, const TStaticCodecInfo&);
+
+ // misc
+
+ TStaticCodecInfo LoadCodecInfoFromStream(IInputStream& in);
+
+ TString FormatCodecInfo(const TStaticCodecInfo&);
+
+}
diff --git a/library/cpp/codecs/static/static_codec_info.proto b/library/cpp/codecs/static/static_codec_info.proto
new file mode 100644
index 0000000000..362abb4dad
--- /dev/null
+++ b/library/cpp/codecs/static/static_codec_info.proto
@@ -0,0 +1,17 @@
+package NCodecs;
+
+message TStaticCodecInfo {
+ message TDebugInfo {
+ optional string CodecName = 1; // the exact codec variant name
+ optional uint64 Timestamp = 2; // when the codec was built
+ optional string RevisionInfo = 3; // the arcadia revision info
+ optional float SampleSizeMultiplier = 4; // how the default sample size was modified to improve compression
+ optional float Compression = 5; // the compression on the training set ((raw_size - coded_size) / raw_size)
+ optional string TrainingSetComment = 6; // a human readable description of the training set
+ optional string TrainingSetResId = 7; // the training set sandbox resource id
+ optional uint64 StoredCodecHash = 8; // cityhash64(data)
+ }
+
+ optional bytes StoredCodec = 1; // the data of the codec
+ optional TDebugInfo DebugInfo = 2; // misc debug info which could be useful in finding whereabouts later
+}
diff --git a/library/cpp/codecs/static/tools/common/ct_common.cpp b/library/cpp/codecs/static/tools/common/ct_common.cpp
new file mode 100644
index 0000000000..fe77691280
--- /dev/null
+++ b/library/cpp/codecs/static/tools/common/ct_common.cpp
@@ -0,0 +1,74 @@
+#include "ct_common.h"
+
+#include <library/cpp/codecs/codecs.h>
+#include <library/cpp/codecs/static/static_codec_info.pb.h>
+#include <library/cpp/string_utils/base64/base64.h>
+
+#include <util/stream/output.h>
+#include <util/string/builder.h>
+#include <util/system/hp_timer.h>
+
+namespace NCodecs {
+ TString TComprStats::Format(const TStaticCodecInfo& info, bool checkMode) const {
+ TStringBuilder s;
+ s << "raw size/item: " << RawSizePerRecord() << Endl;
+ s << "enc.size/item: " << EncSizePerRecord() << Endl;
+ if (checkMode) {
+ s << "orig.enc.size/item: " << OldEncSizePerRecord(info.GetDebugInfo().GetCompression()) << Endl;
+ }
+ s << "enc time us/item: " << EncTimePerRecordUS() << Endl;
+ s << "dec time us/item: " << DecTimePerRecordUS() << Endl;
+ s << "dict size: " << info.GetStoredCodec().Size() << Endl;
+ s << "compression: " << AsPercent(Compression()) << " %" << Endl;
+ if (checkMode) {
+ s << "orig.compression: " << AsPercent(info.GetDebugInfo().GetCompression()) << " %" << Endl;
+ }
+ return s;
+ }
+
+ TComprStats TestCodec(const ICodec& c, const TVector<TString>& input) {
+ TComprStats stats;
+
+ TBuffer encodeBuffer;
+ TBuffer decodeBuffer;
+ for (const auto& data : input) {
+ encodeBuffer.Clear();
+ decodeBuffer.Clear();
+
+ stats.Records += 1;
+ stats.RawSize += data.size();
+
+ THPTimer timer;
+ c.Encode(data, encodeBuffer);
+ stats.EncSize += encodeBuffer.size();
+ stats.EncSeconds += timer.PassedReset();
+
+ c.Decode(TStringBuf{encodeBuffer.data(), encodeBuffer.size()}, decodeBuffer);
+ stats.DecSeconds += timer.PassedReset();
+ Y_ENSURE(data == TStringBuf(decodeBuffer.data(), decodeBuffer.size()), "invalid encoding at record " << stats.Records);
+ }
+
+ return stats;
+ }
+
+ void ParseBlob(TVector<TString>& result, EDataStreamFormat fmt, const TBlob& blob) {
+ TStringBuf bin(blob.AsCharPtr(), blob.Size());
+ TStringBuf line;
+ TString buffer;
+ while (bin.ReadLine(line)) {
+ if (DSF_BASE64_LF == fmt) {
+ Base64Decode(line, buffer);
+ line = buffer;
+ }
+ if (!line) {
+ continue;
+ }
+ result.emplace_back(line.data(), line.size());
+ }
+ }
+
+ TBlob GetInputBlob(const TString& dataFile) {
+ return dataFile && dataFile != "-" ? TBlob::FromFile(dataFile) : TBlob::FromStream(Cin);
+ }
+
+}
diff --git a/library/cpp/codecs/static/tools/common/ct_common.h b/library/cpp/codecs/static/tools/common/ct_common.h
new file mode 100644
index 0000000000..9d3dcbda93
--- /dev/null
+++ b/library/cpp/codecs/static/tools/common/ct_common.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/memory/blob.h>
+#include <cmath>
+
+namespace NCodecs {
+ class TStaticCodecInfo;
+ class ICodec;
+
+ struct TComprStats {
+ double EncSeconds = 0;
+ double DecSeconds = 0;
+ size_t Records = 0;
+ size_t RawSize = 0;
+ size_t EncSize = 0;
+
+ static double Round(double n, size_t decPlaces = 2) {
+ double p = pow(10, decPlaces);
+ return round(n * p) / p;
+ }
+
+ static double AsPercent(double n) {
+ return Round(n * 100);
+ }
+
+ static double AsMicroSecond(double s) {
+ return s * 1000000;
+ }
+
+ double PerRecord(double n) const {
+ return Round((double)(Records ? n / Records : 0));
+ }
+
+ double Compression() const {
+ return ((double)RawSize - (double)EncSize) / RawSize;
+ }
+
+ double EncTimePerRecordUS() const {
+ return PerRecord(AsMicroSecond(EncSeconds));
+ }
+
+ double DecTimePerRecordUS() const {
+ return PerRecord(AsMicroSecond(DecSeconds));
+ }
+
+ double RawSizePerRecord() const {
+ return PerRecord(RawSize);
+ }
+
+ double EncSizePerRecord() const {
+ return PerRecord(EncSize);
+ }
+
+ double OldEncSizePerRecord(double compr) const {
+ return PerRecord((1 - compr) * RawSize);
+ }
+
+ TString Format(const TStaticCodecInfo&, bool checkMode) const;
+ };
+
+ TComprStats TestCodec(const ICodec&, const TVector<TString>& data);
+
+ enum EDataStreamFormat {
+ DSF_NONE,
+ DSF_PLAIN_LF /* "plain" */,
+ DSF_BASE64_LF /* "base64" */,
+ };
+
+ void ParseBlob(TVector<TString>&, EDataStreamFormat, const TBlob&);
+
+ TBlob GetInputBlob(const TString& dataFile);
+
+}
diff --git a/library/cpp/codecs/static/tools/common/ya.make b/library/cpp/codecs/static/tools/common/ya.make
new file mode 100644
index 0000000000..d624222dad
--- /dev/null
+++ b/library/cpp/codecs/static/tools/common/ya.make
@@ -0,0 +1,19 @@
+LIBRARY()
+
+OWNER(velavokr)
+
+SRCS(
+ ct_common.cpp
+)
+
+PEERDIR(
+ library/cpp/codecs
+ library/cpp/codecs/static
+ library/cpp/getopt/small
+ library/cpp/string_utils/base64
+ util/draft
+)
+
+GENERATE_ENUM_SERIALIZATION(ct_common.h)
+
+END()
diff --git a/library/cpp/codecs/static/tools/static_codec_checker/README b/library/cpp/codecs/static/tools/static_codec_checker/README
new file mode 100644
index 0000000000..723a68300b
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_checker/README
@@ -0,0 +1,4 @@
+This is a viewer for generated codec and utility for verification of the compression quality on a new data.
+
+Usage:
+static_codec_checker -t -c 029b29ff64a74927.codec_info -f plain samples.txt
diff --git a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp
new file mode 100644
index 0000000000..9c8d568d82
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp
@@ -0,0 +1,73 @@
+#include <library/cpp/codecs/static/tools/common/ct_common.h>
+#include <library/cpp/codecs/static/static.h>
+#include <library/cpp/codecs/static/static_codec_info.pb.h>
+#include <library/cpp/codecs/codecs.h>
+#include <library/cpp/getopt/small/last_getopt.h>
+
+#include <util/digest/city.h>
+#include <util/generic/yexception.h>
+#include <util/stream/file.h>
+#include <util/stream/buffer.h>
+#include <util/stream/format.h>
+#include <util/string/builder.h>
+
+int main(int argc, char** argv) {
+ NCodecs::TCodecPtr codecPtr;
+ NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE;
+ TString codecFile;
+ bool testCompression = false;
+
+ auto opts = NLastGetopt::TOpts::Default();
+ opts.SetTitle("Prints a .codec_info file and optionally checks its performance on new data. See also static_codec_generator.");
+ opts.SetCmdLineDescr("-c 9089f3e9b7a0f0d4.codec_info -t -f base64 qtrees.sample.txt");
+ NCodecs::TStaticCodecInfo codec;
+
+ opts.AddLongOption('c', "codec-info").RequiredArgument("codec_info").Handler1T<TString>([&codecFile, &codec, &codecPtr](TString name) {
+ codecFile = name;
+ codec.CopyFrom(NCodecs::LoadCodecInfoFromString(TUnbufferedFileInput(name).ReadAll()));
+ codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec());
+ })
+ .Required()
+ .Help(".codec_info file with serialized static data for codec");
+
+ opts.AddLongOption('t', "test").NoArgument().StoreValue(&testCompression, true).Optional().Help("test current performance");
+
+ opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Optional().Help("test set input file format");
+
+ opts.SetFreeArgsMin(0);
+ opts.SetFreeArgTitle(0, "testing_set_input_file", "testing set input files");
+
+ NLastGetopt::TOptsParseResult res(&opts, argc, argv);
+
+ Cout << codecFile << Endl;
+ Cout << NCodecs::FormatCodecInfo(codec) << Endl;
+
+ if (testCompression) {
+ if (NCodecs::DSF_NONE == fmt) {
+ Cerr << "Specify format (-f|--format) for testing set input" << Endl;
+ exit(1);
+ }
+
+ Cout << "Reading testing set data ... " << Flush;
+
+ TVector<TString> allData;
+ for (const auto& freeArg : res.GetFreeArgs()) {
+ NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg));
+ }
+
+ if (!res.GetFreeArgs()) {
+ NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-"));
+ }
+
+ Cout << "Done" << Endl << Endl;
+
+ Cout << "records: " << allData.size() << Endl;
+ Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl;
+
+ Cout << "Testing compression ... " << Flush;
+ auto stats = NCodecs::TestCodec(*codecPtr, allData);
+ Cout << "Done" << Endl << Endl;
+
+ Cout << stats.Format(codec, true) << Endl;
+ }
+}
diff --git a/library/cpp/codecs/static/tools/static_codec_checker/ya.make b/library/cpp/codecs/static/tools/static_codec_checker/ya.make
new file mode 100644
index 0000000000..90e06ca448
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_checker/ya.make
@@ -0,0 +1,16 @@
+PROGRAM()
+
+OWNER(velavokr)
+
+SRCS(
+ static_codec_checker.cpp
+)
+
+PEERDIR(
+ library/cpp/codecs
+ library/cpp/codecs/static
+ library/cpp/codecs/static/tools/common
+ library/cpp/getopt/small
+)
+
+END()
diff --git a/library/cpp/codecs/static/tools/static_codec_generator/README b/library/cpp/codecs/static/tools/static_codec_generator/README
new file mode 100644
index 0000000000..e6bb52b959
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_generator/README
@@ -0,0 +1,4 @@
+This is a utility for reproducible teaching of a codec. And also for saving it into a file with a unique name for a static compilation as a resource.
+
+Usage:
+static_codec_generator -t -m 'the training data description' -f plain samples.txt
diff --git a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp
new file mode 100644
index 0000000000..45fdb5c5fe
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp
@@ -0,0 +1,82 @@
+#include <library/cpp/codecs/static/tools/common/ct_common.h>
+#include <library/cpp/codecs/static/static_codec_info.pb.h>
+#include <library/cpp/codecs/static/builder.h>
+#include <library/cpp/codecs/codecs.h>
+
+#include <library/cpp/getopt/small/last_getopt.h>
+
+#include <util/generic/yexception.h>
+#include <util/stream/file.h>
+#include <util/string/builder.h>
+
+int main(int argc, char** argv) {
+ NCodecs::TCodecBuildInfo info;
+ NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE;
+
+ auto opts = NLastGetopt::TOpts::Default();
+ opts.SetCmdLineDescr("-m 'Training set: 100000 qtrees taken from web mmeta logs' -f base64 qtrees.sample.txt");
+ opts.SetTitle("Teaches the codec and serializes it as a file named CODECNAME.hash(CODECDATA).bin");
+
+ opts.AddLongOption('m', "message").RequiredArgument("training_set_comment").StoreResult(&info.TrainingSetComment).Required().Help("a human description for the training set");
+
+ opts.AddLongOption('r', "resource").RequiredArgument("training_set_res_id").StoreResult(&info.TrainingSetResId).Optional().Help("sandbox resource id for the training set");
+
+ opts.AddLongOption('c', "codec").RequiredArgument("codec_name").StoreResult(&info.CodecName).Optional().DefaultValue(info.CodecName);
+
+ opts.AddLongOption('s', "sample-multiplier").RequiredArgument("multiplier").StoreResult(&info.SampleSizeMultiplier).Optional().DefaultValue(ToString(info.SampleSizeMultiplier)).Help("multiplier for default sample size");
+
+ opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Required().Help("training set input file format");
+
+ opts.AddLongOption("list-codecs").NoArgument().Handler0([]() {
+ Cout << JoinStrings(NCodecs::ICodec::GetCodecsList(), "\n") << Endl;
+ exit(0);
+ })
+ .Optional()
+ .Help("list available codecs");
+
+ opts.AddLongOption("fake-revision").RequiredArgument("revision").StoreResult(&info.RevisionInfo).Optional().Hidden(); // replace static_codec_generator revision in debug info
+
+ opts.AddLongOption("fake-timestamp").RequiredArgument("timestamp").StoreResult(&info.Timestamp).Optional().Hidden(); // replace generating timestamp in debug info
+
+ opts.SetFreeArgsMin(0);
+ opts.SetFreeArgTitle(0, "training_set_input_file", "training set input files");
+
+ NLastGetopt::TOptsParseResult res(&opts, argc, argv);
+
+ Cout << "Reading training set data ... " << Flush;
+ TVector<TString> allData;
+ for (const auto& freeArg : res.GetFreeArgs()) {
+ NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg));
+ }
+
+ if (!res.GetFreeArgs()) {
+ NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-"));
+ }
+ Cout << "Done" << Endl << Endl;
+
+ Cout << "records: " << allData.size() << Endl;
+ Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl;
+
+ Cout << "Training " << info.CodecName << " , sample size multiplier is " << info.SampleSizeMultiplier << " ... " << Flush;
+ auto codec = NCodecs::BuildStaticCodec(allData, info);
+ Cout << "Done" << Endl;
+
+ TString codecName = NCodecs::GetStandardFileName(codec);
+ NCodecs::TCodecPtr codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec());
+
+ Cout << "Testing compression ... " << Flush;
+ auto stats = NCodecs::TestCodec(*codecPtr, allData);
+ Cout << "Done" << Endl << Endl;
+
+ codec.MutableDebugInfo()->SetCompression(stats.Compression());
+
+ Cout << stats.Format(codec, false) << Endl;
+
+ Cout << "Saving as " << codecName << " ... " << Flush;
+ {
+ TUnbufferedFileOutput fout{codecName};
+ NCodecs::SaveCodecInfoToStream(fout, codec);
+ fout.Finish();
+ }
+ Cout << "Done" << Endl << Endl;
+}
diff --git a/library/cpp/codecs/static/tools/static_codec_generator/ya.make b/library/cpp/codecs/static/tools/static_codec_generator/ya.make
new file mode 100644
index 0000000000..efbc440dd1
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_generator/ya.make
@@ -0,0 +1,17 @@
+PROGRAM()
+
+OWNER(velavokr)
+
+SRCS(
+ static_codec_generator.cpp
+)
+
+PEERDIR(
+ library/cpp/codecs
+ library/cpp/codecs/static
+ library/cpp/codecs/static/tools/common
+ library/cpp/digest/md5
+ library/cpp/getopt/small
+)
+
+END()
diff --git a/library/cpp/codecs/static/tools/tests/canondata/result.json b/library/cpp/codecs/static/tools/tests/canondata/result.json
new file mode 100644
index 0000000000..7a637c6763
--- /dev/null
+++ b/library/cpp/codecs/static/tools/tests/canondata/result.json
@@ -0,0 +1,6 @@
+{
+ "static_codec_tools.test_static_codec_tools": {
+ "checksum": "960e3c8c57fb846ab53ccbd07e287233",
+ "uri": "sbr://144512644/static_codec_tools.test_static_codec_tools/solar-8k-a.huffman.1467494385.codec_info"
+ }
+} \ No newline at end of file
diff --git a/library/cpp/codecs/static/tools/tests/static_codec_tools.py b/library/cpp/codecs/static/tools/tests/static_codec_tools.py
new file mode 100644
index 0000000000..db4140e370
--- /dev/null
+++ b/library/cpp/codecs/static/tools/tests/static_codec_tools.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+
+import yatest.common as tt
+import os.path as op
+
+def test_static_codec_tools():
+ tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator")]
+ + ["-m", "test codec", "-r", "sbr://143310406", "-f", "plain", "-c", "solar-8k-a:huffman", "-s", "1",
+ "--fake-revision", "r2385905", "--fake-timestamp", "1467494385", "sample.txt"],
+ timeout=60)
+ assert(op.exists("solar-8k-a.huffman.1467494385.codec_info"))
+ tt.canonical_execute(tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker"),
+ args=["-c", "solar-8k-a.huffman.1467494385.codec_info"],
+ timeout=60)
+ tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker")]
+ + ["-c", "solar-8k-a.huffman.1467494385.codec_info", "-f", "plain", "-t", "sample.txt"],
+ timeout=60)
+ return tt.canonical_file("solar-8k-a.huffman.1467494385.codec_info")
diff --git a/library/cpp/codecs/static/tools/tests/ya.make b/library/cpp/codecs/static/tools/tests/ya.make
new file mode 100644
index 0000000000..c5324eaf53
--- /dev/null
+++ b/library/cpp/codecs/static/tools/tests/ya.make
@@ -0,0 +1,20 @@
+PY2TEST()
+
+OWNER(velavokr)
+
+TEST_SRCS(static_codec_tools.py)
+
+DATA(sbr://143310406)
+
+TIMEOUT(4200)
+
+TAG(ya:not_autocheck)
+
+DEPENDS(
+ library/cpp/codecs/static/tools/static_codec_checker
+ library/cpp/codecs/static/tools/static_codec_generator
+)
+
+
+
+END()
diff --git a/library/cpp/codecs/static/tools/ya.make b/library/cpp/codecs/static/tools/ya.make
new file mode 100644
index 0000000000..dd3e8437aa
--- /dev/null
+++ b/library/cpp/codecs/static/tools/ya.make
@@ -0,0 +1,5 @@
+RECURSE(
+ common
+ static_codec_generator
+ static_codec_checker
+)
diff --git a/library/cpp/codecs/static/ut/builder_ut.cpp b/library/cpp/codecs/static/ut/builder_ut.cpp
new file mode 100644
index 0000000000..b47c279ed1
--- /dev/null
+++ b/library/cpp/codecs/static/ut/builder_ut.cpp
@@ -0,0 +1,57 @@
+#include <library/cpp/testing/unittest/registar.h>
+#include <library/cpp/codecs/static/builder.h>
+#include <library/cpp/codecs/static/static_codec_info.pb.h>
+#include <util/string/vector.h>
+
+class TStaticCodecInfoBuilderTest: public NUnitTest::TTestBase {
+ UNIT_TEST_SUITE(TStaticCodecInfoBuilderTest)
+ UNIT_TEST(TestBuild)
+ UNIT_TEST_SUITE_END();
+
+private:
+ TVector<TString> PrepareData() {
+ TVector<TString> data;
+ for (ui32 i = 'a'; i <= 'z'; ++i) {
+ data.push_back(TString(1, (char)i));
+ }
+ return data;
+ }
+
+ void TestBuild() {
+ TVector<TString> data;
+ NCodecs::TCodecBuildInfo info;
+ info.CodecName = "huffman";
+ info.SampleSizeMultiplier = 2;
+ info.Timestamp = 1467494385;
+ info.RevisionInfo = "r2385905";
+ info.TrainingSetComment = "some dummy data";
+ info.TrainingSetResId = "sbr://1234";
+ auto res = NCodecs::BuildStaticCodec(PrepareData(), info);
+ UNIT_ASSERT_VALUES_EQUAL(res.ShortUtf8DebugString(),
+ "StoredCodec: \"\\007\\000huffman@S\\000a"
+ "\\006b\\005c\\005d\\005e\\005f\\005g\\005h\\005i\\005j\\005k\\005l\\005m\\005n\\005o"
+ "\\005p\\005q\\005r\\005s\\005t\\005u\\004v\\004w\\004x\\004y\\004z\\004\xC7?\xC8>"
+ "\xC9=\xCA<\xCB;\xCC:\3159\3168\3177\3206\3215\3224\3233\3242\3251\3260\xD7/\xD8."
+ "\xD9-\xDA,\xDB+\xDC*\xDD)\xDE(\xDF\\'\xE0&\xE1%\xE2$\xE3#\xE4\\\"\xE5!\xE6 \xE7"
+ "\\037\xE8\\036\xE9\\035\xEA\\034\xEB\\033\xEC\\032\xED\\031\xEE\\030\xEF\\027\xF0"
+ "\\026\xF1\\025\xF2\\024\xF3\\023\xF4\\022\xF5\\021\xF6\\020\xF7\\017\xF8\\016\xF9"
+ "\\r\xFA\\014\xFB\\013\xFC\\n\xFD\\t\xFE\\010\xFF\\007\" "
+ "DebugInfo { "
+ "CodecName: \"huffman\" "
+ "Timestamp: 1467494385 "
+ "RevisionInfo: \"r2385905\" "
+ "SampleSizeMultiplier: 2 "
+ "TrainingSetComment: \"some dummy data\" "
+ "TrainingSetResId: \"sbr://1234\" "
+ "StoredCodecHash: 2509195835471488613 "
+ "}");
+
+ UNIT_ASSERT_VALUES_EQUAL(NCodecs::GetStandardFileName(res), "huffman.1467494385.codec_info");
+ UNIT_ASSERT_VALUES_EQUAL(res.GetDebugInfo().GetStoredCodecHash(), 2509195835471488613ULL);
+
+ auto res1 = NCodecs::LoadCodecInfoFromString(NCodecs::SaveCodecInfoToString(res));
+ UNIT_ASSERT_VALUES_EQUAL(res1.ShortUtf8DebugString(), res.ShortUtf8DebugString());
+ }
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TStaticCodecInfoBuilderTest);
diff --git a/library/cpp/codecs/static/ut/static_ut.cpp b/library/cpp/codecs/static/ut/static_ut.cpp
new file mode 100644
index 0000000000..57e1e62887
--- /dev/null
+++ b/library/cpp/codecs/static/ut/static_ut.cpp
@@ -0,0 +1,27 @@
+#include <library/cpp/testing/unittest/registar.h>
+#include <library/cpp/codecs/static/example/example.h>
+
+class TStaticCodecUsageTest: public NUnitTest::TTestBase {
+ UNIT_TEST_SUITE(TStaticCodecUsageTest)
+ UNIT_TEST(TestUsage)
+ UNIT_TEST_SUITE_END();
+
+private:
+ void DoTestUsage(NStaticCodecExample::EDictVersion dv, size_t expectedSize) {
+ const TStringBuf letov = "Всё идёт по плану";
+
+ TBuffer outEnc, outDec;
+ NStaticCodecExample::Encode(outEnc, letov, dv);
+ NStaticCodecExample::Decode(outDec, TStringBuf{outEnc.data(), outEnc.size()});
+
+ UNIT_ASSERT_VALUES_EQUAL(outEnc.Size(), expectedSize);
+ UNIT_ASSERT_EQUAL(TStringBuf(outDec.data(), outDec.size()), letov);
+ }
+
+ void TestUsage() {
+ DoTestUsage(NStaticCodecExample::DV_HUFF_20160707, 18u);
+ DoTestUsage(NStaticCodecExample::DV_SA_HUFF_20160707, 22u);
+ }
+};
+
+UNIT_TEST_SUITE_REGISTRATION(TStaticCodecUsageTest)
diff --git a/library/cpp/codecs/static/ut/ya.make b/library/cpp/codecs/static/ut/ya.make
new file mode 100644
index 0000000000..b9116097d8
--- /dev/null
+++ b/library/cpp/codecs/static/ut/ya.make
@@ -0,0 +1,14 @@
+UNITTEST_FOR(library/cpp/codecs/static)
+
+OWNER(velavokr)
+
+SRCS(
+ builder_ut.cpp
+ static_ut.cpp
+)
+
+PEERDIR(
+ library/cpp/codecs/static/example
+)
+
+END()
diff --git a/library/cpp/codecs/static/ya.make b/library/cpp/codecs/static/ya.make
new file mode 100644
index 0000000000..00e00fd8d4
--- /dev/null
+++ b/library/cpp/codecs/static/ya.make
@@ -0,0 +1,18 @@
+LIBRARY()
+
+OWNER(velavokr)
+
+SRCS(
+ builder.cpp
+ static_codec_info.proto
+ static.cpp
+)
+
+PEERDIR(
+ library/cpp/codecs
+ library/cpp/archive
+ library/cpp/svnversion
+ util/draft
+)
+
+END()