aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/codecs/static/tools
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/codecs/static/tools
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/codecs/static/tools')
-rw-r--r--library/cpp/codecs/static/tools/common/ct_common.cpp74
-rw-r--r--library/cpp/codecs/static/tools/common/ct_common.h75
-rw-r--r--library/cpp/codecs/static/tools/common/ya.make19
-rw-r--r--library/cpp/codecs/static/tools/static_codec_checker/README4
-rw-r--r--library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp73
-rw-r--r--library/cpp/codecs/static/tools/static_codec_checker/ya.make16
-rw-r--r--library/cpp/codecs/static/tools/static_codec_generator/README4
-rw-r--r--library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp82
-rw-r--r--library/cpp/codecs/static/tools/static_codec_generator/ya.make17
-rw-r--r--library/cpp/codecs/static/tools/tests/canondata/result.json6
-rw-r--r--library/cpp/codecs/static/tools/tests/static_codec_tools.py18
-rw-r--r--library/cpp/codecs/static/tools/tests/ya.make20
-rw-r--r--library/cpp/codecs/static/tools/ya.make5
13 files changed, 413 insertions, 0 deletions
diff --git a/library/cpp/codecs/static/tools/common/ct_common.cpp b/library/cpp/codecs/static/tools/common/ct_common.cpp
new file mode 100644
index 00000000000..fe776912805
--- /dev/null
+++ b/library/cpp/codecs/static/tools/common/ct_common.cpp
@@ -0,0 +1,74 @@
+#include "ct_common.h"
+
+#include <library/cpp/codecs/codecs.h>
+#include <library/cpp/codecs/static/static_codec_info.pb.h>
+#include <library/cpp/string_utils/base64/base64.h>
+
+#include <util/stream/output.h>
+#include <util/string/builder.h>
+#include <util/system/hp_timer.h>
+
+namespace NCodecs {
+ TString TComprStats::Format(const TStaticCodecInfo& info, bool checkMode) const {
+ TStringBuilder s;
+ s << "raw size/item: " << RawSizePerRecord() << Endl;
+ s << "enc.size/item: " << EncSizePerRecord() << Endl;
+ if (checkMode) {
+ s << "orig.enc.size/item: " << OldEncSizePerRecord(info.GetDebugInfo().GetCompression()) << Endl;
+ }
+ s << "enc time us/item: " << EncTimePerRecordUS() << Endl;
+ s << "dec time us/item: " << DecTimePerRecordUS() << Endl;
+ s << "dict size: " << info.GetStoredCodec().Size() << Endl;
+ s << "compression: " << AsPercent(Compression()) << " %" << Endl;
+ if (checkMode) {
+ s << "orig.compression: " << AsPercent(info.GetDebugInfo().GetCompression()) << " %" << Endl;
+ }
+ return s;
+ }
+
+ TComprStats TestCodec(const ICodec& c, const TVector<TString>& input) {
+ TComprStats stats;
+
+ TBuffer encodeBuffer;
+ TBuffer decodeBuffer;
+ for (const auto& data : input) {
+ encodeBuffer.Clear();
+ decodeBuffer.Clear();
+
+ stats.Records += 1;
+ stats.RawSize += data.size();
+
+ THPTimer timer;
+ c.Encode(data, encodeBuffer);
+ stats.EncSize += encodeBuffer.size();
+ stats.EncSeconds += timer.PassedReset();
+
+ c.Decode(TStringBuf{encodeBuffer.data(), encodeBuffer.size()}, decodeBuffer);
+ stats.DecSeconds += timer.PassedReset();
+ Y_ENSURE(data == TStringBuf(decodeBuffer.data(), decodeBuffer.size()), "invalid encoding at record " << stats.Records);
+ }
+
+ return stats;
+ }
+
+ void ParseBlob(TVector<TString>& result, EDataStreamFormat fmt, const TBlob& blob) {
+ TStringBuf bin(blob.AsCharPtr(), blob.Size());
+ TStringBuf line;
+ TString buffer;
+ while (bin.ReadLine(line)) {
+ if (DSF_BASE64_LF == fmt) {
+ Base64Decode(line, buffer);
+ line = buffer;
+ }
+ if (!line) {
+ continue;
+ }
+ result.emplace_back(line.data(), line.size());
+ }
+ }
+
+ TBlob GetInputBlob(const TString& dataFile) {
+ return dataFile && dataFile != "-" ? TBlob::FromFile(dataFile) : TBlob::FromStream(Cin);
+ }
+
+}
diff --git a/library/cpp/codecs/static/tools/common/ct_common.h b/library/cpp/codecs/static/tools/common/ct_common.h
new file mode 100644
index 00000000000..9d3dcbda934
--- /dev/null
+++ b/library/cpp/codecs/static/tools/common/ct_common.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/memory/blob.h>
+#include <cmath>
+
+namespace NCodecs {
+ class TStaticCodecInfo;
+ class ICodec;
+
+ struct TComprStats {
+ double EncSeconds = 0;
+ double DecSeconds = 0;
+ size_t Records = 0;
+ size_t RawSize = 0;
+ size_t EncSize = 0;
+
+ static double Round(double n, size_t decPlaces = 2) {
+ double p = pow(10, decPlaces);
+ return round(n * p) / p;
+ }
+
+ static double AsPercent(double n) {
+ return Round(n * 100);
+ }
+
+ static double AsMicroSecond(double s) {
+ return s * 1000000;
+ }
+
+ double PerRecord(double n) const {
+ return Round((double)(Records ? n / Records : 0));
+ }
+
+ double Compression() const {
+ return ((double)RawSize - (double)EncSize) / RawSize;
+ }
+
+ double EncTimePerRecordUS() const {
+ return PerRecord(AsMicroSecond(EncSeconds));
+ }
+
+ double DecTimePerRecordUS() const {
+ return PerRecord(AsMicroSecond(DecSeconds));
+ }
+
+ double RawSizePerRecord() const {
+ return PerRecord(RawSize);
+ }
+
+ double EncSizePerRecord() const {
+ return PerRecord(EncSize);
+ }
+
+ double OldEncSizePerRecord(double compr) const {
+ return PerRecord((1 - compr) * RawSize);
+ }
+
+ TString Format(const TStaticCodecInfo&, bool checkMode) const;
+ };
+
+ TComprStats TestCodec(const ICodec&, const TVector<TString>& data);
+
+ enum EDataStreamFormat {
+ DSF_NONE,
+ DSF_PLAIN_LF /* "plain" */,
+ DSF_BASE64_LF /* "base64" */,
+ };
+
+ void ParseBlob(TVector<TString>&, EDataStreamFormat, const TBlob&);
+
+ TBlob GetInputBlob(const TString& dataFile);
+
+}
diff --git a/library/cpp/codecs/static/tools/common/ya.make b/library/cpp/codecs/static/tools/common/ya.make
new file mode 100644
index 00000000000..d624222dad0
--- /dev/null
+++ b/library/cpp/codecs/static/tools/common/ya.make
@@ -0,0 +1,19 @@
+LIBRARY()
+
+OWNER(velavokr)
+
+SRCS(
+ ct_common.cpp
+)
+
+PEERDIR(
+ library/cpp/codecs
+ library/cpp/codecs/static
+ library/cpp/getopt/small
+ library/cpp/string_utils/base64
+ util/draft
+)
+
+GENERATE_ENUM_SERIALIZATION(ct_common.h)
+
+END()
diff --git a/library/cpp/codecs/static/tools/static_codec_checker/README b/library/cpp/codecs/static/tools/static_codec_checker/README
new file mode 100644
index 00000000000..723a68300b0
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_checker/README
@@ -0,0 +1,4 @@
+This is a viewer for generated codec and utility for verification of the compression quality on a new data.
+
+Usage:
+static_codec_checker -t -c 029b29ff64a74927.codec_info -f plain samples.txt
diff --git a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp
new file mode 100644
index 00000000000..9c8d568d823
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp
@@ -0,0 +1,73 @@
+#include <library/cpp/codecs/static/tools/common/ct_common.h>
+#include <library/cpp/codecs/static/static.h>
+#include <library/cpp/codecs/static/static_codec_info.pb.h>
+#include <library/cpp/codecs/codecs.h>
+#include <library/cpp/getopt/small/last_getopt.h>
+
+#include <util/digest/city.h>
+#include <util/generic/yexception.h>
+#include <util/stream/file.h>
+#include <util/stream/buffer.h>
+#include <util/stream/format.h>
+#include <util/string/builder.h>
+
+int main(int argc, char** argv) {
+ NCodecs::TCodecPtr codecPtr;
+ NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE;
+ TString codecFile;
+ bool testCompression = false;
+
+ auto opts = NLastGetopt::TOpts::Default();
+ opts.SetTitle("Prints a .codec_info file and optionally checks its performance on new data. See also static_codec_generator.");
+ opts.SetCmdLineDescr("-c 9089f3e9b7a0f0d4.codec_info -t -f base64 qtrees.sample.txt");
+ NCodecs::TStaticCodecInfo codec;
+
+ opts.AddLongOption('c', "codec-info").RequiredArgument("codec_info").Handler1T<TString>([&codecFile, &codec, &codecPtr](TString name) {
+ codecFile = name;
+ codec.CopyFrom(NCodecs::LoadCodecInfoFromString(TUnbufferedFileInput(name).ReadAll()));
+ codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec());
+ })
+ .Required()
+ .Help(".codec_info file with serialized static data for codec");
+
+ opts.AddLongOption('t', "test").NoArgument().StoreValue(&testCompression, true).Optional().Help("test current performance");
+
+ opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Optional().Help("test set input file format");
+
+ opts.SetFreeArgsMin(0);
+ opts.SetFreeArgTitle(0, "testing_set_input_file", "testing set input files");
+
+ NLastGetopt::TOptsParseResult res(&opts, argc, argv);
+
+ Cout << codecFile << Endl;
+ Cout << NCodecs::FormatCodecInfo(codec) << Endl;
+
+ if (testCompression) {
+ if (NCodecs::DSF_NONE == fmt) {
+ Cerr << "Specify format (-f|--format) for testing set input" << Endl;
+ exit(1);
+ }
+
+ Cout << "Reading testing set data ... " << Flush;
+
+ TVector<TString> allData;
+ for (const auto& freeArg : res.GetFreeArgs()) {
+ NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg));
+ }
+
+ if (!res.GetFreeArgs()) {
+ NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-"));
+ }
+
+ Cout << "Done" << Endl << Endl;
+
+ Cout << "records: " << allData.size() << Endl;
+ Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl;
+
+ Cout << "Testing compression ... " << Flush;
+ auto stats = NCodecs::TestCodec(*codecPtr, allData);
+ Cout << "Done" << Endl << Endl;
+
+ Cout << stats.Format(codec, true) << Endl;
+ }
+}
diff --git a/library/cpp/codecs/static/tools/static_codec_checker/ya.make b/library/cpp/codecs/static/tools/static_codec_checker/ya.make
new file mode 100644
index 00000000000..90e06ca448d
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_checker/ya.make
@@ -0,0 +1,16 @@
+PROGRAM()
+
+OWNER(velavokr)
+
+SRCS(
+ static_codec_checker.cpp
+)
+
+PEERDIR(
+ library/cpp/codecs
+ library/cpp/codecs/static
+ library/cpp/codecs/static/tools/common
+ library/cpp/getopt/small
+)
+
+END()
diff --git a/library/cpp/codecs/static/tools/static_codec_generator/README b/library/cpp/codecs/static/tools/static_codec_generator/README
new file mode 100644
index 00000000000..e6bb52b9591
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_generator/README
@@ -0,0 +1,4 @@
+This is a utility for reproducible teaching of a codec. And also for saving it into a file with a unique name for a static compilation as a resource.
+
+Usage:
+static_codec_generator -t -m 'the training data description' -f plain samples.txt
diff --git a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp
new file mode 100644
index 00000000000..45fdb5c5fe8
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp
@@ -0,0 +1,82 @@
+#include <library/cpp/codecs/static/tools/common/ct_common.h>
+#include <library/cpp/codecs/static/static_codec_info.pb.h>
+#include <library/cpp/codecs/static/builder.h>
+#include <library/cpp/codecs/codecs.h>
+
+#include <library/cpp/getopt/small/last_getopt.h>
+
+#include <util/generic/yexception.h>
+#include <util/stream/file.h>
+#include <util/string/builder.h>
+
+int main(int argc, char** argv) {
+ NCodecs::TCodecBuildInfo info;
+ NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE;
+
+ auto opts = NLastGetopt::TOpts::Default();
+ opts.SetCmdLineDescr("-m 'Training set: 100000 qtrees taken from web mmeta logs' -f base64 qtrees.sample.txt");
+ opts.SetTitle("Teaches the codec and serializes it as a file named CODECNAME.hash(CODECDATA).bin");
+
+ opts.AddLongOption('m', "message").RequiredArgument("training_set_comment").StoreResult(&info.TrainingSetComment).Required().Help("a human description for the training set");
+
+ opts.AddLongOption('r', "resource").RequiredArgument("training_set_res_id").StoreResult(&info.TrainingSetResId).Optional().Help("sandbox resource id for the training set");
+
+ opts.AddLongOption('c', "codec").RequiredArgument("codec_name").StoreResult(&info.CodecName).Optional().DefaultValue(info.CodecName);
+
+ opts.AddLongOption('s', "sample-multiplier").RequiredArgument("multiplier").StoreResult(&info.SampleSizeMultiplier).Optional().DefaultValue(ToString(info.SampleSizeMultiplier)).Help("multiplier for default sample size");
+
+ opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Required().Help("training set input file format");
+
+ opts.AddLongOption("list-codecs").NoArgument().Handler0([]() {
+ Cout << JoinStrings(NCodecs::ICodec::GetCodecsList(), "\n") << Endl;
+ exit(0);
+ })
+ .Optional()
+ .Help("list available codecs");
+
+ opts.AddLongOption("fake-revision").RequiredArgument("revision").StoreResult(&info.RevisionInfo).Optional().Hidden(); // replace static_codec_generator revision in debug info
+
+ opts.AddLongOption("fake-timestamp").RequiredArgument("timestamp").StoreResult(&info.Timestamp).Optional().Hidden(); // replace generating timestamp in debug info
+
+ opts.SetFreeArgsMin(0);
+ opts.SetFreeArgTitle(0, "training_set_input_file", "training set input files");
+
+ NLastGetopt::TOptsParseResult res(&opts, argc, argv);
+
+ Cout << "Reading training set data ... " << Flush;
+ TVector<TString> allData;
+ for (const auto& freeArg : res.GetFreeArgs()) {
+ NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg));
+ }
+
+ if (!res.GetFreeArgs()) {
+ NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-"));
+ }
+ Cout << "Done" << Endl << Endl;
+
+ Cout << "records: " << allData.size() << Endl;
+ Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl;
+
+ Cout << "Training " << info.CodecName << " , sample size multiplier is " << info.SampleSizeMultiplier << " ... " << Flush;
+ auto codec = NCodecs::BuildStaticCodec(allData, info);
+ Cout << "Done" << Endl;
+
+ TString codecName = NCodecs::GetStandardFileName(codec);
+ NCodecs::TCodecPtr codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec());
+
+ Cout << "Testing compression ... " << Flush;
+ auto stats = NCodecs::TestCodec(*codecPtr, allData);
+ Cout << "Done" << Endl << Endl;
+
+ codec.MutableDebugInfo()->SetCompression(stats.Compression());
+
+ Cout << stats.Format(codec, false) << Endl;
+
+ Cout << "Saving as " << codecName << " ... " << Flush;
+ {
+ TUnbufferedFileOutput fout{codecName};
+ NCodecs::SaveCodecInfoToStream(fout, codec);
+ fout.Finish();
+ }
+ Cout << "Done" << Endl << Endl;
+}
diff --git a/library/cpp/codecs/static/tools/static_codec_generator/ya.make b/library/cpp/codecs/static/tools/static_codec_generator/ya.make
new file mode 100644
index 00000000000..efbc440dd18
--- /dev/null
+++ b/library/cpp/codecs/static/tools/static_codec_generator/ya.make
@@ -0,0 +1,17 @@
+PROGRAM()
+
+OWNER(velavokr)
+
+SRCS(
+ static_codec_generator.cpp
+)
+
+PEERDIR(
+ library/cpp/codecs
+ library/cpp/codecs/static
+ library/cpp/codecs/static/tools/common
+ library/cpp/digest/md5
+ library/cpp/getopt/small
+)
+
+END()
diff --git a/library/cpp/codecs/static/tools/tests/canondata/result.json b/library/cpp/codecs/static/tools/tests/canondata/result.json
new file mode 100644
index 00000000000..7a637c6763a
--- /dev/null
+++ b/library/cpp/codecs/static/tools/tests/canondata/result.json
@@ -0,0 +1,6 @@
+{
+ "static_codec_tools.test_static_codec_tools": {
+ "checksum": "960e3c8c57fb846ab53ccbd07e287233",
+ "uri": "sbr://144512644/static_codec_tools.test_static_codec_tools/solar-8k-a.huffman.1467494385.codec_info"
+ }
+} \ No newline at end of file
diff --git a/library/cpp/codecs/static/tools/tests/static_codec_tools.py b/library/cpp/codecs/static/tools/tests/static_codec_tools.py
new file mode 100644
index 00000000000..db4140e3703
--- /dev/null
+++ b/library/cpp/codecs/static/tools/tests/static_codec_tools.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+
+import yatest.common as tt
+import os.path as op
+
+def test_static_codec_tools():
+ tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator")]
+ + ["-m", "test codec", "-r", "sbr://143310406", "-f", "plain", "-c", "solar-8k-a:huffman", "-s", "1",
+ "--fake-revision", "r2385905", "--fake-timestamp", "1467494385", "sample.txt"],
+ timeout=60)
+ assert(op.exists("solar-8k-a.huffman.1467494385.codec_info"))
+ tt.canonical_execute(tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker"),
+ args=["-c", "solar-8k-a.huffman.1467494385.codec_info"],
+ timeout=60)
+ tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker")]
+ + ["-c", "solar-8k-a.huffman.1467494385.codec_info", "-f", "plain", "-t", "sample.txt"],
+ timeout=60)
+ return tt.canonical_file("solar-8k-a.huffman.1467494385.codec_info")
diff --git a/library/cpp/codecs/static/tools/tests/ya.make b/library/cpp/codecs/static/tools/tests/ya.make
new file mode 100644
index 00000000000..c5324eaf53b
--- /dev/null
+++ b/library/cpp/codecs/static/tools/tests/ya.make
@@ -0,0 +1,20 @@
+PY2TEST()
+
+OWNER(velavokr)
+
+TEST_SRCS(static_codec_tools.py)
+
+DATA(sbr://143310406)
+
+TIMEOUT(4200)
+
+TAG(ya:not_autocheck)
+
+DEPENDS(
+ library/cpp/codecs/static/tools/static_codec_checker
+ library/cpp/codecs/static/tools/static_codec_generator
+)
+
+
+
+END()
diff --git a/library/cpp/codecs/static/tools/ya.make b/library/cpp/codecs/static/tools/ya.make
new file mode 100644
index 00000000000..dd3e8437aa4
--- /dev/null
+++ b/library/cpp/codecs/static/tools/ya.make
@@ -0,0 +1,5 @@
+RECURSE(
+ common
+ static_codec_generator
+ static_codec_checker
+)