summaryrefslogtreecommitdiffstats
path: root/library/cpp/codecs/static/tools
diff options
context:
space:
mode:
authormonster <[email protected]>2022-07-07 14:41:37 +0300
committermonster <[email protected]>2022-07-07 14:41:37 +0300
commit06e5c21a835c0e923506c4ff27929f34e00761c2 (patch)
tree75efcbc6854ef9bd476eb8bf00cc5c900da436a2 /library/cpp/codecs/static/tools
parent03f024c4412e3aa613bb543cf1660176320ba8f4 (diff)
fix ya.make
Diffstat (limited to 'library/cpp/codecs/static/tools')
-rw-r--r--library/cpp/codecs/static/tools/common/ct_common.cpp74
-rw-r--r--library/cpp/codecs/static/tools/common/ct_common.h75
-rw-r--r--library/cpp/codecs/static/tools/static_codec_checker/README4
-rw-r--r--library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp73
-rw-r--r--library/cpp/codecs/static/tools/static_codec_generator/README4
-rw-r--r--library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp82
-rw-r--r--library/cpp/codecs/static/tools/tests/canondata/result.json6
-rw-r--r--library/cpp/codecs/static/tools/tests/static_codec_tools.py18
8 files changed, 0 insertions, 336 deletions
diff --git a/library/cpp/codecs/static/tools/common/ct_common.cpp b/library/cpp/codecs/static/tools/common/ct_common.cpp
deleted file mode 100644
index fe776912805..00000000000
--- a/library/cpp/codecs/static/tools/common/ct_common.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-#include "ct_common.h"
-
-#include <library/cpp/codecs/codecs.h>
-#include <library/cpp/codecs/static/static_codec_info.pb.h>
-#include <library/cpp/string_utils/base64/base64.h>
-
-#include <util/stream/output.h>
-#include <util/string/builder.h>
-#include <util/system/hp_timer.h>
-
-namespace NCodecs {
- TString TComprStats::Format(const TStaticCodecInfo& info, bool checkMode) const {
- TStringBuilder s;
- s << "raw size/item: " << RawSizePerRecord() << Endl;
- s << "enc.size/item: " << EncSizePerRecord() << Endl;
- if (checkMode) {
- s << "orig.enc.size/item: " << OldEncSizePerRecord(info.GetDebugInfo().GetCompression()) << Endl;
- }
- s << "enc time us/item: " << EncTimePerRecordUS() << Endl;
- s << "dec time us/item: " << DecTimePerRecordUS() << Endl;
- s << "dict size: " << info.GetStoredCodec().Size() << Endl;
- s << "compression: " << AsPercent(Compression()) << " %" << Endl;
- if (checkMode) {
- s << "orig.compression: " << AsPercent(info.GetDebugInfo().GetCompression()) << " %" << Endl;
- }
- return s;
- }
-
- TComprStats TestCodec(const ICodec& c, const TVector<TString>& input) {
- TComprStats stats;
-
- TBuffer encodeBuffer;
- TBuffer decodeBuffer;
- for (const auto& data : input) {
- encodeBuffer.Clear();
- decodeBuffer.Clear();
-
- stats.Records += 1;
- stats.RawSize += data.size();
-
- THPTimer timer;
- c.Encode(data, encodeBuffer);
- stats.EncSize += encodeBuffer.size();
- stats.EncSeconds += timer.PassedReset();
-
- c.Decode(TStringBuf{encodeBuffer.data(), encodeBuffer.size()}, decodeBuffer);
- stats.DecSeconds += timer.PassedReset();
- Y_ENSURE(data == TStringBuf(decodeBuffer.data(), decodeBuffer.size()), "invalid encoding at record " << stats.Records);
- }
-
- return stats;
- }
-
- void ParseBlob(TVector<TString>& result, EDataStreamFormat fmt, const TBlob& blob) {
- TStringBuf bin(blob.AsCharPtr(), blob.Size());
- TStringBuf line;
- TString buffer;
- while (bin.ReadLine(line)) {
- if (DSF_BASE64_LF == fmt) {
- Base64Decode(line, buffer);
- line = buffer;
- }
- if (!line) {
- continue;
- }
- result.emplace_back(line.data(), line.size());
- }
- }
-
- TBlob GetInputBlob(const TString& dataFile) {
- return dataFile && dataFile != "-" ? TBlob::FromFile(dataFile) : TBlob::FromStream(Cin);
- }
-
-}
diff --git a/library/cpp/codecs/static/tools/common/ct_common.h b/library/cpp/codecs/static/tools/common/ct_common.h
deleted file mode 100644
index 9d3dcbda934..00000000000
--- a/library/cpp/codecs/static/tools/common/ct_common.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#pragma once
-
-#include <util/generic/string.h>
-#include <util/generic/vector.h>
-#include <util/memory/blob.h>
-#include <cmath>
-
-namespace NCodecs {
- class TStaticCodecInfo;
- class ICodec;
-
- struct TComprStats {
- double EncSeconds = 0;
- double DecSeconds = 0;
- size_t Records = 0;
- size_t RawSize = 0;
- size_t EncSize = 0;
-
- static double Round(double n, size_t decPlaces = 2) {
- double p = pow(10, decPlaces);
- return round(n * p) / p;
- }
-
- static double AsPercent(double n) {
- return Round(n * 100);
- }
-
- static double AsMicroSecond(double s) {
- return s * 1000000;
- }
-
- double PerRecord(double n) const {
- return Round((double)(Records ? n / Records : 0));
- }
-
- double Compression() const {
- return ((double)RawSize - (double)EncSize) / RawSize;
- }
-
- double EncTimePerRecordUS() const {
- return PerRecord(AsMicroSecond(EncSeconds));
- }
-
- double DecTimePerRecordUS() const {
- return PerRecord(AsMicroSecond(DecSeconds));
- }
-
- double RawSizePerRecord() const {
- return PerRecord(RawSize);
- }
-
- double EncSizePerRecord() const {
- return PerRecord(EncSize);
- }
-
- double OldEncSizePerRecord(double compr) const {
- return PerRecord((1 - compr) * RawSize);
- }
-
- TString Format(const TStaticCodecInfo&, bool checkMode) const;
- };
-
- TComprStats TestCodec(const ICodec&, const TVector<TString>& data);
-
- enum EDataStreamFormat {
- DSF_NONE,
- DSF_PLAIN_LF /* "plain" */,
- DSF_BASE64_LF /* "base64" */,
- };
-
- void ParseBlob(TVector<TString>&, EDataStreamFormat, const TBlob&);
-
- TBlob GetInputBlob(const TString& dataFile);
-
-}
diff --git a/library/cpp/codecs/static/tools/static_codec_checker/README b/library/cpp/codecs/static/tools/static_codec_checker/README
deleted file mode 100644
index 723a68300b0..00000000000
--- a/library/cpp/codecs/static/tools/static_codec_checker/README
+++ /dev/null
@@ -1,4 +0,0 @@
-This is a viewer for generated codec and utility for verification of the compression quality on a new data.
-
-Usage:
-static_codec_checker -t -c 029b29ff64a74927.codec_info -f plain samples.txt
diff --git a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp
deleted file mode 100644
index 9c8d568d823..00000000000
--- a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-#include <library/cpp/codecs/static/tools/common/ct_common.h>
-#include <library/cpp/codecs/static/static.h>
-#include <library/cpp/codecs/static/static_codec_info.pb.h>
-#include <library/cpp/codecs/codecs.h>
-#include <library/cpp/getopt/small/last_getopt.h>
-
-#include <util/digest/city.h>
-#include <util/generic/yexception.h>
-#include <util/stream/file.h>
-#include <util/stream/buffer.h>
-#include <util/stream/format.h>
-#include <util/string/builder.h>
-
-int main(int argc, char** argv) {
- NCodecs::TCodecPtr codecPtr;
- NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE;
- TString codecFile;
- bool testCompression = false;
-
- auto opts = NLastGetopt::TOpts::Default();
- opts.SetTitle("Prints a .codec_info file and optionally checks its performance on new data. See also static_codec_generator.");
- opts.SetCmdLineDescr("-c 9089f3e9b7a0f0d4.codec_info -t -f base64 qtrees.sample.txt");
- NCodecs::TStaticCodecInfo codec;
-
- opts.AddLongOption('c', "codec-info").RequiredArgument("codec_info").Handler1T<TString>([&codecFile, &codec, &codecPtr](TString name) {
- codecFile = name;
- codec.CopyFrom(NCodecs::LoadCodecInfoFromString(TUnbufferedFileInput(name).ReadAll()));
- codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec());
- })
- .Required()
- .Help(".codec_info file with serialized static data for codec");
-
- opts.AddLongOption('t', "test").NoArgument().StoreValue(&testCompression, true).Optional().Help("test current performance");
-
- opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Optional().Help("test set input file format");
-
- opts.SetFreeArgsMin(0);
- opts.SetFreeArgTitle(0, "testing_set_input_file", "testing set input files");
-
- NLastGetopt::TOptsParseResult res(&opts, argc, argv);
-
- Cout << codecFile << Endl;
- Cout << NCodecs::FormatCodecInfo(codec) << Endl;
-
- if (testCompression) {
- if (NCodecs::DSF_NONE == fmt) {
- Cerr << "Specify format (-f|--format) for testing set input" << Endl;
- exit(1);
- }
-
- Cout << "Reading testing set data ... " << Flush;
-
- TVector<TString> allData;
- for (const auto& freeArg : res.GetFreeArgs()) {
- NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg));
- }
-
- if (!res.GetFreeArgs()) {
- NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-"));
- }
-
- Cout << "Done" << Endl << Endl;
-
- Cout << "records: " << allData.size() << Endl;
- Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl;
-
- Cout << "Testing compression ... " << Flush;
- auto stats = NCodecs::TestCodec(*codecPtr, allData);
- Cout << "Done" << Endl << Endl;
-
- Cout << stats.Format(codec, true) << Endl;
- }
-}
diff --git a/library/cpp/codecs/static/tools/static_codec_generator/README b/library/cpp/codecs/static/tools/static_codec_generator/README
deleted file mode 100644
index e6bb52b9591..00000000000
--- a/library/cpp/codecs/static/tools/static_codec_generator/README
+++ /dev/null
@@ -1,4 +0,0 @@
-This is a utility for reproducible teaching of a codec. And also for saving it into a file with a unique name for a static compilation as a resource.
-
-Usage:
-static_codec_generator -t -m 'the training data description' -f plain samples.txt
diff --git a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp
deleted file mode 100644
index 45fdb5c5fe8..00000000000
--- a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-#include <library/cpp/codecs/static/tools/common/ct_common.h>
-#include <library/cpp/codecs/static/static_codec_info.pb.h>
-#include <library/cpp/codecs/static/builder.h>
-#include <library/cpp/codecs/codecs.h>
-
-#include <library/cpp/getopt/small/last_getopt.h>
-
-#include <util/generic/yexception.h>
-#include <util/stream/file.h>
-#include <util/string/builder.h>
-
-int main(int argc, char** argv) {
- NCodecs::TCodecBuildInfo info;
- NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE;
-
- auto opts = NLastGetopt::TOpts::Default();
- opts.SetCmdLineDescr("-m 'Training set: 100000 qtrees taken from web mmeta logs' -f base64 qtrees.sample.txt");
- opts.SetTitle("Teaches the codec and serializes it as a file named CODECNAME.hash(CODECDATA).bin");
-
- opts.AddLongOption('m', "message").RequiredArgument("training_set_comment").StoreResult(&info.TrainingSetComment).Required().Help("a human description for the training set");
-
- opts.AddLongOption('r', "resource").RequiredArgument("training_set_res_id").StoreResult(&info.TrainingSetResId).Optional().Help("sandbox resource id for the training set");
-
- opts.AddLongOption('c', "codec").RequiredArgument("codec_name").StoreResult(&info.CodecName).Optional().DefaultValue(info.CodecName);
-
- opts.AddLongOption('s', "sample-multiplier").RequiredArgument("multiplier").StoreResult(&info.SampleSizeMultiplier).Optional().DefaultValue(ToString(info.SampleSizeMultiplier)).Help("multiplier for default sample size");
-
- opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Required().Help("training set input file format");
-
- opts.AddLongOption("list-codecs").NoArgument().Handler0([]() {
- Cout << JoinStrings(NCodecs::ICodec::GetCodecsList(), "\n") << Endl;
- exit(0);
- })
- .Optional()
- .Help("list available codecs");
-
- opts.AddLongOption("fake-revision").RequiredArgument("revision").StoreResult(&info.RevisionInfo).Optional().Hidden(); // replace static_codec_generator revision in debug info
-
- opts.AddLongOption("fake-timestamp").RequiredArgument("timestamp").StoreResult(&info.Timestamp).Optional().Hidden(); // replace generating timestamp in debug info
-
- opts.SetFreeArgsMin(0);
- opts.SetFreeArgTitle(0, "training_set_input_file", "training set input files");
-
- NLastGetopt::TOptsParseResult res(&opts, argc, argv);
-
- Cout << "Reading training set data ... " << Flush;
- TVector<TString> allData;
- for (const auto& freeArg : res.GetFreeArgs()) {
- NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg));
- }
-
- if (!res.GetFreeArgs()) {
- NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-"));
- }
- Cout << "Done" << Endl << Endl;
-
- Cout << "records: " << allData.size() << Endl;
- Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl;
-
- Cout << "Training " << info.CodecName << " , sample size multiplier is " << info.SampleSizeMultiplier << " ... " << Flush;
- auto codec = NCodecs::BuildStaticCodec(allData, info);
- Cout << "Done" << Endl;
-
- TString codecName = NCodecs::GetStandardFileName(codec);
- NCodecs::TCodecPtr codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec());
-
- Cout << "Testing compression ... " << Flush;
- auto stats = NCodecs::TestCodec(*codecPtr, allData);
- Cout << "Done" << Endl << Endl;
-
- codec.MutableDebugInfo()->SetCompression(stats.Compression());
-
- Cout << stats.Format(codec, false) << Endl;
-
- Cout << "Saving as " << codecName << " ... " << Flush;
- {
- TUnbufferedFileOutput fout{codecName};
- NCodecs::SaveCodecInfoToStream(fout, codec);
- fout.Finish();
- }
- Cout << "Done" << Endl << Endl;
-}
diff --git a/library/cpp/codecs/static/tools/tests/canondata/result.json b/library/cpp/codecs/static/tools/tests/canondata/result.json
deleted file mode 100644
index 7a637c6763a..00000000000
--- a/library/cpp/codecs/static/tools/tests/canondata/result.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "static_codec_tools.test_static_codec_tools": {
- "checksum": "960e3c8c57fb846ab53ccbd07e287233",
- "uri": "sbr://144512644/static_codec_tools.test_static_codec_tools/solar-8k-a.huffman.1467494385.codec_info"
- }
-} \ No newline at end of file
diff --git a/library/cpp/codecs/static/tools/tests/static_codec_tools.py b/library/cpp/codecs/static/tools/tests/static_codec_tools.py
deleted file mode 100644
index db4140e3703..00000000000
--- a/library/cpp/codecs/static/tools/tests/static_codec_tools.py
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env python
-
-import yatest.common as tt
-import os.path as op
-
-def test_static_codec_tools():
- tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator")]
- + ["-m", "test codec", "-r", "sbr://143310406", "-f", "plain", "-c", "solar-8k-a:huffman", "-s", "1",
- "--fake-revision", "r2385905", "--fake-timestamp", "1467494385", "sample.txt"],
- timeout=60)
- assert(op.exists("solar-8k-a.huffman.1467494385.codec_info"))
- tt.canonical_execute(tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker"),
- args=["-c", "solar-8k-a.huffman.1467494385.codec_info"],
- timeout=60)
- tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker")]
- + ["-c", "solar-8k-a.huffman.1467494385.codec_info", "-f", "plain", "-t", "sample.txt"],
- timeout=60)
- return tt.canonical_file("solar-8k-a.huffman.1467494385.codec_info")