diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/codecs/static/tools | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/codecs/static/tools')
13 files changed, 413 insertions, 0 deletions
diff --git a/library/cpp/codecs/static/tools/common/ct_common.cpp b/library/cpp/codecs/static/tools/common/ct_common.cpp new file mode 100644 index 00000000000..fe776912805 --- /dev/null +++ b/library/cpp/codecs/static/tools/common/ct_common.cpp @@ -0,0 +1,74 @@ +#include "ct_common.h" + +#include <library/cpp/codecs/codecs.h> +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <library/cpp/string_utils/base64/base64.h> + +#include <util/stream/output.h> +#include <util/string/builder.h> +#include <util/system/hp_timer.h> + +namespace NCodecs { + TString TComprStats::Format(const TStaticCodecInfo& info, bool checkMode) const { + TStringBuilder s; + s << "raw size/item: " << RawSizePerRecord() << Endl; + s << "enc.size/item: " << EncSizePerRecord() << Endl; + if (checkMode) { + s << "orig.enc.size/item: " << OldEncSizePerRecord(info.GetDebugInfo().GetCompression()) << Endl; + } + s << "enc time us/item: " << EncTimePerRecordUS() << Endl; + s << "dec time us/item: " << DecTimePerRecordUS() << Endl; + s << "dict size: " << info.GetStoredCodec().Size() << Endl; + s << "compression: " << AsPercent(Compression()) << " %" << Endl; + if (checkMode) { + s << "orig.compression: " << AsPercent(info.GetDebugInfo().GetCompression()) << " %" << Endl; + } + return s; + } + + TComprStats TestCodec(const ICodec& c, const TVector<TString>& input) { + TComprStats stats; + + TBuffer encodeBuffer; + TBuffer decodeBuffer; + for (const auto& data : input) { + encodeBuffer.Clear(); + decodeBuffer.Clear(); + + stats.Records += 1; + stats.RawSize += data.size(); + + THPTimer timer; + c.Encode(data, encodeBuffer); + stats.EncSize += encodeBuffer.size(); + stats.EncSeconds += timer.PassedReset(); + + c.Decode(TStringBuf{encodeBuffer.data(), encodeBuffer.size()}, decodeBuffer); + stats.DecSeconds += timer.PassedReset(); + Y_ENSURE(data == TStringBuf(decodeBuffer.data(), decodeBuffer.size()), "invalid encoding at record " << stats.Records); + } + + return stats; + } + + void ParseBlob(TVector<TString>& result, EDataStreamFormat fmt, const TBlob& blob) { + TStringBuf bin(blob.AsCharPtr(), blob.Size()); + TStringBuf line; + TString buffer; + while (bin.ReadLine(line)) { + if (DSF_BASE64_LF == fmt) { + Base64Decode(line, buffer); + line = buffer; + } + if (!line) { + continue; + } + result.emplace_back(line.data(), line.size()); + } + } + + TBlob GetInputBlob(const TString& dataFile) { + return dataFile && dataFile != "-" ? TBlob::FromFile(dataFile) : TBlob::FromStream(Cin); + } + +} diff --git a/library/cpp/codecs/static/tools/common/ct_common.h b/library/cpp/codecs/static/tools/common/ct_common.h new file mode 100644 index 00000000000..9d3dcbda934 --- /dev/null +++ b/library/cpp/codecs/static/tools/common/ct_common.h @@ -0,0 +1,75 @@ +#pragma once + +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/memory/blob.h> +#include <cmath> + +namespace NCodecs { + class TStaticCodecInfo; + class ICodec; + + struct TComprStats { + double EncSeconds = 0; + double DecSeconds = 0; + size_t Records = 0; + size_t RawSize = 0; + size_t EncSize = 0; + + static double Round(double n, size_t decPlaces = 2) { + double p = pow(10, decPlaces); + return round(n * p) / p; + } + + static double AsPercent(double n) { + return Round(n * 100); + } + + static double AsMicroSecond(double s) { + return s * 1000000; + } + + double PerRecord(double n) const { + return Round((double)(Records ? n / Records : 0)); + } + + double Compression() const { + return ((double)RawSize - (double)EncSize) / RawSize; + } + + double EncTimePerRecordUS() const { + return PerRecord(AsMicroSecond(EncSeconds)); + } + + double DecTimePerRecordUS() const { + return PerRecord(AsMicroSecond(DecSeconds)); + } + + double RawSizePerRecord() const { + return PerRecord(RawSize); + } + + double EncSizePerRecord() const { + return PerRecord(EncSize); + } + + double OldEncSizePerRecord(double compr) const { + return PerRecord((1 - compr) * RawSize); + } + + TString Format(const TStaticCodecInfo&, bool checkMode) const; + }; + + TComprStats TestCodec(const ICodec&, const TVector<TString>& data); + + enum EDataStreamFormat { + DSF_NONE, + DSF_PLAIN_LF /* "plain" */, + DSF_BASE64_LF /* "base64" */, + }; + + void ParseBlob(TVector<TString>&, EDataStreamFormat, const TBlob&); + + TBlob GetInputBlob(const TString& dataFile); + +} diff --git a/library/cpp/codecs/static/tools/common/ya.make b/library/cpp/codecs/static/tools/common/ya.make new file mode 100644 index 00000000000..d624222dad0 --- /dev/null +++ b/library/cpp/codecs/static/tools/common/ya.make @@ -0,0 +1,19 @@ +LIBRARY() + +OWNER(velavokr) + +SRCS( + ct_common.cpp +) + +PEERDIR( + library/cpp/codecs + library/cpp/codecs/static + library/cpp/getopt/small + library/cpp/string_utils/base64 + util/draft +) + +GENERATE_ENUM_SERIALIZATION(ct_common.h) + +END() diff --git a/library/cpp/codecs/static/tools/static_codec_checker/README b/library/cpp/codecs/static/tools/static_codec_checker/README new file mode 100644 index 00000000000..723a68300b0 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_checker/README @@ -0,0 +1,4 @@ +This is a viewer for generated codec and utility for verification of the compression quality on a new data. + +Usage: +static_codec_checker -t -c 029b29ff64a74927.codec_info -f plain samples.txt diff --git a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp new file mode 100644 index 00000000000..9c8d568d823 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp @@ -0,0 +1,73 @@ +#include <library/cpp/codecs/static/tools/common/ct_common.h> +#include <library/cpp/codecs/static/static.h> +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <library/cpp/codecs/codecs.h> +#include <library/cpp/getopt/small/last_getopt.h> + +#include <util/digest/city.h> +#include <util/generic/yexception.h> +#include <util/stream/file.h> +#include <util/stream/buffer.h> +#include <util/stream/format.h> +#include <util/string/builder.h> + +int main(int argc, char** argv) { + NCodecs::TCodecPtr codecPtr; + NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE; + TString codecFile; + bool testCompression = false; + + auto opts = NLastGetopt::TOpts::Default(); + opts.SetTitle("Prints a .codec_info file and optionally checks its performance on new data. See also static_codec_generator."); + opts.SetCmdLineDescr("-c 9089f3e9b7a0f0d4.codec_info -t -f base64 qtrees.sample.txt"); + NCodecs::TStaticCodecInfo codec; + + opts.AddLongOption('c', "codec-info").RequiredArgument("codec_info").Handler1T<TString>([&codecFile, &codec, &codecPtr](TString name) { + codecFile = name; + codec.CopyFrom(NCodecs::LoadCodecInfoFromString(TUnbufferedFileInput(name).ReadAll())); + codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec()); + }) + .Required() + .Help(".codec_info file with serialized static data for codec"); + + opts.AddLongOption('t', "test").NoArgument().StoreValue(&testCompression, true).Optional().Help("test current performance"); + + opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Optional().Help("test set input file format"); + + opts.SetFreeArgsMin(0); + opts.SetFreeArgTitle(0, "testing_set_input_file", "testing set input files"); + + NLastGetopt::TOptsParseResult res(&opts, argc, argv); + + Cout << codecFile << Endl; + Cout << NCodecs::FormatCodecInfo(codec) << Endl; + + if (testCompression) { + if (NCodecs::DSF_NONE == fmt) { + Cerr << "Specify format (-f|--format) for testing set input" << Endl; + exit(1); + } + + Cout << "Reading testing set data ... " << Flush; + + TVector<TString> allData; + for (const auto& freeArg : res.GetFreeArgs()) { + NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg)); + } + + if (!res.GetFreeArgs()) { + NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-")); + } + + Cout << "Done" << Endl << Endl; + + Cout << "records: " << allData.size() << Endl; + Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl; + + Cout << "Testing compression ... " << Flush; + auto stats = NCodecs::TestCodec(*codecPtr, allData); + Cout << "Done" << Endl << Endl; + + Cout << stats.Format(codec, true) << Endl; + } +} diff --git a/library/cpp/codecs/static/tools/static_codec_checker/ya.make b/library/cpp/codecs/static/tools/static_codec_checker/ya.make new file mode 100644 index 00000000000..90e06ca448d --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_checker/ya.make @@ -0,0 +1,16 @@ +PROGRAM() + +OWNER(velavokr) + +SRCS( + static_codec_checker.cpp +) + +PEERDIR( + library/cpp/codecs + library/cpp/codecs/static + library/cpp/codecs/static/tools/common + library/cpp/getopt/small +) + +END() diff --git a/library/cpp/codecs/static/tools/static_codec_generator/README b/library/cpp/codecs/static/tools/static_codec_generator/README new file mode 100644 index 00000000000..e6bb52b9591 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_generator/README @@ -0,0 +1,4 @@ +This is a utility for reproducible teaching of a codec. And also for saving it into a file with a unique name for a static compilation as a resource. + +Usage: +static_codec_generator -t -m 'the training data description' -f plain samples.txt diff --git a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp new file mode 100644 index 00000000000..45fdb5c5fe8 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp @@ -0,0 +1,82 @@ +#include <library/cpp/codecs/static/tools/common/ct_common.h> +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <library/cpp/codecs/static/builder.h> +#include <library/cpp/codecs/codecs.h> + +#include <library/cpp/getopt/small/last_getopt.h> + +#include <util/generic/yexception.h> +#include <util/stream/file.h> +#include <util/string/builder.h> + +int main(int argc, char** argv) { + NCodecs::TCodecBuildInfo info; + NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE; + + auto opts = NLastGetopt::TOpts::Default(); + opts.SetCmdLineDescr("-m 'Training set: 100000 qtrees taken from web mmeta logs' -f base64 qtrees.sample.txt"); + opts.SetTitle("Teaches the codec and serializes it as a file named CODECNAME.hash(CODECDATA).bin"); + + opts.AddLongOption('m', "message").RequiredArgument("training_set_comment").StoreResult(&info.TrainingSetComment).Required().Help("a human description for the training set"); + + opts.AddLongOption('r', "resource").RequiredArgument("training_set_res_id").StoreResult(&info.TrainingSetResId).Optional().Help("sandbox resource id for the training set"); + + opts.AddLongOption('c', "codec").RequiredArgument("codec_name").StoreResult(&info.CodecName).Optional().DefaultValue(info.CodecName); + + opts.AddLongOption('s', "sample-multiplier").RequiredArgument("multiplier").StoreResult(&info.SampleSizeMultiplier).Optional().DefaultValue(ToString(info.SampleSizeMultiplier)).Help("multiplier for default sample size"); + + opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Required().Help("training set input file format"); + + opts.AddLongOption("list-codecs").NoArgument().Handler0([]() { + Cout << JoinStrings(NCodecs::ICodec::GetCodecsList(), "\n") << Endl; + exit(0); + }) + .Optional() + .Help("list available codecs"); + + opts.AddLongOption("fake-revision").RequiredArgument("revision").StoreResult(&info.RevisionInfo).Optional().Hidden(); // replace static_codec_generator revision in debug info + + opts.AddLongOption("fake-timestamp").RequiredArgument("timestamp").StoreResult(&info.Timestamp).Optional().Hidden(); // replace generating timestamp in debug info + + opts.SetFreeArgsMin(0); + opts.SetFreeArgTitle(0, "training_set_input_file", "training set input files"); + + NLastGetopt::TOptsParseResult res(&opts, argc, argv); + + Cout << "Reading training set data ... " << Flush; + TVector<TString> allData; + for (const auto& freeArg : res.GetFreeArgs()) { + NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg)); + } + + if (!res.GetFreeArgs()) { + NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-")); + } + Cout << "Done" << Endl << Endl; + + Cout << "records: " << allData.size() << Endl; + Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl; + + Cout << "Training " << info.CodecName << " , sample size multiplier is " << info.SampleSizeMultiplier << " ... " << Flush; + auto codec = NCodecs::BuildStaticCodec(allData, info); + Cout << "Done" << Endl; + + TString codecName = NCodecs::GetStandardFileName(codec); + NCodecs::TCodecPtr codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec()); + + Cout << "Testing compression ... " << Flush; + auto stats = NCodecs::TestCodec(*codecPtr, allData); + Cout << "Done" << Endl << Endl; + + codec.MutableDebugInfo()->SetCompression(stats.Compression()); + + Cout << stats.Format(codec, false) << Endl; + + Cout << "Saving as " << codecName << " ... " << Flush; + { + TUnbufferedFileOutput fout{codecName}; + NCodecs::SaveCodecInfoToStream(fout, codec); + fout.Finish(); + } + Cout << "Done" << Endl << Endl; +} diff --git a/library/cpp/codecs/static/tools/static_codec_generator/ya.make b/library/cpp/codecs/static/tools/static_codec_generator/ya.make new file mode 100644 index 00000000000..efbc440dd18 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_generator/ya.make @@ -0,0 +1,17 @@ +PROGRAM() + +OWNER(velavokr) + +SRCS( + static_codec_generator.cpp +) + +PEERDIR( + library/cpp/codecs + library/cpp/codecs/static + library/cpp/codecs/static/tools/common + library/cpp/digest/md5 + library/cpp/getopt/small +) + +END() diff --git a/library/cpp/codecs/static/tools/tests/canondata/result.json b/library/cpp/codecs/static/tools/tests/canondata/result.json new file mode 100644 index 00000000000..7a637c6763a --- /dev/null +++ b/library/cpp/codecs/static/tools/tests/canondata/result.json @@ -0,0 +1,6 @@ +{ + "static_codec_tools.test_static_codec_tools": { + "checksum": "960e3c8c57fb846ab53ccbd07e287233", + "uri": "sbr://144512644/static_codec_tools.test_static_codec_tools/solar-8k-a.huffman.1467494385.codec_info" + } +}
\ No newline at end of file diff --git a/library/cpp/codecs/static/tools/tests/static_codec_tools.py b/library/cpp/codecs/static/tools/tests/static_codec_tools.py new file mode 100644 index 00000000000..db4140e3703 --- /dev/null +++ b/library/cpp/codecs/static/tools/tests/static_codec_tools.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python + +import yatest.common as tt +import os.path as op + +def test_static_codec_tools(): + tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator")] + + ["-m", "test codec", "-r", "sbr://143310406", "-f", "plain", "-c", "solar-8k-a:huffman", "-s", "1", + "--fake-revision", "r2385905", "--fake-timestamp", "1467494385", "sample.txt"], + timeout=60) + assert(op.exists("solar-8k-a.huffman.1467494385.codec_info")) + tt.canonical_execute(tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker"), + args=["-c", "solar-8k-a.huffman.1467494385.codec_info"], + timeout=60) + tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker")] + + ["-c", "solar-8k-a.huffman.1467494385.codec_info", "-f", "plain", "-t", "sample.txt"], + timeout=60) + return tt.canonical_file("solar-8k-a.huffman.1467494385.codec_info") diff --git a/library/cpp/codecs/static/tools/tests/ya.make b/library/cpp/codecs/static/tools/tests/ya.make new file mode 100644 index 00000000000..c5324eaf53b --- /dev/null +++ b/library/cpp/codecs/static/tools/tests/ya.make @@ -0,0 +1,20 @@ +PY2TEST() + +OWNER(velavokr) + +TEST_SRCS(static_codec_tools.py) + +DATA(sbr://143310406) + +TIMEOUT(4200) + +TAG(ya:not_autocheck) + +DEPENDS( + library/cpp/codecs/static/tools/static_codec_checker + library/cpp/codecs/static/tools/static_codec_generator +) + + + +END() diff --git a/library/cpp/codecs/static/tools/ya.make b/library/cpp/codecs/static/tools/ya.make new file mode 100644 index 00000000000..dd3e8437aa4 --- /dev/null +++ b/library/cpp/codecs/static/tools/ya.make @@ -0,0 +1,5 @@ +RECURSE( + common + static_codec_generator + static_codec_checker +) |