diff options
| author | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 | 
|---|---|---|
| committer | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 | 
| commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
| tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/codecs/static | |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/codecs/static')
29 files changed, 863 insertions, 0 deletions
| diff --git a/library/cpp/codecs/static/README b/library/cpp/codecs/static/README new file mode 100644 index 00000000000..1b07f02433d --- /dev/null +++ b/library/cpp/codecs/static/README @@ -0,0 +1 @@ +Support of static libraries in library/cpp/codecs. See library/cpp/codecs/static/example. diff --git a/library/cpp/codecs/static/builder.cpp b/library/cpp/codecs/static/builder.cpp new file mode 100644 index 00000000000..93e34a3edbb --- /dev/null +++ b/library/cpp/codecs/static/builder.cpp @@ -0,0 +1,39 @@ +#include "builder.h" +#include "common.h" + +#include <library/cpp/codecs/static/static_codec_info.pb.h> + +#include <library/cpp/codecs/codecs.h> + +#include <util/generic/yexception.h> +#include <util/string/subst.h> + +namespace NCodecs { +    TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo& info) { +        TStaticCodecInfo result; +        TCodecPtr codec = ICodec::GetInstance(info.CodecName); +        Y_ENSURE_EX(codec, TCodecException() << "empty codec is not allowed"); + +        codec->LearnX(trainingData.begin(), trainingData.end(), info.SampleSizeMultiplier); +        { +            TStringOutput sout{*result.MutableStoredCodec()}; +            ICodec::Store(&sout, codec); +        } + +        auto& debugInfo = *result.MutableDebugInfo(); +        debugInfo.SetStoredCodecHash(DataSignature(result.GetStoredCodec())); +        debugInfo.SetCodecName(info.CodecName); +        debugInfo.SetSampleSizeMultiplier(info.SampleSizeMultiplier); +        debugInfo.SetTimestamp(info.Timestamp); +        debugInfo.SetRevisionInfo(info.RevisionInfo); +        debugInfo.SetTrainingSetComment(info.TrainingSetComment); +        debugInfo.SetTrainingSetResId(info.TrainingSetResId); +        return result; +    } + +    TString GetStandardFileName(const TStaticCodecInfo& info) { +        TString cName = info.GetDebugInfo().GetCodecName(); +        SubstGlobal(cName, ':', '.'); +        return TStringBuilder() << cName << "." << info.GetDebugInfo().GetTimestamp() << ".codec_info"; +    } +} diff --git a/library/cpp/codecs/static/builder.h b/library/cpp/codecs/static/builder.h new file mode 100644 index 00000000000..d7533be4d58 --- /dev/null +++ b/library/cpp/codecs/static/builder.h @@ -0,0 +1,29 @@ +#pragma once + +#include "static.h" + +#include <library/cpp/svnversion/svnversion.h> + +#include <util/datetime/base.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/string/builder.h> + +namespace NCodecs { +    struct TCodecBuildInfo { +        // optimal values from SEARCH-1655 +        TString CodecName = "solar-8k-a:zstd08d-1"; +        float SampleSizeMultiplier = 1; + +        // debug info: +        time_t Timestamp = TInstant::Now().TimeT(); +        TString RevisionInfo = (TStringBuilder() << "r" << ToString(GetProgramSvnRevision())); +        TString TrainingSetComment; // a human comment on the training data +        TString TrainingSetResId;   // sandbox resid of the training set +    }; + +    TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo&); + +    TString GetStandardFileName(const TStaticCodecInfo&); + +} diff --git a/library/cpp/codecs/static/common.h b/library/cpp/codecs/static/common.h new file mode 100644 index 00000000000..211de2a27d2 --- /dev/null +++ b/library/cpp/codecs/static/common.h @@ -0,0 +1,32 @@ +#pragma once + +#include <util/string/hex.h> +#include <util/digest/city.h> +#include <util/system/byteorder.h> + +namespace NCodecs { +    template <class T> +    ui64 DataSignature(const T& t) { +        static_assert(!std::is_scalar<T>::value, "no scalars"); +        return CityHash64(t.data(), t.size()); +    } + +    template <class T> +    TString HexWriteScalar(T t) { +        static_assert(std::is_scalar<T>::value, "scalars only"); +        t = LittleToBig(t); +        TString res = HexEncode(&t, sizeof(t)); +        res.to_lower(); +        return res; +    } + +    template <class T> +    T HexReadScalar(TStringBuf s) { +        static_assert(std::is_scalar<T>::value, "scalars only"); +        T t = 0; +        HexDecode(s.data(), Min(s.size(), sizeof(T)), &t); +        t = BigToLittle(t); +        return t; +    } + +} diff --git a/library/cpp/codecs/static/example/example.cpp b/library/cpp/codecs/static/example/example.cpp new file mode 100644 index 00000000000..5b750b717e1 --- /dev/null +++ b/library/cpp/codecs/static/example/example.cpp @@ -0,0 +1,43 @@ +#include "example.h" + +#include <library/cpp/codecs/static/static.h> + +#include <util/generic/yexception.h> + +extern "C" { +extern const ui8 codec_info_huff_20160707[]; +extern const ui32 codec_info_huff_20160707Size; +extern const ui8 codec_info_sa_huff_20160707[]; +extern const ui32 codec_info_sa_huff_20160707Size; +}; + +namespace NStaticCodecExample { +    static const NCodecs::TCodecConstPtr CODECS[] = { +        nullptr, +        NCodecs::RestoreCodecFromArchive(codec_info_huff_20160707, codec_info_huff_20160707Size), +        NCodecs::RestoreCodecFromArchive(codec_info_sa_huff_20160707, codec_info_sa_huff_20160707Size), +    }; + +    static_assert(Y_ARRAY_SIZE(CODECS) == DV_COUNT, "bad array size"); + +    void Encode(TBuffer& out, TStringBuf in, EDictVersion dv) { +        Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv); +        out.Clear(); +        if (!in) { +            return; +        } +        CODECS[dv]->Encode(in, out); +        out.Append((char)dv); +    } + +    void Decode(TBuffer& out, TStringBuf in) { +        out.Clear(); +        if (!in) { +            return; +        } +        EDictVersion dv = (EDictVersion)in.back(); +        Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv); +        in.Chop(1); +        CODECS[dv]->Decode(in, out); +    } +} diff --git a/library/cpp/codecs/static/example/example.h b/library/cpp/codecs/static/example/example.h new file mode 100644 index 00000000000..f9b3a7324b7 --- /dev/null +++ b/library/cpp/codecs/static/example/example.h @@ -0,0 +1,17 @@ +#pragma once + +#include <util/generic/strbuf.h> +#include <util/generic/buffer.h> + +namespace NStaticCodecExample { +    enum EDictVersion : ui8 { +        DV_NULL = 0, +        DV_HUFF_20160707, +        DV_SA_HUFF_20160707, +        DV_COUNT +    }; + +    void Encode(TBuffer&, TStringBuf, EDictVersion dv = DV_SA_HUFF_20160707); + +    void Decode(TBuffer&, TStringBuf); +} diff --git a/library/cpp/codecs/static/example/huffman.1467494385.codec_info b/library/cpp/codecs/static/example/huffman.1467494385.codec_infoBinary files differ new file mode 100644 index 00000000000..5fc18270a6b --- /dev/null +++ b/library/cpp/codecs/static/example/huffman.1467494385.codec_info diff --git a/library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_info b/library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_infoBinary files differ new file mode 100644 index 00000000000..d36d8e24ec9 --- /dev/null +++ b/library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_info diff --git a/library/cpp/codecs/static/example/ya.make b/library/cpp/codecs/static/example/ya.make new file mode 100644 index 00000000000..ca6c5fd900a --- /dev/null +++ b/library/cpp/codecs/static/example/ya.make @@ -0,0 +1,24 @@ +LIBRARY() + +OWNER(velavokr) + +SRCS( +    GLOBAL example.cpp +) + +PEERDIR( +    library/cpp/codecs +    library/cpp/codecs/static +) + +ARCHIVE_ASM( +    "solar-8k-a.huffman.1467494385.codec_info" +    NAME codec_info_sa_huff_20160707 +) + +ARCHIVE_ASM( +    "huffman.1467494385.codec_info" +    NAME codec_info_huff_20160707 +) + +END() diff --git a/library/cpp/codecs/static/static.cpp b/library/cpp/codecs/static/static.cpp new file mode 100644 index 00000000000..44a07dd73a2 --- /dev/null +++ b/library/cpp/codecs/static/static.cpp @@ -0,0 +1,98 @@ +#include "static.h" +#include "common.h" + +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <library/cpp/archive/yarchive.h> + +#include <util/draft/datetime.h> + +#include <util/string/builder.h> +#include <util/stream/buffer.h> +#include <util/stream/mem.h> +#include <util/string/hex.h> +#include <util/ysaveload.h> + +namespace NCodecs { +    static constexpr TStringBuf STATIC_CODEC_INFO_MAGIC = "CodecInf"; + +    static TStringBuf GetStaticCodecInfoMagic() { +        return STATIC_CODEC_INFO_MAGIC; +    } + +    void SaveCodecInfoToStream(IOutputStream& out, const TStaticCodecInfo& info) { +        TBufferOutput bout; +        info.SerializeToArcadiaStream(&bout); +        ui64 hash = DataSignature(bout.Buffer()); +        out.Write(GetStaticCodecInfoMagic()); +        ::Save(&out, hash); +        ::Save(&out, bout.Buffer()); +    } + +    TStaticCodecInfo LoadCodecInfoFromStream(IInputStream& in) { +        { +            TBuffer magic; +            magic.Resize(GetStaticCodecInfoMagic().size()); +            Y_ENSURE_EX(in.Read(magic.Data(), GetStaticCodecInfoMagic().size()) == GetStaticCodecInfoMagic().size(), +                        TCodecException() << "bad codec info"); +            Y_ENSURE_EX(TStringBuf(magic.data(), magic.size()) == GetStaticCodecInfoMagic(), +                        TCodecException() << "bad codec info"); +        } + +        ui64 hash; +        ::Load(&in, hash); +        TBuffer info; +        ::Load(&in, info); +        Y_ENSURE_EX(hash == DataSignature(info), TCodecException() << "bad codec info"); + +        TStaticCodecInfo result; +        Y_ENSURE_EX(result.ParseFromArray(info.data(), info.size()), TCodecException() << "bad codec info"); + +        return result; +    } + +    TString SaveCodecInfoToString(const TStaticCodecInfo& info) { +        TStringStream s; +        SaveCodecInfoToStream(s, info); +        return s.Str(); +    } + +    TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data) { +        TMemoryInput m{data.data(), data.size()}; +        return LoadCodecInfoFromStream(m); +    } + +    TString FormatCodecInfo(const TStaticCodecInfo& ci) { +        TStringBuilder s; +        s << "codec name:      " << ci.GetDebugInfo().GetCodecName() << Endl; +        s << "codec hash:      " << HexWriteScalar(ci.GetDebugInfo().GetStoredCodecHash()) << Endl; +        s << "dict size:       " << ci.GetStoredCodec().Size() << Endl; +        s << "sample mult:     " << ci.GetDebugInfo().GetSampleSizeMultiplier() << Endl; +        s << "orig.compress:   " << ci.GetDebugInfo().GetCompression() * 100 << " %" << Endl; +        s << "timestamp:       " << ci.GetDebugInfo().GetTimestamp() << " (" +          << NDatetime::TSimpleTM::NewLocal(ci.GetDebugInfo().GetTimestamp()).ToString() +          << ")" << Endl; +        s << "revision:        " << ci.GetDebugInfo().GetRevisionInfo() << Endl; +        s << "training set comment: " << ci.GetDebugInfo().GetTrainingSetComment() << Endl; +        s << "training set resId:   " << ci.GetDebugInfo().GetTrainingSetResId() << Endl; +        return s; +    } + +    TString LoadStringFromArchive(const ui8* begin, size_t size) { +        TArchiveReader ar(TBlob::NoCopy(begin, size)); +        Y_VERIFY(ar.Count() == 1, "invalid number of entries"); +        auto blob = ar.ObjectBlobByKey(ar.KeyByIndex(0)); +        return TString{blob.AsCharPtr(), blob.Size()}; +    } + +    TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo& info) { +        return NCodecs::ICodec::RestoreFromString(info.GetStoredCodec()); +    } + +    TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size) { +        const auto& data = LoadStringFromArchive(begin, size); +        const auto& info = LoadCodecInfoFromString(data); +        const auto& codec = RestoreCodecFromCodecInfo(info); +        Y_ENSURE_EX(codec, TCodecException() << "null codec"); +        return codec; +    } +} diff --git a/library/cpp/codecs/static/static.h b/library/cpp/codecs/static/static.h new file mode 100644 index 00000000000..c1eaed2a742 --- /dev/null +++ b/library/cpp/codecs/static/static.h @@ -0,0 +1,34 @@ +#pragma once + +#include <library/cpp/codecs/codecs.h> + +#include <util/generic/strbuf.h> +#include <util/generic/string.h> +#include <util/stream/output.h> + +namespace NCodecs { +    class TStaticCodecInfo; + +    // load + +    TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo&); + +    TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data); + +    TString LoadStringFromArchive(const ui8* begin, size_t size); + +    TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size); + +    // save + +    TString SaveCodecInfoToString(const TStaticCodecInfo&); + +    void SaveCodecInfoToStream(IOutputStream& out, const TStaticCodecInfo&); + +    // misc + +    TStaticCodecInfo LoadCodecInfoFromStream(IInputStream& in); + +    TString FormatCodecInfo(const TStaticCodecInfo&); + +} diff --git a/library/cpp/codecs/static/static_codec_info.proto b/library/cpp/codecs/static/static_codec_info.proto new file mode 100644 index 00000000000..362abb4dadf --- /dev/null +++ b/library/cpp/codecs/static/static_codec_info.proto @@ -0,0 +1,17 @@ +package NCodecs; + +message TStaticCodecInfo { +    message TDebugInfo { +        optional string CodecName = 1;           // the exact codec variant name +        optional uint64 Timestamp = 2;           // when the codec was built +        optional string RevisionInfo = 3;        // the arcadia revision info +        optional float SampleSizeMultiplier = 4; // how the default sample size was modified to improve compression +        optional float Compression = 5;          // the compression on the training set ((raw_size - coded_size) / raw_size) +        optional string TrainingSetComment = 6;  // a human readable description of the training set +        optional string TrainingSetResId = 7;    // the training set sandbox resource id +        optional uint64 StoredCodecHash = 8;     // cityhash64(data) +    } +     +    optional bytes StoredCodec = 1;           // the data of the codec +    optional TDebugInfo DebugInfo = 2;        // misc debug info which could be useful in finding whereabouts later +} diff --git a/library/cpp/codecs/static/tools/common/ct_common.cpp b/library/cpp/codecs/static/tools/common/ct_common.cpp new file mode 100644 index 00000000000..fe776912805 --- /dev/null +++ b/library/cpp/codecs/static/tools/common/ct_common.cpp @@ -0,0 +1,74 @@ +#include "ct_common.h" + +#include <library/cpp/codecs/codecs.h> +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <library/cpp/string_utils/base64/base64.h> + +#include <util/stream/output.h> +#include <util/string/builder.h> +#include <util/system/hp_timer.h> + +namespace NCodecs { +    TString TComprStats::Format(const TStaticCodecInfo& info, bool checkMode) const { +        TStringBuilder s; +        s << "raw size/item:      " << RawSizePerRecord() << Endl; +        s << "enc.size/item:      " << EncSizePerRecord() << Endl; +        if (checkMode) { +            s << "orig.enc.size/item: " << OldEncSizePerRecord(info.GetDebugInfo().GetCompression()) << Endl; +        } +        s << "enc time us/item:   " << EncTimePerRecordUS() << Endl; +        s << "dec time us/item:   " << DecTimePerRecordUS() << Endl; +        s << "dict size:          " << info.GetStoredCodec().Size() << Endl; +        s << "compression:        " << AsPercent(Compression()) << " %" << Endl; +        if (checkMode) { +            s << "orig.compression:   " << AsPercent(info.GetDebugInfo().GetCompression()) << " %" << Endl; +        } +        return s; +    } + +    TComprStats TestCodec(const ICodec& c, const TVector<TString>& input) { +        TComprStats stats; + +        TBuffer encodeBuffer; +        TBuffer decodeBuffer; +        for (const auto& data : input) { +            encodeBuffer.Clear(); +            decodeBuffer.Clear(); + +            stats.Records += 1; +            stats.RawSize += data.size(); + +            THPTimer timer; +            c.Encode(data, encodeBuffer); +            stats.EncSize += encodeBuffer.size(); +            stats.EncSeconds += timer.PassedReset(); + +            c.Decode(TStringBuf{encodeBuffer.data(), encodeBuffer.size()}, decodeBuffer); +            stats.DecSeconds += timer.PassedReset(); +            Y_ENSURE(data == TStringBuf(decodeBuffer.data(), decodeBuffer.size()), "invalid encoding at record " << stats.Records); +        } + +        return stats; +    } + +    void ParseBlob(TVector<TString>& result, EDataStreamFormat fmt, const TBlob& blob) { +        TStringBuf bin(blob.AsCharPtr(), blob.Size()); +        TStringBuf line; +        TString buffer; +        while (bin.ReadLine(line)) { +            if (DSF_BASE64_LF == fmt) { +                Base64Decode(line, buffer); +                line = buffer; +            } +            if (!line) { +                continue; +            } +            result.emplace_back(line.data(), line.size()); +        } +    } + +    TBlob GetInputBlob(const TString& dataFile) { +        return dataFile && dataFile != "-" ? TBlob::FromFile(dataFile) : TBlob::FromStream(Cin); +    } + +} diff --git a/library/cpp/codecs/static/tools/common/ct_common.h b/library/cpp/codecs/static/tools/common/ct_common.h new file mode 100644 index 00000000000..9d3dcbda934 --- /dev/null +++ b/library/cpp/codecs/static/tools/common/ct_common.h @@ -0,0 +1,75 @@ +#pragma once + +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/memory/blob.h> +#include <cmath> + +namespace NCodecs { +    class TStaticCodecInfo; +    class ICodec; + +    struct TComprStats { +        double EncSeconds = 0; +        double DecSeconds = 0; +        size_t Records = 0; +        size_t RawSize = 0; +        size_t EncSize = 0; + +        static double Round(double n, size_t decPlaces = 2) { +            double p = pow(10, decPlaces); +            return round(n * p) / p; +        } + +        static double AsPercent(double n) { +            return Round(n * 100); +        } + +        static double AsMicroSecond(double s) { +            return s * 1000000; +        } + +        double PerRecord(double n) const { +            return Round((double)(Records ? n / Records : 0)); +        } + +        double Compression() const { +            return ((double)RawSize - (double)EncSize) / RawSize; +        } + +        double EncTimePerRecordUS() const { +            return PerRecord(AsMicroSecond(EncSeconds)); +        } + +        double DecTimePerRecordUS() const { +            return PerRecord(AsMicroSecond(DecSeconds)); +        } + +        double RawSizePerRecord() const { +            return PerRecord(RawSize); +        } + +        double EncSizePerRecord() const { +            return PerRecord(EncSize); +        } + +        double OldEncSizePerRecord(double compr) const { +            return PerRecord((1 - compr) * RawSize); +        } + +        TString Format(const TStaticCodecInfo&, bool checkMode) const; +    }; + +    TComprStats TestCodec(const ICodec&, const TVector<TString>& data); + +    enum EDataStreamFormat { +        DSF_NONE, +        DSF_PLAIN_LF /* "plain" */, +        DSF_BASE64_LF /* "base64" */, +    }; + +    void ParseBlob(TVector<TString>&, EDataStreamFormat, const TBlob&); + +    TBlob GetInputBlob(const TString& dataFile); + +} diff --git a/library/cpp/codecs/static/tools/common/ya.make b/library/cpp/codecs/static/tools/common/ya.make new file mode 100644 index 00000000000..d624222dad0 --- /dev/null +++ b/library/cpp/codecs/static/tools/common/ya.make @@ -0,0 +1,19 @@ +LIBRARY() + +OWNER(velavokr) + +SRCS( +    ct_common.cpp +) + +PEERDIR( +    library/cpp/codecs +    library/cpp/codecs/static +    library/cpp/getopt/small +    library/cpp/string_utils/base64 +    util/draft +) + +GENERATE_ENUM_SERIALIZATION(ct_common.h) + +END() diff --git a/library/cpp/codecs/static/tools/static_codec_checker/README b/library/cpp/codecs/static/tools/static_codec_checker/README new file mode 100644 index 00000000000..723a68300b0 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_checker/README @@ -0,0 +1,4 @@ +This is a viewer for generated codec and utility for verification of the compression quality on a new data. + +Usage: +static_codec_checker -t -c 029b29ff64a74927.codec_info -f plain samples.txt diff --git a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp new file mode 100644 index 00000000000..9c8d568d823 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp @@ -0,0 +1,73 @@ +#include <library/cpp/codecs/static/tools/common/ct_common.h> +#include <library/cpp/codecs/static/static.h> +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <library/cpp/codecs/codecs.h> +#include <library/cpp/getopt/small/last_getopt.h> + +#include <util/digest/city.h> +#include <util/generic/yexception.h> +#include <util/stream/file.h> +#include <util/stream/buffer.h> +#include <util/stream/format.h> +#include <util/string/builder.h> + +int main(int argc, char** argv) { +    NCodecs::TCodecPtr codecPtr; +    NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE; +    TString codecFile; +    bool testCompression = false; + +    auto opts = NLastGetopt::TOpts::Default(); +    opts.SetTitle("Prints a .codec_info file and optionally checks its performance on new data. See also static_codec_generator."); +    opts.SetCmdLineDescr("-c 9089f3e9b7a0f0d4.codec_info -t -f base64 qtrees.sample.txt"); +    NCodecs::TStaticCodecInfo codec; + +    opts.AddLongOption('c', "codec-info").RequiredArgument("codec_info").Handler1T<TString>([&codecFile, &codec, &codecPtr](TString name) { +                                                                            codecFile = name; +                                                                            codec.CopyFrom(NCodecs::LoadCodecInfoFromString(TUnbufferedFileInput(name).ReadAll())); +                                                                            codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec()); +                                                                        }) +        .Required() +        .Help(".codec_info file with serialized static data for codec"); + +    opts.AddLongOption('t', "test").NoArgument().StoreValue(&testCompression, true).Optional().Help("test current performance"); + +    opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Optional().Help("test set input file format"); + +    opts.SetFreeArgsMin(0); +    opts.SetFreeArgTitle(0, "testing_set_input_file", "testing set input files"); + +    NLastGetopt::TOptsParseResult res(&opts, argc, argv); + +    Cout << codecFile << Endl; +    Cout << NCodecs::FormatCodecInfo(codec) << Endl; + +    if (testCompression) { +        if (NCodecs::DSF_NONE == fmt) { +            Cerr << "Specify format (-f|--format) for testing set input" << Endl; +            exit(1); +        } + +        Cout << "Reading testing set data ... " << Flush; + +        TVector<TString> allData; +        for (const auto& freeArg : res.GetFreeArgs()) { +            NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg)); +        } + +        if (!res.GetFreeArgs()) { +            NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-")); +        } + +        Cout << "Done" << Endl << Endl; + +        Cout << "records:  " << allData.size() << Endl; +        Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl; + +        Cout << "Testing compression ... " << Flush; +        auto stats = NCodecs::TestCodec(*codecPtr, allData); +        Cout << "Done" << Endl << Endl; + +        Cout << stats.Format(codec, true) << Endl; +    } +} diff --git a/library/cpp/codecs/static/tools/static_codec_checker/ya.make b/library/cpp/codecs/static/tools/static_codec_checker/ya.make new file mode 100644 index 00000000000..90e06ca448d --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_checker/ya.make @@ -0,0 +1,16 @@ +PROGRAM() + +OWNER(velavokr) + +SRCS( +    static_codec_checker.cpp +) + +PEERDIR( +    library/cpp/codecs +    library/cpp/codecs/static +    library/cpp/codecs/static/tools/common +    library/cpp/getopt/small +) + +END() diff --git a/library/cpp/codecs/static/tools/static_codec_generator/README b/library/cpp/codecs/static/tools/static_codec_generator/README new file mode 100644 index 00000000000..e6bb52b9591 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_generator/README @@ -0,0 +1,4 @@ +This is a utility for reproducible  teaching of a codec. And also for saving it into a file with a unique name for a static compilation as a resource. + +Usage: +static_codec_generator -t -m 'the training data description' -f plain samples.txt diff --git a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp new file mode 100644 index 00000000000..45fdb5c5fe8 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp @@ -0,0 +1,82 @@ +#include <library/cpp/codecs/static/tools/common/ct_common.h> +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <library/cpp/codecs/static/builder.h> +#include <library/cpp/codecs/codecs.h> + +#include <library/cpp/getopt/small/last_getopt.h> + +#include <util/generic/yexception.h> +#include <util/stream/file.h> +#include <util/string/builder.h> + +int main(int argc, char** argv) { +    NCodecs::TCodecBuildInfo info; +    NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE; + +    auto opts = NLastGetopt::TOpts::Default(); +    opts.SetCmdLineDescr("-m 'Training set: 100000 qtrees taken from web mmeta logs' -f base64 qtrees.sample.txt"); +    opts.SetTitle("Teaches the codec and serializes it as a file named CODECNAME.hash(CODECDATA).bin"); + +    opts.AddLongOption('m', "message").RequiredArgument("training_set_comment").StoreResult(&info.TrainingSetComment).Required().Help("a human description for the training set"); + +    opts.AddLongOption('r', "resource").RequiredArgument("training_set_res_id").StoreResult(&info.TrainingSetResId).Optional().Help("sandbox resource id for the training set"); + +    opts.AddLongOption('c', "codec").RequiredArgument("codec_name").StoreResult(&info.CodecName).Optional().DefaultValue(info.CodecName); + +    opts.AddLongOption('s', "sample-multiplier").RequiredArgument("multiplier").StoreResult(&info.SampleSizeMultiplier).Optional().DefaultValue(ToString(info.SampleSizeMultiplier)).Help("multiplier for default sample size"); + +    opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Required().Help("training set input file format"); + +    opts.AddLongOption("list-codecs").NoArgument().Handler0([]() { +                                                      Cout << JoinStrings(NCodecs::ICodec::GetCodecsList(), "\n") << Endl; +                                                      exit(0); +                                                  }) +        .Optional() +        .Help("list available codecs"); + +    opts.AddLongOption("fake-revision").RequiredArgument("revision").StoreResult(&info.RevisionInfo).Optional().Hidden(); // replace static_codec_generator revision in debug info + +    opts.AddLongOption("fake-timestamp").RequiredArgument("timestamp").StoreResult(&info.Timestamp).Optional().Hidden(); // replace generating timestamp in debug info + +    opts.SetFreeArgsMin(0); +    opts.SetFreeArgTitle(0, "training_set_input_file", "training set input files"); + +    NLastGetopt::TOptsParseResult res(&opts, argc, argv); + +    Cout << "Reading training set data ... " << Flush; +    TVector<TString> allData; +    for (const auto& freeArg : res.GetFreeArgs()) { +        NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg)); +    } + +    if (!res.GetFreeArgs()) { +        NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-")); +    } +    Cout << "Done" << Endl << Endl; + +    Cout << "records:  " << allData.size() << Endl; +    Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl; + +    Cout << "Training " << info.CodecName << " , sample size multiplier is " << info.SampleSizeMultiplier << " ... " << Flush; +    auto codec = NCodecs::BuildStaticCodec(allData, info); +    Cout << "Done" << Endl; + +    TString codecName = NCodecs::GetStandardFileName(codec); +    NCodecs::TCodecPtr codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec()); + +    Cout << "Testing compression ... " << Flush; +    auto stats = NCodecs::TestCodec(*codecPtr, allData); +    Cout << "Done" << Endl << Endl; + +    codec.MutableDebugInfo()->SetCompression(stats.Compression()); + +    Cout << stats.Format(codec, false) << Endl; + +    Cout << "Saving as " << codecName << " ... " << Flush; +    { +        TUnbufferedFileOutput fout{codecName}; +        NCodecs::SaveCodecInfoToStream(fout, codec); +        fout.Finish(); +    } +    Cout << "Done" << Endl << Endl; +} diff --git a/library/cpp/codecs/static/tools/static_codec_generator/ya.make b/library/cpp/codecs/static/tools/static_codec_generator/ya.make new file mode 100644 index 00000000000..efbc440dd18 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_generator/ya.make @@ -0,0 +1,17 @@ +PROGRAM() + +OWNER(velavokr) + +SRCS( +    static_codec_generator.cpp +) + +PEERDIR( +    library/cpp/codecs +    library/cpp/codecs/static +    library/cpp/codecs/static/tools/common +    library/cpp/digest/md5 +    library/cpp/getopt/small +) + +END() diff --git a/library/cpp/codecs/static/tools/tests/canondata/result.json b/library/cpp/codecs/static/tools/tests/canondata/result.json new file mode 100644 index 00000000000..7a637c6763a --- /dev/null +++ b/library/cpp/codecs/static/tools/tests/canondata/result.json @@ -0,0 +1,6 @@ +{ +    "static_codec_tools.test_static_codec_tools": { +        "checksum": "960e3c8c57fb846ab53ccbd07e287233", +        "uri": "sbr://144512644/static_codec_tools.test_static_codec_tools/solar-8k-a.huffman.1467494385.codec_info" +    } +}
\ No newline at end of file diff --git a/library/cpp/codecs/static/tools/tests/static_codec_tools.py b/library/cpp/codecs/static/tools/tests/static_codec_tools.py new file mode 100644 index 00000000000..db4140e3703 --- /dev/null +++ b/library/cpp/codecs/static/tools/tests/static_codec_tools.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python + +import yatest.common as tt +import os.path as op + +def test_static_codec_tools(): +    tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator")] +        + ["-m", "test codec", "-r", "sbr://143310406", "-f", "plain", "-c", "solar-8k-a:huffman", "-s", "1", +            "--fake-revision", "r2385905", "--fake-timestamp", "1467494385", "sample.txt"], +        timeout=60) +    assert(op.exists("solar-8k-a.huffman.1467494385.codec_info")) +    tt.canonical_execute(tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker"), +        args=["-c", "solar-8k-a.huffman.1467494385.codec_info"], +        timeout=60) +    tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker")] +        + ["-c", "solar-8k-a.huffman.1467494385.codec_info", "-f", "plain", "-t", "sample.txt"], +        timeout=60) +    return tt.canonical_file("solar-8k-a.huffman.1467494385.codec_info") diff --git a/library/cpp/codecs/static/tools/tests/ya.make b/library/cpp/codecs/static/tools/tests/ya.make new file mode 100644 index 00000000000..c5324eaf53b --- /dev/null +++ b/library/cpp/codecs/static/tools/tests/ya.make @@ -0,0 +1,20 @@ +PY2TEST() + +OWNER(velavokr) + +TEST_SRCS(static_codec_tools.py) + +DATA(sbr://143310406) + +TIMEOUT(4200) + +TAG(ya:not_autocheck) + +DEPENDS( +    library/cpp/codecs/static/tools/static_codec_checker +    library/cpp/codecs/static/tools/static_codec_generator +) + + + +END() diff --git a/library/cpp/codecs/static/tools/ya.make b/library/cpp/codecs/static/tools/ya.make new file mode 100644 index 00000000000..dd3e8437aa4 --- /dev/null +++ b/library/cpp/codecs/static/tools/ya.make @@ -0,0 +1,5 @@ +RECURSE( +    common +    static_codec_generator +    static_codec_checker +) diff --git a/library/cpp/codecs/static/ut/builder_ut.cpp b/library/cpp/codecs/static/ut/builder_ut.cpp new file mode 100644 index 00000000000..b47c279ed14 --- /dev/null +++ b/library/cpp/codecs/static/ut/builder_ut.cpp @@ -0,0 +1,57 @@ +#include <library/cpp/testing/unittest/registar.h> +#include <library/cpp/codecs/static/builder.h> +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <util/string/vector.h> + +class TStaticCodecInfoBuilderTest: public NUnitTest::TTestBase { +    UNIT_TEST_SUITE(TStaticCodecInfoBuilderTest) +    UNIT_TEST(TestBuild) +    UNIT_TEST_SUITE_END(); + +private: +    TVector<TString> PrepareData() { +        TVector<TString> data; +        for (ui32 i = 'a'; i <= 'z'; ++i) { +            data.push_back(TString(1, (char)i)); +        } +        return data; +    } + +    void TestBuild() { +        TVector<TString> data; +        NCodecs::TCodecBuildInfo info; +        info.CodecName = "huffman"; +        info.SampleSizeMultiplier = 2; +        info.Timestamp = 1467494385; +        info.RevisionInfo = "r2385905"; +        info.TrainingSetComment = "some dummy data"; +        info.TrainingSetResId = "sbr://1234"; +        auto res = NCodecs::BuildStaticCodec(PrepareData(), info); +        UNIT_ASSERT_VALUES_EQUAL(res.ShortUtf8DebugString(), +                                 "StoredCodec: \"\\007\\000huffman@S\\000a" +                                 "\\006b\\005c\\005d\\005e\\005f\\005g\\005h\\005i\\005j\\005k\\005l\\005m\\005n\\005o" +                                 "\\005p\\005q\\005r\\005s\\005t\\005u\\004v\\004w\\004x\\004y\\004z\\004\xC7?\xC8>" +                                 "\xC9=\xCA<\xCB;\xCC:\3159\3168\3177\3206\3215\3224\3233\3242\3251\3260\xD7/\xD8." +                                 "\xD9-\xDA,\xDB+\xDC*\xDD)\xDE(\xDF\\'\xE0&\xE1%\xE2$\xE3#\xE4\\\"\xE5!\xE6 \xE7" +                                 "\\037\xE8\\036\xE9\\035\xEA\\034\xEB\\033\xEC\\032\xED\\031\xEE\\030\xEF\\027\xF0" +                                 "\\026\xF1\\025\xF2\\024\xF3\\023\xF4\\022\xF5\\021\xF6\\020\xF7\\017\xF8\\016\xF9" +                                 "\\r\xFA\\014\xFB\\013\xFC\\n\xFD\\t\xFE\\010\xFF\\007\" " +                                 "DebugInfo { " +                                 "CodecName: \"huffman\" " +                                 "Timestamp: 1467494385 " +                                 "RevisionInfo: \"r2385905\" " +                                 "SampleSizeMultiplier: 2 " +                                 "TrainingSetComment: \"some dummy data\" " +                                 "TrainingSetResId: \"sbr://1234\" " +                                 "StoredCodecHash: 2509195835471488613 " +                                 "}"); + +        UNIT_ASSERT_VALUES_EQUAL(NCodecs::GetStandardFileName(res), "huffman.1467494385.codec_info"); +        UNIT_ASSERT_VALUES_EQUAL(res.GetDebugInfo().GetStoredCodecHash(), 2509195835471488613ULL); + +        auto res1 = NCodecs::LoadCodecInfoFromString(NCodecs::SaveCodecInfoToString(res)); +        UNIT_ASSERT_VALUES_EQUAL(res1.ShortUtf8DebugString(), res.ShortUtf8DebugString()); +    } +}; + +UNIT_TEST_SUITE_REGISTRATION(TStaticCodecInfoBuilderTest); diff --git a/library/cpp/codecs/static/ut/static_ut.cpp b/library/cpp/codecs/static/ut/static_ut.cpp new file mode 100644 index 00000000000..57e1e628874 --- /dev/null +++ b/library/cpp/codecs/static/ut/static_ut.cpp @@ -0,0 +1,27 @@ +#include <library/cpp/testing/unittest/registar.h> +#include <library/cpp/codecs/static/example/example.h> + +class TStaticCodecUsageTest: public NUnitTest::TTestBase { +    UNIT_TEST_SUITE(TStaticCodecUsageTest) +    UNIT_TEST(TestUsage) +    UNIT_TEST_SUITE_END(); + +private: +    void DoTestUsage(NStaticCodecExample::EDictVersion dv, size_t expectedSize) { +        const TStringBuf letov = "Всё идёт по плану"; + +        TBuffer outEnc, outDec; +        NStaticCodecExample::Encode(outEnc, letov, dv); +        NStaticCodecExample::Decode(outDec, TStringBuf{outEnc.data(), outEnc.size()}); + +        UNIT_ASSERT_VALUES_EQUAL(outEnc.Size(), expectedSize); +        UNIT_ASSERT_EQUAL(TStringBuf(outDec.data(), outDec.size()), letov); +    } + +    void TestUsage() { +        DoTestUsage(NStaticCodecExample::DV_HUFF_20160707, 18u); +        DoTestUsage(NStaticCodecExample::DV_SA_HUFF_20160707, 22u); +    } +}; + +UNIT_TEST_SUITE_REGISTRATION(TStaticCodecUsageTest) diff --git a/library/cpp/codecs/static/ut/ya.make b/library/cpp/codecs/static/ut/ya.make new file mode 100644 index 00000000000..b9116097d87 --- /dev/null +++ b/library/cpp/codecs/static/ut/ya.make @@ -0,0 +1,14 @@ +UNITTEST_FOR(library/cpp/codecs/static) + +OWNER(velavokr) + +SRCS( +    builder_ut.cpp +    static_ut.cpp +) + +PEERDIR( +    library/cpp/codecs/static/example +) + +END() diff --git a/library/cpp/codecs/static/ya.make b/library/cpp/codecs/static/ya.make new file mode 100644 index 00000000000..00e00fd8d43 --- /dev/null +++ b/library/cpp/codecs/static/ya.make @@ -0,0 +1,18 @@ +LIBRARY() + +OWNER(velavokr) + +SRCS( +    builder.cpp +    static_codec_info.proto +    static.cpp +) + +PEERDIR( +    library/cpp/codecs +    library/cpp/archive +    library/cpp/svnversion +    util/draft +) + +END() | 
