diff options
| author | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 | 
|---|---|---|
| committer | Devtools Arcadia <[email protected]> | 2022-02-07 18:08:42 +0300 | 
| commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
| tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/codecs | |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/codecs')
66 files changed, 6758 insertions, 0 deletions
diff --git a/library/cpp/codecs/README.md b/library/cpp/codecs/README.md new file mode 100644 index 00000000000..42646ccd978 --- /dev/null +++ b/library/cpp/codecs/README.md @@ -0,0 +1,46 @@ +This is a library of compression algorithms with a unified interface and serialization. +See also library/cpp/codecs/static, where a support for statically compiled dictionaries is implemented. + +All algorithms have a common `ICodec` interface (described in codecs.h). + +The `ICodec` interface has the following methods:\ +    `virtual ui8 ICodec::Encode (TMemoryRegion, TBuffer&) const;`\ +            - Input - memory region. Output - filled buffer and the rest of the last byte, if it was not filled to the end.\ +    `virtual void ICodec::Decode (TMemoryRegion, TBuffer&) const;`\ +            - Input - memory region. Output - filled buffer.\ +    `virtual void Save (TOutputStream*) const;`\ +            - Serialization.\ +    `virtual void Load (TInputStream*);`\ +            - Deserialization.\ +    `virtual bool NeedsTraining() const;`\ +            - Returns if it is necessary or not to teach this codec.\ +    `virtual bool PaddingBit() const;`\ +            - Returns which values should fill the free bits of the last byte.\ +    `virtual size_t ApproximateSizeOnEncode(size_t sz) const;`\ +            - Returns an approximate estimate of the size of the result after encoding.\ +                    For example could be used for a more accurate preallocation of a buffer.\ +    `virtual size_t ApproximateSizeOnDecode(size_t sz) const;`\ +            - Returns an approximate estimate of the size of the result after decoding.\ +                    For example could be used for a more accurate preallocation of a buffer.\ +    `virtual TString GetName() const;`\ +            - The name of the codec. It is required for registration of the codec in the system of serialization/deserialization.\ +                    For example, it allows you to save information about which combination of codecs was in use (see below).\ +    `virtual void Learn(ISequenceReader*);`\ +            - The interface for teaching codecs that use information about the distribution of data. + +In addition, the library has a number of utilities that allow a more flexible use of it. + +In the `ICodec` class the following methods are available:\ +    `static TCodecPtr GetInstance(const TString& name);`\ +            - Creation of a codec instance by a symbolic name\ +                    (See `GetName()` above)\ +    `static void Store(TOutputStream*, TCodecPtr p);`\ +            - Serialization of the codec along with its name\ +                    Allows you to save information about which particular combination of codecs was used for encoding,\ +                    and then reassemble the same combination for decoding\ +    `static TCodecPtr Restore(TInputStream* in);`\ +            - Serialization of the codec along with its name\ +    `static TCodecPtr RestoreFromString(TStringBuf data);`\ +            - Loads the codec instance from the string\ +    `static TVector<TString> GetCodecsList();`\ +            - The list of registered codecs diff --git a/library/cpp/codecs/codecs.cpp b/library/cpp/codecs/codecs.cpp new file mode 100644 index 00000000000..b17a3156d21 --- /dev/null +++ b/library/cpp/codecs/codecs.cpp @@ -0,0 +1,190 @@ +#include "codecs.h" +#include "tls_cache.h" + +#include <util/stream/mem.h> + +namespace NCodecs { +    void ICodec::Store(IOutputStream* out, TCodecPtr p) { +        if (!p.Get()) { +            ::Save(out, (ui16)0); +            return; +        } + +        Y_ENSURE_EX(p->AlreadyTrained(), TCodecException() << "untrained codec " << p->GetName()); +        const TString& n = p->GetName(); +        Y_VERIFY(n.size() <= Max<ui16>()); +        ::Save(out, (ui16)n.size()); +        out->Write(n.data(), n.size()); +        p->Save(out); +    } + +    TCodecPtr ICodec::Restore(IInputStream* in) { +        ui16 l = 0; +        ::Load(in, l); + +        if (!l) { +            return nullptr; +        } + +        TString n; +        n.resize(l); + +        Y_ENSURE_EX(in->Load(n.begin(), l) == l, TCodecException()); + +        TCodecPtr p = ICodec::GetInstance(n); +        p->Load(in); +        p->Trained = true; +        return p; +    } + +    TCodecPtr ICodec::RestoreFromString(TStringBuf s) { +        TMemoryInput minp{s.data(), s.size()}; +        return Restore(&minp); +    } + +    TString ICodec::GetNameSafe(TCodecPtr p) { +        return !p ? TString("none") : p->GetName(); +    } + +    ui8 TPipelineCodec::Encode(TStringBuf in, TBuffer& out) const { +        size_t res = Traits().ApproximateSizeOnEncode(in.size()); +        out.Reserve(res); +        out.Clear(); + +        if (Pipeline.empty()) { +            out.Append(in.data(), in.size()); +            return 0; +        } else if (Pipeline.size() == 1) { +            return Pipeline.front()->Encode(in, out); +        } + +        ui8 freelastbits = 0; + +        auto buffer = TBufferTlsCache::TlsInstance().Item(); +        TBuffer& tmp = buffer.Get(); +        tmp.Reserve(res); + +        for (auto it = Pipeline.begin(); it != Pipeline.end(); ++it) { +            if (it != Pipeline.begin()) { +                tmp.Clear(); +                tmp.Swap(out); +                in = TStringBuf{tmp.data(), tmp.size()}; +            } +            freelastbits = (*it)->Encode(in, out); +        } + +        return freelastbits; +    } + +    void TPipelineCodec::Decode(TStringBuf in, TBuffer& out) const { +        size_t res = Traits().ApproximateSizeOnDecode(in.size()); +        out.Reserve(res); +        out.Clear(); + +        if (Pipeline.empty()) { +            out.Append(in.data(), in.size()); +            return; +        } else if (Pipeline.size() == 1) { +            Pipeline.front()->Decode(in, out); +            return; +        } + +        auto buffer = TBufferTlsCache::TlsInstance().Item(); + +        TBuffer& tmp = buffer.Get(); +        tmp.Reserve(res); + +        for (TPipeline::const_reverse_iterator it = Pipeline.rbegin(); it != Pipeline.rend(); ++it) { +            if (it != Pipeline.rbegin()) { +                tmp.Clear(); +                tmp.Swap(out); +                in = TStringBuf{tmp.data(), tmp.size()}; +            } +            (*it)->Decode(in, out); +        } +    } + +    void TPipelineCodec::Save(IOutputStream* out) const { +        for (const auto& it : Pipeline) +            it->Save(out); +    } + +    void TPipelineCodec::Load(IInputStream* in) { +        for (const auto& it : Pipeline) { +            it->Load(in); +            it->SetTrained(true); +        } +    } + +    void TPipelineCodec::SetTrained(bool t) { +        for (const auto& it : Pipeline) { +            it->SetTrained(t); +        } +    } + +    TPipelineCodec& TPipelineCodec::AddCodec(TCodecPtr codec) { +        if (!codec) +            return *this; + +        TCodecTraits tr = codec->Traits(); + +        if (!MyName) { +            MyTraits.AssumesStructuredInput = tr.AssumesStructuredInput; +            MyTraits.SizeOfInputElement = tr.SizeOfInputElement; +        } else { +            MyName.append(':'); +        } + +        MyName.append(codec->GetName()); +        MyTraits.PreservesPrefixGrouping &= tr.PreservesPrefixGrouping; +        MyTraits.PaddingBit = tr.PaddingBit; +        MyTraits.NeedsTraining |= tr.NeedsTraining; +        MyTraits.Irreversible |= tr.Irreversible; +        MyTraits.SizeOnEncodeAddition = MyTraits.SizeOnEncodeAddition * tr.SizeOnEncodeMultiplier + tr.SizeOnEncodeAddition; +        MyTraits.SizeOnEncodeMultiplier *= tr.SizeOnEncodeMultiplier; +        MyTraits.SizeOnDecodeMultiplier *= tr.SizeOnDecodeMultiplier; +        MyTraits.RecommendedSampleSize = Max(MyTraits.RecommendedSampleSize, tr.RecommendedSampleSize); + +        Pipeline.push_back(codec); +        return *this; +    } + +    void TPipelineCodec::DoLearnX(ISequenceReader& in, double sampleSizeMult) { +        if (!Traits().NeedsTraining) { +            return; +        } + +        if (Pipeline.size() == 1) { +            Pipeline.back()->Learn(in); +            return; +        } + +        TVector<TBuffer> trainingInput; + +        TStringBuf r; +        while (in.NextRegion(r)) { +            trainingInput.emplace_back(r.data(), r.size()); +        } + +        TBuffer buff; +        for (const auto& it : Pipeline) { +            it->LearnX(trainingInput.begin(), trainingInput.end(), sampleSizeMult); + +            for (auto& bit : trainingInput) { +                buff.Clear(); +                it->Encode(TStringBuf{bit.data(), bit.size()}, buff); +                buff.Swap(bit); +            } +        } +    } + +    bool TPipelineCodec::AlreadyTrained() const { +        for (const auto& it : Pipeline) { +            if (!it->AlreadyTrained()) +                return false; +        } + +        return true; +    } + +} diff --git a/library/cpp/codecs/codecs.h b/library/cpp/codecs/codecs.h new file mode 100644 index 00000000000..cc5e72b2850 --- /dev/null +++ b/library/cpp/codecs/codecs.h @@ -0,0 +1,259 @@ +#pragma once + +#include "sample.h" + +#include <util/generic/bt_exception.h> +#include <util/generic/hash.h> +#include <util/generic/ptr.h> +#include <util/generic/singleton.h> + +#include <util/stream/input.h> +#include <util/stream/output.h> + +#include <util/string/cast.h> +#include <util/string/vector.h> +#include <util/system/tls.h> +#include <util/ysaveload.h> + +namespace NCodecs { +    class TCodecException: public TWithBackTrace<yexception> {}; + +    class ICodec; + +    using TCodecPtr = TIntrusivePtr<ICodec>; +    using TCodecConstPtr = TIntrusiveConstPtr<ICodec>; + +    struct TCodecTraits { +        ui32 RecommendedSampleSize = 0; +        ui16 SizeOfInputElement = 1; +        ui8 SizeOnEncodeMultiplier = 1; +        ui8 SizeOnEncodeAddition = 0; +        ui8 SizeOnDecodeMultiplier = 1; + +        bool NeedsTraining = false; +        bool PreservesPrefixGrouping = false; +        bool Irreversible = false; +        bool PaddingBit = 0; +        bool AssumesStructuredInput = false; + +        size_t ApproximateSizeOnEncode(size_t sz) const { +            return sz * SizeOnEncodeMultiplier + SizeOnEncodeAddition; +        } + +        size_t ApproximateSizeOnDecode(size_t sz) const { +            return sz * SizeOnDecodeMultiplier; +        } +    }; + +    class ICodec: public TAtomicRefCount<ICodec> { +    protected: +        bool Trained = false; +        TCodecTraits MyTraits; + +    public: +        TCodecTraits Traits() const { +            return MyTraits; +        } + +        // the name of the codec (or its variant) to be used in the codec registry +        virtual TString GetName() const = 0; + +        virtual ui8 /*free bits in last byte*/ Encode(TStringBuf, TBuffer&) const = 0; +        virtual ui8 Encode(const TBuffer& input, TBuffer& output) const { +            return Encode(TStringBuf(input.Data(), input.Data() + input.Size()), output); +        } +        virtual void Decode(TStringBuf, TBuffer&) const = 0; +        virtual void Decode(const TBuffer& input, TBuffer& output) const { +            Decode(TStringBuf(input.Data(), input.Data() + input.Size()), output); +        } + +        virtual ~ICodec() = default; + +        virtual bool AlreadyTrained() const { +            return !Traits().NeedsTraining || Trained; +        } +        virtual void SetTrained(bool t) { +            Trained = t; +        } + +        bool TryToLearn(ISequenceReader& r) { +            Trained = DoTryToLearn(r); +            return Trained; +        } + +        void Learn(ISequenceReader& r) { +            LearnX(r, 1); +        } + +        template <class TIter> +        void Learn(TIter beg, TIter end) { +            Learn(beg, end, IterToStringBuf<TIter>); +        } + +        template <class TIter, class TGetter> +        void Learn(TIter beg, TIter end, TGetter getter) { +            auto sample = GetSample(beg, end, Traits().RecommendedSampleSize, getter); +            TSimpleSequenceReader<TBuffer> reader{sample}; +            Learn(reader); +        } + +        static TCodecPtr GetInstance(TStringBuf name); + +        static TVector<TString> GetCodecsList(); + +        static TString GetNameSafe(TCodecPtr p); + +        static void Store(IOutputStream* out, TCodecPtr p); +        static TCodecPtr Restore(IInputStream* in); +        static TCodecPtr RestoreFromString(TStringBuf); + +    protected: +        virtual void DoLearn(ISequenceReader&) = 0; + +        virtual bool DoTryToLearn(ISequenceReader& r) { +            DoLearn(r); +            return true; +        } + +        // so the pipeline codec will know to adjust the sample for the subcodecs +        virtual void DoLearnX(ISequenceReader& r, double /*sampleSizeMultiplier*/) { +            DoLearn(r); +        } + +        virtual void Save(IOutputStream*) const { +        } +        virtual void Load(IInputStream*) { +        } +        friend class TPipelineCodec; + +    public: +        // so the pipeline codec will know to adjust the sample for the subcodecs +        void LearnX(ISequenceReader& r, double sampleSizeMult) { +            DoLearnX(r, sampleSizeMult); +            Trained = true; +        } + +        template <class TIter> +        void LearnX(TIter beg, TIter end, double sampleSizeMult) { +            auto sample = GetSample(beg, end, Traits().RecommendedSampleSize * sampleSizeMult); +            TSimpleSequenceReader<TBuffer> reader{sample}; +            LearnX(reader, sampleSizeMult); +        } +    }; + +    class TBasicTrivialCodec: public ICodec { +    public: +        ui8 Encode(TStringBuf in, TBuffer& out) const override { +            out.Assign(in.data(), in.size()); +            return 0; +        } + +        void Decode(TStringBuf in, TBuffer& out) const override { +            Encode(in, out); +        } + +    protected: +        void DoLearn(ISequenceReader&) override { +        } +    }; + +    class TTrivialCodec: public TBasicTrivialCodec { +    public: +        TTrivialCodec() { +            MyTraits.PreservesPrefixGrouping = true; +        } + +        static TStringBuf MyName() { +            return "trivial"; +        } + +        TString GetName() const override { +            return ToString(MyName()); +        } +    }; + +    class TTrivialTrainableCodec: public TBasicTrivialCodec { +    public: +        TTrivialTrainableCodec() { +            MyTraits.PreservesPrefixGrouping = true; +            MyTraits.NeedsTraining = true; +        } + +        static TStringBuf MyName() { +            return "trivial-trainable"; +        } + +        TString GetName() const override { +            return ToString(MyName()); +        } +    }; + +    class TNullCodec: public ICodec { +    public: +        TNullCodec() { +            MyTraits.Irreversible = true; +            MyTraits.SizeOnDecodeMultiplier = 0; +            MyTraits.SizeOnEncodeMultiplier = 0; +        } + +        TString GetName() const override { +            return "null"; +        } + +        ui8 Encode(TStringBuf, TBuffer& out) const override { +            out.Clear(); +            return 0; +        } + +        void Decode(TStringBuf, TBuffer& out) const override { +            out.Clear(); +        } + +    protected: +        void DoLearn(ISequenceReader&) override { +        } +    }; + +    class TPipelineCodec: public ICodec { +        typedef TVector<TCodecPtr> TPipeline; + +        TPipeline Pipeline; +        TString MyName; + +    public: +        explicit TPipelineCodec(TCodecPtr c0 = nullptr, TCodecPtr c1 = nullptr, TCodecPtr c2 = nullptr, TCodecPtr c3 = nullptr) { +            MyTraits.PreservesPrefixGrouping = true; +            AddCodec(c0); +            AddCodec(c1); +            AddCodec(c2); +            AddCodec(c3); +        } + +        TString GetName() const override { +            return MyName; +        } + +        ui8 Encode(TStringBuf in, TBuffer& out) const override; +        void Decode(TStringBuf in, TBuffer& out) const override; + +    public: +        /* +     * Add codecs in the following order: +     * uncompressed -> codec0 | codec1 | ... | codecN -> compressed +     */ +        TPipelineCodec& AddCodec(TCodecPtr codec); + +        bool AlreadyTrained() const override; +        void SetTrained(bool t) override; + +    protected: +        void DoLearn(ISequenceReader& in) override { +            DoLearnX(in, 1); +        } + +        void DoLearnX(ISequenceReader& in, double sampleSizeMult) override; +        void Save(IOutputStream* out) const override; +        void Load(IInputStream* in) override; +    }; + +} diff --git a/library/cpp/codecs/codecs_registry.cpp b/library/cpp/codecs/codecs_registry.cpp new file mode 100644 index 00000000000..6fd7ff67b08 --- /dev/null +++ b/library/cpp/codecs/codecs_registry.cpp @@ -0,0 +1,226 @@ +#include "codecs_registry.h" +#include "delta_codec.h" +#include "huffman_codec.h" +#include "pfor_codec.h" +#include "solar_codec.h" +#include "comptable_codec.h" +#include "zstd_dict_codec.h" + +#include <library/cpp/blockcodecs/codecs.h> + +#include <util/string/builder.h> +#include <util/string/cast.h> + +namespace NCodecs { +    TCodecPtr ICodec::GetInstance(TStringBuf name) { +        return Default<NPrivate::TCodecRegistry>().GetCodec(name); +    } + +    TVector<TString> ICodec::GetCodecsList() { +        return Default<NPrivate::TCodecRegistry>().GetCodecsList(); +    } + +    namespace NPrivate { +        void TCodecRegistry::RegisterFactory(TFactoryPtr fac) { +            TVector<TString> names = fac->ListNames(); +            for (const auto& name : names) { +                Y_VERIFY(!Registry.contains(name), "already has %s", name.data()); +                Registry[name] = fac; +            } +        } + +        TCodecPtr TCodecRegistry::GetCodec(TStringBuf name) const { +            using namespace NPrivate; + +            if (!name || "none" == name) { +                return nullptr; +            } + +            if (TStringBuf::npos == name.find(':')) { +                Y_ENSURE_EX(Registry.contains(name), TNoCodecException(name)); +                return Registry.find(name)->second->MakeCodec(name); +            } else { +                TPipelineCodec* pipe = new TPipelineCodec; + +                do { +                    TStringBuf v = name.NextTok(':'); +                    pipe->AddCodec(GetCodec(v)); +                } while (name); + +                return pipe; +            } +        } + +        TVector<TString> TCodecRegistry::GetCodecsList() const { +            using namespace NPrivate; +            TVector<TString> vs; +            vs.push_back("none"); + +            for (const auto& it : Registry) { +                vs.push_back(it.first); +            } + +            Sort(vs.begin(), vs.end()); +            return vs; +        } + +        struct TSolarCodecFactory : ICodecFactory { +            TCodecPtr MakeCodec(TStringBuf name) const override { +                if (TSolarCodec::MyNameShortInt() == name) { +                    return new TSolarCodecShortInt(); +                } +                if (TSolarCodec::MyName() == name) { +                    return new TSolarCodec(); +                } +                if (name.EndsWith(TStringBuf("-a"))) { +                    return MakeCodecImpl<TAdaptiveSolarCodec>(name, name.SubStr(TSolarCodec::MyName().size()).Chop(2)); +                } else { +                    return MakeCodecImpl<TSolarCodec>(name, name.SubStr(TSolarCodec::MyName().size())); +                } +            } + +            template <class TCodecCls> +            TCodecPtr MakeCodecImpl(const TStringBuf& name, const TStringBuf& type) const { +                if (TStringBuf("-8k") == type) { +                    return new TCodecCls(1 << 13); +                } +                if (TStringBuf("-16k") == type) { +                    return new TCodecCls(1 << 14); +                } +                if (TStringBuf("-32k") == type) { +                    return new TCodecCls(1 << 15); +                } +                if (TStringBuf("-64k") == type) { +                    return new TCodecCls(1 << 16); +                } +                if (TStringBuf("-256k") == type) { +                    return new TCodecCls(1 << 18); +                } +                ythrow TNoCodecException(name); +            } + +            TVector<TString> ListNames() const override { +                TVector<TString> vs; +                vs.push_back(ToString(TSolarCodec::MyName())); +                vs.push_back(ToString(TSolarCodec::MyName8k())); +                vs.push_back(ToString(TSolarCodec::MyName16k())); +                vs.push_back(ToString(TSolarCodec::MyName32k())); +                vs.push_back(ToString(TSolarCodec::MyName64k())); +                vs.push_back(ToString(TSolarCodec::MyName256k())); +                vs.push_back(ToString(TSolarCodec::MyName8kAdapt())); +                vs.push_back(ToString(TSolarCodec::MyName16kAdapt())); +                vs.push_back(ToString(TSolarCodec::MyName32kAdapt())); +                vs.push_back(ToString(TSolarCodec::MyName64kAdapt())); +                vs.push_back(ToString(TSolarCodec::MyName256kAdapt())); +                vs.push_back(ToString(TSolarCodec::MyNameShortInt())); +                return vs; +            } +        }; + +        struct TZStdDictCodecFactory : ICodecFactory { +            TCodecPtr MakeCodec(TStringBuf name) const override { +                return new TZStdDictCodec(TZStdDictCodec::ParseCompressionName(name)); +            } + +            TVector<TString> ListNames() const override { +                return TZStdDictCodec::ListCompressionNames(); +            } +        }; + +        struct TCompTableCodecFactory : ICodecFactory { +            TCodecPtr MakeCodec(TStringBuf name) const override { +                if (TCompTableCodec::MyNameHQ() == name) { +                    return new TCompTableCodec(TCompTableCodec::Q_HIGH); +                } else if (TCompTableCodec::MyNameLQ() == name) { +                    return new TCompTableCodec(TCompTableCodec::Q_LOW); +                } else { +                    Y_ENSURE_EX(false, TNoCodecException(name)); +                    return nullptr; +                } +            } + +            TVector<TString> ListNames() const override { +                TVector<TString> vs; +                vs.push_back(ToString(TCompTableCodec::MyNameHQ())); +                vs.push_back(ToString(TCompTableCodec::MyNameLQ())); +                return vs; +            } +        }; + +        struct TBlockCodec : ICodec { +            const NBlockCodecs::ICodec* Codec; + +            TBlockCodec(TStringBuf name) +                : Codec(NBlockCodecs::Codec(name)) +            { +            } + +            TString GetName() const override { +                return ToString(Codec->Name()); +            } + +            ui8 Encode(TStringBuf r, TBuffer& b) const override { +                Codec->Encode(r, b); +                return 0; +            } + +            void Decode(TStringBuf r, TBuffer& b) const override { +                // TODO: throws exception that is not TCodecException +                Codec->Decode(r, b); +            } + +        protected: +            void DoLearn(ISequenceReader&) override { +            } +        }; + +        struct TBlockCodecsFactory : ICodecFactory { +            using TRegistry = THashMap<TString, TCodecPtr>; +            TRegistry Registry; + +            TBlockCodecsFactory() { +                for (TStringBuf codec : NBlockCodecs::ListAllCodecs()) { +                    Register(codec); +                } +            } + +            void Register(TStringBuf name) { +                TCodecPtr p = Registry[name] = new TBlockCodec(name); +                Registry[p->GetName()] = p; +            } + +            TCodecPtr MakeCodec(TStringBuf name) const override { +                if (!Registry.contains(name)) { +                    ythrow TNoCodecException(name); +                } +                return Registry.find(name)->second; +            } + +            TVector<TString> ListNames() const override { +                TVector<TString> res; +                for (const auto& it : Registry) { +                    res.push_back(it.first); +                } +                return res; +            } +        }; + +        TCodecRegistry::TCodecRegistry() { +            RegisterFactory(new TInstanceFactory<TTrivialCodec>); +            RegisterFactory(new TInstanceFactory<TTrivialTrainableCodec>); +            RegisterFactory(new TInstanceFactory<THuffmanCodec>); +            RegisterFactory(new TInstanceFactory<TPForCodec<ui64, true>>); +            RegisterFactory(new TInstanceFactory<TPForCodec<ui32, true>>); +            RegisterFactory(new TSolarCodecFactory); +            RegisterFactory(new TZStdDictCodecFactory); +            RegisterFactory(new TCompTableCodecFactory); +            RegisterFactory(new TBlockCodecsFactory); +        } + +    } + +    void RegisterCodecFactory(TCodecFactoryPtr fact) { +        Singleton<NPrivate::TCodecRegistry>()->RegisterFactory(fact); +    } + +} diff --git a/library/cpp/codecs/codecs_registry.h b/library/cpp/codecs/codecs_registry.h new file mode 100644 index 00000000000..53710310d56 --- /dev/null +++ b/library/cpp/codecs/codecs_registry.h @@ -0,0 +1,60 @@ +#pragma once + +#include "codecs.h" +#include <util/string/cast.h> + +namespace NCodecs { +    struct TNoCodecException : TCodecException { +        TNoCodecException(TStringBuf name) { +            (*this) << "unknown codec: " << name; +        } +    }; + +    struct ICodecFactory : TAtomicRefCount<ICodecFactory> { +        virtual ~ICodecFactory() = default; +        virtual TCodecPtr MakeCodec(TStringBuf name) const = 0; +        virtual TVector<TString> ListNames() const = 0; +    }; + +    typedef TIntrusivePtr<ICodecFactory> TCodecFactoryPtr; + +    namespace NPrivate { +        template <typename TCodec> +        struct TInstanceFactory : ICodecFactory { +            TCodecPtr MakeCodec(TStringBuf) const override { +                return new TCodec; +            } + +            TVector<TString> ListNames() const override { +                TVector<TString> vs; +                vs.push_back(ToString(TCodec::MyName())); +                return vs; +            } +        }; + +        class TCodecRegistry { +            using TRegistry = THashMap<TString, TIntrusivePtr<ICodecFactory>>; +            TRegistry Registry; + +        public: +            using TFactoryPtr = TIntrusivePtr<ICodecFactory>; + +            TCodecRegistry(); + +            void RegisterFactory(TFactoryPtr fac); + +            TCodecPtr GetCodec(TStringBuf name) const; + +            TVector<TString> GetCodecsList() const; +        }; + +    } + +    void RegisterCodecFactory(TCodecFactoryPtr fact); + +    template <typename TCodec> +    void RegisterCodec() { +        RegisterCodecFactory(new NPrivate::TInstanceFactory<TCodec>()); +    } + +} diff --git a/library/cpp/codecs/comptable_codec.cpp b/library/cpp/codecs/comptable_codec.cpp new file mode 100644 index 00000000000..476b8ada80c --- /dev/null +++ b/library/cpp/codecs/comptable_codec.cpp @@ -0,0 +1,108 @@ +#include "comptable_codec.h" + +#include <library/cpp/comptable/comptable.h> +#include <util/string/cast.h> + +namespace NCodecs { +    class TCompTableCodec::TImpl: public TAtomicRefCount<TImpl> { +    public: +        TImpl(EQuality q) +            : Quality(q) +        { +        } + +        void Init() { +            Compressor.Reset(new NCompTable::TChunkCompressor{(bool)Quality, Table}); +            Decompressor.Reset(new NCompTable::TChunkDecompressor{(bool)Quality, Table}); +        } + +        ui8 Encode(TStringBuf in, TBuffer& out) const { +            out.Clear(); +            if (!in) { +                return 0; +            } + +            TVector<char> result; +            Compressor->Compress(in, &result); +            out.Assign(&result[0], result.size()); +            return 0; +        } + +        void Decode(TStringBuf in, TBuffer& out) const { +            out.Clear(); +            if (!in) { +                return; +            } + +            TVector<char> result; +            Decompressor->Decompress(in, &result); +            out.Assign(&result[0], result.size()); +        } + +        void DoLearn(ISequenceReader& in) { +            NCompTable::TDataSampler sampler; +            TStringBuf region; +            while (in.NextRegion(region)) { +                if (!region) { +                    continue; +                } + +                sampler.AddStat(region); +            } + +            sampler.BuildTable(Table); +            Init(); +        } + +        void Save(IOutputStream* out) const { +            ::Save(out, Table); +        } + +        void Load(IInputStream* in) { +            ::Load(in, Table); +            Init(); +        } + +        NCompTable::TCompressorTable Table; +        THolder<NCompTable::TChunkCompressor> Compressor; +        THolder<NCompTable::TChunkDecompressor> Decompressor; +        const EQuality Quality; +        static const ui32 SampleSize = Max(NCompTable::TDataSampler::Size * 4, (1 << 22) * 5); +    }; + +    TCompTableCodec::TCompTableCodec(EQuality q) +        : Impl(new TImpl{q}) +    { +        MyTraits.NeedsTraining = true; +        MyTraits.SizeOnEncodeMultiplier = 2; +        MyTraits.SizeOnDecodeMultiplier = 10; +        MyTraits.RecommendedSampleSize = TImpl::SampleSize; +    } + +    TCompTableCodec::~TCompTableCodec() = default; + +    TString TCompTableCodec::GetName() const { +        return ToString(Impl->Quality ? MyNameHQ() : MyNameLQ()); +    } + +    ui8 TCompTableCodec::Encode(TStringBuf in, TBuffer& out) const { +        return Impl->Encode(in, out); +    } + +    void TCompTableCodec::Decode(TStringBuf in, TBuffer& out) const { +        Impl->Decode(in, out); +    } + +    void TCompTableCodec::DoLearn(ISequenceReader& in) { +        Impl->DoLearn(in); +    } + +    void TCompTableCodec::Save(IOutputStream* out) const { +        Impl->Save(out); +    } + +    void TCompTableCodec::Load(IInputStream* in) { +        Impl->Load(in); +    } + +} diff --git a/library/cpp/codecs/comptable_codec.h b/library/cpp/codecs/comptable_codec.h new file mode 100644 index 00000000000..7ba4f4c5432 --- /dev/null +++ b/library/cpp/codecs/comptable_codec.h @@ -0,0 +1,40 @@ +#pragma once + +#include "codecs.h" + +#include <util/generic/ptr.h> + +namespace NCodecs { +    class TCompTableCodec: public ICodec { +        class TImpl; +        TIntrusivePtr<TImpl> Impl; + +    public: +        enum EQuality { +            Q_LOW = 0, +            Q_HIGH = 1 +        }; + +        explicit TCompTableCodec(EQuality q = Q_HIGH); +        ~TCompTableCodec() override; + +        static TStringBuf MyNameHQ() { +            return "comptable-hq"; +        } +        static TStringBuf MyNameLQ() { +            return "comptable-lq"; +        } + +        TString GetName() const override; + +        ui8 Encode(TStringBuf in, TBuffer& out) const override; + +        void Decode(TStringBuf in, TBuffer& out) const override; + +    protected: +        void DoLearn(ISequenceReader& in) override; +        void Save(IOutputStream* out) const override; +        void Load(IInputStream* in) override; +    }; + +} diff --git a/library/cpp/codecs/delta_codec.cpp b/library/cpp/codecs/delta_codec.cpp new file mode 100644 index 00000000000..61606d6f6f6 --- /dev/null +++ b/library/cpp/codecs/delta_codec.cpp @@ -0,0 +1,21 @@ +#include "delta_codec.h" + +namespace NCodecs { +    template <> +    TStringBuf TDeltaCodec<ui64, true>::MyName() { +        return "delta64-unsigned"; +    } +    template <> +    TStringBuf TDeltaCodec<ui32, true>::MyName() { +        return "delta32-unsigned"; +    } +    template <> +    TStringBuf TDeltaCodec<ui64, false>::MyName() { +        return "delta64-signed"; +    } +    template <> +    TStringBuf TDeltaCodec<ui32, false>::MyName() { +        return "delta32-signed"; +    } + +} diff --git a/library/cpp/codecs/delta_codec.h b/library/cpp/codecs/delta_codec.h new file mode 100644 index 00000000000..21325825e6a --- /dev/null +++ b/library/cpp/codecs/delta_codec.h @@ -0,0 +1,143 @@ +#pragma once + +#include "codecs.h" + +#include <util/generic/array_ref.h> +#include <util/generic/typetraits.h> +#include <util/generic/bitops.h> +#include <util/string/cast.h> + +namespace NCodecs { +    template <typename T = ui64, bool UnsignedDelta = true> +    class TDeltaCodec: public ICodec { +        static_assert(std::is_integral<T>::value, "expect std::is_integral<T>::value"); + +    public: +        using TUnsigned = std::make_unsigned_t<T>; +        using TSigned = std::make_signed_t<T>; +        using TDelta = std::conditional_t<UnsignedDelta, TUnsigned, TSigned>; + +    private: +        const TDelta MinDelta{Min<TDelta>()}; +        const TDelta MaxDelta{Max<TDelta>() - 1}; +        const TDelta InvalidDelta{MaxDelta + 1}; + +        Y_FORCE_INLINE static TDelta AddSafe(TUnsigned a, TUnsigned b) { +            return a + b; +        } + +        Y_FORCE_INLINE static TDelta SubSafe(TUnsigned a, TUnsigned b) { +            return a - b; +        } + +    public: +        struct TDecoder { +            const TDelta InvalidDelta{Max<TDelta>()}; + +            T Last = 0; +            T Result = 0; + +            bool First = true; +            bool Invalid = false; + +            Y_FORCE_INLINE bool Decode(TDelta t) { +                if (Y_UNLIKELY(First)) { +                    First = false; +                    Result = Last = t; +                    return true; +                } + +                if (Y_UNLIKELY(Invalid)) { +                    Invalid = false; +                    Last = 0; +                    Result = t; +                    return true; +                } + +                Result = (Last += t); +                Invalid = t == InvalidDelta; + +                return !Invalid; +            } +        }; + +    public: +        static TStringBuf MyName(); + +        TDeltaCodec() { +            MyTraits.SizeOfInputElement = sizeof(T); +            MyTraits.AssumesStructuredInput = true; +        } + +        TString GetName() const override { +            return ToString(MyName()); +        } + +        template <class TItem> +        static void AppendTo(TBuffer& b, TItem t) { +            b.Append((char*)&t, sizeof(t)); +        } + +        ui8 Encode(TStringBuf s, TBuffer& b) const override { +            b.Clear(); +            if (s.empty()) { +                return 0; +            } + +            b.Reserve(s.size()); +            TArrayRef<const T> tin{(const T*)s.data(), s.size() / sizeof(T)}; + +            const T* it = tin.begin(); +            TDelta last = *(it++); +            AppendTo(b, last); + +            TDelta maxt = SubSafe(MaxDelta, last); +            TDelta mint = AddSafe(MinDelta, last); + +            for (; it != tin.end(); ++it) { +                TDelta t = *it; + +                if (Y_LIKELY((t >= mint) & (t <= maxt))) { +                    AppendTo(b, t - last); +                    last = t; +                    maxt = SubSafe(MaxDelta, last); +                    mint = AddSafe(MinDelta, last); +                } else { +                    // delta overflow +                    AppendTo(b, InvalidDelta); +                    AppendTo(b, t); +                    last = 0; +                    maxt = MaxDelta; +                    mint = MinDelta; +                } +            } + +            return 0; +        } + +        void Decode(TStringBuf s, TBuffer& b) const override { +            b.Clear(); +            if (s.empty()) { +                return; +            } + +            b.Reserve(s.size()); +            TArrayRef<const T> tin{(const T*)s.data(), s.size() / sizeof(T)}; + +            TDecoder dec; + +            for (const T* it = tin.begin(); it != tin.end(); ++it) { +                T tmp; +                memcpy(&tmp, it, sizeof(tmp)); +                if (dec.Decode(tmp)) { +                    AppendTo(b, dec.Result); +                } +            } +        } + +    protected: +        void DoLearn(ISequenceReader&) override { +        } +    }; + +} diff --git a/library/cpp/codecs/float_huffman.cpp b/library/cpp/codecs/float_huffman.cpp new file mode 100644 index 00000000000..c4a8bd228fc --- /dev/null +++ b/library/cpp/codecs/float_huffman.cpp @@ -0,0 +1,333 @@ +#include "float_huffman.h" + +#include <util/generic/array_ref.h> +#include <util/generic/bitops.h> +#include <util/generic/cast.h> +#include <util/generic/yexception.h> +#include <util/system/unaligned_mem.h> +#include <util/system/yassert.h> +#include <util/stream/format.h> + +namespace NCodecs::NFloatHuff { +    namespace { +        struct THuffEntry { +            ui32 CodeBase; +            ui16 Prefix; +            int PrefLength; +            int CodeLength; +            int TotalLength; +            ui32 Mask; +            ui64 Offset; + +            THuffEntry() = default; + +            constexpr THuffEntry(ui32 codeBase, ui16 prefix, int prefLength, int codeLength) +                : CodeBase(codeBase) +                , Prefix(prefix) +                , PrefLength(prefLength) +                , CodeLength(codeLength) +                , TotalLength(prefLength + codeLength) +                , Mask(Mask64(codeLength)) +                , Offset(-(ui64(codeBase) << prefLength) + prefix) +            {} + +            bool Fit(ui32 code) const { +                return code >= CodeBase && code < CodeBase + (1ULL << CodeLength); +            } +        }; + +        // NB. There is a typo in the penultimate line (34 instead of 24). It was there from the very +        // first commit and we cannot fix it without breaking all the clients. +        constexpr THuffEntry entries[16] = { +            {0x00000000, 0x01, 1, 0}, // Only +0.0f, 1 bit, prefix                     [1] +            {0x3f800000, 0x0e, 4, 0}, // Only +1.0f, 4 bits, prefix                    [0111] +            {0x3f700000, 0x08, 5, 20}, // [0.9375, 1.0), 25 bits, prefix               [00010] +            {0x3f000000, 0x00, 5, 20}, // [0.5, 0.5625), 25 bits, prefx                [00000] +            {0x3f400000, 0x06, 6, 20}, // [0.75, 0.8125), 26 bits, prefix              [011000] +            {0x3f500000, 0x22, 6, 20}, // [0.8125, 0.875), 26 bits, prefix             [010001] +            {0x3f200000, 0x02, 6, 20}, // [0.625, 0.6875), 26 bits, prefix             [010000] +            {0x3f100000, 0x38, 6, 20}, // [0.5625, 0.625), 26 bits, prefix             [000111] +            {0x3f600000, 0x18, 6, 20}, // [0.875, 0.9375), 26 bits, prefix             [000110] +            {0x3f300000, 0x30, 6, 20}, // [0.6875, 0.75), 26 bits, prefix              [000011] +            {0x3e800000, 0x10, 6, 20}, // [0.25, 0.28125), 26 bits, prefix             [000010] +            {0x3e000000, 0x04, 3, 24}, // [0.125, 0.5), 27 bits, prefix                [001] +            {0x3d000000, 0x0a, 4, 24}, // [0.03125, 0.125), 28 bits, prefix            [0101] +            {0x3c000000, 0x12, 5, 24}, // [0.0078125, 0.03125), 29 bits, prefix        [01001] +            {0x3b000000, 0x26, 6, 34}, // [0.001953125, end of range), 40 bits, prefix [011001] +            {0x00000000, 0x16, 5, 32}, // whole range, 37 bits, prefix                 [01101] +        }; + +        [[noreturn]] Y_NO_INLINE void ThrowInvalidOffset(size_t size, size_t byteOffset) { +            ythrow yexception() << +                "Decompression error: requested decoding 8 bytes past end of input buffer of " << size << " bytes size at position " << byteOffset << ". "; +        } + +        struct THuffInfo { +            constexpr THuffInfo() { +                for (size_t i = 0; i < 64; ++i) { +                    bool init = false; +                    for (size_t j = 0; j != 16; ++j) { +                        ui16 prefix = i & Mask64(entries[j].PrefLength); +                        if (entries[j].Prefix == prefix) { +                            init = true; +                            DecodeLookup[i] = entries[j]; +                            break; +                        } +                    } +                    Y_ASSERT(init); +                } + +                for (ui32 i = 0; i < (1 << 12); ++i) { +                    // First two entries (+0.0f and +1.0f) are not present in the lookup, they are handled separately +                    for (int value = 2; value < 16; ++value) { +                        if (entries[value].Fit(i << 20)) { +                            EncodeLookup[i] = value; +                            break; +                        } +                    } +                } +            } + +            std::pair<ui64, int> GetCode(ui32 value) const { +                // Zeros are handled separately in the main loop +                Y_ASSERT(value != 0); + +                if (value == 0x3f800000) { +                    return {0x0e, 4}; +                } + +                const auto& entry = entries[EncodeLookup[value >> 20]]; + +                return { +                    (ui64(value) << entry.PrefLength) + entry.Offset, +                    entry.TotalLength +                }; +            } + +            THuffEntry DecodeLookup[64]; +            ui8 EncodeLookup[1 << 12]; +        }; + +        const THuffInfo huffInfo; +        /// End Of Stream +        const ui32 EOS = ui32(-1); +    } + +    TString Encode(TArrayRef<const float> factors) { +        TString result; +        result.resize((factors.size() + 1) * 40 / 8 + 8, 0); // Max code length is 40 bits +        int usedBits = 0; +        ui64 buffer = 0; +        char* writePtr = result.begin(); + +        auto writeBits = [&](ui64 code, int size) { +            const auto bitsToTransfer = Min(size, 64 - usedBits); +            buffer |= (code << usedBits); +            usedBits += bitsToTransfer; +            if (usedBits == 64) { +                memcpy(writePtr, &buffer, 8); +                usedBits = size - bitsToTransfer; +                if (bitsToTransfer != 64) { +                    buffer = code >> bitsToTransfer; +                } else { +                    buffer = 0; +                } +                writePtr += 8; +            } +        }; + +        for (size_t i = 0; i != factors.size();) { +            if (BitCast<ui32>(factors[i]) == 0) { +                int zeroCount = 1; +                for (;;) { +                    ++i; +                    if (i == factors.size() || BitCast<ui32>(factors[i]) != 0) { +                        break; +                    } +                    ++zeroCount; +                } +                for (; zeroCount >= 64; zeroCount -= 64) { +                    writeBits(ui64(-1), 64); +                } +                writeBits(Mask64(zeroCount), zeroCount); +            } else { +                const auto [code, codeSize] = huffInfo.GetCode(BitCast<ui32>(factors[i])); +                writeBits(code, codeSize); +                ++i; +            } +        } +        // Write EOS. +        // We use precomputed constants instead of the following: +        //      auto [code, codeSize] = huffInfo.GetCode(EOS); +        //      writeBits(code, codeSize); +        writeBits(211527139302, 40); +        memcpy(writePtr, &buffer, 8); +        result.resize(writePtr - result.begin() + usedBits / 8 + 8); + +        return result; +    } + +    TDecoder::TDecoder(TStringBuf data) +        : State{ +            .Workspace = data.size() < 8 ? ThrowInvalidOffset(data.size(), 0), 0 : ReadUnaligned<ui64>(data.data()), +            .WorkspaceSize = 64, +            .Position = 8, +            .Data = data +        } +    { +        FillDecodeBuffer(); +    } + +    TVector<float> TDecoder::DecodeAll(size_t sizeHint) { +        TVector<float> result; +        result.reserve(sizeHint); + +        while (Begin != End) { +            result.insert(result.end(), Begin, End); +            FillDecodeBuffer(); +        } +        return result; +    } + +    size_t TDecoder::Decode(TArrayRef<float> dest) { +        size_t count = 0; +        while (count < dest.size()) { +            if (dest.size() - count < size_t(End - Begin)) { +                const auto size = dest.size() - count; +                std::copy(Begin, Begin + size, dest.data() + count); +                Begin += size; +                return dest.size(); +            } else { +                std::copy(Begin, End, dest.data() + count); +                count += End - Begin; +                FillDecodeBuffer(); +                if (Begin == End) { +                    break; +                } +            } +        } +        return count; +    } + +    size_t TDecoder::Skip(size_t count) { +        size_t skippedCount = 0; +        while (skippedCount < count) { +            if (count - skippedCount < size_t(End - Begin)) { +                const auto size = count - skippedCount; +                Begin += size; +                return count; +            } else { +                skippedCount += End - Begin; +                FillDecodeBuffer(); +                if (Begin == End) { +                    break; +                } +            } +        } +        return skippedCount; +    } + +    bool TDecoder::HasMore() const { +        return Begin != End; +    } + +    void TDecoder::FillDecodeBuffer() { +        Begin = DecodeBuffer.data(); +        End = DecodeBuffer.data(); + +        if (HitEos) { +            return; +        } + +        // This helps to keep most of the variables in the registers. +        float* end = End; +        TState state = State; + +        // It is faster to just zero all the memory here in one go +        // and then avoid inner loop when writing zeros. There we +        // can just increment end pointer. +        std::fill(DecodeBuffer.begin(), DecodeBuffer.end(), 0.0f); + +        // Make sure that inside the loop we always have space to put 64 zeros and one other +        // value. +        float* cap = DecodeBuffer.data() + DecodeBuffer.size() - 64 - 1; + +        while (end < cap) { +            if (state.Workspace % 2 == 1) { +                // Decode zeros +                // There we can just scan whole state.Workspace for ones because it contains +                // zeros outside of the WorkspaceSize bits. +                const auto negWorkspace = ~state.Workspace; +                const int zeroCount = negWorkspace ? CountTrailingZeroBits(negWorkspace) : 64; +                end += zeroCount; +                state.SkipBits(zeroCount); +                continue; +            } +            if (state.PeekBits(4) == 0x0e) { +                *end++ = 1.0f; +                state.SkipBits(4); +                continue; +            } +            const auto& entry = huffInfo.DecodeLookup[state.PeekBits(6)]; +            const auto code = ui32((state.NextBitsUnmasked(entry.TotalLength) >> entry.PrefLength) & entry.Mask) + entry.CodeBase; +            if (Y_UNLIKELY(code == EOS)) { +                HitEos = true; +                break; +            } +            *end++ = BitCast<float>(code); +        } + +        End = end; +        State = state; +    } + +    ui64 TDecoder::TState::PeekBits(int count) { +        if (WorkspaceSize > count) { +            return Workspace & Mask64(count); +        } else { +            if (Y_UNLIKELY(Position + 8 > Data.size())) { +                ThrowInvalidOffset(Data.size(), Position); +            } +            return (Workspace | (ReadUnaligned<ui64>(Data.data() + Position) << WorkspaceSize)) & Mask64(count); +        } +    } + +    ui64 TDecoder::TState::NextBitsUnmasked(int count) { +        if (WorkspaceSize > count) { +            const auto result = Workspace; +            Workspace >>= count; +            WorkspaceSize -= count; +            return result; +        } else { +            if (Y_UNLIKELY(Position + 8 > Data.size())) { +                ThrowInvalidOffset(Data.size(), Position); +            } +            ui64 result = Workspace; +            Workspace = ReadUnaligned<ui64>(Data.data() + Position); +            Position += 8; +            result |= Workspace << WorkspaceSize; +            Workspace >>= count - WorkspaceSize; +            WorkspaceSize += 64 - count; +            return result; +        } +    } + +    void TDecoder::TState::SkipBits(int count) { +        if (WorkspaceSize > count) { +            Workspace >>= count; +            WorkspaceSize -= count; +        } else { +            if (Y_UNLIKELY(Position + 8 > Data.size())) { +                ThrowInvalidOffset(Data.size(), Position); +            } +            Workspace = ReadUnaligned<ui64>(Data.data() + Position); +            Position += 8; +            Workspace >>= count - WorkspaceSize; +            WorkspaceSize += 64 - count; +        } +    } + +    TVector<float> Decode(TStringBuf data, size_t sizeHint) { +        return TDecoder(data).DecodeAll(sizeHint); +    } +} // namespace NCodecs::NFloatHuff diff --git a/library/cpp/codecs/float_huffman.h b/library/cpp/codecs/float_huffman.h new file mode 100644 index 00000000000..786a8eae1d0 --- /dev/null +++ b/library/cpp/codecs/float_huffman.h @@ -0,0 +1,50 @@ +#pragma once + +#include <util/generic/array_ref.h> +#include <util/generic/vector.h> +#include <util/generic/strbuf.h> + +#include <array> + +namespace NCodecs::NFloatHuff { +    TString Encode(TArrayRef<const float> factors); + +    class TDecoder { +    public: +        explicit TDecoder(TStringBuf data); + +        TVector<float> DecodeAll(size_t sizeHint = 0); + +        // Returns number of decoded floats. May be fewer than requested if the EOS is found. +        size_t Decode(TArrayRef<float> dest); + +        // Returns the number of skipped values. +        size_t Skip(size_t count); + +        bool HasMore() const; + +    private: +        struct TState { +            ui64 Workspace = 0; +            int WorkspaceSize = 0; +            ui64 Position = 0; +            TStringBuf Data; + +            ui64 NextBitsUnmasked(int count); // The upper 64 - count bits may be arbitrary +            ui64 PeekBits(int count); +            void SkipBits(int count); +        }; + +        void FillDecodeBuffer(); + +        TState State; +        std::array<float, 128> DecodeBuffer; +        // The range of already decompressed numbers inside the DecodeBuffer. +        // Always kept non-empty until the EOS is encountered. +        float* Begin; +        float* End; +        bool HitEos = false; +    }; + +    TVector<float> Decode(TStringBuf data, size_t sizeHint = 0); +} diff --git a/library/cpp/codecs/float_huffman_bench/main.cpp b/library/cpp/codecs/float_huffman_bench/main.cpp new file mode 100644 index 00000000000..1018c17834a --- /dev/null +++ b/library/cpp/codecs/float_huffman_bench/main.cpp @@ -0,0 +1,145 @@ +#include <library/cpp/codecs/float_huffman.h> + +#include <benchmark/benchmark.h> + +#include <util/generic/vector.h> + +const float Factors[] = { +    0.340582, 0.000974026, 0.487168, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0.411765, 0.921569, +    0.00390625, 0.109371, 0, 1, 0, 0, 0, 0, 0.523322, 0, 1, 0, 0, 0, 0, 0.285714, 1, +    0.008253, 1, 0, 0, 0.00993935, 0.450213, 0.000974026, 1, 1, 1, 1, 0, 0, 0.20564, +    0.97561, 0.913896, 1, 1, 0, 1, 0, 0, 0.5, 0, 0, 0, 0.1, 1, 0, 0, 0, 0, 0, 0.450923, +    0, 0.5, 0, 0, 0.20564, 0, 0.5, 0, 0, 0.20564, 0, 0, 0.0313726, 0, 1, 1, 1, 0.363636, +    0.5, 0.686073, 0.45121, 0.00574382, 0.366166, 0.413295, 1, 1, 1, 0, 0, 0, 0, 0.160784, +    0, 0.937255, 0.537255, 0.133333, 0, 0, 0, 0, 0.00392157, 0, 0.333333, 0.027451, 0.0156863, +    1, 0.105882, 1, 0.00220908, 0.000112501, 0.0111262, 0.102384, 0.00140808, 0.123581, +    0.29308, 6.57282e-06, 0.00489498, 2.10209e-05, 0.00140559, 5.907e-06, 0, 0.559322, +    0.559322, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0.794765, 0, +    0.352648, 0.225904, 1, 0.047619, 0.0107276, 0.399461, 0.0304838, 0.292932, 0.00969929, +    0, 0, 0.886904, 0.714693, 0, 0.00223213, 0.000544069, 0, 0, 0, 0, 0, 0, 0, 0, 0, +    0, 0.00507403, 0, 0, 0, 0, 0, 0.875, 0, 0, 1, 1, 1, 0, 0.20564, 0, 0.00176048, 0, +    0.000440121, 0, 0, 0, 0.000974026, 0.487168, 0, 0, 0.533333, 0, 0, 0, 0, 0, 0, 0, +    0, 0, 0, 1, 0, 0, 1, 0, 0, 0.723187, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +    0, 0, 0, 0, 0, 1, 0, 0.206882, 0.00483367, 0.792983, 0.00126106, 1, 0.0313726, 0.470588, +    0.254902, 0.188235, 0.188235, 0.388235, 0.164706, 0, 0.870588, 0.843137, 0.635294, +    0.384314, 0.384314, 0.643137, 0, 0, 0, 0, 0, 0, 0, 0, 0.541176, 0, 0.541176, 0, 0, +    0.0532634, 1, 0, 0, 0, 0.015044, 1, 0, 1, 1, 1, 0.47451, 0.329412, 0.964706, 0, 0, +    0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0.0941176, 0.970588, 0.970588, 0, 0.970588, 0.97561, +    0, 0.0431373, 0.47451, 0.329412, 0.964706, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +    0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0.231373, 0.00392157, 0, 0, 0, 0.054902, 0, 0, +    1, 0, 0, 0.0235294, 0, 1, 0, 0, 0, 0, 0.34902, 0.0352941, 0.925379, 0.623681, 0, +    0.954543, 0, 0, 0.00102756, 0.709804, 0.498039, 0.0901961, 0.631373, 0.847059, 0.270588, +    0.0156863, 0.133333, 0.980392, 1e-12, 1e-12, 1e-12, 1e-12, 0.497159, 0, 0.407487, +    0, 0, 0, 0.00392157, 0.00202156, 0.046875, 0.187159, 0.046875, 0.15625, 0.434232, +    0.15625, 0, 2.95083e-07, 0.20564, 0.20564, 0.97561, 0.913896, 0, 0, 0, 0, 0, 0, 0.00784314, +    0, 0.695525, 1, 0.07205, 0, 0, 0.176471, 0, 0, 0, 1, 1, 0.98, 0.01, 0.01, 0, 0.00690702, +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.29078, 0.29078, 1, 0, 0, 0, 0, 0.192157, 0.188235, +    0.0941176, 0, 0.0313726, 0, 0.141176, 0.207843, 0.0901961, 0.00784314, 0.0784314, +    0, 0, 0, 0, 0, 0.203922, 0.0196078, 0.34902, 0.0235294, 0.0980392, 0.164706, 0.133333, +    0.368627, 0, 0.0941176, 0, 1, 0.313726, 0, 0, 0.433582, 0.384508, 0.0532186, 0.0833333, +    0.01609, 0, 1, 0, 0, 0, 0.0666667, 0, 0, 0, 0, 1, 0, 0.564706, 0.501961, 0, 0, 0, +    0, 0, 0.0516447, 0.000173065, 0, 0, 0, 0, 0, 0, 0, 0.996309, 0, 0, 0.00392157, 1, +    0, 0.01, 0, 0, 0, 0, 0, 0.439505, 0.206882, 0.206882, 0.260891, 0, 0.875, 0, 0, 0, +    0, 0, 0.185657, 1, 1, 0, 0, 0, 0.0332647, 0.206106, 0.0688878, 0.239216, 0, 0, 0, +    0, 0.054902, 0, 0.101961, 0.160784, 0.180392, 0, 0.737828, 0, 0, 0.875, 0.0142566, +    0, 0.662745, 1, 0, 0, 0, 0.225806, 0.99992, 0.631373, 0.00392157, 1, 0, 0.143647, +    0.00270085, 1, 0.231482, 0.246735, 0.0428062, 0, 0, 1, 0, 0.186441, 0.0115358, 0, +    0.221762, 0, 0.2, 0, 0.0156863, 0, 0, 0, 0.976471, 0, 0.231373, 0, 0, 0, 0, 0, 0, +    0, 0, 0, 0, 0.00392157, 0.00392157, 0.0666667, 0, 0, 0, 0, 0.0117647, 0.580392, 0.98737, +    1, 1, 1, 0, 0, 0, 0.153, 0.847, 0.931373, 0.94697, 0.94697, 0, 0.946294, 0.408118, +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0.99992, 0.97561, 0, 0, 0, 0, 0, 0, +    0.274677, 0.153017, 0, 0.642356, 0, 0, 0.1, 0, 0, 0, 0, 0.327944, 0.327944, 0, 0, +    0.815686, 0, 0, 0, 0, 0.206106, 0.439126, 0, 0, 0, 0, 0, 1, 1, 1, 0.00392157, 0.232788, +    0.232465, 0.999899, 0.00309296, 0.0636097, 0.445954, 0.156863, 0, 0, 0, 0, 0, 0, +    0.3796, 0.0784, 0.0651664, 0, 0, 0.254902, 0.266667, 1, 0, 0, 0, 0, 0, 0.596073, +    0.517876, 0.145833, 0.372549, 0, 0.991667, 0.602125, 0.161979, 0, 0, 0, 0, 0.0255146, +    0.947855, 0, 0, 0, 0, 0, 0, 0, 0, 0.847059, 0.679841, 0, 0.156863, 0, 0, 1, 0, 0, +    0, 0, 0.969697, 0, 0, 0.564706, 0, 0, 0, 0, 0, 1, 0.0367282, 0.0395228, 0, 0, 0, +    0, 0, 0.0470588, 0.141176, 0.054902, 0, 0, 0, 0}; + +const ui8 CodedFactors[] = { +    0x24, 0x06, 0x73, 0xB5, 0xC7, 0x55, 0x7F, 0x3A, 0xB4, 0x70, 0xCB, 0xEF, 0xEE, 0xFE, 0xB3, 0x5B, +    0x5A, 0x1A, 0x93, 0x5F, 0x5F, 0x13, 0x00, 0x00, 0x10, 0x00, 0x3D, 0xEF, 0xFF, 0xEE, 0x0F, 0xDC, +    0xF0, 0xAB, 0x3F, 0x37, 0x92, 0x24, 0x5D, 0x5E, 0xDE, 0x1C, 0xF8, 0x12, 0x15, 0x5B, 0x84, 0x51, +    0x82, 0xE6, 0xF6, 0xB8, 0xEA, 0x4F, 0xC7, 0xDD, 0x7D, 0x2E, 0x4D, 0x4A, 0x21, 0xCA, 0xE0, 0xC4, +    0x2E, 0xEA, 0xD3, 0xBD, 0x0F, 0x00, 0x00, 0xE0, 0xDA, 0xCC, 0xCC, 0xEC, 0x9F, 0x61, 0xDF, 0xE6, +    0x01, 0x00, 0x00, 0xCC, 0xA5, 0x49, 0xA9, 0x00, 0x00, 0x00, 0xE6, 0xD2, 0xA4, 0xD4, 0xEA, 0x08, +    0x08, 0xD0, 0xDD, 0xF9, 0xE7, 0xA2, 0x0B, 0x00, 0x00, 0x40, 0xD8, 0x13, 0x7D, 0xFE, 0x13, 0x9C, +    0x9B, 0xA8, 0x36, 0xBC, 0x00, 0x90, 0x43, 0x6F, 0x97, 0x67, 0x9B, 0xD3, 0xEE, 0xFE, 0x84, 0x24, +    0x25, 0x89, 0xC9, 0xBF, 0x3F, 0x58, 0x4C, 0x4C, 0xCA, 0x21, 0x22, 0xBC, 0x39, 0x08, 0x08, 0x08, +    0x40, 0x7E, 0xAA, 0xAA, 0xCA, 0x75, 0x70, 0x70, 0xE9, 0x08, 0x08, 0xE8, 0x9A, 0x8A, 0x8D, 0xED, +    0xA6, 0x8D, 0x31, 0x04, 0x00, 0x96, 0xD0, 0x7D, 0x1D, 0x47, 0xAA, 0x2A, 0xD9, 0x28, 0xAD, 0x6B, +    0xB4, 0x9D, 0x7A, 0xC4, 0xD5, 0xD1, 0x04, 0x8C, 0x7E, 0x56, 0x3A, 0x58, 0x5A, 0x0C, 0x46, 0x6E, +    0x1B, 0x53, 0xC2, 0x0C, 0x14, 0x00, 0xAB, 0x60, 0x05, 0x7B, 0x63, 0x8D, 0x77, 0x70, 0x75, 0xAC, +    0x2F, 0x8D, 0xB1, 0x4D, 0xA0, 0xFB, 0xF2, 0x40, 0xF7, 0xE5, 0x7F, 0xDF, 0xDD, 0xFD, 0xBB, 0x1B, +    0xB8, 0x75, 0x9B, 0x47, 0x8E, 0xB4, 0x0C, 0x9B, 0x3A, 0x73, 0x25, 0x61, 0x18, 0x92, 0xD1, 0xC2, +    0x2F, 0x3C, 0x31, 0x64, 0x96, 0x2A, 0xB9, 0xF9, 0x7C, 0xD9, 0xAF, 0x94, 0xC5, 0xE9, 0x1E, 0x63, +    0x24, 0x0C, 0x03, 0x7F, 0xD8, 0x5B, 0xB3, 0x1D, 0x49, 0x02, 0x00, 0xAB, 0xFD, 0xE9, 0xA0, 0xF3, +    0xBF, 0xC9, 0x40, 0x64, 0x0A, 0xC0, 0xC7, 0x00, 0x00, 0x60, 0x77, 0xCF, 0xA5, 0x49, 0xA9, 0x16, +    0xFD, 0xD7, 0x5C, 0xA7, 0x55, 0x00, 0x36, 0xCF, 0xB9, 0x3D, 0xAE, 0xFA, 0xD3, 0xA1, 0x85, 0x5B, +    0xFE, 0x60, 0x10, 0x11, 0xFF, 0xF7, 0x7D, 0x38, 0x59, 0x24, 0xFF, 0xFF, 0xDF, 0x13, 0x1C, 0x7B, +    0xCA, 0x1C, 0x1E, 0xF3, 0x04, 0xC0, 0x78, 0x07, 0x58, 0x7B, 0xA2, 0x54, 0xAA, 0xE3, 0xEA, 0x08, +    0x08, 0xC0, 0x74, 0x78, 0x78, 0x88, 0x50, 0x50, 0xD8, 0x0A, 0x0C, 0xC4, 0x56, 0x60, 0x20, 0xF6, +    0x1A, 0x1B, 0x33, 0x16, 0x15, 0xA5, 0xB8, 0xED, 0xED, 0x22, 0xF5, 0xF5, 0x09, 0xA1, 0xA2, 0x42, +    0x67, 0x62, 0x62, 0x3A, 0x13, 0x13, 0x0B, 0xA0, 0xA4, 0xF4, 0x0F, 0x06, 0x15, 0x35, 0x18, 0x54, +    0xD4, 0x35, 0x57, 0x45, 0xCB, 0x2F, 0x39, 0xF6, 0xEC, 0xBC, 0xBB, 0x53, 0x5F, 0x5E, 0x9E, 0xB1, +    0xA8, 0xA8, 0x28, 0xDF, 0xDE, 0x3E, 0x00, 0x00, 0x80, 0x5F, 0x75, 0x81, 0x81, 0x51, 0x1D, 0x1E, +    0xA2, 0x3A, 0x3C, 0x8C, 0xEA, 0xF0, 0x10, 0x51, 0x06, 0x67, 0xED, 0x85, 0x85, 0xA1, 0xBE, 0xBC, +    0x3C, 0x63, 0x51, 0x51, 0x51, 0xBE, 0xBD, 0xFD, 0xFF, 0xFD, 0xFE, 0xCE, 0x85, 0x76, 0x36, 0x73, +    0x10, 0x10, 0x10, 0x80, 0xEB, 0x3A, 0x38, 0xD8, 0xBE, 0xD4, 0x05, 0x06, 0xEE, 0x4F, 0x60, 0x59, +    0x59, 0x65, 0x84, 0x84, 0xC0, 0x46, 0xCB, 0x19, 0x7F, 0x4C, 0xFD, 0xC8, 0x9D, 0x8B, 0xB6, 0x31, +    0xAF, 0x86, 0x3A, 0xF0, 0x6D, 0x6D, 0x11, 0xDF, 0xDF, 0x5F, 0x79, 0x71, 0x71, 0x85, 0xD4, 0xD0, +    0x10, 0xB9, 0xB1, 0x11, 0x1A, 0x54, 0x54, 0xE9, 0x08, 0x08, 0x48, 0x39, 0x44, 0x04, 0x84, 0xAF, +    0xAF, 0x96, 0x99, 0x97, 0x71, 0xC5, 0x32, 0xF3, 0x32, 0xAE, 0x58, 0x66, 0x5E, 0xC6, 0x15, 0xCB, +    0xCC, 0xCB, 0xB8, 0x42, 0xD0, 0x45, 0xFF, 0x1C, 0x11, 0x85, 0xBE, 0x39, 0x08, 0x08, 0x08, 0x80, +    0x69, 0xC2, 0x47, 0x00, 0x80, 0x02, 0x00, 0x00, 0x91, 0xD3, 0xF4, 0x47, 0x01, 0x00, 0x80, 0x08, +    0x00, 0x00, 0x42, 0xD4, 0x29, 0x6F, 0x02, 0x00, 0x80, 0xB4, 0xE6, 0x6B, 0x9E, 0x34, 0x5C, 0x9A, +    0x94, 0xE2, 0xD2, 0xA4, 0x14, 0xA2, 0x0C, 0x4E, 0xEC, 0xA2, 0x3E, 0x7F, 0x39, 0x08, 0x08, 0x10, +    0x6E, 0x6F, 0x10, 0xD7, 0x79, 0xC7, 0xC9, 0x09, 0x4D, 0x4B, 0x73, 0x77, 0x84, 0x14, 0xAE, 0x52, +    0xE1, 0x7A, 0x44, 0x2A, 0x5C, 0x8F, 0x34, 0x93, 0xA8, 0xC4, 0x01, 0xF8, 0x3F, 0x3D, 0xC2, 0x29, +    0xE9, 0x11, 0x4E, 0xE9, 0x4F, 0x67, 0x62, 0x22, 0xB6, 0x02, 0x03, 0xA9, 0x2E, 0x30, 0x70, 0x75, +    0x04, 0x04, 0xC8, 0x38, 0x48, 0x08, 0x32, 0x53, 0x53, 0x29, 0x2F, 0x2E, 0xAE, 0x1C, 0x04, 0x04, +    0x50, 0x52, 0x50, 0xD0, 0x4F, 0x77, 0x68, 0x28, 0x99, 0x08, 0x0A, 0x4A, 0x60, 0x59, 0x59, 0xA9, +    0x0B, 0x0C, 0xAC, 0xC7, 0xC8, 0xC8, 0x8C, 0x45, 0x45, 0xA1, 0x1C, 0x22, 0x02, 0x5D, 0x79, 0x79, +    0xAB, 0x2E, 0x30, 0x70, 0xA7, 0x2C, 0x28, 0xE8, 0xB4, 0xF3, 0xEF, 0x26, 0x8F, 0x37, 0xB1, 0xFE, +    0xEE, 0x67, 0xA9, 0xA9, 0xAA, 0xAA, 0x6C, 0x79, 0x1E, 0xEC, 0xD7, 0x46, 0x44, 0xC4, 0xF7, 0xF8, +    0x24, 0x24, 0x00, 0x42, 0x40, 0xF8, 0x5A, 0x96, 0x38, 0x65, 0x91, 0xF1, 0x6A, 0x72, 0xFE, 0x68, +    0xC3, 0xE1, 0x37, 0x07, 0x01, 0x01, 0x01, 0xF0, 0x52, 0xE1, 0x7A, 0xE4, 0xB3, 0xD9, 0x20, 0x9C, +    0xE0, 0xD8, 0x53, 0x04, 0xC7, 0x9E, 0x82, 0x02, 0x27, 0x2B, 0x06, 0x00, 0x00, 0x9F, 0xDE, 0x1C, +    0x3E, 0xEE, 0xD7, 0x48, 0x20, 0x04, 0xD2, 0x35, 0x4C, 0x29, 0x43, 0x45, 0x23, 0x15, 0xEA, 0xE9, +    0x5E, 0xD7, 0xC1, 0xC1, 0xAA, 0x3B, 0x34, 0x34, 0x21, 0x49, 0x49, 0xE8, 0x8A, 0x8B, 0x13, 0x66, +    0x12, 0xE7, 0x31, 0x00, 0x00, 0x90, 0x84, 0x94, 0x69, 0x05, 0xD4, 0xD4, 0xF4, 0x13, 0x36, 0xE7, +    0x0C, 0x09, 0xEB, 0xBF, 0x90, 0x1A, 0x1A, 0xE6, 0x20, 0x20, 0x20, 0x00, 0x9E, 0x33, 0x18, 0x13, +    0xA6, 0x2F, 0x40, 0x0C, 0x00, 0x4E, 0xCF, 0x84, 0x36, 0x6A, 0xA0, 0xF2, 0xA9, 0x63, 0xD5, 0xCB, +    0x9E, 0x64, 0xEA, 0x3E, 0xF2, 0x14, 0xA0, 0x27, 0x29, 0x2B, 0xC6, 0xB2, 0x99, 0x99, 0xA9, 0x74, +    0x04, 0x04, 0x3C, 0x0A, 0xD0, 0xCF, 0x5C, 0x68, 0x67, 0xFB, 0xDF, 0x1C, 0x04, 0x04, 0x04, 0xC0, +    0x1C, 0x04, 0x04, 0x04, 0x40, 0x1B, 0x11, 0x11, 0x5F, 0xEA, 0x02, 0x03, 0xE1, 0x92, 0x94, 0x84, +    0x90, 0x88, 0xD9, 0xDD, 0x4F, 0x04, 0x56, 0x0E, 0xD1, 0x9F, 0x1A, 0x31, 0x3B, 0x37, 0x47, 0xA0, +    0x6C, 0x82, 0x40, 0xD9, 0x24, 0x9A, 0x02, 0x12, 0x62, 0xD3, 0x43, 0xFF, 0xBF, 0x8F, 0x84, 0xF5, +    0x1F, 0x51, 0x06, 0xE7, 0x0F, 0xDD, 0x89, 0x32, 0xFB, 0x60, 0x39, 0x0A, 0x71, 0x71, 0xB4, 0x36, +    0x33, 0x33, 0x3F, 0x8F, 0xD0, 0x4F, 0x79, 0x84, 0x7E, 0xBA, 0xC8, 0x0C, 0x0D, 0x4F, 0xBA, 0x86, +    0x29, 0x82, 0x54, 0x83, 0x7F, 0x77, 0x37, 0x07, 0x01, 0x01, 0x01, 0xA0, 0xFE, 0x97, 0x1B, 0x9D, +    0x16, 0xDC, 0x90, 0x58, 0xFE, 0x9B, 0x42, 0xB3, 0x4A, 0x00, 0x68, 0x73, 0x91, 0x20, 0x2B, 0xA8, +    0xC8, 0x29, 0x0B, 0x0A, 0xF2, 0xD3, 0x5D, 0x4B, 0x58, 0x5D, 0x20, 0x41, 0xD5, 0xBE, 0xAE, 0x70, +    0x88, 0x50, 0x50, 0x20, 0x4A, 0x44, 0xF4, 0x8F, 0xF7, 0x60, 0x22, 0x30, 0x9C, 0x24, 0xFE, 0x54, +    0x55, 0xD0, 0xD7, 0xD7, 0x37, 0x1A, 0xEF, 0x6E, 0xBC, 0x9B, 0x44, 0x39, 0xDD, 0x5D, 0xF2, 0xF2, +    0x7F, 0x20, 0x1A, 0x81, 0x9A, 0xCA, 0xBF, 0xC8, 0x8D, 0x8D, 0xC2, 0x83, 0x82, 0xA7, 0x2C, 0x28, +    0xC8, 0xFE, 0x08, 0xC2, 0x07, 0xC7, 0x27, 0x21, 0xE1, 0xBB, 0x3E, 0xC1, 0x59, 0x68, 0xAA, 0x78, +    0xC8, 0x57, 0x5D, 0x60, 0x20, 0xC6, 0x41, 0x42, 0xE8, 0x3A, 0x38, 0xD8, 0x9B, 0xFF, 0xFF, 0xFF, +    0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; +static const TStringBuf CodedFactorsBuf(reinterpret_cast<const char*>(CodedFactors), Y_ARRAY_SIZE(CodedFactors)); + +void BM_Encode(benchmark::State& state) { +    for (const auto _ : state) { +        NCodecs::NFloatHuff::Encode(Factors); +    } +} + +void BM_Decode(benchmark::State& state) { +    for (const auto _ : state) { +        NCodecs::NFloatHuff::Decode(CodedFactorsBuf, Y_ARRAY_SIZE(Factors)); +    } +} + +BENCHMARK(BM_Encode); +BENCHMARK(BM_Decode); diff --git a/library/cpp/codecs/float_huffman_bench/ya.make b/library/cpp/codecs/float_huffman_bench/ya.make new file mode 100644 index 00000000000..c8fae6873a1 --- /dev/null +++ b/library/cpp/codecs/float_huffman_bench/ya.make @@ -0,0 +1,13 @@ +OWNER(eeight) + +G_BENCHMARK() + +SRCS( +    main.cpp +) + +PEERDIR( +    library/cpp/codecs +) + +END() diff --git a/library/cpp/codecs/greedy_dict/gd_builder.cpp b/library/cpp/codecs/greedy_dict/gd_builder.cpp new file mode 100644 index 00000000000..561bfbca015 --- /dev/null +++ b/library/cpp/codecs/greedy_dict/gd_builder.cpp @@ -0,0 +1,142 @@ +#include "gd_builder.h" + +#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> +#include <util/generic/algorithm.h> + +#include <util/random/shuffle.h> +#include <util/stream/output.h> +#include <util/string/printf.h> +#include <util/system/rusage.h> + +namespace NGreedyDict { +    void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) { +        if (!Current) { +            Current = MakeHolder<TEntrySet>(); +            Current->InitWithAlpha(); +        } + +        TEntrySet& set = *Current; + +        for (auto& it : set) +            it.Count = 0; + +        CompoundCounts = nullptr; +        CompoundCountsPool.Clear(); + +        if (!final) { +            CompoundCounts = MakeHolder<TCompoundCounts>(&CompoundCountsPool); +            CompoundCounts->reserve(maxcand); +        } + +        Shuffle(Input.begin(), Input.end(), Rng); + +        for (auto str : Input) { +            if (!final && CompoundCounts->size() > maxcand) +                break; + +            i32 prev = -1; + +            while (!!str) { +                TEntry* e = set.FindPrefix(str); +                ui32 num = e->Number; + +                e->Count += 1; +                if (!final && prev >= 0) { +                    (*CompoundCounts)[Compose(prev, num)] += 1; +                } + +                prev = num; +                ++set.TotalCount; +            } +        } + +        Current->SetModelP(); +    } + +    ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) { +        TAutoPtr<TEntrySet> newset = new TEntrySet; +        newset->InitWithAlpha(); +        maxent -= newset->size(); + +        ui32 additions = 0; +        ui32 deletions = 0; + +        { +            const TEntrySet& set = *Current; + +            Candidates.clear(); +            const ui32 total = set.TotalCount; +            const float minpval = Settings.MinPValue; +            const EEntryStatTest test = Settings.StatTest; +            const EEntryScore score = Settings.Score; +            const ui32 mincnt = Settings.MinAbsCount; + +            for (const auto& it : set) { +                const TEntry& e = it; +                float modelp = e.ModelP; +                ui32 cnt = e.Count; + +                if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval) +                    Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number)); +            } + +            if (!!CompoundCounts) { +                for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) { +                    const TEntry& prev = set.Get(Prev(it->first)); +                    const TEntry& next = set.Get(Next(it->first)); +                    float modelp = ModelP(prev.Count, next.Count, total); +                    ui32 cnt = it->second; +                    if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval) +                        Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first)); +                } +            } + +            Sort(Candidates.begin(), Candidates.end()); + +            if (Candidates.size() > maxent) +                Candidates.resize(maxent); + +            for (const auto& candidate : Candidates) { +                if (IsCompound(candidate.second)) { +                    additions++; +                    newset->Add(set.Get(Prev(candidate.second)).Str, set.Get(Next(candidate.second)).Str); +                } else { +                    newset->Add(set.Get(candidate.second).Str); +                } +            } + +            deletions = set.size() - (newset->size() - additions); +        } + +        Current = newset; +        Current->BuildHierarchy(); +        return deletions + additions; +    } + +    ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) { +        size_t totalsz = 0; +        for (auto it : Input) +            totalsz += it.size(); + +        while (maxiters) { +            maxiters--; + +            RebuildCounts(maxentries * Settings.GrowLimit, false); + +            if (Settings.Verbose) { +                TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size()); +                Clog << Sprintf("%-110s RSS=%" PRIu32 "M", mess.data(), (ui32)(TRusage::Get().MaxRss >> 20)) << Endl; +            } + +            ui32 diff = BuildNextGeneration(maxentries); + +            if (Current->size() == maxentries && diff < mindiff) +                break; +        } + +        RebuildCounts(0, true); +        Current->SetScores(Settings.Score); +        return maxiters; +    } + +} diff --git a/library/cpp/codecs/greedy_dict/gd_builder.h b/library/cpp/codecs/greedy_dict/gd_builder.h new file mode 100644 index 00000000000..b8e9a5e37be --- /dev/null +++ b/library/cpp/codecs/greedy_dict/gd_builder.h @@ -0,0 +1,94 @@ +#pragma once + +#include "gd_entry.h" + +#include <util/generic/hash.h> +#include <util/random/fast.h> + +namespace NGreedyDict { +    struct TBuildSettings { +        EEntryStatTest StatTest = EST_SIMPLE_NORM; +        EEntryScore Score = ES_LEN_SIMPLE; + +        float MinPValue = 0.75; +        ui32 MinAbsCount = 10; +        ui32 GrowLimit = 10; // times of maxentries +        bool Verbose = false; +    }; + +    class TDictBuilder { +        using TCompoundCounts = THashMap<ui64, ui32, THash<ui64>, TEqualTo<ui64>, TPoolAllocator>; +        using TCandidate = std::pair<float, ui64>; +        using TCandidates = TVector<TCandidate>; + +    private: +        TFastRng64 Rng{0x1a5d0ac170565c1c, 0x0be7bc27, 0x6235f6f57820aa0d, 0xafdc7fb}; +        TStringBufs Input; + +        THolder<TEntrySet> Current; + +        TMemoryPool CompoundCountsPool; +        THolder<TCompoundCounts> CompoundCounts; + +        TCandidates Candidates; + +        TBuildSettings Settings; + +    public: +        TDictBuilder(const TBuildSettings& s = TBuildSettings()) +            : CompoundCountsPool(8112, TMemoryPool::TLinearGrow::Instance()) +            , Settings(s) +        { +        } + +        void SetInput(const TStringBufs& in) { +            Input = in; +        } + +        const TBuildSettings& GetSettings() const { +            return Settings; +        } + +        TBuildSettings& GetSettings() { +            return Settings; +        } + +        void SetSettings(const TBuildSettings& s) { +            Settings = s; +        } + +        TEntrySet& EntrySet() { +            return *Current; +        } + +        const TEntrySet& EntrySet() const { +            return *Current; +        } + +        THolder<TEntrySet> ReleaseEntrySet() { +            return std::move(Current); +        } + +        ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10); + +    public: +        void RebuildCounts(ui32 maxcand, bool final); +        ui32 /*diff size*/ BuildNextGeneration(ui32 maxent); + +        static bool IsCompound(ui64 ent) { +            return ent & 0xFFFFFFFF00000000ULL; +        } + +        static ui32 Next(ui64 ent) { +            return ent; +        } +        static ui32 Prev(ui64 ent) { +            return (ent >> 32) - 1; +        } + +        static ui64 Compose(ui32 prev, ui32 next) { +            return ((prev + 1ULL) << 32) | next; +        } +    }; + +} diff --git a/library/cpp/codecs/greedy_dict/gd_entry.cpp b/library/cpp/codecs/greedy_dict/gd_entry.cpp new file mode 100644 index 00000000000..2c315c7f7cf --- /dev/null +++ b/library/cpp/codecs/greedy_dict/gd_entry.cpp @@ -0,0 +1,98 @@ +#include "gd_entry.h" +#include "gd_stats.h" + +#include <util/generic/algorithm.h> +#include <util/generic/singleton.h> + +namespace NGreedyDict { +    class TAlphas { +        char Memory[512]; + +    public: +        TStringBufs Alphas; + +        TAlphas() { +            for (ui32 i = 0; i < 256; ++i) { +                Memory[2 * i] = (char)i; +                Memory[2 * i + 1] = 0; + +                Alphas.push_back(TStringBuf(&Memory[2 * i], 1)); +            } +        } +    }; + +    void TEntrySet::InitWithAlpha() { +        Pool.ClearKeepFirstChunk(); +        const TStringBufs& a = Singleton<TAlphas>()->Alphas; +        for (auto it : a) { +            Add(it); +        } +        BuildHierarchy(); +    } + +    void TEntrySet::BuildHierarchy() { +        Sort(begin(), end(), TEntry::StrLess); + +        TCompactTrieBuilder<char, ui32, TAsIsPacker<ui32>> builder(CTBF_PREFIX_GROUPED); + +        for (iterator it = begin(); it != end(); ++it) { +            it->Number = (it - begin()); +            TStringBuf suff = it->Str; +            size_t len = 0; +            ui32 val = 0; + +            if (builder.FindLongestPrefix(suff.data(), suff.size(), &len, &val) && len) { +                it->NearestPrefix = val; +            } + +            builder.Add(suff.data(), suff.size(), it->Number); +        } + +        TBufferOutput bout; +        builder.Save(bout); +        Trie.Init(TBlob::FromBuffer(bout.Buffer())); +    } + +    TEntry* TEntrySet::FindPrefix(TStringBuf& str) { +        size_t len = 0; +        ui32 off = 0; + +        if (!Trie.FindLongestPrefix(str, &len, &off)) { +            return nullptr; +        } + +        str.Skip(len); +        return &Get(off); +    } + +    void TEntrySet::SetModelP() { +        for (iterator it = begin(); it != end(); ++it) { +            TEntry& e = *it; + +            if (!e.HasPrefix()) { +                e.ModelP = 0; +                continue; +            } + +            TStringBuf suff = e.Str; +            const TEntry& p = Get(e.NearestPrefix); +            suff.Skip(p.Len()); + +            float modelp = float(p.Count + e.Count) / TotalCount; + +            while (!!suff) { +                TEntry* pp = FindPrefix(suff); +                modelp *= float(pp->Count + e.Count) / TotalCount; +            } + +            e.ModelP = modelp; +        } +    } + +    void TEntrySet::SetScores(EEntryScore s) { +        for (auto& it : *this) { +            it.Score = Score(s, it.Len(), it.ModelP, it.Count, TotalCount); +        } +    } + +} diff --git a/library/cpp/codecs/greedy_dict/gd_entry.h b/library/cpp/codecs/greedy_dict/gd_entry.h new file mode 100644 index 00000000000..18b5be0e156 --- /dev/null +++ b/library/cpp/codecs/greedy_dict/gd_entry.h @@ -0,0 +1,103 @@ +#pragma once + +#include "gd_stats.h" + +#include <library/cpp/containers/comptrie/comptrie.h> + +#include <util/generic/ptr.h> +#include <util/generic/strbuf.h> +#include <util/generic/vector.h> + +#include <util/memory/pool.h> + +namespace NGreedyDict { +    using TStringBufs = TVector<TStringBuf>; + +    struct TEntry { +        static const i32 NoPrefix = -1; + +        TStringBuf Str; + +        i32 NearestPrefix = NoPrefix; +        ui32 Count = 0; +        ui32 Number = 0; +        float ModelP = 0; +        float Score = 0; + +        TEntry(TStringBuf b = TStringBuf(), ui32 cnt = 0) +            : Str(b) +            , Count(cnt) +        { +        } + +        bool HasPrefix() const { +            return NearestPrefix != NoPrefix; +        } +        ui32 Len() const { +            return Str.size(); +        } + +        static bool StrLess(const TEntry& a, const TEntry& b) { +            return a.Str < b.Str; +        } +        static bool NumberLess(const TEntry& a, const TEntry& b) { +            return a.Number < b.Number; +        } +        static bool ScoreMore(const TEntry& a, const TEntry& b) { +            return a.Score > b.Score; +        } +    }; + +    class TEntrySet: public TVector<TEntry>, TNonCopyable { +        TMemoryPool Pool{8112}; +        TCompactTrie<char, ui32, TAsIsPacker<ui32>> Trie; + +    public: +        ui32 TotalCount = 0; + +        void InitWithAlpha(); + +        void Add(TStringBuf a) { +            push_back(TStringBuf(Pool.Append(a.data(), a.size()), a.size())); +        } + +        void Add(TStringBuf a, TStringBuf b) { +            size_t sz = a.size() + b.size(); +            char* p = (char*)Pool.Allocate(sz); +            memcpy(p, a.data(), a.size()); +            memcpy(p + a.size(), b.data(), b.size()); +            push_back(TStringBuf(p, sz)); +        } + +        TEntry& Get(ui32 idx) { +            return (*this)[idx]; +        } + +        const TEntry& Get(ui32 idx) const { +            return (*this)[idx]; +        } + +        void BuildHierarchy(); + +        // longest prefix +        TEntry* FindPrefix(TStringBuf& str); + +        const TEntry* FindPrefix(TStringBuf& str) const { +            return ((TEntrySet*)this)->FindPrefix(str); +        } + +        const TEntry* FirstPrefix(const TEntry& e, TStringBuf& suff) { +            if (!e.HasPrefix()) +                return nullptr; + +            const TEntry& p = Get(e.NearestPrefix); +            suff = e.Str; +            suff.Skip(p.Str.size()); +            return &p; +        } + +        void SetModelP(); +        void SetScores(EEntryScore); +    }; + +} diff --git a/library/cpp/codecs/greedy_dict/gd_stats.h b/library/cpp/codecs/greedy_dict/gd_stats.h new file mode 100644 index 00000000000..b63c4c38d23 --- /dev/null +++ b/library/cpp/codecs/greedy_dict/gd_stats.h @@ -0,0 +1,79 @@ +#pragma once + +#include <util/generic/ymath.h> +#include <util/generic/algorithm.h> +#include <util/generic/yexception.h> + +namespace NGreedyDict { +    enum EEntryScore { +        ES_COUNT, +        ES_LEN_COUNT, +        ES_SIMPLE, +        ES_LEN_SIMPLE, +        ES_SOLAR +    }; + +    enum EEntryStatTest { +        EST_NONE = 0, +        EST_SIMPLE_NORM = 2 +    }; + +    inline float ModelP(ui32 countA, ui32 countB, ui32 total) { +        return float(countA) * countB / total / total; +    } + +    // P (ab | dependent) +    inline float SimpleTest(float modelp, ui32 countAB, ui32 total) { +        float realp = float(countAB) / total; +        return modelp >= realp ? 0 : (realp - modelp); +    } + +    inline float SolarTest(float modelp, ui32 countAB, ui32 total) { +        float realp = float(countAB) / total; +        return modelp >= realp ? 0 : (modelp + realp * (log(realp / modelp) - 1)); +    } + +    // P (ab | dependent) / P (ab) +    inline float SimpleTestNorm(float modelp, ui32 countAB, ui32 total) { +        float realp = float(countAB) / total; +        return modelp >= realp ? 0 : (realp - modelp) / realp; +    } + +    inline float StatTest(EEntryStatTest test, float modelp, ui32 countAB, ui32 total) { +        if (!total) { +            return 0; +        } +        switch (test) { +            case EST_NONE: +                return 1; +            case EST_SIMPLE_NORM: +                return SimpleTestNorm(modelp, countAB, total); +        } +        Y_FAIL("no way!"); +        return 0; +    } + +    inline float Score(EEntryScore score, ui32 len, float modelp, ui32 count, ui32 total) { +        if (!total) { +            return 0; +        } +        ui32 m = 1; +        switch (score) { +            case ES_LEN_COUNT: +                m = len; +                [[fallthrough]]; +            case ES_COUNT: +                return m * count; +            case ES_LEN_SIMPLE: +                m = len; +                [[fallthrough]]; +            case ES_SIMPLE: +                return m * SimpleTest(modelp, count, total); +            case ES_SOLAR: +                return SolarTest(modelp, count, total); +        } +        Y_FAIL("no way!"); +        return 0; +    } + +} diff --git a/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp new file mode 100644 index 00000000000..679089a11be --- /dev/null +++ b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp @@ -0,0 +1,282 @@ +#include "gd_builder.h" + +#include <library/cpp/testing/unittest/registar.h> +#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> +#include <util/string/printf.h> +#include <util/generic/ymath.h> + +class TGreedyDictTest: public TTestBase { +    UNIT_TEST_SUITE(TGreedyDictTest); +    UNIT_TEST(TestEntrySet) +    UNIT_TEST(TestBuilder0) +    UNIT_TEST(TestBuilder) +    UNIT_TEST_SUITE_END(); + +    void TestEntrySet() { +        using namespace NGreedyDict; + +        { +            TEntrySet d; + +            d.InitWithAlpha(); + +            for (TEntrySet::const_iterator it = d.begin(); it != d.end(); ++it) { +                UNIT_ASSERT_C(!it->HasPrefix(), Sprintf("%u -> %u", it->Number, it->NearestPrefix)); +                UNIT_ASSERT_VALUES_EQUAL(it->Number, (ui32)(it - d.begin())); +            } + +            UNIT_ASSERT_VALUES_EQUAL(d.size(), 256u); +            TStringBuf s = "aaabbb"; +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a"); +            UNIT_ASSERT_VALUES_EQUAL(s, "aabbb"); +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a"); +            UNIT_ASSERT_VALUES_EQUAL(s, "abbb"); +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a"); +            UNIT_ASSERT_VALUES_EQUAL(s, "bbb"); +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b"); +            UNIT_ASSERT_VALUES_EQUAL(s, "bb"); +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b"); +            UNIT_ASSERT_VALUES_EQUAL(s, "b"); +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b"); +            UNIT_ASSERT_VALUES_EQUAL(s, ""); +            s = TStringBuf("", 1); +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, TStringBuf("", 1)); +            UNIT_ASSERT_VALUES_EQUAL(s, ""); +            s = "\xFF"; +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "\xFF"); +            UNIT_ASSERT_VALUES_EQUAL(s, ""); +        } +        { +            TEntrySet d; +            d.Add("a"); +            d.Add("b"); +            d.Add("b", "a"); +            d.BuildHierarchy(); + +            UNIT_ASSERT_VALUES_EQUAL(d.size(), 3u); + +            TStringBuf s = "bab"; +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "ba"); +            UNIT_ASSERT_VALUES_EQUAL(s, "b"); +        } +        { +            TEntrySet d; + +            d.Add("a"); +            d.Add("aa"); +            d.Add("aaa"); +            d.Add("aab"); +            d.Add("b"); +            d.Add("ba"); + +            d.BuildHierarchy(); + +            UNIT_ASSERT_VALUES_EQUAL(d.size(), 6u); +            { +                TStringBuf s = "aaaaa"; +                const TEntry* e = d.FindPrefix(s); +                UNIT_ASSERT_VALUES_EQUAL(e->Str, "aaa"); +                UNIT_ASSERT_VALUES_EQUAL(e->Number, 2u); +                UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 1); +                UNIT_ASSERT_VALUES_EQUAL(s, "aa"); +            } + +            { +                TStringBuf s = "a"; +                const TEntry* e = d.FindPrefix(s); +                UNIT_ASSERT_VALUES_EQUAL(e->Str, "a"); +                UNIT_ASSERT_VALUES_EQUAL(e->Number, 0u); +                UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1); +                UNIT_ASSERT_VALUES_EQUAL(s, ""); +            } + +            { +                TStringBuf s = "bab"; +                const TEntry* e = d.FindPrefix(s); +                UNIT_ASSERT_VALUES_EQUAL(e->Str, "ba"); +                UNIT_ASSERT_VALUES_EQUAL(e->Number, 5u); +                UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 4); +                UNIT_ASSERT_VALUES_EQUAL(s, "b"); +            } + +            { +                TStringBuf s = "bba"; +                const TEntry* e = d.FindPrefix(s); +                UNIT_ASSERT_VALUES_EQUAL(e->Str, "b"); +                UNIT_ASSERT_VALUES_EQUAL(e->Number, 4u); +                UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1); +                UNIT_ASSERT_VALUES_EQUAL(s, "ba"); +            } +        } +    } + +    void TestBuilder0() { +        using namespace NGreedyDict; +        ui32 a = 1, b = 11; +        ui64 ab = TDictBuilder::Compose(a, b); +        UNIT_ASSERT(TDictBuilder::IsCompound(ab)); +        UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Prev(ab), a); +        UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Next(ab), b); +    } + +    void FillData(NGreedyDict::TStringBufs& data) { +        static const char* urls[] = {"http://53.ru/car/motors/foreign/opel/tigra/", "http://abakan.24au.ru/tender/85904/", "http://anm15.gulaig.com/", "http://avto-parts.com/mercedes-benz/mercedes-benz-w220-1998-2005/category-442/category-443/", "http://ballooncousin.co.uk/", "http://benzol.ru/equipment/?id=1211&parent=514", "http://blazingseorank.com/blazing-seo-rank-free-website-analysis-to-increase-rank-and-traffic-450.html", "http://blogblaugrana.contadorwebmasters.com/", "http://bristolhash.org.uk/bh3cntct.php", "http://broker.borovichi.ru/category/item/3/1/0/8/28/257", "http://canoncompactcamerax.blogspot.com/", "http://classifieds.smashits.com/p,107881,email-to-friend.htm", "http://conferences.ksde.org/Portals/132/FallAssessment/SAVETHEDAY-FA09.pdf", "http://eway.vn/raovat/325-dien-tu-gia-dung/337-dieu-hoa/98041-b1-sua-may-lanh-quan-binh-tan-sua-may-lanh-quan-binh-chanh-hh-979676119-toan-quoc.html", "http://gallery.e2bn.org/asset73204_8-.html", "http://goplay.nsw.gov.au/activities-for-kids/by/historic-houses-trust/?startdate=2012-07-10", "http://grichards19067.multiply.com/", "http://hotkovo.egent.ru/user/89262269084/", "http://howimetyourself.com/?redirect_to=http://gomiso.com/m/suits/seasons/2/episodes/2", "http://islamqa.com/hi/ref/9014/DEAD%20PEOPLE%20GOOD%20DEEDS", "http://lapras.rutube.ru/", "http://nceluiko.ya.ru/", "http://nyanyanyanyaa.beon.ru/", "http://ozbo.com/Leaf-River-DV-7SS-7-0-MP-Game-Camera-K1-32541.html", "http://sbantom.ru/catalog/chasy/632753.html", "http://shopingoff.com/index.php?option=com_virtuemart&Itemid=65&category_id=&page=shop.browse&manufacturer_id=122&limit=32&limitstart=96", "http://shopingoff.com/katalog-odezhdy/manufacturer/62-christian-audigier.html?limit=32&start=448", "https://webwinkel.ah.nl/process?fh_location=//ecommerce/nl_NL/categories%3C%7Becommerce_shoc1%7D/it_show_product_code_1384%3E%7B10%3B20%7D/pr_startdate%3C20120519/pr_enddate%3E20120519/pr_ltc_allowed%3E%7Bbowi%7D/categories%3C%7Becommerce_shoc1_1al%7D/categories%3C%7Becommerce_shoc1_1al_1ahal%7D&&action=albert_noscript.modules.build", "http://top100.rambler.ru/navi/?theme=208/210/371&rgn=17", "http://volgogradskaya-oblast.extra-m.ru/classifieds/rabota/vakansii/banki-investicii/901467/", "http://wikien4.appspot.com/wiki/Warburg_hypothesis", "http://wola_baranowska.kamerzysta24.com.pl/", "http://www.10dot0dot0dot1.com/", "http://www.anima-redux.ru/index.php?key=gifts+teenage+girls", "http://www.aquaticabyseaworld.com/Calendar.aspx/CP/CP/CP/sp-us/CP/CP/ParkMap/Tickets/Weather.aspx", "http://www.autousa.com/360spin/2012_cadillac_ctssportwagon_3.6awdpremiumcollection.htm", "http://www.booking.com/city/gb/paignton-aireborough.html?inac=0&lang=pl", "http://www.booking.com/city/it/vodo-cadore.en.html", "http://www.booking.com/district/us/new-york/rockefeller-center.html&lang=no", "http://www.booking.com/hotel/bg/crown-fort-club.lv.html", "http://www.booking.com/hotel/ca/gouverneur-rimouski.ar.html", "http://www.booking.com/hotel/ch/l-auberge-du-chalet-a-gobet.fi.html", "http://www.booking.com/hotel/de/mark-garni.ru.html?aid=337384;label=yandex-hotel-mark-garni-68157-%7Bparam1%7D", "http://www.booking.com/hotel/de/mercure-goldschmieding-castrop-rauxel.ro.html", "http://www.booking.com/hotel/de/zollenspieker-fahrhaus.fr.html", "http://www.booking.com/hotel/es/jardin-metropolitano.ca.html", "http://www.booking.com/hotel/fr/clim.fr.html", "http://www.booking.com/hotel/fr/radisson-sas-toulouse-airport.et.html", "http://www.booking.com/hotel/gb/stgileshotel.ro.html?srfid=68c7fe42a03653a8796c84435c5299e4X16?tab=4", "http://www.booking.com/hotel/gr/rodos-park-suites.ru.html", "http://www.booking.com/hotel/id/le-grande-suites-bali.ru.html", "http://www.booking.com/hotel/it/mozart.it.html?aid=321655", "http://www.booking.com/hotel/ni/bahia-del-sol-villas.ru.html?dcid=1;dva=0", "http://www.booking.com/hotel/nl/cpschiphol.ro.html.ro.html?tab=4", "http://www.booking.com/hotel/th/laem-din.en-gb.html", "http://www.booking.com/hotel/th/tinidee-ranong.en.html", "http://www.booking.com/hotel/us/best-western-plus-merrimack-valley.hu.html", "http://www.booking.com/hotel/vn/tan-hai-long.km.html", "http://www.booking.com/landmark/au/royal-brisbane-women-s-hospital.vi.html", "http://www.booking.com/landmark/hk/nam-cheong-station.html&lang=id", "http://www.booking.com/landmark/it/spanish-steps.ca.html", "http://www.booking.com/landmark/sg/asian-civilisations-museum.html&lang=fi", "http://www.booking.com/place/fi-1376029.pt.html", "http://www.booking.com/place/tn257337.pl.html", "http://www.booking.com/region/ca/niagarafalls.ar.html&selected_currency=PLN", "http://www.booking.com/region/mx/queretaro.pt-pt.html&selected_currency=AUD", "http://www.booking.com/searchresults.en.html?city=20063074", "http://www.booking.com/searchresults.et.html?checkin=;checkout=;city=-394632", "http://www.booking.com/searchresults.lv.html?region=3936", "http://www.cevredanismanlari.com/index.php/component/k2/index.php/mevzuat/genel-yazlar/item/dosyalar/index.php?option=com_k2&view=item&id=16:iso-14001-%C3%A7evre-y%C3%B6netim-sistemi&Itemid=132&limitstart=107120", "http://www.dh-wholesaler.com/MENS-POLO-RACING-TEE-RL-p-417.html", "http://www.employabilityonline.net/", "http://www.esso.inc.ru/board/tools.php?event=profile&pname=Invinerrq", "http://www.filesurgery.ru/searchfw/kids_clothes-3.html", "http://www.furnitureandcarpetsource.com/Item.aspx?ItemID=-2107311899&ItemNum=53-T3048", "http://www.gets.cn/product/Gold-Sand-Lampwork-Glass-Beads--Flat-round--28x28x13mm_p260717.html", "http://www.gets.cn/wholesale-Sterling-Silver-Pendant-Findings-3577_S--L-Star-P-1.html?view=1&by=1", "http://www.homeandgardenadvice.com/diy/Mortgages_Loans_and_Financing/9221.html", "http://www.hongkongairport.com/eng/index.html/passenger/passenger/transport/to-from-airport/business/about-the-airport/transport/shopping/entertainment/t2/passenger/interactive-map.html", "http://www.hongkongairport.com/eng/index.html/shopping/insideshopping/all/passenger/transfer-transit/all/airline-information/shopping/entertainment/t2/business/about-the-airport/welcome.html", "http://www.hongkongairport.com/eng/index.html/transport/business/about-the-airport/transport/business/airport-authority/passenger/shopping/dining/all/dining.html", "http://www.idedge.com/index.cfm/fuseaction/category.display/category_id/298/index.cfm", "http://www.istanbulburda.com/aramalar.php", "http://www.jewelryinthenet.com/ads/AdDetail.aspx?AdID=1-0311002490689&stid=22-0111001020877", "http://www.johnnydepp.ru/forum/index.php?showtopic=1629&mode=linearplus&view=findpost&p=186977", "http://www.johnnydepp.ru/forum/index.php?showtopic=476&st=60&p=87379&", "http://www.joseleano.com/joomla/index.php/audio", "http://www.kaplicarehberi.com/tag/sakar-ilicali-kaplicalari/feed", "http://www.khaber.com.tr/arama.html?key=%C3%A7avdar", "http://www.kiz-oyunlari1.com/1783/4437/4363/1056/4170/Bump-Copter2-.html", "http://www.kiz-oyunlari1.com/3752/2612/4175/1166/3649/1047/Angelina-Oyunu.html", "http://www.kiz-oyunlari1.com/4266/3630/3665/3286/4121/301/3274/Sinir-Sinekler-.html", "http://www.kuldiga.lv/index.php?f=8&cat=371", "http://www.kuldiga.lv/index.php/img/index.php?l=lv&art_id=1836&show_c=&cat=85", "http://www.patronessa.ru/remontiruemsya/kuzovnie30raboti.html", "http://www.rapdict.org/Nu_Money?title=Talk:Nu_Money&action=edit", "http://www.serafin-phu.tabor24.com/?page=8", "http://www.shoes-store.org/brand1/Kids/Minnetonka.html", "http://www.shoes-store.org/shoes-store.xml", "http://www.way2allah.com/khotab-download-34695.htm"}; +        data.clear(); +        data.insert(data.begin(), urls, urls + Y_ARRAY_SIZE(urls)); +    } + +    typedef THashMap<TStringBuf, NGreedyDict::TEntry> TDict; + +    TAutoPtr<NGreedyDict::TEntrySet> DoTestBuilder(const NGreedyDict::TBuildSettings& s, +                                                   TDict& res) { +        using namespace NGreedyDict; + +        TStringBufs data; +        FillData(data); + +        TDictBuilder b(s); +        b.SetInput(data); +        b.Build(256 + 128); + +        TEntrySet& set = b.EntrySet(); + +        for (const auto& it : set) { +            if (it.Score) { +                res[it.Str] = it; +            } +        } + +        return b.ReleaseEntrySet(); +    } + +    void DoAssertEntry(TStringBuf entry, ui32 number, i32 parent, float score, const TDict& dict) { +        TDict::const_iterator it = dict.find(entry); +        UNIT_ASSERT_C(it != dict.end(), entry); +        UNIT_ASSERT_VALUES_EQUAL_C(it->second.Number, number, entry); +        UNIT_ASSERT_VALUES_EQUAL_C(it->second.NearestPrefix, parent, entry); +        UNIT_ASSERT_VALUES_EQUAL_C(round(it->second.Score * 10000), round(score * 10000), entry); +    } + +    void TestBuilder() { +        TAutoPtr<NGreedyDict::TEntrySet> set; +        THashMap<TStringBuf, NGreedyDict::TEntry> res; +        NGreedyDict::TBuildSettings s; +        set = DoTestBuilder(s, res); + +        UNIT_ASSERT_VALUES_EQUAL(set->size(), 295u); +        UNIT_ASSERT_VALUES_EQUAL(res.size(), 110u); + +        DoAssertEntry("%", 37, -1, 0.00375193, res); +        DoAssertEntry("%7", 38, 37, 0.00513299, res); +        DoAssertEntry("&", 39, -1, 0.00794527, res); +        DoAssertEntry("+", 44, -1, 0.000441404, res); +        DoAssertEntry(",", 45, -1, 0.000441404, res); +        DoAssertEntry("-", 46, -1, 0.0417126, res); +        DoAssertEntry(".", 47, -1, 0.0196425, res); +        DoAssertEntry(".com/", 48, 47, 0.0374482, res); +        DoAssertEntry(".html", 49, 47, 0.0496577, res); +        DoAssertEntry(".html?", 50, 49, 0.0153908, res); +        DoAssertEntry(".php", 51, 47, 0.0123585, res); +        DoAssertEntry(".ru/", 52, 47, 0.0150027, res); +        DoAssertEntry("/", 53, -1, 0.0452439, res); +        DoAssertEntry("/index", 54, 53, 0.0158905, res); +        DoAssertEntry("0", 55, -1, 0.00816597, res); +        DoAssertEntry("1", 56, -1, 0.0167733, res); +        DoAssertEntry("10", 57, 56, 0.00530474, res); +        DoAssertEntry("2", 58, -1, 0.0101523, res); +        DoAssertEntry("20", 59, 58, 0.00674234, res); +        DoAssertEntry("3", 60, -1, 0.01258, res); +        DoAssertEntry("32", 61, 60, 0.00490697, res); +        DoAssertEntry("4", 62, -1, 0.00993158, res); +        DoAssertEntry("5", 63, -1, 0.00617965, res); +        DoAssertEntry("6", 64, -1, 0.00971088, res); +        DoAssertEntry("7", 65, -1, 0.0101523, res); +        DoAssertEntry("8", 66, -1, 0.00728316, res); +        DoAssertEntry("9", 67, -1, 0.00728316, res); +        DoAssertEntry(":", 68, -1, 0.000662106, res); +        DoAssertEntry(";", 69, -1, 0.000882807, res); +        DoAssertEntry("=", 71, -1, 0.01258, res); +        DoAssertEntry("?", 73, -1, 0.00397263, res); +        DoAssertEntry("A", 75, -1, 0.00264842, res); +        DoAssertEntry("B", 76, -1, 0.00220702, res); +        DoAssertEntry("C", 77, -1, 0.00353123, res); +        DoAssertEntry("D", 78, -1, 0.00375193, res); +        DoAssertEntry("E", 79, -1, 0.00286912, res); +        DoAssertEntry("F", 80, -1, 0.00110351, res); +        DoAssertEntry("G", 81, -1, 0.00110351, res); +        DoAssertEntry("H", 82, -1, 0.000220702, res); +        DoAssertEntry("I", 83, -1, 0.00198632, res); +        DoAssertEntry("K", 85, -1, 0.000441404, res); +        DoAssertEntry("L", 86, -1, 0.00198632, res); +        DoAssertEntry("M", 87, -1, 0.00154491, res); +        DoAssertEntry("N", 88, -1, 0.00154491, res); +        DoAssertEntry("O", 89, -1, 0.00132421, res); +        DoAssertEntry("P", 90, -1, 0.00308983, res); +        DoAssertEntry("R", 92, -1, 0.000662106, res); +        DoAssertEntry("S", 93, -1, 0.00264842, res); +        DoAssertEntry("T", 94, -1, 0.00110351, res); +        DoAssertEntry("U", 95, -1, 0.000220702, res); +        DoAssertEntry("V", 96, -1, 0.000441404, res); +        DoAssertEntry("W", 97, -1, 0.000441404, res); +        DoAssertEntry("X", 98, -1, 0.000220702, res); +        DoAssertEntry("Y", 99, -1, 0.000220702, res); +        DoAssertEntry("_", 105, -1, 0.00904877, res); +        DoAssertEntry("a", 107, -1, 0.0505407, res); +        DoAssertEntry("an", 108, 107, 0.018273, res); +        DoAssertEntry("ar", 109, 107, 0.0169385, res); +        DoAssertEntry("b", 110, -1, 0.0156698, res); +        DoAssertEntry("c", 111, -1, 0.018539, res); +        DoAssertEntry("cat", 112, 111, 0.00846732, res); +        DoAssertEntry("ch", 113, 111, 0.00644872, res); +        DoAssertEntry("com", 114, 111, 0.00724235, res); +        DoAssertEntry("ct", 115, 111, 0.00605729, res); +        DoAssertEntry("d", 116, -1, 0.020746, res); +        DoAssertEntry("di", 117, 116, 0.00730659, res); +        DoAssertEntry("e", 118, -1, 0.0624586, res); +        DoAssertEntry("en", 119, 118, 0.0108999, res); +        DoAssertEntry("ent", 120, 119, 0.00616002, res); +        DoAssertEntry("f", 121, -1, 0.00860737, res); +        DoAssertEntry("fi", 122, 121, 0.00423196, res); +        DoAssertEntry("g", 123, -1, 0.0180975, res); +        DoAssertEntry("go", 124, 123, 0.00601862, res); +        DoAssertEntry("h", 125, -1, 0.010373, res); +        DoAssertEntry("ho", 126, 125, 0.00570298, res); +        DoAssertEntry("http://", 127, 125, 0.0494372, res); +        DoAssertEntry("http://www.", 128, 127, 0.0849702, res); +        DoAssertEntry("http://www.booking.com/", 129, 128, 0.071066, res); +        DoAssertEntry("http://www.booking.com/hotel/", 130, 129, 0.121607, res); +        DoAssertEntry("i", 131, -1, 0.0258221, res); +        DoAssertEntry("id=", 132, 131, 0.00725369, res); +        DoAssertEntry("im", 133, 131, 0.00373318, res); +        DoAssertEntry("in", 134, 131, 0.013625, res); +        DoAssertEntry("ing", 135, 134, 0.00795491, res); +        DoAssertEntry("ion", 136, 131, 0.00796149, res); +        DoAssertEntry("it", 137, 131, 0.00953416, res); +        DoAssertEntry("j", 138, -1, 0.00132421, res); +        DoAssertEntry("k", 139, -1, 0.0134628, res); +        DoAssertEntry("l", 140, -1, 0.0381814, res); +        DoAssertEntry("m", 141, -1, 0.0174354, res); +        DoAssertEntry("mer", 142, 141, 0.00711846, res); +        DoAssertEntry("n", 143, -1, 0.0132421, res); +        DoAssertEntry("o", 144, -1, 0.0302362, res); +        DoAssertEntry("on", 145, 144, 0.00802271, res); +        DoAssertEntry("ou", 146, 144, 0.00414545, res); +        DoAssertEntry("p", 147, -1, 0.0225116, res); +        DoAssertEntry("port", 148, 147, 0.0123532, res); +        DoAssertEntry("q", 149, -1, 0.00176561, res); +        DoAssertEntry("r", 150, -1, 0.0401677, res); +        DoAssertEntry("ran", 151, 150, 0.00686918, res); +        DoAssertEntry("s", 152, -1, 0.0487751, res); +        DoAssertEntry("sho", 153, 152, 0.0113876, res); +        DoAssertEntry("t", 154, -1, 0.0379607, res); +        DoAssertEntry("u", 155, -1, 0.0211874, res); +        DoAssertEntry("v", 156, -1, 0.00595895, res); +        DoAssertEntry("vi", 157, 156, 0.00480673, res); +        DoAssertEntry("w", 158, -1, 0.00816597, res); +        DoAssertEntry("x", 159, -1, 0.00375193, res); +        DoAssertEntry("y", 160, -1, 0.0130214, res); +        DoAssertEntry("z", 161, -1, 0.00353123, res); +    } +}; + +UNIT_TEST_SUITE_REGISTRATION(TGreedyDictTest); diff --git a/library/cpp/codecs/greedy_dict/ut/ya.make b/library/cpp/codecs/greedy_dict/ut/ya.make new file mode 100644 index 00000000000..bd67d1a4522 --- /dev/null +++ b/library/cpp/codecs/greedy_dict/ut/ya.make @@ -0,0 +1,9 @@ +UNITTEST_FOR(library/cpp/codecs/greedy_dict) + +OWNER(velavokr) + +SRCS( +    greedy_dict_ut.cpp +) + +END() diff --git a/library/cpp/codecs/greedy_dict/ya.make b/library/cpp/codecs/greedy_dict/ya.make new file mode 100644 index 00000000000..2a57224f7e1 --- /dev/null +++ b/library/cpp/codecs/greedy_dict/ya.make @@ -0,0 +1,15 @@ +OWNER(velavokr) + +LIBRARY() + +SRCS( +    gd_builder.cpp +    gd_entry.cpp +) + +PEERDIR( +    library/cpp/containers/comptrie +    library/cpp/string_utils/relaxed_escaper +) + +END() diff --git a/library/cpp/codecs/huffman_codec.cpp b/library/cpp/codecs/huffman_codec.cpp new file mode 100644 index 00000000000..650fe7cdfdd --- /dev/null +++ b/library/cpp/codecs/huffman_codec.cpp @@ -0,0 +1,592 @@ +#include "huffman_codec.h" +#include <library/cpp/bit_io/bitinput.h> +#include <library/cpp/bit_io/bitoutput.h> + +#include <util/generic/algorithm.h> +#include <util/generic/bitops.h> +#include <util/stream/buffer.h> +#include <util/stream/length.h> +#include <util/string/printf.h> + +namespace NCodecs { +    template <typename T> +    struct TCanonicalCmp { +        bool operator()(const T& a, const T& b) const { +            if (a.CodeLength == b.CodeLength) { +                return a.Char < b.Char; +            } else { +                return a.CodeLength < b.CodeLength; +            } +        } +    }; + +    template <typename T> +    struct TByCharCmp { +        bool operator()(const T& a, const T& b) const { +            return a.Char < b.Char; +        } +    }; + +    struct TTreeEntry { +        static const ui32 InvalidBranch = (ui32)-1; + +        ui64 Freq = 0; +        ui32 Branches[2]{InvalidBranch, InvalidBranch}; + +        ui32 CodeLength = 0; +        ui8 Char = 0; +        bool Invalid = false; + +        TTreeEntry() = default; + +        static bool ByFreq(const TTreeEntry& a, const TTreeEntry& b) { +            return a.Freq < b.Freq; +        } + +        static bool ByFreqRev(const TTreeEntry& a, const TTreeEntry& b) { +            return a.Freq > b.Freq; +        } +    }; + +    using TCodeTree = TVector<TTreeEntry>; + +    void InitTreeByFreqs(TCodeTree& tree, const ui64 freqs[256]) { +        tree.reserve(255 * 256 / 2); // worst case - balanced tree + +        for (ui32 i = 0; i < 256; ++i) { +            tree.emplace_back(); +            tree.back().Char = i; +            tree.back().Freq = freqs[i]; +        } + +        StableSort(tree.begin(), tree.end(), TTreeEntry::ByFreq); +    } + +    void InitTree(TCodeTree& tree, ISequenceReader* in) { +        using namespace NPrivate; +        ui64 freqs[256]; +        Zero(freqs); + +        TStringBuf r; +        while (in->NextRegion(r)) { +            for (ui64 i = 0; i < r.size(); ++i) +                ++freqs[(ui8)r[i]]; +        } + +        InitTreeByFreqs(tree, freqs); +    } + +    void CalculateCodeLengths(TCodeTree& tree) { +        Y_ENSURE(tree.size() == 256, " "); +        const ui32 firstbranch = tree.size(); + +        ui32 curleaf = 0; +        ui32 curbranch = firstbranch; + +        // building code tree. two priority queues are combined in one. +        while (firstbranch - curleaf + tree.size() - curbranch >= 2) { +            TTreeEntry e; + +            for (auto& branche : e.Branches) { +                ui32 br; + +                if (curleaf >= firstbranch) +                    br = curbranch++; +                else if (curbranch >= tree.size()) +                    br = curleaf++; +                else if (tree[curleaf].Freq < tree[curbranch].Freq) +                    br = curleaf++; +                else +                    br = curbranch++; + +                Y_ENSURE(br < tree.size(), " "); +                branche = br; +                e.Freq += tree[br].Freq; +            } + +            tree.push_back(e); +            PushHeap(tree.begin() + curbranch, tree.end(), TTreeEntry::ByFreqRev); +        } + +        // computing code lengths +        for (ui64 i = tree.size() - 1; i >= firstbranch; --i) { +            TTreeEntry e = tree[i]; + +            for (auto branche : e.Branches) +                tree[branche].CodeLength = e.CodeLength + 1; +        } + +        // chopping off the branches +        tree.resize(firstbranch); + +        Sort(tree.begin(), tree.end(), TCanonicalCmp<TTreeEntry>()); + +        // simplification: we are stripping codes longer than 64 bits +        while (!tree.empty() && tree.back().CodeLength > 64) +            tree.pop_back(); + +        // will not compress +        if (tree.empty()) +            return; + +        // special invalid code word +        tree.back().Invalid = true; +    } + +    struct TEncoderEntry { +        ui64 Code = 0; + +        ui8 CodeLength = 0; +        ui8 Char = 0; +        ui8 Invalid = true; + +        explicit TEncoderEntry(TTreeEntry e) +            : CodeLength(e.CodeLength) +            , Char(e.Char) +            , Invalid(e.Invalid) +        { +        } + +        TEncoderEntry() = default; +    }; + +    struct TEncoderTable { +        TEncoderEntry Entries[256]; + +        void Save(IOutputStream* out) const { +            ui16 nval = 0; + +            for (auto entrie : Entries) +                nval += !entrie.Invalid; + +            ::Save(out, nval); + +            for (auto entrie : Entries) { +                if (!entrie.Invalid) { +                    ::Save(out, entrie.Char); +                    ::Save(out, entrie.CodeLength); +                } +            } +        } + +        void Load(IInputStream* in) { +            ui16 nval = 0; +            ::Load(in, nval); + +            for (ui32 i = 0; i < 256; ++i) +                Entries[i].Char = i; + +            for (ui32 i = 0; i < nval; ++i) { +                ui8 ch = 0; +                ui8 len = 0; +                ::Load(in, ch); +                ::Load(in, len); +                Entries[ch].CodeLength = len; +                Entries[ch].Invalid = false; +            } +        } +    }; + +    struct TDecoderEntry { +        ui32 NextTable : 10; +        ui32 Char : 8; +        ui32 Invalid : 1; +        ui32 Bad : 1; + +        TDecoderEntry() +            : NextTable() +            , Char() +            , Invalid() +            , Bad() +        { +        } +    }; + +    struct TDecoderTable: public TIntrusiveListItem<TDecoderTable> { +        ui64 Length = 0; +        ui64 BaseCode = 0; + +        TDecoderEntry Entries[256]; + +        TDecoderTable() { +            Zero(Entries); +        } +    }; + +    const int CACHE_BITS_COUNT = 16; +    class THuffmanCodec::TImpl: public TAtomicRefCount<TImpl> { +        TEncoderTable Encoder; +        TDecoderTable Decoder[256]; + +        TEncoderEntry Invalid; + +        ui32 SubTablesNum; + +        class THuffmanCache { +            struct TCacheEntry { +                int EndOffset : 24; +                int BitsLeft : 8; +            }; +            TVector<char> DecodeCache; +            TVector<TCacheEntry> CacheEntries; +            const TImpl& Original; + +        public: +            THuffmanCache(const THuffmanCodec::TImpl& encoder); + +            void Decode(NBitIO::TBitInput& in, TBuffer& out) const; +        }; + +        THolder<THuffmanCache> Cache; + +    public: +        TImpl() +            : SubTablesNum(1) +        { +            Invalid.CodeLength = 255; +        } + +        ui8 Encode(TStringBuf in, TBuffer& out) const { +            out.Clear(); + +            if (in.empty()) { +                return 0; +            } + +            out.Reserve(in.size() * 2); + +            { +                NBitIO::TBitOutputVector<TBuffer> bout(&out); +                TStringBuf tin = in; + +                // data is under compression +                bout.Write(1, 1); + +                for (auto t : tin) { +                    const TEncoderEntry& ce = Encoder.Entries[(ui8)t]; + +                    bout.Write(ce.Code, ce.CodeLength); + +                    if (ce.Invalid) { +                        bout.Write(t, 8); +                    } +                } + +                // in canonical huffman coding there cannot be a code having no 0 in the suffix +                // and shorter than 8 bits. +                bout.Write((ui64)-1, bout.GetByteReminder()); +                return bout.GetByteReminder(); +            } +        } + +        void Decode(TStringBuf in, TBuffer& out) const { +            out.Clear(); + +            if (in.empty()) { +                return; +            } + +            NBitIO::TBitInput bin(in); +            ui64 f = 0; +            bin.ReadK<1>(f); + +            // if data is uncompressed +            if (!f) { +                in.Skip(1); +                out.Append(in.data(), in.size()); +            } else { +                out.Reserve(in.size() * 8); + +                if (Cache.Get()) { +                    Cache->Decode(bin, out); +                } else { +                    while (ReadNextChar(bin, out)) { +                    } +                } +            } +        } + +        Y_FORCE_INLINE int ReadNextChar(NBitIO::TBitInput& bin, TBuffer& out) const { +            const TDecoderTable* table = Decoder; +            TDecoderEntry e; + +            int bitsRead = 0; +            while (true) { +                ui64 code = 0; + +                if (Y_UNLIKELY(!bin.Read(code, table->Length))) +                    return 0; +                bitsRead += table->Length; + +                if (Y_UNLIKELY(code < table->BaseCode)) +                    return 0; + +                code -= table->BaseCode; + +                if (Y_UNLIKELY(code > 255)) +                    return 0; + +                e = table->Entries[code]; + +                if (Y_UNLIKELY(e.Bad)) +                    return 0; + +                if (e.NextTable) { +                    table = Decoder + e.NextTable; +                } else { +                    if (e.Invalid) { +                        code = 0; +                        bin.ReadK<8>(code); +                        bitsRead += 8; +                        out.Append((ui8)code); +                    } else { +                        out.Append((ui8)e.Char); +                    } + +                    return bitsRead; +                } +            } + +            Y_ENSURE(false, " could not decode input"); +            return 0; +        } + +        void GenerateEncoder(TCodeTree& tree) { +            const ui64 sz = tree.size(); + +            TEncoderEntry lastcode = Encoder.Entries[tree[0].Char] = TEncoderEntry(tree[0]); + +            for (ui32 i = 1; i < sz; ++i) { +                const TTreeEntry& te = tree[i]; +                TEncoderEntry& e = Encoder.Entries[te.Char]; +                e = TEncoderEntry(te); + +                e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength); +                lastcode = e; + +                e.Code = ReverseBits(e.Code, e.CodeLength); + +                if (e.Invalid) +                    Invalid = e; +            } + +            for (auto& e : Encoder.Entries) { +                if (e.Invalid) +                    e = Invalid; + +                Y_ENSURE(e.CodeLength, " "); +            } +        } + +        void RegenerateEncoder() { +            for (auto& entrie : Encoder.Entries) { +                if (entrie.Invalid) +                    entrie.CodeLength = Invalid.CodeLength; +            } + +            Sort(Encoder.Entries, Encoder.Entries + 256, TCanonicalCmp<TEncoderEntry>()); + +            TEncoderEntry lastcode = Encoder.Entries[0]; + +            for (ui32 i = 1; i < 256; ++i) { +                TEncoderEntry& e = Encoder.Entries[i]; +                e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength); +                lastcode = e; + +                e.Code = ReverseBits(e.Code, e.CodeLength); +            } + +            for (auto& entrie : Encoder.Entries) { +                if (entrie.Invalid) { +                    Invalid = entrie; +                    break; +                } +            } + +            Sort(Encoder.Entries, Encoder.Entries + 256, TByCharCmp<TEncoderEntry>()); + +            for (auto& entrie : Encoder.Entries) { +                if (entrie.Invalid) +                    entrie = Invalid; +            } +        } + +        void BuildDecoder() { +            TEncoderTable enc = Encoder; +            Sort(enc.Entries, enc.Entries + 256, TCanonicalCmp<TEncoderEntry>()); + +            TEncoderEntry& e1 = enc.Entries[0]; +            Decoder[0].BaseCode = e1.Code; +            Decoder[0].Length = e1.CodeLength; + +            for (auto e2 : enc.Entries) { +                SetEntry(Decoder, e2.Code, e2.CodeLength, e2); +            } +            Cache.Reset(new THuffmanCache(*this)); +        } + +        void SetEntry(TDecoderTable* t, ui64 code, ui64 len, TEncoderEntry e) { +            Y_ENSURE(len >= t->Length, len << " < " << t->Length); + +            ui64 idx = (code & MaskLowerBits(t->Length)) - t->BaseCode; +            TDecoderEntry& d = t->Entries[idx]; + +            if (len == t->Length) { +                Y_ENSURE(!d.NextTable, " "); + +                d.Char = e.Char; +                d.Invalid = e.Invalid; +                return; +            } + +            if (!d.NextTable) { +                Y_ENSURE(SubTablesNum < Y_ARRAY_SIZE(Decoder), " "); +                d.NextTable = SubTablesNum++; +                TDecoderTable* nt = Decoder + d.NextTable; +                nt->Length = Min<ui64>(8, len - t->Length); +                nt->BaseCode = (code >> t->Length) & MaskLowerBits(nt->Length); +            } + +            SetEntry(Decoder + d.NextTable, code >> t->Length, len - t->Length, e); +        } + +        void Learn(ISequenceReader* in) { +            { +                TCodeTree tree; +                InitTree(tree, in); +                CalculateCodeLengths(tree); +                Y_ENSURE(!tree.empty(), " "); +                GenerateEncoder(tree); +            } +            BuildDecoder(); +        } + +        void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) { +            TCodeTree tree; + +            ui64 freqsArray[256]; +            Zero(freqsArray); + +            for (const auto& freq : freqs) +                freqsArray[static_cast<ui8>(freq.first)] += freq.second; + +            InitTreeByFreqs(tree, freqsArray); +            CalculateCodeLengths(tree); + +            Y_ENSURE(!tree.empty(), " "); + +            GenerateEncoder(tree); +            BuildDecoder(); +        } + +        void Save(IOutputStream* out) { +            ::Save(out, Invalid.CodeLength); +            Encoder.Save(out); +        } + +        void Load(IInputStream* in) { +            ::Load(in, Invalid.CodeLength); +            Encoder.Load(in); +            RegenerateEncoder(); +            BuildDecoder(); +        } +    }; + +    THuffmanCodec::TImpl::THuffmanCache::THuffmanCache(const THuffmanCodec::TImpl& codec) +        : Original(codec) +    { +        CacheEntries.resize(1 << CACHE_BITS_COUNT); +        DecodeCache.reserve(CacheEntries.size() * 2); +        char buffer[2]; +        TBuffer decoded; +        for (size_t i = 0; i < CacheEntries.size(); i++) { +            buffer[1] = i >> 8; +            buffer[0] = i; +            NBitIO::TBitInput bin(buffer, buffer + sizeof(buffer)); +            int totalBits = 0; +            while (true) { +                decoded.Resize(0); +                int bits = codec.ReadNextChar(bin, decoded); +                if (totalBits + bits > 16 || !bits) { +                    TCacheEntry e = {static_cast<int>(DecodeCache.size()), 16 - totalBits}; +                    CacheEntries[i] = e; +                    break; +                } + +                for (TBuffer::TConstIterator it = decoded.Begin(); it != decoded.End(); ++it) { +                    DecodeCache.push_back(*it); +                } +                totalBits += bits; +            } +        } +        DecodeCache.push_back(0); +        CacheEntries.shrink_to_fit(); +        DecodeCache.shrink_to_fit(); +    } + +    void THuffmanCodec::TImpl::THuffmanCache::Decode(NBitIO::TBitInput& bin, TBuffer& out) const { +        int bits = 0; +        ui64 code = 0; +        while (!bin.Eof()) { +            ui64 f = 0; +            const int toRead = 16 - bits; +            if (toRead > 0 && bin.Read(f, toRead)) { +                code = (code >> (16 - bits)) | (f << bits); +                code &= 0xFFFF; +                TCacheEntry entry = CacheEntries[code]; +                int start = code > 0 ? CacheEntries[code - 1].EndOffset : 0; +                out.Append((const char*)&DecodeCache[start], (const char*)&DecodeCache[entry.EndOffset]); +                bits = entry.BitsLeft; +            } else { // should never happen until there are exceptions or unaligned input +                bin.Back(bits); +                if (!Original.ReadNextChar(bin, out)) +                    break; + +                code = 0; +                bits = 0; +            } +        } +    } + +    THuffmanCodec::THuffmanCodec() +        : Impl(new TImpl) +    { +        MyTraits.NeedsTraining = true; +        MyTraits.PreservesPrefixGrouping = true; +        MyTraits.PaddingBit = 1; +        MyTraits.SizeOnEncodeMultiplier = 2; +        MyTraits.SizeOnDecodeMultiplier = 8; +        MyTraits.RecommendedSampleSize = 1 << 21; +    } + +    THuffmanCodec::~THuffmanCodec() = default; + +    ui8 THuffmanCodec::Encode(TStringBuf in, TBuffer& bbb) const { +        if (Y_UNLIKELY(!Trained)) +            ythrow TCodecException() << " not trained"; + +        return Impl->Encode(in, bbb); +    } + +    void THuffmanCodec::Decode(TStringBuf in, TBuffer& bbb) const { +        Impl->Decode(in, bbb); +    } + +    void THuffmanCodec::Save(IOutputStream* out) const { +        Impl->Save(out); +    } + +    void THuffmanCodec::Load(IInputStream* in) { +        Impl->Load(in); +    } + +    void THuffmanCodec::DoLearn(ISequenceReader& in) { +        Impl->Learn(&in); +    } + +    void THuffmanCodec::LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) { +        Impl->LearnByFreqs(freqs); +        Trained = true; +    } + +} diff --git a/library/cpp/codecs/huffman_codec.h b/library/cpp/codecs/huffman_codec.h new file mode 100644 index 00000000000..559545b90d9 --- /dev/null +++ b/library/cpp/codecs/huffman_codec.h @@ -0,0 +1,39 @@ +#pragma once + +#include "codecs.h" + +#include <util/generic/ptr.h> +#include <util/string/cast.h> + +namespace NCodecs { +    // for types greater than char, pipeline with TFreqCodec. + +    class THuffmanCodec: public ICodec { +        class TImpl; +        TIntrusivePtr<TImpl> Impl; + +    public: +        THuffmanCodec(); +        ~THuffmanCodec() override; + +        static TStringBuf MyName() { +            return "huffman"; +        } + +        TString GetName() const override { +            return ToString(MyName()); +        } + +        ui8 Encode(TStringBuf in, TBuffer& bbb) const override; + +        void Decode(TStringBuf in, TBuffer& bbb) const override; + +        void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs); + +    protected: +        void DoLearn(ISequenceReader& in) override; +        void Save(IOutputStream* out) const override; +        void Load(IInputStream* in) override; +    }; + +} diff --git a/library/cpp/codecs/pfor_codec.cpp b/library/cpp/codecs/pfor_codec.cpp new file mode 100644 index 00000000000..f6b3b0920bd --- /dev/null +++ b/library/cpp/codecs/pfor_codec.cpp @@ -0,0 +1,22 @@ +#include "pfor_codec.h" + +namespace NCodecs { +    template <> +    TStringBuf TPForCodec<ui64, true>::MyName() { +        return "pfor-delta64-sorted"; +    } +    template <> +    TStringBuf TPForCodec<ui32, true>::MyName() { +        return "pfor-delta32-sorted"; +    } + +    template <> +    TStringBuf TPForCodec<ui64, false>::MyName() { +        return "pfor-ui64"; +    } +    template <> +    TStringBuf TPForCodec<ui32, false>::MyName() { +        return "pfor-ui32"; +    } + +} diff --git a/library/cpp/codecs/pfor_codec.h b/library/cpp/codecs/pfor_codec.h new file mode 100644 index 00000000000..d7d4bb8bf48 --- /dev/null +++ b/library/cpp/codecs/pfor_codec.h @@ -0,0 +1,211 @@ +#pragma once + +#include "codecs.h" + +#include "delta_codec.h" +#include "tls_cache.h" + +#include <library/cpp/bit_io/bitinput.h> +#include <library/cpp/bit_io/bitoutput.h> +#include <util/string/cast.h> + +namespace NCodecs { +    template <typename T, bool WithDelta = false> +    class TPForCodec: public ICodec { +        using TUnsigned = std::make_unsigned_t<T>; +        typedef TDeltaCodec<TUnsigned> TDCodec; + +        typedef std::conditional_t<WithDelta, typename TDCodec::TDelta, T> TValue; +        static_assert(std::is_unsigned<TValue>::value, "expect std:is_unsigned<TValue>::value"); + +        static const ui64 BitsInT = sizeof(TUnsigned) * 8; + +        TDCodec DeltaCodec; + +    public: +        static TStringBuf MyName(); + +        TPForCodec() { +            MyTraits.AssumesStructuredInput = true; +            MyTraits.SizeOfInputElement = sizeof(T); +            MyTraits.SizeOnDecodeMultiplier = sizeof(T); +        } + +        TString GetName() const override { +            return ToString(MyName()); +        } + +        ui8 Encode(TStringBuf s, TBuffer& b) const override { +            b.Clear(); +            if (s.empty()) { +                return 0; +            } + +            b.Reserve(2 * s.size() + b.Size()); + +            if (WithDelta) { +                auto buffer = TBufferTlsCache::TlsInstance().Item(); +                TBuffer& db = buffer.Get(); +                db.Clear(); +                db.Reserve(2 * s.size()); +                DeltaCodec.Encode(s, db); +                s = TStringBuf{db.data(), db.size()}; +            } + +            TArrayRef<const TValue> tin{(const TValue*)s.data(), s.size() / sizeof(TValue)}; + +            const ui64 sz = tin.size(); +            ui64 bitcounts[BitsInT + 1]; +            Zero(bitcounts); + +            ui32 zeros = 0; + +            for (const TValue* it = tin.begin(); it != tin.end(); ++it) { +                TUnsigned v = 1 + (TUnsigned)*it; +                ui64 l = MostSignificantBit(v) + 1; +                ++bitcounts[l]; + +                if (!v) { +                    ++zeros; +                } +            } + +            // cumulative bit counts +            for (ui64 i = 0; i < BitsInT; ++i) { +                bitcounts[i + 1] += bitcounts[i]; +            } + +            bool hasexceptions = zeros; +            ui64 optimalbits = BitsInT; + +            { +                ui64 excsize = 0; +                ui64 minsize = sz * BitsInT; + +                for (ui64 current = BitsInT; current; --current) { +                    ui64 size = bitcounts[current] * current + (sz - bitcounts[current]) * (current + 6 + excsize) + zeros * (current + 6); + +                    excsize += current * bitcounts[current]; + +                    if (size < minsize) { +                        minsize = size; +                        optimalbits = current; +                        hasexceptions = zeros || sz - bitcounts[current]; +                    } +                } +            } + +            if (!optimalbits || BitsInT == optimalbits) { +                b.Append((ui8)-1); +                b.Append(s.data(), s.size()); +                return 0; +            } else { +                NBitIO::TBitOutputVector<TBuffer> bout(&b); +                bout.Write(0, 1); +                bout.Write(hasexceptions, 1); +                bout.Write(optimalbits, 6); + +                for (const TValue* it = tin.begin(); it != tin.end(); ++it) { +                    TUnsigned word = 1 + (TUnsigned)*it; +                    ui64 len = MostSignificantBit(word) + 1; +                    if (len > optimalbits || !word) { +                        Y_ENSURE(hasexceptions, " "); +                        bout.Write(0, optimalbits); +                        bout.Write(len, 6); +                        bout.Write(word, len); +                    } else { +                        bout.Write(word, optimalbits); +                    } +                } + +                return bout.GetByteReminder(); +            } // the rest of the last byte is zero padded. BitsInT is always > 7. +        } + +        void Decode(TStringBuf s, TBuffer& b) const override { +            b.Clear(); +            if (s.empty()) { +                return; +            } + +            b.Reserve(s.size() * sizeof(T) + b.Size()); + +            ui64 isplain = 0; +            ui64 hasexceptions = 0; +            ui64 bits = 0; + +            NBitIO::TBitInput bin(s); +            bin.ReadK<1>(isplain); +            bin.ReadK<1>(hasexceptions); +            bin.ReadK<6>(bits); + +            if (Y_UNLIKELY(isplain)) { +                s.Skip(1); + +                if (WithDelta) { +                    DeltaCodec.Decode(s, b); +                } else { +                    b.Append(s.data(), s.size()); +                } +            } else { +                typename TDCodec::TDecoder decoder; + +                if (hasexceptions) { +                    ui64 word = 0; +                    while (bin.Read(word, bits)) { +                        if (word || (bin.ReadK<6>(word) && bin.Read(word, word))) { +                            --word; + +                            TValue t = word; + +                            if (WithDelta) { +                                if (decoder.Decode(t)) { +                                    TStringBuf r{(char*)&decoder.Result, sizeof(decoder.Result)}; +                                    b.Append(r.data(), r.size()); +                                } +                            } else { +                                TStringBuf r{(char*)&t, sizeof(t)}; +                                b.Append(r.data(), r.size()); +                            } +                        } +                    } +                } else { +                    ui64 word = 0; +                    T outarr[256 / sizeof(T)]; +                    ui32 cnt = 0; +                    while (true) { +                        ui64 v = bin.Read(word, bits); + +                        if ((!v) | (!word)) +                            break; + +                        --word; +                        TValue t = word; + +                        if (WithDelta) { +                            if (decoder.Decode(t)) { +                                outarr[cnt++] = decoder.Result; +                            } +                        } else { +                            outarr[cnt++] = t; +                        } + +                        if (cnt == Y_ARRAY_SIZE(outarr)) { +                            b.Append((const char*)outarr, sizeof(outarr)); +                            cnt = 0; +                        } +                    } + +                    if (cnt) { +                        b.Append((const char*)outarr, cnt * sizeof(T)); +                    } +                } +            } +        } + +    protected: +        void DoLearn(ISequenceReader&) override { +        } +    }; + +} diff --git a/library/cpp/codecs/sample.h b/library/cpp/codecs/sample.h new file mode 100644 index 00000000000..15f03afcc5d --- /dev/null +++ b/library/cpp/codecs/sample.h @@ -0,0 +1,89 @@ +#pragma once + +#include <library/cpp/deprecated/accessors/accessors.h> + +#include <util/generic/buffer.h> +#include <util/generic/vector.h> +#include <util/random/fast.h> +#include <util/random/shuffle.h> + +#include <functional> +#include <type_traits> + +namespace NCodecs { +    class ISequenceReader { +    public: +        virtual bool NextRegion(TStringBuf& s) = 0; + +        virtual ~ISequenceReader() = default; +    }; + +    template <class TValue> +    TStringBuf ValueToStringBuf(TValue&& t) { +        return TStringBuf{NAccessors::Begin(t), NAccessors::End(t)}; +    } + +    template <class TIter> +    TStringBuf IterToStringBuf(TIter iter) { +        return ValueToStringBuf(*iter); +    } + +    template <class TItem> +    class TSimpleSequenceReader: public ISequenceReader { +        const TVector<TItem>& Items; +        size_t Idx = 0; + +    public: +        TSimpleSequenceReader(const TVector<TItem>& items) +            : Items(items) +        { +        } + +        bool NextRegion(TStringBuf& s) override { +            if (Idx >= Items.size()) { +                return false; +            } + +            s = ValueToStringBuf(Items[Idx++]); +            return true; +        } +    }; + +    template <class TIter, class TGetter> +    size_t GetInputSize(TIter begin, TIter end, TGetter getter) { +        size_t totalBytes = 0; +        for (TIter iter = begin; iter != end; ++iter) { +            totalBytes += getter(iter).size(); +        } +        return totalBytes; +    } + +    template <class TIter> +    size_t GetInputSize(TIter begin, TIter end) { +        return GetInputSize(begin, end, IterToStringBuf<TIter>); +    } + +    template <class TIter, class TGetter> +    TVector<TBuffer> GetSample(TIter begin, TIter end, size_t sampleSizeBytes, TGetter getter) { +        TFastRng64 rng{0x1ce1f2e507541a05, 0x07d45659, 0x7b8771030dd9917e, 0x2d6636ce}; + +        size_t totalBytes = GetInputSize(begin, end, getter); +        double sampleProb = (double)sampleSizeBytes / Max<size_t>(1, totalBytes); + +        TVector<TBuffer> result; +        for (TIter iter = begin; iter != end; ++iter) { +            if (sampleProb >= 1 || rng.GenRandReal1() < sampleProb) { +                TStringBuf reg = getter(iter); +                result.emplace_back(reg.data(), reg.size()); +            } +        } +        Shuffle(result.begin(), result.end(), rng); +        return result; +    } + +    template <class TIter> +    TVector<TBuffer> GetSample(TIter begin, TIter end, size_t sampleSizeBytes) { +        return GetSample(begin, end, sampleSizeBytes, IterToStringBuf<TIter>); +    } + +} diff --git a/library/cpp/codecs/solar_codec.cpp b/library/cpp/codecs/solar_codec.cpp new file mode 100644 index 00000000000..d0692fe2a46 --- /dev/null +++ b/library/cpp/codecs/solar_codec.cpp @@ -0,0 +1,133 @@ +#include "solar_codec.h" + +#include <library/cpp/codecs/greedy_dict/gd_builder.h> + +#include <library/cpp/containers/comptrie/comptrie_builder.h> +#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> +#include <util/stream/length.h> +#include <util/string/printf.h> +#include <util/ysaveload.h> + +namespace NCodecs { +    static inline ui32 Append(TBuffer& pool, TStringBuf data) { +        pool.Append(data.data(), data.size()); +        return pool.Size(); +    } + +    void TSolarCodec::DoLearn(ISequenceReader& r) { +        using namespace NGreedyDict; + +        Decoder.clear(); +        Pool.Clear(); + +        THolder<TEntrySet> set; + +        { +            TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance()); +            TStringBufs bufs; + +            TStringBuf m; +            while (r.NextRegion(m)) { +                bufs.push_back(pool.AppendString(m)); +            } + +            { +                TDictBuilder b(Settings); +                b.SetInput(bufs); +                b.Build(MaxEntries, MaxIterations); + +                set = b.ReleaseEntrySet(); +            } +        } + +        set->SetScores(ES_LEN_COUNT); + +        { +            TVector<std::pair<float, TStringBuf>> tmp; +            tmp.reserve(set->size()); + +            for (const auto& it : *set) { +                tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1)))); +            } + +            Sort(tmp.begin(), tmp.end()); + +            Decoder.reserve(tmp.size() + 1); +            Decoder.push_back(0); + +            for (const auto& it : tmp) { +                Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed"); +                ui32 endoff = Append(Pool, it.second); +                Decoder.push_back(endoff); +            } +        } + +        Pool.ShrinkToFit(); +        Decoder.shrink_to_fit(); + +        TBufferOutput bout; + +        { +            TVector<std::pair<TStringBuf, ui32>> tmp2; +            tmp2.reserve(Decoder.size()); + +            for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) { +                TStringBuf s = DoDecode(i); +                tmp2.push_back(std::make_pair(s, i - 1)); +                Y_ENSURE(s.size() == (Decoder[i] - Decoder[i - 1]), "learning invariant failed"); +            } + +            Sort(tmp2.begin(), tmp2.end()); + +            { +                TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED); +                for (const auto& it : tmp2) { +                    builder.Add(it.first.data(), it.first.size(), it.second); +                } + +                builder.Save(bout); +            } +        } + +        Encoder.Init(TBlob::FromBuffer(bout.Buffer())); +    } + +    void TSolarCodec::Save(IOutputStream* out) const { +        TBlob b = Encoder.Data(); +        ::Save(out, (ui32)b.Size()); +        out->Write(b.Data(), b.Size()); +    } + +    void TSolarCodec::Load(IInputStream* in) { +        ui32 sz; +        ::Load(in, sz); +        TLengthLimitedInput lin(in, sz); +        Encoder.Init(TBlob::FromStream(lin)); +        Pool.Clear(); +        Decoder.clear(); + +        TVector<std::pair<ui32, TString>> tmp; + +        ui32 poolsz = 0; +        for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) { +            const TString& s = it.GetKey(); +            tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s)); +            poolsz += Max<ui32>(s.size(), 1); +        } + +        Sort(tmp.begin(), tmp.end()); + +        Pool.Reserve(poolsz); +        Decoder.reserve(tmp.size() + 1); +        Decoder.push_back(0); + +        for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) { +            Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first); +            Decoder.push_back(Append(Pool, tmp[i].second)); +        } + +        Pool.ShrinkToFit(); +        Decoder.shrink_to_fit(); +    } + +} diff --git a/library/cpp/codecs/solar_codec.h b/library/cpp/codecs/solar_codec.h new file mode 100644 index 00000000000..7158ae79262 --- /dev/null +++ b/library/cpp/codecs/solar_codec.h @@ -0,0 +1,244 @@ +#pragma once + +#include "codecs.h" +#include <library/cpp/containers/comptrie/comptrie_trie.h> +#include <library/cpp/codecs/greedy_dict/gd_builder.h> + +#include <util/string/cast.h> +#include <util/string/escape.h> + +namespace NCodecs { +    // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы. +    // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе. + +    struct TVarIntTraits { +        static const size_t MAX_VARINT32_BYTES = 5; + +        static void Write(ui32 value, TBuffer& b) { +            while (value > 0x7F) { +                b.Append(static_cast<ui8>(value) | 0x80); +                value >>= 7; +            } +            b.Append(static_cast<ui8>(value) & 0x7F); +        } + +        static void Read(TStringBuf& r, ui32& value) { +            ui32 result = 0; +            for (ui32 count = 0; count < MAX_VARINT32_BYTES; ++count) { +                const ui32 b = static_cast<ui8>(r[0]); +                r.Skip(1); +                result |= static_cast<ui32>(b & 0x7F) << (7 * count); +                if (!(b & 0x80)) { +                    value = result; +                    return; +                } else if (Y_UNLIKELY(r.empty())) { +                    break; +                } +            } +            Y_ENSURE_EX(false, TCodecException() << "Bad data"); +        } +    }; + +    struct TShortIntTraits { +        static const size_t SHORTINT_SIZE_LIMIT = 0x8000; + +        Y_FORCE_INLINE static void Write(ui32 value, TBuffer& b) { +            Y_ENSURE_EX(value < SHORTINT_SIZE_LIMIT, TCodecException() << "Bad write method"); +            if (value >= 0x80) { +                b.Append(static_cast<ui8>(value >> 8) | 0x80); +            } +            b.Append(static_cast<ui8>(value)); +        } + +        Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) { +            ui32 result = static_cast<ui8>(r[0]); +            r.Skip(1); +            if (result >= 0x80) { +                Y_ENSURE_EX(!r.empty(), TCodecException() << "Bad data"); +                result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]); +                r.Skip(1); +            } +            value = result; +        } +    }; + +    class TSolarCodec: public ICodec { +    public: +        static TStringBuf MyName8k() { +            return TStringBuf("solar-8k"); +        } +        static TStringBuf MyName16k() { +            return TStringBuf("solar-16k"); +        } +        static TStringBuf MyName32k() { +            return TStringBuf("solar-32k"); +        } +        static TStringBuf MyName64k() { +            return TStringBuf("solar-64k"); +        } +        static TStringBuf MyName256k() { +            return TStringBuf("solar-256k"); +        } +        static TStringBuf MyName() { +            return TStringBuf("solar"); +        } +        static TStringBuf MyName8kAdapt() { +            return TStringBuf("solar-8k-a"); +        } +        static TStringBuf MyName16kAdapt() { +            return TStringBuf("solar-16k-a"); +        } +        static TStringBuf MyName32kAdapt() { +            return TStringBuf("solar-32k-a"); +        } +        static TStringBuf MyName64kAdapt() { +            return TStringBuf("solar-64k-a"); +        } +        static TStringBuf MyName256kAdapt() { +            return TStringBuf("solar-256k-a"); +        } +        static TStringBuf MyNameShortInt() { +            return TStringBuf("solar-si"); +        } + +        explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) +            : Settings(s) +            , MaxEntries(maxentries) +            , MaxIterations(maxiter) +        { +            MyTraits.NeedsTraining = true; +            MyTraits.SizeOnDecodeMultiplier = 2; +            MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8; +        } + +        ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { +            EncodeImpl<TVarIntTraits>(r, b); +            return 0; +        } + +        void Decode(TStringBuf r, TBuffer& b) const override { +            DecodeImpl<TVarIntTraits>(r, b); +        } + +        TString GetName() const override { +            return ToString(MyName()); +        } + +    protected: +        void DoLearn(ISequenceReader&) override; +        void Save(IOutputStream*) const override; +        void Load(IInputStream*) override; + +        Y_FORCE_INLINE TStringBuf SubStr(ui32 begoff, ui32 endoff) const { +            return TStringBuf(Pool.Data() + begoff, endoff - begoff); +        } + +        Y_FORCE_INLINE TStringBuf DoDecode(ui32 num) const { +            return SubStr(Decoder[num - 1], Decoder[num]); +        } + +        template <class TTraits> +        Y_FORCE_INLINE void EncodeImpl(TStringBuf r, TBuffer& b) const { +            b.Clear(); +            b.Reserve(r.size()); +            while (!r.empty()) { +                size_t sz = 0; +                ui32 val = (ui32)-1; +                Encoder.FindLongestPrefix(r, &sz, &val); +                TTraits::Write(val + 1, b); +                r.Skip(Max<size_t>(sz, 1)); +            } +        } + +        template <class TTraits> +        Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const { +            b.Clear(); +            b.Reserve(r.size()); +            ui32 v = 0; +            while (!r.empty()) { +                TTraits::Read(r, v); +                TStringBuf s = DoDecode(v); +                b.Append(s.data(), s.size()); +            } +        } + +        inline bool CanUseShortInt() const { +            return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT; +        } + +    private: +        typedef TCompactTrie<char, ui32> TEncoder; +        typedef TVector<ui32> TDecoder; + +        TBuffer Pool; +        TEncoder Encoder; +        TDecoder Decoder; + +        NGreedyDict::TBuildSettings Settings; +        ui32 MaxEntries; +        ui32 MaxIterations; +    }; + +    // Uses varints or shortints depending on the decoder size +    class TAdaptiveSolarCodec: public TSolarCodec { +    public: +        explicit TAdaptiveSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) +            : TSolarCodec(maxentries, maxiter, s) +        { +        } + +        ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { +            if (CanUseShortInt()) { +                EncodeImpl<TShortIntTraits>(r, b); +            } else { +                EncodeImpl<TVarIntTraits>(r, b); +            } + +            return 0; +        } + +        void Decode(TStringBuf r, TBuffer& b) const override { +            if (CanUseShortInt()) { +                DecodeImpl<TShortIntTraits>(r, b); +            } else { +                DecodeImpl<TVarIntTraits>(r, b); +            } +        } + +        TString GetName() const override { +            if (CanUseShortInt()) { +                return ToString(MyNameShortInt()); +            } else { +                return ToString(MyName()); +            } +        } +    }; + +    class TSolarCodecShortInt: public TSolarCodec { +    public: +        explicit TSolarCodecShortInt(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings()) +            : TSolarCodec(maxentries, maxiter, s) +        { +        } + +        ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override { +            EncodeImpl<TShortIntTraits>(r, b); +            return 0; +        } + +        void Decode(TStringBuf r, TBuffer& b) const override { +            DecodeImpl<TShortIntTraits>(r, b); +        } + +        TString GetName() const override { +            return ToString(MyNameShortInt()); +        } + +    protected: +        void Load(IInputStream* in) override { +            TSolarCodec::Load(in); +            Y_ENSURE_EX(CanUseShortInt(), TCodecException() << "Bad data"); +        } +    }; + +} diff --git a/library/cpp/codecs/static/README b/library/cpp/codecs/static/README new file mode 100644 index 00000000000..1b07f02433d --- /dev/null +++ b/library/cpp/codecs/static/README @@ -0,0 +1 @@ +Support of static libraries in library/cpp/codecs. See library/cpp/codecs/static/example. diff --git a/library/cpp/codecs/static/builder.cpp b/library/cpp/codecs/static/builder.cpp new file mode 100644 index 00000000000..93e34a3edbb --- /dev/null +++ b/library/cpp/codecs/static/builder.cpp @@ -0,0 +1,39 @@ +#include "builder.h" +#include "common.h" + +#include <library/cpp/codecs/static/static_codec_info.pb.h> + +#include <library/cpp/codecs/codecs.h> + +#include <util/generic/yexception.h> +#include <util/string/subst.h> + +namespace NCodecs { +    TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo& info) { +        TStaticCodecInfo result; +        TCodecPtr codec = ICodec::GetInstance(info.CodecName); +        Y_ENSURE_EX(codec, TCodecException() << "empty codec is not allowed"); + +        codec->LearnX(trainingData.begin(), trainingData.end(), info.SampleSizeMultiplier); +        { +            TStringOutput sout{*result.MutableStoredCodec()}; +            ICodec::Store(&sout, codec); +        } + +        auto& debugInfo = *result.MutableDebugInfo(); +        debugInfo.SetStoredCodecHash(DataSignature(result.GetStoredCodec())); +        debugInfo.SetCodecName(info.CodecName); +        debugInfo.SetSampleSizeMultiplier(info.SampleSizeMultiplier); +        debugInfo.SetTimestamp(info.Timestamp); +        debugInfo.SetRevisionInfo(info.RevisionInfo); +        debugInfo.SetTrainingSetComment(info.TrainingSetComment); +        debugInfo.SetTrainingSetResId(info.TrainingSetResId); +        return result; +    } + +    TString GetStandardFileName(const TStaticCodecInfo& info) { +        TString cName = info.GetDebugInfo().GetCodecName(); +        SubstGlobal(cName, ':', '.'); +        return TStringBuilder() << cName << "." << info.GetDebugInfo().GetTimestamp() << ".codec_info"; +    } +} diff --git a/library/cpp/codecs/static/builder.h b/library/cpp/codecs/static/builder.h new file mode 100644 index 00000000000..d7533be4d58 --- /dev/null +++ b/library/cpp/codecs/static/builder.h @@ -0,0 +1,29 @@ +#pragma once + +#include "static.h" + +#include <library/cpp/svnversion/svnversion.h> + +#include <util/datetime/base.h> +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/string/builder.h> + +namespace NCodecs { +    struct TCodecBuildInfo { +        // optimal values from SEARCH-1655 +        TString CodecName = "solar-8k-a:zstd08d-1"; +        float SampleSizeMultiplier = 1; + +        // debug info: +        time_t Timestamp = TInstant::Now().TimeT(); +        TString RevisionInfo = (TStringBuilder() << "r" << ToString(GetProgramSvnRevision())); +        TString TrainingSetComment; // a human comment on the training data +        TString TrainingSetResId;   // sandbox resid of the training set +    }; + +    TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo&); + +    TString GetStandardFileName(const TStaticCodecInfo&); + +} diff --git a/library/cpp/codecs/static/common.h b/library/cpp/codecs/static/common.h new file mode 100644 index 00000000000..211de2a27d2 --- /dev/null +++ b/library/cpp/codecs/static/common.h @@ -0,0 +1,32 @@ +#pragma once + +#include <util/string/hex.h> +#include <util/digest/city.h> +#include <util/system/byteorder.h> + +namespace NCodecs { +    template <class T> +    ui64 DataSignature(const T& t) { +        static_assert(!std::is_scalar<T>::value, "no scalars"); +        return CityHash64(t.data(), t.size()); +    } + +    template <class T> +    TString HexWriteScalar(T t) { +        static_assert(std::is_scalar<T>::value, "scalars only"); +        t = LittleToBig(t); +        TString res = HexEncode(&t, sizeof(t)); +        res.to_lower(); +        return res; +    } + +    template <class T> +    T HexReadScalar(TStringBuf s) { +        static_assert(std::is_scalar<T>::value, "scalars only"); +        T t = 0; +        HexDecode(s.data(), Min(s.size(), sizeof(T)), &t); +        t = BigToLittle(t); +        return t; +    } + +} diff --git a/library/cpp/codecs/static/example/example.cpp b/library/cpp/codecs/static/example/example.cpp new file mode 100644 index 00000000000..5b750b717e1 --- /dev/null +++ b/library/cpp/codecs/static/example/example.cpp @@ -0,0 +1,43 @@ +#include "example.h" + +#include <library/cpp/codecs/static/static.h> + +#include <util/generic/yexception.h> + +extern "C" { +extern const ui8 codec_info_huff_20160707[]; +extern const ui32 codec_info_huff_20160707Size; +extern const ui8 codec_info_sa_huff_20160707[]; +extern const ui32 codec_info_sa_huff_20160707Size; +}; + +namespace NStaticCodecExample { +    static const NCodecs::TCodecConstPtr CODECS[] = { +        nullptr, +        NCodecs::RestoreCodecFromArchive(codec_info_huff_20160707, codec_info_huff_20160707Size), +        NCodecs::RestoreCodecFromArchive(codec_info_sa_huff_20160707, codec_info_sa_huff_20160707Size), +    }; + +    static_assert(Y_ARRAY_SIZE(CODECS) == DV_COUNT, "bad array size"); + +    void Encode(TBuffer& out, TStringBuf in, EDictVersion dv) { +        Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv); +        out.Clear(); +        if (!in) { +            return; +        } +        CODECS[dv]->Encode(in, out); +        out.Append((char)dv); +    } + +    void Decode(TBuffer& out, TStringBuf in) { +        out.Clear(); +        if (!in) { +            return; +        } +        EDictVersion dv = (EDictVersion)in.back(); +        Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv); +        in.Chop(1); +        CODECS[dv]->Decode(in, out); +    } +} diff --git a/library/cpp/codecs/static/example/example.h b/library/cpp/codecs/static/example/example.h new file mode 100644 index 00000000000..f9b3a7324b7 --- /dev/null +++ b/library/cpp/codecs/static/example/example.h @@ -0,0 +1,17 @@ +#pragma once + +#include <util/generic/strbuf.h> +#include <util/generic/buffer.h> + +namespace NStaticCodecExample { +    enum EDictVersion : ui8 { +        DV_NULL = 0, +        DV_HUFF_20160707, +        DV_SA_HUFF_20160707, +        DV_COUNT +    }; + +    void Encode(TBuffer&, TStringBuf, EDictVersion dv = DV_SA_HUFF_20160707); + +    void Decode(TBuffer&, TStringBuf); +} diff --git a/library/cpp/codecs/static/example/huffman.1467494385.codec_info b/library/cpp/codecs/static/example/huffman.1467494385.codec_info Binary files differnew file mode 100644 index 00000000000..5fc18270a6b --- /dev/null +++ b/library/cpp/codecs/static/example/huffman.1467494385.codec_info diff --git a/library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_info b/library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_info Binary files differnew file mode 100644 index 00000000000..d36d8e24ec9 --- /dev/null +++ b/library/cpp/codecs/static/example/solar-8k-a.huffman.1467494385.codec_info diff --git a/library/cpp/codecs/static/example/ya.make b/library/cpp/codecs/static/example/ya.make new file mode 100644 index 00000000000..ca6c5fd900a --- /dev/null +++ b/library/cpp/codecs/static/example/ya.make @@ -0,0 +1,24 @@ +LIBRARY() + +OWNER(velavokr) + +SRCS( +    GLOBAL example.cpp +) + +PEERDIR( +    library/cpp/codecs +    library/cpp/codecs/static +) + +ARCHIVE_ASM( +    "solar-8k-a.huffman.1467494385.codec_info" +    NAME codec_info_sa_huff_20160707 +) + +ARCHIVE_ASM( +    "huffman.1467494385.codec_info" +    NAME codec_info_huff_20160707 +) + +END() diff --git a/library/cpp/codecs/static/static.cpp b/library/cpp/codecs/static/static.cpp new file mode 100644 index 00000000000..44a07dd73a2 --- /dev/null +++ b/library/cpp/codecs/static/static.cpp @@ -0,0 +1,98 @@ +#include "static.h" +#include "common.h" + +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <library/cpp/archive/yarchive.h> + +#include <util/draft/datetime.h> + +#include <util/string/builder.h> +#include <util/stream/buffer.h> +#include <util/stream/mem.h> +#include <util/string/hex.h> +#include <util/ysaveload.h> + +namespace NCodecs { +    static constexpr TStringBuf STATIC_CODEC_INFO_MAGIC = "CodecInf"; + +    static TStringBuf GetStaticCodecInfoMagic() { +        return STATIC_CODEC_INFO_MAGIC; +    } + +    void SaveCodecInfoToStream(IOutputStream& out, const TStaticCodecInfo& info) { +        TBufferOutput bout; +        info.SerializeToArcadiaStream(&bout); +        ui64 hash = DataSignature(bout.Buffer()); +        out.Write(GetStaticCodecInfoMagic()); +        ::Save(&out, hash); +        ::Save(&out, bout.Buffer()); +    } + +    TStaticCodecInfo LoadCodecInfoFromStream(IInputStream& in) { +        { +            TBuffer magic; +            magic.Resize(GetStaticCodecInfoMagic().size()); +            Y_ENSURE_EX(in.Read(magic.Data(), GetStaticCodecInfoMagic().size()) == GetStaticCodecInfoMagic().size(), +                        TCodecException() << "bad codec info"); +            Y_ENSURE_EX(TStringBuf(magic.data(), magic.size()) == GetStaticCodecInfoMagic(), +                        TCodecException() << "bad codec info"); +        } + +        ui64 hash; +        ::Load(&in, hash); +        TBuffer info; +        ::Load(&in, info); +        Y_ENSURE_EX(hash == DataSignature(info), TCodecException() << "bad codec info"); + +        TStaticCodecInfo result; +        Y_ENSURE_EX(result.ParseFromArray(info.data(), info.size()), TCodecException() << "bad codec info"); + +        return result; +    } + +    TString SaveCodecInfoToString(const TStaticCodecInfo& info) { +        TStringStream s; +        SaveCodecInfoToStream(s, info); +        return s.Str(); +    } + +    TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data) { +        TMemoryInput m{data.data(), data.size()}; +        return LoadCodecInfoFromStream(m); +    } + +    TString FormatCodecInfo(const TStaticCodecInfo& ci) { +        TStringBuilder s; +        s << "codec name:      " << ci.GetDebugInfo().GetCodecName() << Endl; +        s << "codec hash:      " << HexWriteScalar(ci.GetDebugInfo().GetStoredCodecHash()) << Endl; +        s << "dict size:       " << ci.GetStoredCodec().Size() << Endl; +        s << "sample mult:     " << ci.GetDebugInfo().GetSampleSizeMultiplier() << Endl; +        s << "orig.compress:   " << ci.GetDebugInfo().GetCompression() * 100 << " %" << Endl; +        s << "timestamp:       " << ci.GetDebugInfo().GetTimestamp() << " (" +          << NDatetime::TSimpleTM::NewLocal(ci.GetDebugInfo().GetTimestamp()).ToString() +          << ")" << Endl; +        s << "revision:        " << ci.GetDebugInfo().GetRevisionInfo() << Endl; +        s << "training set comment: " << ci.GetDebugInfo().GetTrainingSetComment() << Endl; +        s << "training set resId:   " << ci.GetDebugInfo().GetTrainingSetResId() << Endl; +        return s; +    } + +    TString LoadStringFromArchive(const ui8* begin, size_t size) { +        TArchiveReader ar(TBlob::NoCopy(begin, size)); +        Y_VERIFY(ar.Count() == 1, "invalid number of entries"); +        auto blob = ar.ObjectBlobByKey(ar.KeyByIndex(0)); +        return TString{blob.AsCharPtr(), blob.Size()}; +    } + +    TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo& info) { +        return NCodecs::ICodec::RestoreFromString(info.GetStoredCodec()); +    } + +    TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size) { +        const auto& data = LoadStringFromArchive(begin, size); +        const auto& info = LoadCodecInfoFromString(data); +        const auto& codec = RestoreCodecFromCodecInfo(info); +        Y_ENSURE_EX(codec, TCodecException() << "null codec"); +        return codec; +    } +} diff --git a/library/cpp/codecs/static/static.h b/library/cpp/codecs/static/static.h new file mode 100644 index 00000000000..c1eaed2a742 --- /dev/null +++ b/library/cpp/codecs/static/static.h @@ -0,0 +1,34 @@ +#pragma once + +#include <library/cpp/codecs/codecs.h> + +#include <util/generic/strbuf.h> +#include <util/generic/string.h> +#include <util/stream/output.h> + +namespace NCodecs { +    class TStaticCodecInfo; + +    // load + +    TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo&); + +    TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data); + +    TString LoadStringFromArchive(const ui8* begin, size_t size); + +    TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size); + +    // save + +    TString SaveCodecInfoToString(const TStaticCodecInfo&); + +    void SaveCodecInfoToStream(IOutputStream& out, const TStaticCodecInfo&); + +    // misc + +    TStaticCodecInfo LoadCodecInfoFromStream(IInputStream& in); + +    TString FormatCodecInfo(const TStaticCodecInfo&); + +} diff --git a/library/cpp/codecs/static/static_codec_info.proto b/library/cpp/codecs/static/static_codec_info.proto new file mode 100644 index 00000000000..362abb4dadf --- /dev/null +++ b/library/cpp/codecs/static/static_codec_info.proto @@ -0,0 +1,17 @@ +package NCodecs; + +message TStaticCodecInfo { +    message TDebugInfo { +        optional string CodecName = 1;           // the exact codec variant name +        optional uint64 Timestamp = 2;           // when the codec was built +        optional string RevisionInfo = 3;        // the arcadia revision info +        optional float SampleSizeMultiplier = 4; // how the default sample size was modified to improve compression +        optional float Compression = 5;          // the compression on the training set ((raw_size - coded_size) / raw_size) +        optional string TrainingSetComment = 6;  // a human readable description of the training set +        optional string TrainingSetResId = 7;    // the training set sandbox resource id +        optional uint64 StoredCodecHash = 8;     // cityhash64(data) +    } +     +    optional bytes StoredCodec = 1;           // the data of the codec +    optional TDebugInfo DebugInfo = 2;        // misc debug info which could be useful in finding whereabouts later +} diff --git a/library/cpp/codecs/static/tools/common/ct_common.cpp b/library/cpp/codecs/static/tools/common/ct_common.cpp new file mode 100644 index 00000000000..fe776912805 --- /dev/null +++ b/library/cpp/codecs/static/tools/common/ct_common.cpp @@ -0,0 +1,74 @@ +#include "ct_common.h" + +#include <library/cpp/codecs/codecs.h> +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <library/cpp/string_utils/base64/base64.h> + +#include <util/stream/output.h> +#include <util/string/builder.h> +#include <util/system/hp_timer.h> + +namespace NCodecs { +    TString TComprStats::Format(const TStaticCodecInfo& info, bool checkMode) const { +        TStringBuilder s; +        s << "raw size/item:      " << RawSizePerRecord() << Endl; +        s << "enc.size/item:      " << EncSizePerRecord() << Endl; +        if (checkMode) { +            s << "orig.enc.size/item: " << OldEncSizePerRecord(info.GetDebugInfo().GetCompression()) << Endl; +        } +        s << "enc time us/item:   " << EncTimePerRecordUS() << Endl; +        s << "dec time us/item:   " << DecTimePerRecordUS() << Endl; +        s << "dict size:          " << info.GetStoredCodec().Size() << Endl; +        s << "compression:        " << AsPercent(Compression()) << " %" << Endl; +        if (checkMode) { +            s << "orig.compression:   " << AsPercent(info.GetDebugInfo().GetCompression()) << " %" << Endl; +        } +        return s; +    } + +    TComprStats TestCodec(const ICodec& c, const TVector<TString>& input) { +        TComprStats stats; + +        TBuffer encodeBuffer; +        TBuffer decodeBuffer; +        for (const auto& data : input) { +            encodeBuffer.Clear(); +            decodeBuffer.Clear(); + +            stats.Records += 1; +            stats.RawSize += data.size(); + +            THPTimer timer; +            c.Encode(data, encodeBuffer); +            stats.EncSize += encodeBuffer.size(); +            stats.EncSeconds += timer.PassedReset(); + +            c.Decode(TStringBuf{encodeBuffer.data(), encodeBuffer.size()}, decodeBuffer); +            stats.DecSeconds += timer.PassedReset(); +            Y_ENSURE(data == TStringBuf(decodeBuffer.data(), decodeBuffer.size()), "invalid encoding at record " << stats.Records); +        } + +        return stats; +    } + +    void ParseBlob(TVector<TString>& result, EDataStreamFormat fmt, const TBlob& blob) { +        TStringBuf bin(blob.AsCharPtr(), blob.Size()); +        TStringBuf line; +        TString buffer; +        while (bin.ReadLine(line)) { +            if (DSF_BASE64_LF == fmt) { +                Base64Decode(line, buffer); +                line = buffer; +            } +            if (!line) { +                continue; +            } +            result.emplace_back(line.data(), line.size()); +        } +    } + +    TBlob GetInputBlob(const TString& dataFile) { +        return dataFile && dataFile != "-" ? TBlob::FromFile(dataFile) : TBlob::FromStream(Cin); +    } + +} diff --git a/library/cpp/codecs/static/tools/common/ct_common.h b/library/cpp/codecs/static/tools/common/ct_common.h new file mode 100644 index 00000000000..9d3dcbda934 --- /dev/null +++ b/library/cpp/codecs/static/tools/common/ct_common.h @@ -0,0 +1,75 @@ +#pragma once + +#include <util/generic/string.h> +#include <util/generic/vector.h> +#include <util/memory/blob.h> +#include <cmath> + +namespace NCodecs { +    class TStaticCodecInfo; +    class ICodec; + +    struct TComprStats { +        double EncSeconds = 0; +        double DecSeconds = 0; +        size_t Records = 0; +        size_t RawSize = 0; +        size_t EncSize = 0; + +        static double Round(double n, size_t decPlaces = 2) { +            double p = pow(10, decPlaces); +            return round(n * p) / p; +        } + +        static double AsPercent(double n) { +            return Round(n * 100); +        } + +        static double AsMicroSecond(double s) { +            return s * 1000000; +        } + +        double PerRecord(double n) const { +            return Round((double)(Records ? n / Records : 0)); +        } + +        double Compression() const { +            return ((double)RawSize - (double)EncSize) / RawSize; +        } + +        double EncTimePerRecordUS() const { +            return PerRecord(AsMicroSecond(EncSeconds)); +        } + +        double DecTimePerRecordUS() const { +            return PerRecord(AsMicroSecond(DecSeconds)); +        } + +        double RawSizePerRecord() const { +            return PerRecord(RawSize); +        } + +        double EncSizePerRecord() const { +            return PerRecord(EncSize); +        } + +        double OldEncSizePerRecord(double compr) const { +            return PerRecord((1 - compr) * RawSize); +        } + +        TString Format(const TStaticCodecInfo&, bool checkMode) const; +    }; + +    TComprStats TestCodec(const ICodec&, const TVector<TString>& data); + +    enum EDataStreamFormat { +        DSF_NONE, +        DSF_PLAIN_LF /* "plain" */, +        DSF_BASE64_LF /* "base64" */, +    }; + +    void ParseBlob(TVector<TString>&, EDataStreamFormat, const TBlob&); + +    TBlob GetInputBlob(const TString& dataFile); + +} diff --git a/library/cpp/codecs/static/tools/common/ya.make b/library/cpp/codecs/static/tools/common/ya.make new file mode 100644 index 00000000000..d624222dad0 --- /dev/null +++ b/library/cpp/codecs/static/tools/common/ya.make @@ -0,0 +1,19 @@ +LIBRARY() + +OWNER(velavokr) + +SRCS( +    ct_common.cpp +) + +PEERDIR( +    library/cpp/codecs +    library/cpp/codecs/static +    library/cpp/getopt/small +    library/cpp/string_utils/base64 +    util/draft +) + +GENERATE_ENUM_SERIALIZATION(ct_common.h) + +END() diff --git a/library/cpp/codecs/static/tools/static_codec_checker/README b/library/cpp/codecs/static/tools/static_codec_checker/README new file mode 100644 index 00000000000..723a68300b0 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_checker/README @@ -0,0 +1,4 @@ +This is a viewer for generated codec and utility for verification of the compression quality on a new data. + +Usage: +static_codec_checker -t -c 029b29ff64a74927.codec_info -f plain samples.txt diff --git a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp new file mode 100644 index 00000000000..9c8d568d823 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp @@ -0,0 +1,73 @@ +#include <library/cpp/codecs/static/tools/common/ct_common.h> +#include <library/cpp/codecs/static/static.h> +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <library/cpp/codecs/codecs.h> +#include <library/cpp/getopt/small/last_getopt.h> + +#include <util/digest/city.h> +#include <util/generic/yexception.h> +#include <util/stream/file.h> +#include <util/stream/buffer.h> +#include <util/stream/format.h> +#include <util/string/builder.h> + +int main(int argc, char** argv) { +    NCodecs::TCodecPtr codecPtr; +    NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE; +    TString codecFile; +    bool testCompression = false; + +    auto opts = NLastGetopt::TOpts::Default(); +    opts.SetTitle("Prints a .codec_info file and optionally checks its performance on new data. See also static_codec_generator."); +    opts.SetCmdLineDescr("-c 9089f3e9b7a0f0d4.codec_info -t -f base64 qtrees.sample.txt"); +    NCodecs::TStaticCodecInfo codec; + +    opts.AddLongOption('c', "codec-info").RequiredArgument("codec_info").Handler1T<TString>([&codecFile, &codec, &codecPtr](TString name) { +                                                                            codecFile = name; +                                                                            codec.CopyFrom(NCodecs::LoadCodecInfoFromString(TUnbufferedFileInput(name).ReadAll())); +                                                                            codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec()); +                                                                        }) +        .Required() +        .Help(".codec_info file with serialized static data for codec"); + +    opts.AddLongOption('t', "test").NoArgument().StoreValue(&testCompression, true).Optional().Help("test current performance"); + +    opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Optional().Help("test set input file format"); + +    opts.SetFreeArgsMin(0); +    opts.SetFreeArgTitle(0, "testing_set_input_file", "testing set input files"); + +    NLastGetopt::TOptsParseResult res(&opts, argc, argv); + +    Cout << codecFile << Endl; +    Cout << NCodecs::FormatCodecInfo(codec) << Endl; + +    if (testCompression) { +        if (NCodecs::DSF_NONE == fmt) { +            Cerr << "Specify format (-f|--format) for testing set input" << Endl; +            exit(1); +        } + +        Cout << "Reading testing set data ... " << Flush; + +        TVector<TString> allData; +        for (const auto& freeArg : res.GetFreeArgs()) { +            NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg)); +        } + +        if (!res.GetFreeArgs()) { +            NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-")); +        } + +        Cout << "Done" << Endl << Endl; + +        Cout << "records:  " << allData.size() << Endl; +        Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl; + +        Cout << "Testing compression ... " << Flush; +        auto stats = NCodecs::TestCodec(*codecPtr, allData); +        Cout << "Done" << Endl << Endl; + +        Cout << stats.Format(codec, true) << Endl; +    } +} diff --git a/library/cpp/codecs/static/tools/static_codec_checker/ya.make b/library/cpp/codecs/static/tools/static_codec_checker/ya.make new file mode 100644 index 00000000000..90e06ca448d --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_checker/ya.make @@ -0,0 +1,16 @@ +PROGRAM() + +OWNER(velavokr) + +SRCS( +    static_codec_checker.cpp +) + +PEERDIR( +    library/cpp/codecs +    library/cpp/codecs/static +    library/cpp/codecs/static/tools/common +    library/cpp/getopt/small +) + +END() diff --git a/library/cpp/codecs/static/tools/static_codec_generator/README b/library/cpp/codecs/static/tools/static_codec_generator/README new file mode 100644 index 00000000000..e6bb52b9591 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_generator/README @@ -0,0 +1,4 @@ +This is a utility for reproducible  teaching of a codec. And also for saving it into a file with a unique name for a static compilation as a resource. + +Usage: +static_codec_generator -t -m 'the training data description' -f plain samples.txt diff --git a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp new file mode 100644 index 00000000000..45fdb5c5fe8 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp @@ -0,0 +1,82 @@ +#include <library/cpp/codecs/static/tools/common/ct_common.h> +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <library/cpp/codecs/static/builder.h> +#include <library/cpp/codecs/codecs.h> + +#include <library/cpp/getopt/small/last_getopt.h> + +#include <util/generic/yexception.h> +#include <util/stream/file.h> +#include <util/string/builder.h> + +int main(int argc, char** argv) { +    NCodecs::TCodecBuildInfo info; +    NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE; + +    auto opts = NLastGetopt::TOpts::Default(); +    opts.SetCmdLineDescr("-m 'Training set: 100000 qtrees taken from web mmeta logs' -f base64 qtrees.sample.txt"); +    opts.SetTitle("Teaches the codec and serializes it as a file named CODECNAME.hash(CODECDATA).bin"); + +    opts.AddLongOption('m', "message").RequiredArgument("training_set_comment").StoreResult(&info.TrainingSetComment).Required().Help("a human description for the training set"); + +    opts.AddLongOption('r', "resource").RequiredArgument("training_set_res_id").StoreResult(&info.TrainingSetResId).Optional().Help("sandbox resource id for the training set"); + +    opts.AddLongOption('c', "codec").RequiredArgument("codec_name").StoreResult(&info.CodecName).Optional().DefaultValue(info.CodecName); + +    opts.AddLongOption('s', "sample-multiplier").RequiredArgument("multiplier").StoreResult(&info.SampleSizeMultiplier).Optional().DefaultValue(ToString(info.SampleSizeMultiplier)).Help("multiplier for default sample size"); + +    opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Required().Help("training set input file format"); + +    opts.AddLongOption("list-codecs").NoArgument().Handler0([]() { +                                                      Cout << JoinStrings(NCodecs::ICodec::GetCodecsList(), "\n") << Endl; +                                                      exit(0); +                                                  }) +        .Optional() +        .Help("list available codecs"); + +    opts.AddLongOption("fake-revision").RequiredArgument("revision").StoreResult(&info.RevisionInfo).Optional().Hidden(); // replace static_codec_generator revision in debug info + +    opts.AddLongOption("fake-timestamp").RequiredArgument("timestamp").StoreResult(&info.Timestamp).Optional().Hidden(); // replace generating timestamp in debug info + +    opts.SetFreeArgsMin(0); +    opts.SetFreeArgTitle(0, "training_set_input_file", "training set input files"); + +    NLastGetopt::TOptsParseResult res(&opts, argc, argv); + +    Cout << "Reading training set data ... " << Flush; +    TVector<TString> allData; +    for (const auto& freeArg : res.GetFreeArgs()) { +        NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg)); +    } + +    if (!res.GetFreeArgs()) { +        NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-")); +    } +    Cout << "Done" << Endl << Endl; + +    Cout << "records:  " << allData.size() << Endl; +    Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl; + +    Cout << "Training " << info.CodecName << " , sample size multiplier is " << info.SampleSizeMultiplier << " ... " << Flush; +    auto codec = NCodecs::BuildStaticCodec(allData, info); +    Cout << "Done" << Endl; + +    TString codecName = NCodecs::GetStandardFileName(codec); +    NCodecs::TCodecPtr codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec()); + +    Cout << "Testing compression ... " << Flush; +    auto stats = NCodecs::TestCodec(*codecPtr, allData); +    Cout << "Done" << Endl << Endl; + +    codec.MutableDebugInfo()->SetCompression(stats.Compression()); + +    Cout << stats.Format(codec, false) << Endl; + +    Cout << "Saving as " << codecName << " ... " << Flush; +    { +        TUnbufferedFileOutput fout{codecName}; +        NCodecs::SaveCodecInfoToStream(fout, codec); +        fout.Finish(); +    } +    Cout << "Done" << Endl << Endl; +} diff --git a/library/cpp/codecs/static/tools/static_codec_generator/ya.make b/library/cpp/codecs/static/tools/static_codec_generator/ya.make new file mode 100644 index 00000000000..efbc440dd18 --- /dev/null +++ b/library/cpp/codecs/static/tools/static_codec_generator/ya.make @@ -0,0 +1,17 @@ +PROGRAM() + +OWNER(velavokr) + +SRCS( +    static_codec_generator.cpp +) + +PEERDIR( +    library/cpp/codecs +    library/cpp/codecs/static +    library/cpp/codecs/static/tools/common +    library/cpp/digest/md5 +    library/cpp/getopt/small +) + +END() diff --git a/library/cpp/codecs/static/tools/tests/canondata/result.json b/library/cpp/codecs/static/tools/tests/canondata/result.json new file mode 100644 index 00000000000..7a637c6763a --- /dev/null +++ b/library/cpp/codecs/static/tools/tests/canondata/result.json @@ -0,0 +1,6 @@ +{ +    "static_codec_tools.test_static_codec_tools": { +        "checksum": "960e3c8c57fb846ab53ccbd07e287233", +        "uri": "sbr://144512644/static_codec_tools.test_static_codec_tools/solar-8k-a.huffman.1467494385.codec_info" +    } +}
\ No newline at end of file diff --git a/library/cpp/codecs/static/tools/tests/static_codec_tools.py b/library/cpp/codecs/static/tools/tests/static_codec_tools.py new file mode 100644 index 00000000000..db4140e3703 --- /dev/null +++ b/library/cpp/codecs/static/tools/tests/static_codec_tools.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python + +import yatest.common as tt +import os.path as op + +def test_static_codec_tools(): +    tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator")] +        + ["-m", "test codec", "-r", "sbr://143310406", "-f", "plain", "-c", "solar-8k-a:huffman", "-s", "1", +            "--fake-revision", "r2385905", "--fake-timestamp", "1467494385", "sample.txt"], +        timeout=60) +    assert(op.exists("solar-8k-a.huffman.1467494385.codec_info")) +    tt.canonical_execute(tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker"), +        args=["-c", "solar-8k-a.huffman.1467494385.codec_info"], +        timeout=60) +    tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker")] +        + ["-c", "solar-8k-a.huffman.1467494385.codec_info", "-f", "plain", "-t", "sample.txt"], +        timeout=60) +    return tt.canonical_file("solar-8k-a.huffman.1467494385.codec_info") diff --git a/library/cpp/codecs/static/tools/tests/ya.make b/library/cpp/codecs/static/tools/tests/ya.make new file mode 100644 index 00000000000..c5324eaf53b --- /dev/null +++ b/library/cpp/codecs/static/tools/tests/ya.make @@ -0,0 +1,20 @@ +PY2TEST() + +OWNER(velavokr) + +TEST_SRCS(static_codec_tools.py) + +DATA(sbr://143310406) + +TIMEOUT(4200) + +TAG(ya:not_autocheck) + +DEPENDS( +    library/cpp/codecs/static/tools/static_codec_checker +    library/cpp/codecs/static/tools/static_codec_generator +) + + + +END() diff --git a/library/cpp/codecs/static/tools/ya.make b/library/cpp/codecs/static/tools/ya.make new file mode 100644 index 00000000000..dd3e8437aa4 --- /dev/null +++ b/library/cpp/codecs/static/tools/ya.make @@ -0,0 +1,5 @@ +RECURSE( +    common +    static_codec_generator +    static_codec_checker +) diff --git a/library/cpp/codecs/static/ut/builder_ut.cpp b/library/cpp/codecs/static/ut/builder_ut.cpp new file mode 100644 index 00000000000..b47c279ed14 --- /dev/null +++ b/library/cpp/codecs/static/ut/builder_ut.cpp @@ -0,0 +1,57 @@ +#include <library/cpp/testing/unittest/registar.h> +#include <library/cpp/codecs/static/builder.h> +#include <library/cpp/codecs/static/static_codec_info.pb.h> +#include <util/string/vector.h> + +class TStaticCodecInfoBuilderTest: public NUnitTest::TTestBase { +    UNIT_TEST_SUITE(TStaticCodecInfoBuilderTest) +    UNIT_TEST(TestBuild) +    UNIT_TEST_SUITE_END(); + +private: +    TVector<TString> PrepareData() { +        TVector<TString> data; +        for (ui32 i = 'a'; i <= 'z'; ++i) { +            data.push_back(TString(1, (char)i)); +        } +        return data; +    } + +    void TestBuild() { +        TVector<TString> data; +        NCodecs::TCodecBuildInfo info; +        info.CodecName = "huffman"; +        info.SampleSizeMultiplier = 2; +        info.Timestamp = 1467494385; +        info.RevisionInfo = "r2385905"; +        info.TrainingSetComment = "some dummy data"; +        info.TrainingSetResId = "sbr://1234"; +        auto res = NCodecs::BuildStaticCodec(PrepareData(), info); +        UNIT_ASSERT_VALUES_EQUAL(res.ShortUtf8DebugString(), +                                 "StoredCodec: \"\\007\\000huffman@S\\000a" +                                 "\\006b\\005c\\005d\\005e\\005f\\005g\\005h\\005i\\005j\\005k\\005l\\005m\\005n\\005o" +                                 "\\005p\\005q\\005r\\005s\\005t\\005u\\004v\\004w\\004x\\004y\\004z\\004\xC7?\xC8>" +                                 "\xC9=\xCA<\xCB;\xCC:\3159\3168\3177\3206\3215\3224\3233\3242\3251\3260\xD7/\xD8." +                                 "\xD9-\xDA,\xDB+\xDC*\xDD)\xDE(\xDF\\'\xE0&\xE1%\xE2$\xE3#\xE4\\\"\xE5!\xE6 \xE7" +                                 "\\037\xE8\\036\xE9\\035\xEA\\034\xEB\\033\xEC\\032\xED\\031\xEE\\030\xEF\\027\xF0" +                                 "\\026\xF1\\025\xF2\\024\xF3\\023\xF4\\022\xF5\\021\xF6\\020\xF7\\017\xF8\\016\xF9" +                                 "\\r\xFA\\014\xFB\\013\xFC\\n\xFD\\t\xFE\\010\xFF\\007\" " +                                 "DebugInfo { " +                                 "CodecName: \"huffman\" " +                                 "Timestamp: 1467494385 " +                                 "RevisionInfo: \"r2385905\" " +                                 "SampleSizeMultiplier: 2 " +                                 "TrainingSetComment: \"some dummy data\" " +                                 "TrainingSetResId: \"sbr://1234\" " +                                 "StoredCodecHash: 2509195835471488613 " +                                 "}"); + +        UNIT_ASSERT_VALUES_EQUAL(NCodecs::GetStandardFileName(res), "huffman.1467494385.codec_info"); +        UNIT_ASSERT_VALUES_EQUAL(res.GetDebugInfo().GetStoredCodecHash(), 2509195835471488613ULL); + +        auto res1 = NCodecs::LoadCodecInfoFromString(NCodecs::SaveCodecInfoToString(res)); +        UNIT_ASSERT_VALUES_EQUAL(res1.ShortUtf8DebugString(), res.ShortUtf8DebugString()); +    } +}; + +UNIT_TEST_SUITE_REGISTRATION(TStaticCodecInfoBuilderTest); diff --git a/library/cpp/codecs/static/ut/static_ut.cpp b/library/cpp/codecs/static/ut/static_ut.cpp new file mode 100644 index 00000000000..57e1e628874 --- /dev/null +++ b/library/cpp/codecs/static/ut/static_ut.cpp @@ -0,0 +1,27 @@ +#include <library/cpp/testing/unittest/registar.h> +#include <library/cpp/codecs/static/example/example.h> + +class TStaticCodecUsageTest: public NUnitTest::TTestBase { +    UNIT_TEST_SUITE(TStaticCodecUsageTest) +    UNIT_TEST(TestUsage) +    UNIT_TEST_SUITE_END(); + +private: +    void DoTestUsage(NStaticCodecExample::EDictVersion dv, size_t expectedSize) { +        const TStringBuf letov = "Всё идёт по плану"; + +        TBuffer outEnc, outDec; +        NStaticCodecExample::Encode(outEnc, letov, dv); +        NStaticCodecExample::Decode(outDec, TStringBuf{outEnc.data(), outEnc.size()}); + +        UNIT_ASSERT_VALUES_EQUAL(outEnc.Size(), expectedSize); +        UNIT_ASSERT_EQUAL(TStringBuf(outDec.data(), outDec.size()), letov); +    } + +    void TestUsage() { +        DoTestUsage(NStaticCodecExample::DV_HUFF_20160707, 18u); +        DoTestUsage(NStaticCodecExample::DV_SA_HUFF_20160707, 22u); +    } +}; + +UNIT_TEST_SUITE_REGISTRATION(TStaticCodecUsageTest) diff --git a/library/cpp/codecs/static/ut/ya.make b/library/cpp/codecs/static/ut/ya.make new file mode 100644 index 00000000000..b9116097d87 --- /dev/null +++ b/library/cpp/codecs/static/ut/ya.make @@ -0,0 +1,14 @@ +UNITTEST_FOR(library/cpp/codecs/static) + +OWNER(velavokr) + +SRCS( +    builder_ut.cpp +    static_ut.cpp +) + +PEERDIR( +    library/cpp/codecs/static/example +) + +END() diff --git a/library/cpp/codecs/static/ya.make b/library/cpp/codecs/static/ya.make new file mode 100644 index 00000000000..00e00fd8d43 --- /dev/null +++ b/library/cpp/codecs/static/ya.make @@ -0,0 +1,18 @@ +LIBRARY() + +OWNER(velavokr) + +SRCS( +    builder.cpp +    static_codec_info.proto +    static.cpp +) + +PEERDIR( +    library/cpp/codecs +    library/cpp/archive +    library/cpp/svnversion +    util/draft +) + +END() diff --git a/library/cpp/codecs/tls_cache.cpp b/library/cpp/codecs/tls_cache.cpp new file mode 100644 index 00000000000..0a1b32bda14 --- /dev/null +++ b/library/cpp/codecs/tls_cache.cpp @@ -0,0 +1,4 @@ +#include "tls_cache.h" + +namespace NCodecs { +} diff --git a/library/cpp/codecs/tls_cache.h b/library/cpp/codecs/tls_cache.h new file mode 100644 index 00000000000..0184e4bb6c2 --- /dev/null +++ b/library/cpp/codecs/tls_cache.h @@ -0,0 +1,100 @@ +#pragma once + +#include <util/generic/buffer.h> +#include <util/generic/deque.h> +#include <util/generic/noncopyable.h> +#include <util/generic/strbuf.h> +#include <util/system/tls.h> +#include <util/thread/singleton.h> + +namespace NCodecs { +    template <class TItem> +    struct TClear { +        void operator()(TItem& item) const { +            item.Clear(); +        } +    }; + +    template <class TItem, class TCleaner = TClear<TItem>> +    class TTlsCache { +        using TSelf = TTlsCache<TItem, TCleaner>; + +        struct TItemHolder: public TIntrusiveListItem<TItemHolder> { +            TItemHolder(TSelf& factory) +                : Factory(factory) +            { +            } + +            void Release() { +                Factory.Release(*this); +            } + +            TSelf& Factory; +            TItem Item; +        }; + +        class TItemGuard { +        public: +            explicit TItemGuard(TSelf& fact) +                : Holder(fact.Acquire()) +            { +            } + +            TItemGuard(TItemGuard&& other) noexcept { +                *this = std::move(other); +            } + +            TItemGuard& operator=(TItemGuard&& other) noexcept { +                if (&other != this) { +                    std::swap(Holder, other.Holder); +                } +                return *this; +            } + +            ~TItemGuard() { +                if (Holder) { +                    Holder->Release(); +                } +            } + +            TItem& Get() & { +                Y_ASSERT(Holder); +                return Holder->Item; +            } + +            TItem& Get() && = delete; + +        private: +            TItemHolder* Holder = nullptr; +        }; + +    public: +        TItemGuard Item() { +            return TItemGuard(*this); +        } + +        static TSelf& TlsInstance() { +            return *FastTlsSingleton<TSelf>(); +        } + +    private: +        TItemHolder* Acquire() { +            if (Free.Empty()) { +                return new TItemHolder(*this); +            } else { +                return Free.PopBack(); +            } +        } + +        void Release(TItemHolder& item) { +            Cleaner(item.Item); +            Free.PushBack(&item); +        } + +    private: +        TIntrusiveListWithAutoDelete<TItemHolder, TDelete> Free; +        TCleaner Cleaner; +    }; + +    using TBufferTlsCache = TTlsCache<TBuffer>; +} diff --git a/library/cpp/codecs/ut/codecs_ut.cpp b/library/cpp/codecs/ut/codecs_ut.cpp new file mode 100644 index 00000000000..caf6089aef7 --- /dev/null +++ b/library/cpp/codecs/ut/codecs_ut.cpp @@ -0,0 +1,1360 @@ +#include <library/cpp/codecs/delta_codec.h> +#include <library/cpp/codecs/huffman_codec.h> +#include <library/cpp/codecs/pfor_codec.h> +#include <library/cpp/codecs/solar_codec.h> +#include <library/cpp/codecs/zstd_dict_codec.h> +#include <library/cpp/codecs/comptable_codec.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/generic/buffer.h> +#include <util/string/util.h> +#include <util/string/hex.h> +#include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> + +namespace { +    const char* TextValues[] = { +        "! сентября газета", +        "!(возмездие это)!", +        "!(материнский капитал)", +        "!(пермь березники)", +        "!биография | !жизнь / + розинг | зворыгин & изобретение | телевидение | электронно лучевая трубка", +        "!овсиенко николай павлович", +        "!путин", +        "\"i'm on you\" p. diddy тимати клип", +        "\"билайн\" представит собственный планшет", +        "\"в особо крупном размере\"", +        "\"викиликс\" джулиан ассанж", +        "\"вимм билль данн", +        "\"газэнергосеть астрахань", +        "\"газэнергосеть астрахань\"", +        "\"домодедово\" ту-154", +        "\"жилина\" \"спартак\" видео", +        "\"зелёнsq шершнm\"", +        "\"зелёного шершня\"", +        "\"золотой граммофон\" марины яблоковой", +        "\"золотой граммофон-2010\"", +        "\"калинниковы\"", +        "\"манчестер юнайтед\" (англия) \"валенсия\" (испания) 1:1 (0:1)", +        "\"маркер\"", +        "\"моника\" засыпает москву снегом", +        "\"моника\" снегопад", +        "\"о безопасности\",", +        "\"памятку\" для пассажиров воздушных международных рейсов", +        "\"петровский парк\" и \"ходынское поле\"", +        "\"путинская\" трава", +        "\"пятерочка\"купила \"копейку\"", +        "\"пятёрочка\" и \"копейка\" объединились", +        "\"реал\" \"осер\" 4:0", +        "\"речь мутко\"", +        "\"российский лес 2010\"", +        "\"ростехинвентаризация федеральное бти\" рубцов", +        "\"саня останется с нами\",", +        "\"следопыт\" реалити шоу", +        "\"слышишь\" молодые авторы", +        "\"стадион\"", +        "\"ходынское поле\" метро", +        "\"хроники нарнии\"", +        "\"чистая вода\"", +        "\"школа деда мороза\"", +        "# asus -1394", +        "# сторонники wikileaks", +        "#106#", +        "#11", +        "#8 какой цвет", +        "#если клиент", +        "$ 13,79", +        "$ xnj ,s dct ,skb ljdjkmys !!!", +        "$ в день", +        "$ диск компьютера", +        "$.ajax", +        "$125 000", +        "$курс", +        "% в си", +        "% влады", +        "% годовых", +        "% женщин и % мужчин в россии", +        "% занятости персонала", +        "% инфляции 2010", +        "% инфляции в 2010 г.", +        "% налога", +        "% налогов в 2010г.", +        "% общего количества", +        "% от числа", +        "% по налогу на прибыль организации", +        "%24", +        "%академия%", +        "%комарова%татьяна", +        "& в 1с", +        "&& (+не существует | !такой проблемы)", +        ">>>скачать | download c cs strikez.clan.su<<<", +        ">hbq nbityrjd", +        "< какой знак", +        "< лицей | < техническая школа# < история#< лицей сегодня#< перечень профессий#< руководство лицея#< прием учащихся#< контакты#< схема проезда#< фотогалереяистория создания лицея и основные этапы путиулица купчинская дом 28", +        "<<link>>", +        "</storage>", +        "<bfnkjy", +        "<bktntd", +        "<cr", +        "<ddr3>", +        "<e[ufknthcrbq abyfycjdsq", +        "<fcctqys", +        "<fhcf", +        "<fhctkjyf he,by", +        "<firbhbz", +        "<fyr djphj;ltybt", +        "<fyr vjcrds", +        "<fyr резерв", +        "<fyufkjh", +        "<index>", +        "<jkmifz jrhe;yfz rbtd", +        "<kbpytws", +        "<megafon> интернет", +        "<thtpybrb gthvcrbq rhfq", +        "<tkjxrf", +        "<беларусь это мы", +        "<бокс, версия ibf", +        "designer tree svc", +        "seriesg810", +        "doll makers", +        "rotten.com", +        "evening gowns", +        "discover", +        "south carolina escorts", +        "forkliftjobsinhousron", +        "mailbox", +        "alexis", +        "espn.com mlb", +        "gypsy.chat.2k", +        "the man in the mirror", +        "azteca", +        "sebastian telfair - jamel thomas", +        "kirby", +        "java", +        "trike motorcycles", +        "piasecki helicopter", +        "wicca binding spells", +        "pier park panama city beach .com", +        "continente europeo", +        "asswatchers.com", +        "asswatchers.com", +        "easton stealth stiff flex cnt adult baseball bat - 3", +        "facesofdeath", +        "video of 9 11", +        "profileedit.myspace.com", +        "georgia snakes", +        "yahoo.com", +        "google", +        "http wwwclassicindustries .corvettes-roadsters.com", +        "arington training stable", +        "find bred of dog", +        "southpark contact tables for myspace", +        "symptoms of laryngitis", +        "suzuki stickers", +        "avianca", +        "radio shack", +        "dominican republic pictures", +        "recent", +        "mapquest", +        "http myspace .com", +        "research chemicals supplies", +        "winn dixie.com", +        "drivers 20guide.com", +        "dylan whitley north carolina", +        "google com", +        "order wild horses cigarettes", +        "yahoocom", +        "fl runners", +        "aol companion install", +        "nbc.comdond 59595 6", +        "directv.com", +        "motorsports insurance", +        "cartoonnetwork", +        "pop warner-victorville", +        "black iorn spars", +        "goog", +        "the suns", +        "ebay", +        "pop warner", +        "philadelphia cream cheese", +        "oklahoma", +        "doudleday books.com", +        "javascript download", +        "city of nacogdoches", +        "sfyl", +        "myspace.com", +        "baptism pictures", +        "games", +        "depredadores sexuales", +        "mycl.cravelyrics.com", +        "become a bone marrow donner", +        "vintage copies", +        "ford dealership", +        "candystand", +        "smarthairypussyom", +        "yahoo.com", +        "vanderbilt.edu", +        "ebay", +        "grouper", +        "mys", +        "myrsa and birth defects", +        "hatteras rentals", +        "female escorts", +        "ja rule", +        "meat bluesheet", +        "yahoo", +        "american disability act court cases", +        "clearview cinemas", +        "hard69.com", +        "make a living will for free", +        "fat asses", +        "flashback concert in atlanta ga", +        "fucking", +        "flat abdomen exercises", +        "big brother facial", +        "german dictionary", +        "black dick", +        "ebonymovies", +        "airsoft rifles", +        "best fishing days calander", +        "tattoo", +        "impressions", +        "cs.com", +        "northwest airlines reservations", +        "halo 3", +        "wallbaums", +        "chat room listings", +        "waterbury ct warrants", +        "pictures of chad michael murry", +        "yahoo", +        "install wallpaper", +        "halo 3", +        "clits and tits", +        "prothsmouth general circuit courts", +        "old hawthorne columbia", +        "jess lee photos", +        "no deposit casino bonus", +        "bbc gladiator dressed to kill", +        "anemagazine.com", +        "lyrics unfaithful", +        "gold bars found", +        "art.comhttp", +        "free unlock key", +        "man o war lost a race", +        "blue cross and blue shield", +        "phenergan", +        "myspace.com", +        "http www.constitutional court.com", +        "monster trucks", +        "the breeze fort myers fla.newspaper", +        "davis origin name", +        "upper deck.com", +        "arizona", +        "akira lane", +        "ebaumsworld", +        "union pacific jobs", +        "google.cm", +        "free bigt girls nudes", +        "abcnews.com", +        "tootse.com", +        "az lyrics", +        "freddy", +        "georgia.com", +        "johncombest.com", +        "nelly", +        "gussi mane", +        "university of illinois", +        "oregan valcano's", +        "mythbusters", +        "sailormoon hentai", +        "international cub tractor", +        "desert sky movie green valley az", +        "evite", +        "nelly nud epics", +        "penndot.com", +        "first banks", +        "psp manual", +        "google", +        "jackieaudet hotmail.com", +        "internet", +        "shootinggames", +        "shootinggames", +        "montana western rendezvous of art", +        "hello kitty layouts", +        "yahoo", +        "translation", +        "glenn scott attorney", +        "hallofshame", +        "capitolone.com", +        "recipe for popovers", +        "pictures of demons", +        "barnes and nobles.com", +        "rbd", +        "hart and hunnington tattoo shop", +        "janepowellmovies.com", +        "ged schools in the military", +        "kelis", +        "hvacagent", +        "neat home organizer television show", +        "2719 24-2-crime and courts", +        "fsu", +        "torpedo bomber games", +        "love poems", +        "polly pocket'toys", +        "yweatherahoo.com", +        "jungle gin", +        "flemington new jersey real estate", +        "milf hunter stories", +        "budget.com", +        "chopperstyle", +        "keno player", +        "up skirt", +        "dogs", +        "beerballers", +        "phat white butt", +        "phat white butt", +        "va licensing for interpeters for the deaf", +        "white page phone book maiden north carolina", +        "controlled 20solutions 20corp.com", +        "friedman jewelery", +        "kelis", +        "curtains", +        "curtains", +        "fuck me harder", +        "naked girls", +        "southwest airlines boarding pass", +        "mailbox", +        "1976 mavrick", +        "adult diapers", +        "horse nasal discharge", +        "charles ludlam", +        "google", +        "himnos en espanol", +        "quarter horses for sale in nebraska", +        "cosmo", +        "hi", +        "mattel", +        "aouto 20trader.com", +        "sunsetter awnings", +        "bl.cfm", +        "at", +        "tattoo designs", +        "bubs", +        "yahoo", +        "free live gay cam chats", +        "antibiotics", +        "upgrade", +        "aessuccess.org", +        "yahoo", +        "boobdex", +        "the jackle", +        "plus size lingerie magazines for home", +        "lehigh valley little league", +        "ancient trade coins", +        "pillsbury", +        "colorado springs", +        "canada aviation jobs", +        "free guitar tablature", +        "kids aol", +        "capitol community colage", +        "kevin thomas bermuda", +        "missouri lotto", +        "homedepotsportscomplex.com", +        "dr. franklin schneier", +        "williamsburg va. hotels", +        "aim", +        "morningbuzz", +        "probusines.com", +        "wwwalbasoul.com", +        "w.runehints.com", +        "yahoo.com", +        "yahoo.com", +        "yahoo.com", +        "fantasy 5", +        "xxx rape", +        "hawaiian gift baskets", +        "madonna.com", +        "myspace contact tables", +        "white cock", +        "safe space", +        "drinks", +        "o rly", +        "dsl", +        "wwww.uncc.edu", +        "wwww.uncc.edu", +        "wwww.uncc.edu", +        "online overseas checkt.westernunion.com", +        "angina", +        "heba technologies", +        "hebrew ancient coins", +        "games", +        "recent", +        "international male.com", +        "sex pics", +        "paul wall layouts for myspace", +        "health", +        "wire lamp shade frames", +        "windows", +        "top business colleges", +        "mary jo eustace", +        "attored", +        "oklahoma indian legal services", +        "6arab", +        "santo nino", +        "10.1.0.199", +        "http www.myspace.com daffydonn07", +        "marine electrical", +        "sandy creek cabins weekend new york", +        "onionbutts", +        "tucson classifieds", +        "new york times", +        "recently deleted screen names", +        "goldeneagle.net", +        "fta support forums", +        "low protein in bloos", +        "datring", +        "lilwayne", +        "free billiards games", +        "yahoo", +        "ako", +        "a.tribalfusion.c script language", +        "dustin davis", +        "cooking", +        "yahoo.com", +        "universal studios", +        "adult chat", +        "santa monica flea market", +        "carpevino.us", +        "wine vinyard in stewertstown pa", +        "y", +        "craigslist", +        "ups.com", +        "1-866-347-3292", +        "renegade boats", +        "renegade boats", +        "sunset state beach caping", +        "artofstanlync.org", +        "heart-i want make love to you video", +        "triangles around the world", +        "mycl.cravelyrics.com", +        "in the bible what type of persons were forced to walk around in public and say unclean unclean", +        "providence water fire", +        "googlecom", +        "yahoo.com", +        "b.g", +        "website de rebelde", +        "stoplinks", +        "allison 2000 transmission", +        "thepriceanduseofgasoline.com", +        "chamillinaire", +        "veryspecialhomescom", +        "crashbandicoot", +        "a short sex story", +        "yahoo.com", +        "music now", +        "east carolina university", +        "vandalism in new york", +        "the bainde soleil company", +        "dicaprio movies", +        "xxx dvds", +        "visual basic scripting support", +        "english bulldogs", +        "travelocity.com", +        "website for asstr.org", +        "hypnotic slave training", +        "pogo", +        "university at buffalo addmissions", +        "screen name services", +        "superdrol", +        "art institute", +        "online business cards", +        "aolfinancial", +        "upgrade shop", +        "anderson abrasive", +        "weatherchannel.com", +        "recent", +        "ebay", +        "diagram and xray of a normal shouldercheck out surgicalpoker.comfor more sports medicine and orthopedic information and images check out emedx.com by dr. allan mishranormal diagram normal x-ray", +        "95 mustang gt chips", +        "gold grills", +        "hap housing in portland or", +        "car sales", +        "swimming with dolphins", +        "jennifer lopez nude", +        "wwwdubcnn.com", +        "dominicks pizza", +        "fl studio", +        "http blackplanet .com", +        "http blackplanet .com", +        "http blackplanet .com", +        "A$AP Rocky", +        "benie mac", +        "fujifilm.com", +        "aol dialup setup", +        "metal fabrication tools", +        "internet", +        "buy my painting", +        "pulaski va classifieds", +        "w.coj.net", +        "postopia.com", +        "no medical records hydrocodone", +        "auto completes for deal or no deal contest", +        "http www. big monster dicks .com", +        "invacare wheelchairs", +        "musicdownload.com", +        "president bush", +        "heavy equipment", +        "inmate information", +        "allina.com", +        "megan law.gov", +        "wwwl.eharmony.com", +        "jobs in colombiaoqx0nq", +        "beastsex", +        "ferguisson", +        "heart-i wanna make love to you vedio", +        "west georgia university", +        "west georgia university", +        "hsn", +        "bb&t", +        "midas realty", +        "yahoo", +        "mytrip.com", +        "donna texas mcdonalds", +        "free picture of our lady", +        "bubs", +        "taken chemo for 5 month's cancer can still be seen on ct scan", +        "porn 20video 20clips", +        "lake monsters", +        "freedj mix vibes", +        "myspace.coim", +        "la joya school district tx", +        "colorado bungee jumping", +        "yahoo", +        "google.com", +        "lafayette co vampire grave", +        "ice cube", +        "internet", +        "tccd.edu", +        "google", +        "people", +        "instructions on putting together a filing cabinet", +        "click.babycenter.com", +        "90minut", +        "ramien noodles", +        "lilwayne", +        "danni virgin", +        "nice sexy girls.com", +        "guttural pouch", +        "free male masturbating", +        "good", +        "rotton 20dot.com", +        "fox sports", +        "seth rogen", +        "desb.mspaceads.com", +        "betjc.com", +        "pictures of quebec", +        "gold in quartz", +        "evergreen college", +        "runescape", +        "gastons white river resort", +        "sunset beach santa cruz", +        "auto parts", +        "travelocity", +        "myspace.com", +        "laptops", +        "beyaonce and j", +        "free gay ebony knights webcams", +        "google", +        "derek watson", +        "alice in wonderland tshirts", +        "hippa p rivacy act", +        "down payment mortgage", +        "believe it or not", +        "mys", +        "datatreca", +        "onesuite", +        "names", +        "lil john", +        "scales of justice cuff links", +        "localsales.com", +        "alametris denise lipsey", +        "adam for adam", +        "flip flops crochet", +        "arbors", +        "heb hospital", +        "myspae.com", +        "midevil breast torture", +        "askjeeves", +        "assparade", +        ".comhttp", +        "weekly hotels reston virginia", +        "noiceinparadise.com", +        "pre diabetic diet", +        "h.i.m.com", +        "myspace", +        "myspace", +        "wwww.sex.lp.cpm", +        "mcso mugshots", +        "roush", +        "wellfargo", +        "lilwayne", +        "hopecherie", +        "frontgate.com", +        "barbados registration department", +        "american pitbull", +        "free pc full flight simulation game downloads", +        "google", +        "vaginal secretion grey stuff", +        "myspace layouts", +        "kanye west", +        "walmart", +        "pain in hip and leg", +        "tenneesseeaquarium.com", +        "suncom.com", +        "alysseandrachelwerehere", +        "pimiclo", +        "starmagazine.com", +        "classifieds", +        "mount rushmore in dakota", +        "sams", +        "disney com", +        "beastyality", +        "chief joseph paintings", +        "henry scott", +        "paris hilton", +        "kb903235", +        "autotrader", +        "irish traveller", +        "ajcobs.com", +        "art of stanlync.org", +        "fox news", +        "freeporn", +        "depo provera", +        "air france", +        "talk city active chats", +        "codes for the gamecube game resident evil 4", +        "good food to eat for sugar diabetes", +        "warpmymind", +        "arc jacksonville fl", +        "7fwww.sendspace.com", +        "j blackfoot", +        "mcso madison street jail inmate", +        "macys", +        "eduscapes", +        "free picture of our lady", +        "http www.eastman.org", +        "minneapolisstartribune localnews", +        "minneapolisstartribune localnews", +        "tennessee", +        "foodtown", +        "anti virous download", +        "http www.mdland rec.net", +        "ed edd eddy", +        "maryjbilge", +        "shipping services", +        "baseball videogames", +        "egyption ancient coins", +        "internet", +        "what is sodomy", +        "international cub lowboy", +        "mary j. bilge", +        "scenic backgrounds", +        "google.com", +        "rosettalangueges.com", +        "titanpoker.net", +        "titie show", +        "edelen realtor", +        "lil cim", +        "china.com", +        "boost mobile", +        "nc eipa", +        "people's 20pharmacy 20guide 20to", +        "costco", +        "charles schultz drawings", +        "nicisterling", +        "a picture of author stephen crane", +        "yahoo.com", +        "sponge bob myspace layouts", +        "g", +        "calendar creator", +        "careerbuilder.com", +        "cool tex for web pages", +        "yahoo.com", +        "mcdougal littel", +        "sign on", +        "superman", +        "radio", +        "lajollaindians.com", +        "mike tyson died", +        "pink panther", +        "lolita newgroups", +        "nude girls", +        "galveston 20texas", +        "gerlach meat co.", +        "thetakeover2006.com", +        "yahoo", +        "simpsons movie", +        "saxy", +        "yahoo", +        "21st century realty", +        "new zealand", +        "dogs", +        "weather", +        "free porn sex", +        "bugs bunny parties", +        "mortal kombat 2 fatalities", +        "sea life park hawaii", +        "songs for middle school choir", +        "rocky mountain jeep", +        "householdbank.com", +        "birdville isd", +        "brutal dildo", +        "brutal dildo", +        "free live gay cam chats", +        "wonder woman", +        "ebay com", +        "myspace.com", +        "boost mobile", +        "desktop themes sex", +        "myspace.com", +        "myspace.com", +        "maroon chevy auto dealership", +        "beyonce", +        "cleopatra vii", +        "accountcentralonline.com", +        "juvenile", +        "the game cock", +        "pics of ashland city tennessee", +        "coherent deos", +        "microwsoft wireless connection", +        "best buy", +        "southwest airlines", +        "southwest airlines", +        "pogo games", +        "family court record room in brooklyn newyork", +        "60.ufc.net", +        "us mint", +        "people", +        "firstcitycreditunion", +        "washington mutual careers", +        "beyonce", +        "tab energy drink", +        "http vemmabuilder.com", +        "new york state lottery", +        "yahoo", +        "tmobile", +        "yellow pages.com", +        "az.central.com", +        "pasco auto salvage", +        "im help", +        "home based businesses", +        "studyisland", +        "bible study from king james on 1 corinthians chapter 6 verses 18- 20", +        "bellevue-ne", +        "msn.com", +        "aolsignupfree", +        "the simsons", +        "nevada", +        "forsyth central high school", +        "road state college", +        "does my child have adhd", +        "les tucanesde tijuana", +        "yahoo.com", +        "mexican pharmacy hyrocodone", +        "ford motor co year end sales", +        "google.com", +        "google.com", +        "person.com", +        "marylyn monroe", +        "nfl", +        "the hun.net", +        "nkena anderson", +        "free netscape download", +        "top fifty colleges", +        "wil.", +        "memphis tennessee", +        "yahoo mail", +        "corrections officer of juveniles", +        "jada pinkett smith", +        "mapquest.com", +        "apartments", +        "msn.com", +        "msn.com", +        "wasco state prison", +        "solitaire", +        "http", +        "freeport seaman center", +        "futbol soccer", +        "screen names", +        "kmov.com", +        "survey.otxresearch.com", +        "facial shaves", +        "gle", +        "flw.com", +        "seasportboats.com", +        "toysrus.com", +        "animated sexy graphics", +        "colombia", +        "unitarian univeralist association", +        "fr", +        "google video.com", +        "660-342-1072", +        "suzan-lori parks", +        "male facial", +        "william bouguereau first kiss how much it is worth", +        "streetfighter", +        "nick.com", +        "wonder woman", +        "pentagram", +        "mcafee virus protection", +        "diary", +        "037f34742140a5f761ad51d95180b4f8", +        "free porn", +        "no deposit casino bonus", +        "spongebob the movie myspace layouts", +        "on line banking", +        "equestrian properties for sale", +        "kazaa free muisc download", +        "gay truckers", +        "24", +        "pay-pal", +        "www yahoo.com", +        "phatazz.white hoes", +        "planets of the universe", +        "free movies", +        "budget rentals special", +        "yahoogames", +        "talaat pasha", +        "mariah carey song lyrics don't forget about us", +        "futbol soccer", +        "msn groups", +        "martha steward", +        "martha steward", +        "soap opera scoops cbs", +        "cingular", +        "stuwie", +        "womengiving blowjobs", +        "hear dancing queen by abba", +        "love song", +        "fhsaa.org", +        "any dvd", +        "any dvd", +        "gallery.brookeskye.com", +        "gibson ranch", +        "wachovia com", +        "kzg golf information", +        "skylight curtains", +        "c", +        "123freeweblayouts.com", +        "yahoo.com", +        "allie.com", +        "ghosts of bingham cemetery", +        "resume maker", +        "resume maker", +        "resume maker", +        "lymphomatoid papulosis", +        "sez.com", +    }; +} + +class TCodecsTest: public TTestBase { +    UNIT_TEST_SUITE(TCodecsTest); +    UNIT_TEST(TestPipeline) +    UNIT_TEST(TestDelta) +    UNIT_TEST(TestHuffman) +    UNIT_TEST(TestZStdDict) +    UNIT_TEST(TestCompTable) +    UNIT_TEST(TestHuffmanLearnByFreqs) +    UNIT_TEST(TestSolar) +    UNIT_TEST(TestPFor) +    UNIT_TEST(TestRegistry) + +    UNIT_TEST_SUITE_END(); + +private: +    TString PrintError(TStringBuf learn, TStringBuf test, TStringBuf codec, ui32 i) { +        TString s; +        TStringOutput sout(s); +        sout << codec << ": " << i << ", " +             << "\n"; +        sout << HexEncode(learn.data(), learn.size()); //NEscJ::EscapeJ<true>(learn, sout); +        sout << " != \n"; +        sout << HexEncode(test.data(), test.size()); //NEscJ::EscapeJ<true>(test, sout); + +        if (s.Size() > 1536) { +            TString res = s.substr(0, 512); +            res.append("...<skipped ").append(ToString(s.size() - 1024)).append(">..."); +            res.append(s.substr(s.size() - 512)); +        } + +        return s; +    } + +    TStringBuf AsStrBuf(const TBuffer& b) { +        return TStringBuf(b.data(), b.size()); +    } + +    template <typename TCodec, bool testsaveload> +    void TestCodec(const TVector<TBuffer>& inlearn = TVector<TBuffer>(), const TVector<TBuffer>& in = TVector<TBuffer>(), NCodecs::TCodecPtr c = new TCodec) { +        using namespace NCodecs; + +        TBuffer buff; + +        { +            TVector<TBuffer> out; + +            c->Learn(inlearn.begin(), inlearn.end()); + +            if (testsaveload) { +                { +                    TBufferOutput bout(buff); +                    ICodec::Store(&bout, c); +                } + +                { +                    TBufferInput bin(buff); +                    c = ICodec::Restore(&bin); +                    UNIT_ASSERT(c->AlreadyTrained()); +                } +            } + +            { +                size_t insz = 0; +                size_t outsz = buff.Size(); + +                for (ui32 i = 0; i < inlearn.size(); ++i) { +                    out.emplace_back(); +                    c->Encode(AsStrBuf(inlearn[i]), out[i]); + +                    insz += inlearn[i].Size(); +                    outsz += out[i].Size(); +                } + +                TBuffer vecl; +                for (ui32 i = 0; i < out.size(); ++i) { +                    vecl.Clear(); +                    c->Decode(AsStrBuf(out[i]), vecl); + +                    UNIT_ASSERT_EQUAL_C(AsStrBuf(inlearn[i]), AsStrBuf(vecl), +                                        PrintError(TStringBuf(inlearn[i].data(), inlearn[i].size()), +                                                   TStringBuf(vecl.data(), vecl.size()), c->GetName(), i)); +                } +            } +        } + +        { +            if (testsaveload) { +                TBufferInput bin(buff); +                c = ICodec::Restore(&bin); +            } + +            size_t insz = 0; +            size_t outsz = buff.Size(); + +            TBuffer out, in1; +            for (ui32 i = 0; i < in.size(); ++i) { +                out.Clear(); +                in1.Clear(); +                c->Encode(AsStrBuf(in[i]), out); +                insz += in[i].Size(); +                outsz += out.Size(); +                c->Decode(AsStrBuf(out), in1); +                UNIT_ASSERT_EQUAL_C(AsStrBuf(in[i]), AsStrBuf(in1), +                                    PrintError(TStringBuf(in[i].data(), in[i].size()), +                                               TStringBuf(in1.data(), in1.size()), c->GetName(), i)); +            } +        } +    } + +    template <class T> +    void AppendTo(TBuffer& b, T t) { +        b.Append((char*)&t, sizeof(t)); +    } + +    void TestDelta() { +        using namespace NCodecs; +        TVector<TBuffer> d; + +        // 1. common case +        d.emplace_back(); +        AppendTo(d.back(), 1ULL); +        AppendTo(d.back(), 10ULL); +        AppendTo(d.back(), 100ULL); +        AppendTo(d.back(), 1000ULL); +        AppendTo(d.back(), 10000ULL); +        AppendTo(d.back(), 100000ULL); + +        // 2. delta overflow +        d.emplace_back(); +        AppendTo(d.back(), 1ULL); +        AppendTo(d.back(), 10ULL); +        AppendTo(d.back(), 100ULL); +        AppendTo(d.back(), 1000ULL); +        AppendTo(d.back(), (ui64)-100LL); +        AppendTo(d.back(), (ui64)-10ULL); + +        // 3. bad sorting +        d.emplace_back(); +        AppendTo(d.back(), 1ULL); +        AppendTo(d.back(), 10ULL); +        AppendTo(d.back(), 1000ULL); +        AppendTo(d.back(), 100ULL); +        AppendTo(d.back(), 10000ULL); +        AppendTo(d.back(), 100000ULL); + +        // all bad +        d.emplace_back(); +        AppendTo(d.back(), -1LL); +        AppendTo(d.back(), -1LL); +        AppendTo(d.back(), -1LL); +        AppendTo(d.back(), -1LL); + +        TestCodec<TDeltaCodec<ui64, true>, false>(d); +        TestCodec<TDeltaCodec<ui64, false>, false>(d); +    } + +    void TestPFor() { +        using namespace NCodecs; +        { +            TVector<TBuffer> d; +            d.emplace_back(); +            AppendTo(d.back(), -1LL); +            AppendTo(d.back(), -1LL); +            AppendTo(d.back(), -1LL); +            AppendTo(d.back(), -1LL); +            d.emplace_back(); +            AppendTo(d.back(), 0LL); +            AppendTo(d.back(), 1LL); +            AppendTo(d.back(), 2LL); +            AppendTo(d.back(), 1LL); +            AppendTo(d.back(), 0LL); +            AppendTo(d.back(), 1LL); +            AppendTo(d.back(), 2LL); +            d.emplace_back(); +            AppendTo(d.back(), 0LL); +            AppendTo(d.back(), 1LL); +            AppendTo(d.back(), 2LL); +            AppendTo(d.back(), 1LL); +            AppendTo(d.back(), -1LL); +            AppendTo(d.back(), 0LL); +            AppendTo(d.back(), 1LL); +            AppendTo(d.back(), 2LL); +            d.emplace_back(); +            AppendTo(d.back(), 0LL); +            AppendTo(d.back(), -1LL); +            AppendTo(d.back(), -2LL); +            AppendTo(d.back(), -1LL); +            AppendTo(d.back(), -2LL); +            AppendTo(d.back(), -1LL); +            AppendTo(d.back(), 0LL); +            AppendTo(d.back(), -1LL); +            AppendTo(d.back(), -2LL); + +            TestCodec<TPForCodec<ui64>, false>(d); +            TestCodec<TPForCodec<ui64, true>, true>(d); +        } +        { +            TVector<TBuffer> d; +            d.emplace_back(); +            AppendTo(d.back(), -1); +            AppendTo(d.back(), -1); +            AppendTo(d.back(), -1); +            AppendTo(d.back(), -1); +            d.emplace_back(); +            AppendTo(d.back(), 0); +            AppendTo(d.back(), 1); +            AppendTo(d.back(), 2); +            AppendTo(d.back(), 1); +            AppendTo(d.back(), -1); +            AppendTo(d.back(), 0); +            AppendTo(d.back(), 1); +            AppendTo(d.back(), 2); +            d.emplace_back(); +            AppendTo(d.back(), 0); +            AppendTo(d.back(), -1); +            AppendTo(d.back(), -2); +            AppendTo(d.back(), -1); +            AppendTo(d.back(), -2); +            AppendTo(d.back(), -1); +            AppendTo(d.back(), 0); +            AppendTo(d.back(), -1); +            AppendTo(d.back(), -2); + +            TestCodec<TPForCodec<ui32>, false>(d); +            TestCodec<TPForCodec<ui32, true>, false>(d); +        } +        { +            TVector<TBuffer> d; +            d.emplace_back(); +            for (auto& textValue : TextValues) { +                AppendTo(d.back(), (ui32)strlen(textValue)); +            } + +            TestCodec<TPForCodec<ui32>, false>(d); +            TestCodec<TPForCodec<ui32, true>, false>(d); +        } +        { +            TVector<TBuffer> d; +            d.emplace_back(); +            for (auto& textValue : TextValues) { +                AppendTo(d.back(), (ui64)strlen(textValue)); +            } + +            TestCodec<TPForCodec<ui64>, false>(d); +            TestCodec<TPForCodec<ui64, true>, false>(d); +        } +    } + +    template <class TCodec> +    void DoTestSimpleCodec() { +        using namespace NCodecs; +        { +            TVector<TBuffer> learn; + +            for (auto& textValue : TextValues) { +                learn.emplace_back(textValue, strlen(textValue)); +            } + +            TestCodec<TCodec, true>(learn); +        } +        { +            TestCodec<TCodec, true>(); +        } + +        { +            TVector<TBuffer> learn; +            learn.emplace_back(); +            learn.back().Append('a'); + +            TVector<TBuffer> test; +            test.emplace_back(); +            for (ui32 i = 0; i < 256; ++i) { +                test.back().Append((ui8)i); +            } + +            TestCodec<TCodec, true>(learn, test); +        } + +        { +            TVector<TBuffer> learn; +            learn.emplace_back(); +            for (ui32 i = 0; i < 256; ++i) { +                for (ui32 j = 0; j < i; ++j) { +                    learn.back().Append((ui8)i); +                } +            } + +            TVector<TBuffer> test; +            test.emplace_back(); +            for (ui32 i = 0; i < 256; ++i) { +                test.back().Append((ui8)i); +            } + +            TestCodec<TCodec, true>(learn, test); +        } + +        { +            TVector<TBuffer> learn; +            learn.emplace_back(); +            for (ui32 i = 0; i < 128; ++i) { +                for (ui32 j = 0; j < i; ++j) { +                    learn.back().Append((ui8)i); +                } +            } + +            TVector<TBuffer> test; +            test.emplace_back(); +            for (ui32 i = 128; i < 256; ++i) { +                test.back().Append((ui8)i); +            } + +            TestCodec<TCodec, true>(learn, test); +        } +    } + +    void TestHuffman() { +        DoTestSimpleCodec<NCodecs::THuffmanCodec>(); +    } + +    void TestZStdDict() { +        using namespace NCodecs; +        { +            TVector<TBuffer> learn; + +            for (auto& textValue : TextValues) { +                learn.emplace_back(textValue, strlen(textValue)); +            } + +            TestCodec<TZStdDictCodec, true>(learn); +        } + +    } + +    void TestCompTable() { +        DoTestSimpleCodec<NCodecs::TCompTableCodec>(); +    } + +    void TestHuffmanLearnByFreqs() { +        using namespace NCodecs; + +        TVector<TBuffer> data; + +        for (auto& textValue : TextValues) { +            data.emplace_back(textValue, strlen(textValue)); +        } + +        TVector<TBuffer> outLearn; + +        { +            THuffmanCodec codec; +            static_cast<ICodec&>(codec).Learn(data.begin(), data.end()); + +            for (ui32 i = 0; i < data.size(); ++i) { +                outLearn.emplace_back(); +                codec.Encode(AsStrBuf(data[i]), outLearn[i]); +            } +        } + +        TVector<TBuffer> outLearnByFreqs; + +        { +            THuffmanCodec codec; +            std::pair<char, ui64> freqs[256]; + +            for (size_t i = 0; i < Y_ARRAY_SIZE(freqs); ++i) { +                freqs[i].first = (char)i; +                freqs[i].second = 0; +            } + +            for (auto& textValue : TextValues) { +                size_t len = strlen(textValue); +                for (size_t j = 0; j < len; ++j) { +                    ++freqs[(ui32)(0xFF & textValue[j])].second; +                } +            } + +            codec.LearnByFreqs(TArrayRef<std::pair<char, ui64>>(freqs, Y_ARRAY_SIZE(freqs))); + +            for (ui32 i = 0; i < data.size(); ++i) { +                outLearnByFreqs.emplace_back(); +                codec.Encode(AsStrBuf(data[i]), outLearnByFreqs[i]); +            } +        } + +        UNIT_ASSERT_EQUAL(outLearn.size(), outLearnByFreqs.size()); +        const size_t sz = outLearn.size(); +        for (size_t n = 0; n < sz; ++n) { +            UNIT_ASSERT_EQUAL(AsStrBuf(outLearn[n]), AsStrBuf(outLearnByFreqs[n])); +        } +    } + +    void TestSolar() { +        using namespace NCodecs; +        { +            TVector<TBuffer> learn; + +            for (auto& textValue : TextValues) { +                learn.emplace_back(textValue, strlen(textValue)); +            } + +            TestCodec<TSolarCodec, true>(learn, TVector<TBuffer>(), new TSolarCodec(512, 8)); +            TestCodec<TAdaptiveSolarCodec, false>(learn, TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8)); +            TestCodec<TAdaptiveSolarCodec, true>(learn, TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8)); +            TestCodec<TSolarCodecShortInt, true>(learn, TVector<TBuffer>(), new TSolarCodecShortInt(512, 8)); +        } +        { +            TestCodec<TSolarCodec, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TSolarCodec(512, 8)); +            TestCodec<TAdaptiveSolarCodec, false>(TVector<TBuffer>(), TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8)); +            TestCodec<TAdaptiveSolarCodec, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8)); +            TestCodec<TSolarCodecShortInt, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TSolarCodecShortInt(512, 8)); +        } + +        { +            TVector<TBuffer> learn; +            learn.emplace_back(); +            learn.back().Append('a'); + +            TVector<TBuffer> test; +            test.emplace_back(); +            for (ui32 i = 0; i < 256; ++i) { +                test.back().Append((ui8)i); +            } + +            TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8)); +            TestCodec<TAdaptiveSolarCodec, false>(learn, test, new TAdaptiveSolarCodec(512, 8)); +            TestCodec<TAdaptiveSolarCodec, true>(learn, test, new TAdaptiveSolarCodec(512, 8)); +            TestCodec<TSolarCodecShortInt, true>(learn, test, new TSolarCodecShortInt(512, 8)); +        } + +        { +            TVector<TBuffer> learn; +            learn.emplace_back(); +            for (ui32 i = 0; i < 256; ++i) { +                for (ui32 j = 0; j < i; ++j) { +                    learn.back().Append((ui8)i); +                } +            } + +            TVector<TBuffer> test; +            test.emplace_back(); +            for (ui32 i = 0; i < 256; ++i) { +                test.back().Append((ui8)i); +            } + +            TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8)); +            TestCodec<TAdaptiveSolarCodec, false>(learn, test, new TAdaptiveSolarCodec(512, 8)); +            TestCodec<TAdaptiveSolarCodec, true>(learn, test, new TAdaptiveSolarCodec(512, 8)); +            TestCodec<TSolarCodecShortInt, true>(learn, test, new TSolarCodecShortInt(512, 8)); +        } +    } + +    void TestPipeline() { +        using namespace NCodecs; +        { +            TVector<TBuffer> learn; +            learn.emplace_back(); +            for (ui32 i = 0; i < 256; ++i) { +                for (i32 j = i; j >= 0; --j) { +                    learn.back().Append((ui8)j); +                } +            } + +            TVector<TBuffer> test; +            test.emplace_back(); +            for (ui32 i = 0; i < 256; ++i) { +                test.back().Append((ui8)i); +            } + +            TestCodec<TPipelineCodec, true>(learn, test, +                                            new TPipelineCodec(new TSolarCodec(512, 8), new TSolarCodec(512, 8), new THuffmanCodec)); +        } +        { +            TVector<TBuffer> d; +            d.emplace_back(); +            for (ui32 i = 0; i < 256; ++i) { +                for (i32 j = i; j >= 0; --j) { +                    d.back().Append(i * i); +                } +            } + +            TestCodec<TPipelineCodec, false>(d, TVector<TBuffer>(), +                                             new TPipelineCodec(new TDeltaCodec<ui32, false>, new TPForCodec<ui32>)); +        } +    } + +    void TestRegistry() { +        using namespace NCodecs; +        TVector<TString> vs = ICodec::GetCodecsList(); +        for (const auto& v : vs) { +            TCodecPtr p = ICodec::GetInstance(v); +            if (v == "none") { +                UNIT_ASSERT(!p); +                continue; +            } +            UNIT_ASSERT_C(!!p, v); +            UNIT_ASSERT_C(TStringBuf(v).Head(3) == TStringBuf(p->GetName()).Head(3), v + " " + p->GetName()); +        } +    } +}; + +UNIT_TEST_SUITE_REGISTRATION(TCodecsTest) diff --git a/library/cpp/codecs/ut/float_huffman_ut.cpp b/library/cpp/codecs/ut/float_huffman_ut.cpp new file mode 100644 index 00000000000..3156fb1f461 --- /dev/null +++ b/library/cpp/codecs/ut/float_huffman_ut.cpp @@ -0,0 +1,237 @@ +#include <library/cpp/codecs/float_huffman.h> + +#include <library/cpp/testing/unittest/registar.h> + +#include <util/stream/format.h> +#include <util/stream/output.h> +#include <library/cpp/string_utils/base64/base64.h> + +namespace fh = NCodecs::NFloatHuff; + +Y_UNIT_TEST_SUITE(FloatHuffmanTest) { +    static const float Factors[] = { +        0.340582, 0.000974026, 0.487168, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0.411765, 0.921569, +        0.00390625, 0.109371, 0, 1, 0, 0, 0, 0, 0.523322, 0, 1, 0, 0, 0, 0, 0.285714, 1, +        0.008253, 1, 0, 0, 0.00993935, 0.450213, 0.000974026, 1, 1, 1, 1, 0, 0, 0.20564, +        0.97561, 0.913896, 1, 1, 0, 1, 0, 0, 0.5, 0, 0, 0, 0.1, 1, 0, 0, 0, 0, 0, 0.450923, +        0, 0.5, 0, 0, 0.20564, 0, 0.5, 0, 0, 0.20564, 0, 0, 0.0313726, 0, 1, 1, 1, 0.363636, +        0.5, 0.686073, 0.45121, 0.00574382, 0.366166, 0.413295, 1, 1, 1, 0, 0, 0, 0, 0.160784, +        0, 0.937255, 0.537255, 0.133333, 0, 0, 0, 0, 0.00392157, 0, 0.333333, 0.027451, 0.0156863, +        1, 0.105882, 1, 0.00220908, 0.000112501, 0.0111262, 0.102384, 0.00140808, 0.123581, +        0.29308, 6.57282e-06, 0.00489498, 2.10209e-05, 0.00140559, 5.907e-06, 0, 0.559322, +        0.559322, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0.794765, 0, +        0.352648, 0.225904, 1, 0.047619, 0.0107276, 0.399461, 0.0304838, 0.292932, 0.00969929, +        0, 0, 0.886904, 0.714693, 0, 0.00223213, 0.000544069, 0, 0, 0, 0, 0, 0, 0, 0, 0, +        0, 0.00507403, 0, 0, 0, 0, 0, 0.875, 0, 0, 1, 1, 1, 0, 0.20564, 0, 0.00176048, 0, +        0.000440121, 0, 0, 0, 0.000974026, 0.487168, 0, 0, 0.533333, 0, 0, 0, 0, 0, 0, 0, +        0, 0, 0, 1, 0, 0, 1, 0, 0, 0.723187, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +        0, 0, 0, 0, 0, 1, 0, 0.206882, 0.00483367, 0.792983, 0.00126106, 1, 0.0313726, 0.470588, +        0.254902, 0.188235, 0.188235, 0.388235, 0.164706, 0, 0.870588, 0.843137, 0.635294, +        0.384314, 0.384314, 0.643137, 0, 0, 0, 0, 0, 0, 0, 0, 0.541176, 0, 0.541176, 0, 0, +        0.0532634, 1, 0, 0, 0, 0.015044, 1, 0, 1, 1, 1, 0.47451, 0.329412, 0.964706, 0, 0, +        0, 0, 0, 0.5, 0, 0, 0, 0, 0, 0, 0.0941176, 0.970588, 0.970588, 0, 0.970588, 0.97561, +        0, 0.0431373, 0.47451, 0.329412, 0.964706, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +        0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0.231373, 0.00392157, 0, 0, 0, 0.054902, 0, 0, +        1, 0, 0, 0.0235294, 0, 1, 0, 0, 0, 0, 0.34902, 0.0352941, 0.925379, 0.623681, 0, +        0.954543, 0, 0, 0.00102756, 0.709804, 0.498039, 0.0901961, 0.631373, 0.847059, 0.270588, +        0.0156863, 0.133333, 0.980392, 1e-12, 1e-12, 1e-12, 1e-12, 0.497159, 0, 0.407487, +        0, 0, 0, 0.00392157, 0.00202156, 0.046875, 0.187159, 0.046875, 0.15625, 0.434232, +        0.15625, 0, 2.95083e-07, 0.20564, 0.20564, 0.97561, 0.913896, 0, 0, 0, 0, 0, 0, 0.00784314, +        0, 0.695525, 1, 0.07205, 0, 0, 0.176471, 0, 0, 0, 1, 1, 0.98, 0.01, 0.01, 0, 0.00690702, +        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.29078, 0.29078, 1, 0, 0, 0, 0, 0.192157, 0.188235, +        0.0941176, 0, 0.0313726, 0, 0.141176, 0.207843, 0.0901961, 0.00784314, 0.0784314, +        0, 0, 0, 0, 0, 0.203922, 0.0196078, 0.34902, 0.0235294, 0.0980392, 0.164706, 0.133333, +        0.368627, 0, 0.0941176, 0, 1, 0.313726, 0, 0, 0.433582, 0.384508, 0.0532186, 0.0833333, +        0.01609, 0, 1, 0, 0, 0, 0.0666667, 0, 0, 0, 0, 1, 0, 0.564706, 0.501961, 0, 0, 0, +        0, 0, 0.0516447, 0.000173065, 0, 0, 0, 0, 0, 0, 0, 0.996309, 0, 0, 0.00392157, 1, +        0, 0.01, 0, 0, 0, 0, 0, 0.439505, 0.206882, 0.206882, 0.260891, 0, 0.875, 0, 0, 0, +        0, 0, 0.185657, 1, 1, 0, 0, 0, 0.0332647, 0.206106, 0.0688878, 0.239216, 0, 0, 0, +        0, 0.054902, 0, 0.101961, 0.160784, 0.180392, 0, 0.737828, 0, 0, 0.875, 0.0142566, +        0, 0.662745, 1, 0, 0, 0, 0.225806, 0.99992, 0.631373, 0.00392157, 1, 0, 0.143647, +        0.00270085, 1, 0.231482, 0.246735, 0.0428062, 0, 0, 1, 0, 0.186441, 0.0115358, 0, +        0.221762, 0, 0.2, 0, 0.0156863, 0, 0, 0, 0.976471, 0, 0.231373, 0, 0, 0, 0, 0, 0, +        0, 0, 0, 0, 0.00392157, 0.00392157, 0.0666667, 0, 0, 0, 0, 0.0117647, 0.580392, 0.98737, +        1, 1, 1, 0, 0, 0, 0.153, 0.847, 0.931373, 0.94697, 0.94697, 0, 0.946294, 0.408118, +        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0.99992, 0.97561, 0, 0, 0, 0, 0, 0, +        0.274677, 0.153017, 0, 0.642356, 0, 0, 0.1, 0, 0, 0, 0, 0.327944, 0.327944, 0, 0, +        0.815686, 0, 0, 0, 0, 0.206106, 0.439126, 0, 0, 0, 0, 0, 1, 1, 1, 0.00392157, 0.232788, +        0.232465, 0.999899, 0.00309296, 0.0636097, 0.445954, 0.156863, 0, 0, 0, 0, 0, 0, +        0.3796, 0.0784, 0.0651664, 0, 0, 0.254902, 0.266667, 1, 0, 0, 0, 0, 0, 0.596073, +        0.517876, 0.145833, 0.372549, 0, 0.991667, 0.602125, 0.161979, 0, 0, 0, 0, 0.0255146, +        0.947855, 0, 0, 0, 0, 0, 0, 0, 0, 0.847059, 0.679841, 0, 0.156863, 0, 0, 1, 0, 0, +        0, 0, 0.969697, 0, 0, 0.564706, 0, 0, 0, 0, 0, 1, 0.0367282, 0.0395228, 0, 0, 0, +        0, 0, 0.0470588, 0.141176, 0.054902, 0, 0, 0, 0}; +    static const size_t FactorCount = Y_ARRAY_SIZE(Factors); + +    static const ui8 CodedFactors[] = { +        0x24, 0x06, 0x73, 0xB5, 0xC7, 0x55, 0x7F, 0x3A, 0xB4, 0x70, 0xCB, 0xEF, 0xEE, 0xFE, 0xB3, 0x5B, +        0x5A, 0x1A, 0x93, 0x5F, 0x5F, 0x13, 0x00, 0x00, 0x10, 0x00, 0x3D, 0xEF, 0xFF, 0xEE, 0x0F, 0xDC, +        0xF0, 0xAB, 0x3F, 0x37, 0x92, 0x24, 0x5D, 0x5E, 0xDE, 0x1C, 0xF8, 0x12, 0x15, 0x5B, 0x84, 0x51, +        0x82, 0xE6, 0xF6, 0xB8, 0xEA, 0x4F, 0xC7, 0xDD, 0x7D, 0x2E, 0x4D, 0x4A, 0x21, 0xCA, 0xE0, 0xC4, +        0x2E, 0xEA, 0xD3, 0xBD, 0x0F, 0x00, 0x00, 0xE0, 0xDA, 0xCC, 0xCC, 0xEC, 0x9F, 0x61, 0xDF, 0xE6, +        0x01, 0x00, 0x00, 0xCC, 0xA5, 0x49, 0xA9, 0x00, 0x00, 0x00, 0xE6, 0xD2, 0xA4, 0xD4, 0xEA, 0x08, +        0x08, 0xD0, 0xDD, 0xF9, 0xE7, 0xA2, 0x0B, 0x00, 0x00, 0x40, 0xD8, 0x13, 0x7D, 0xFE, 0x13, 0x9C, +        0x9B, 0xA8, 0x36, 0xBC, 0x00, 0x90, 0x43, 0x6F, 0x97, 0x67, 0x9B, 0xD3, 0xEE, 0xFE, 0x84, 0x24, +        0x25, 0x89, 0xC9, 0xBF, 0x3F, 0x58, 0x4C, 0x4C, 0xCA, 0x21, 0x22, 0xBC, 0x39, 0x08, 0x08, 0x08, +        0x40, 0x7E, 0xAA, 0xAA, 0xCA, 0x75, 0x70, 0x70, 0xE9, 0x08, 0x08, 0xE8, 0x9A, 0x8A, 0x8D, 0xED, +        0xA6, 0x8D, 0x31, 0x04, 0x00, 0x96, 0xD0, 0x7D, 0x1D, 0x47, 0xAA, 0x2A, 0xD9, 0x28, 0xAD, 0x6B, +        0xB4, 0x9D, 0x7A, 0xC4, 0xD5, 0xD1, 0x04, 0x8C, 0x7E, 0x56, 0x3A, 0x58, 0x5A, 0x0C, 0x46, 0x6E, +        0x1B, 0x53, 0xC2, 0x0C, 0x14, 0x00, 0xAB, 0x60, 0x05, 0x7B, 0x63, 0x8D, 0x77, 0x70, 0x75, 0xAC, +        0x2F, 0x8D, 0xB1, 0x4D, 0xA0, 0xFB, 0xF2, 0x40, 0xF7, 0xE5, 0x7F, 0xDF, 0xDD, 0xFD, 0xBB, 0x1B, +        0xB8, 0x75, 0x9B, 0x47, 0x8E, 0xB4, 0x0C, 0x9B, 0x3A, 0x73, 0x25, 0x61, 0x18, 0x92, 0xD1, 0xC2, +        0x2F, 0x3C, 0x31, 0x64, 0x96, 0x2A, 0xB9, 0xF9, 0x7C, 0xD9, 0xAF, 0x94, 0xC5, 0xE9, 0x1E, 0x63, +        0x24, 0x0C, 0x03, 0x7F, 0xD8, 0x5B, 0xB3, 0x1D, 0x49, 0x02, 0x00, 0xAB, 0xFD, 0xE9, 0xA0, 0xF3, +        0xBF, 0xC9, 0x40, 0x64, 0x0A, 0xC0, 0xC7, 0x00, 0x00, 0x60, 0x77, 0xCF, 0xA5, 0x49, 0xA9, 0x16, +        0xFD, 0xD7, 0x5C, 0xA7, 0x55, 0x00, 0x36, 0xCF, 0xB9, 0x3D, 0xAE, 0xFA, 0xD3, 0xA1, 0x85, 0x5B, +        0xFE, 0x60, 0x10, 0x11, 0xFF, 0xF7, 0x7D, 0x38, 0x59, 0x24, 0xFF, 0xFF, 0xDF, 0x13, 0x1C, 0x7B, +        0xCA, 0x1C, 0x1E, 0xF3, 0x04, 0xC0, 0x78, 0x07, 0x58, 0x7B, 0xA2, 0x54, 0xAA, 0xE3, 0xEA, 0x08, +        0x08, 0xC0, 0x74, 0x78, 0x78, 0x88, 0x50, 0x50, 0xD8, 0x0A, 0x0C, 0xC4, 0x56, 0x60, 0x20, 0xF6, +        0x1A, 0x1B, 0x33, 0x16, 0x15, 0xA5, 0xB8, 0xED, 0xED, 0x22, 0xF5, 0xF5, 0x09, 0xA1, 0xA2, 0x42, +        0x67, 0x62, 0x62, 0x3A, 0x13, 0x13, 0x0B, 0xA0, 0xA4, 0xF4, 0x0F, 0x06, 0x15, 0x35, 0x18, 0x54, +        0xD4, 0x35, 0x57, 0x45, 0xCB, 0x2F, 0x39, 0xF6, 0xEC, 0xBC, 0xBB, 0x53, 0x5F, 0x5E, 0x9E, 0xB1, +        0xA8, 0xA8, 0x28, 0xDF, 0xDE, 0x3E, 0x00, 0x00, 0x80, 0x5F, 0x75, 0x81, 0x81, 0x51, 0x1D, 0x1E, +        0xA2, 0x3A, 0x3C, 0x8C, 0xEA, 0xF0, 0x10, 0x51, 0x06, 0x67, 0xED, 0x85, 0x85, 0xA1, 0xBE, 0xBC, +        0x3C, 0x63, 0x51, 0x51, 0x51, 0xBE, 0xBD, 0xFD, 0xFF, 0xFD, 0xFE, 0xCE, 0x85, 0x76, 0x36, 0x73, +        0x10, 0x10, 0x10, 0x80, 0xEB, 0x3A, 0x38, 0xD8, 0xBE, 0xD4, 0x05, 0x06, 0xEE, 0x4F, 0x60, 0x59, +        0x59, 0x65, 0x84, 0x84, 0xC0, 0x46, 0xCB, 0x19, 0x7F, 0x4C, 0xFD, 0xC8, 0x9D, 0x8B, 0xB6, 0x31, +        0xAF, 0x86, 0x3A, 0xF0, 0x6D, 0x6D, 0x11, 0xDF, 0xDF, 0x5F, 0x79, 0x71, 0x71, 0x85, 0xD4, 0xD0, +        0x10, 0xB9, 0xB1, 0x11, 0x1A, 0x54, 0x54, 0xE9, 0x08, 0x08, 0x48, 0x39, 0x44, 0x04, 0x84, 0xAF, +        0xAF, 0x96, 0x99, 0x97, 0x71, 0xC5, 0x32, 0xF3, 0x32, 0xAE, 0x58, 0x66, 0x5E, 0xC6, 0x15, 0xCB, +        0xCC, 0xCB, 0xB8, 0x42, 0xD0, 0x45, 0xFF, 0x1C, 0x11, 0x85, 0xBE, 0x39, 0x08, 0x08, 0x08, 0x80, +        0x69, 0xC2, 0x47, 0x00, 0x80, 0x02, 0x00, 0x00, 0x91, 0xD3, 0xF4, 0x47, 0x01, 0x00, 0x80, 0x08, +        0x00, 0x00, 0x42, 0xD4, 0x29, 0x6F, 0x02, 0x00, 0x80, 0xB4, 0xE6, 0x6B, 0x9E, 0x34, 0x5C, 0x9A, +        0x94, 0xE2, 0xD2, 0xA4, 0x14, 0xA2, 0x0C, 0x4E, 0xEC, 0xA2, 0x3E, 0x7F, 0x39, 0x08, 0x08, 0x10, +        0x6E, 0x6F, 0x10, 0xD7, 0x79, 0xC7, 0xC9, 0x09, 0x4D, 0x4B, 0x73, 0x77, 0x84, 0x14, 0xAE, 0x52, +        0xE1, 0x7A, 0x44, 0x2A, 0x5C, 0x8F, 0x34, 0x93, 0xA8, 0xC4, 0x01, 0xF8, 0x3F, 0x3D, 0xC2, 0x29, +        0xE9, 0x11, 0x4E, 0xE9, 0x4F, 0x67, 0x62, 0x22, 0xB6, 0x02, 0x03, 0xA9, 0x2E, 0x30, 0x70, 0x75, +        0x04, 0x04, 0xC8, 0x38, 0x48, 0x08, 0x32, 0x53, 0x53, 0x29, 0x2F, 0x2E, 0xAE, 0x1C, 0x04, 0x04, +        0x50, 0x52, 0x50, 0xD0, 0x4F, 0x77, 0x68, 0x28, 0x99, 0x08, 0x0A, 0x4A, 0x60, 0x59, 0x59, 0xA9, +        0x0B, 0x0C, 0xAC, 0xC7, 0xC8, 0xC8, 0x8C, 0x45, 0x45, 0xA1, 0x1C, 0x22, 0x02, 0x5D, 0x79, 0x79, +        0xAB, 0x2E, 0x30, 0x70, 0xA7, 0x2C, 0x28, 0xE8, 0xB4, 0xF3, 0xEF, 0x26, 0x8F, 0x37, 0xB1, 0xFE, +        0xEE, 0x67, 0xA9, 0xA9, 0xAA, 0xAA, 0x6C, 0x79, 0x1E, 0xEC, 0xD7, 0x46, 0x44, 0xC4, 0xF7, 0xF8, +        0x24, 0x24, 0x00, 0x42, 0x40, 0xF8, 0x5A, 0x96, 0x38, 0x65, 0x91, 0xF1, 0x6A, 0x72, 0xFE, 0x68, +        0xC3, 0xE1, 0x37, 0x07, 0x01, 0x01, 0x01, 0xF0, 0x52, 0xE1, 0x7A, 0xE4, 0xB3, 0xD9, 0x20, 0x9C, +        0xE0, 0xD8, 0x53, 0x04, 0xC7, 0x9E, 0x82, 0x02, 0x27, 0x2B, 0x06, 0x00, 0x00, 0x9F, 0xDE, 0x1C, +        0x3E, 0xEE, 0xD7, 0x48, 0x20, 0x04, 0xD2, 0x35, 0x4C, 0x29, 0x43, 0x45, 0x23, 0x15, 0xEA, 0xE9, +        0x5E, 0xD7, 0xC1, 0xC1, 0xAA, 0x3B, 0x34, 0x34, 0x21, 0x49, 0x49, 0xE8, 0x8A, 0x8B, 0x13, 0x66, +        0x12, 0xE7, 0x31, 0x00, 0x00, 0x90, 0x84, 0x94, 0x69, 0x05, 0xD4, 0xD4, 0xF4, 0x13, 0x36, 0xE7, +        0x0C, 0x09, 0xEB, 0xBF, 0x90, 0x1A, 0x1A, 0xE6, 0x20, 0x20, 0x20, 0x00, 0x9E, 0x33, 0x18, 0x13, +        0xA6, 0x2F, 0x40, 0x0C, 0x00, 0x4E, 0xCF, 0x84, 0x36, 0x6A, 0xA0, 0xF2, 0xA9, 0x63, 0xD5, 0xCB, +        0x9E, 0x64, 0xEA, 0x3E, 0xF2, 0x14, 0xA0, 0x27, 0x29, 0x2B, 0xC6, 0xB2, 0x99, 0x99, 0xA9, 0x74, +        0x04, 0x04, 0x3C, 0x0A, 0xD0, 0xCF, 0x5C, 0x68, 0x67, 0xFB, 0xDF, 0x1C, 0x04, 0x04, 0x04, 0xC0, +        0x1C, 0x04, 0x04, 0x04, 0x40, 0x1B, 0x11, 0x11, 0x5F, 0xEA, 0x02, 0x03, 0xE1, 0x92, 0x94, 0x84, +        0x90, 0x88, 0xD9, 0xDD, 0x4F, 0x04, 0x56, 0x0E, 0xD1, 0x9F, 0x1A, 0x31, 0x3B, 0x37, 0x47, 0xA0, +        0x6C, 0x82, 0x40, 0xD9, 0x24, 0x9A, 0x02, 0x12, 0x62, 0xD3, 0x43, 0xFF, 0xBF, 0x8F, 0x84, 0xF5, +        0x1F, 0x51, 0x06, 0xE7, 0x0F, 0xDD, 0x89, 0x32, 0xFB, 0x60, 0x39, 0x0A, 0x71, 0x71, 0xB4, 0x36, +        0x33, 0x33, 0x3F, 0x8F, 0xD0, 0x4F, 0x79, 0x84, 0x7E, 0xBA, 0xC8, 0x0C, 0x0D, 0x4F, 0xBA, 0x86, +        0x29, 0x82, 0x54, 0x83, 0x7F, 0x77, 0x37, 0x07, 0x01, 0x01, 0x01, 0xA0, 0xFE, 0x97, 0x1B, 0x9D, +        0x16, 0xDC, 0x90, 0x58, 0xFE, 0x9B, 0x42, 0xB3, 0x4A, 0x00, 0x68, 0x73, 0x91, 0x20, 0x2B, 0xA8, +        0xC8, 0x29, 0x0B, 0x0A, 0xF2, 0xD3, 0x5D, 0x4B, 0x58, 0x5D, 0x20, 0x41, 0xD5, 0xBE, 0xAE, 0x70, +        0x88, 0x50, 0x50, 0x20, 0x4A, 0x44, 0xF4, 0x8F, 0xF7, 0x60, 0x22, 0x30, 0x9C, 0x24, 0xFE, 0x54, +        0x55, 0xD0, 0xD7, 0xD7, 0x37, 0x1A, 0xEF, 0x6E, 0xBC, 0x9B, 0x44, 0x39, 0xDD, 0x5D, 0xF2, 0xF2, +        0x7F, 0x20, 0x1A, 0x81, 0x9A, 0xCA, 0xBF, 0xC8, 0x8D, 0x8D, 0xC2, 0x83, 0x82, 0xA7, 0x2C, 0x28, +        0xC8, 0xFE, 0x08, 0xC2, 0x07, 0xC7, 0x27, 0x21, 0xE1, 0xBB, 0x3E, 0xC1, 0x59, 0x68, 0xAA, 0x78, +        0xC8, 0x57, 0x5D, 0x60, 0x20, 0xC6, 0x41, 0x42, 0xE8, 0x3A, 0x38, 0xD8, 0x9B, 0xFF, 0xFF, 0xFF, +        0xC4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; +    static const size_t CodedSize = Y_ARRAY_SIZE(CodedFactors); +    static const TStringBuf CodedFactorsBuf(reinterpret_cast<const char*>(CodedFactors), CodedSize); + +    void FillWithGarbage(float* factors, size_t count) { +        void* data = static_cast<void*>(factors); +        memset(data, 0xAA, sizeof(float) * count); +    } + +    // Helper for dumping compressed values +    void PrintCompressed(const TVector<ui8>& codedFactors) { +        for (size_t i = 0; i < codedFactors.size(); ++i) { +            if (i % 0x10 == 0) +                Cerr << Endl; +            Cerr << Hex(codedFactors[i]) << ", "; +        } +        Cerr << Endl; +    } + +    // Helper for dumping decompressed values +    void PrintDecompressed(const TVector<float>& factors) { +        TStringStream result; +        TStringStream line; + +        for (size_t i = 0; i < factors.size(); ++i) { +            line << factors[i] << ", "; +            if (line.Str().size() > 80) { +                result << line.Str() << Endl; +                line.Clear(); +            } +        } +        Cerr << result.Str() << Endl; +    } + +    Y_UNIT_TEST(TestCompress) { +        const auto codedFactors = fh::Encode(Factors); +        UNIT_ASSERT_VALUES_EQUAL(codedFactors.size(), CodedSize); +        for (size_t i = 0; i < Min(codedFactors.size(), CodedSize); ++i) +            UNIT_ASSERT_VALUES_EQUAL((ui8)codedFactors[i], CodedFactors[i]); +        //PrintCompressed(codedFactors); +    } + +    Y_UNIT_TEST(TestSimpleDecompress) { +        TVector<float> factors = fh::Decode(CodedFactorsBuf); +        UNIT_ASSERT_VALUES_EQUAL(factors.size(), FactorCount); +        for (size_t i = 0; i < Min(factors.size(), FactorCount); ++i) +            UNIT_ASSERT_VALUES_EQUAL(factors[i], Factors[i]); +        //PrintDecompressed(factors); +    } + +    Y_UNIT_TEST(TestDecompressInParts) { +        float factors[FactorCount]; +        FillWithGarbage(factors, FactorCount); +        fh::TDecoder decoder(CodedFactorsBuf); +        const size_t firstPack = 100; +        // unpack first pack +        UNIT_ASSERT_VALUES_EQUAL(decoder.Decode({factors, firstPack}), firstPack); +        // unpack all the rest +        UNIT_ASSERT_VALUES_EQUAL(decoder.Decode({factors + firstPack, FactorCount - firstPack}), FactorCount - firstPack); + +        for (size_t i = 0; i < FactorCount; ++i) +            UNIT_ASSERT_VALUES_EQUAL(factors[i], Factors[i]); +        //PrintDecompressed(factors); +    } + +    Y_UNIT_TEST(TestSkip) { +        float factors[FactorCount]; +        FillWithGarbage(factors, FactorCount); +        fh::TDecoder decoder(CodedFactorsBuf); +        const size_t firstPack = 100; +        // unpack first pack +        UNIT_ASSERT_VALUES_EQUAL(decoder.Decode({factors, firstPack}), firstPack); +        // skip some factors +        const size_t skipCount = 60; +        UNIT_ASSERT_VALUES_EQUAL(decoder.Skip(skipCount / 2), skipCount / 2); +        // unpack all, except some factors in the end +        const auto toDecode = FactorCount - firstPack - skipCount; +        UNIT_ASSERT_VALUES_EQUAL(decoder.Decode({factors + firstPack, toDecode}), toDecode); +        UNIT_ASSERT_VALUES_EQUAL(decoder.Skip(skipCount / 2), skipCount / 2); +        for (size_t i = 0; i < FactorCount - skipCount; ++i) { +            size_t correctedI = i < firstPack ? i : i + skipCount / 2; +            UNIT_ASSERT_VALUES_EQUAL(factors[i], Factors[correctedI]); +        } +        //PrintDecompressed(factors); +    } + +    Y_UNIT_TEST(TestDecompressForgedData) { +        // this coredumps without end-of-coded-stream check, see SEARCH-1156 for details +        TString brokenBase64Encoded = +            "NLjYltUWs5pqnd3d3f05Li4OAwCAEqrP6mv06jDt7PiAUVu7Y+PiMpuZmdzeM" +            "ArqOLxS2q4FKCII52dktcVs7y0zL+OKgeO9SOzEkFj7uPfFqqoCAAAAAADAtZ" +            "mZ2fdmICAgANQXhi1WVRUAAAAAAAAGjvcWq6oKAAAAAAAAA8d7qe4rV3Nxcd3" +            "d4ZfQZrETm3B+OxxB8bbnTPM5+qtbQ92mJ3fHPGj+iH5+8tzcnJuamry1tWUw" +            "MBD693f07+9+DQQEkIGAgIgPetzN5yEbAGxWpbCNxXK/0JGTKRz2KkIoR7aM"; +        UNIT_ASSERT_EXCEPTION( +            fh::Decode(Base64Decode(brokenBase64Encoded)), +            yexception); +    } + +    Y_UNIT_TEST(TestDecompressEmpty) { +        UNIT_ASSERT_EXCEPTION(fh::Decode({}), yexception); +    } +}; diff --git a/library/cpp/codecs/ut/tls_cache_ut.cpp b/library/cpp/codecs/ut/tls_cache_ut.cpp new file mode 100644 index 00000000000..8101af761fe --- /dev/null +++ b/library/cpp/codecs/ut/tls_cache_ut.cpp @@ -0,0 +1,36 @@ +#include <library/cpp/testing/unittest/registar.h> +#include <library/cpp/codecs/tls_cache.h> + +Y_UNIT_TEST_SUITE(CodecsBufferFactoryTest){ +    void AssignToBuffer(TBuffer & buf, TStringBuf val){ +        buf.Assign(val.data(), val.size()); +} + +TStringBuf AsStringBuf(const TBuffer& b) { +    return TStringBuf(b.Data(), b.Size()); +} + +Y_UNIT_TEST(TestAcquireReleaseReuse) { +    NCodecs::TBufferTlsCache factory; +    // acquiring the first buffer +    auto buf1 = factory.Item(); +    AssignToBuffer(buf1.Get(), "Buffer_01"); +    { +        // acquiring the second buffer +        auto buf2 = factory.Item(); +        AssignToBuffer(buf2.Get(), "Buffer_02"); +    } +    // the first buffer should stay intact +    UNIT_ASSERT_EQUAL(AsStringBuf(buf1.Get()), "Buffer_01"); +    { +        // reacquiring the last released buffer +        // expecting it zero sized but having the same memory +        auto buf2 = factory.Item(); +        UNIT_ASSERT_VALUES_EQUAL(buf2.Get().Size(), 0u); +        buf2.Get().Resize(TStringBuf("Buffer_02").Size()); +        UNIT_ASSERT_EQUAL(AsStringBuf(buf2.Get()), "Buffer_02"); +    } +    // when the factory dies we should see no leaks +} +} +; diff --git a/library/cpp/codecs/ut/ya.make b/library/cpp/codecs/ut/ya.make new file mode 100644 index 00000000000..90841b05ef6 --- /dev/null +++ b/library/cpp/codecs/ut/ya.make @@ -0,0 +1,20 @@ +UNITTEST() + +OWNER( +    g:base +    velavokr +) + +PEERDIR( +    library/cpp/string_utils/base64 +    library/cpp/codecs +    library/cpp/string_utils/relaxed_escaper +) + +SRCS( +    tls_cache_ut.cpp +    codecs_ut.cpp +    float_huffman_ut.cpp +) + +END() diff --git a/library/cpp/codecs/ya.make b/library/cpp/codecs/ya.make new file mode 100644 index 00000000000..7e76fb0c9ad --- /dev/null +++ b/library/cpp/codecs/ya.make @@ -0,0 +1,33 @@ +LIBRARY() + +OWNER( +    g:base +    velavokr +) + +SRCS( +    tls_cache.cpp +    codecs.cpp +    codecs_registry.cpp +    comptable_codec.cpp +    delta_codec.cpp +    float_huffman.cpp +    huffman_codec.cpp +    pfor_codec.cpp +    solar_codec.cpp +    zstd_dict_codec.cpp +) + +PEERDIR( +    contrib/libs/zstd +    library/cpp/bit_io +    library/cpp/blockcodecs +    library/cpp/codecs/greedy_dict +    library/cpp/comptable +    library/cpp/containers/comptrie +    library/cpp/deprecated/accessors +    library/cpp/packers +    library/cpp/string_utils/relaxed_escaper +) + +END() diff --git a/library/cpp/codecs/zstd_dict_codec.cpp b/library/cpp/codecs/zstd_dict_codec.cpp new file mode 100644 index 00000000000..c42a2879e6c --- /dev/null +++ b/library/cpp/codecs/zstd_dict_codec.cpp @@ -0,0 +1,281 @@ +#include "zstd_dict_codec.h" + +#include <library/cpp/packers/packers.h> + +#include <util/generic/ptr.h> +#include <util/generic/refcount.h> +#include <util/generic/noncopyable.h> +#include <util/string/builder.h> +#include <util/system/src_location.h> +#include <util/ysaveload.h> + +#define ZDICT_STATIC_LINKING_ONLY + +#include <contrib/libs/zstd/include/zdict.h> +#include <contrib/libs/zstd/include/zstd.h> +#include <contrib/libs/zstd/include/zstd_errors.h> + +// See IGNIETFERRO-320 for possible bugs + +namespace NCodecs { +    class TZStdDictCodec::TImpl: public TAtomicRefCount<TZStdDictCodec::TImpl> { +        template <class T, size_t Deleter(T*)> +        class TPtrHolder : TMoveOnly { +            T* Ptr = nullptr; + +        public: +            TPtrHolder() = default; + +            TPtrHolder(T* dict) +                : Ptr(dict) +            { +            } + +            T* Get() { +                return Ptr; +            } + +            const T* Get() const { +                return Ptr; +            } + +            void Reset(T* dict) { +                Dispose(); +                Ptr = dict; +            } + +            void Dispose() { +                if (Ptr) { +                    Deleter(Ptr); +                    Ptr = nullptr; +                } +            } + +            ~TPtrHolder() { +                Dispose(); +            } +        }; + +        using TCDict = TPtrHolder<ZSTD_CDict, ZSTD_freeCDict>; +        using TDDict = TPtrHolder<ZSTD_DDict, ZSTD_freeDDict>; +        using TCCtx = TPtrHolder<ZSTD_CCtx, ZSTD_freeCCtx>; +        using TDCtx = TPtrHolder<ZSTD_DCtx, ZSTD_freeDCtx>; + +        using TSizePacker = NPackers::TPacker<ui64>; + +    public: +        static const ui32 SampleSize = (1 << 22) * 5; + +        explicit TImpl(ui32 comprLevel) +            : CompressionLevel(comprLevel) +        { +            const size_t zeroSz = TSizePacker().MeasureLeaf(0); +            Zero.Resize(zeroSz); +            TSizePacker().PackLeaf(Zero.data(), 0, zeroSz); +        } + +        ui32 GetCompressionLevel() const { +            return CompressionLevel; +        } + +        ui8 Encode(TStringBuf in, TBuffer& outbuf) const { +            outbuf.Clear(); + +            if (in.empty()) { +                return 0; +            } + +            TSizePacker packer; + +            const char* rawBeg = in.data(); +            const size_t rawSz = in.size(); + +            const size_t szSz = packer.MeasureLeaf(rawSz); +            const size_t maxDatSz = ZSTD_compressBound(rawSz); + +            outbuf.Resize(szSz + maxDatSz); +            packer.PackLeaf(outbuf.data(), rawSz, szSz); + +            TCCtx ctx{CheckPtr(ZSTD_createCCtx(), __LOCATION__)}; +            const size_t resSz = CheckSize(ZSTD_compress_usingCDict( +                                               ctx.Get(), outbuf.data() + szSz, maxDatSz, rawBeg, rawSz, CDict.Get()), +                                           __LOCATION__); + +            if (resSz < rawSz) { +                outbuf.Resize(resSz + szSz); +            } else { +                outbuf.Resize(Zero.size() + rawSz); +                memcpy(outbuf.data(), Zero.data(), Zero.size()); +                memcpy(outbuf.data() + Zero.size(), rawBeg, rawSz); +            } +            return 0; +        } + +        void Decode(TStringBuf in, TBuffer& outbuf) const { +            outbuf.Clear(); + +            if (in.empty()) { +                return; +            } + +            TSizePacker packer; + +            const char* rawBeg = in.data(); +            size_t rawSz = in.size(); + +            const size_t szSz = packer.SkipLeaf(rawBeg); +            ui64 datSz = 0; +            packer.UnpackLeaf(rawBeg, datSz); + +            rawBeg += szSz; +            rawSz -= szSz; + +            if (!datSz) { +                outbuf.Resize(rawSz); +                memcpy(outbuf.data(), rawBeg, rawSz); +            } else { +                //                size_t zSz = ZSTD_getDecompressedSize(rawBeg, rawSz); +                //                Y_ENSURE_EX(datSz == zSz, TCodecException() << datSz << " != " << zSz); +                outbuf.Resize(datSz); +                TDCtx ctx{CheckPtr(ZSTD_createDCtx(), __LOCATION__)}; +                CheckSize(ZSTD_decompress_usingDDict( +                              ctx.Get(), outbuf.data(), outbuf.size(), rawBeg, rawSz, DDict.Get()), +                          __LOCATION__); +                outbuf.Resize(datSz); +            } +        } + +        bool Learn(ISequenceReader& in, bool throwOnError) { +            TBuffer data; +            TVector<size_t> lens; + +            data.Reserve(2 * SampleSize); +            TStringBuf r; +            while (in.NextRegion(r)) { +                if (!r) { +                    continue; +                } +                data.Append(r.data(), r.size()); +                lens.push_back(r.size()); +            } + +            ZDICT_legacy_params_t params; +            memset(¶ms, 0, sizeof(params)); +            params.zParams.compressionLevel = 1; +            params.zParams.notificationLevel = 1; +            Dict.Resize(Max<size_t>(1 << 20, data.Size() + 16 * lens.size())); + +            if (!lens) { +                Dict.Reset(); +            } else { +                size_t trainResult = ZDICT_trainFromBuffer_legacy( +                    Dict.data(), Dict.size(), data.Data(), const_cast<const size_t*>(&lens[0]), lens.size(), params); +                if (ZSTD_isError(trainResult)) { +                    if (!throwOnError) { +                        return false; +                    } +                    CheckSize(trainResult, __LOCATION__); +                } +                Dict.Resize(trainResult); +                Dict.ShrinkToFit(); +            } +            InitContexts(); +            return true; +        } + +        void Save(IOutputStream* out) const { +            ::Save(out, Dict); +        } + +        void Load(IInputStream* in) { +            ::Load(in, Dict); +            InitContexts(); +        } + +        void InitContexts() { +            CDict.Reset(CheckPtr(ZSTD_createCDict(Dict.data(), Dict.size(), CompressionLevel), __LOCATION__)); +            DDict.Reset(CheckPtr(ZSTD_createDDict(Dict.data(), Dict.size()), __LOCATION__)); +        } + +        static size_t CheckSize(size_t sz, TSourceLocation loc) { +            if (ZSTD_isError(sz)) { +                ythrow TCodecException() << loc << " " << ZSTD_getErrorName(sz) << " (code " << (int)ZSTD_getErrorCode(sz) << ")"; +            } +            return sz; +        } + +        template <class T> +        static T* CheckPtr(T* t, TSourceLocation loc) { +            Y_ENSURE_EX(t, TCodecException() << loc << " " +                                             << "unexpected nullptr"); +            return t; +        } + +    private: +        ui32 CompressionLevel = 1; + +        TBuffer Zero; +        TBuffer Dict; + +        TCDict CDict; +        TDDict DDict; +    }; + +    TZStdDictCodec::TZStdDictCodec(ui32 comprLevel) +        : Impl(new TImpl(comprLevel)) +    { +        MyTraits.NeedsTraining = true; +        MyTraits.SizeOnEncodeMultiplier = 2; +        MyTraits.SizeOnDecodeMultiplier = 10; +        MyTraits.RecommendedSampleSize = TImpl::SampleSize; // same as for solar +    } + +    TZStdDictCodec::~TZStdDictCodec() { +    } + +    TString TZStdDictCodec::GetName() const { +        return TStringBuilder() << MyName() << "-" << Impl->GetCompressionLevel(); +    } + +    ui8 TZStdDictCodec::Encode(TStringBuf in, TBuffer& out) const { +        return Impl->Encode(in, out); +    } + +    void TZStdDictCodec::Decode(TStringBuf in, TBuffer& out) const { +        Impl->Decode(in, out); +    } + +    void TZStdDictCodec::DoLearn(ISequenceReader& in) { +        Impl = new TImpl(Impl->GetCompressionLevel()); +        Impl->Learn(in, true/*throwOnError*/); +    } + +    bool TZStdDictCodec::DoTryToLearn(ISequenceReader& in) { +        Impl = new TImpl(Impl->GetCompressionLevel()); +        return Impl->Learn(in, false/*throwOnError*/); +    } + +    void TZStdDictCodec::Save(IOutputStream* out) const { +        Impl->Save(out); +    } + +    void TZStdDictCodec::Load(IInputStream* in) { +        Impl->Load(in); +    } + +    TVector<TString> TZStdDictCodec::ListCompressionNames() { +        TVector<TString> res; +        for (int i = 1; i <= ZSTD_maxCLevel(); ++i) { +            res.emplace_back(TStringBuilder() << MyName() << "-" << i); +        } +        return res; +    } + +    int TZStdDictCodec::ParseCompressionName(TStringBuf name) { +        int c = 0; +        TryFromString(name.After('-'), c); +        Y_ENSURE_EX(name.Before('-') == MyName() && c > 0 && c <= ZSTD_maxCLevel(), TCodecException() << "invald codec name" << name); +        return c; +    } + +} diff --git a/library/cpp/codecs/zstd_dict_codec.h b/library/cpp/codecs/zstd_dict_codec.h new file mode 100644 index 00000000000..59c1ad6c606 --- /dev/null +++ b/library/cpp/codecs/zstd_dict_codec.h @@ -0,0 +1,38 @@ +#pragma once + +#include "codecs.h" + +#include <util/generic/ptr.h> + +namespace NCodecs { +    // benchmarks are here: https://st.yandex-team.ru/SEARCH-1655 + +    class TZStdDictCodec: public ICodec { +        class TImpl; +        TIntrusivePtr<TImpl> Impl; + +    public: +        explicit TZStdDictCodec(ui32 comprLevel = 1); +        ~TZStdDictCodec() override; + +        static TStringBuf MyName() { +            return "zstd08d"; +        } + +        TString GetName() const override; + +        ui8 Encode(TStringBuf in, TBuffer& out) const override; + +        void Decode(TStringBuf in, TBuffer& out) const override; + +        static TVector<TString> ListCompressionNames(); +        static int ParseCompressionName(TStringBuf); + +    protected: +        void DoLearn(ISequenceReader& in) override; +        bool DoTryToLearn(ISequenceReader& in) final; +        void Save(IOutputStream* out) const override; +        void Load(IInputStream* in) override; +    }; + +}  | 
