diff options
| author | Ruslan Kovalev <[email protected]> | 2022-02-10 16:46:44 +0300 | 
|---|---|---|
| committer | Daniil Cherednik <[email protected]> | 2022-02-10 16:46:44 +0300 | 
| commit | 59e19371de37995fcb36beb16cd6ec030af960bc (patch) | |
| tree | fa68e36093ebff8b805462e9e6d331fe9d348214 /library/cpp/codecs | |
| parent | 89db6fe2fe2c32d2a832ddfeb04e8d078e301084 (diff) | |
Restoring authorship annotation for Ruslan Kovalev <[email protected]>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/codecs')
58 files changed, 2702 insertions, 2702 deletions
diff --git a/library/cpp/codecs/README.md b/library/cpp/codecs/README.md index 42646ccd978..26fa96da59b 100644 --- a/library/cpp/codecs/README.md +++ b/library/cpp/codecs/README.md @@ -1,8 +1,8 @@  This is a library of compression algorithms with a unified interface and serialization.  See also library/cpp/codecs/static, where a support for statically compiled dictionaries is implemented. - +   All algorithms have a common `ICodec` interface (described in codecs.h). - +   The `ICodec` interface has the following methods:\      `virtual ui8 ICodec::Encode (TMemoryRegion, TBuffer&) const;`\              - Input - memory region. Output - filled buffer and the rest of the last byte, if it was not filled to the end.\ @@ -27,9 +27,9 @@ The `ICodec` interface has the following methods:\                      For example, it allows you to save information about which combination of codecs was in use (see below).\      `virtual void Learn(ISequenceReader*);`\              - The interface for teaching codecs that use information about the distribution of data. - +   In addition, the library has a number of utilities that allow a more flexible use of it. - +   In the `ICodec` class the following methods are available:\      `static TCodecPtr GetInstance(const TString& name);`\              - Creation of a codec instance by a symbolic name\ diff --git a/library/cpp/codecs/codecs.cpp b/library/cpp/codecs/codecs.cpp index b17a3156d21..d2265dd9f95 100644 --- a/library/cpp/codecs/codecs.cpp +++ b/library/cpp/codecs/codecs.cpp @@ -1,69 +1,69 @@ -#include "codecs.h" -#include "tls_cache.h" - -#include <util/stream/mem.h> - -namespace NCodecs { +#include "codecs.h"  +#include "tls_cache.h"  +  +#include <util/stream/mem.h>  +  +namespace NCodecs {       void ICodec::Store(IOutputStream* out, TCodecPtr p) {          if (!p.Get()) {              ::Save(out, (ui16)0);              return;          } - +           Y_ENSURE_EX(p->AlreadyTrained(), TCodecException() << "untrained codec " << p->GetName());          const TString& n = p->GetName();          Y_VERIFY(n.size() <= Max<ui16>());          ::Save(out, (ui16)n.size());          out->Write(n.data(), n.size());          p->Save(out); -    } - +    }  +       TCodecPtr ICodec::Restore(IInputStream* in) {          ui16 l = 0;          ::Load(in, l); - +           if (!l) {              return nullptr;          } - +           TString n;          n.resize(l); - +           Y_ENSURE_EX(in->Load(n.begin(), l) == l, TCodecException()); - +           TCodecPtr p = ICodec::GetInstance(n);          p->Load(in);          p->Trained = true;          return p;      } - +       TCodecPtr ICodec::RestoreFromString(TStringBuf s) {          TMemoryInput minp{s.data(), s.size()};          return Restore(&minp);      } - +       TString ICodec::GetNameSafe(TCodecPtr p) {          return !p ? TString("none") : p->GetName();      } - +       ui8 TPipelineCodec::Encode(TStringBuf in, TBuffer& out) const {          size_t res = Traits().ApproximateSizeOnEncode(in.size());          out.Reserve(res);          out.Clear(); - +           if (Pipeline.empty()) {              out.Append(in.data(), in.size());              return 0;          } else if (Pipeline.size() == 1) {              return Pipeline.front()->Encode(in, out);          } - +           ui8 freelastbits = 0; - +           auto buffer = TBufferTlsCache::TlsInstance().Item();          TBuffer& tmp = buffer.Get();          tmp.Reserve(res); - +           for (auto it = Pipeline.begin(); it != Pipeline.end(); ++it) {              if (it != Pipeline.begin()) {                  tmp.Clear(); @@ -72,15 +72,15 @@ namespace NCodecs {              }              freelastbits = (*it)->Encode(in, out);          } - +           return freelastbits; -    } - +    }  +       void TPipelineCodec::Decode(TStringBuf in, TBuffer& out) const {          size_t res = Traits().ApproximateSizeOnDecode(in.size());          out.Reserve(res);          out.Clear(); - +           if (Pipeline.empty()) {              out.Append(in.data(), in.size());              return; @@ -88,12 +88,12 @@ namespace NCodecs {              Pipeline.front()->Decode(in, out);              return;          } - +           auto buffer = TBufferTlsCache::TlsInstance().Item(); - +           TBuffer& tmp = buffer.Get();          tmp.Reserve(res); - +           for (TPipeline::const_reverse_iterator it = Pipeline.rbegin(); it != Pipeline.rend(); ++it) {              if (it != Pipeline.rbegin()) {                  tmp.Clear(); @@ -101,40 +101,40 @@ namespace NCodecs {                  in = TStringBuf{tmp.data(), tmp.size()};              }              (*it)->Decode(in, out); -        } -    } - +        }  +    }  +       void TPipelineCodec::Save(IOutputStream* out) const {          for (const auto& it : Pipeline)              it->Save(out); -    } - +    }  +       void TPipelineCodec::Load(IInputStream* in) {          for (const auto& it : Pipeline) {              it->Load(in);              it->SetTrained(true);          } -    } - +    }  +       void TPipelineCodec::SetTrained(bool t) {          for (const auto& it : Pipeline) {              it->SetTrained(t);          } -    } - +    }  +       TPipelineCodec& TPipelineCodec::AddCodec(TCodecPtr codec) {          if (!codec)              return *this; - +           TCodecTraits tr = codec->Traits(); - +           if (!MyName) {              MyTraits.AssumesStructuredInput = tr.AssumesStructuredInput;              MyTraits.SizeOfInputElement = tr.SizeOfInputElement;          } else {              MyName.append(':');          } - +           MyName.append(codec->GetName());          MyTraits.PreservesPrefixGrouping &= tr.PreservesPrefixGrouping;          MyTraits.PaddingBit = tr.PaddingBit; @@ -144,27 +144,27 @@ namespace NCodecs {          MyTraits.SizeOnEncodeMultiplier *= tr.SizeOnEncodeMultiplier;          MyTraits.SizeOnDecodeMultiplier *= tr.SizeOnDecodeMultiplier;          MyTraits.RecommendedSampleSize = Max(MyTraits.RecommendedSampleSize, tr.RecommendedSampleSize); - +           Pipeline.push_back(codec);          return *this; -    } - +    }  +       void TPipelineCodec::DoLearnX(ISequenceReader& in, double sampleSizeMult) {          if (!Traits().NeedsTraining) {              return;          } - +           if (Pipeline.size() == 1) {              Pipeline.back()->Learn(in);              return;          } - +           TVector<TBuffer> trainingInput; - +           TStringBuf r;          while (in.NextRegion(r)) {              trainingInput.emplace_back(r.data(), r.size()); -        } +        }           TBuffer buff;          for (const auto& it : Pipeline) { @@ -176,8 +176,8 @@ namespace NCodecs {                  buff.Swap(bit);              }          } -    } - +    }  +       bool TPipelineCodec::AlreadyTrained() const {          for (const auto& it : Pipeline) {              if (!it->AlreadyTrained()) @@ -185,6 +185,6 @@ namespace NCodecs {          }          return true; -    } - -} +    }  +  +}  diff --git a/library/cpp/codecs/codecs.h b/library/cpp/codecs/codecs.h index cc5e72b2850..aa7c24b4c6e 100644 --- a/library/cpp/codecs/codecs.h +++ b/library/cpp/codecs/codecs.h @@ -1,63 +1,63 @@ -#pragma once - -#include "sample.h" - -#include <util/generic/bt_exception.h> -#include <util/generic/hash.h> -#include <util/generic/ptr.h> -#include <util/generic/singleton.h> - -#include <util/stream/input.h> -#include <util/stream/output.h> - +#pragma once  +  +#include "sample.h"  +  +#include <util/generic/bt_exception.h>  +#include <util/generic/hash.h>  +#include <util/generic/ptr.h>  +#include <util/generic/singleton.h>  +  +#include <util/stream/input.h>  +#include <util/stream/output.h>  +   #include <util/string/cast.h> -#include <util/string/vector.h> -#include <util/system/tls.h> -#include <util/ysaveload.h> - -namespace NCodecs { +#include <util/string/vector.h>  +#include <util/system/tls.h>  +#include <util/ysaveload.h>  +  +namespace NCodecs {       class TCodecException: public TWithBackTrace<yexception> {}; - +       class ICodec; - +       using TCodecPtr = TIntrusivePtr<ICodec>;      using TCodecConstPtr = TIntrusiveConstPtr<ICodec>; - +       struct TCodecTraits {          ui32 RecommendedSampleSize = 0;          ui16 SizeOfInputElement = 1;          ui8 SizeOnEncodeMultiplier = 1;          ui8 SizeOnEncodeAddition = 0;          ui8 SizeOnDecodeMultiplier = 1; - +           bool NeedsTraining = false;          bool PreservesPrefixGrouping = false;          bool Irreversible = false;          bool PaddingBit = 0;          bool AssumesStructuredInput = false; - +           size_t ApproximateSizeOnEncode(size_t sz) const {              return sz * SizeOnEncodeMultiplier + SizeOnEncodeAddition;          } - +           size_t ApproximateSizeOnDecode(size_t sz) const {              return sz * SizeOnDecodeMultiplier;          }      }; - +       class ICodec: public TAtomicRefCount<ICodec> {      protected:          bool Trained = false;          TCodecTraits MyTraits; - +       public:          TCodecTraits Traits() const {              return MyTraits;          } - +           // the name of the codec (or its variant) to be used in the codec registry          virtual TString GetName() const = 0; - +           virtual ui8 /*free bits in last byte*/ Encode(TStringBuf, TBuffer&) const = 0;          virtual ui8 Encode(const TBuffer& input, TBuffer& output) const {              return Encode(TStringBuf(input.Data(), input.Data() + input.Size()), output); @@ -66,16 +66,16 @@ namespace NCodecs {          virtual void Decode(const TBuffer& input, TBuffer& output) const {              Decode(TStringBuf(input.Data(), input.Data() + input.Size()), output);          } - +           virtual ~ICodec() = default; - +           virtual bool AlreadyTrained() const {              return !Traits().NeedsTraining || Trained;          }          virtual void SetTrained(bool t) {              Trained = t;          } - +           bool TryToLearn(ISequenceReader& r) {              Trained = DoTryToLearn(r);              return Trained; @@ -84,32 +84,32 @@ namespace NCodecs {          void Learn(ISequenceReader& r) {              LearnX(r, 1);          } - +           template <class TIter>          void Learn(TIter beg, TIter end) {              Learn(beg, end, IterToStringBuf<TIter>);          } - +           template <class TIter, class TGetter>          void Learn(TIter beg, TIter end, TGetter getter) {              auto sample = GetSample(beg, end, Traits().RecommendedSampleSize, getter);              TSimpleSequenceReader<TBuffer> reader{sample};              Learn(reader);          } - +           static TCodecPtr GetInstance(TStringBuf name); - +           static TVector<TString> GetCodecsList(); - +           static TString GetNameSafe(TCodecPtr p); - +           static void Store(IOutputStream* out, TCodecPtr p);          static TCodecPtr Restore(IInputStream* in);          static TCodecPtr RestoreFromString(TStringBuf); - +       protected:          virtual void DoLearn(ISequenceReader&) = 0; - +           virtual bool DoTryToLearn(ISequenceReader& r) {              DoLearn(r);              return true; @@ -119,20 +119,20 @@ namespace NCodecs {          virtual void DoLearnX(ISequenceReader& r, double /*sampleSizeMultiplier*/) {              DoLearn(r);          } - +           virtual void Save(IOutputStream*) const {          }          virtual void Load(IInputStream*) {          }          friend class TPipelineCodec; - +       public:          // so the pipeline codec will know to adjust the sample for the subcodecs          void LearnX(ISequenceReader& r, double sampleSizeMult) {              DoLearnX(r, sampleSizeMult);              Trained = true;          } - +           template <class TIter>          void LearnX(TIter beg, TIter end, double sampleSizeMult) {              auto sample = GetSample(beg, end, Traits().RecommendedSampleSize * sampleSizeMult); @@ -140,54 +140,54 @@ namespace NCodecs {              LearnX(reader, sampleSizeMult);          }      }; - +       class TBasicTrivialCodec: public ICodec {      public:          ui8 Encode(TStringBuf in, TBuffer& out) const override {              out.Assign(in.data(), in.size());              return 0;          } - +           void Decode(TStringBuf in, TBuffer& out) const override {              Encode(in, out);          } - +       protected:          void DoLearn(ISequenceReader&) override {          }      }; - +       class TTrivialCodec: public TBasicTrivialCodec {      public:          TTrivialCodec() {              MyTraits.PreservesPrefixGrouping = true;          } - +           static TStringBuf MyName() {              return "trivial";          } - +           TString GetName() const override {              return ToString(MyName());          }      }; - +       class TTrivialTrainableCodec: public TBasicTrivialCodec {      public:          TTrivialTrainableCodec() {              MyTraits.PreservesPrefixGrouping = true;              MyTraits.NeedsTraining = true;          } - +           static TStringBuf MyName() {              return "trivial-trainable";          } - +           TString GetName() const override {              return ToString(MyName());          }      }; - +       class TNullCodec: public ICodec {      public:          TNullCodec() { @@ -195,31 +195,31 @@ namespace NCodecs {              MyTraits.SizeOnDecodeMultiplier = 0;              MyTraits.SizeOnEncodeMultiplier = 0;          } - +           TString GetName() const override {              return "null";          } - +           ui8 Encode(TStringBuf, TBuffer& out) const override {              out.Clear();              return 0;          } - +           void Decode(TStringBuf, TBuffer& out) const override {              out.Clear();          } - +       protected:          void DoLearn(ISequenceReader&) override {          }      }; - +       class TPipelineCodec: public ICodec {          typedef TVector<TCodecPtr> TPipeline; - +           TPipeline Pipeline;          TString MyName; - +       public:          explicit TPipelineCodec(TCodecPtr c0 = nullptr, TCodecPtr c1 = nullptr, TCodecPtr c2 = nullptr, TCodecPtr c3 = nullptr) {              MyTraits.PreservesPrefixGrouping = true; @@ -228,32 +228,32 @@ namespace NCodecs {              AddCodec(c2);              AddCodec(c3);          } - +           TString GetName() const override {              return MyName;          } - +           ui8 Encode(TStringBuf in, TBuffer& out) const override;          void Decode(TStringBuf in, TBuffer& out) const override; - +       public:          /* -     * Add codecs in the following order: -     * uncompressed -> codec0 | codec1 | ... | codecN -> compressed -     */ +     * Add codecs in the following order:  +     * uncompressed -> codec0 | codec1 | ... | codecN -> compressed  +     */           TPipelineCodec& AddCodec(TCodecPtr codec); - +           bool AlreadyTrained() const override;          void SetTrained(bool t) override; - +       protected:          void DoLearn(ISequenceReader& in) override {              DoLearnX(in, 1);          } - +           void DoLearnX(ISequenceReader& in, double sampleSizeMult) override;          void Save(IOutputStream* out) const override;          void Load(IInputStream* in) override;      }; - -} +  +}  diff --git a/library/cpp/codecs/codecs_registry.cpp b/library/cpp/codecs/codecs_registry.cpp index 17d07062ab4..7ccfd07a8a8 100644 --- a/library/cpp/codecs/codecs_registry.cpp +++ b/library/cpp/codecs/codecs_registry.cpp @@ -1,104 +1,104 @@ -#include "codecs_registry.h" -#include "delta_codec.h" -#include "huffman_codec.h" -#include "pfor_codec.h" -#include "solar_codec.h" -#include "comptable_codec.h" -#include "zstd_dict_codec.h" - +#include "codecs_registry.h"  +#include "delta_codec.h"  +#include "huffman_codec.h"  +#include "pfor_codec.h"  +#include "solar_codec.h"  +#include "comptable_codec.h"  +#include "zstd_dict_codec.h"  +   #include <library/cpp/blockcodecs/codecs.h> - -#include <util/string/builder.h> +  +#include <util/string/builder.h>   #include <util/string/cast.h> - -namespace NCodecs { -    TCodecPtr ICodec::GetInstance(TStringBuf name) { +  +namespace NCodecs {  +    TCodecPtr ICodec::GetInstance(TStringBuf name) {           return Singleton<NPrivate::TCodecRegistry>()->GetCodec(name); -    } - +    }  +       TVector<TString> ICodec::GetCodecsList() {          return Singleton<NPrivate::TCodecRegistry>()->GetCodecsList(); -    } - -    namespace NPrivate { -        void TCodecRegistry::RegisterFactory(TFactoryPtr fac) { +    }  +  +    namespace NPrivate {  +        void TCodecRegistry::RegisterFactory(TFactoryPtr fac) {               TVector<TString> names = fac->ListNames(); -            for (const auto& name : names) { +            for (const auto& name : names) {                   Y_VERIFY(!Registry.contains(name), "already has %s", name.data()); -                Registry[name] = fac; -            } +                Registry[name] = fac;  +            }           } -        TCodecPtr TCodecRegistry::GetCodec(TStringBuf name) const { -            using namespace NPrivate; - -            if (!name || "none" == name) { -                return nullptr; -            } - -            if (TStringBuf::npos == name.find(':')) { +        TCodecPtr TCodecRegistry::GetCodec(TStringBuf name) const {  +            using namespace NPrivate;  +  +            if (!name || "none" == name) {  +                return nullptr;  +            }  +  +            if (TStringBuf::npos == name.find(':')) {                   Y_ENSURE_EX(Registry.contains(name), TNoCodecException(name)); -                return Registry.find(name)->second->MakeCodec(name); -            } else { -                TPipelineCodec* pipe = new TPipelineCodec; - +                return Registry.find(name)->second->MakeCodec(name);  +            } else {  +                TPipelineCodec* pipe = new TPipelineCodec;  +                   do { -                    TStringBuf v = name.NextTok(':'); -                    pipe->AddCodec(GetCodec(v)); -                } while (name); - -                return pipe; -            } -        } - +                    TStringBuf v = name.NextTok(':');  +                    pipe->AddCodec(GetCodec(v));  +                } while (name);  +  +                return pipe;  +            }  +        }  +           TVector<TString> TCodecRegistry::GetCodecsList() const { -            using namespace NPrivate; +            using namespace NPrivate;               TVector<TString> vs; -            vs.push_back("none"); - -            for (const auto& it : Registry) { -                vs.push_back(it.first); -            } - -            Sort(vs.begin(), vs.end()); -            return vs; -        } - +            vs.push_back("none");  +  +            for (const auto& it : Registry) {  +                vs.push_back(it.first);  +            }  +  +            Sort(vs.begin(), vs.end());  +            return vs;  +        }  +           struct TSolarCodecFactory : ICodecFactory { -            TCodecPtr MakeCodec(TStringBuf name) const override { -                if (TSolarCodec::MyNameShortInt() == name) { -                    return new TSolarCodecShortInt(); -                } -                if (TSolarCodec::MyName() == name) { -                    return new TSolarCodec(); -                } +            TCodecPtr MakeCodec(TStringBuf name) const override {  +                if (TSolarCodec::MyNameShortInt() == name) {  +                    return new TSolarCodecShortInt();  +                }  +                if (TSolarCodec::MyName() == name) {  +                    return new TSolarCodec();  +                }                   if (name.EndsWith(TStringBuf("-a"))) {                      return MakeCodecImpl<TAdaptiveSolarCodec>(name, name.SubStr(TSolarCodec::MyName().size()).Chop(2)); -                } else { +                } else {                       return MakeCodecImpl<TSolarCodec>(name, name.SubStr(TSolarCodec::MyName().size())); -                } -            } - +                }  +            }  +               template <class TCodecCls> -            TCodecPtr MakeCodecImpl(const TStringBuf& name, const TStringBuf& type) const { +            TCodecPtr MakeCodecImpl(const TStringBuf& name, const TStringBuf& type) const {                   if (TStringBuf("-8k") == type) { -                    return new TCodecCls(1 << 13); -                } +                    return new TCodecCls(1 << 13);  +                }                   if (TStringBuf("-16k") == type) { -                    return new TCodecCls(1 << 14); -                } +                    return new TCodecCls(1 << 14);  +                }                   if (TStringBuf("-32k") == type) { -                    return new TCodecCls(1 << 15); -                } +                    return new TCodecCls(1 << 15);  +                }                   if (TStringBuf("-64k") == type) { -                    return new TCodecCls(1 << 16); -                } +                    return new TCodecCls(1 << 16);  +                }                   if (TStringBuf("-256k") == type) { -                    return new TCodecCls(1 << 18); -                } -                ythrow TNoCodecException(name); -            } - +                    return new TCodecCls(1 << 18);  +                }  +                ythrow TNoCodecException(name);  +            }  +               TVector<TString> ListNames() const override {                  TVector<TString> vs;                  vs.push_back(ToString(TSolarCodec::MyName())); @@ -113,114 +113,114 @@ namespace NCodecs {                  vs.push_back(ToString(TSolarCodec::MyName64kAdapt()));                  vs.push_back(ToString(TSolarCodec::MyName256kAdapt()));                  vs.push_back(ToString(TSolarCodec::MyNameShortInt())); -                return vs; -            } -        }; - +                return vs;  +            }  +        };  +           struct TZStdDictCodecFactory : ICodecFactory { -            TCodecPtr MakeCodec(TStringBuf name) const override { -                return new TZStdDictCodec(TZStdDictCodec::ParseCompressionName(name)); -            } - +            TCodecPtr MakeCodec(TStringBuf name) const override {  +                return new TZStdDictCodec(TZStdDictCodec::ParseCompressionName(name));  +            }  +               TVector<TString> ListNames() const override { -                return TZStdDictCodec::ListCompressionNames(); -            } -        }; - +                return TZStdDictCodec::ListCompressionNames();  +            }  +        };  +           struct TCompTableCodecFactory : ICodecFactory { -            TCodecPtr MakeCodec(TStringBuf name) const override { -                if (TCompTableCodec::MyNameHQ() == name) { -                    return new TCompTableCodec(TCompTableCodec::Q_HIGH); -                } else if (TCompTableCodec::MyNameLQ() == name) { -                    return new TCompTableCodec(TCompTableCodec::Q_LOW); -                } else { -                    Y_ENSURE_EX(false, TNoCodecException(name)); -                    return nullptr; -                } -            } - +            TCodecPtr MakeCodec(TStringBuf name) const override {  +                if (TCompTableCodec::MyNameHQ() == name) {  +                    return new TCompTableCodec(TCompTableCodec::Q_HIGH);  +                } else if (TCompTableCodec::MyNameLQ() == name) {  +                    return new TCompTableCodec(TCompTableCodec::Q_LOW);  +                } else {  +                    Y_ENSURE_EX(false, TNoCodecException(name));  +                    return nullptr;  +                }  +            }  +               TVector<TString> ListNames() const override {                  TVector<TString> vs;                  vs.push_back(ToString(TCompTableCodec::MyNameHQ()));                  vs.push_back(ToString(TCompTableCodec::MyNameLQ())); -                return vs; -            } -        }; - +                return vs;  +            }  +        };  +           struct TBlockCodec : ICodec { -            const NBlockCodecs::ICodec* Codec; - -            TBlockCodec(TStringBuf name) +            const NBlockCodecs::ICodec* Codec;  +  +            TBlockCodec(TStringBuf name)                   : Codec(NBlockCodecs::Codec(name)) -            { -            } - +            {  +            }  +               TString GetName() const override {                  return ToString(Codec->Name()); -            } - -            ui8 Encode(TStringBuf r, TBuffer& b) const override { -                Codec->Encode(r, b); -                return 0; -            } - -            void Decode(TStringBuf r, TBuffer& b) const override { -                // TODO: throws exception that is not TCodecException -                Codec->Decode(r, b); -            } - -        protected: -            void DoLearn(ISequenceReader&) override { -            } -        }; - +            }  +  +            ui8 Encode(TStringBuf r, TBuffer& b) const override {  +                Codec->Encode(r, b);  +                return 0;  +            }  +  +            void Decode(TStringBuf r, TBuffer& b) const override {  +                // TODO: throws exception that is not TCodecException  +                Codec->Decode(r, b);  +            }  +  +        protected:  +            void DoLearn(ISequenceReader&) override {  +            }  +        };  +           struct TBlockCodecsFactory : ICodecFactory {              using TRegistry = THashMap<TString, TCodecPtr>; -            TRegistry Registry; - +            TRegistry Registry;  +               TBlockCodecsFactory() {                  for (TStringBuf codec : NBlockCodecs::ListAllCodecs()) {                      Register(codec);                  } -            } - -            void Register(TStringBuf name) { -                TCodecPtr p = Registry[name] = new TBlockCodec(name); -                Registry[p->GetName()] = p; -            } - -            TCodecPtr MakeCodec(TStringBuf name) const override { +            }  +  +            void Register(TStringBuf name) {  +                TCodecPtr p = Registry[name] = new TBlockCodec(name);  +                Registry[p->GetName()] = p;  +            }  +  +            TCodecPtr MakeCodec(TStringBuf name) const override {                   if (!Registry.contains(name)) { -                    ythrow TNoCodecException(name); -                } -                return Registry.find(name)->second; -            } - +                    ythrow TNoCodecException(name);  +                }  +                return Registry.find(name)->second;  +            }  +               TVector<TString> ListNames() const override {                  TVector<TString> res; -                for (const auto& it : Registry) { -                    res.push_back(it.first); -                } -                return res; -            } -        }; - +                for (const auto& it : Registry) {  +                    res.push_back(it.first);  +                }  +                return res;  +            }  +        };  +           TCodecRegistry::TCodecRegistry() { -            RegisterFactory(new TInstanceFactory<TTrivialCodec>); -            RegisterFactory(new TInstanceFactory<TTrivialTrainableCodec>); -            RegisterFactory(new TInstanceFactory<THuffmanCodec>); +            RegisterFactory(new TInstanceFactory<TTrivialCodec>);  +            RegisterFactory(new TInstanceFactory<TTrivialTrainableCodec>);  +            RegisterFactory(new TInstanceFactory<THuffmanCodec>);               RegisterFactory(new TInstanceFactory<TPForCodec<ui64, true>>);              RegisterFactory(new TInstanceFactory<TPForCodec<ui32, true>>); -            RegisterFactory(new TSolarCodecFactory); -            RegisterFactory(new TZStdDictCodecFactory); -            RegisterFactory(new TCompTableCodecFactory); -            RegisterFactory(new TBlockCodecsFactory); -        } - -    } - -    void RegisterCodecFactory(TCodecFactoryPtr fact) { -        Singleton<NPrivate::TCodecRegistry>()->RegisterFactory(fact); -    } - -} +            RegisterFactory(new TSolarCodecFactory);  +            RegisterFactory(new TZStdDictCodecFactory);  +            RegisterFactory(new TCompTableCodecFactory);  +            RegisterFactory(new TBlockCodecsFactory);  +        }  +  +    }  +  +    void RegisterCodecFactory(TCodecFactoryPtr fact) {  +        Singleton<NPrivate::TCodecRegistry>()->RegisterFactory(fact);  +    }  +  +}  diff --git a/library/cpp/codecs/codecs_registry.h b/library/cpp/codecs/codecs_registry.h index 53710310d56..31170afd62a 100644 --- a/library/cpp/codecs/codecs_registry.h +++ b/library/cpp/codecs/codecs_registry.h @@ -1,60 +1,60 @@ -#pragma once - -#include "codecs.h" +#pragma once  +  +#include "codecs.h"   #include <util/string/cast.h> - -namespace NCodecs { +  +namespace NCodecs {       struct TNoCodecException : TCodecException {          TNoCodecException(TStringBuf name) { -            (*this) << "unknown codec: " << name; -        } -    }; - +            (*this) << "unknown codec: " << name;  +        }  +    };  +       struct ICodecFactory : TAtomicRefCount<ICodecFactory> { -        virtual ~ICodecFactory() = default; -        virtual TCodecPtr MakeCodec(TStringBuf name) const = 0; +        virtual ~ICodecFactory() = default;  +        virtual TCodecPtr MakeCodec(TStringBuf name) const = 0;           virtual TVector<TString> ListNames() const = 0; -    }; - -    typedef TIntrusivePtr<ICodecFactory> TCodecFactoryPtr; - -    namespace NPrivate { +    };  +  +    typedef TIntrusivePtr<ICodecFactory> TCodecFactoryPtr;  +  +    namespace NPrivate {           template <typename TCodec>          struct TInstanceFactory : ICodecFactory { -            TCodecPtr MakeCodec(TStringBuf) const override { -                return new TCodec; -            } - +            TCodecPtr MakeCodec(TStringBuf) const override {  +                return new TCodec;  +            }  +               TVector<TString> ListNames() const override {                  TVector<TString> vs;                  vs.push_back(ToString(TCodec::MyName())); -                return vs; -            } -        }; - -        class TCodecRegistry { +                return vs;  +            }  +        };  +  +        class TCodecRegistry {               using TRegistry = THashMap<TString, TIntrusivePtr<ICodecFactory>>; -            TRegistry Registry; - -        public: -            using TFactoryPtr = TIntrusivePtr<ICodecFactory>; - -            TCodecRegistry(); - -            void RegisterFactory(TFactoryPtr fac); - -            TCodecPtr GetCodec(TStringBuf name) const; - +            TRegistry Registry;  +  +        public:  +            using TFactoryPtr = TIntrusivePtr<ICodecFactory>;  +  +            TCodecRegistry();  +  +            void RegisterFactory(TFactoryPtr fac);  +  +            TCodecPtr GetCodec(TStringBuf name) const;  +               TVector<TString> GetCodecsList() const; -        }; - -    } - -    void RegisterCodecFactory(TCodecFactoryPtr fact); - +        };  +  +    }  +  +    void RegisterCodecFactory(TCodecFactoryPtr fact);  +       template <typename TCodec> -    void RegisterCodec() { -        RegisterCodecFactory(new NPrivate::TInstanceFactory<TCodec>()); -    } - -} +    void RegisterCodec() {  +        RegisterCodecFactory(new NPrivate::TInstanceFactory<TCodec>());  +    }  +  +}  diff --git a/library/cpp/codecs/comptable_codec.cpp b/library/cpp/codecs/comptable_codec.cpp index 476b8ada80c..1eca4354c6c 100644 --- a/library/cpp/codecs/comptable_codec.cpp +++ b/library/cpp/codecs/comptable_codec.cpp @@ -1,108 +1,108 @@ -#include "comptable_codec.h" - +#include "comptable_codec.h"  +   #include <library/cpp/comptable/comptable.h>  #include <util/string/cast.h> - -namespace NCodecs { +  +namespace NCodecs {       class TCompTableCodec::TImpl: public TAtomicRefCount<TImpl> { -    public: -        TImpl(EQuality q) -            : Quality(q) +    public:  +        TImpl(EQuality q)  +            : Quality(q)           {          } - -        void Init() { -            Compressor.Reset(new NCompTable::TChunkCompressor{(bool)Quality, Table}); -            Decompressor.Reset(new NCompTable::TChunkDecompressor{(bool)Quality, Table}); -        } - -        ui8 Encode(TStringBuf in, TBuffer& out) const { -            out.Clear(); -            if (!in) { -                return 0; -            } - +  +        void Init() {  +            Compressor.Reset(new NCompTable::TChunkCompressor{(bool)Quality, Table});  +            Decompressor.Reset(new NCompTable::TChunkDecompressor{(bool)Quality, Table});  +        }  +  +        ui8 Encode(TStringBuf in, TBuffer& out) const {  +            out.Clear();  +            if (!in) {  +                return 0;  +            }  +               TVector<char> result; -            Compressor->Compress(in, &result); -            out.Assign(&result[0], result.size()); -            return 0; -        } - -        void Decode(TStringBuf in, TBuffer& out) const { -            out.Clear(); -            if (!in) { -                return; -            } - +            Compressor->Compress(in, &result);  +            out.Assign(&result[0], result.size());  +            return 0;  +        }  +  +        void Decode(TStringBuf in, TBuffer& out) const {  +            out.Clear();  +            if (!in) {  +                return;  +            }  +               TVector<char> result; -            Decompressor->Decompress(in, &result); -            out.Assign(&result[0], result.size()); -        } - -        void DoLearn(ISequenceReader& in) { -            NCompTable::TDataSampler sampler; -            TStringBuf region; -            while (in.NextRegion(region)) { -                if (!region) { -                    continue; -                } - -                sampler.AddStat(region); -            } - -            sampler.BuildTable(Table); -            Init(); -        } - +            Decompressor->Decompress(in, &result);  +            out.Assign(&result[0], result.size());  +        }  +  +        void DoLearn(ISequenceReader& in) {  +            NCompTable::TDataSampler sampler;  +            TStringBuf region;  +            while (in.NextRegion(region)) {  +                if (!region) {  +                    continue;  +                }  +  +                sampler.AddStat(region);  +            }  +  +            sampler.BuildTable(Table);  +            Init();  +        }  +           void Save(IOutputStream* out) const { -            ::Save(out, Table); -        } - +            ::Save(out, Table);  +        }  +           void Load(IInputStream* in) { -            ::Load(in, Table); -            Init(); -        } - -        NCompTable::TCompressorTable Table; -        THolder<NCompTable::TChunkCompressor> Compressor; -        THolder<NCompTable::TChunkDecompressor> Decompressor; -        const EQuality Quality; -        static const ui32 SampleSize = Max(NCompTable::TDataSampler::Size * 4, (1 << 22) * 5); -    }; - -    TCompTableCodec::TCompTableCodec(EQuality q) -        : Impl(new TImpl{q}) -    { -        MyTraits.NeedsTraining = true; -        MyTraits.SizeOnEncodeMultiplier = 2; -        MyTraits.SizeOnDecodeMultiplier = 10; -        MyTraits.RecommendedSampleSize = TImpl::SampleSize; -    } - -    TCompTableCodec::~TCompTableCodec() = default; - +            ::Load(in, Table);  +            Init();  +        }  +  +        NCompTable::TCompressorTable Table;  +        THolder<NCompTable::TChunkCompressor> Compressor;  +        THolder<NCompTable::TChunkDecompressor> Decompressor;  +        const EQuality Quality;  +        static const ui32 SampleSize = Max(NCompTable::TDataSampler::Size * 4, (1 << 22) * 5);  +    };  +  +    TCompTableCodec::TCompTableCodec(EQuality q)  +        : Impl(new TImpl{q})  +    {  +        MyTraits.NeedsTraining = true;  +        MyTraits.SizeOnEncodeMultiplier = 2;  +        MyTraits.SizeOnDecodeMultiplier = 10;  +        MyTraits.RecommendedSampleSize = TImpl::SampleSize;  +    }  +  +    TCompTableCodec::~TCompTableCodec() = default;  +       TString TCompTableCodec::GetName() const {          return ToString(Impl->Quality ? MyNameHQ() : MyNameLQ()); -    } - -    ui8 TCompTableCodec::Encode(TStringBuf in, TBuffer& out) const { -        return Impl->Encode(in, out); -    } - -    void TCompTableCodec::Decode(TStringBuf in, TBuffer& out) const { -        Impl->Decode(in, out); -    } - -    void TCompTableCodec::DoLearn(ISequenceReader& in) { -        Impl->DoLearn(in); -    } - +    }  +  +    ui8 TCompTableCodec::Encode(TStringBuf in, TBuffer& out) const {  +        return Impl->Encode(in, out);  +    }  +  +    void TCompTableCodec::Decode(TStringBuf in, TBuffer& out) const {  +        Impl->Decode(in, out);  +    }  +  +    void TCompTableCodec::DoLearn(ISequenceReader& in) {  +        Impl->DoLearn(in);  +    }  +       void TCompTableCodec::Save(IOutputStream* out) const { -        Impl->Save(out); -    } - +        Impl->Save(out);  +    }  +       void TCompTableCodec::Load(IInputStream* in) { -        Impl->Load(in); -    } - -} +        Impl->Load(in);  +    }  +  +}  diff --git a/library/cpp/codecs/comptable_codec.h b/library/cpp/codecs/comptable_codec.h index 7ba4f4c5432..1a10c8241e8 100644 --- a/library/cpp/codecs/comptable_codec.h +++ b/library/cpp/codecs/comptable_codec.h @@ -1,40 +1,40 @@ -#pragma once - -#include "codecs.h" - -#include <util/generic/ptr.h> - -namespace NCodecs { +#pragma once  +  +#include "codecs.h"  +  +#include <util/generic/ptr.h>  +  +namespace NCodecs {       class TCompTableCodec: public ICodec {          class TImpl;          TIntrusivePtr<TImpl> Impl; - +       public:          enum EQuality {              Q_LOW = 0,              Q_HIGH = 1          }; - +           explicit TCompTableCodec(EQuality q = Q_HIGH);          ~TCompTableCodec() override; - +           static TStringBuf MyNameHQ() {              return "comptable-hq";          }          static TStringBuf MyNameLQ() {              return "comptable-lq";          } - +           TString GetName() const override; - +           ui8 Encode(TStringBuf in, TBuffer& out) const override; - +           void Decode(TStringBuf in, TBuffer& out) const override; - +       protected:          void DoLearn(ISequenceReader& in) override;          void Save(IOutputStream* out) const override;          void Load(IInputStream* in) override;      }; - -} +  +}  diff --git a/library/cpp/codecs/delta_codec.cpp b/library/cpp/codecs/delta_codec.cpp index 61606d6f6f6..28d6b6e3bb9 100644 --- a/library/cpp/codecs/delta_codec.cpp +++ b/library/cpp/codecs/delta_codec.cpp @@ -1,6 +1,6 @@ -#include "delta_codec.h" - -namespace NCodecs { +#include "delta_codec.h"  +  +namespace NCodecs {       template <>      TStringBuf TDeltaCodec<ui64, true>::MyName() {          return "delta64-unsigned"; @@ -17,5 +17,5 @@ namespace NCodecs {      TStringBuf TDeltaCodec<ui32, false>::MyName() {          return "delta32-signed";      } - -} +  +}  diff --git a/library/cpp/codecs/delta_codec.h b/library/cpp/codecs/delta_codec.h index 21325825e6a..7398b3ae80a 100644 --- a/library/cpp/codecs/delta_codec.h +++ b/library/cpp/codecs/delta_codec.h @@ -1,102 +1,102 @@ -#pragma once - -#include "codecs.h" - +#pragma once  +  +#include "codecs.h"  +   #include <util/generic/array_ref.h> -#include <util/generic/typetraits.h> +#include <util/generic/typetraits.h>   #include <util/generic/bitops.h>  #include <util/string/cast.h> - -namespace NCodecs { +  +namespace NCodecs {       template <typename T = ui64, bool UnsignedDelta = true>      class TDeltaCodec: public ICodec {          static_assert(std::is_integral<T>::value, "expect std::is_integral<T>::value"); - +       public:          using TUnsigned = std::make_unsigned_t<T>;          using TSigned = std::make_signed_t<T>;          using TDelta = std::conditional_t<UnsignedDelta, TUnsigned, TSigned>; - +       private:          const TDelta MinDelta{Min<TDelta>()};          const TDelta MaxDelta{Max<TDelta>() - 1};          const TDelta InvalidDelta{MaxDelta + 1}; - +           Y_FORCE_INLINE static TDelta AddSafe(TUnsigned a, TUnsigned b) {              return a + b;          } - +           Y_FORCE_INLINE static TDelta SubSafe(TUnsigned a, TUnsigned b) {              return a - b;          } - +       public:          struct TDecoder {              const TDelta InvalidDelta{Max<TDelta>()}; - +               T Last = 0;              T Result = 0; - +               bool First = true;              bool Invalid = false; - +               Y_FORCE_INLINE bool Decode(TDelta t) {                  if (Y_UNLIKELY(First)) {                      First = false;                      Result = Last = t;                      return true;                  } - +                   if (Y_UNLIKELY(Invalid)) {                      Invalid = false;                      Last = 0;                      Result = t;                      return true;                  } - +                   Result = (Last += t);                  Invalid = t == InvalidDelta;                  return !Invalid; -            } +            }           }; - +       public:          static TStringBuf MyName(); - +           TDeltaCodec() {              MyTraits.SizeOfInputElement = sizeof(T);              MyTraits.AssumesStructuredInput = true; -        } - +        }  +           TString GetName() const override {              return ToString(MyName());          } - +           template <class TItem>          static void AppendTo(TBuffer& b, TItem t) {              b.Append((char*)&t, sizeof(t));          } - +           ui8 Encode(TStringBuf s, TBuffer& b) const override {              b.Clear();              if (s.empty()) {                  return 0;              } - +               b.Reserve(s.size());              TArrayRef<const T> tin{(const T*)s.data(), s.size() / sizeof(T)}; - +               const T* it = tin.begin();              TDelta last = *(it++);              AppendTo(b, last); - +               TDelta maxt = SubSafe(MaxDelta, last);              TDelta mint = AddSafe(MinDelta, last); - +               for (; it != tin.end(); ++it) {                  TDelta t = *it; - +                   if (Y_LIKELY((t >= mint) & (t <= maxt))) {                      AppendTo(b, t - last);                      last = t; @@ -111,33 +111,33 @@ namespace NCodecs {                      mint = MinDelta;                  }              } - +               return 0;          } - +           void Decode(TStringBuf s, TBuffer& b) const override {              b.Clear();              if (s.empty()) {                  return; -            } - +            }  +               b.Reserve(s.size());              TArrayRef<const T> tin{(const T*)s.data(), s.size() / sizeof(T)}; - +               TDecoder dec; - +               for (const T* it = tin.begin(); it != tin.end(); ++it) {                  T tmp;                  memcpy(&tmp, it, sizeof(tmp));                  if (dec.Decode(tmp)) {                      AppendTo(b, dec.Result);                  } -            } -        } - +            }  +        }  +       protected:          void DoLearn(ISequenceReader&) override {          }      }; - -} +  +}  diff --git a/library/cpp/codecs/float_huffman.h b/library/cpp/codecs/float_huffman.h index 786a8eae1d0..f03fc240ceb 100644 --- a/library/cpp/codecs/float_huffman.h +++ b/library/cpp/codecs/float_huffman.h @@ -5,7 +5,7 @@  #include <util/generic/strbuf.h>  #include <array> - +   namespace NCodecs::NFloatHuff {      TString Encode(TArrayRef<const float> factors); diff --git a/library/cpp/codecs/greedy_dict/gd_builder.cpp b/library/cpp/codecs/greedy_dict/gd_builder.cpp index 561bfbca015..2fb46029bf8 100644 --- a/library/cpp/codecs/greedy_dict/gd_builder.cpp +++ b/library/cpp/codecs/greedy_dict/gd_builder.cpp @@ -1,85 +1,85 @@ -#include "gd_builder.h" - +#include "gd_builder.h"  +   #include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> -#include <util/generic/algorithm.h> - -#include <util/random/shuffle.h> +#include <util/generic/algorithm.h>  +  +#include <util/random/shuffle.h>   #include <util/stream/output.h> -#include <util/string/printf.h> -#include <util/system/rusage.h> - -namespace NGreedyDict { +#include <util/string/printf.h>  +#include <util/system/rusage.h>  +  +namespace NGreedyDict {       void TDictBuilder::RebuildCounts(ui32 maxcand, bool final) {          if (!Current) {              Current = MakeHolder<TEntrySet>();              Current->InitWithAlpha();          } - +           TEntrySet& set = *Current; - +           for (auto& it : set)              it.Count = 0; - +           CompoundCounts = nullptr;          CompoundCountsPool.Clear(); - +           if (!final) {              CompoundCounts = MakeHolder<TCompoundCounts>(&CompoundCountsPool);              CompoundCounts->reserve(maxcand);          } - +           Shuffle(Input.begin(), Input.end(), Rng); - +           for (auto str : Input) {              if (!final && CompoundCounts->size() > maxcand)                  break; - +               i32 prev = -1; - +               while (!!str) {                  TEntry* e = set.FindPrefix(str);                  ui32 num = e->Number; - +                   e->Count += 1;                  if (!final && prev >= 0) {                      (*CompoundCounts)[Compose(prev, num)] += 1;                  } - +                   prev = num;                  ++set.TotalCount; -            } +            }           } - +           Current->SetModelP(); -    } - +    }  +       ui32 TDictBuilder::BuildNextGeneration(ui32 maxent) {          TAutoPtr<TEntrySet> newset = new TEntrySet;          newset->InitWithAlpha();          maxent -= newset->size(); - +           ui32 additions = 0;          ui32 deletions = 0; - +           {              const TEntrySet& set = *Current; - +               Candidates.clear();              const ui32 total = set.TotalCount;              const float minpval = Settings.MinPValue;              const EEntryStatTest test = Settings.StatTest;              const EEntryScore score = Settings.Score;              const ui32 mincnt = Settings.MinAbsCount; - +               for (const auto& it : set) {                  const TEntry& e = it;                  float modelp = e.ModelP;                  ui32 cnt = e.Count; - +                   if (e.HasPrefix() && e.Count > mincnt && StatTest(test, modelp, cnt, total) > minpval)                      Candidates.push_back(TCandidate(-Score(score, e.Len(), modelp, cnt, total), it.Number));              } - +               if (!!CompoundCounts) {                  for (TCompoundCounts::const_iterator it = CompoundCounts->begin(); it != CompoundCounts->end(); ++it) {                      const TEntry& prev = set.Get(Prev(it->first)); @@ -89,13 +89,13 @@ namespace NGreedyDict {                      if (cnt > mincnt && StatTest(test, modelp, cnt, total) > minpval)                          Candidates.push_back(TCandidate(-Score(score, prev.Len() + next.Len(), modelp, cnt, total), it->first));                  } -            } - +            }  +               Sort(Candidates.begin(), Candidates.end()); - +               if (Candidates.size() > maxent)                  Candidates.resize(maxent); - +               for (const auto& candidate : Candidates) {                  if (IsCompound(candidate.second)) {                      additions++; @@ -103,40 +103,40 @@ namespace NGreedyDict {                  } else {                      newset->Add(set.Get(candidate.second).Str);                  } -            } +            }               deletions = set.size() - (newset->size() - additions); -        } - +        }  +           Current = newset;          Current->BuildHierarchy();          return deletions + additions; -    } - +    }  +       ui32 TDictBuilder::Build(ui32 maxentries, ui32 maxiters, ui32 mindiff) {          size_t totalsz = 0;          for (auto it : Input)              totalsz += it.size(); - +           while (maxiters) {              maxiters--; - +               RebuildCounts(maxentries * Settings.GrowLimit, false); - +               if (Settings.Verbose) {                  TString mess = Sprintf("iter:%" PRIu32 " sz:%" PRIu32 " pend:%" PRIu32, maxiters, (ui32)Current->size(), (ui32)CompoundCounts->size());                  Clog << Sprintf("%-110s RSS=%" PRIu32 "M", mess.data(), (ui32)(TRusage::Get().MaxRss >> 20)) << Endl;              } - +               ui32 diff = BuildNextGeneration(maxentries);              if (Current->size() == maxentries && diff < mindiff)                  break; -        } - +        }  +           RebuildCounts(0, true);          Current->SetScores(Settings.Score);          return maxiters; -    } - -} +    }  +  +}  diff --git a/library/cpp/codecs/greedy_dict/gd_builder.h b/library/cpp/codecs/greedy_dict/gd_builder.h index b8e9a5e37be..7f3cea88cb4 100644 --- a/library/cpp/codecs/greedy_dict/gd_builder.h +++ b/library/cpp/codecs/greedy_dict/gd_builder.h @@ -1,94 +1,94 @@ -#pragma once - -#include "gd_entry.h" - -#include <util/generic/hash.h> -#include <util/random/fast.h> - -namespace NGreedyDict { +#pragma once  +  +#include "gd_entry.h"  +  +#include <util/generic/hash.h>  +#include <util/random/fast.h>  +  +namespace NGreedyDict {       struct TBuildSettings {          EEntryStatTest StatTest = EST_SIMPLE_NORM;          EEntryScore Score = ES_LEN_SIMPLE; - +           float MinPValue = 0.75;          ui32 MinAbsCount = 10;          ui32 GrowLimit = 10; // times of maxentries          bool Verbose = false;      }; - +       class TDictBuilder {          using TCompoundCounts = THashMap<ui64, ui32, THash<ui64>, TEqualTo<ui64>, TPoolAllocator>;          using TCandidate = std::pair<float, ui64>;          using TCandidates = TVector<TCandidate>; - +       private:          TFastRng64 Rng{0x1a5d0ac170565c1c, 0x0be7bc27, 0x6235f6f57820aa0d, 0xafdc7fb};          TStringBufs Input; - +           THolder<TEntrySet> Current; - +           TMemoryPool CompoundCountsPool;          THolder<TCompoundCounts> CompoundCounts; - +           TCandidates Candidates; - +           TBuildSettings Settings; - +       public:          TDictBuilder(const TBuildSettings& s = TBuildSettings())              : CompoundCountsPool(8112, TMemoryPool::TLinearGrow::Instance())              , Settings(s)          {          } - +           void SetInput(const TStringBufs& in) {              Input = in;          } - +           const TBuildSettings& GetSettings() const {              return Settings;          } - +           TBuildSettings& GetSettings() {              return Settings;          } - +           void SetSettings(const TBuildSettings& s) {              Settings = s;          } - +           TEntrySet& EntrySet() {              return *Current;          } - +           const TEntrySet& EntrySet() const {              return *Current;          } - +           THolder<TEntrySet> ReleaseEntrySet() {              return std::move(Current);          } - +           ui32 /*iters*/ Build(ui32 maxentries, ui32 maxiters = 16, ui32 mindiff = 10); - +       public:          void RebuildCounts(ui32 maxcand, bool final);          ui32 /*diff size*/ BuildNextGeneration(ui32 maxent); - +           static bool IsCompound(ui64 ent) {              return ent & 0xFFFFFFFF00000000ULL;          } - +           static ui32 Next(ui64 ent) {              return ent;          }          static ui32 Prev(ui64 ent) {              return (ent >> 32) - 1;          } - +           static ui64 Compose(ui32 prev, ui32 next) {              return ((prev + 1ULL) << 32) | next;          }      }; - -} +  +}  diff --git a/library/cpp/codecs/greedy_dict/gd_entry.cpp b/library/cpp/codecs/greedy_dict/gd_entry.cpp index 2c315c7f7cf..f23a754976a 100644 --- a/library/cpp/codecs/greedy_dict/gd_entry.cpp +++ b/library/cpp/codecs/greedy_dict/gd_entry.cpp @@ -1,98 +1,98 @@ -#include "gd_entry.h" -#include "gd_stats.h" - -#include <util/generic/algorithm.h> -#include <util/generic/singleton.h> - -namespace NGreedyDict { +#include "gd_entry.h"  +#include "gd_stats.h"  +  +#include <util/generic/algorithm.h>  +#include <util/generic/singleton.h>  +  +namespace NGreedyDict {       class TAlphas {          char Memory[512]; - +       public:          TStringBufs Alphas; - +           TAlphas() {              for (ui32 i = 0; i < 256; ++i) {                  Memory[2 * i] = (char)i;                  Memory[2 * i + 1] = 0; - +                   Alphas.push_back(TStringBuf(&Memory[2 * i], 1));              }          }      }; - +       void TEntrySet::InitWithAlpha() {          Pool.ClearKeepFirstChunk();          const TStringBufs& a = Singleton<TAlphas>()->Alphas;          for (auto it : a) {              Add(it); -        } +        }           BuildHierarchy(); -    } - +    }  +       void TEntrySet::BuildHierarchy() {          Sort(begin(), end(), TEntry::StrLess); - +           TCompactTrieBuilder<char, ui32, TAsIsPacker<ui32>> builder(CTBF_PREFIX_GROUPED); - +           for (iterator it = begin(); it != end(); ++it) {              it->Number = (it - begin());              TStringBuf suff = it->Str;              size_t len = 0;              ui32 val = 0; - +               if (builder.FindLongestPrefix(suff.data(), suff.size(), &len, &val) && len) {                  it->NearestPrefix = val;              } - +               builder.Add(suff.data(), suff.size(), it->Number); -        } - +        }  +           TBufferOutput bout;          builder.Save(bout);          Trie.Init(TBlob::FromBuffer(bout.Buffer())); -    } - +    }  +       TEntry* TEntrySet::FindPrefix(TStringBuf& str) {          size_t len = 0;          ui32 off = 0; - +           if (!Trie.FindLongestPrefix(str, &len, &off)) {              return nullptr;          } - +           str.Skip(len);          return &Get(off); -    } - +    }  +       void TEntrySet::SetModelP() {          for (iterator it = begin(); it != end(); ++it) {              TEntry& e = *it; - +               if (!e.HasPrefix()) {                  e.ModelP = 0;                  continue;              } - +               TStringBuf suff = e.Str;              const TEntry& p = Get(e.NearestPrefix);              suff.Skip(p.Len()); - +               float modelp = float(p.Count + e.Count) / TotalCount; - +               while (!!suff) {                  TEntry* pp = FindPrefix(suff);                  modelp *= float(pp->Count + e.Count) / TotalCount;              } - +               e.ModelP = modelp; -        } -    } - +        }  +    }  +       void TEntrySet::SetScores(EEntryScore s) {          for (auto& it : *this) {              it.Score = Score(s, it.Len(), it.ModelP, it.Count, TotalCount);          } -    } - -} +    }  +  +}  diff --git a/library/cpp/codecs/greedy_dict/gd_entry.h b/library/cpp/codecs/greedy_dict/gd_entry.h index 18b5be0e156..0362fd9f99b 100644 --- a/library/cpp/codecs/greedy_dict/gd_entry.h +++ b/library/cpp/codecs/greedy_dict/gd_entry.h @@ -1,42 +1,42 @@ -#pragma once - -#include "gd_stats.h" - +#pragma once  +  +#include "gd_stats.h"  +   #include <library/cpp/containers/comptrie/comptrie.h> - -#include <util/generic/ptr.h> -#include <util/generic/strbuf.h> -#include <util/generic/vector.h> - -#include <util/memory/pool.h> - -namespace NGreedyDict { +  +#include <util/generic/ptr.h>  +#include <util/generic/strbuf.h>  +#include <util/generic/vector.h>  +  +#include <util/memory/pool.h>  +  +namespace NGreedyDict {       using TStringBufs = TVector<TStringBuf>; - +       struct TEntry {          static const i32 NoPrefix = -1; - +           TStringBuf Str; - +           i32 NearestPrefix = NoPrefix;          ui32 Count = 0;          ui32 Number = 0;          float ModelP = 0;          float Score = 0; - +           TEntry(TStringBuf b = TStringBuf(), ui32 cnt = 0)              : Str(b)              , Count(cnt)          {          } - +           bool HasPrefix() const {              return NearestPrefix != NoPrefix;          }          ui32 Len() const {              return Str.size();          } - +           static bool StrLess(const TEntry& a, const TEntry& b) {              return a.Str < b.Str;          } @@ -47,20 +47,20 @@ namespace NGreedyDict {              return a.Score > b.Score;          }      }; - +       class TEntrySet: public TVector<TEntry>, TNonCopyable {          TMemoryPool Pool{8112};          TCompactTrie<char, ui32, TAsIsPacker<ui32>> Trie; - +       public:          ui32 TotalCount = 0; - +           void InitWithAlpha(); - +           void Add(TStringBuf a) {              push_back(TStringBuf(Pool.Append(a.data(), a.size()), a.size()));          } - +           void Add(TStringBuf a, TStringBuf b) {              size_t sz = a.size() + b.size();              char* p = (char*)Pool.Allocate(sz); @@ -68,36 +68,36 @@ namespace NGreedyDict {              memcpy(p + a.size(), b.data(), b.size());              push_back(TStringBuf(p, sz));          } - +           TEntry& Get(ui32 idx) {              return (*this)[idx];          } - +           const TEntry& Get(ui32 idx) const {              return (*this)[idx];          } - +           void BuildHierarchy(); - +           // longest prefix          TEntry* FindPrefix(TStringBuf& str); - +           const TEntry* FindPrefix(TStringBuf& str) const {              return ((TEntrySet*)this)->FindPrefix(str);          } - +           const TEntry* FirstPrefix(const TEntry& e, TStringBuf& suff) {              if (!e.HasPrefix())                  return nullptr; - +               const TEntry& p = Get(e.NearestPrefix);              suff = e.Str;              suff.Skip(p.Str.size());              return &p;          } - +           void SetModelP();          void SetScores(EEntryScore);      }; - -} +  +}  diff --git a/library/cpp/codecs/greedy_dict/gd_stats.h b/library/cpp/codecs/greedy_dict/gd_stats.h index b63c4c38d23..3c209fc67d2 100644 --- a/library/cpp/codecs/greedy_dict/gd_stats.h +++ b/library/cpp/codecs/greedy_dict/gd_stats.h @@ -1,10 +1,10 @@ -#pragma once - +#pragma once  +   #include <util/generic/ymath.h> -#include <util/generic/algorithm.h> -#include <util/generic/yexception.h> - -namespace NGreedyDict { +#include <util/generic/algorithm.h>  +#include <util/generic/yexception.h>  +  +namespace NGreedyDict {       enum EEntryScore {          ES_COUNT,          ES_LEN_COUNT, @@ -12,33 +12,33 @@ namespace NGreedyDict {          ES_LEN_SIMPLE,          ES_SOLAR      }; - +       enum EEntryStatTest {          EST_NONE = 0,          EST_SIMPLE_NORM = 2      }; - +       inline float ModelP(ui32 countA, ui32 countB, ui32 total) {          return float(countA) * countB / total / total;      } - +       // P (ab | dependent)      inline float SimpleTest(float modelp, ui32 countAB, ui32 total) {          float realp = float(countAB) / total;          return modelp >= realp ? 0 : (realp - modelp);      } - +       inline float SolarTest(float modelp, ui32 countAB, ui32 total) {          float realp = float(countAB) / total;          return modelp >= realp ? 0 : (modelp + realp * (log(realp / modelp) - 1));      } - +       // P (ab | dependent) / P (ab)      inline float SimpleTestNorm(float modelp, ui32 countAB, ui32 total) {          float realp = float(countAB) / total;          return modelp >= realp ? 0 : (realp - modelp) / realp;      } - +       inline float StatTest(EEntryStatTest test, float modelp, ui32 countAB, ui32 total) {          if (!total) {              return 0; @@ -50,9 +50,9 @@ namespace NGreedyDict {                  return SimpleTestNorm(modelp, countAB, total);          }          Y_FAIL("no way!"); -        return 0; -    } - +        return 0;  +    }  +       inline float Score(EEntryScore score, ui32 len, float modelp, ui32 count, ui32 total) {          if (!total) {              return 0; @@ -73,7 +73,7 @@ namespace NGreedyDict {                  return SolarTest(modelp, count, total);          }          Y_FAIL("no way!"); -        return 0; -    } - -} +        return 0;  +    }  +  +}  diff --git a/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp index 679089a11be..60ab9f7c308 100644 --- a/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp +++ b/library/cpp/codecs/greedy_dict/ut/greedy_dict_ut.cpp @@ -1,282 +1,282 @@ -#include "gd_builder.h" - +#include "gd_builder.h"  +   #include <library/cpp/testing/unittest/registar.h>  #include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> -#include <util/string/printf.h> +#include <util/string/printf.h>   #include <util/generic/ymath.h> - -class TGreedyDictTest: public TTestBase { +  +class TGreedyDictTest: public TTestBase {       UNIT_TEST_SUITE(TGreedyDictTest); -    UNIT_TEST(TestEntrySet) -    UNIT_TEST(TestBuilder0) -    UNIT_TEST(TestBuilder) +    UNIT_TEST(TestEntrySet)  +    UNIT_TEST(TestBuilder0)  +    UNIT_TEST(TestBuilder)       UNIT_TEST_SUITE_END(); - -    void TestEntrySet() { -        using namespace NGreedyDict; - -        { -            TEntrySet d; - -            d.InitWithAlpha(); - -            for (TEntrySet::const_iterator it = d.begin(); it != d.end(); ++it) { -                UNIT_ASSERT_C(!it->HasPrefix(), Sprintf("%u -> %u", it->Number, it->NearestPrefix)); -                UNIT_ASSERT_VALUES_EQUAL(it->Number, (ui32)(it - d.begin())); -            } - -            UNIT_ASSERT_VALUES_EQUAL(d.size(), 256u); -            TStringBuf s = "aaabbb"; -            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a"); -            UNIT_ASSERT_VALUES_EQUAL(s, "aabbb"); -            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a"); -            UNIT_ASSERT_VALUES_EQUAL(s, "abbb"); -            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a"); -            UNIT_ASSERT_VALUES_EQUAL(s, "bbb"); -            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b"); -            UNIT_ASSERT_VALUES_EQUAL(s, "bb"); -            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b"); -            UNIT_ASSERT_VALUES_EQUAL(s, "b"); -            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b"); -            UNIT_ASSERT_VALUES_EQUAL(s, ""); -            s = TStringBuf("", 1); -            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, TStringBuf("", 1)); -            UNIT_ASSERT_VALUES_EQUAL(s, ""); -            s = "\xFF"; -            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "\xFF"); -            UNIT_ASSERT_VALUES_EQUAL(s, ""); -        } -        { -            TEntrySet d; -            d.Add("a"); -            d.Add("b"); -            d.Add("b", "a"); -            d.BuildHierarchy(); - -            UNIT_ASSERT_VALUES_EQUAL(d.size(), 3u); - -            TStringBuf s = "bab"; -            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "ba"); -            UNIT_ASSERT_VALUES_EQUAL(s, "b"); -        } -        { -            TEntrySet d; - -            d.Add("a"); -            d.Add("aa"); -            d.Add("aaa"); -            d.Add("aab"); -            d.Add("b"); -            d.Add("ba"); - -            d.BuildHierarchy(); - -            UNIT_ASSERT_VALUES_EQUAL(d.size(), 6u); -            { -                TStringBuf s = "aaaaa"; -                const TEntry* e = d.FindPrefix(s); -                UNIT_ASSERT_VALUES_EQUAL(e->Str, "aaa"); -                UNIT_ASSERT_VALUES_EQUAL(e->Number, 2u); -                UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 1); -                UNIT_ASSERT_VALUES_EQUAL(s, "aa"); -            } - -            { -                TStringBuf s = "a"; -                const TEntry* e = d.FindPrefix(s); -                UNIT_ASSERT_VALUES_EQUAL(e->Str, "a"); -                UNIT_ASSERT_VALUES_EQUAL(e->Number, 0u); -                UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1); -                UNIT_ASSERT_VALUES_EQUAL(s, ""); -            } - -            { -                TStringBuf s = "bab"; -                const TEntry* e = d.FindPrefix(s); -                UNIT_ASSERT_VALUES_EQUAL(e->Str, "ba"); -                UNIT_ASSERT_VALUES_EQUAL(e->Number, 5u); -                UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 4); -                UNIT_ASSERT_VALUES_EQUAL(s, "b"); -            } - -            { -                TStringBuf s = "bba"; -                const TEntry* e = d.FindPrefix(s); -                UNIT_ASSERT_VALUES_EQUAL(e->Str, "b"); -                UNIT_ASSERT_VALUES_EQUAL(e->Number, 4u); -                UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1); -                UNIT_ASSERT_VALUES_EQUAL(s, "ba"); -            } -        } -    } - -    void TestBuilder0() { -        using namespace NGreedyDict; -        ui32 a = 1, b = 11; -        ui64 ab = TDictBuilder::Compose(a, b); -        UNIT_ASSERT(TDictBuilder::IsCompound(ab)); -        UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Prev(ab), a); -        UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Next(ab), b); -    } - -    void FillData(NGreedyDict::TStringBufs& data) { +  +    void TestEntrySet() {  +        using namespace NGreedyDict;  +  +        {  +            TEntrySet d;  +  +            d.InitWithAlpha();  +  +            for (TEntrySet::const_iterator it = d.begin(); it != d.end(); ++it) {  +                UNIT_ASSERT_C(!it->HasPrefix(), Sprintf("%u -> %u", it->Number, it->NearestPrefix));  +                UNIT_ASSERT_VALUES_EQUAL(it->Number, (ui32)(it - d.begin()));  +            }  +  +            UNIT_ASSERT_VALUES_EQUAL(d.size(), 256u);  +            TStringBuf s = "aaabbb";  +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a");  +            UNIT_ASSERT_VALUES_EQUAL(s, "aabbb");  +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a");  +            UNIT_ASSERT_VALUES_EQUAL(s, "abbb");  +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "a");  +            UNIT_ASSERT_VALUES_EQUAL(s, "bbb");  +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b");  +            UNIT_ASSERT_VALUES_EQUAL(s, "bb");  +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b");  +            UNIT_ASSERT_VALUES_EQUAL(s, "b");  +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "b");  +            UNIT_ASSERT_VALUES_EQUAL(s, "");  +            s = TStringBuf("", 1);  +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, TStringBuf("", 1));  +            UNIT_ASSERT_VALUES_EQUAL(s, "");  +            s = "\xFF";  +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "\xFF");  +            UNIT_ASSERT_VALUES_EQUAL(s, "");  +        }  +        {  +            TEntrySet d;  +            d.Add("a");  +            d.Add("b");  +            d.Add("b", "a");  +            d.BuildHierarchy();  +  +            UNIT_ASSERT_VALUES_EQUAL(d.size(), 3u);  +  +            TStringBuf s = "bab";  +            UNIT_ASSERT_VALUES_EQUAL(d.FindPrefix(s)->Str, "ba");  +            UNIT_ASSERT_VALUES_EQUAL(s, "b");  +        }  +        {  +            TEntrySet d;  +  +            d.Add("a");  +            d.Add("aa");  +            d.Add("aaa");  +            d.Add("aab");  +            d.Add("b");  +            d.Add("ba");  +  +            d.BuildHierarchy();  +  +            UNIT_ASSERT_VALUES_EQUAL(d.size(), 6u);  +            {  +                TStringBuf s = "aaaaa";  +                const TEntry* e = d.FindPrefix(s);  +                UNIT_ASSERT_VALUES_EQUAL(e->Str, "aaa");  +                UNIT_ASSERT_VALUES_EQUAL(e->Number, 2u);  +                UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 1);  +                UNIT_ASSERT_VALUES_EQUAL(s, "aa");  +            }  +  +            {  +                TStringBuf s = "a";  +                const TEntry* e = d.FindPrefix(s);  +                UNIT_ASSERT_VALUES_EQUAL(e->Str, "a");  +                UNIT_ASSERT_VALUES_EQUAL(e->Number, 0u);  +                UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1);  +                UNIT_ASSERT_VALUES_EQUAL(s, "");  +            }  +  +            {  +                TStringBuf s = "bab";  +                const TEntry* e = d.FindPrefix(s);  +                UNIT_ASSERT_VALUES_EQUAL(e->Str, "ba");  +                UNIT_ASSERT_VALUES_EQUAL(e->Number, 5u);  +                UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, 4);  +                UNIT_ASSERT_VALUES_EQUAL(s, "b");  +            }  +  +            {  +                TStringBuf s = "bba";  +                const TEntry* e = d.FindPrefix(s);  +                UNIT_ASSERT_VALUES_EQUAL(e->Str, "b");  +                UNIT_ASSERT_VALUES_EQUAL(e->Number, 4u);  +                UNIT_ASSERT_VALUES_EQUAL(e->NearestPrefix, -1);  +                UNIT_ASSERT_VALUES_EQUAL(s, "ba");  +            }  +        }  +    }  +  +    void TestBuilder0() {  +        using namespace NGreedyDict;  +        ui32 a = 1, b = 11;  +        ui64 ab = TDictBuilder::Compose(a, b);  +        UNIT_ASSERT(TDictBuilder::IsCompound(ab));  +        UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Prev(ab), a);  +        UNIT_ASSERT_VALUES_EQUAL(TDictBuilder::Next(ab), b);  +    }  +  +    void FillData(NGreedyDict::TStringBufs& data) {           static const char* urls[] = {"http://53.ru/car/motors/foreign/opel/tigra/", "http://abakan.24au.ru/tender/85904/", "http://anm15.gulaig.com/", "http://avto-parts.com/mercedes-benz/mercedes-benz-w220-1998-2005/category-442/category-443/", "http://ballooncousin.co.uk/", "http://benzol.ru/equipment/?id=1211&parent=514", "http://blazingseorank.com/blazing-seo-rank-free-website-analysis-to-increase-rank-and-traffic-450.html", "http://blogblaugrana.contadorwebmasters.com/", "http://bristolhash.org.uk/bh3cntct.php", "http://broker.borovichi.ru/category/item/3/1/0/8/28/257", "http://canoncompactcamerax.blogspot.com/", "http://classifieds.smashits.com/p,107881,email-to-friend.htm", "http://conferences.ksde.org/Portals/132/FallAssessment/SAVETHEDAY-FA09.pdf", "http://eway.vn/raovat/325-dien-tu-gia-dung/337-dieu-hoa/98041-b1-sua-may-lanh-quan-binh-tan-sua-may-lanh-quan-binh-chanh-hh-979676119-toan-quoc.html", "http://gallery.e2bn.org/asset73204_8-.html", "http://goplay.nsw.gov.au/activities-for-kids/by/historic-houses-trust/?startdate=2012-07-10", "http://grichards19067.multiply.com/", "http://hotkovo.egent.ru/user/89262269084/", "http://howimetyourself.com/?redirect_to=http://gomiso.com/m/suits/seasons/2/episodes/2", "http://islamqa.com/hi/ref/9014/DEAD%20PEOPLE%20GOOD%20DEEDS", "http://lapras.rutube.ru/", "http://nceluiko.ya.ru/", "http://nyanyanyanyaa.beon.ru/", "http://ozbo.com/Leaf-River-DV-7SS-7-0-MP-Game-Camera-K1-32541.html", "http://sbantom.ru/catalog/chasy/632753.html", "http://shopingoff.com/index.php?option=com_virtuemart&Itemid=65&category_id=&page=shop.browse&manufacturer_id=122&limit=32&limitstart=96", "http://shopingoff.com/katalog-odezhdy/manufacturer/62-christian-audigier.html?limit=32&start=448", "https://webwinkel.ah.nl/process?fh_location=//ecommerce/nl_NL/categories%3C%7Becommerce_shoc1%7D/it_show_product_code_1384%3E%7B10%3B20%7D/pr_startdate%3C20120519/pr_enddate%3E20120519/pr_ltc_allowed%3E%7Bbowi%7D/categories%3C%7Becommerce_shoc1_1al%7D/categories%3C%7Becommerce_shoc1_1al_1ahal%7D&&action=albert_noscript.modules.build", "http://top100.rambler.ru/navi/?theme=208/210/371&rgn=17", "http://volgogradskaya-oblast.extra-m.ru/classifieds/rabota/vakansii/banki-investicii/901467/", "http://wikien4.appspot.com/wiki/Warburg_hypothesis", "http://wola_baranowska.kamerzysta24.com.pl/", "http://www.10dot0dot0dot1.com/", "http://www.anima-redux.ru/index.php?key=gifts+teenage+girls", "http://www.aquaticabyseaworld.com/Calendar.aspx/CP/CP/CP/sp-us/CP/CP/ParkMap/Tickets/Weather.aspx", "http://www.autousa.com/360spin/2012_cadillac_ctssportwagon_3.6awdpremiumcollection.htm", "http://www.booking.com/city/gb/paignton-aireborough.html?inac=0&lang=pl", "http://www.booking.com/city/it/vodo-cadore.en.html", "http://www.booking.com/district/us/new-york/rockefeller-center.html&lang=no", "http://www.booking.com/hotel/bg/crown-fort-club.lv.html", "http://www.booking.com/hotel/ca/gouverneur-rimouski.ar.html", "http://www.booking.com/hotel/ch/l-auberge-du-chalet-a-gobet.fi.html", "http://www.booking.com/hotel/de/mark-garni.ru.html?aid=337384;label=yandex-hotel-mark-garni-68157-%7Bparam1%7D", "http://www.booking.com/hotel/de/mercure-goldschmieding-castrop-rauxel.ro.html", "http://www.booking.com/hotel/de/zollenspieker-fahrhaus.fr.html", "http://www.booking.com/hotel/es/jardin-metropolitano.ca.html", "http://www.booking.com/hotel/fr/clim.fr.html", "http://www.booking.com/hotel/fr/radisson-sas-toulouse-airport.et.html", "http://www.booking.com/hotel/gb/stgileshotel.ro.html?srfid=68c7fe42a03653a8796c84435c5299e4X16?tab=4", "http://www.booking.com/hotel/gr/rodos-park-suites.ru.html", "http://www.booking.com/hotel/id/le-grande-suites-bali.ru.html", "http://www.booking.com/hotel/it/mozart.it.html?aid=321655", "http://www.booking.com/hotel/ni/bahia-del-sol-villas.ru.html?dcid=1;dva=0", "http://www.booking.com/hotel/nl/cpschiphol.ro.html.ro.html?tab=4", "http://www.booking.com/hotel/th/laem-din.en-gb.html", "http://www.booking.com/hotel/th/tinidee-ranong.en.html", "http://www.booking.com/hotel/us/best-western-plus-merrimack-valley.hu.html", "http://www.booking.com/hotel/vn/tan-hai-long.km.html", "http://www.booking.com/landmark/au/royal-brisbane-women-s-hospital.vi.html", "http://www.booking.com/landmark/hk/nam-cheong-station.html&lang=id", "http://www.booking.com/landmark/it/spanish-steps.ca.html", "http://www.booking.com/landmark/sg/asian-civilisations-museum.html&lang=fi", "http://www.booking.com/place/fi-1376029.pt.html", "http://www.booking.com/place/tn257337.pl.html", "http://www.booking.com/region/ca/niagarafalls.ar.html&selected_currency=PLN", "http://www.booking.com/region/mx/queretaro.pt-pt.html&selected_currency=AUD", "http://www.booking.com/searchresults.en.html?city=20063074", "http://www.booking.com/searchresults.et.html?checkin=;checkout=;city=-394632", "http://www.booking.com/searchresults.lv.html?region=3936", "http://www.cevredanismanlari.com/index.php/component/k2/index.php/mevzuat/genel-yazlar/item/dosyalar/index.php?option=com_k2&view=item&id=16:iso-14001-%C3%A7evre-y%C3%B6netim-sistemi&Itemid=132&limitstart=107120", "http://www.dh-wholesaler.com/MENS-POLO-RACING-TEE-RL-p-417.html", "http://www.employabilityonline.net/", "http://www.esso.inc.ru/board/tools.php?event=profile&pname=Invinerrq", "http://www.filesurgery.ru/searchfw/kids_clothes-3.html", "http://www.furnitureandcarpetsource.com/Item.aspx?ItemID=-2107311899&ItemNum=53-T3048", "http://www.gets.cn/product/Gold-Sand-Lampwork-Glass-Beads--Flat-round--28x28x13mm_p260717.html", "http://www.gets.cn/wholesale-Sterling-Silver-Pendant-Findings-3577_S--L-Star-P-1.html?view=1&by=1", "http://www.homeandgardenadvice.com/diy/Mortgages_Loans_and_Financing/9221.html", "http://www.hongkongairport.com/eng/index.html/passenger/passenger/transport/to-from-airport/business/about-the-airport/transport/shopping/entertainment/t2/passenger/interactive-map.html", "http://www.hongkongairport.com/eng/index.html/shopping/insideshopping/all/passenger/transfer-transit/all/airline-information/shopping/entertainment/t2/business/about-the-airport/welcome.html", "http://www.hongkongairport.com/eng/index.html/transport/business/about-the-airport/transport/business/airport-authority/passenger/shopping/dining/all/dining.html", "http://www.idedge.com/index.cfm/fuseaction/category.display/category_id/298/index.cfm", "http://www.istanbulburda.com/aramalar.php", "http://www.jewelryinthenet.com/ads/AdDetail.aspx?AdID=1-0311002490689&stid=22-0111001020877", "http://www.johnnydepp.ru/forum/index.php?showtopic=1629&mode=linearplus&view=findpost&p=186977", "http://www.johnnydepp.ru/forum/index.php?showtopic=476&st=60&p=87379&", "http://www.joseleano.com/joomla/index.php/audio", "http://www.kaplicarehberi.com/tag/sakar-ilicali-kaplicalari/feed", "http://www.khaber.com.tr/arama.html?key=%C3%A7avdar", "http://www.kiz-oyunlari1.com/1783/4437/4363/1056/4170/Bump-Copter2-.html", "http://www.kiz-oyunlari1.com/3752/2612/4175/1166/3649/1047/Angelina-Oyunu.html", "http://www.kiz-oyunlari1.com/4266/3630/3665/3286/4121/301/3274/Sinir-Sinekler-.html", "http://www.kuldiga.lv/index.php?f=8&cat=371", "http://www.kuldiga.lv/index.php/img/index.php?l=lv&art_id=1836&show_c=&cat=85", "http://www.patronessa.ru/remontiruemsya/kuzovnie30raboti.html", "http://www.rapdict.org/Nu_Money?title=Talk:Nu_Money&action=edit", "http://www.serafin-phu.tabor24.com/?page=8", "http://www.shoes-store.org/brand1/Kids/Minnetonka.html", "http://www.shoes-store.org/shoes-store.xml", "http://www.way2allah.com/khotab-download-34695.htm"}; -        data.clear(); +        data.clear();           data.insert(data.begin(), urls, urls + Y_ARRAY_SIZE(urls)); -    } - +    }  +       typedef THashMap<TStringBuf, NGreedyDict::TEntry> TDict; - -    TAutoPtr<NGreedyDict::TEntrySet> DoTestBuilder(const NGreedyDict::TBuildSettings& s, +  +    TAutoPtr<NGreedyDict::TEntrySet> DoTestBuilder(const NGreedyDict::TBuildSettings& s,                                                      TDict& res) { -        using namespace NGreedyDict; - -        TStringBufs data; -        FillData(data); - -        TDictBuilder b(s); -        b.SetInput(data); -        b.Build(256 + 128); - -        TEntrySet& set = b.EntrySet(); - +        using namespace NGreedyDict;  +  +        TStringBufs data;  +        FillData(data);  +  +        TDictBuilder b(s);  +        b.SetInput(data);  +        b.Build(256 + 128);  +  +        TEntrySet& set = b.EntrySet();  +           for (const auto& it : set) {              if (it.Score) {                  res[it.Str] = it; -            } -        } - -        return b.ReleaseEntrySet(); -    } - -    void DoAssertEntry(TStringBuf entry, ui32 number, i32 parent, float score, const TDict& dict) { -        TDict::const_iterator it = dict.find(entry); -        UNIT_ASSERT_C(it != dict.end(), entry); -        UNIT_ASSERT_VALUES_EQUAL_C(it->second.Number, number, entry); -        UNIT_ASSERT_VALUES_EQUAL_C(it->second.NearestPrefix, parent, entry); -        UNIT_ASSERT_VALUES_EQUAL_C(round(it->second.Score * 10000), round(score * 10000), entry); -    } - -    void TestBuilder() { -        TAutoPtr<NGreedyDict::TEntrySet> set; +            }  +        }  +  +        return b.ReleaseEntrySet();  +    }  +  +    void DoAssertEntry(TStringBuf entry, ui32 number, i32 parent, float score, const TDict& dict) {  +        TDict::const_iterator it = dict.find(entry);  +        UNIT_ASSERT_C(it != dict.end(), entry);  +        UNIT_ASSERT_VALUES_EQUAL_C(it->second.Number, number, entry);  +        UNIT_ASSERT_VALUES_EQUAL_C(it->second.NearestPrefix, parent, entry);  +        UNIT_ASSERT_VALUES_EQUAL_C(round(it->second.Score * 10000), round(score * 10000), entry);  +    }  +  +    void TestBuilder() {  +        TAutoPtr<NGreedyDict::TEntrySet> set;           THashMap<TStringBuf, NGreedyDict::TEntry> res; -        NGreedyDict::TBuildSettings s; -        set = DoTestBuilder(s, res); - -        UNIT_ASSERT_VALUES_EQUAL(set->size(), 295u); -        UNIT_ASSERT_VALUES_EQUAL(res.size(), 110u); - -        DoAssertEntry("%", 37, -1, 0.00375193, res); -        DoAssertEntry("%7", 38, 37, 0.00513299, res); -        DoAssertEntry("&", 39, -1, 0.00794527, res); -        DoAssertEntry("+", 44, -1, 0.000441404, res); -        DoAssertEntry(",", 45, -1, 0.000441404, res); -        DoAssertEntry("-", 46, -1, 0.0417126, res); -        DoAssertEntry(".", 47, -1, 0.0196425, res); -        DoAssertEntry(".com/", 48, 47, 0.0374482, res); -        DoAssertEntry(".html", 49, 47, 0.0496577, res); -        DoAssertEntry(".html?", 50, 49, 0.0153908, res); -        DoAssertEntry(".php", 51, 47, 0.0123585, res); -        DoAssertEntry(".ru/", 52, 47, 0.0150027, res); -        DoAssertEntry("/", 53, -1, 0.0452439, res); -        DoAssertEntry("/index", 54, 53, 0.0158905, res); -        DoAssertEntry("0", 55, -1, 0.00816597, res); -        DoAssertEntry("1", 56, -1, 0.0167733, res); -        DoAssertEntry("10", 57, 56, 0.00530474, res); -        DoAssertEntry("2", 58, -1, 0.0101523, res); -        DoAssertEntry("20", 59, 58, 0.00674234, res); -        DoAssertEntry("3", 60, -1, 0.01258, res); -        DoAssertEntry("32", 61, 60, 0.00490697, res); -        DoAssertEntry("4", 62, -1, 0.00993158, res); -        DoAssertEntry("5", 63, -1, 0.00617965, res); -        DoAssertEntry("6", 64, -1, 0.00971088, res); -        DoAssertEntry("7", 65, -1, 0.0101523, res); -        DoAssertEntry("8", 66, -1, 0.00728316, res); -        DoAssertEntry("9", 67, -1, 0.00728316, res); -        DoAssertEntry(":", 68, -1, 0.000662106, res); -        DoAssertEntry(";", 69, -1, 0.000882807, res); -        DoAssertEntry("=", 71, -1, 0.01258, res); -        DoAssertEntry("?", 73, -1, 0.00397263, res); -        DoAssertEntry("A", 75, -1, 0.00264842, res); -        DoAssertEntry("B", 76, -1, 0.00220702, res); -        DoAssertEntry("C", 77, -1, 0.00353123, res); -        DoAssertEntry("D", 78, -1, 0.00375193, res); -        DoAssertEntry("E", 79, -1, 0.00286912, res); -        DoAssertEntry("F", 80, -1, 0.00110351, res); -        DoAssertEntry("G", 81, -1, 0.00110351, res); -        DoAssertEntry("H", 82, -1, 0.000220702, res); -        DoAssertEntry("I", 83, -1, 0.00198632, res); -        DoAssertEntry("K", 85, -1, 0.000441404, res); -        DoAssertEntry("L", 86, -1, 0.00198632, res); -        DoAssertEntry("M", 87, -1, 0.00154491, res); -        DoAssertEntry("N", 88, -1, 0.00154491, res); -        DoAssertEntry("O", 89, -1, 0.00132421, res); -        DoAssertEntry("P", 90, -1, 0.00308983, res); -        DoAssertEntry("R", 92, -1, 0.000662106, res); -        DoAssertEntry("S", 93, -1, 0.00264842, res); -        DoAssertEntry("T", 94, -1, 0.00110351, res); -        DoAssertEntry("U", 95, -1, 0.000220702, res); -        DoAssertEntry("V", 96, -1, 0.000441404, res); -        DoAssertEntry("W", 97, -1, 0.000441404, res); -        DoAssertEntry("X", 98, -1, 0.000220702, res); -        DoAssertEntry("Y", 99, -1, 0.000220702, res); -        DoAssertEntry("_", 105, -1, 0.00904877, res); -        DoAssertEntry("a", 107, -1, 0.0505407, res); -        DoAssertEntry("an", 108, 107, 0.018273, res); -        DoAssertEntry("ar", 109, 107, 0.0169385, res); -        DoAssertEntry("b", 110, -1, 0.0156698, res); -        DoAssertEntry("c", 111, -1, 0.018539, res); -        DoAssertEntry("cat", 112, 111, 0.00846732, res); -        DoAssertEntry("ch", 113, 111, 0.00644872, res); -        DoAssertEntry("com", 114, 111, 0.00724235, res); -        DoAssertEntry("ct", 115, 111, 0.00605729, res); -        DoAssertEntry("d", 116, -1, 0.020746, res); -        DoAssertEntry("di", 117, 116, 0.00730659, res); -        DoAssertEntry("e", 118, -1, 0.0624586, res); -        DoAssertEntry("en", 119, 118, 0.0108999, res); -        DoAssertEntry("ent", 120, 119, 0.00616002, res); -        DoAssertEntry("f", 121, -1, 0.00860737, res); -        DoAssertEntry("fi", 122, 121, 0.00423196, res); -        DoAssertEntry("g", 123, -1, 0.0180975, res); -        DoAssertEntry("go", 124, 123, 0.00601862, res); -        DoAssertEntry("h", 125, -1, 0.010373, res); -        DoAssertEntry("ho", 126, 125, 0.00570298, res); -        DoAssertEntry("http://", 127, 125, 0.0494372, res); -        DoAssertEntry("http://www.", 128, 127, 0.0849702, res); -        DoAssertEntry("http://www.booking.com/", 129, 128, 0.071066, res); -        DoAssertEntry("http://www.booking.com/hotel/", 130, 129, 0.121607, res); -        DoAssertEntry("i", 131, -1, 0.0258221, res); -        DoAssertEntry("id=", 132, 131, 0.00725369, res); -        DoAssertEntry("im", 133, 131, 0.00373318, res); -        DoAssertEntry("in", 134, 131, 0.013625, res); -        DoAssertEntry("ing", 135, 134, 0.00795491, res); -        DoAssertEntry("ion", 136, 131, 0.00796149, res); -        DoAssertEntry("it", 137, 131, 0.00953416, res); -        DoAssertEntry("j", 138, -1, 0.00132421, res); -        DoAssertEntry("k", 139, -1, 0.0134628, res); -        DoAssertEntry("l", 140, -1, 0.0381814, res); -        DoAssertEntry("m", 141, -1, 0.0174354, res); -        DoAssertEntry("mer", 142, 141, 0.00711846, res); -        DoAssertEntry("n", 143, -1, 0.0132421, res); -        DoAssertEntry("o", 144, -1, 0.0302362, res); -        DoAssertEntry("on", 145, 144, 0.00802271, res); -        DoAssertEntry("ou", 146, 144, 0.00414545, res); -        DoAssertEntry("p", 147, -1, 0.0225116, res); -        DoAssertEntry("port", 148, 147, 0.0123532, res); -        DoAssertEntry("q", 149, -1, 0.00176561, res); -        DoAssertEntry("r", 150, -1, 0.0401677, res); -        DoAssertEntry("ran", 151, 150, 0.00686918, res); -        DoAssertEntry("s", 152, -1, 0.0487751, res); -        DoAssertEntry("sho", 153, 152, 0.0113876, res); -        DoAssertEntry("t", 154, -1, 0.0379607, res); -        DoAssertEntry("u", 155, -1, 0.0211874, res); -        DoAssertEntry("v", 156, -1, 0.00595895, res); -        DoAssertEntry("vi", 157, 156, 0.00480673, res); -        DoAssertEntry("w", 158, -1, 0.00816597, res); -        DoAssertEntry("x", 159, -1, 0.00375193, res); -        DoAssertEntry("y", 160, -1, 0.0130214, res); -        DoAssertEntry("z", 161, -1, 0.00353123, res); -    } -}; - -UNIT_TEST_SUITE_REGISTRATION(TGreedyDictTest); +        NGreedyDict::TBuildSettings s;  +        set = DoTestBuilder(s, res);  +  +        UNIT_ASSERT_VALUES_EQUAL(set->size(), 295u);  +        UNIT_ASSERT_VALUES_EQUAL(res.size(), 110u);  +  +        DoAssertEntry("%", 37, -1, 0.00375193, res);  +        DoAssertEntry("%7", 38, 37, 0.00513299, res);  +        DoAssertEntry("&", 39, -1, 0.00794527, res);  +        DoAssertEntry("+", 44, -1, 0.000441404, res);  +        DoAssertEntry(",", 45, -1, 0.000441404, res);  +        DoAssertEntry("-", 46, -1, 0.0417126, res);  +        DoAssertEntry(".", 47, -1, 0.0196425, res);  +        DoAssertEntry(".com/", 48, 47, 0.0374482, res);  +        DoAssertEntry(".html", 49, 47, 0.0496577, res);  +        DoAssertEntry(".html?", 50, 49, 0.0153908, res);  +        DoAssertEntry(".php", 51, 47, 0.0123585, res);  +        DoAssertEntry(".ru/", 52, 47, 0.0150027, res);  +        DoAssertEntry("/", 53, -1, 0.0452439, res);  +        DoAssertEntry("/index", 54, 53, 0.0158905, res);  +        DoAssertEntry("0", 55, -1, 0.00816597, res);  +        DoAssertEntry("1", 56, -1, 0.0167733, res);  +        DoAssertEntry("10", 57, 56, 0.00530474, res);  +        DoAssertEntry("2", 58, -1, 0.0101523, res);  +        DoAssertEntry("20", 59, 58, 0.00674234, res);  +        DoAssertEntry("3", 60, -1, 0.01258, res);  +        DoAssertEntry("32", 61, 60, 0.00490697, res);  +        DoAssertEntry("4", 62, -1, 0.00993158, res);  +        DoAssertEntry("5", 63, -1, 0.00617965, res);  +        DoAssertEntry("6", 64, -1, 0.00971088, res);  +        DoAssertEntry("7", 65, -1, 0.0101523, res);  +        DoAssertEntry("8", 66, -1, 0.00728316, res);  +        DoAssertEntry("9", 67, -1, 0.00728316, res);  +        DoAssertEntry(":", 68, -1, 0.000662106, res);  +        DoAssertEntry(";", 69, -1, 0.000882807, res);  +        DoAssertEntry("=", 71, -1, 0.01258, res);  +        DoAssertEntry("?", 73, -1, 0.00397263, res);  +        DoAssertEntry("A", 75, -1, 0.00264842, res);  +        DoAssertEntry("B", 76, -1, 0.00220702, res);  +        DoAssertEntry("C", 77, -1, 0.00353123, res);  +        DoAssertEntry("D", 78, -1, 0.00375193, res);  +        DoAssertEntry("E", 79, -1, 0.00286912, res);  +        DoAssertEntry("F", 80, -1, 0.00110351, res);  +        DoAssertEntry("G", 81, -1, 0.00110351, res);  +        DoAssertEntry("H", 82, -1, 0.000220702, res);  +        DoAssertEntry("I", 83, -1, 0.00198632, res);  +        DoAssertEntry("K", 85, -1, 0.000441404, res);  +        DoAssertEntry("L", 86, -1, 0.00198632, res);  +        DoAssertEntry("M", 87, -1, 0.00154491, res);  +        DoAssertEntry("N", 88, -1, 0.00154491, res);  +        DoAssertEntry("O", 89, -1, 0.00132421, res);  +        DoAssertEntry("P", 90, -1, 0.00308983, res);  +        DoAssertEntry("R", 92, -1, 0.000662106, res);  +        DoAssertEntry("S", 93, -1, 0.00264842, res);  +        DoAssertEntry("T", 94, -1, 0.00110351, res);  +        DoAssertEntry("U", 95, -1, 0.000220702, res);  +        DoAssertEntry("V", 96, -1, 0.000441404, res);  +        DoAssertEntry("W", 97, -1, 0.000441404, res);  +        DoAssertEntry("X", 98, -1, 0.000220702, res);  +        DoAssertEntry("Y", 99, -1, 0.000220702, res);  +        DoAssertEntry("_", 105, -1, 0.00904877, res);  +        DoAssertEntry("a", 107, -1, 0.0505407, res);  +        DoAssertEntry("an", 108, 107, 0.018273, res);  +        DoAssertEntry("ar", 109, 107, 0.0169385, res);  +        DoAssertEntry("b", 110, -1, 0.0156698, res);  +        DoAssertEntry("c", 111, -1, 0.018539, res);  +        DoAssertEntry("cat", 112, 111, 0.00846732, res);  +        DoAssertEntry("ch", 113, 111, 0.00644872, res);  +        DoAssertEntry("com", 114, 111, 0.00724235, res);  +        DoAssertEntry("ct", 115, 111, 0.00605729, res);  +        DoAssertEntry("d", 116, -1, 0.020746, res);  +        DoAssertEntry("di", 117, 116, 0.00730659, res);  +        DoAssertEntry("e", 118, -1, 0.0624586, res);  +        DoAssertEntry("en", 119, 118, 0.0108999, res);  +        DoAssertEntry("ent", 120, 119, 0.00616002, res);  +        DoAssertEntry("f", 121, -1, 0.00860737, res);  +        DoAssertEntry("fi", 122, 121, 0.00423196, res);  +        DoAssertEntry("g", 123, -1, 0.0180975, res);  +        DoAssertEntry("go", 124, 123, 0.00601862, res);  +        DoAssertEntry("h", 125, -1, 0.010373, res);  +        DoAssertEntry("ho", 126, 125, 0.00570298, res);  +        DoAssertEntry("http://", 127, 125, 0.0494372, res);  +        DoAssertEntry("http://www.", 128, 127, 0.0849702, res);  +        DoAssertEntry("http://www.booking.com/", 129, 128, 0.071066, res);  +        DoAssertEntry("http://www.booking.com/hotel/", 130, 129, 0.121607, res);  +        DoAssertEntry("i", 131, -1, 0.0258221, res);  +        DoAssertEntry("id=", 132, 131, 0.00725369, res);  +        DoAssertEntry("im", 133, 131, 0.00373318, res);  +        DoAssertEntry("in", 134, 131, 0.013625, res);  +        DoAssertEntry("ing", 135, 134, 0.00795491, res);  +        DoAssertEntry("ion", 136, 131, 0.00796149, res);  +        DoAssertEntry("it", 137, 131, 0.00953416, res);  +        DoAssertEntry("j", 138, -1, 0.00132421, res);  +        DoAssertEntry("k", 139, -1, 0.0134628, res);  +        DoAssertEntry("l", 140, -1, 0.0381814, res);  +        DoAssertEntry("m", 141, -1, 0.0174354, res);  +        DoAssertEntry("mer", 142, 141, 0.00711846, res);  +        DoAssertEntry("n", 143, -1, 0.0132421, res);  +        DoAssertEntry("o", 144, -1, 0.0302362, res);  +        DoAssertEntry("on", 145, 144, 0.00802271, res);  +        DoAssertEntry("ou", 146, 144, 0.00414545, res);  +        DoAssertEntry("p", 147, -1, 0.0225116, res);  +        DoAssertEntry("port", 148, 147, 0.0123532, res);  +        DoAssertEntry("q", 149, -1, 0.00176561, res);  +        DoAssertEntry("r", 150, -1, 0.0401677, res);  +        DoAssertEntry("ran", 151, 150, 0.00686918, res);  +        DoAssertEntry("s", 152, -1, 0.0487751, res);  +        DoAssertEntry("sho", 153, 152, 0.0113876, res);  +        DoAssertEntry("t", 154, -1, 0.0379607, res);  +        DoAssertEntry("u", 155, -1, 0.0211874, res);  +        DoAssertEntry("v", 156, -1, 0.00595895, res);  +        DoAssertEntry("vi", 157, 156, 0.00480673, res);  +        DoAssertEntry("w", 158, -1, 0.00816597, res);  +        DoAssertEntry("x", 159, -1, 0.00375193, res);  +        DoAssertEntry("y", 160, -1, 0.0130214, res);  +        DoAssertEntry("z", 161, -1, 0.00353123, res);  +    }  +};  +  +UNIT_TEST_SUITE_REGISTRATION(TGreedyDictTest);  diff --git a/library/cpp/codecs/greedy_dict/ut/ya.make b/library/cpp/codecs/greedy_dict/ut/ya.make index bd67d1a4522..e5d597a0834 100644 --- a/library/cpp/codecs/greedy_dict/ut/ya.make +++ b/library/cpp/codecs/greedy_dict/ut/ya.make @@ -1,5 +1,5 @@  UNITTEST_FOR(library/cpp/codecs/greedy_dict) - +   OWNER(velavokr)  SRCS( diff --git a/library/cpp/codecs/greedy_dict/ya.make b/library/cpp/codecs/greedy_dict/ya.make index 2a57224f7e1..6904a354de6 100644 --- a/library/cpp/codecs/greedy_dict/ya.make +++ b/library/cpp/codecs/greedy_dict/ya.make @@ -1,15 +1,15 @@  OWNER(velavokr) -LIBRARY() - -SRCS( -    gd_builder.cpp -    gd_entry.cpp -) - -PEERDIR( +LIBRARY()  +  +SRCS(  +    gd_builder.cpp  +    gd_entry.cpp  +)  +  +PEERDIR(       library/cpp/containers/comptrie      library/cpp/string_utils/relaxed_escaper -) - +)  +   END() diff --git a/library/cpp/codecs/huffman_codec.cpp b/library/cpp/codecs/huffman_codec.cpp index 650fe7cdfdd..391662fb0d5 100644 --- a/library/cpp/codecs/huffman_codec.cpp +++ b/library/cpp/codecs/huffman_codec.cpp @@ -1,14 +1,14 @@ -#include "huffman_codec.h" +#include "huffman_codec.h"   #include <library/cpp/bit_io/bitinput.h>  #include <library/cpp/bit_io/bitoutput.h> - -#include <util/generic/algorithm.h> +  +#include <util/generic/algorithm.h>   #include <util/generic/bitops.h> -#include <util/stream/buffer.h> -#include <util/stream/length.h> -#include <util/string/printf.h> - -namespace NCodecs { +#include <util/stream/buffer.h>  +#include <util/stream/length.h>  +#include <util/string/printf.h>  +  +namespace NCodecs {       template <typename T>      struct TCanonicalCmp {          bool operator()(const T& a, const T& b) const { @@ -19,40 +19,40 @@ namespace NCodecs {              }          }      }; - +       template <typename T>      struct TByCharCmp {          bool operator()(const T& a, const T& b) const { -            return a.Char < b.Char; -        } +            return a.Char < b.Char;  +        }       }; - +       struct TTreeEntry {          static const ui32 InvalidBranch = (ui32)-1; - +           ui64 Freq = 0;          ui32 Branches[2]{InvalidBranch, InvalidBranch}; - +           ui32 CodeLength = 0;          ui8 Char = 0;          bool Invalid = false; - +           TTreeEntry() = default; - +           static bool ByFreq(const TTreeEntry& a, const TTreeEntry& b) {              return a.Freq < b.Freq;          } - +           static bool ByFreqRev(const TTreeEntry& a, const TTreeEntry& b) {              return a.Freq > b.Freq;          }      }; - +       using TCodeTree = TVector<TTreeEntry>; - +       void InitTreeByFreqs(TCodeTree& tree, const ui64 freqs[256]) {          tree.reserve(255 * 256 / 2); // worst case - balanced tree - +           for (ui32 i = 0; i < 256; ++i) {              tree.emplace_back();              tree.back().Char = i; @@ -72,24 +72,24 @@ namespace NCodecs {              for (ui64 i = 0; i < r.size(); ++i)                  ++freqs[(ui8)r[i]];          } - +           InitTreeByFreqs(tree, freqs); -    } - +    }  +       void CalculateCodeLengths(TCodeTree& tree) {          Y_ENSURE(tree.size() == 256, " ");          const ui32 firstbranch = tree.size(); - +           ui32 curleaf = 0;          ui32 curbranch = firstbranch; - +           // building code tree. two priority queues are combined in one.          while (firstbranch - curleaf + tree.size() - curbranch >= 2) {              TTreeEntry e; - +               for (auto& branche : e.Branches) {                  ui32 br; - +                   if (curleaf >= firstbranch)                      br = curbranch++;                  else if (curbranch >= tree.size()) @@ -98,84 +98,84 @@ namespace NCodecs {                      br = curleaf++;                  else                      br = curbranch++; - +                   Y_ENSURE(br < tree.size(), " ");                  branche = br;                  e.Freq += tree[br].Freq;              } - +               tree.push_back(e);              PushHeap(tree.begin() + curbranch, tree.end(), TTreeEntry::ByFreqRev); -        } - +        }  +           // computing code lengths          for (ui64 i = tree.size() - 1; i >= firstbranch; --i) {              TTreeEntry e = tree[i]; - +               for (auto branche : e.Branches)                  tree[branche].CodeLength = e.CodeLength + 1;          } - +           // chopping off the branches          tree.resize(firstbranch); - +           Sort(tree.begin(), tree.end(), TCanonicalCmp<TTreeEntry>()); - +           // simplification: we are stripping codes longer than 64 bits          while (!tree.empty() && tree.back().CodeLength > 64)              tree.pop_back(); - +           // will not compress          if (tree.empty())              return; - +           // special invalid code word          tree.back().Invalid = true;      } - +       struct TEncoderEntry {          ui64 Code = 0; - +           ui8 CodeLength = 0;          ui8 Char = 0;          ui8 Invalid = true; - +           explicit TEncoderEntry(TTreeEntry e)              : CodeLength(e.CodeLength)              , Char(e.Char)              , Invalid(e.Invalid)          {          } - +           TEncoderEntry() = default;      }; - +       struct TEncoderTable {          TEncoderEntry Entries[256]; - +           void Save(IOutputStream* out) const {              ui16 nval = 0; - +               for (auto entrie : Entries)                  nval += !entrie.Invalid; - +               ::Save(out, nval); - +               for (auto entrie : Entries) {                  if (!entrie.Invalid) {                      ::Save(out, entrie.Char);                      ::Save(out, entrie.CodeLength);                  } -            } -        } - +            }  +        }  +           void Load(IInputStream* in) {              ui16 nval = 0;              ::Load(in, nval); - +               for (ui32 i = 0; i < 256; ++i)                  Entries[i].Char = i; - +               for (ui32 i = 0; i < nval; ++i) {                  ui8 ch = 0;                  ui8 len = 0; @@ -184,15 +184,15 @@ namespace NCodecs {                  Entries[ch].CodeLength = len;                  Entries[ch].Invalid = false;              } -        } +        }       }; - +       struct TDecoderEntry {          ui32 NextTable : 10;          ui32 Char : 8;          ui32 Invalid : 1;          ui32 Bad : 1; - +           TDecoderEntry()              : NextTable()              , Char() @@ -201,27 +201,27 @@ namespace NCodecs {          {          }      }; - +       struct TDecoderTable: public TIntrusiveListItem<TDecoderTable> {          ui64 Length = 0;          ui64 BaseCode = 0; - +           TDecoderEntry Entries[256]; - +           TDecoderTable() {              Zero(Entries);          }      }; - +       const int CACHE_BITS_COUNT = 16;      class THuffmanCodec::TImpl: public TAtomicRefCount<TImpl> {          TEncoderTable Encoder;          TDecoderTable Decoder[256]; - +           TEncoderEntry Invalid; - +           ui32 SubTablesNum; - +           class THuffmanCache {              struct TCacheEntry {                  int EndOffset : 24; @@ -230,7 +230,7 @@ namespace NCodecs {              TVector<char> DecodeCache;              TVector<TCacheEntry> CacheEntries;              const TImpl& Original; - +           public:              THuffmanCache(const THuffmanCodec::TImpl& encoder); @@ -252,51 +252,51 @@ namespace NCodecs {              if (in.empty()) {                  return 0;              } - +               out.Reserve(in.size() * 2); - +               {                  NBitIO::TBitOutputVector<TBuffer> bout(&out);                  TStringBuf tin = in; - +                   // data is under compression                  bout.Write(1, 1); - +                   for (auto t : tin) {                      const TEncoderEntry& ce = Encoder.Entries[(ui8)t]; - +                       bout.Write(ce.Code, ce.CodeLength); - +                       if (ce.Invalid) {                          bout.Write(t, 8);                      }                  } - +                   // in canonical huffman coding there cannot be a code having no 0 in the suffix                  // and shorter than 8 bits.                  bout.Write((ui64)-1, bout.GetByteReminder());                  return bout.GetByteReminder(); -            } -        } - +            }  +        }  +           void Decode(TStringBuf in, TBuffer& out) const {              out.Clear(); - +               if (in.empty()) {                  return;              } - +               NBitIO::TBitInput bin(in);              ui64 f = 0;              bin.ReadK<1>(f); - +               // if data is uncompressed              if (!f) {                  in.Skip(1);                  out.Append(in.data(), in.size());              } else {                  out.Reserve(in.size() * 8); - +                   if (Cache.Get()) {                      Cache->Decode(bin, out);                  } else { @@ -304,36 +304,36 @@ namespace NCodecs {                      }                  }              } -        } - +        }  +           Y_FORCE_INLINE int ReadNextChar(NBitIO::TBitInput& bin, TBuffer& out) const {              const TDecoderTable* table = Decoder;              TDecoderEntry e; - +               int bitsRead = 0;              while (true) {                  ui64 code = 0; - +                   if (Y_UNLIKELY(!bin.Read(code, table->Length)))                      return 0;                  bitsRead += table->Length; - +                   if (Y_UNLIKELY(code < table->BaseCode))                      return 0; - +                   code -= table->BaseCode; - +                   if (Y_UNLIKELY(code > 255))                      return 0; - +                   e = table->Entries[code]; - +                   if (Y_UNLIKELY(e.Bad))                      return 0; - +                   if (e.NextTable) {                      table = Decoder + e.NextTable; -                } else { +                } else {                       if (e.Invalid) {                          code = 0;                          bin.ReadK<8>(code); @@ -344,77 +344,77 @@ namespace NCodecs {                      }                      return bitsRead; -                } +                }               } - +               Y_ENSURE(false, " could not decode input");              return 0; -        } - +        }  +           void GenerateEncoder(TCodeTree& tree) {              const ui64 sz = tree.size(); - +               TEncoderEntry lastcode = Encoder.Entries[tree[0].Char] = TEncoderEntry(tree[0]); - +               for (ui32 i = 1; i < sz; ++i) {                  const TTreeEntry& te = tree[i];                  TEncoderEntry& e = Encoder.Entries[te.Char];                  e = TEncoderEntry(te); - +                   e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength);                  lastcode = e; - +                   e.Code = ReverseBits(e.Code, e.CodeLength); - +                   if (e.Invalid)                      Invalid = e;              } - +               for (auto& e : Encoder.Entries) {                  if (e.Invalid)                      e = Invalid;                  Y_ENSURE(e.CodeLength, " ");              } -        } - +        }  +           void RegenerateEncoder() {              for (auto& entrie : Encoder.Entries) {                  if (entrie.Invalid)                      entrie.CodeLength = Invalid.CodeLength;              } - +               Sort(Encoder.Entries, Encoder.Entries + 256, TCanonicalCmp<TEncoderEntry>()); - +               TEncoderEntry lastcode = Encoder.Entries[0]; - +               for (ui32 i = 1; i < 256; ++i) {                  TEncoderEntry& e = Encoder.Entries[i];                  e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength);                  lastcode = e; - +                   e.Code = ReverseBits(e.Code, e.CodeLength);              } - +               for (auto& entrie : Encoder.Entries) {                  if (entrie.Invalid) {                      Invalid = entrie;                      break;                  }              } - +               Sort(Encoder.Entries, Encoder.Entries + 256, TByCharCmp<TEncoderEntry>()); - +               for (auto& entrie : Encoder.Entries) {                  if (entrie.Invalid)                      entrie = Invalid; -            } -        } - +            }  +        }  +           void BuildDecoder() {              TEncoderTable enc = Encoder;              Sort(enc.Entries, enc.Entries + 256, TCanonicalCmp<TEncoderEntry>()); - +               TEncoderEntry& e1 = enc.Entries[0];              Decoder[0].BaseCode = e1.Code;              Decoder[0].Length = e1.CodeLength; @@ -423,22 +423,22 @@ namespace NCodecs {                  SetEntry(Decoder, e2.Code, e2.CodeLength, e2);              }              Cache.Reset(new THuffmanCache(*this)); -        } - +        }  +           void SetEntry(TDecoderTable* t, ui64 code, ui64 len, TEncoderEntry e) {              Y_ENSURE(len >= t->Length, len << " < " << t->Length); - +               ui64 idx = (code & MaskLowerBits(t->Length)) - t->BaseCode;              TDecoderEntry& d = t->Entries[idx]; - +               if (len == t->Length) {                  Y_ENSURE(!d.NextTable, " "); - +                   d.Char = e.Char;                  d.Invalid = e.Invalid;                  return;              } - +               if (!d.NextTable) {                  Y_ENSURE(SubTablesNum < Y_ARRAY_SIZE(Decoder), " ");                  d.NextTable = SubTablesNum++; @@ -446,10 +446,10 @@ namespace NCodecs {                  nt->Length = Min<ui64>(8, len - t->Length);                  nt->BaseCode = (code >> t->Length) & MaskLowerBits(nt->Length);              } - +               SetEntry(Decoder + d.NextTable, code >> t->Length, len - t->Length, e); -        } - +        }  +           void Learn(ISequenceReader* in) {              {                  TCodeTree tree; @@ -459,11 +459,11 @@ namespace NCodecs {                  GenerateEncoder(tree);              }              BuildDecoder(); -        } - +        }  +           void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) { -            TCodeTree tree; - +            TCodeTree tree;  +               ui64 freqsArray[256];              Zero(freqsArray); @@ -491,7 +491,7 @@ namespace NCodecs {              BuildDecoder();          }      }; - +       THuffmanCodec::TImpl::THuffmanCache::THuffmanCache(const THuffmanCodec::TImpl& codec)          : Original(codec)      { @@ -512,7 +512,7 @@ namespace NCodecs {                      CacheEntries[i] = e;                      break;                  } - +                   for (TBuffer::TConstIterator it = decoded.Begin(); it != decoded.End(); ++it) {                      DecodeCache.push_back(*it);                  } @@ -558,32 +558,32 @@ namespace NCodecs {          MyTraits.SizeOnDecodeMultiplier = 8;          MyTraits.RecommendedSampleSize = 1 << 21;      } - +       THuffmanCodec::~THuffmanCodec() = default; - +       ui8 THuffmanCodec::Encode(TStringBuf in, TBuffer& bbb) const {          if (Y_UNLIKELY(!Trained))              ythrow TCodecException() << " not trained"; - +           return Impl->Encode(in, bbb);      } - +       void THuffmanCodec::Decode(TStringBuf in, TBuffer& bbb) const {          Impl->Decode(in, bbb);      } - +       void THuffmanCodec::Save(IOutputStream* out) const {          Impl->Save(out);      } - +       void THuffmanCodec::Load(IInputStream* in) {          Impl->Load(in);      } - +       void THuffmanCodec::DoLearn(ISequenceReader& in) {          Impl->Learn(&in);      } - +       void THuffmanCodec::LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) {          Impl->LearnByFreqs(freqs);          Trained = true; diff --git a/library/cpp/codecs/huffman_codec.h b/library/cpp/codecs/huffman_codec.h index 559545b90d9..1c00a806375 100644 --- a/library/cpp/codecs/huffman_codec.h +++ b/library/cpp/codecs/huffman_codec.h @@ -1,33 +1,33 @@ -#pragma once - -#include "codecs.h" - -#include <util/generic/ptr.h> +#pragma once  +  +#include "codecs.h"  +  +#include <util/generic/ptr.h>   #include <util/string/cast.h> - -namespace NCodecs { +  +namespace NCodecs {       // for types greater than char, pipeline with TFreqCodec. - +       class THuffmanCodec: public ICodec {          class TImpl;          TIntrusivePtr<TImpl> Impl; - +       public:          THuffmanCodec();          ~THuffmanCodec() override; - +           static TStringBuf MyName() {              return "huffman";          } - +           TString GetName() const override {              return ToString(MyName());          } - +           ui8 Encode(TStringBuf in, TBuffer& bbb) const override; - +           void Decode(TStringBuf in, TBuffer& bbb) const override; - +           void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs);      protected: @@ -35,5 +35,5 @@ namespace NCodecs {          void Save(IOutputStream* out) const override;          void Load(IInputStream* in) override;      }; - -} +  +}  diff --git a/library/cpp/codecs/pfor_codec.cpp b/library/cpp/codecs/pfor_codec.cpp index f6b3b0920bd..3b51c99afad 100644 --- a/library/cpp/codecs/pfor_codec.cpp +++ b/library/cpp/codecs/pfor_codec.cpp @@ -1,6 +1,6 @@ -#include "pfor_codec.h" - -namespace NCodecs { +#include "pfor_codec.h"  +  +namespace NCodecs {       template <>      TStringBuf TPForCodec<ui64, true>::MyName() {          return "pfor-delta64-sorted"; @@ -9,7 +9,7 @@ namespace NCodecs {      TStringBuf TPForCodec<ui32, true>::MyName() {          return "pfor-delta32-sorted";      } - +       template <>      TStringBuf TPForCodec<ui64, false>::MyName() {          return "pfor-ui64"; @@ -18,5 +18,5 @@ namespace NCodecs {      TStringBuf TPForCodec<ui32, false>::MyName() {          return "pfor-ui32";      } - -} +  +}  diff --git a/library/cpp/codecs/pfor_codec.h b/library/cpp/codecs/pfor_codec.h index d7d4bb8bf48..b0207512acb 100644 --- a/library/cpp/codecs/pfor_codec.h +++ b/library/cpp/codecs/pfor_codec.h @@ -1,48 +1,48 @@ -#pragma once - -#include "codecs.h" - -#include "delta_codec.h" -#include "tls_cache.h" - +#pragma once  +  +#include "codecs.h"  +  +#include "delta_codec.h"  +#include "tls_cache.h"  +   #include <library/cpp/bit_io/bitinput.h>  #include <library/cpp/bit_io/bitoutput.h>  #include <util/string/cast.h> - -namespace NCodecs { +  +namespace NCodecs {       template <typename T, bool WithDelta = false>      class TPForCodec: public ICodec {          using TUnsigned = std::make_unsigned_t<T>;          typedef TDeltaCodec<TUnsigned> TDCodec; - +           typedef std::conditional_t<WithDelta, typename TDCodec::TDelta, T> TValue;          static_assert(std::is_unsigned<TValue>::value, "expect std:is_unsigned<TValue>::value"); - +           static const ui64 BitsInT = sizeof(TUnsigned) * 8; - +           TDCodec DeltaCodec; - +       public:          static TStringBuf MyName(); - +           TPForCodec() {              MyTraits.AssumesStructuredInput = true;              MyTraits.SizeOfInputElement = sizeof(T);              MyTraits.SizeOnDecodeMultiplier = sizeof(T);          } - +           TString GetName() const override {              return ToString(MyName());          } - +           ui8 Encode(TStringBuf s, TBuffer& b) const override {              b.Clear();              if (s.empty()) {                  return 0;              } - +               b.Reserve(2 * s.size() + b.Size()); - +               if (WithDelta) {                  auto buffer = TBufferTlsCache::TlsInstance().Item();                  TBuffer& db = buffer.Get(); @@ -51,50 +51,50 @@ namespace NCodecs {                  DeltaCodec.Encode(s, db);                  s = TStringBuf{db.data(), db.size()};              } - +               TArrayRef<const TValue> tin{(const TValue*)s.data(), s.size() / sizeof(TValue)}; - +               const ui64 sz = tin.size();              ui64 bitcounts[BitsInT + 1];              Zero(bitcounts); - +               ui32 zeros = 0; - +               for (const TValue* it = tin.begin(); it != tin.end(); ++it) {                  TUnsigned v = 1 + (TUnsigned)*it;                  ui64 l = MostSignificantBit(v) + 1;                  ++bitcounts[l]; - +                   if (!v) {                      ++zeros;                  }              } - +               // cumulative bit counts              for (ui64 i = 0; i < BitsInT; ++i) {                  bitcounts[i + 1] += bitcounts[i]; -            } - +            }  +               bool hasexceptions = zeros;              ui64 optimalbits = BitsInT; - +               {                  ui64 excsize = 0;                  ui64 minsize = sz * BitsInT; - +                   for (ui64 current = BitsInT; current; --current) {                      ui64 size = bitcounts[current] * current + (sz - bitcounts[current]) * (current + 6 + excsize) + zeros * (current + 6); - +                       excsize += current * bitcounts[current]; - +                       if (size < minsize) {                          minsize = size;                          optimalbits = current;                          hasexceptions = zeros || sz - bitcounts[current];                      } -                } -            } - +                }  +            }  +               if (!optimalbits || BitsInT == optimalbits) {                  b.Append((ui8)-1);                  b.Append(s.data(), s.size()); @@ -104,7 +104,7 @@ namespace NCodecs {                  bout.Write(0, 1);                  bout.Write(hasexceptions, 1);                  bout.Write(optimalbits, 6); - +                   for (const TValue* it = tin.begin(); it != tin.end(); ++it) {                      TUnsigned word = 1 + (TUnsigned)*it;                      ui64 len = MostSignificantBit(word) + 1; @@ -116,29 +116,29 @@ namespace NCodecs {                      } else {                          bout.Write(word, optimalbits);                      } -                } - +                }  +                   return bout.GetByteReminder();              } // the rest of the last byte is zero padded. BitsInT is always > 7. -        } - +        }  +           void Decode(TStringBuf s, TBuffer& b) const override {              b.Clear();              if (s.empty()) {                  return;              } - +               b.Reserve(s.size() * sizeof(T) + b.Size()); - +               ui64 isplain = 0;              ui64 hasexceptions = 0;              ui64 bits = 0; - +               NBitIO::TBitInput bin(s);              bin.ReadK<1>(isplain);              bin.ReadK<1>(hasexceptions);              bin.ReadK<6>(bits); - +               if (Y_UNLIKELY(isplain)) {                  s.Skip(1); @@ -147,17 +147,17 @@ namespace NCodecs {                  } else {                      b.Append(s.data(), s.size());                  } -            } else { +            } else {                   typename TDCodec::TDecoder decoder; - +                   if (hasexceptions) {                      ui64 word = 0;                      while (bin.Read(word, bits)) {                          if (word || (bin.ReadK<6>(word) && bin.Read(word, word))) {                              --word; - +                               TValue t = word; - +                               if (WithDelta) {                                  if (decoder.Decode(t)) {                                      TStringBuf r{(char*)&decoder.Result, sizeof(decoder.Result)}; @@ -166,46 +166,46 @@ namespace NCodecs {                              } else {                                  TStringBuf r{(char*)&t, sizeof(t)};                                  b.Append(r.data(), r.size()); -                            } -                        } -                    } +                            }  +                        }  +                    }                   } else {                      ui64 word = 0;                      T outarr[256 / sizeof(T)];                      ui32 cnt = 0;                      while (true) {                          ui64 v = bin.Read(word, bits); - +                           if ((!v) | (!word))                              break; - +                           --word;                          TValue t = word; - +                           if (WithDelta) {                              if (decoder.Decode(t)) {                                  outarr[cnt++] = decoder.Result;                              }                          } else {                              outarr[cnt++] = t; -                        } +                        }                           if (cnt == Y_ARRAY_SIZE(outarr)) {                              b.Append((const char*)outarr, sizeof(outarr));                              cnt = 0;                          } -                    } - +                    }  +                       if (cnt) {                          b.Append((const char*)outarr, cnt * sizeof(T)); -                    } -                } -            } -        } - +                    }  +                }  +            }  +        }  +       protected:          void DoLearn(ISequenceReader&) override {          }      }; - -} +  +}  diff --git a/library/cpp/codecs/sample.h b/library/cpp/codecs/sample.h index 15f03afcc5d..bce37e6a2c2 100644 --- a/library/cpp/codecs/sample.h +++ b/library/cpp/codecs/sample.h @@ -1,89 +1,89 @@ -#pragma once - +#pragma once  +   #include <library/cpp/deprecated/accessors/accessors.h> - -#include <util/generic/buffer.h> -#include <util/generic/vector.h> -#include <util/random/fast.h> -#include <util/random/shuffle.h> - -#include <functional> -#include <type_traits> - -namespace NCodecs { -    class ISequenceReader { -    public: -        virtual bool NextRegion(TStringBuf& s) = 0; - -        virtual ~ISequenceReader() = default; -    }; - -    template <class TValue> -    TStringBuf ValueToStringBuf(TValue&& t) { -        return TStringBuf{NAccessors::Begin(t), NAccessors::End(t)}; -    } - -    template <class TIter> +  +#include <util/generic/buffer.h>  +#include <util/generic/vector.h>  +#include <util/random/fast.h>  +#include <util/random/shuffle.h>  +  +#include <functional>  +#include <type_traits>  +  +namespace NCodecs {  +    class ISequenceReader {  +    public:  +        virtual bool NextRegion(TStringBuf& s) = 0;  +  +        virtual ~ISequenceReader() = default;  +    };  +  +    template <class TValue>  +    TStringBuf ValueToStringBuf(TValue&& t) {  +        return TStringBuf{NAccessors::Begin(t), NAccessors::End(t)};  +    }  +  +    template <class TIter>       TStringBuf IterToStringBuf(TIter iter) { -        return ValueToStringBuf(*iter); -    } - -    template <class TItem> +        return ValueToStringBuf(*iter);  +    }  +  +    template <class TItem>       class TSimpleSequenceReader: public ISequenceReader {          const TVector<TItem>& Items; -        size_t Idx = 0; - -    public: +        size_t Idx = 0;  +  +    public:           TSimpleSequenceReader(const TVector<TItem>& items) -            : Items(items) +            : Items(items)           {          } - -        bool NextRegion(TStringBuf& s) override { -            if (Idx >= Items.size()) { -                return false; -            } - -            s = ValueToStringBuf(Items[Idx++]); -            return true; -        } -    }; - -    template <class TIter, class TGetter> -    size_t GetInputSize(TIter begin, TIter end, TGetter getter) { -        size_t totalBytes = 0; -        for (TIter iter = begin; iter != end; ++iter) { -            totalBytes += getter(iter).size(); -        } -        return totalBytes; -    } - -    template <class TIter> -    size_t GetInputSize(TIter begin, TIter end) { -        return GetInputSize(begin, end, IterToStringBuf<TIter>); -    } - -    template <class TIter, class TGetter> +  +        bool NextRegion(TStringBuf& s) override {  +            if (Idx >= Items.size()) {  +                return false;  +            }  +  +            s = ValueToStringBuf(Items[Idx++]);  +            return true;  +        }  +    };  +  +    template <class TIter, class TGetter>  +    size_t GetInputSize(TIter begin, TIter end, TGetter getter) {  +        size_t totalBytes = 0;  +        for (TIter iter = begin; iter != end; ++iter) {  +            totalBytes += getter(iter).size();  +        }  +        return totalBytes;  +    }  +  +    template <class TIter>  +    size_t GetInputSize(TIter begin, TIter end) {  +        return GetInputSize(begin, end, IterToStringBuf<TIter>);  +    }  +  +    template <class TIter, class TGetter>       TVector<TBuffer> GetSample(TIter begin, TIter end, size_t sampleSizeBytes, TGetter getter) { -        TFastRng64 rng{0x1ce1f2e507541a05, 0x07d45659, 0x7b8771030dd9917e, 0x2d6636ce}; - -        size_t totalBytes = GetInputSize(begin, end, getter); -        double sampleProb = (double)sampleSizeBytes / Max<size_t>(1, totalBytes); - +        TFastRng64 rng{0x1ce1f2e507541a05, 0x07d45659, 0x7b8771030dd9917e, 0x2d6636ce};  +  +        size_t totalBytes = GetInputSize(begin, end, getter);  +        double sampleProb = (double)sampleSizeBytes / Max<size_t>(1, totalBytes);  +           TVector<TBuffer> result; -        for (TIter iter = begin; iter != end; ++iter) { -            if (sampleProb >= 1 || rng.GenRandReal1() < sampleProb) { -                TStringBuf reg = getter(iter); +        for (TIter iter = begin; iter != end; ++iter) {  +            if (sampleProb >= 1 || rng.GenRandReal1() < sampleProb) {  +                TStringBuf reg = getter(iter);                   result.emplace_back(reg.data(), reg.size()); -            } -        } -        Shuffle(result.begin(), result.end(), rng); -        return result; -    } - -    template <class TIter> +            }  +        }  +        Shuffle(result.begin(), result.end(), rng);  +        return result;  +    }  +  +    template <class TIter>       TVector<TBuffer> GetSample(TIter begin, TIter end, size_t sampleSizeBytes) { -        return GetSample(begin, end, sampleSizeBytes, IterToStringBuf<TIter>); -    } - -} +        return GetSample(begin, end, sampleSizeBytes, IterToStringBuf<TIter>);  +    }  +  +}  diff --git a/library/cpp/codecs/solar_codec.cpp b/library/cpp/codecs/solar_codec.cpp index d0692fe2a46..088bdead192 100644 --- a/library/cpp/codecs/solar_codec.cpp +++ b/library/cpp/codecs/solar_codec.cpp @@ -1,36 +1,36 @@ -#include "solar_codec.h" - +#include "solar_codec.h"  +   #include <library/cpp/codecs/greedy_dict/gd_builder.h> - +   #include <library/cpp/containers/comptrie/comptrie_builder.h>  #include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> -#include <util/stream/length.h> -#include <util/string/printf.h> -#include <util/ysaveload.h> - -namespace NCodecs { +#include <util/stream/length.h>  +#include <util/string/printf.h>  +#include <util/ysaveload.h>  +  +namespace NCodecs {       static inline ui32 Append(TBuffer& pool, TStringBuf data) {          pool.Append(data.data(), data.size());          return pool.Size();      } - +       void TSolarCodec::DoLearn(ISequenceReader& r) {          using namespace NGreedyDict; - +           Decoder.clear();          Pool.Clear(); - +           THolder<TEntrySet> set; - +           {              TMemoryPool pool(8112, TMemoryPool::TLinearGrow::Instance());              TStringBufs bufs; - +               TStringBuf m;              while (r.NextRegion(m)) {                  bufs.push_back(pool.AppendString(m));              } - +               {                  TDictBuilder b(Settings);                  b.SetInput(bufs); @@ -38,66 +38,66 @@ namespace NCodecs {                  set = b.ReleaseEntrySet();              } -        } - +        }  +           set->SetScores(ES_LEN_COUNT); -        { +        {               TVector<std::pair<float, TStringBuf>> tmp;              tmp.reserve(set->size()); - +               for (const auto& it : *set) {                  tmp.push_back(std::make_pair(-it.Score, TStringBuf(it.Str).Trunc(Max<ui32>() / Max<ui32>(MaxEntries, 1))));              } - +               Sort(tmp.begin(), tmp.end()); - +               Decoder.reserve(tmp.size() + 1);              Decoder.push_back(0); - +               for (const auto& it : tmp) {                  Y_ENSURE(Decoder.back() == Pool.Size(), "learning invariant failed");                  ui32 endoff = Append(Pool, it.second);                  Decoder.push_back(endoff);              } -        } - +        }  +           Pool.ShrinkToFit();          Decoder.shrink_to_fit(); - +           TBufferOutput bout; - +           {              TVector<std::pair<TStringBuf, ui32>> tmp2;              tmp2.reserve(Decoder.size()); - +               for (ui32 i = 1, sz = Decoder.size(); i < sz; ++i) {                  TStringBuf s = DoDecode(i);                  tmp2.push_back(std::make_pair(s, i - 1));                  Y_ENSURE(s.size() == (Decoder[i] - Decoder[i - 1]), "learning invariant failed");              } - +               Sort(tmp2.begin(), tmp2.end()); - +               {                  TEncoder::TBuilder builder(CTBF_PREFIX_GROUPED);                  for (const auto& it : tmp2) {                      builder.Add(it.first.data(), it.first.size(), it.second);                  } - +                   builder.Save(bout); -            } -        } - +            }  +        }  +           Encoder.Init(TBlob::FromBuffer(bout.Buffer())); -    } - +    }  +       void TSolarCodec::Save(IOutputStream* out) const {          TBlob b = Encoder.Data();          ::Save(out, (ui32)b.Size());          out->Write(b.Data(), b.Size());      } - +       void TSolarCodec::Load(IInputStream* in) {          ui32 sz;          ::Load(in, sz); @@ -105,29 +105,29 @@ namespace NCodecs {          Encoder.Init(TBlob::FromStream(lin));          Pool.Clear();          Decoder.clear(); - +           TVector<std::pair<ui32, TString>> tmp; - +           ui32 poolsz = 0;          for (TEncoder::TConstIterator it = Encoder.Begin(); it != Encoder.End(); ++it) {              const TString& s = it.GetKey();              tmp.push_back(std::make_pair(it.GetValue(), !s ? TString("\0", 1) : s));              poolsz += Max<ui32>(s.size(), 1);          } - +           Sort(tmp.begin(), tmp.end()); - +           Pool.Reserve(poolsz);          Decoder.reserve(tmp.size() + 1);          Decoder.push_back(0); - +           for (ui32 i = 0, sz2 = tmp.size(); i < sz2; ++i) {              Y_ENSURE(i == tmp[i].first, "oops! " << i << " " << tmp[i].first);              Decoder.push_back(Append(Pool, tmp[i].second));          } - +           Pool.ShrinkToFit();          Decoder.shrink_to_fit(); -    } - -} +    }  +  +}  diff --git a/library/cpp/codecs/solar_codec.h b/library/cpp/codecs/solar_codec.h index 7158ae79262..e6c0b891ad7 100644 --- a/library/cpp/codecs/solar_codec.h +++ b/library/cpp/codecs/solar_codec.h @@ -1,16 +1,16 @@ -#pragma once - -#include "codecs.h" +#pragma once  +  +#include "codecs.h"   #include <library/cpp/containers/comptrie/comptrie_trie.h>  #include <library/cpp/codecs/greedy_dict/gd_builder.h> - +   #include <util/string/cast.h> -#include <util/string/escape.h> - -namespace NCodecs { -    // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы. -    // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе. - +#include <util/string/escape.h>  +  +namespace NCodecs {  +    // TODO: Попробовать добавлять в словарь вместе с намайненными словами также их суффиксы.  +    // TODO: Возможно удастся, не слишком потеряв в сжатии, выиграть в робастности к небольшим изменениям в корпусе.  +       struct TVarIntTraits {          static const size_t MAX_VARINT32_BYTES = 5; @@ -52,7 +52,7 @@ namespace NCodecs {          Y_FORCE_INLINE static void Read(TStringBuf& r, ui32& value) {              ui32 result = static_cast<ui8>(r[0]); -            r.Skip(1); +            r.Skip(1);               if (result >= 0x80) {                  Y_ENSURE_EX(!r.empty(), TCodecException() << "Bad data");                  result = ((result << 8) & 0x7FFF) | static_cast<ui8>(r[0]); @@ -100,7 +100,7 @@ namespace NCodecs {          static TStringBuf MyNameShortInt() {              return TStringBuf("solar-si");          } - +           explicit TSolarCodec(ui32 maxentries = 1 << 14, ui32 maxiter = 16, const NGreedyDict::TBuildSettings& s = NGreedyDict::TBuildSettings())              : Settings(s)              , MaxEntries(maxentries) @@ -110,7 +110,7 @@ namespace NCodecs {              MyTraits.SizeOnDecodeMultiplier = 2;              MyTraits.RecommendedSampleSize = maxentries * s.GrowLimit * maxiter * 8;          } - +           ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {              EncodeImpl<TVarIntTraits>(r, b);              return 0; @@ -148,8 +148,8 @@ namespace NCodecs {                  TTraits::Write(val + 1, b);                  r.Skip(Max<size_t>(sz, 1));              } -        } - +        }  +           template <class TTraits>          Y_FORCE_INLINE void DecodeImpl(TStringBuf r, TBuffer& b) const {              b.Clear(); @@ -160,25 +160,25 @@ namespace NCodecs {                  TStringBuf s = DoDecode(v);                  b.Append(s.data(), s.size());              } -        } - +        }  +           inline bool CanUseShortInt() const {              return Decoder.size() < TShortIntTraits::SHORTINT_SIZE_LIMIT;          } - +       private:          typedef TCompactTrie<char, ui32> TEncoder;          typedef TVector<ui32> TDecoder; - +           TBuffer Pool;          TEncoder Encoder;          TDecoder Decoder; - +           NGreedyDict::TBuildSettings Settings;          ui32 MaxEntries;          ui32 MaxIterations;      }; - +       // Uses varints or shortints depending on the decoder size      class TAdaptiveSolarCodec: public TSolarCodec {      public: @@ -186,7 +186,7 @@ namespace NCodecs {              : TSolarCodec(maxentries, maxiter, s)          {          } - +           ui8 /*free bits in last byte*/ Encode(TStringBuf r, TBuffer& b) const override {              if (CanUseShortInt()) {                  EncodeImpl<TShortIntTraits>(r, b); @@ -225,7 +225,7 @@ namespace NCodecs {              EncodeImpl<TShortIntTraits>(r, b);              return 0;          } - +           void Decode(TStringBuf r, TBuffer& b) const override {              DecodeImpl<TShortIntTraits>(r, b);          } @@ -241,4 +241,4 @@ namespace NCodecs {          }      }; -} +}  diff --git a/library/cpp/codecs/static/builder.cpp b/library/cpp/codecs/static/builder.cpp index 93e34a3edbb..083f0fc6f6c 100644 --- a/library/cpp/codecs/static/builder.cpp +++ b/library/cpp/codecs/static/builder.cpp @@ -1,39 +1,39 @@ -#include "builder.h" -#include "common.h" - +#include "builder.h"  +#include "common.h"  +   #include <library/cpp/codecs/static/static_codec_info.pb.h> - +   #include <library/cpp/codecs/codecs.h> - -#include <util/generic/yexception.h> -#include <util/string/subst.h> - -namespace NCodecs { +  +#include <util/generic/yexception.h>  +#include <util/string/subst.h>  +  +namespace NCodecs {       TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo& info) { -        TStaticCodecInfo result; -        TCodecPtr codec = ICodec::GetInstance(info.CodecName); -        Y_ENSURE_EX(codec, TCodecException() << "empty codec is not allowed"); - -        codec->LearnX(trainingData.begin(), trainingData.end(), info.SampleSizeMultiplier); -        { -            TStringOutput sout{*result.MutableStoredCodec()}; -            ICodec::Store(&sout, codec); -        } - -        auto& debugInfo = *result.MutableDebugInfo(); -        debugInfo.SetStoredCodecHash(DataSignature(result.GetStoredCodec())); -        debugInfo.SetCodecName(info.CodecName); -        debugInfo.SetSampleSizeMultiplier(info.SampleSizeMultiplier); -        debugInfo.SetTimestamp(info.Timestamp); -        debugInfo.SetRevisionInfo(info.RevisionInfo); -        debugInfo.SetTrainingSetComment(info.TrainingSetComment); -        debugInfo.SetTrainingSetResId(info.TrainingSetResId); -        return result; -    } - +        TStaticCodecInfo result;  +        TCodecPtr codec = ICodec::GetInstance(info.CodecName);  +        Y_ENSURE_EX(codec, TCodecException() << "empty codec is not allowed");  +  +        codec->LearnX(trainingData.begin(), trainingData.end(), info.SampleSizeMultiplier);  +        {  +            TStringOutput sout{*result.MutableStoredCodec()};  +            ICodec::Store(&sout, codec);  +        }  +  +        auto& debugInfo = *result.MutableDebugInfo();  +        debugInfo.SetStoredCodecHash(DataSignature(result.GetStoredCodec()));  +        debugInfo.SetCodecName(info.CodecName);  +        debugInfo.SetSampleSizeMultiplier(info.SampleSizeMultiplier);  +        debugInfo.SetTimestamp(info.Timestamp);  +        debugInfo.SetRevisionInfo(info.RevisionInfo);  +        debugInfo.SetTrainingSetComment(info.TrainingSetComment);  +        debugInfo.SetTrainingSetResId(info.TrainingSetResId);  +        return result;  +    }  +       TString GetStandardFileName(const TStaticCodecInfo& info) {          TString cName = info.GetDebugInfo().GetCodecName(); -        SubstGlobal(cName, ':', '.'); -        return TStringBuilder() << cName << "." << info.GetDebugInfo().GetTimestamp() << ".codec_info"; -    } -} +        SubstGlobal(cName, ':', '.');  +        return TStringBuilder() << cName << "." << info.GetDebugInfo().GetTimestamp() << ".codec_info";  +    }  +}  diff --git a/library/cpp/codecs/static/builder.h b/library/cpp/codecs/static/builder.h index d7533be4d58..234ad42dffa 100644 --- a/library/cpp/codecs/static/builder.h +++ b/library/cpp/codecs/static/builder.h @@ -1,29 +1,29 @@ -#pragma once - -#include "static.h" - +#pragma once  +  +#include "static.h"  +   #include <library/cpp/svnversion/svnversion.h> - -#include <util/datetime/base.h> +  +#include <util/datetime/base.h>   #include <util/generic/string.h> -#include <util/generic/vector.h> -#include <util/string/builder.h> - -namespace NCodecs { -    struct TCodecBuildInfo { -        // optimal values from SEARCH-1655 +#include <util/generic/vector.h>  +#include <util/string/builder.h>  +  +namespace NCodecs {  +    struct TCodecBuildInfo {  +        // optimal values from SEARCH-1655           TString CodecName = "solar-8k-a:zstd08d-1"; -        float SampleSizeMultiplier = 1; - -        // debug info: -        time_t Timestamp = TInstant::Now().TimeT(); +        float SampleSizeMultiplier = 1;  +  +        // debug info:  +        time_t Timestamp = TInstant::Now().TimeT();           TString RevisionInfo = (TStringBuilder() << "r" << ToString(GetProgramSvnRevision()));          TString TrainingSetComment; // a human comment on the training data          TString TrainingSetResId;   // sandbox resid of the training set -    }; - +    };  +       TStaticCodecInfo BuildStaticCodec(const TVector<TString>& trainingData, const TCodecBuildInfo&); - +       TString GetStandardFileName(const TStaticCodecInfo&); - -} +  +}  diff --git a/library/cpp/codecs/static/common.h b/library/cpp/codecs/static/common.h index 211de2a27d2..84b0349d82b 100644 --- a/library/cpp/codecs/static/common.h +++ b/library/cpp/codecs/static/common.h @@ -1,32 +1,32 @@ -#pragma once - -#include <util/string/hex.h> -#include <util/digest/city.h> -#include <util/system/byteorder.h> - -namespace NCodecs { -    template <class T> -    ui64 DataSignature(const T& t) { -        static_assert(!std::is_scalar<T>::value, "no scalars"); +#pragma once  +  +#include <util/string/hex.h>  +#include <util/digest/city.h>  +#include <util/system/byteorder.h>  +  +namespace NCodecs {  +    template <class T>  +    ui64 DataSignature(const T& t) {  +        static_assert(!std::is_scalar<T>::value, "no scalars");           return CityHash64(t.data(), t.size()); -    } - -    template <class T> +    }  +  +    template <class T>       TString HexWriteScalar(T t) { -        static_assert(std::is_scalar<T>::value, "scalars only"); -        t = LittleToBig(t); +        static_assert(std::is_scalar<T>::value, "scalars only");  +        t = LittleToBig(t);           TString res = HexEncode(&t, sizeof(t)); -        res.to_lower(); -        return res; -    } - -    template <class T> -    T HexReadScalar(TStringBuf s) { -        static_assert(std::is_scalar<T>::value, "scalars only"); -        T t = 0; +        res.to_lower();  +        return res;  +    }  +  +    template <class T>  +    T HexReadScalar(TStringBuf s) {  +        static_assert(std::is_scalar<T>::value, "scalars only");  +        T t = 0;           HexDecode(s.data(), Min(s.size(), sizeof(T)), &t); -        t = BigToLittle(t); -        return t; -    } - -} +        t = BigToLittle(t);  +        return t;  +    }  +  +}  diff --git a/library/cpp/codecs/static/example/example.cpp b/library/cpp/codecs/static/example/example.cpp index 5b750b717e1..0c50a1a5be8 100644 --- a/library/cpp/codecs/static/example/example.cpp +++ b/library/cpp/codecs/static/example/example.cpp @@ -1,43 +1,43 @@ -#include "example.h" - +#include "example.h"  +   #include <library/cpp/codecs/static/static.h> - -#include <util/generic/yexception.h> - -extern "C" { +  +#include <util/generic/yexception.h>  +  +extern "C" {   extern const ui8 codec_info_huff_20160707[];  extern const ui32 codec_info_huff_20160707Size;  extern const ui8 codec_info_sa_huff_20160707[];  extern const ui32 codec_info_sa_huff_20160707Size; -}; - -namespace NStaticCodecExample { -    static const NCodecs::TCodecConstPtr CODECS[] = { -        nullptr, -        NCodecs::RestoreCodecFromArchive(codec_info_huff_20160707, codec_info_huff_20160707Size), -        NCodecs::RestoreCodecFromArchive(codec_info_sa_huff_20160707, codec_info_sa_huff_20160707Size), -    }; - -    static_assert(Y_ARRAY_SIZE(CODECS) == DV_COUNT, "bad array size"); - -    void Encode(TBuffer& out, TStringBuf in, EDictVersion dv) { -        Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv); -        out.Clear(); -        if (!in) { -            return; -        } -        CODECS[dv]->Encode(in, out); -        out.Append((char)dv); -    } - -    void Decode(TBuffer& out, TStringBuf in) { -        out.Clear(); -        if (!in) { -            return; -        } -        EDictVersion dv = (EDictVersion)in.back(); -        Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv); -        in.Chop(1); -        CODECS[dv]->Decode(in, out); -    } -} +};  +  +namespace NStaticCodecExample {  +    static const NCodecs::TCodecConstPtr CODECS[] = {  +        nullptr,  +        NCodecs::RestoreCodecFromArchive(codec_info_huff_20160707, codec_info_huff_20160707Size),  +        NCodecs::RestoreCodecFromArchive(codec_info_sa_huff_20160707, codec_info_sa_huff_20160707Size),  +    };  +  +    static_assert(Y_ARRAY_SIZE(CODECS) == DV_COUNT, "bad array size");  +  +    void Encode(TBuffer& out, TStringBuf in, EDictVersion dv) {  +        Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv);  +        out.Clear();  +        if (!in) {  +            return;  +        }  +        CODECS[dv]->Encode(in, out);  +        out.Append((char)dv);  +    }  +  +    void Decode(TBuffer& out, TStringBuf in) {  +        out.Clear();  +        if (!in) {  +            return;  +        }  +        EDictVersion dv = (EDictVersion)in.back();  +        Y_ENSURE(dv > DV_NULL && dv < DV_COUNT, "invalid dict version: " << (int)dv);  +        in.Chop(1);  +        CODECS[dv]->Decode(in, out);  +    }  +}  diff --git a/library/cpp/codecs/static/example/example.h b/library/cpp/codecs/static/example/example.h index f9b3a7324b7..070ca90f02c 100644 --- a/library/cpp/codecs/static/example/example.h +++ b/library/cpp/codecs/static/example/example.h @@ -1,17 +1,17 @@ -#pragma once - -#include <util/generic/strbuf.h> -#include <util/generic/buffer.h> - -namespace NStaticCodecExample { +#pragma once  +  +#include <util/generic/strbuf.h>  +#include <util/generic/buffer.h>  +  +namespace NStaticCodecExample {       enum EDictVersion : ui8 {          DV_NULL = 0,          DV_HUFF_20160707,          DV_SA_HUFF_20160707,          DV_COUNT -    }; - -    void Encode(TBuffer&, TStringBuf, EDictVersion dv = DV_SA_HUFF_20160707); - -    void Decode(TBuffer&, TStringBuf); -} +    };  +  +    void Encode(TBuffer&, TStringBuf, EDictVersion dv = DV_SA_HUFF_20160707);  +  +    void Decode(TBuffer&, TStringBuf);  +}  diff --git a/library/cpp/codecs/static/example/ya.make b/library/cpp/codecs/static/example/ya.make index ca6c5fd900a..85dc2226243 100644 --- a/library/cpp/codecs/static/example/ya.make +++ b/library/cpp/codecs/static/example/ya.make @@ -1,24 +1,24 @@ -LIBRARY() - -OWNER(velavokr) - -SRCS( -    GLOBAL example.cpp -) - -PEERDIR( +LIBRARY()  +  +OWNER(velavokr)  +  +SRCS(  +    GLOBAL example.cpp  +)  +  +PEERDIR(       library/cpp/codecs      library/cpp/codecs/static -) - -ARCHIVE_ASM( +)  +  +ARCHIVE_ASM(       "solar-8k-a.huffman.1467494385.codec_info"      NAME codec_info_sa_huff_20160707 -) - -ARCHIVE_ASM( +)  +  +ARCHIVE_ASM(       "huffman.1467494385.codec_info"      NAME codec_info_huff_20160707 -) - -END() +)  +  +END()  diff --git a/library/cpp/codecs/static/static.cpp b/library/cpp/codecs/static/static.cpp index 44a07dd73a2..d2c99a15ee5 100644 --- a/library/cpp/codecs/static/static.cpp +++ b/library/cpp/codecs/static/static.cpp @@ -1,98 +1,98 @@ -#include "static.h" -#include "common.h" - +#include "static.h"  +#include "common.h"  +   #include <library/cpp/codecs/static/static_codec_info.pb.h>  #include <library/cpp/archive/yarchive.h> - -#include <util/draft/datetime.h> - -#include <util/string/builder.h> -#include <util/stream/buffer.h> -#include <util/stream/mem.h> -#include <util/string/hex.h> -#include <util/ysaveload.h> - -namespace NCodecs { +  +#include <util/draft/datetime.h>  +  +#include <util/string/builder.h>  +#include <util/stream/buffer.h>  +#include <util/stream/mem.h>  +#include <util/string/hex.h>  +#include <util/ysaveload.h>  +  +namespace NCodecs {       static constexpr TStringBuf STATIC_CODEC_INFO_MAGIC = "CodecInf"; - -    static TStringBuf GetStaticCodecInfoMagic() { +  +    static TStringBuf GetStaticCodecInfoMagic() {           return STATIC_CODEC_INFO_MAGIC; -    } - +    }  +       void SaveCodecInfoToStream(IOutputStream& out, const TStaticCodecInfo& info) { -        TBufferOutput bout; +        TBufferOutput bout;           info.SerializeToArcadiaStream(&bout); -        ui64 hash = DataSignature(bout.Buffer()); -        out.Write(GetStaticCodecInfoMagic()); -        ::Save(&out, hash); -        ::Save(&out, bout.Buffer()); -    } - +        ui64 hash = DataSignature(bout.Buffer());  +        out.Write(GetStaticCodecInfoMagic());  +        ::Save(&out, hash);  +        ::Save(&out, bout.Buffer());  +    }  +       TStaticCodecInfo LoadCodecInfoFromStream(IInputStream& in) { -        { -            TBuffer magic; +        {  +            TBuffer magic;               magic.Resize(GetStaticCodecInfoMagic().size());              Y_ENSURE_EX(in.Read(magic.Data(), GetStaticCodecInfoMagic().size()) == GetStaticCodecInfoMagic().size(), -                        TCodecException() << "bad codec info"); +                        TCodecException() << "bad codec info");               Y_ENSURE_EX(TStringBuf(magic.data(), magic.size()) == GetStaticCodecInfoMagic(), -                        TCodecException() << "bad codec info"); -        } - -        ui64 hash; -        ::Load(&in, hash); -        TBuffer info; -        ::Load(&in, info); -        Y_ENSURE_EX(hash == DataSignature(info), TCodecException() << "bad codec info"); - -        TStaticCodecInfo result; +                        TCodecException() << "bad codec info");  +        }  +  +        ui64 hash;  +        ::Load(&in, hash);  +        TBuffer info;  +        ::Load(&in, info);  +        Y_ENSURE_EX(hash == DataSignature(info), TCodecException() << "bad codec info");  +  +        TStaticCodecInfo result;           Y_ENSURE_EX(result.ParseFromArray(info.data(), info.size()), TCodecException() << "bad codec info"); - -        return result; -    } - +  +        return result;  +    }  +       TString SaveCodecInfoToString(const TStaticCodecInfo& info) { -        TStringStream s; -        SaveCodecInfoToStream(s, info); -        return s.Str(); -    } - -    TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data) { +        TStringStream s;  +        SaveCodecInfoToStream(s, info);  +        return s.Str();  +    }  +  +    TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data) {           TMemoryInput m{data.data(), data.size()}; -        return LoadCodecInfoFromStream(m); -    } - +        return LoadCodecInfoFromStream(m);  +    }  +       TString FormatCodecInfo(const TStaticCodecInfo& ci) { -        TStringBuilder s; -        s << "codec name:      " << ci.GetDebugInfo().GetCodecName() << Endl; -        s << "codec hash:      " << HexWriteScalar(ci.GetDebugInfo().GetStoredCodecHash()) << Endl; -        s << "dict size:       " << ci.GetStoredCodec().Size() << Endl; -        s << "sample mult:     " << ci.GetDebugInfo().GetSampleSizeMultiplier() << Endl; -        s << "orig.compress:   " << ci.GetDebugInfo().GetCompression() * 100 << " %" << Endl; -        s << "timestamp:       " << ci.GetDebugInfo().GetTimestamp() << " (" +        TStringBuilder s;  +        s << "codec name:      " << ci.GetDebugInfo().GetCodecName() << Endl;  +        s << "codec hash:      " << HexWriteScalar(ci.GetDebugInfo().GetStoredCodecHash()) << Endl;  +        s << "dict size:       " << ci.GetStoredCodec().Size() << Endl;  +        s << "sample mult:     " << ci.GetDebugInfo().GetSampleSizeMultiplier() << Endl;  +        s << "orig.compress:   " << ci.GetDebugInfo().GetCompression() * 100 << " %" << Endl;  +        s << "timestamp:       " << ci.GetDebugInfo().GetTimestamp() << " ("             << NDatetime::TSimpleTM::NewLocal(ci.GetDebugInfo().GetTimestamp()).ToString()            << ")" << Endl; -        s << "revision:        " << ci.GetDebugInfo().GetRevisionInfo() << Endl; -        s << "training set comment: " << ci.GetDebugInfo().GetTrainingSetComment() << Endl; -        s << "training set resId:   " << ci.GetDebugInfo().GetTrainingSetResId() << Endl; -        return s; -    } - +        s << "revision:        " << ci.GetDebugInfo().GetRevisionInfo() << Endl;  +        s << "training set comment: " << ci.GetDebugInfo().GetTrainingSetComment() << Endl;  +        s << "training set resId:   " << ci.GetDebugInfo().GetTrainingSetResId() << Endl;  +        return s;  +    }  +       TString LoadStringFromArchive(const ui8* begin, size_t size) { -        TArchiveReader ar(TBlob::NoCopy(begin, size)); -        Y_VERIFY(ar.Count() == 1, "invalid number of entries"); -        auto blob = ar.ObjectBlobByKey(ar.KeyByIndex(0)); +        TArchiveReader ar(TBlob::NoCopy(begin, size));  +        Y_VERIFY(ar.Count() == 1, "invalid number of entries");  +        auto blob = ar.ObjectBlobByKey(ar.KeyByIndex(0));           return TString{blob.AsCharPtr(), blob.Size()}; -    } - -    TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo& info) { -        return NCodecs::ICodec::RestoreFromString(info.GetStoredCodec()); -    } - -    TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size) { -        const auto& data = LoadStringFromArchive(begin, size); -        const auto& info = LoadCodecInfoFromString(data); -        const auto& codec = RestoreCodecFromCodecInfo(info); -        Y_ENSURE_EX(codec, TCodecException() << "null codec"); -        return codec; -    } -} +    }  +  +    TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo& info) {  +        return NCodecs::ICodec::RestoreFromString(info.GetStoredCodec());  +    }  +  +    TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size) {  +        const auto& data = LoadStringFromArchive(begin, size);  +        const auto& info = LoadCodecInfoFromString(data);  +        const auto& codec = RestoreCodecFromCodecInfo(info);  +        Y_ENSURE_EX(codec, TCodecException() << "null codec");  +        return codec;  +    }  +}  diff --git a/library/cpp/codecs/static/static.h b/library/cpp/codecs/static/static.h index c1eaed2a742..efa9c60c225 100644 --- a/library/cpp/codecs/static/static.h +++ b/library/cpp/codecs/static/static.h @@ -1,34 +1,34 @@ -#pragma once - +#pragma once  +   #include <library/cpp/codecs/codecs.h> - -#include <util/generic/strbuf.h> +  +#include <util/generic/strbuf.h>   #include <util/generic/string.h>  #include <util/stream/output.h> - -namespace NCodecs { -    class TStaticCodecInfo; - -    // load - -    TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo&); - -    TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data); - +  +namespace NCodecs {  +    class TStaticCodecInfo;  +  +    // load  +  +    TCodecConstPtr RestoreCodecFromCodecInfo(const TStaticCodecInfo&);  +  +    TStaticCodecInfo LoadCodecInfoFromString(TStringBuf data);  +       TString LoadStringFromArchive(const ui8* begin, size_t size); - -    TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size); - -    // save - +  +    TCodecConstPtr RestoreCodecFromArchive(const ui8* begin, size_t size);  +  +    // save  +       TString SaveCodecInfoToString(const TStaticCodecInfo&); - +       void SaveCodecInfoToStream(IOutputStream& out, const TStaticCodecInfo&); - -    // misc - +  +    // misc  +       TStaticCodecInfo LoadCodecInfoFromStream(IInputStream& in); - +       TString FormatCodecInfo(const TStaticCodecInfo&); - -} +  +}  diff --git a/library/cpp/codecs/static/static_codec_info.proto b/library/cpp/codecs/static/static_codec_info.proto index 362abb4dadf..178459784b6 100644 --- a/library/cpp/codecs/static/static_codec_info.proto +++ b/library/cpp/codecs/static/static_codec_info.proto @@ -1,17 +1,17 @@ -package NCodecs; - -message TStaticCodecInfo { -    message TDebugInfo { -        optional string CodecName = 1;           // the exact codec variant name -        optional uint64 Timestamp = 2;           // when the codec was built -        optional string RevisionInfo = 3;        // the arcadia revision info -        optional float SampleSizeMultiplier = 4; // how the default sample size was modified to improve compression -        optional float Compression = 5;          // the compression on the training set ((raw_size - coded_size) / raw_size) -        optional string TrainingSetComment = 6;  // a human readable description of the training set -        optional string TrainingSetResId = 7;    // the training set sandbox resource id -        optional uint64 StoredCodecHash = 8;     // cityhash64(data) -    } -     -    optional bytes StoredCodec = 1;           // the data of the codec -    optional TDebugInfo DebugInfo = 2;        // misc debug info which could be useful in finding whereabouts later -} +package NCodecs;  +  +message TStaticCodecInfo {  +    message TDebugInfo {  +        optional string CodecName = 1;           // the exact codec variant name  +        optional uint64 Timestamp = 2;           // when the codec was built  +        optional string RevisionInfo = 3;        // the arcadia revision info  +        optional float SampleSizeMultiplier = 4; // how the default sample size was modified to improve compression  +        optional float Compression = 5;          // the compression on the training set ((raw_size - coded_size) / raw_size)  +        optional string TrainingSetComment = 6;  // a human readable description of the training set  +        optional string TrainingSetResId = 7;    // the training set sandbox resource id  +        optional uint64 StoredCodecHash = 8;     // cityhash64(data)  +    }  +      +    optional bytes StoredCodec = 1;           // the data of the codec  +    optional TDebugInfo DebugInfo = 2;        // misc debug info which could be useful in finding whereabouts later  +}  diff --git a/library/cpp/codecs/static/tools/common/ct_common.cpp b/library/cpp/codecs/static/tools/common/ct_common.cpp index fe776912805..cea40506e13 100644 --- a/library/cpp/codecs/static/tools/common/ct_common.cpp +++ b/library/cpp/codecs/static/tools/common/ct_common.cpp @@ -1,74 +1,74 @@ -#include "ct_common.h" - +#include "ct_common.h"  +   #include <library/cpp/codecs/codecs.h>  #include <library/cpp/codecs/static/static_codec_info.pb.h>  #include <library/cpp/string_utils/base64/base64.h> - +   #include <util/stream/output.h> -#include <util/string/builder.h> -#include <util/system/hp_timer.h> - -namespace NCodecs { +#include <util/string/builder.h>  +#include <util/system/hp_timer.h>  +  +namespace NCodecs {       TString TComprStats::Format(const TStaticCodecInfo& info, bool checkMode) const { -        TStringBuilder s; -        s << "raw size/item:      " << RawSizePerRecord() << Endl; -        s << "enc.size/item:      " << EncSizePerRecord() << Endl; -        if (checkMode) { -            s << "orig.enc.size/item: " << OldEncSizePerRecord(info.GetDebugInfo().GetCompression()) << Endl; -        } -        s << "enc time us/item:   " << EncTimePerRecordUS() << Endl; -        s << "dec time us/item:   " << DecTimePerRecordUS() << Endl; -        s << "dict size:          " << info.GetStoredCodec().Size() << Endl; -        s << "compression:        " << AsPercent(Compression()) << " %" << Endl; -        if (checkMode) { -            s << "orig.compression:   " << AsPercent(info.GetDebugInfo().GetCompression()) << " %" << Endl; -        } -        return s; -    } - +        TStringBuilder s;  +        s << "raw size/item:      " << RawSizePerRecord() << Endl;  +        s << "enc.size/item:      " << EncSizePerRecord() << Endl;  +        if (checkMode) {  +            s << "orig.enc.size/item: " << OldEncSizePerRecord(info.GetDebugInfo().GetCompression()) << Endl;  +        }  +        s << "enc time us/item:   " << EncTimePerRecordUS() << Endl;  +        s << "dec time us/item:   " << DecTimePerRecordUS() << Endl;  +        s << "dict size:          " << info.GetStoredCodec().Size() << Endl;  +        s << "compression:        " << AsPercent(Compression()) << " %" << Endl;  +        if (checkMode) {  +            s << "orig.compression:   " << AsPercent(info.GetDebugInfo().GetCompression()) << " %" << Endl;  +        }  +        return s;  +    }  +       TComprStats TestCodec(const ICodec& c, const TVector<TString>& input) { -        TComprStats stats; - -        TBuffer encodeBuffer; -        TBuffer decodeBuffer; -        for (const auto& data : input) { -            encodeBuffer.Clear(); -            decodeBuffer.Clear(); - -            stats.Records += 1; +        TComprStats stats;  +  +        TBuffer encodeBuffer;  +        TBuffer decodeBuffer;  +        for (const auto& data : input) {  +            encodeBuffer.Clear();  +            decodeBuffer.Clear();  +  +            stats.Records += 1;               stats.RawSize += data.size(); - -            THPTimer timer; -            c.Encode(data, encodeBuffer); +  +            THPTimer timer;  +            c.Encode(data, encodeBuffer);               stats.EncSize += encodeBuffer.size(); -            stats.EncSeconds += timer.PassedReset(); - +            stats.EncSeconds += timer.PassedReset();  +               c.Decode(TStringBuf{encodeBuffer.data(), encodeBuffer.size()}, decodeBuffer); -            stats.DecSeconds += timer.PassedReset(); +            stats.DecSeconds += timer.PassedReset();               Y_ENSURE(data == TStringBuf(decodeBuffer.data(), decodeBuffer.size()), "invalid encoding at record " << stats.Records); -        } - -        return stats; -    } - +        }  +  +        return stats;  +    }  +       void ParseBlob(TVector<TString>& result, EDataStreamFormat fmt, const TBlob& blob) {          TStringBuf bin(blob.AsCharPtr(), blob.Size()); -        TStringBuf line; +        TStringBuf line;           TString buffer; -        while (bin.ReadLine(line)) { -            if (DSF_BASE64_LF == fmt) { -                Base64Decode(line, buffer); -                line = buffer; -            } -            if (!line) { -                continue; -            } +        while (bin.ReadLine(line)) {  +            if (DSF_BASE64_LF == fmt) {  +                Base64Decode(line, buffer);  +                line = buffer;  +            }  +            if (!line) {  +                continue;  +            }               result.emplace_back(line.data(), line.size()); -        } -    } - +        }  +    }  +       TBlob GetInputBlob(const TString& dataFile) { -        return dataFile && dataFile != "-" ? TBlob::FromFile(dataFile) : TBlob::FromStream(Cin); -    } - -} +        return dataFile && dataFile != "-" ? TBlob::FromFile(dataFile) : TBlob::FromStream(Cin);  +    }  +  +}  diff --git a/library/cpp/codecs/static/tools/common/ct_common.h b/library/cpp/codecs/static/tools/common/ct_common.h index 9d3dcbda934..de531b27e6b 100644 --- a/library/cpp/codecs/static/tools/common/ct_common.h +++ b/library/cpp/codecs/static/tools/common/ct_common.h @@ -1,75 +1,75 @@ -#pragma once - +#pragma once  +   #include <util/generic/string.h> -#include <util/generic/vector.h> -#include <util/memory/blob.h> -#include <cmath> - -namespace NCodecs { -    class TStaticCodecInfo; -    class ICodec; - -    struct TComprStats { -        double EncSeconds = 0; -        double DecSeconds = 0; -        size_t Records = 0; -        size_t RawSize = 0; -        size_t EncSize = 0; - -        static double Round(double n, size_t decPlaces = 2) { -            double p = pow(10, decPlaces); -            return round(n * p) / p; -        } - -        static double AsPercent(double n) { -            return Round(n * 100); -        } - -        static double AsMicroSecond(double s) { -            return s * 1000000; -        } - -        double PerRecord(double n) const { -            return Round((double)(Records ? n / Records : 0)); -        } - -        double Compression() const { -            return ((double)RawSize - (double)EncSize) / RawSize; -        } - -        double EncTimePerRecordUS() const { -            return PerRecord(AsMicroSecond(EncSeconds)); -        } - -        double DecTimePerRecordUS() const { -            return PerRecord(AsMicroSecond(DecSeconds)); -        } - -        double RawSizePerRecord() const { -            return PerRecord(RawSize); -        } - -        double EncSizePerRecord() const { -            return PerRecord(EncSize); -        } - -        double OldEncSizePerRecord(double compr) const { -            return PerRecord((1 - compr) * RawSize); -        } - +#include <util/generic/vector.h>  +#include <util/memory/blob.h>  +#include <cmath>  +  +namespace NCodecs {  +    class TStaticCodecInfo;  +    class ICodec;  +  +    struct TComprStats {  +        double EncSeconds = 0;  +        double DecSeconds = 0;  +        size_t Records = 0;  +        size_t RawSize = 0;  +        size_t EncSize = 0;  +  +        static double Round(double n, size_t decPlaces = 2) {  +            double p = pow(10, decPlaces);  +            return round(n * p) / p;  +        }  +  +        static double AsPercent(double n) {  +            return Round(n * 100);  +        }  +  +        static double AsMicroSecond(double s) {  +            return s * 1000000;  +        }  +  +        double PerRecord(double n) const {  +            return Round((double)(Records ? n / Records : 0));  +        }  +  +        double Compression() const {  +            return ((double)RawSize - (double)EncSize) / RawSize;  +        }  +  +        double EncTimePerRecordUS() const {  +            return PerRecord(AsMicroSecond(EncSeconds));  +        }  +  +        double DecTimePerRecordUS() const {  +            return PerRecord(AsMicroSecond(DecSeconds));  +        }  +  +        double RawSizePerRecord() const {  +            return PerRecord(RawSize);  +        }  +  +        double EncSizePerRecord() const {  +            return PerRecord(EncSize);  +        }  +  +        double OldEncSizePerRecord(double compr) const {  +            return PerRecord((1 - compr) * RawSize);  +        }  +           TString Format(const TStaticCodecInfo&, bool checkMode) const; -    }; - +    };  +       TComprStats TestCodec(const ICodec&, const TVector<TString>& data); - -    enum EDataStreamFormat { -        DSF_NONE, -        DSF_PLAIN_LF /* "plain" */, -        DSF_BASE64_LF /* "base64" */, -    }; - +  +    enum EDataStreamFormat {  +        DSF_NONE,  +        DSF_PLAIN_LF /* "plain" */,  +        DSF_BASE64_LF /* "base64" */,  +    };  +       void ParseBlob(TVector<TString>&, EDataStreamFormat, const TBlob&); - +       TBlob GetInputBlob(const TString& dataFile); - -} +  +}  diff --git a/library/cpp/codecs/static/tools/common/ya.make b/library/cpp/codecs/static/tools/common/ya.make index d624222dad0..5f575a2f283 100644 --- a/library/cpp/codecs/static/tools/common/ya.make +++ b/library/cpp/codecs/static/tools/common/ya.make @@ -1,19 +1,19 @@ -LIBRARY() - +LIBRARY()  +   OWNER(velavokr) - -SRCS( -    ct_common.cpp -) - -PEERDIR( +  +SRCS(  +    ct_common.cpp  +)  +  +PEERDIR(       library/cpp/codecs      library/cpp/codecs/static      library/cpp/getopt/small      library/cpp/string_utils/base64 -    util/draft -) - +    util/draft  +)  +   GENERATE_ENUM_SERIALIZATION(ct_common.h) - -END() +  +END()  diff --git a/library/cpp/codecs/static/tools/static_codec_checker/README b/library/cpp/codecs/static/tools/static_codec_checker/README index 723a68300b0..c66703227d1 100644 --- a/library/cpp/codecs/static/tools/static_codec_checker/README +++ b/library/cpp/codecs/static/tools/static_codec_checker/README @@ -1,4 +1,4 @@  This is a viewer for generated codec and utility for verification of the compression quality on a new data. - +   Usage: -static_codec_checker -t -c 029b29ff64a74927.codec_info -f plain samples.txt +static_codec_checker -t -c 029b29ff64a74927.codec_info -f plain samples.txt  diff --git a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp index 9c8d568d823..5ae901d8f83 100644 --- a/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp +++ b/library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker.cpp @@ -3,25 +3,25 @@  #include <library/cpp/codecs/static/static_codec_info.pb.h>  #include <library/cpp/codecs/codecs.h>  #include <library/cpp/getopt/small/last_getopt.h> - -#include <util/digest/city.h> -#include <util/generic/yexception.h> -#include <util/stream/file.h> -#include <util/stream/buffer.h> -#include <util/stream/format.h> -#include <util/string/builder.h> - -int main(int argc, char** argv) { -    NCodecs::TCodecPtr codecPtr; -    NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE; +  +#include <util/digest/city.h>  +#include <util/generic/yexception.h>  +#include <util/stream/file.h>  +#include <util/stream/buffer.h>  +#include <util/stream/format.h>  +#include <util/string/builder.h>  +  +int main(int argc, char** argv) {  +    NCodecs::TCodecPtr codecPtr;  +    NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE;       TString codecFile; -    bool testCompression = false; - -    auto opts = NLastGetopt::TOpts::Default(); -    opts.SetTitle("Prints a .codec_info file and optionally checks its performance on new data. See also static_codec_generator."); -    opts.SetCmdLineDescr("-c 9089f3e9b7a0f0d4.codec_info -t -f base64 qtrees.sample.txt"); -    NCodecs::TStaticCodecInfo codec; - +    bool testCompression = false;  +  +    auto opts = NLastGetopt::TOpts::Default();  +    opts.SetTitle("Prints a .codec_info file and optionally checks its performance on new data. See also static_codec_generator.");  +    opts.SetCmdLineDescr("-c 9089f3e9b7a0f0d4.codec_info -t -f base64 qtrees.sample.txt");  +    NCodecs::TStaticCodecInfo codec;  +       opts.AddLongOption('c', "codec-info").RequiredArgument("codec_info").Handler1T<TString>([&codecFile, &codec, &codecPtr](TString name) {                                                                              codecFile = name;                                                                              codec.CopyFrom(NCodecs::LoadCodecInfoFromString(TUnbufferedFileInput(name).ReadAll())); @@ -29,45 +29,45 @@ int main(int argc, char** argv) {                                                                          })          .Required()          .Help(".codec_info file with serialized static data for codec"); - +       opts.AddLongOption('t', "test").NoArgument().StoreValue(&testCompression, true).Optional().Help("test current performance"); - +       opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Optional().Help("test set input file format"); - -    opts.SetFreeArgsMin(0); -    opts.SetFreeArgTitle(0, "testing_set_input_file", "testing set input files"); - -    NLastGetopt::TOptsParseResult res(&opts, argc, argv); - -    Cout << codecFile << Endl; -    Cout << NCodecs::FormatCodecInfo(codec) << Endl; - -    if (testCompression) { -        if (NCodecs::DSF_NONE == fmt) { -            Cerr << "Specify format (-f|--format) for testing set input" << Endl; -            exit(1); -        } - -        Cout << "Reading testing set data ... " << Flush; - +  +    opts.SetFreeArgsMin(0);  +    opts.SetFreeArgTitle(0, "testing_set_input_file", "testing set input files");  +  +    NLastGetopt::TOptsParseResult res(&opts, argc, argv);  +  +    Cout << codecFile << Endl;  +    Cout << NCodecs::FormatCodecInfo(codec) << Endl;  +  +    if (testCompression) {  +        if (NCodecs::DSF_NONE == fmt) {  +            Cerr << "Specify format (-f|--format) for testing set input" << Endl;  +            exit(1);  +        }  +  +        Cout << "Reading testing set data ... " << Flush;  +           TVector<TString> allData; -        for (const auto& freeArg : res.GetFreeArgs()) { -            NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg)); -        } - -        if (!res.GetFreeArgs()) { -            NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-")); -        } - -        Cout << "Done" << Endl << Endl; - -        Cout << "records:  " << allData.size() << Endl; -        Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl; - -        Cout << "Testing compression ... " << Flush; -        auto stats = NCodecs::TestCodec(*codecPtr, allData); -        Cout << "Done" << Endl << Endl; - -        Cout << stats.Format(codec, true) << Endl; -    } -} +        for (const auto& freeArg : res.GetFreeArgs()) {  +            NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg));  +        }  +  +        if (!res.GetFreeArgs()) {  +            NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-"));  +        }  +  +        Cout << "Done" << Endl << Endl;  +  +        Cout << "records:  " << allData.size() << Endl;  +        Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl;  +  +        Cout << "Testing compression ... " << Flush;  +        auto stats = NCodecs::TestCodec(*codecPtr, allData);  +        Cout << "Done" << Endl << Endl;  +  +        Cout << stats.Format(codec, true) << Endl;  +    }  +}  diff --git a/library/cpp/codecs/static/tools/static_codec_checker/ya.make b/library/cpp/codecs/static/tools/static_codec_checker/ya.make index 90e06ca448d..86b73dff6c4 100644 --- a/library/cpp/codecs/static/tools/static_codec_checker/ya.make +++ b/library/cpp/codecs/static/tools/static_codec_checker/ya.make @@ -1,16 +1,16 @@ -PROGRAM() - +PROGRAM()  +   OWNER(velavokr) - -SRCS( -    static_codec_checker.cpp -) - -PEERDIR( +  +SRCS(  +    static_codec_checker.cpp  +)  +  +PEERDIR(       library/cpp/codecs      library/cpp/codecs/static      library/cpp/codecs/static/tools/common      library/cpp/getopt/small -) - -END() +)  +  +END()  diff --git a/library/cpp/codecs/static/tools/static_codec_generator/README b/library/cpp/codecs/static/tools/static_codec_generator/README index e6bb52b9591..f0fffd745ad 100644 --- a/library/cpp/codecs/static/tools/static_codec_generator/README +++ b/library/cpp/codecs/static/tools/static_codec_generator/README @@ -1,4 +1,4 @@  This is a utility for reproducible  teaching of a codec. And also for saving it into a file with a unique name for a static compilation as a resource. - +   Usage: -static_codec_generator -t -m 'the training data description' -f plain samples.txt +static_codec_generator -t -m 'the training data description' -f plain samples.txt  diff --git a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp index 45fdb5c5fe8..b37a0f686d5 100644 --- a/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp +++ b/library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator.cpp @@ -2,81 +2,81 @@  #include <library/cpp/codecs/static/static_codec_info.pb.h>  #include <library/cpp/codecs/static/builder.h>  #include <library/cpp/codecs/codecs.h> - +   #include <library/cpp/getopt/small/last_getopt.h> - -#include <util/generic/yexception.h> -#include <util/stream/file.h> -#include <util/string/builder.h> - -int main(int argc, char** argv) { -    NCodecs::TCodecBuildInfo info; -    NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE; - -    auto opts = NLastGetopt::TOpts::Default(); -    opts.SetCmdLineDescr("-m 'Training set: 100000 qtrees taken from web mmeta logs' -f base64 qtrees.sample.txt"); -    opts.SetTitle("Teaches the codec and serializes it as a file named CODECNAME.hash(CODECDATA).bin"); - +  +#include <util/generic/yexception.h>  +#include <util/stream/file.h>  +#include <util/string/builder.h>  +  +int main(int argc, char** argv) {  +    NCodecs::TCodecBuildInfo info;  +    NCodecs::EDataStreamFormat fmt = NCodecs::DSF_NONE;  +  +    auto opts = NLastGetopt::TOpts::Default();  +    opts.SetCmdLineDescr("-m 'Training set: 100000 qtrees taken from web mmeta logs' -f base64 qtrees.sample.txt");  +    opts.SetTitle("Teaches the codec and serializes it as a file named CODECNAME.hash(CODECDATA).bin");  +       opts.AddLongOption('m', "message").RequiredArgument("training_set_comment").StoreResult(&info.TrainingSetComment).Required().Help("a human description for the training set"); - +       opts.AddLongOption('r', "resource").RequiredArgument("training_set_res_id").StoreResult(&info.TrainingSetResId).Optional().Help("sandbox resource id for the training set"); - +       opts.AddLongOption('c', "codec").RequiredArgument("codec_name").StoreResult(&info.CodecName).Optional().DefaultValue(info.CodecName); - +       opts.AddLongOption('s', "sample-multiplier").RequiredArgument("multiplier").StoreResult(&info.SampleSizeMultiplier).Optional().DefaultValue(ToString(info.SampleSizeMultiplier)).Help("multiplier for default sample size"); - +       opts.AddLongOption('f', "format").RequiredArgument(TStringBuilder() << "(" << NCodecs::DSF_PLAIN_LF << "|" << NCodecs::DSF_BASE64_LF << ")").StoreResult(&fmt).Required().Help("training set input file format"); - +       opts.AddLongOption("list-codecs").NoArgument().Handler0([]() {                                                        Cout << JoinStrings(NCodecs::ICodec::GetCodecsList(), "\n") << Endl;                                                        exit(0);                                                    })          .Optional()          .Help("list available codecs"); - +       opts.AddLongOption("fake-revision").RequiredArgument("revision").StoreResult(&info.RevisionInfo).Optional().Hidden(); // replace static_codec_generator revision in debug info - +       opts.AddLongOption("fake-timestamp").RequiredArgument("timestamp").StoreResult(&info.Timestamp).Optional().Hidden(); // replace generating timestamp in debug info - -    opts.SetFreeArgsMin(0); -    opts.SetFreeArgTitle(0, "training_set_input_file", "training set input files"); - -    NLastGetopt::TOptsParseResult res(&opts, argc, argv); - -    Cout << "Reading training set data ... " << Flush; +  +    opts.SetFreeArgsMin(0);  +    opts.SetFreeArgTitle(0, "training_set_input_file", "training set input files");  +  +    NLastGetopt::TOptsParseResult res(&opts, argc, argv);  +  +    Cout << "Reading training set data ... " << Flush;       TVector<TString> allData; -    for (const auto& freeArg : res.GetFreeArgs()) { -        NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg)); -    } - -    if (!res.GetFreeArgs()) { -        NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-")); -    } -    Cout << "Done" << Endl << Endl; - -    Cout << "records:  " << allData.size() << Endl; -    Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl; - -    Cout << "Training " << info.CodecName << " , sample size multiplier is " << info.SampleSizeMultiplier << " ... " << Flush; -    auto codec = NCodecs::BuildStaticCodec(allData, info); -    Cout << "Done" << Endl; - +    for (const auto& freeArg : res.GetFreeArgs()) {  +        NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob(freeArg));  +    }  +  +    if (!res.GetFreeArgs()) {  +        NCodecs::ParseBlob(allData, fmt, NCodecs::GetInputBlob("-"));  +    }  +    Cout << "Done" << Endl << Endl;  +  +    Cout << "records:  " << allData.size() << Endl;  +    Cout << "raw size: " << NCodecs::GetInputSize(allData.begin(), allData.end()) << " bytes" << Endl << Endl;  +  +    Cout << "Training " << info.CodecName << " , sample size multiplier is " << info.SampleSizeMultiplier << " ... " << Flush;  +    auto codec = NCodecs::BuildStaticCodec(allData, info);  +    Cout << "Done" << Endl;  +       TString codecName = NCodecs::GetStandardFileName(codec); -    NCodecs::TCodecPtr codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec()); - -    Cout << "Testing compression ... " << Flush; -    auto stats = NCodecs::TestCodec(*codecPtr, allData); -    Cout << "Done" << Endl << Endl; - -    codec.MutableDebugInfo()->SetCompression(stats.Compression()); - -    Cout << stats.Format(codec, false) << Endl; - -    Cout << "Saving as " << codecName << " ... " << Flush; -    { +    NCodecs::TCodecPtr codecPtr = NCodecs::ICodec::RestoreFromString(codec.GetStoredCodec());  +  +    Cout << "Testing compression ... " << Flush;  +    auto stats = NCodecs::TestCodec(*codecPtr, allData);  +    Cout << "Done" << Endl << Endl;  +  +    codec.MutableDebugInfo()->SetCompression(stats.Compression());  +  +    Cout << stats.Format(codec, false) << Endl;  +  +    Cout << "Saving as " << codecName << " ... " << Flush;  +    {           TUnbufferedFileOutput fout{codecName}; -        NCodecs::SaveCodecInfoToStream(fout, codec); -        fout.Finish(); -    } -    Cout << "Done" << Endl << Endl; -} +        NCodecs::SaveCodecInfoToStream(fout, codec);  +        fout.Finish();  +    }  +    Cout << "Done" << Endl << Endl;  +}  diff --git a/library/cpp/codecs/static/tools/static_codec_generator/ya.make b/library/cpp/codecs/static/tools/static_codec_generator/ya.make index efbc440dd18..21750dde49b 100644 --- a/library/cpp/codecs/static/tools/static_codec_generator/ya.make +++ b/library/cpp/codecs/static/tools/static_codec_generator/ya.make @@ -1,17 +1,17 @@ -PROGRAM() - +PROGRAM()  +   OWNER(velavokr) - -SRCS( -    static_codec_generator.cpp -) - -PEERDIR( +  +SRCS(  +    static_codec_generator.cpp  +)  +  +PEERDIR(       library/cpp/codecs      library/cpp/codecs/static      library/cpp/codecs/static/tools/common      library/cpp/digest/md5      library/cpp/getopt/small -) - -END() +)  +  +END()  diff --git a/library/cpp/codecs/static/tools/tests/static_codec_tools.py b/library/cpp/codecs/static/tools/tests/static_codec_tools.py index db4140e3703..a5baa262f79 100644 --- a/library/cpp/codecs/static/tools/tests/static_codec_tools.py +++ b/library/cpp/codecs/static/tools/tests/static_codec_tools.py @@ -1,18 +1,18 @@ -#!/usr/bin/env python - -import yatest.common as tt -import os.path as op - -def test_static_codec_tools(): +#!/usr/bin/env python  +  +import yatest.common as tt  +import os.path as op  +  +def test_static_codec_tools():       tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_generator/static_codec_generator")] -        + ["-m", "test codec", "-r", "sbr://143310406", "-f", "plain", "-c", "solar-8k-a:huffman", "-s", "1", -            "--fake-revision", "r2385905", "--fake-timestamp", "1467494385", "sample.txt"], -        timeout=60) -    assert(op.exists("solar-8k-a.huffman.1467494385.codec_info")) +        + ["-m", "test codec", "-r", "sbr://143310406", "-f", "plain", "-c", "solar-8k-a:huffman", "-s", "1",  +            "--fake-revision", "r2385905", "--fake-timestamp", "1467494385", "sample.txt"],  +        timeout=60)  +    assert(op.exists("solar-8k-a.huffman.1467494385.codec_info"))       tt.canonical_execute(tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker"), -        args=["-c", "solar-8k-a.huffman.1467494385.codec_info"], -        timeout=60) +        args=["-c", "solar-8k-a.huffman.1467494385.codec_info"],  +        timeout=60)       tt.execute([tt.binary_path("library/cpp/codecs/static/tools/static_codec_checker/static_codec_checker")] -        + ["-c", "solar-8k-a.huffman.1467494385.codec_info", "-f", "plain", "-t", "sample.txt"], -        timeout=60) -    return tt.canonical_file("solar-8k-a.huffman.1467494385.codec_info") +        + ["-c", "solar-8k-a.huffman.1467494385.codec_info", "-f", "plain", "-t", "sample.txt"],  +        timeout=60)  +    return tt.canonical_file("solar-8k-a.huffman.1467494385.codec_info")  diff --git a/library/cpp/codecs/static/tools/tests/ya.make b/library/cpp/codecs/static/tools/tests/ya.make index c5324eaf53b..5555d90caed 100644 --- a/library/cpp/codecs/static/tools/tests/ya.make +++ b/library/cpp/codecs/static/tools/tests/ya.make @@ -1,20 +1,20 @@  PY2TEST() - -OWNER(velavokr) - -TEST_SRCS(static_codec_tools.py) - -DATA(sbr://143310406) - -TIMEOUT(4200) - +  +OWNER(velavokr)  +  +TEST_SRCS(static_codec_tools.py)  +  +DATA(sbr://143310406)  +  +TIMEOUT(4200)  +   TAG(ya:not_autocheck) -DEPENDS( +DEPENDS(       library/cpp/codecs/static/tools/static_codec_checker      library/cpp/codecs/static/tools/static_codec_generator -) - - +)  +  +  -END() +END()  diff --git a/library/cpp/codecs/static/tools/ya.make b/library/cpp/codecs/static/tools/ya.make index dd3e8437aa4..ab727691537 100644 --- a/library/cpp/codecs/static/tools/ya.make +++ b/library/cpp/codecs/static/tools/ya.make @@ -1,5 +1,5 @@ -RECURSE( -    common -    static_codec_generator -    static_codec_checker -) +RECURSE(  +    common  +    static_codec_generator  +    static_codec_checker  +)  diff --git a/library/cpp/codecs/static/ut/builder_ut.cpp b/library/cpp/codecs/static/ut/builder_ut.cpp index b47c279ed14..48d5c98d5d1 100644 --- a/library/cpp/codecs/static/ut/builder_ut.cpp +++ b/library/cpp/codecs/static/ut/builder_ut.cpp @@ -1,57 +1,57 @@  #include <library/cpp/testing/unittest/registar.h>  #include <library/cpp/codecs/static/builder.h>  #include <library/cpp/codecs/static/static_codec_info.pb.h> -#include <util/string/vector.h> - +#include <util/string/vector.h>  +   class TStaticCodecInfoBuilderTest: public NUnitTest::TTestBase { -    UNIT_TEST_SUITE(TStaticCodecInfoBuilderTest) +    UNIT_TEST_SUITE(TStaticCodecInfoBuilderTest)       UNIT_TEST(TestBuild) -    UNIT_TEST_SUITE_END(); +    UNIT_TEST_SUITE_END();  -private: +private:       TVector<TString> PrepareData() {          TVector<TString> data; -        for (ui32 i = 'a'; i <= 'z'; ++i) { +        for (ui32 i = 'a'; i <= 'z'; ++i) {               data.push_back(TString(1, (char)i)); -        } -        return data; -    } - -    void TestBuild() { +        }  +        return data;  +    }  +  +    void TestBuild() {           TVector<TString> data; -        NCodecs::TCodecBuildInfo info; -        info.CodecName = "huffman"; -        info.SampleSizeMultiplier = 2; -        info.Timestamp = 1467494385; -        info.RevisionInfo = "r2385905"; -        info.TrainingSetComment = "some dummy data"; -        info.TrainingSetResId = "sbr://1234"; -        auto res = NCodecs::BuildStaticCodec(PrepareData(), info); -        UNIT_ASSERT_VALUES_EQUAL(res.ShortUtf8DebugString(), -                                 "StoredCodec: \"\\007\\000huffman@S\\000a" -                                 "\\006b\\005c\\005d\\005e\\005f\\005g\\005h\\005i\\005j\\005k\\005l\\005m\\005n\\005o" -                                 "\\005p\\005q\\005r\\005s\\005t\\005u\\004v\\004w\\004x\\004y\\004z\\004\xC7?\xC8>" -                                 "\xC9=\xCA<\xCB;\xCC:\3159\3168\3177\3206\3215\3224\3233\3242\3251\3260\xD7/\xD8." -                                 "\xD9-\xDA,\xDB+\xDC*\xDD)\xDE(\xDF\\'\xE0&\xE1%\xE2$\xE3#\xE4\\\"\xE5!\xE6 \xE7" -                                 "\\037\xE8\\036\xE9\\035\xEA\\034\xEB\\033\xEC\\032\xED\\031\xEE\\030\xEF\\027\xF0" -                                 "\\026\xF1\\025\xF2\\024\xF3\\023\xF4\\022\xF5\\021\xF6\\020\xF7\\017\xF8\\016\xF9" -                                 "\\r\xFA\\014\xFB\\013\xFC\\n\xFD\\t\xFE\\010\xFF\\007\" " -                                 "DebugInfo { " -                                 "CodecName: \"huffman\" " -                                 "Timestamp: 1467494385 " -                                 "RevisionInfo: \"r2385905\" " -                                 "SampleSizeMultiplier: 2 " -                                 "TrainingSetComment: \"some dummy data\" " -                                 "TrainingSetResId: \"sbr://1234\" " -                                 "StoredCodecHash: 2509195835471488613 " -                                 "}"); - -        UNIT_ASSERT_VALUES_EQUAL(NCodecs::GetStandardFileName(res), "huffman.1467494385.codec_info"); -        UNIT_ASSERT_VALUES_EQUAL(res.GetDebugInfo().GetStoredCodecHash(), 2509195835471488613ULL); - -        auto res1 = NCodecs::LoadCodecInfoFromString(NCodecs::SaveCodecInfoToString(res)); -        UNIT_ASSERT_VALUES_EQUAL(res1.ShortUtf8DebugString(), res.ShortUtf8DebugString()); -    } -}; - -UNIT_TEST_SUITE_REGISTRATION(TStaticCodecInfoBuilderTest); +        NCodecs::TCodecBuildInfo info;  +        info.CodecName = "huffman";  +        info.SampleSizeMultiplier = 2;  +        info.Timestamp = 1467494385;  +        info.RevisionInfo = "r2385905";  +        info.TrainingSetComment = "some dummy data";  +        info.TrainingSetResId = "sbr://1234";  +        auto res = NCodecs::BuildStaticCodec(PrepareData(), info);  +        UNIT_ASSERT_VALUES_EQUAL(res.ShortUtf8DebugString(),  +                                 "StoredCodec: \"\\007\\000huffman@S\\000a"  +                                 "\\006b\\005c\\005d\\005e\\005f\\005g\\005h\\005i\\005j\\005k\\005l\\005m\\005n\\005o"  +                                 "\\005p\\005q\\005r\\005s\\005t\\005u\\004v\\004w\\004x\\004y\\004z\\004\xC7?\xC8>"  +                                 "\xC9=\xCA<\xCB;\xCC:\3159\3168\3177\3206\3215\3224\3233\3242\3251\3260\xD7/\xD8."  +                                 "\xD9-\xDA,\xDB+\xDC*\xDD)\xDE(\xDF\\'\xE0&\xE1%\xE2$\xE3#\xE4\\\"\xE5!\xE6 \xE7"  +                                 "\\037\xE8\\036\xE9\\035\xEA\\034\xEB\\033\xEC\\032\xED\\031\xEE\\030\xEF\\027\xF0"  +                                 "\\026\xF1\\025\xF2\\024\xF3\\023\xF4\\022\xF5\\021\xF6\\020\xF7\\017\xF8\\016\xF9"  +                                 "\\r\xFA\\014\xFB\\013\xFC\\n\xFD\\t\xFE\\010\xFF\\007\" "  +                                 "DebugInfo { "  +                                 "CodecName: \"huffman\" "  +                                 "Timestamp: 1467494385 "  +                                 "RevisionInfo: \"r2385905\" "  +                                 "SampleSizeMultiplier: 2 "  +                                 "TrainingSetComment: \"some dummy data\" "  +                                 "TrainingSetResId: \"sbr://1234\" "  +                                 "StoredCodecHash: 2509195835471488613 "  +                                 "}");  +  +        UNIT_ASSERT_VALUES_EQUAL(NCodecs::GetStandardFileName(res), "huffman.1467494385.codec_info");  +        UNIT_ASSERT_VALUES_EQUAL(res.GetDebugInfo().GetStoredCodecHash(), 2509195835471488613ULL);  +  +        auto res1 = NCodecs::LoadCodecInfoFromString(NCodecs::SaveCodecInfoToString(res));  +        UNIT_ASSERT_VALUES_EQUAL(res1.ShortUtf8DebugString(), res.ShortUtf8DebugString());  +    }  +};  +  +UNIT_TEST_SUITE_REGISTRATION(TStaticCodecInfoBuilderTest);  diff --git a/library/cpp/codecs/static/ut/static_ut.cpp b/library/cpp/codecs/static/ut/static_ut.cpp index 57e1e628874..fda9ffcccb5 100644 --- a/library/cpp/codecs/static/ut/static_ut.cpp +++ b/library/cpp/codecs/static/ut/static_ut.cpp @@ -1,27 +1,27 @@  #include <library/cpp/testing/unittest/registar.h>  #include <library/cpp/codecs/static/example/example.h> - +   class TStaticCodecUsageTest: public NUnitTest::TTestBase { -    UNIT_TEST_SUITE(TStaticCodecUsageTest) +    UNIT_TEST_SUITE(TStaticCodecUsageTest)       UNIT_TEST(TestUsage) -    UNIT_TEST_SUITE_END(); +    UNIT_TEST_SUITE_END();  -private: -    void DoTestUsage(NStaticCodecExample::EDictVersion dv, size_t expectedSize) { +private:  +    void DoTestUsage(NStaticCodecExample::EDictVersion dv, size_t expectedSize) {           const TStringBuf letov = "Всё идёт по плану"; - -        TBuffer outEnc, outDec; -        NStaticCodecExample::Encode(outEnc, letov, dv); +  +        TBuffer outEnc, outDec;  +        NStaticCodecExample::Encode(outEnc, letov, dv);           NStaticCodecExample::Decode(outDec, TStringBuf{outEnc.data(), outEnc.size()}); - -        UNIT_ASSERT_VALUES_EQUAL(outEnc.Size(), expectedSize); +  +        UNIT_ASSERT_VALUES_EQUAL(outEnc.Size(), expectedSize);           UNIT_ASSERT_EQUAL(TStringBuf(outDec.data(), outDec.size()), letov); -    } - -    void TestUsage() { -        DoTestUsage(NStaticCodecExample::DV_HUFF_20160707, 18u); -        DoTestUsage(NStaticCodecExample::DV_SA_HUFF_20160707, 22u); -    } -}; - -UNIT_TEST_SUITE_REGISTRATION(TStaticCodecUsageTest) +    }  +  +    void TestUsage() {  +        DoTestUsage(NStaticCodecExample::DV_HUFF_20160707, 18u);  +        DoTestUsage(NStaticCodecExample::DV_SA_HUFF_20160707, 22u);  +    }  +};  +  +UNIT_TEST_SUITE_REGISTRATION(TStaticCodecUsageTest)  diff --git a/library/cpp/codecs/static/ut/ya.make b/library/cpp/codecs/static/ut/ya.make index b9116097d87..5bb2017fac0 100644 --- a/library/cpp/codecs/static/ut/ya.make +++ b/library/cpp/codecs/static/ut/ya.make @@ -1,14 +1,14 @@  UNITTEST_FOR(library/cpp/codecs/static) - -OWNER(velavokr) - -SRCS( -    builder_ut.cpp -    static_ut.cpp -) - -PEERDIR( +  +OWNER(velavokr)  +  +SRCS(  +    builder_ut.cpp  +    static_ut.cpp  +)  +  +PEERDIR(       library/cpp/codecs/static/example -) - -END() +)  +  +END()  diff --git a/library/cpp/codecs/static/ya.make b/library/cpp/codecs/static/ya.make index 00e00fd8d43..a2698b9432f 100644 --- a/library/cpp/codecs/static/ya.make +++ b/library/cpp/codecs/static/ya.make @@ -1,18 +1,18 @@ -LIBRARY() - -OWNER(velavokr) - -SRCS( -    builder.cpp -    static_codec_info.proto -    static.cpp -) - -PEERDIR( +LIBRARY()  +  +OWNER(velavokr)  +  +SRCS(  +    builder.cpp  +    static_codec_info.proto  +    static.cpp  +)  +  +PEERDIR(       library/cpp/codecs      library/cpp/archive      library/cpp/svnversion -    util/draft -) - -END() +    util/draft  +)  +  +END()  diff --git a/library/cpp/codecs/tls_cache.cpp b/library/cpp/codecs/tls_cache.cpp index 0a1b32bda14..d54339d8693 100644 --- a/library/cpp/codecs/tls_cache.cpp +++ b/library/cpp/codecs/tls_cache.cpp @@ -1,4 +1,4 @@ -#include "tls_cache.h" - -namespace NCodecs { -} +#include "tls_cache.h"  +  +namespace NCodecs {  +}  diff --git a/library/cpp/codecs/tls_cache.h b/library/cpp/codecs/tls_cache.h index 0184e4bb6c2..fa166729c52 100644 --- a/library/cpp/codecs/tls_cache.h +++ b/library/cpp/codecs/tls_cache.h @@ -1,100 +1,100 @@ -#pragma once - -#include <util/generic/buffer.h> -#include <util/generic/deque.h> -#include <util/generic/noncopyable.h> -#include <util/generic/strbuf.h> -#include <util/system/tls.h> -#include <util/thread/singleton.h> - -namespace NCodecs { -    template <class TItem> -    struct TClear { -        void operator()(TItem& item) const { -            item.Clear(); -        } -    }; - +#pragma once  +  +#include <util/generic/buffer.h>  +#include <util/generic/deque.h>  +#include <util/generic/noncopyable.h>  +#include <util/generic/strbuf.h>  +#include <util/system/tls.h>  +#include <util/thread/singleton.h>  +  +namespace NCodecs {  +    template <class TItem>  +    struct TClear {  +        void operator()(TItem& item) const {  +            item.Clear();  +        }  +    };  +       template <class TItem, class TCleaner = TClear<TItem>> -    class TTlsCache { -        using TSelf = TTlsCache<TItem, TCleaner>; - +    class TTlsCache {  +        using TSelf = TTlsCache<TItem, TCleaner>;  +           struct TItemHolder: public TIntrusiveListItem<TItemHolder> { -            TItemHolder(TSelf& factory) -                : Factory(factory) +            TItemHolder(TSelf& factory)  +                : Factory(factory)               {              } - -            void Release() { -                Factory.Release(*this); -            } - -            TSelf& Factory; -            TItem Item; -        }; - -        class TItemGuard { -        public: -            explicit TItemGuard(TSelf& fact) -                : Holder(fact.Acquire()) +  +            void Release() {  +                Factory.Release(*this);  +            }  +  +            TSelf& Factory;  +            TItem Item;  +        };  +  +        class TItemGuard {  +        public:  +            explicit TItemGuard(TSelf& fact)  +                : Holder(fact.Acquire())               {              } - +               TItemGuard(TItemGuard&& other) noexcept { -                *this = std::move(other); -            } - +                *this = std::move(other);  +            }  +               TItemGuard& operator=(TItemGuard&& other) noexcept { -                if (&other != this) { -                    std::swap(Holder, other.Holder); -                } -                return *this; -            } - -            ~TItemGuard() { -                if (Holder) { -                    Holder->Release(); -                } -            } - -            TItem& Get() & { -                Y_ASSERT(Holder); -                return Holder->Item; -            } - -            TItem& Get() && = delete; - -        private: -            TItemHolder* Holder = nullptr; -        }; - -    public: -        TItemGuard Item() { -            return TItemGuard(*this); -        } - -        static TSelf& TlsInstance() { -            return *FastTlsSingleton<TSelf>(); -        } - -    private: -        TItemHolder* Acquire() { -            if (Free.Empty()) { -                return new TItemHolder(*this); -            } else { -                return Free.PopBack(); -            } -        } - -        void Release(TItemHolder& item) { -            Cleaner(item.Item); -            Free.PushBack(&item); -        } - -    private: -        TIntrusiveListWithAutoDelete<TItemHolder, TDelete> Free; -        TCleaner Cleaner; -    }; - -    using TBufferTlsCache = TTlsCache<TBuffer>; -} +                if (&other != this) {  +                    std::swap(Holder, other.Holder);  +                }  +                return *this;  +            }  +  +            ~TItemGuard() {  +                if (Holder) {  +                    Holder->Release();  +                }  +            }  +  +            TItem& Get() & {  +                Y_ASSERT(Holder);  +                return Holder->Item;  +            }  +  +            TItem& Get() && = delete;  +  +        private:  +            TItemHolder* Holder = nullptr;  +        };  +  +    public:  +        TItemGuard Item() {  +            return TItemGuard(*this);  +        }  +  +        static TSelf& TlsInstance() {  +            return *FastTlsSingleton<TSelf>();  +        }  +  +    private:  +        TItemHolder* Acquire() {  +            if (Free.Empty()) {  +                return new TItemHolder(*this);  +            } else {  +                return Free.PopBack();  +            }  +        }  +  +        void Release(TItemHolder& item) {  +            Cleaner(item.Item);  +            Free.PushBack(&item);  +        }  +  +    private:  +        TIntrusiveListWithAutoDelete<TItemHolder, TDelete> Free;  +        TCleaner Cleaner;  +    };  +  +    using TBufferTlsCache = TTlsCache<TBuffer>;  +}  diff --git a/library/cpp/codecs/ut/codecs_ut.cpp b/library/cpp/codecs/ut/codecs_ut.cpp index caf6089aef7..19382024009 100644 --- a/library/cpp/codecs/ut/codecs_ut.cpp +++ b/library/cpp/codecs/ut/codecs_ut.cpp @@ -4,15 +4,15 @@  #include <library/cpp/codecs/solar_codec.h>  #include <library/cpp/codecs/zstd_dict_codec.h>  #include <library/cpp/codecs/comptable_codec.h> - +   #include <library/cpp/testing/unittest/registar.h> - -#include <util/generic/buffer.h> -#include <util/string/util.h> -#include <util/string/hex.h> +  +#include <util/generic/buffer.h>  +#include <util/string/util.h>  +#include <util/string/hex.h>   #include <library/cpp/string_utils/relaxed_escaper/relaxed_escaper.h> - -namespace { +  +namespace {       const char* TextValues[] = {          "! сентября газета",          "!(возмездие это)!", @@ -855,328 +855,328 @@ namespace {          "lymphomatoid papulosis",          "sez.com",      }; -} - -class TCodecsTest: public TTestBase { +}  +  +class TCodecsTest: public TTestBase {       UNIT_TEST_SUITE(TCodecsTest); -    UNIT_TEST(TestPipeline) -    UNIT_TEST(TestDelta) -    UNIT_TEST(TestHuffman) -    UNIT_TEST(TestZStdDict) -    UNIT_TEST(TestCompTable) +    UNIT_TEST(TestPipeline)  +    UNIT_TEST(TestDelta)  +    UNIT_TEST(TestHuffman)  +    UNIT_TEST(TestZStdDict)  +    UNIT_TEST(TestCompTable)       UNIT_TEST(TestHuffmanLearnByFreqs) -    UNIT_TEST(TestSolar) -    UNIT_TEST(TestPFor) -    UNIT_TEST(TestRegistry) - +    UNIT_TEST(TestSolar)  +    UNIT_TEST(TestPFor)  +    UNIT_TEST(TestRegistry)  +       UNIT_TEST_SUITE_END(); - -private: +  +private:       TString PrintError(TStringBuf learn, TStringBuf test, TStringBuf codec, ui32 i) {          TString s; -        TStringOutput sout(s); +        TStringOutput sout(s);           sout << codec << ": " << i << ", "               << "\n";          sout << HexEncode(learn.data(), learn.size()); //NEscJ::EscapeJ<true>(learn, sout); -        sout << " != \n"; +        sout << " != \n";           sout << HexEncode(test.data(), test.size()); //NEscJ::EscapeJ<true>(test, sout); - -        if (s.Size() > 1536) { +  +        if (s.Size() > 1536) {               TString res = s.substr(0, 512); -            res.append("...<skipped ").append(ToString(s.size() - 1024)).append(">..."); -            res.append(s.substr(s.size() - 512)); -        } - -        return s; -    } - -    TStringBuf AsStrBuf(const TBuffer& b) { +            res.append("...<skipped ").append(ToString(s.size() - 1024)).append(">...");  +            res.append(s.substr(s.size() - 512));  +        }  +  +        return s;  +    }  +  +    TStringBuf AsStrBuf(const TBuffer& b) {           return TStringBuf(b.data(), b.size()); -    } - -    template <typename TCodec, bool testsaveload> +    }  +  +    template <typename TCodec, bool testsaveload>       void TestCodec(const TVector<TBuffer>& inlearn = TVector<TBuffer>(), const TVector<TBuffer>& in = TVector<TBuffer>(), NCodecs::TCodecPtr c = new TCodec) { -        using namespace NCodecs; - -        TBuffer buff; - -        { +        using namespace NCodecs;  +  +        TBuffer buff;  +  +        {               TVector<TBuffer> out; - -            c->Learn(inlearn.begin(), inlearn.end()); - -            if (testsaveload) { -                { -                    TBufferOutput bout(buff); -                    ICodec::Store(&bout, c); -                } - -                { -                    TBufferInput bin(buff); -                    c = ICodec::Restore(&bin); +  +            c->Learn(inlearn.begin(), inlearn.end());  +  +            if (testsaveload) {  +                {  +                    TBufferOutput bout(buff);  +                    ICodec::Store(&bout, c);  +                }  +  +                {  +                    TBufferInput bin(buff);  +                    c = ICodec::Restore(&bin);                       UNIT_ASSERT(c->AlreadyTrained()); -                } -            } - -            { -                size_t insz = 0; -                size_t outsz = buff.Size(); - -                for (ui32 i = 0; i < inlearn.size(); ++i) { +                }  +            }  +  +            {  +                size_t insz = 0;  +                size_t outsz = buff.Size();  +  +                for (ui32 i = 0; i < inlearn.size(); ++i) {                       out.emplace_back(); -                    c->Encode(AsStrBuf(inlearn[i]), out[i]); - -                    insz += inlearn[i].Size(); -                    outsz += out[i].Size(); -                } - -                TBuffer vecl; -                for (ui32 i = 0; i < out.size(); ++i) { -                    vecl.Clear(); -                    c->Decode(AsStrBuf(out[i]), vecl); - -                    UNIT_ASSERT_EQUAL_C(AsStrBuf(inlearn[i]), AsStrBuf(vecl), +                    c->Encode(AsStrBuf(inlearn[i]), out[i]);  +  +                    insz += inlearn[i].Size();  +                    outsz += out[i].Size();  +                }  +  +                TBuffer vecl;  +                for (ui32 i = 0; i < out.size(); ++i) {  +                    vecl.Clear();  +                    c->Decode(AsStrBuf(out[i]), vecl);  +  +                    UNIT_ASSERT_EQUAL_C(AsStrBuf(inlearn[i]), AsStrBuf(vecl),                                           PrintError(TStringBuf(inlearn[i].data(), inlearn[i].size()),                                                     TStringBuf(vecl.data(), vecl.size()), c->GetName(), i)); -                } -            } -        } - -        { -            if (testsaveload) { -                TBufferInput bin(buff); -                c = ICodec::Restore(&bin); -            } - -            size_t insz = 0; -            size_t outsz = buff.Size(); - -            TBuffer out, in1; -            for (ui32 i = 0; i < in.size(); ++i) { -                out.Clear(); -                in1.Clear(); -                c->Encode(AsStrBuf(in[i]), out); -                insz += in[i].Size(); -                outsz += out.Size(); -                c->Decode(AsStrBuf(out), in1); -                UNIT_ASSERT_EQUAL_C(AsStrBuf(in[i]), AsStrBuf(in1), +                }  +            }  +        }  +  +        {  +            if (testsaveload) {  +                TBufferInput bin(buff);  +                c = ICodec::Restore(&bin);  +            }  +  +            size_t insz = 0;  +            size_t outsz = buff.Size();  +  +            TBuffer out, in1;  +            for (ui32 i = 0; i < in.size(); ++i) {  +                out.Clear();  +                in1.Clear();  +                c->Encode(AsStrBuf(in[i]), out);  +                insz += in[i].Size();  +                outsz += out.Size();  +                c->Decode(AsStrBuf(out), in1);  +                UNIT_ASSERT_EQUAL_C(AsStrBuf(in[i]), AsStrBuf(in1),                                       PrintError(TStringBuf(in[i].data(), in[i].size()),                                                 TStringBuf(in1.data(), in1.size()), c->GetName(), i)); -            } -        } -    } - -    template <class T> -    void AppendTo(TBuffer& b, T t) { -        b.Append((char*)&t, sizeof(t)); -    } - -    void TestDelta() { -        using namespace NCodecs; +            }  +        }  +    }  +  +    template <class T>  +    void AppendTo(TBuffer& b, T t) {  +        b.Append((char*)&t, sizeof(t));  +    }  +  +    void TestDelta() {  +        using namespace NCodecs;           TVector<TBuffer> d; - -        // 1. common case +  +        // 1. common case           d.emplace_back(); -        AppendTo(d.back(), 1ULL); -        AppendTo(d.back(), 10ULL); -        AppendTo(d.back(), 100ULL); -        AppendTo(d.back(), 1000ULL); -        AppendTo(d.back(), 10000ULL); -        AppendTo(d.back(), 100000ULL); - -        // 2. delta overflow +        AppendTo(d.back(), 1ULL);  +        AppendTo(d.back(), 10ULL);  +        AppendTo(d.back(), 100ULL);  +        AppendTo(d.back(), 1000ULL);  +        AppendTo(d.back(), 10000ULL);  +        AppendTo(d.back(), 100000ULL);  +  +        // 2. delta overflow           d.emplace_back(); -        AppendTo(d.back(), 1ULL); -        AppendTo(d.back(), 10ULL); -        AppendTo(d.back(), 100ULL); -        AppendTo(d.back(), 1000ULL); -        AppendTo(d.back(), (ui64)-100LL); -        AppendTo(d.back(), (ui64)-10ULL); - -        // 3. bad sorting +        AppendTo(d.back(), 1ULL);  +        AppendTo(d.back(), 10ULL);  +        AppendTo(d.back(), 100ULL);  +        AppendTo(d.back(), 1000ULL);  +        AppendTo(d.back(), (ui64)-100LL);  +        AppendTo(d.back(), (ui64)-10ULL);  +  +        // 3. bad sorting           d.emplace_back(); -        AppendTo(d.back(), 1ULL); -        AppendTo(d.back(), 10ULL); -        AppendTo(d.back(), 1000ULL); -        AppendTo(d.back(), 100ULL); -        AppendTo(d.back(), 10000ULL); -        AppendTo(d.back(), 100000ULL); - -        // all bad +        AppendTo(d.back(), 1ULL);  +        AppendTo(d.back(), 10ULL);  +        AppendTo(d.back(), 1000ULL);  +        AppendTo(d.back(), 100ULL);  +        AppendTo(d.back(), 10000ULL);  +        AppendTo(d.back(), 100000ULL);  +  +        // all bad           d.emplace_back(); -        AppendTo(d.back(), -1LL); -        AppendTo(d.back(), -1LL); -        AppendTo(d.back(), -1LL); -        AppendTo(d.back(), -1LL); - +        AppendTo(d.back(), -1LL);  +        AppendTo(d.back(), -1LL);  +        AppendTo(d.back(), -1LL);  +        AppendTo(d.back(), -1LL);  +           TestCodec<TDeltaCodec<ui64, true>, false>(d);          TestCodec<TDeltaCodec<ui64, false>, false>(d); -    } - -    void TestPFor() { -        using namespace NCodecs; -        { +    }  +  +    void TestPFor() {  +        using namespace NCodecs;  +        {               TVector<TBuffer> d;              d.emplace_back(); -            AppendTo(d.back(), -1LL); -            AppendTo(d.back(), -1LL); -            AppendTo(d.back(), -1LL); -            AppendTo(d.back(), -1LL); +            AppendTo(d.back(), -1LL);  +            AppendTo(d.back(), -1LL);  +            AppendTo(d.back(), -1LL);  +            AppendTo(d.back(), -1LL);               d.emplace_back(); -            AppendTo(d.back(), 0LL); -            AppendTo(d.back(), 1LL); -            AppendTo(d.back(), 2LL); -            AppendTo(d.back(), 1LL); -            AppendTo(d.back(), 0LL); -            AppendTo(d.back(), 1LL); -            AppendTo(d.back(), 2LL); +            AppendTo(d.back(), 0LL);  +            AppendTo(d.back(), 1LL);  +            AppendTo(d.back(), 2LL);  +            AppendTo(d.back(), 1LL);  +            AppendTo(d.back(), 0LL);  +            AppendTo(d.back(), 1LL);  +            AppendTo(d.back(), 2LL);               d.emplace_back(); -            AppendTo(d.back(), 0LL); -            AppendTo(d.back(), 1LL); -            AppendTo(d.back(), 2LL); -            AppendTo(d.back(), 1LL); -            AppendTo(d.back(), -1LL); -            AppendTo(d.back(), 0LL); -            AppendTo(d.back(), 1LL); -            AppendTo(d.back(), 2LL); +            AppendTo(d.back(), 0LL);  +            AppendTo(d.back(), 1LL);  +            AppendTo(d.back(), 2LL);  +            AppendTo(d.back(), 1LL);  +            AppendTo(d.back(), -1LL);  +            AppendTo(d.back(), 0LL);  +            AppendTo(d.back(), 1LL);  +            AppendTo(d.back(), 2LL);               d.emplace_back(); -            AppendTo(d.back(), 0LL); -            AppendTo(d.back(), -1LL); -            AppendTo(d.back(), -2LL); -            AppendTo(d.back(), -1LL); -            AppendTo(d.back(), -2LL); -            AppendTo(d.back(), -1LL); -            AppendTo(d.back(), 0LL); -            AppendTo(d.back(), -1LL); -            AppendTo(d.back(), -2LL); - +            AppendTo(d.back(), 0LL);  +            AppendTo(d.back(), -1LL);  +            AppendTo(d.back(), -2LL);  +            AppendTo(d.back(), -1LL);  +            AppendTo(d.back(), -2LL);  +            AppendTo(d.back(), -1LL);  +            AppendTo(d.back(), 0LL);  +            AppendTo(d.back(), -1LL);  +            AppendTo(d.back(), -2LL);  +               TestCodec<TPForCodec<ui64>, false>(d); -            TestCodec<TPForCodec<ui64, true>, true>(d); -        } -        { +            TestCodec<TPForCodec<ui64, true>, true>(d);  +        }  +        {               TVector<TBuffer> d;              d.emplace_back(); -            AppendTo(d.back(), -1); -            AppendTo(d.back(), -1); -            AppendTo(d.back(), -1); -            AppendTo(d.back(), -1); +            AppendTo(d.back(), -1);  +            AppendTo(d.back(), -1);  +            AppendTo(d.back(), -1);  +            AppendTo(d.back(), -1);               d.emplace_back(); -            AppendTo(d.back(), 0); -            AppendTo(d.back(), 1); -            AppendTo(d.back(), 2); -            AppendTo(d.back(), 1); -            AppendTo(d.back(), -1); -            AppendTo(d.back(), 0); -            AppendTo(d.back(), 1); -            AppendTo(d.back(), 2); +            AppendTo(d.back(), 0);  +            AppendTo(d.back(), 1);  +            AppendTo(d.back(), 2);  +            AppendTo(d.back(), 1);  +            AppendTo(d.back(), -1);  +            AppendTo(d.back(), 0);  +            AppendTo(d.back(), 1);  +            AppendTo(d.back(), 2);               d.emplace_back(); -            AppendTo(d.back(), 0); -            AppendTo(d.back(), -1); -            AppendTo(d.back(), -2); -            AppendTo(d.back(), -1); -            AppendTo(d.back(), -2); -            AppendTo(d.back(), -1); -            AppendTo(d.back(), 0); -            AppendTo(d.back(), -1); -            AppendTo(d.back(), -2); - +            AppendTo(d.back(), 0);  +            AppendTo(d.back(), -1);  +            AppendTo(d.back(), -2);  +            AppendTo(d.back(), -1);  +            AppendTo(d.back(), -2);  +            AppendTo(d.back(), -1);  +            AppendTo(d.back(), 0);  +            AppendTo(d.back(), -1);  +            AppendTo(d.back(), -2);  +               TestCodec<TPForCodec<ui32>, false>(d); -            TestCodec<TPForCodec<ui32, true>, false>(d); -        } -        { +            TestCodec<TPForCodec<ui32, true>, false>(d);  +        }  +        {               TVector<TBuffer> d;              d.emplace_back(); -            for (auto& textValue : TextValues) { -                AppendTo(d.back(), (ui32)strlen(textValue)); -            } - -            TestCodec<TPForCodec<ui32>, false>(d); -            TestCodec<TPForCodec<ui32, true>, false>(d); -        } -        { +            for (auto& textValue : TextValues) {  +                AppendTo(d.back(), (ui32)strlen(textValue));  +            }  +  +            TestCodec<TPForCodec<ui32>, false>(d);  +            TestCodec<TPForCodec<ui32, true>, false>(d);  +        }  +        {               TVector<TBuffer> d;              d.emplace_back(); -            for (auto& textValue : TextValues) { -                AppendTo(d.back(), (ui64)strlen(textValue)); -            } - -            TestCodec<TPForCodec<ui64>, false>(d); -            TestCodec<TPForCodec<ui64, true>, false>(d); -        } -    } - -    template <class TCodec> -    void DoTestSimpleCodec() { -        using namespace NCodecs; -        { +            for (auto& textValue : TextValues) {  +                AppendTo(d.back(), (ui64)strlen(textValue));  +            }  +  +            TestCodec<TPForCodec<ui64>, false>(d);  +            TestCodec<TPForCodec<ui64, true>, false>(d);  +        }  +    }  +  +    template <class TCodec>  +    void DoTestSimpleCodec() {  +        using namespace NCodecs;  +        {               TVector<TBuffer> learn; - +               for (auto& textValue : TextValues) {                  learn.emplace_back(textValue, strlen(textValue)); -            } - -            TestCodec<TCodec, true>(learn); -        } -        { -            TestCodec<TCodec, true>(); -        } - -        { +            }  +  +            TestCodec<TCodec, true>(learn);  +        }  +        {  +            TestCodec<TCodec, true>();  +        }  +  +        {               TVector<TBuffer> learn;              learn.emplace_back(); -            learn.back().Append('a'); - +            learn.back().Append('a');  +               TVector<TBuffer> test;              test.emplace_back(); -            for (ui32 i = 0; i < 256; ++i) { -                test.back().Append((ui8)i); -            } - -            TestCodec<TCodec, true>(learn, test); -        } - -        { +            for (ui32 i = 0; i < 256; ++i) {  +                test.back().Append((ui8)i);  +            }  +  +            TestCodec<TCodec, true>(learn, test);  +        }  +  +        {               TVector<TBuffer> learn;              learn.emplace_back(); -            for (ui32 i = 0; i < 256; ++i) { -                for (ui32 j = 0; j < i; ++j) { +            for (ui32 i = 0; i < 256; ++i) {  +                for (ui32 j = 0; j < i; ++j) {                       learn.back().Append((ui8)i); -                } -            } - +                }  +            }  +               TVector<TBuffer> test;              test.emplace_back(); -            for (ui32 i = 0; i < 256; ++i) { -                test.back().Append((ui8)i); -            } - -            TestCodec<TCodec, true>(learn, test); -        } - -        { +            for (ui32 i = 0; i < 256; ++i) {  +                test.back().Append((ui8)i);  +            }  +  +            TestCodec<TCodec, true>(learn, test);  +        }  +  +        {               TVector<TBuffer> learn;              learn.emplace_back(); -            for (ui32 i = 0; i < 128; ++i) { -                for (ui32 j = 0; j < i; ++j) { -                    learn.back().Append((ui8)i); -                } -            } - +            for (ui32 i = 0; i < 128; ++i) {  +                for (ui32 j = 0; j < i; ++j) {  +                    learn.back().Append((ui8)i);  +                }  +            }  +               TVector<TBuffer> test;              test.emplace_back(); -            for (ui32 i = 128; i < 256; ++i) { -                test.back().Append((ui8)i); -            } - -            TestCodec<TCodec, true>(learn, test); -        } -    } - -    void TestHuffman() { -        DoTestSimpleCodec<NCodecs::THuffmanCodec>(); -    } - -    void TestZStdDict() { +            for (ui32 i = 128; i < 256; ++i) {  +                test.back().Append((ui8)i);  +            }  +  +            TestCodec<TCodec, true>(learn, test);  +        }  +    }  +  +    void TestHuffman() {  +        DoTestSimpleCodec<NCodecs::THuffmanCodec>();  +    }  +  +    void TestZStdDict() {           using namespace NCodecs;          {              TVector<TBuffer> learn; @@ -1188,12 +1188,12 @@ private:              TestCodec<TZStdDictCodec, true>(learn);          } -    } - -    void TestCompTable() { -        DoTestSimpleCodec<NCodecs::TCompTableCodec>(); -    } - +    }  +  +    void TestCompTable() {  +        DoTestSimpleCodec<NCodecs::TCompTableCodec>();  +    }  +       void TestHuffmanLearnByFreqs() {          using namespace NCodecs; @@ -1211,7 +1211,7 @@ private:              for (ui32 i = 0; i < data.size(); ++i) {                  outLearn.emplace_back(); -                codec.Encode(AsStrBuf(data[i]), outLearn[i]); +                codec.Encode(AsStrBuf(data[i]), outLearn[i]);               }          } @@ -1228,133 +1228,133 @@ private:              for (auto& textValue : TextValues) {                  size_t len = strlen(textValue); -                for (size_t j = 0; j < len; ++j) { +                for (size_t j = 0; j < len; ++j) {                       ++freqs[(ui32)(0xFF & textValue[j])].second; -                } +                }               }              codec.LearnByFreqs(TArrayRef<std::pair<char, ui64>>(freqs, Y_ARRAY_SIZE(freqs)));              for (ui32 i = 0; i < data.size(); ++i) {                  outLearnByFreqs.emplace_back(); -                codec.Encode(AsStrBuf(data[i]), outLearnByFreqs[i]); +                codec.Encode(AsStrBuf(data[i]), outLearnByFreqs[i]);               }          } -        UNIT_ASSERT_EQUAL(outLearn.size(), outLearnByFreqs.size()); -        const size_t sz = outLearn.size(); -        for (size_t n = 0; n < sz; ++n) { -            UNIT_ASSERT_EQUAL(AsStrBuf(outLearn[n]), AsStrBuf(outLearnByFreqs[n])); -        } +        UNIT_ASSERT_EQUAL(outLearn.size(), outLearnByFreqs.size());  +        const size_t sz = outLearn.size();  +        for (size_t n = 0; n < sz; ++n) {  +            UNIT_ASSERT_EQUAL(AsStrBuf(outLearn[n]), AsStrBuf(outLearnByFreqs[n]));  +        }       } -    void TestSolar() { -        using namespace NCodecs; -        { +    void TestSolar() {  +        using namespace NCodecs;  +        {               TVector<TBuffer> learn; - +               for (auto& textValue : TextValues) {                  learn.emplace_back(textValue, strlen(textValue)); -            } - +            }  +               TestCodec<TSolarCodec, true>(learn, TVector<TBuffer>(), new TSolarCodec(512, 8));              TestCodec<TAdaptiveSolarCodec, false>(learn, TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8));              TestCodec<TAdaptiveSolarCodec, true>(learn, TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8));              TestCodec<TSolarCodecShortInt, true>(learn, TVector<TBuffer>(), new TSolarCodecShortInt(512, 8)); -        } -        { +        }  +        {               TestCodec<TSolarCodec, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TSolarCodec(512, 8));              TestCodec<TAdaptiveSolarCodec, false>(TVector<TBuffer>(), TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8));              TestCodec<TAdaptiveSolarCodec, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TAdaptiveSolarCodec(512, 8));              TestCodec<TSolarCodecShortInt, true>(TVector<TBuffer>(), TVector<TBuffer>(), new TSolarCodecShortInt(512, 8)); -        } - -        { +        }  +  +        {               TVector<TBuffer> learn;              learn.emplace_back(); -            learn.back().Append('a'); - +            learn.back().Append('a');  +               TVector<TBuffer> test;              test.emplace_back(); -            for (ui32 i = 0; i < 256; ++i) { -                test.back().Append((ui8)i); -            } - -            TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8)); +            for (ui32 i = 0; i < 256; ++i) {  +                test.back().Append((ui8)i);  +            }  +  +            TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8));               TestCodec<TAdaptiveSolarCodec, false>(learn, test, new TAdaptiveSolarCodec(512, 8));              TestCodec<TAdaptiveSolarCodec, true>(learn, test, new TAdaptiveSolarCodec(512, 8));              TestCodec<TSolarCodecShortInt, true>(learn, test, new TSolarCodecShortInt(512, 8)); -        } - -        { +        }  +  +        {               TVector<TBuffer> learn;              learn.emplace_back(); -            for (ui32 i = 0; i < 256; ++i) { -                for (ui32 j = 0; j < i; ++j) { -                    learn.back().Append((ui8)i); -                } -            } - +            for (ui32 i = 0; i < 256; ++i) {  +                for (ui32 j = 0; j < i; ++j) {  +                    learn.back().Append((ui8)i);  +                }  +            }  +               TVector<TBuffer> test;              test.emplace_back(); -            for (ui32 i = 0; i < 256; ++i) { -                test.back().Append((ui8)i); -            } - -            TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8)); +            for (ui32 i = 0; i < 256; ++i) {  +                test.back().Append((ui8)i);  +            }  +  +            TestCodec<TSolarCodec, true>(learn, test, new TSolarCodec(512, 8));               TestCodec<TAdaptiveSolarCodec, false>(learn, test, new TAdaptiveSolarCodec(512, 8));              TestCodec<TAdaptiveSolarCodec, true>(learn, test, new TAdaptiveSolarCodec(512, 8));              TestCodec<TSolarCodecShortInt, true>(learn, test, new TSolarCodecShortInt(512, 8)); -        } -    } - -    void TestPipeline() { -        using namespace NCodecs; -        { +        }  +    }  +  +    void TestPipeline() {  +        using namespace NCodecs;  +        {               TVector<TBuffer> learn;              learn.emplace_back(); -            for (ui32 i = 0; i < 256; ++i) { -                for (i32 j = i; j >= 0; --j) { -                    learn.back().Append((ui8)j); -                } -            } - +            for (ui32 i = 0; i < 256; ++i) {  +                for (i32 j = i; j >= 0; --j) {  +                    learn.back().Append((ui8)j);  +                }  +            }  +               TVector<TBuffer> test;              test.emplace_back(); -            for (ui32 i = 0; i < 256; ++i) { -                test.back().Append((ui8)i); -            } - -            TestCodec<TPipelineCodec, true>(learn, test, +            for (ui32 i = 0; i < 256; ++i) {  +                test.back().Append((ui8)i);  +            }  +  +            TestCodec<TPipelineCodec, true>(learn, test,                                               new TPipelineCodec(new TSolarCodec(512, 8), new TSolarCodec(512, 8), new THuffmanCodec)); -        } -        { +        }  +        {               TVector<TBuffer> d;              d.emplace_back(); -            for (ui32 i = 0; i < 256; ++i) { -                for (i32 j = i; j >= 0; --j) { -                    d.back().Append(i * i); -                } -            } - +            for (ui32 i = 0; i < 256; ++i) {  +                for (i32 j = i; j >= 0; --j) {  +                    d.back().Append(i * i);  +                }  +            }  +               TestCodec<TPipelineCodec, false>(d, TVector<TBuffer>(),                                               new TPipelineCodec(new TDeltaCodec<ui32, false>, new TPForCodec<ui32>)); -        } -    } - -    void TestRegistry() { -        using namespace NCodecs; +        }  +    }  +  +    void TestRegistry() {  +        using namespace NCodecs;           TVector<TString> vs = ICodec::GetCodecsList();          for (const auto& v : vs) {              TCodecPtr p = ICodec::GetInstance(v);              if (v == "none") { -                UNIT_ASSERT(!p); -                continue; -            } +                UNIT_ASSERT(!p);  +                continue;  +            }               UNIT_ASSERT_C(!!p, v);              UNIT_ASSERT_C(TStringBuf(v).Head(3) == TStringBuf(p->GetName()).Head(3), v + " " + p->GetName()); -        } -    } -}; - -UNIT_TEST_SUITE_REGISTRATION(TCodecsTest) +        }  +    }  +};  +  +UNIT_TEST_SUITE_REGISTRATION(TCodecsTest)  diff --git a/library/cpp/codecs/ut/tls_cache_ut.cpp b/library/cpp/codecs/ut/tls_cache_ut.cpp index 8101af761fe..11dd5da53c2 100644 --- a/library/cpp/codecs/ut/tls_cache_ut.cpp +++ b/library/cpp/codecs/ut/tls_cache_ut.cpp @@ -1,15 +1,15 @@  #include <library/cpp/testing/unittest/registar.h>  #include <library/cpp/codecs/tls_cache.h> - +   Y_UNIT_TEST_SUITE(CodecsBufferFactoryTest){      void AssignToBuffer(TBuffer & buf, TStringBuf val){          buf.Assign(val.data(), val.size());  } - +   TStringBuf AsStringBuf(const TBuffer& b) {      return TStringBuf(b.Data(), b.Size());  } - +   Y_UNIT_TEST(TestAcquireReleaseReuse) {      NCodecs::TBufferTlsCache factory;      // acquiring the first buffer @@ -19,7 +19,7 @@ Y_UNIT_TEST(TestAcquireReleaseReuse) {          // acquiring the second buffer          auto buf2 = factory.Item();          AssignToBuffer(buf2.Get(), "Buffer_02"); -    } +    }       // the first buffer should stay intact      UNIT_ASSERT_EQUAL(AsStringBuf(buf1.Get()), "Buffer_01");      { diff --git a/library/cpp/codecs/ut/ya.make b/library/cpp/codecs/ut/ya.make index 90841b05ef6..0b53eba9e51 100644 --- a/library/cpp/codecs/ut/ya.make +++ b/library/cpp/codecs/ut/ya.make @@ -12,7 +12,7 @@ PEERDIR(  )  SRCS( -    tls_cache_ut.cpp +    tls_cache_ut.cpp       codecs_ut.cpp      float_huffman_ut.cpp  ) diff --git a/library/cpp/codecs/ya.make b/library/cpp/codecs/ya.make index 7e76fb0c9ad..d105d6925e7 100644 --- a/library/cpp/codecs/ya.make +++ b/library/cpp/codecs/ya.make @@ -1,24 +1,24 @@ -LIBRARY() - +LIBRARY()  +   OWNER(      g:base      velavokr  ) -SRCS( -    tls_cache.cpp -    codecs.cpp -    codecs_registry.cpp -    comptable_codec.cpp -    delta_codec.cpp +SRCS(  +    tls_cache.cpp  +    codecs.cpp  +    codecs_registry.cpp  +    comptable_codec.cpp  +    delta_codec.cpp       float_huffman.cpp -    huffman_codec.cpp -    pfor_codec.cpp -    solar_codec.cpp -    zstd_dict_codec.cpp -) - -PEERDIR( +    huffman_codec.cpp  +    pfor_codec.cpp  +    solar_codec.cpp  +    zstd_dict_codec.cpp  +)  +  +PEERDIR(       contrib/libs/zstd      library/cpp/bit_io      library/cpp/blockcodecs @@ -28,6 +28,6 @@ PEERDIR(      library/cpp/deprecated/accessors      library/cpp/packers      library/cpp/string_utils/relaxed_escaper -) - +)  +   END() diff --git a/library/cpp/codecs/zstd_dict_codec.cpp b/library/cpp/codecs/zstd_dict_codec.cpp index c42a2879e6c..d543736b3dc 100644 --- a/library/cpp/codecs/zstd_dict_codec.cpp +++ b/library/cpp/codecs/zstd_dict_codec.cpp @@ -1,173 +1,173 @@ -#include "zstd_dict_codec.h" - +#include "zstd_dict_codec.h"  +   #include <library/cpp/packers/packers.h> - -#include <util/generic/ptr.h> -#include <util/generic/refcount.h> -#include <util/generic/noncopyable.h> -#include <util/string/builder.h> -#include <util/system/src_location.h> -#include <util/ysaveload.h> - -#define ZDICT_STATIC_LINKING_ONLY - +  +#include <util/generic/ptr.h>  +#include <util/generic/refcount.h>  +#include <util/generic/noncopyable.h>  +#include <util/string/builder.h>  +#include <util/system/src_location.h>  +#include <util/ysaveload.h>  +  +#define ZDICT_STATIC_LINKING_ONLY  +   #include <contrib/libs/zstd/include/zdict.h>  #include <contrib/libs/zstd/include/zstd.h>  #include <contrib/libs/zstd/include/zstd_errors.h> - -// See IGNIETFERRO-320 for possible bugs - -namespace NCodecs { -    class TZStdDictCodec::TImpl: public TAtomicRefCount<TZStdDictCodec::TImpl> { -        template <class T, size_t Deleter(T*)> -        class TPtrHolder : TMoveOnly { -            T* Ptr = nullptr; - -        public: -            TPtrHolder() = default; - -            TPtrHolder(T* dict) -                : Ptr(dict) +  +// See IGNIETFERRO-320 for possible bugs  +  +namespace NCodecs {  +    class TZStdDictCodec::TImpl: public TAtomicRefCount<TZStdDictCodec::TImpl> {  +        template <class T, size_t Deleter(T*)>  +        class TPtrHolder : TMoveOnly {  +            T* Ptr = nullptr;  +  +        public:  +            TPtrHolder() = default;  +  +            TPtrHolder(T* dict)  +                : Ptr(dict)               {              } - -            T* Get() { -                return Ptr; -            } - -            const T* Get() const { -                return Ptr; -            } - -            void Reset(T* dict) { -                Dispose(); -                Ptr = dict; -            } - -            void Dispose() { -                if (Ptr) { -                    Deleter(Ptr); -                    Ptr = nullptr; -                } -            } - -            ~TPtrHolder() { -                Dispose(); -            } -        }; - -        using TCDict = TPtrHolder<ZSTD_CDict, ZSTD_freeCDict>; -        using TDDict = TPtrHolder<ZSTD_DDict, ZSTD_freeDDict>; -        using TCCtx = TPtrHolder<ZSTD_CCtx, ZSTD_freeCCtx>; -        using TDCtx = TPtrHolder<ZSTD_DCtx, ZSTD_freeDCtx>; - -        using TSizePacker = NPackers::TPacker<ui64>; - -    public: -        static const ui32 SampleSize = (1 << 22) * 5; - -        explicit TImpl(ui32 comprLevel) -            : CompressionLevel(comprLevel) -        { -            const size_t zeroSz = TSizePacker().MeasureLeaf(0); -            Zero.Resize(zeroSz); +  +            T* Get() {  +                return Ptr;  +            }  +  +            const T* Get() const {  +                return Ptr;  +            }  +  +            void Reset(T* dict) {  +                Dispose();  +                Ptr = dict;  +            }  +  +            void Dispose() {  +                if (Ptr) {  +                    Deleter(Ptr);  +                    Ptr = nullptr;  +                }  +            }  +  +            ~TPtrHolder() {  +                Dispose();  +            }  +        };  +  +        using TCDict = TPtrHolder<ZSTD_CDict, ZSTD_freeCDict>;  +        using TDDict = TPtrHolder<ZSTD_DDict, ZSTD_freeDDict>;  +        using TCCtx = TPtrHolder<ZSTD_CCtx, ZSTD_freeCCtx>;  +        using TDCtx = TPtrHolder<ZSTD_DCtx, ZSTD_freeDCtx>;  +  +        using TSizePacker = NPackers::TPacker<ui64>;  +  +    public:  +        static const ui32 SampleSize = (1 << 22) * 5;  +  +        explicit TImpl(ui32 comprLevel)  +            : CompressionLevel(comprLevel)  +        {  +            const size_t zeroSz = TSizePacker().MeasureLeaf(0);  +            Zero.Resize(zeroSz);               TSizePacker().PackLeaf(Zero.data(), 0, zeroSz); -        } - -        ui32 GetCompressionLevel() const { -            return CompressionLevel; -        } - -        ui8 Encode(TStringBuf in, TBuffer& outbuf) const { -            outbuf.Clear(); - +        }  +  +        ui32 GetCompressionLevel() const {  +            return CompressionLevel;  +        }  +  +        ui8 Encode(TStringBuf in, TBuffer& outbuf) const {  +            outbuf.Clear();  +               if (in.empty()) { -                return 0; -            } - -            TSizePacker packer; - +                return 0;  +            }  +  +            TSizePacker packer;  +               const char* rawBeg = in.data();              const size_t rawSz = in.size(); - -            const size_t szSz = packer.MeasureLeaf(rawSz); -            const size_t maxDatSz = ZSTD_compressBound(rawSz); - -            outbuf.Resize(szSz + maxDatSz); +  +            const size_t szSz = packer.MeasureLeaf(rawSz);  +            const size_t maxDatSz = ZSTD_compressBound(rawSz);  +  +            outbuf.Resize(szSz + maxDatSz);               packer.PackLeaf(outbuf.data(), rawSz, szSz); - -            TCCtx ctx{CheckPtr(ZSTD_createCCtx(), __LOCATION__)}; -            const size_t resSz = CheckSize(ZSTD_compress_usingCDict( +  +            TCCtx ctx{CheckPtr(ZSTD_createCCtx(), __LOCATION__)};  +            const size_t resSz = CheckSize(ZSTD_compress_usingCDict(                                                  ctx.Get(), outbuf.data() + szSz, maxDatSz, rawBeg, rawSz, CDict.Get()),                                             __LOCATION__); - -            if (resSz < rawSz) { -                outbuf.Resize(resSz + szSz); -            } else { +  +            if (resSz < rawSz) {  +                outbuf.Resize(resSz + szSz);  +            } else {                   outbuf.Resize(Zero.size() + rawSz);                  memcpy(outbuf.data(), Zero.data(), Zero.size());                  memcpy(outbuf.data() + Zero.size(), rawBeg, rawSz); -            } -            return 0; -        } - -        void Decode(TStringBuf in, TBuffer& outbuf) const { -            outbuf.Clear(); - +            }  +            return 0;  +        }  +  +        void Decode(TStringBuf in, TBuffer& outbuf) const {  +            outbuf.Clear();  +               if (in.empty()) { -                return; -            } - -            TSizePacker packer; - +                return;  +            }  +  +            TSizePacker packer;  +               const char* rawBeg = in.data();              size_t rawSz = in.size(); - -            const size_t szSz = packer.SkipLeaf(rawBeg); -            ui64 datSz = 0; -            packer.UnpackLeaf(rawBeg, datSz); - -            rawBeg += szSz; -            rawSz -= szSz; - -            if (!datSz) { -                outbuf.Resize(rawSz); +  +            const size_t szSz = packer.SkipLeaf(rawBeg);  +            ui64 datSz = 0;  +            packer.UnpackLeaf(rawBeg, datSz);  +  +            rawBeg += szSz;  +            rawSz -= szSz;  +  +            if (!datSz) {  +                outbuf.Resize(rawSz);                   memcpy(outbuf.data(), rawBeg, rawSz); -            } else { +            } else {                   //                size_t zSz = ZSTD_getDecompressedSize(rawBeg, rawSz);                  //                Y_ENSURE_EX(datSz == zSz, TCodecException() << datSz << " != " << zSz); -                outbuf.Resize(datSz); -                TDCtx ctx{CheckPtr(ZSTD_createDCtx(), __LOCATION__)}; -                CheckSize(ZSTD_decompress_usingDDict( +                outbuf.Resize(datSz);  +                TDCtx ctx{CheckPtr(ZSTD_createDCtx(), __LOCATION__)};  +                CheckSize(ZSTD_decompress_usingDDict(                                 ctx.Get(), outbuf.data(), outbuf.size(), rawBeg, rawSz, DDict.Get()),                            __LOCATION__); -                outbuf.Resize(datSz); -            } -        } - +                outbuf.Resize(datSz);  +            }  +        }  +           bool Learn(ISequenceReader& in, bool throwOnError) { -            TBuffer data; +            TBuffer data;               TVector<size_t> lens; - -            data.Reserve(2 * SampleSize); -            TStringBuf r; -            while (in.NextRegion(r)) { -                if (!r) { -                    continue; -                } +  +            data.Reserve(2 * SampleSize);  +            TStringBuf r;  +            while (in.NextRegion(r)) {  +                if (!r) {  +                    continue;  +                }                   data.Append(r.data(), r.size());                  lens.push_back(r.size()); -            } - +            }  +               ZDICT_legacy_params_t params; -            memset(¶ms, 0, sizeof(params)); +            memset(¶ms, 0, sizeof(params));               params.zParams.compressionLevel = 1;              params.zParams.notificationLevel = 1; -            Dict.Resize(Max<size_t>(1 << 20, data.Size() + 16 * lens.size())); - -            if (!lens) { -                Dict.Reset(); -            } else { +            Dict.Resize(Max<size_t>(1 << 20, data.Size() + 16 * lens.size()));  +  +            if (!lens) {  +                Dict.Reset();  +            } else {                   size_t trainResult = ZDICT_trainFromBuffer_legacy(                      Dict.data(), Dict.size(), data.Data(), const_cast<const size_t*>(&lens[0]), lens.size(), params);                  if (ZSTD_isError(trainResult)) { @@ -177,105 +177,105 @@ namespace NCodecs {                      CheckSize(trainResult, __LOCATION__);                  }                  Dict.Resize(trainResult); -                Dict.ShrinkToFit(); -            } -            InitContexts(); +                Dict.ShrinkToFit();  +            }  +            InitContexts();               return true; -        } - +        }  +           void Save(IOutputStream* out) const { -            ::Save(out, Dict); -        } - +            ::Save(out, Dict);  +        }  +           void Load(IInputStream* in) { -            ::Load(in, Dict); -            InitContexts(); -        } - -        void InitContexts() { +            ::Load(in, Dict);  +            InitContexts();  +        }  +  +        void InitContexts() {               CDict.Reset(CheckPtr(ZSTD_createCDict(Dict.data(), Dict.size(), CompressionLevel), __LOCATION__));              DDict.Reset(CheckPtr(ZSTD_createDDict(Dict.data(), Dict.size()), __LOCATION__)); -        } - -        static size_t CheckSize(size_t sz, TSourceLocation loc) { -            if (ZSTD_isError(sz)) { -                ythrow TCodecException() << loc << " " << ZSTD_getErrorName(sz) << " (code " << (int)ZSTD_getErrorCode(sz) << ")"; -            } -            return sz; -        } - -        template <class T> -        static T* CheckPtr(T* t, TSourceLocation loc) { +        }  +  +        static size_t CheckSize(size_t sz, TSourceLocation loc) {  +            if (ZSTD_isError(sz)) {  +                ythrow TCodecException() << loc << " " << ZSTD_getErrorName(sz) << " (code " << (int)ZSTD_getErrorCode(sz) << ")";  +            }  +            return sz;  +        }  +  +        template <class T>  +        static T* CheckPtr(T* t, TSourceLocation loc) {               Y_ENSURE_EX(t, TCodecException() << loc << " "                                               << "unexpected nullptr"); -            return t; -        } - -    private: -        ui32 CompressionLevel = 1; - -        TBuffer Zero; -        TBuffer Dict; - -        TCDict CDict; -        TDDict DDict; -    }; - -    TZStdDictCodec::TZStdDictCodec(ui32 comprLevel) -        : Impl(new TImpl(comprLevel)) -    { -        MyTraits.NeedsTraining = true; -        MyTraits.SizeOnEncodeMultiplier = 2; -        MyTraits.SizeOnDecodeMultiplier = 10; -        MyTraits.RecommendedSampleSize = TImpl::SampleSize; // same as for solar -    } - +            return t;  +        }  +  +    private:  +        ui32 CompressionLevel = 1;  +  +        TBuffer Zero;  +        TBuffer Dict;  +  +        TCDict CDict;  +        TDDict DDict;  +    };  +  +    TZStdDictCodec::TZStdDictCodec(ui32 comprLevel)  +        : Impl(new TImpl(comprLevel))  +    {  +        MyTraits.NeedsTraining = true;  +        MyTraits.SizeOnEncodeMultiplier = 2;  +        MyTraits.SizeOnDecodeMultiplier = 10;  +        MyTraits.RecommendedSampleSize = TImpl::SampleSize; // same as for solar  +    }  +       TZStdDictCodec::~TZStdDictCodec() {      } - +       TString TZStdDictCodec::GetName() const {          return TStringBuilder() << MyName() << "-" << Impl->GetCompressionLevel(); -    } - -    ui8 TZStdDictCodec::Encode(TStringBuf in, TBuffer& out) const { -        return Impl->Encode(in, out); -    } - -    void TZStdDictCodec::Decode(TStringBuf in, TBuffer& out) const { -        Impl->Decode(in, out); -    } - -    void TZStdDictCodec::DoLearn(ISequenceReader& in) { -        Impl = new TImpl(Impl->GetCompressionLevel()); +    }  +  +    ui8 TZStdDictCodec::Encode(TStringBuf in, TBuffer& out) const {  +        return Impl->Encode(in, out);  +    }  +  +    void TZStdDictCodec::Decode(TStringBuf in, TBuffer& out) const {  +        Impl->Decode(in, out);  +    }  +  +    void TZStdDictCodec::DoLearn(ISequenceReader& in) {  +        Impl = new TImpl(Impl->GetCompressionLevel());           Impl->Learn(in, true/*throwOnError*/); -    } - +    }  +       bool TZStdDictCodec::DoTryToLearn(ISequenceReader& in) {          Impl = new TImpl(Impl->GetCompressionLevel());          return Impl->Learn(in, false/*throwOnError*/);      }      void TZStdDictCodec::Save(IOutputStream* out) const { -        Impl->Save(out); -    } - +        Impl->Save(out);  +    }  +       void TZStdDictCodec::Load(IInputStream* in) { -        Impl->Load(in); -    } - +        Impl->Load(in);  +    }  +       TVector<TString> TZStdDictCodec::ListCompressionNames() {          TVector<TString> res; -        for (int i = 1; i <= ZSTD_maxCLevel(); ++i) { -            res.emplace_back(TStringBuilder() << MyName() << "-" << i); -        } -        return res; -    } - -    int TZStdDictCodec::ParseCompressionName(TStringBuf name) { -        int c = 0; -        TryFromString(name.After('-'), c); -        Y_ENSURE_EX(name.Before('-') == MyName() && c > 0 && c <= ZSTD_maxCLevel(), TCodecException() << "invald codec name" << name); -        return c; -    } - -} +        for (int i = 1; i <= ZSTD_maxCLevel(); ++i) {  +            res.emplace_back(TStringBuilder() << MyName() << "-" << i);  +        }  +        return res;  +    }  +  +    int TZStdDictCodec::ParseCompressionName(TStringBuf name) {  +        int c = 0;  +        TryFromString(name.After('-'), c);  +        Y_ENSURE_EX(name.Before('-') == MyName() && c > 0 && c <= ZSTD_maxCLevel(), TCodecException() << "invald codec name" << name);  +        return c;  +    }  +  +}  diff --git a/library/cpp/codecs/zstd_dict_codec.h b/library/cpp/codecs/zstd_dict_codec.h index 59c1ad6c606..cdfc5c82859 100644 --- a/library/cpp/codecs/zstd_dict_codec.h +++ b/library/cpp/codecs/zstd_dict_codec.h @@ -1,38 +1,38 @@ -#pragma once - -#include "codecs.h" - -#include <util/generic/ptr.h> - -namespace NCodecs { +#pragma once  +  +#include "codecs.h"  +  +#include <util/generic/ptr.h>  +  +namespace NCodecs {       // benchmarks are here: https://st.yandex-team.ru/SEARCH-1655 - +       class TZStdDictCodec: public ICodec {          class TImpl;          TIntrusivePtr<TImpl> Impl; - +       public:          explicit TZStdDictCodec(ui32 comprLevel = 1);          ~TZStdDictCodec() override; - +           static TStringBuf MyName() {              return "zstd08d";          } - +           TString GetName() const override; - +           ui8 Encode(TStringBuf in, TBuffer& out) const override; - +           void Decode(TStringBuf in, TBuffer& out) const override; - +           static TVector<TString> ListCompressionNames();          static int ParseCompressionName(TStringBuf); - +       protected:          void DoLearn(ISequenceReader& in) override;          bool DoTryToLearn(ISequenceReader& in) final;          void Save(IOutputStream* out) const override;          void Load(IInputStream* in) override;      }; - -} +  +}   | 
