diff options
author | Anton Samokhvalov <pg83@yandex.ru> | 2022-02-10 16:45:17 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:17 +0300 |
commit | d3a398281c6fd1d3672036cb2d63f842d2cb28c5 (patch) | |
tree | dd4bd3ca0f36b817e96812825ffaf10d645803f2 /library/cpp/compproto | |
parent | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (diff) | |
download | ydb-d3a398281c6fd1d3672036cb2d63f842d2cb28c5.tar.gz |
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/compproto')
-rw-r--r-- | library/cpp/compproto/bit.h | 654 | ||||
-rw-r--r-- | library/cpp/compproto/compproto_ut.cpp | 86 | ||||
-rw-r--r-- | library/cpp/compproto/compressor.h | 114 | ||||
-rw-r--r-- | library/cpp/compproto/huff.h | 702 | ||||
-rw-r--r-- | library/cpp/compproto/metainfo.h | 502 |
5 files changed, 1029 insertions, 1029 deletions
diff --git a/library/cpp/compproto/bit.h b/library/cpp/compproto/bit.h index 7d2800843f..6a421b65f7 100644 --- a/library/cpp/compproto/bit.h +++ b/library/cpp/compproto/bit.h @@ -3,346 +3,346 @@ #include <util/generic/array_ref.h> #include <util/generic/vector.h> #include <util/stream/output.h> -#include <util/stream/input.h> +#include <util/stream/input.h> #include "huff.h" #include "compressor.h" -#include "metainfo.h" +#include "metainfo.h" namespace NCompProto { - struct TBitBuffer { - TVector<ui8> Out; - ui64 Position; - ui64 Size; - ui8 Counter; - TBitBuffer() { - Clear(); - } - - static ui64 Read(const ui8* out, ui64 position, size_t size) { - const ui8* dst = out + (size_t(position >> 3)); - ui64 outCode = *reinterpret_cast<const ui64*>(dst); - ui8 shift = position & 7; - return ((1ULL << size) - 1) & (outCode >> shift); - } - - ui64 Read(ui64 position, size_t size) { - return Read(&Out[0], position, size); - } - - void Code(ui64 value, size_t size) { - if (++Counter == 0) { - Junk(257 * 64); - } - Position = Code(value, size, Position); - } - ui64 Code(ui64 value, size_t size, ui64 position) { - ui8* dst = &Out[size_t(position >> 3)]; - ui64& outCode = *(ui64*)dst; - ui8 shift = position & 7; - ui64 mask = ((1ULL << size) - 1) << shift; - outCode = ((value << shift) & mask) | (outCode & ~mask); - return position + size; - } - void Junk(size_t junk = 1024) { - size_t need = size_t(Position >> 3); - if (Out.size() * 8 < Position + junk) { - Out.resize(((need + junk) * 3) / 2); - Size = Out.size(); - } - } - void Insert(ui64 value, ui64 position, size_t size) { - ui64 newVal = Read(position, 56); - position = Code(value, size, position); - value = newVal; - - while (position < Position + 64) { - newVal = Read(position + 56 - size, 56); - position = Code(value, 56, position); - value = newVal; - } - - Position += size; - } - - size_t ByteLength() const { - return (Position + 7) / 8; - } - - void ResetPosition() { - Position = 0; - Size = 0; - } - - void Clear() { - Counter = 0; - Position = 0; - Size = 0; - Out.clear(); - Junk(); - } + struct TBitBuffer { + TVector<ui8> Out; + ui64 Position; + ui64 Size; + ui8 Counter; + TBitBuffer() { + Clear(); + } + + static ui64 Read(const ui8* out, ui64 position, size_t size) { + const ui8* dst = out + (size_t(position >> 3)); + ui64 outCode = *reinterpret_cast<const ui64*>(dst); + ui8 shift = position & 7; + return ((1ULL << size) - 1) & (outCode >> shift); + } + + ui64 Read(ui64 position, size_t size) { + return Read(&Out[0], position, size); + } + + void Code(ui64 value, size_t size) { + if (++Counter == 0) { + Junk(257 * 64); + } + Position = Code(value, size, Position); + } + ui64 Code(ui64 value, size_t size, ui64 position) { + ui8* dst = &Out[size_t(position >> 3)]; + ui64& outCode = *(ui64*)dst; + ui8 shift = position & 7; + ui64 mask = ((1ULL << size) - 1) << shift; + outCode = ((value << shift) & mask) | (outCode & ~mask); + return position + size; + } + void Junk(size_t junk = 1024) { + size_t need = size_t(Position >> 3); + if (Out.size() * 8 < Position + junk) { + Out.resize(((need + junk) * 3) / 2); + Size = Out.size(); + } + } + void Insert(ui64 value, ui64 position, size_t size) { + ui64 newVal = Read(position, 56); + position = Code(value, size, position); + value = newVal; + + while (position < Position + 64) { + newVal = Read(position + 56 - size, 56); + position = Code(value, 56, position); + value = newVal; + } + + Position += size; + } + + size_t ByteLength() const { + return (Position + 7) / 8; + } + + void ResetPosition() { + Position = 0; + Size = 0; + } + + void Clear() { + Counter = 0; + Position = 0; + Size = 0; + Out.clear(); + Junk(); + } TArrayRef<const char> AsDataRegion() const { return TArrayRef<const char>(reinterpret_cast<const char*>(Out.data()), ByteLength()); - } - }; - - struct THuff { - TCoder Coder; - ui64 Position; - THuff() { - Position = (ui64)(-1); - } - void Load(IInputStream& stream) { - TString name; - Coder.InitDefault(); - Coder.Entries.clear(); - while (1) { - stream >> name; - if (name == "end") - return; - else if (name == "entry") { - TCoderEntry entry; - ui32 value; - stream >> value; - entry.MinValue = value; - stream >> value; - entry.Prefix = value; - stream >> value; - entry.PrefixBits = value; - stream >> value; - entry.AllBits = value; - Coder.Entries.push_back(entry); - } - } - } - - void Save(IOutputStream& stream, TString offset) { - TString step = " "; - for (size_t i = 0; i < Coder.Entries.size(); ++i) { - stream << offset << step << "entry "; - stream << (ui32)Coder.Entries[i].MinValue << " "; - stream << (ui32)Coder.Entries[i].Prefix << " "; - stream << (ui32)Coder.Entries[i].PrefixBits << " "; - stream << (ui32)Coder.Entries[i].AllBits << " "; - stream << Endl; + } + }; + + struct THuff { + TCoder Coder; + ui64 Position; + THuff() { + Position = (ui64)(-1); + } + void Load(IInputStream& stream) { + TString name; + Coder.InitDefault(); + Coder.Entries.clear(); + while (1) { + stream >> name; + if (name == "end") + return; + else if (name == "entry") { + TCoderEntry entry; + ui32 value; + stream >> value; + entry.MinValue = value; + stream >> value; + entry.Prefix = value; + stream >> value; + entry.PrefixBits = value; + stream >> value; + entry.AllBits = value; + Coder.Entries.push_back(entry); + } + } + } + + void Save(IOutputStream& stream, TString offset) { + TString step = " "; + for (size_t i = 0; i < Coder.Entries.size(); ++i) { + stream << offset << step << "entry "; + stream << (ui32)Coder.Entries[i].MinValue << " "; + stream << (ui32)Coder.Entries[i].Prefix << " "; + stream << (ui32)Coder.Entries[i].PrefixBits << " "; + stream << (ui32)Coder.Entries[i].AllBits << " "; + stream << Endl; + } + stream << offset << "end" << Endl; + } + + void BeginElement(TBitBuffer& out) { + Position = out.Position; + } + void Add(ui32 value, TBitBuffer& out) { + size_t val = 0; + ui64 code = Coder.Code(value, val); + out.Code(code, val); + } + void AddDelayed(ui32 value, TBitBuffer& out) { + size_t val = 0; + ui64 code = Coder.Code(value, val); + if (Position == (ui64)(-1)) { + ythrow yexception() << "Position == (ui64)(-1)"; + } + out.Insert(code, Position, val); + out.Junk(); + } + }; + + struct THist { + TAccum Accum; + THist() { + } + + void Load(IInputStream& stream) { + TString name; + while (1) { + stream >> name; + if (name == "end") + return; } - stream << offset << "end" << Endl; - } - - void BeginElement(TBitBuffer& out) { - Position = out.Position; - } - void Add(ui32 value, TBitBuffer& out) { - size_t val = 0; - ui64 code = Coder.Code(value, val); - out.Code(code, val); - } - void AddDelayed(ui32 value, TBitBuffer& out) { - size_t val = 0; - ui64 code = Coder.Code(value, val); - if (Position == (ui64)(-1)) { - ythrow yexception() << "Position == (ui64)(-1)"; - } - out.Insert(code, Position, val); - out.Junk(); - } - }; - - struct THist { - TAccum Accum; - THist() { - } - - void Load(IInputStream& stream) { - TString name; - while (1) { - stream >> name; - if (name == "end") - return; - } - } - // TODO: why not const TString& ??? - void Save(IOutputStream& /*stream*/, TString /*offset*/) { - } - - void Add(ui32 value, TEmpty& /*empty*/) { - Accum.Add(value); - } - void AddDelayed(ui32 value, TEmpty& /*empty*/) { - Accum.Add(value); - } - void BeginElement(TEmpty& /*empty*/) { - } - }; - - struct THistToHuff { - static THistToHuff Instance() { - return THistToHuff(); - } - TEmpty Build() const { - return TEmpty(); - } - void Build(THuff& info, const THist& hist) const { - Analyze(hist.Accum, info.Coder.Entries); - info.Coder.Normalize(); - } - }; - - struct IDecompressor { - // sequentially decompresses whole structure according to metainfo, starts at position offset - virtual void Decompress(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) = 0; - // decompresses one record of outer repeated structure starting at position offset - virtual void DecompressOne(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset, ui32 prevIndex = -1) = 0; - virtual ~IDecompressor() = default; - }; - - template <class X> - struct TMetaIterator: public IDecompressor { - X Self; - TMetaIterator() { - Self.Parent = this; - } - - private: - inline void DecompressSingle(ui32 repeatedIndex, const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) { - Self.BeginElement(repeatedIndex); - ui32 mask = table->Mask.Decompress(codes, offset); - size_t index = 0; - ui32 scalarMask = table->ScalarMask; - while (mask || scalarMask) { - if (mask & 1) { - if (scalarMask & 1) { - ui32 val = table->Scalar[index].Decompress(codes, offset); - Self.SetScalar(index, val); - } else { - Self.GetDecompressor(index).Decompress(table->Repeated[index].Get(), codes, offset); - } - } else if ((scalarMask & 1) && table->Default[index].Type == TScalarDefaultValue::Fixed) { - Self.SetScalar(index, table->Default[index].Value); - } - scalarMask = scalarMask >> 1; - mask = mask >> 1; - ++index; - } - Self.EndElement(); - } - - inline void DecompressSingleScalarsOnly(ui32 repeatedIndex, const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) { - Self.BeginElement(repeatedIndex); - ui32 mask = table->Mask.Decompress(codes, offset); - ui32 scalarMask = table->ScalarMask; - size_t index = 0; - while (scalarMask) { - if (mask & 1) { + } + // TODO: why not const TString& ??? + void Save(IOutputStream& /*stream*/, TString /*offset*/) { + } + + void Add(ui32 value, TEmpty& /*empty*/) { + Accum.Add(value); + } + void AddDelayed(ui32 value, TEmpty& /*empty*/) { + Accum.Add(value); + } + void BeginElement(TEmpty& /*empty*/) { + } + }; + + struct THistToHuff { + static THistToHuff Instance() { + return THistToHuff(); + } + TEmpty Build() const { + return TEmpty(); + } + void Build(THuff& info, const THist& hist) const { + Analyze(hist.Accum, info.Coder.Entries); + info.Coder.Normalize(); + } + }; + + struct IDecompressor { + // sequentially decompresses whole structure according to metainfo, starts at position offset + virtual void Decompress(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) = 0; + // decompresses one record of outer repeated structure starting at position offset + virtual void DecompressOne(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset, ui32 prevIndex = -1) = 0; + virtual ~IDecompressor() = default; + }; + + template <class X> + struct TMetaIterator: public IDecompressor { + X Self; + TMetaIterator() { + Self.Parent = this; + } + + private: + inline void DecompressSingle(ui32 repeatedIndex, const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) { + Self.BeginElement(repeatedIndex); + ui32 mask = table->Mask.Decompress(codes, offset); + size_t index = 0; + ui32 scalarMask = table->ScalarMask; + while (mask || scalarMask) { + if (mask & 1) { + if (scalarMask & 1) { + ui32 val = table->Scalar[index].Decompress(codes, offset); + Self.SetScalar(index, val); + } else { + Self.GetDecompressor(index).Decompress(table->Repeated[index].Get(), codes, offset); + } + } else if ((scalarMask & 1) && table->Default[index].Type == TScalarDefaultValue::Fixed) { + Self.SetScalar(index, table->Default[index].Value); + } + scalarMask = scalarMask >> 1; + mask = mask >> 1; + ++index; + } + Self.EndElement(); + } + + inline void DecompressSingleScalarsOnly(ui32 repeatedIndex, const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) { + Self.BeginElement(repeatedIndex); + ui32 mask = table->Mask.Decompress(codes, offset); + ui32 scalarMask = table->ScalarMask; + size_t index = 0; + while (scalarMask) { + if (mask & 1) { ui32 val = table->Scalar[index].Decompress(codes, offset); Self.SetScalar(index, val); - } else if (table->Default[index].Type == TScalarDefaultValue::Fixed) { - Self.SetScalar(index, table->Default[index].Value); + } else if (table->Default[index].Type == TScalarDefaultValue::Fixed) { + Self.SetScalar(index, table->Default[index].Value); } - mask = mask >> 1; - scalarMask = scalarMask >> 1; - ++index; + mask = mask >> 1; + scalarMask = scalarMask >> 1; + ++index; } - Self.EndElement(); - } - - public: - void Decompress(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) override { - ui64 locOffset = offset; - ui32 count = table->Count.Decompress(codes, locOffset); - ui32 repeatedIndex = (ui32)(-1); - Self.BeginSelf(count, table->Id); - if (table->RepeatedMask) { - for (ui32 i = 0; i < count; ++i) { - repeatedIndex += table->Index.Decompress(codes, locOffset); - DecompressSingle(repeatedIndex, table, codes, locOffset); - } - } else { - for (ui32 i = 0; i < count; ++i) { - repeatedIndex += table->Index.Decompress(codes, locOffset); - DecompressSingleScalarsOnly(repeatedIndex, table, codes, locOffset); - } + Self.EndElement(); + } + + public: + void Decompress(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) override { + ui64 locOffset = offset; + ui32 count = table->Count.Decompress(codes, locOffset); + ui32 repeatedIndex = (ui32)(-1); + Self.BeginSelf(count, table->Id); + if (table->RepeatedMask) { + for (ui32 i = 0; i < count; ++i) { + repeatedIndex += table->Index.Decompress(codes, locOffset); + DecompressSingle(repeatedIndex, table, codes, locOffset); + } + } else { + for (ui32 i = 0; i < count; ++i) { + repeatedIndex += table->Index.Decompress(codes, locOffset); + DecompressSingleScalarsOnly(repeatedIndex, table, codes, locOffset); + } } - offset = locOffset; - Self.EndSelf(); - } - - // XXX: iterator needed? - // - // Following two functions serves the purpose of decompressing outer repeated-structure(such structure is mandatory now). - // They can work for inner repeated structures too, if you supply correct table, codes and offset parameters. - void DecompressOne(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset, ui32 prevIndex = -1) override { - table->Index.Decompress(codes, offset); - DecompressSingle(prevIndex, table, codes, offset); - } - - ui32 DecompressCount(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) { - return table->Count.Decompress(codes, offset); - } - }; - - template <class X> - struct TParentHold { - TMetaIterator<X>* Parent; - TParentHold() - : Parent(nullptr) - { - } - }; - - struct TEmptyDecompressor: public TParentHold<TEmptyDecompressor> { - void BeginSelf(ui32 /*count*/, ui32 /*id*/) { - } - void EndSelf() { - } - void BeginElement(ui32 /*element*/) { - } - void EndElement() { - } - void SetScalar(size_t /*index*/, ui32 /*val*/) { - } - TMetaIterator<TEmptyDecompressor>& GetDecompressor(size_t index) { - Y_UNUSED(index); - return *Parent; - } - }; - - inline TMetaIterator<TEmptyDecompressor>& GetEmptyDecompressor() { - static TMetaIterator<TEmptyDecompressor> empty; - return empty; + offset = locOffset; + Self.EndSelf(); + } + + // XXX: iterator needed? + // + // Following two functions serves the purpose of decompressing outer repeated-structure(such structure is mandatory now). + // They can work for inner repeated structures too, if you supply correct table, codes and offset parameters. + void DecompressOne(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset, ui32 prevIndex = -1) override { + table->Index.Decompress(codes, offset); + DecompressSingle(prevIndex, table, codes, offset); + } + + ui32 DecompressCount(const TMetaInfo<TTable>* table, const ui8* codes, ui64& offset) { + return table->Count.Decompress(codes, offset); + } + }; + + template <class X> + struct TParentHold { + TMetaIterator<X>* Parent; + TParentHold() + : Parent(nullptr) + { + } + }; + + struct TEmptyDecompressor: public TParentHold<TEmptyDecompressor> { + void BeginSelf(ui32 /*count*/, ui32 /*id*/) { + } + void EndSelf() { + } + void BeginElement(ui32 /*element*/) { + } + void EndElement() { + } + void SetScalar(size_t /*index*/, ui32 /*val*/) { + } + TMetaIterator<TEmptyDecompressor>& GetDecompressor(size_t index) { + Y_UNUSED(index); + return *Parent; + } + }; + + inline TMetaIterator<TEmptyDecompressor>& GetEmptyDecompressor() { + static TMetaIterator<TEmptyDecompressor> empty; + return empty; } - struct THuffToTable { - static THuffToTable Instance() { - return THuffToTable(); - } - void Build(TTable& table, const THuff& huff) const { - // can happen if a field was never serialized across whole structure - if (huff.Coder.Entries.empty()) - return; - for (ui32 i = 0; i < 64; ++i) { - const TCoderEntry& entry = huff.Coder.GetEntry(i, table.Id[i]); - table.CodeBase[i] = entry.MinValue; - table.PrefLength[i] = entry.PrefixBits; - table.CodeMask[i] = (1ULL << (entry.AllBits - entry.PrefixBits)) - 1ULL; - table.Length[i] = entry.AllBits; - } - } - }; - - struct THuffToTableWithDecompressor: private THuffToTable { - TSimpleSharedPtr<IDecompressor> Decompressor; - THuffToTableWithDecompressor(TSimpleSharedPtr<IDecompressor> decompressor) - : Decompressor(decompressor) - { - } - TSimpleSharedPtr<IDecompressor> Build() const { - return Decompressor; - } - void Build(TTable& table, const THuff& huff) const { - THuffToTable::Build(table, huff); - } - }; - -} + struct THuffToTable { + static THuffToTable Instance() { + return THuffToTable(); + } + void Build(TTable& table, const THuff& huff) const { + // can happen if a field was never serialized across whole structure + if (huff.Coder.Entries.empty()) + return; + for (ui32 i = 0; i < 64; ++i) { + const TCoderEntry& entry = huff.Coder.GetEntry(i, table.Id[i]); + table.CodeBase[i] = entry.MinValue; + table.PrefLength[i] = entry.PrefixBits; + table.CodeMask[i] = (1ULL << (entry.AllBits - entry.PrefixBits)) - 1ULL; + table.Length[i] = entry.AllBits; + } + } + }; + + struct THuffToTableWithDecompressor: private THuffToTable { + TSimpleSharedPtr<IDecompressor> Decompressor; + THuffToTableWithDecompressor(TSimpleSharedPtr<IDecompressor> decompressor) + : Decompressor(decompressor) + { + } + TSimpleSharedPtr<IDecompressor> Build() const { + return Decompressor; + } + void Build(TTable& table, const THuff& huff) const { + THuffToTable::Build(table, huff); + } + }; + +} diff --git a/library/cpp/compproto/compproto_ut.cpp b/library/cpp/compproto/compproto_ut.cpp index e0cfcfaf6b..9393be967a 100644 --- a/library/cpp/compproto/compproto_ut.cpp +++ b/library/cpp/compproto/compproto_ut.cpp @@ -32,8 +32,8 @@ struct TTestParams { ui32 ValueArraySize; }; -template <typename X> -void TestSaveLoadMeta(NCompProto::TMetaInfo<X>& src) { +template <typename X> +void TestSaveLoadMeta(NCompProto::TMetaInfo<X>& src) { TStringStream ss; src.Save(ss); TString data = ss.Str(); @@ -50,7 +50,7 @@ void TestWithParams(const TString& metainfo, const ECompMode mode, const TTestPa TStringInput stream(metainfo); - THolder<TMetaInfo<THuff>> meta; + THolder<TMetaInfo<THuff>> meta; if (mode == CM_TWOPASS) { TMetaInfo<THist> hist(stream); TEmpty empty; @@ -70,19 +70,19 @@ void TestWithParams(const TString& metainfo, const ECompMode mode, const TTestPa // verify that no memory read beyond buffer occurs const size_t byteSize = buffer.ByteLength(); - const size_t PAGESIZEX = 4096; - const size_t busyPages = (byteSize + (PAGESIZEX - 1)) / PAGESIZEX; + const size_t PAGESIZEX = 4096; + const size_t busyPages = (byteSize + (PAGESIZEX - 1)) / PAGESIZEX; const size_t allPages = busyPages + 1; - const size_t allocSize = (allPages + 1) * PAGESIZEX; + const size_t allocSize = (allPages + 1) * PAGESIZEX; TVector<ui8> readBuffer(allocSize); ui8* start = &readBuffer[0]; - ui8* pageStart = reinterpret_cast<ui8*>((size_t(start) + PAGESIZEX) & ~(PAGESIZEX - 1)); + ui8* pageStart = reinterpret_cast<ui8*>((size_t(start) + PAGESIZEX) & ~(PAGESIZEX - 1)); // XX DATA DATA DATA DATA PROT // | | | | | pages // calculate dataStart so that data ends exactly at the page end - ui8* dataStart = pageStart + busyPages * PAGESIZEX - byteSize; - ui8* dataEnd = pageStart + busyPages * PAGESIZEX; - ProtectMemory(dataEnd, PAGESIZEX, PM_NONE); + ui8* dataStart = pageStart + busyPages * PAGESIZEX - byteSize; + ui8* dataEnd = pageStart + busyPages * PAGESIZEX; + ProtectMemory(dataEnd, PAGESIZEX, PM_NONE); // memory copying should be performed without any problems memcpy(dataStart, buffer.Out.data(), byteSize); @@ -93,7 +93,7 @@ void TestWithParams(const TString& metainfo, const ECompMode mode, const TTestPa const ui64 decodedSize = position; UNIT_ASSERT_EQUAL(codedSize, decodedSize); // unprotect memory - ProtectMemory(dataEnd, PAGESIZEX, PM_READ | PM_WRITE | PM_EXEC); + ProtectMemory(dataEnd, PAGESIZEX, PM_READ | PM_WRITE | PM_EXEC); } template <typename TDecompressor, template <typename, typename> class TSerialize> @@ -112,7 +112,7 @@ Y_UNIT_TEST_SUITE(CompProtoTestBasic) { using namespace NCompProto; const TString metainfo = - "\n\ + "\n\ repeated data id 0\n\ scalar clicks id 0 default const 0\n\ scalar shows id 1 default const 0\n\ @@ -137,9 +137,9 @@ Y_UNIT_TEST_SUITE(CompProtoTestBasic) { TVector<TData> data; - template <class TMeta, class TFunctor> + template <class TMeta, class TFunctor> struct TSerialize { - static void Serialize(TMetaInfo<TMeta>& meta, TFunctor& functor, const TTestParams& params) { + static void Serialize(TMetaInfo<TMeta>& meta, TFunctor& functor, const TTestParams& params) { FlushPseudoRandom(); meta.BeginSelf(functor); data.clear(); @@ -153,7 +153,7 @@ Y_UNIT_TEST_SUITE(CompProtoTestBasic) { meta.SetScalar(0, data[i].Clicks, functor); meta.SetScalar(1, data[i].Shows, functor); - TMetaInfo<TMeta>& regClicks = meta.BeginRepeated(2, functor); + TMetaInfo<TMeta>& regClicks = meta.BeginRepeated(2, functor); for (ui32 j = 0; j < PseudoRandom(200); j += 1 + PseudoRandom(10)) { regClicks.BeginElement(j, functor); TRegInfo& r = data[i].RegClicks[j]; @@ -173,10 +173,10 @@ Y_UNIT_TEST_SUITE(CompProtoTestBasic) { } }; - struct TMultiDecompressor: public TParentHold<TMultiDecompressor> { - struct TRegClicks: public TParentHold<TRegClicks> { - const TData* Data; - const TRegInfo* Elem; + struct TMultiDecompressor: public TParentHold<TMultiDecompressor> { + struct TRegClicks: public TParentHold<TRegClicks> { + const TData* Data; + const TRegInfo* Elem; TRegClicks() : Data(nullptr) , Elem(nullptr) @@ -201,13 +201,13 @@ Y_UNIT_TEST_SUITE(CompProtoTestBasic) { if (index == 1) UNIT_ASSERT_EQUAL(val, Elem->Shows); } - IDecompressor& GetDecompressor(size_t) { + IDecompressor& GetDecompressor(size_t) { UNIT_ASSERT(0); return GetEmptyDecompressor(); } }; - const TData* Elem; + const TData* Elem; TMetaIterator<TRegClicks> RegClicks; void BeginSelf(ui32 /*count*/, ui32 /*id*/) { } @@ -227,7 +227,7 @@ Y_UNIT_TEST_SUITE(CompProtoTestBasic) { if (index == 31) UNIT_ASSERT_EQUAL(val, Elem->Extra); } - IDecompressor& GetDecompressor(size_t index) { + IDecompressor& GetDecompressor(size_t index) { if (index == 2) { RegClicks.Self.Data = Elem; return RegClicks; @@ -241,19 +241,19 @@ Y_UNIT_TEST_SUITE(CompProtoTestBasic) { } }; - struct TVerifyingDecompressor: public TParentHold<TVerifyingDecompressor> { + struct TVerifyingDecompressor: public TParentHold<TVerifyingDecompressor> { enum EState { - Startstop, - OutDataElem, - InDataElem, - InRegClicks, + Startstop, + OutDataElem, + InDataElem, + InRegClicks, }; EState State; ui32 DataInd; TMap<ui32, TRegInfo>::iterator RegIter; - TMetaIterator<TVerifyingDecompressor>& GetDecompressor(size_t index) { + TMetaIterator<TVerifyingDecompressor>& GetDecompressor(size_t index) { Y_UNUSED(index); return *Parent; } @@ -374,7 +374,7 @@ Y_UNIT_TEST_SUITE(CompProtoTestBasic) { Y_UNIT_TEST_SUITE(CompProtoTestExtended) { using namespace NCompProto; const TString metainfo = - "\n\ + "\n\ repeated data id 0\n\ repeated second id 3\n\ scalar inner2 id 0 default const 0\n\ @@ -385,21 +385,21 @@ Y_UNIT_TEST_SUITE(CompProtoTestExtended) { end\n"; TVector<std::pair<TVector<ui32>, TVector<ui32>>> data; - template <class TMeta, class TFunctor> + template <class TMeta, class TFunctor> struct TSerialize { - static void Serialize(TMetaInfo<TMeta>& meta, TFunctor& functor, const TTestParams& params) { + static void Serialize(TMetaInfo<TMeta>& meta, TFunctor& functor, const TTestParams& params) { FlushPseudoRandom(); meta.BeginSelf(functor); data.clear(); data.resize(params.DataSize); for (size_t i = 0; i < params.DataSize; ++i) { meta.BeginElement(i, functor); - TMetaInfo<TMeta>& first = meta.BeginRepeated(2, functor); + TMetaInfo<TMeta>& first = meta.BeginRepeated(2, functor); data[i].first.resize(params.ValueArraySize); for (ui32 j = 0; j < params.ValueArraySize; j++) { first.BeginElement(j, functor); - ui32 val = PseudoRandom(42 * 42 * 42); + ui32 val = PseudoRandom(42 * 42 * 42); first.SetScalar(0, val, functor); data[i].first[j] = val; @@ -407,12 +407,12 @@ Y_UNIT_TEST_SUITE(CompProtoTestExtended) { } first.EndRepeated(functor); - TMetaInfo<TMeta>& second = meta.BeginRepeated(3, functor); + TMetaInfo<TMeta>& second = meta.BeginRepeated(3, functor); data[i].second.resize(params.ValueArraySize); for (ui32 j = 0; j < params.ValueArraySize; j++) { second.BeginElement(j, functor); - ui32 val = PseudoRandom(42 * 42 * 42); + ui32 val = PseudoRandom(42 * 42 * 42); second.SetScalar(0, val, functor); data[i].second[j] = val; @@ -425,14 +425,14 @@ Y_UNIT_TEST_SUITE(CompProtoTestExtended) { } }; - struct TVerifyingDecompressor: public TParentHold<TVerifyingDecompressor> { + struct TVerifyingDecompressor: public TParentHold<TVerifyingDecompressor> { enum EState { - Startstop, - OutDataElem, - InDataElemBeforeSecond, - InDataElemSecond, - InFirst, - InSecond, + Startstop, + OutDataElem, + InDataElemBeforeSecond, + InDataElemSecond, + InFirst, + InSecond, }; EState State; @@ -446,7 +446,7 @@ Y_UNIT_TEST_SUITE(CompProtoTestExtended) { { } - TMetaIterator<TVerifyingDecompressor>& GetDecompressor(size_t index) { + TMetaIterator<TVerifyingDecompressor>& GetDecompressor(size_t index) { Y_UNUSED(index); return *Parent; } diff --git a/library/cpp/compproto/compressor.h b/library/cpp/compproto/compressor.h index 0e3a27afd0..14b335e13c 100644 --- a/library/cpp/compproto/compressor.h +++ b/library/cpp/compproto/compressor.h @@ -3,72 +3,72 @@ #include <util/system/defaults.h> namespace NCompProto { - struct TEmpty { - }; + struct TEmpty { + }; - struct TTable { - TTable() { - for (size_t i = 0; i < 64; ++i) { - CodeBase[i] = 0; - CodeMask[i] = 0; - Length[i] = 0; - PrefLength[i] = 0; - Id[i] = 0; - } + struct TTable { + TTable() { + for (size_t i = 0; i < 64; ++i) { + CodeBase[i] = 0; + CodeMask[i] = 0; + Length[i] = 0; + PrefLength[i] = 0; + Id[i] = 0; + } } - ui32 CodeBase[64]; - ui32 CodeMask[64]; - ui8 Length[64]; - ui8 PrefLength[64]; - ui8 Id[64]; - enum { - PAGE_BOUND = 4096, + ui32 CodeBase[64]; + ui32 CodeMask[64]; + ui8 Length[64]; + ui8 PrefLength[64]; + ui8 Id[64]; + enum { + PAGE_BOUND = 4096, #ifdef WITH_VALGRIND - SAFE_MODE = 1, + SAFE_MODE = 1, #else -#if defined(__has_feature) -#if __has_feature(address_sanitizer) +#if defined(__has_feature) +#if __has_feature(address_sanitizer) SAFE_MODE = 1, -#else +#else SAFE_MODE = 0, #endif -#else - SAFE_MODE = 0, -#endif -#endif - }; - ui32 inline Decompress(const ui8* codes, ui64& offset) const { - codes += (offset >> 3); - size_t pageOff = size_t(codes) % PAGE_BOUND; - size_t readOff = offset & 7; - if (pageOff > PAGE_BOUND - 8 || SAFE_MODE) { - size_t off = 8; - ui64 res = codes[0]; +#else + SAFE_MODE = 0, +#endif +#endif + }; + ui32 inline Decompress(const ui8* codes, ui64& offset) const { + codes += (offset >> 3); + size_t pageOff = size_t(codes) % PAGE_BOUND; + size_t readOff = offset & 7; + if (pageOff > PAGE_BOUND - 8 || SAFE_MODE) { + size_t off = 8; + ui64 res = codes[0]; ++codes; - ui64 indexCur = ((res + 0x0000) >> readOff) & 63; - ui64 indexAlt = ((res + 0xff00) >> readOff) & 63; - if (Id[indexCur] != Id[indexAlt]) { - res += (ui64(codes[0]) << off); - ++codes; - off += 8; - indexCur = (res >> readOff) & 63; - } - ui64 index = indexCur; - ui64 length = Length[index]; - while (off < readOff + length) { - res += (ui64(codes[0]) << off); - ++codes; - off += 8; - } - offset += length; - ui64 code = res >> readOff; - return (((ui32)(code >> PrefLength[index])) & CodeMask[index]) + CodeBase[index]; + ui64 indexCur = ((res + 0x0000) >> readOff) & 63; + ui64 indexAlt = ((res + 0xff00) >> readOff) & 63; + if (Id[indexCur] != Id[indexAlt]) { + res += (ui64(codes[0]) << off); + ++codes; + off += 8; + indexCur = (res >> readOff) & 63; + } + ui64 index = indexCur; + ui64 length = Length[index]; + while (off < readOff + length) { + res += (ui64(codes[0]) << off); + ++codes; + off += 8; + } + offset += length; + ui64 code = res >> readOff; + return (((ui32)(code >> PrefLength[index])) & CodeMask[index]) + CodeBase[index]; } - ui64 code = ((const ui64*)(codes))[0] >> readOff; - ui64 index = code & 63; - offset += Length[index]; + ui64 code = ((const ui64*)(codes))[0] >> readOff; + ui64 index = code & 63; + offset += Length[index]; return (((ui32)(code >> PrefLength[index])) & CodeMask[index]) + CodeBase[index]; } - }; + }; -} +} diff --git a/library/cpp/compproto/huff.h b/library/cpp/compproto/huff.h index a27c014bbb..fa5c139189 100644 --- a/library/cpp/compproto/huff.h +++ b/library/cpp/compproto/huff.h @@ -7,396 +7,396 @@ #include <util/generic/algorithm.h> #include <utility> -#include <queue> +#include <queue> #include "compressor.h" namespace NCompProto { - template <size_t CacheSize, typename TEntry> - struct TCache { - ui32 CacheKey[CacheSize]; - TEntry CacheVal[CacheSize]; - size_t Hits; - size_t Misses; - ui32 Hash(ui32 key) { - return key % CacheSize; - } - TCache() { - Hits = 0; - Misses = 0; - Clear(); - } - void Clear() { - for (size_t i = 0; i < CacheSize; ++i) { - ui32 j = 0; - for (; Hash(j) == i; ++j) - ; - CacheKey[i] = j; - } - } - }; - - struct TCode { - i64 Probability; - ui32 Start; - ui32 Bits; - ui32 Prefix; - ui32 PrefLength; - TCode(i64 probability = 0, ui32 start = 0, ui32 bits = 0) - : Probability(probability) - , Start(start) - , Bits(bits) - { + template <size_t CacheSize, typename TEntry> + struct TCache { + ui32 CacheKey[CacheSize]; + TEntry CacheVal[CacheSize]; + size_t Hits; + size_t Misses; + ui32 Hash(ui32 key) { + return key % CacheSize; + } + TCache() { + Hits = 0; + Misses = 0; + Clear(); + } + void Clear() { + for (size_t i = 0; i < CacheSize; ++i) { + ui32 j = 0; + for (; Hash(j) == i; ++j) + ; + CacheKey[i] = j; + } + } + }; + + struct TCode { + i64 Probability; + ui32 Start; + ui32 Bits; + ui32 Prefix; + ui32 PrefLength; + TCode(i64 probability = 0, ui32 start = 0, ui32 bits = 0) + : Probability(probability) + , Start(start) + , Bits(bits) + { + } + + bool operator<(const TCode& code) const { + return Probability < code.Probability; } - bool operator<(const TCode& code) const { - return Probability < code.Probability; - } - - bool operator>(const TCode& code) const { - return Probability > code.Probability; - } - }; - - struct TAccum { - struct TTable { - TAutoPtr<TTable> Tables[16]; - i64 Counts[16]; - TTable(const TTable& other) { - for (size_t i = 0; i < 16; ++i) { - Counts[i] = other.Counts[i]; - if (other.Tables[i].Get()) { - Tables[i].Reset(new TTable(*other.Tables[i].Get())); - } - } - } - TTable() { - for (auto& count : Counts) - count = 0; - } - - i64 GetCellCount(size_t i) { - i64 count = Counts[i]; - if (Tables[i].Get()) { - for (size_t j = 0; j < 16; ++j) { - count += Tables[i]->GetCellCount(j); - } + bool operator>(const TCode& code) const { + return Probability > code.Probability; + } + }; + + struct TAccum { + struct TTable { + TAutoPtr<TTable> Tables[16]; + i64 Counts[16]; + TTable(const TTable& other) { + for (size_t i = 0; i < 16; ++i) { + Counts[i] = other.Counts[i]; + if (other.Tables[i].Get()) { + Tables[i].Reset(new TTable(*other.Tables[i].Get())); + } } - return count; + } + TTable() { + for (auto& count : Counts) + count = 0; } - i64 GetCount() { - i64 count = 0; + i64 GetCellCount(size_t i) { + i64 count = Counts[i]; + if (Tables[i].Get()) { + for (size_t j = 0; j < 16; ++j) { + count += Tables[i]->GetCellCount(j); + } + } + return count; + } + + i64 GetCount() { + i64 count = 0; for (size_t j = 0; j < 16; ++j) { - count += GetCellCount(j); + count += GetCellCount(j); } - return count; + return count; } - void GenerateFreqs(TVector<std::pair<i64, TCode>>& codes, int depth, int termDepth, ui32 code, i64 cnt) { - if (depth == termDepth) { - for (size_t i = 0; i < 16; ++i) { - i64 iCount = GetCellCount(i); - if (Tables[i].Get()) { - Counts[i] = iCount; - Tables[i].Reset(nullptr); - } - - if (iCount > cnt || (termDepth == 0 && iCount > 0)) { - std::pair<i64, TCode> codep; - codep.first = iCount; - codep.second.Probability = iCount; - codep.second.Start = code + (i << (28 - depth)); - codep.second.Bits = 28 - depth; - codes.push_back(codep); - Counts[i] = 0; - } - } - } + void GenerateFreqs(TVector<std::pair<i64, TCode>>& codes, int depth, int termDepth, ui32 code, i64 cnt) { + if (depth == termDepth) { + for (size_t i = 0; i < 16; ++i) { + i64 iCount = GetCellCount(i); + if (Tables[i].Get()) { + Counts[i] = iCount; + Tables[i].Reset(nullptr); + } + + if (iCount > cnt || (termDepth == 0 && iCount > 0)) { + std::pair<i64, TCode> codep; + codep.first = iCount; + codep.second.Probability = iCount; + codep.second.Start = code + (i << (28 - depth)); + codep.second.Bits = 28 - depth; + codes.push_back(codep); + Counts[i] = 0; + } + } + } for (size_t i = 0; i < 16; ++i) { if (Tables[i].Get()) { - Tables[i]->GenerateFreqs(codes, depth + 4, termDepth, code + (i << (28 - depth)), cnt); + Tables[i]->GenerateFreqs(codes, depth + 4, termDepth, code + (i << (28 - depth)), cnt); } } } - }; - - TTable Root; - int TableCount; - i64 Total; - ui64 Max; - - TAccum() { - TableCount = 0; - Total = 0; - Max = 0; - } - - void GenerateFreqs(TVector<std::pair<i64, TCode>>& codes, int mul) const { - TTable root(Root); - - for (int i = 28; i > 0; i -= 4) { - root.GenerateFreqs(codes, 0, i, 0, Total / mul); - } - - i64 iCount = root.GetCount(); - if (iCount == 0) - return; - std::pair<i64, TCode> codep; - codep.first = iCount; - codep.second.Probability = iCount; - codep.second.Start = 0; - ui32 bits = 0; - while (1) { - if ((1ULL << bits) > Max) - break; - ++bits; - } - codep.second.Bits = bits; - codes.push_back(codep); + }; + + TTable Root; + int TableCount; + i64 Total; + ui64 Max; + + TAccum() { + TableCount = 0; + Total = 0; + Max = 0; + } + + void GenerateFreqs(TVector<std::pair<i64, TCode>>& codes, int mul) const { + TTable root(Root); + + for (int i = 28; i > 0; i -= 4) { + root.GenerateFreqs(codes, 0, i, 0, Total / mul); + } + + i64 iCount = root.GetCount(); + if (iCount == 0) + return; + std::pair<i64, TCode> codep; + codep.first = iCount; + codep.second.Probability = iCount; + codep.second.Start = 0; + ui32 bits = 0; + while (1) { + if ((1ULL << bits) > Max) + break; + ++bits; + } + codep.second.Bits = bits; + codes.push_back(codep); } - TCache<256, i64*> Cache; - - void AddMap(ui32 value, i64 weight = 1) { - ui32 index = Cache.Hash(value); - if (Cache.CacheKey[index] == value) { - Cache.CacheVal[index][0] += weight; - return; - } - TTable* root = &Root; - for (size_t i = 0; i < 15; ++i) { - ui32 index2 = (value >> (28 - i * 4)) & 0xf; - if (!root->Tables[index2].Get()) { - if (TableCount < 1024) { - ++TableCount; - root->Tables[index2].Reset(new TTable); - } else { - Cache.CacheKey[index2] = value; - Cache.CacheVal[index2] = &root->Counts[index2]; - root->Counts[index2] += weight; - return; - } + TCache<256, i64*> Cache; + + void AddMap(ui32 value, i64 weight = 1) { + ui32 index = Cache.Hash(value); + if (Cache.CacheKey[index] == value) { + Cache.CacheVal[index][0] += weight; + return; + } + TTable* root = &Root; + for (size_t i = 0; i < 15; ++i) { + ui32 index2 = (value >> (28 - i * 4)) & 0xf; + if (!root->Tables[index2].Get()) { + if (TableCount < 1024) { + ++TableCount; + root->Tables[index2].Reset(new TTable); + } else { + Cache.CacheKey[index2] = value; + Cache.CacheVal[index2] = &root->Counts[index2]; + root->Counts[index2] += weight; + return; + } } - root = root->Tables[index2].Get(); + root = root->Tables[index2].Get(); } - - Cache.CacheKey[index] = value; - Cache.CacheVal[index] = &root->Counts[value & 0xf]; - root->Counts[value & 0xf] += weight; + + Cache.CacheKey[index] = value; + Cache.CacheVal[index] = &root->Counts[value & 0xf]; + root->Counts[value & 0xf] += weight; } - void Add(ui32 value, i64 weight = 1) { - Max = ::Max(Max, (ui64)value); - Total += weight; - AddMap(value, weight); - }; + void Add(ui32 value, i64 weight = 1) { + Max = ::Max(Max, (ui64)value); + Total += weight; + AddMap(value, weight); + }; }; - struct THuffNode { - i64 Weight; - i64 Priority; - THuffNode* Nodes[2]; - TCode* Code; - THuffNode(i64 weight, i64 priority, TCode* code) - : Weight(weight) - , Priority(priority) - , Code(code) - { - Nodes[0] = nullptr; - Nodes[1] = nullptr; - } - - void BuildPrefixes(ui32 depth, ui32 prefix) { - if (Code) { - Code->Prefix = prefix; - Code->PrefLength = depth; - return; - } - Nodes[0]->BuildPrefixes(depth + 1, prefix + (0UL << depth)); - Nodes[1]->BuildPrefixes(depth + 1, prefix + (1UL << depth)); - } - - i64 Iterate(size_t depth) const { - if (Code) { - return (depth + Code->Bits) * Code->Probability; - } - return Nodes[0]->Iterate(depth + 1) + Nodes[1]->Iterate(depth + 1); + struct THuffNode { + i64 Weight; + i64 Priority; + THuffNode* Nodes[2]; + TCode* Code; + THuffNode(i64 weight, i64 priority, TCode* code) + : Weight(weight) + , Priority(priority) + , Code(code) + { + Nodes[0] = nullptr; + Nodes[1] = nullptr; + } + + void BuildPrefixes(ui32 depth, ui32 prefix) { + if (Code) { + Code->Prefix = prefix; + Code->PrefLength = depth; + return; + } + Nodes[0]->BuildPrefixes(depth + 1, prefix + (0UL << depth)); + Nodes[1]->BuildPrefixes(depth + 1, prefix + (1UL << depth)); + } + + i64 Iterate(size_t depth) const { + if (Code) { + return (depth + Code->Bits) * Code->Probability; + } + return Nodes[0]->Iterate(depth + 1) + Nodes[1]->Iterate(depth + 1); } - size_t Depth() const { - if (Code) { - return 0; - } - return Max(Nodes[0]->Depth(), Nodes[1]->Depth()) + 1; + size_t Depth() const { + if (Code) { + return 0; + } + return Max(Nodes[0]->Depth(), Nodes[1]->Depth()) + 1; } - }; - - struct THLess { - bool operator()(const THuffNode* a, const THuffNode* b) { - if (a->Weight > b->Weight) - return 1; - if (a->Weight == b->Weight && a->Priority > b->Priority) - return 1; + }; + + struct THLess { + bool operator()(const THuffNode* a, const THuffNode* b) { + if (a->Weight > b->Weight) + return 1; + if (a->Weight == b->Weight && a->Priority > b->Priority) + return 1; return 0; } - }; - - inline i64 BuildHuff(TVector<TCode>& codes) { - TVector<TSimpleSharedPtr<THuffNode>> hold; - std::priority_queue<THuffNode*, TVector<THuffNode*>, THLess> nodes; - i64 ret = 0; - - int priority = 0; - for (size_t i = 0; i < codes.size(); ++i) { - TSimpleSharedPtr<THuffNode> node(new THuffNode(codes[i].Probability, priority++, &codes[i])); - hold.push_back(node); - nodes.push(node.Get()); - } - - while (nodes.size() > 1) { - THuffNode* nodea = nodes.top(); - nodes.pop(); - THuffNode* nodeb = nodes.top(); - nodes.pop(); - TSimpleSharedPtr<THuffNode> node(new THuffNode(nodea->Weight + nodeb->Weight, priority++, nullptr)); - node->Nodes[0] = nodea; - node->Nodes[1] = nodeb; - hold.push_back(node); - nodes.push(node.Get()); - } - - if (nodes.size()) { - THuffNode* node = nodes.top(); - node->BuildPrefixes(0, 0); - ret = node->Iterate(0); - } - - return ret; - }; - - struct TCoderEntry { - ui32 MinValue; - ui16 Prefix; - ui8 PrefixBits; - ui8 AllBits; - - ui64 MaxValue() const { - return MinValue + (1ULL << (AllBits - PrefixBits)); - } - }; - - inline i64 Analyze(const TAccum& acc, TVector<TCoderEntry>& retCodes) { - i64 ret; - for (int k = 256; k > 0; --k) { - retCodes.clear(); - TVector<std::pair<i64, TCode>> pairs; - acc.GenerateFreqs(pairs, k); - TVector<TCode> codes; - for (size_t i = 0; i < pairs.size(); ++i) { - codes.push_back(pairs[i].second); - } - - StableSort(codes.begin(), codes.end(), std::greater<TCode>()); - - ret = BuildHuff(codes); - bool valid = true; - for (size_t i = 0; i < codes.size(); ++i) { - TCoderEntry code; - code.MinValue = codes[i].Start; - code.Prefix = codes[i].Prefix; - code.PrefixBits = codes[i].PrefLength; - if (code.PrefixBits > 6) - valid = false; - code.AllBits = code.PrefixBits + codes[i].Bits; - retCodes.push_back(code); - } - if (valid) - return ret; + }; + + inline i64 BuildHuff(TVector<TCode>& codes) { + TVector<TSimpleSharedPtr<THuffNode>> hold; + std::priority_queue<THuffNode*, TVector<THuffNode*>, THLess> nodes; + i64 ret = 0; + + int priority = 0; + for (size_t i = 0; i < codes.size(); ++i) { + TSimpleSharedPtr<THuffNode> node(new THuffNode(codes[i].Probability, priority++, &codes[i])); + hold.push_back(node); + nodes.push(node.Get()); + } + + while (nodes.size() > 1) { + THuffNode* nodea = nodes.top(); + nodes.pop(); + THuffNode* nodeb = nodes.top(); + nodes.pop(); + TSimpleSharedPtr<THuffNode> node(new THuffNode(nodea->Weight + nodeb->Weight, priority++, nullptr)); + node->Nodes[0] = nodea; + node->Nodes[1] = nodeb; + hold.push_back(node); + nodes.push(node.Get()); + } + + if (nodes.size()) { + THuffNode* node = nodes.top(); + node->BuildPrefixes(0, 0); + ret = node->Iterate(0); + } + + return ret; + }; + + struct TCoderEntry { + ui32 MinValue; + ui16 Prefix; + ui8 PrefixBits; + ui8 AllBits; + + ui64 MaxValue() const { + return MinValue + (1ULL << (AllBits - PrefixBits)); + } + }; + + inline i64 Analyze(const TAccum& acc, TVector<TCoderEntry>& retCodes) { + i64 ret; + for (int k = 256; k > 0; --k) { + retCodes.clear(); + TVector<std::pair<i64, TCode>> pairs; + acc.GenerateFreqs(pairs, k); + TVector<TCode> codes; + for (size_t i = 0; i < pairs.size(); ++i) { + codes.push_back(pairs[i].second); + } + + StableSort(codes.begin(), codes.end(), std::greater<TCode>()); + + ret = BuildHuff(codes); + bool valid = true; + for (size_t i = 0; i < codes.size(); ++i) { + TCoderEntry code; + code.MinValue = codes[i].Start; + code.Prefix = codes[i].Prefix; + code.PrefixBits = codes[i].PrefLength; + if (code.PrefixBits > 6) + valid = false; + code.AllBits = code.PrefixBits + codes[i].Bits; + retCodes.push_back(code); + } + if (valid) + return ret; + } + + return ret; + } + + struct TComparer { + bool operator()(const TCoderEntry& e0, const TCoderEntry& e1) const { + return e0.AllBits < e1.AllBits; } + }; - return ret; - } + struct TCoder { + TVector<TCoderEntry> Entries; + void Normalize() { + TComparer comp; + StableSort(Entries.begin(), Entries.end(), comp); + } + TCoder() { + InitDefault(); + } + void InitDefault() { + ui64 cum = 0; + Cache.Clear(); + Entries.clear(); + ui16 b = 1; + for (ui16 i = 0; i < 40; ++i) { + ui16 bits = Min(b, (ui16)(32)); + b = (b * 16) / 10 + 1; + if (b > 32) + b = 32; + TCoderEntry entry; + entry.PrefixBits = i + 1; + entry.AllBits = entry.PrefixBits + bits; + entry.MinValue = (ui32)Min(cum, (ui64)(ui32)(-1)); + cum += (1ULL << bits); + entry.Prefix = ((1UL << i) - 1); + Entries.push_back(entry); + if (cum > (ui32)(-1)) { + return; + } + }; + } - struct TComparer { - bool operator()(const TCoderEntry& e0, const TCoderEntry& e1) const { - return e0.AllBits < e1.AllBits; + TCache<1024, TCoderEntry> Cache; + + ui64 RealCode(ui32 value, const TCoderEntry& entry, size_t& length) { + length = entry.AllBits; + return (ui64(value - entry.MinValue) << entry.PrefixBits) + entry.Prefix; + } + + bool Empty() const { + return Entries.empty(); } - }; - - struct TCoder { - TVector<TCoderEntry> Entries; - void Normalize() { - TComparer comp; - StableSort(Entries.begin(), Entries.end(), comp); - } - TCoder() { - InitDefault(); - } - void InitDefault() { - ui64 cum = 0; - Cache.Clear(); - Entries.clear(); - ui16 b = 1; - for (ui16 i = 0; i < 40; ++i) { - ui16 bits = Min(b, (ui16)(32)); - b = (b * 16) / 10 + 1; - if (b > 32) - b = 32; - TCoderEntry entry; - entry.PrefixBits = i + 1; - entry.AllBits = entry.PrefixBits + bits; - entry.MinValue = (ui32)Min(cum, (ui64)(ui32)(-1)); - cum += (1ULL << bits); - entry.Prefix = ((1UL << i) - 1); - Entries.push_back(entry); - if (cum > (ui32)(-1)) { - return; - } - }; - } - - TCache<1024, TCoderEntry> Cache; - - ui64 RealCode(ui32 value, const TCoderEntry& entry, size_t& length) { - length = entry.AllBits; - return (ui64(value - entry.MinValue) << entry.PrefixBits) + entry.Prefix; - } - - bool Empty() const { - return Entries.empty(); - } - const TCoderEntry& GetEntry(ui32 code, ui8& id) const { - for (size_t i = 0; i < Entries.size(); ++i) { - const TCoderEntry& entry = Entries[i]; - ui32 prefMask = (1UL << entry.PrefixBits) - 1UL; - if (entry.Prefix == (code & prefMask)) { - id = ui8(i); - return entry; - } + const TCoderEntry& GetEntry(ui32 code, ui8& id) const { + for (size_t i = 0; i < Entries.size(); ++i) { + const TCoderEntry& entry = Entries[i]; + ui32 prefMask = (1UL << entry.PrefixBits) - 1UL; + if (entry.Prefix == (code & prefMask)) { + id = ui8(i); + return entry; + } } - ythrow yexception() << "bad entry"; - return Entries[0]; + ythrow yexception() << "bad entry"; + return Entries[0]; } - ui64 Code(ui32 entry, size_t& length) { - ui32 index = Cache.Hash(entry); - if (Cache.CacheKey[index] == entry) { - ++Cache.Hits; + ui64 Code(ui32 entry, size_t& length) { + ui32 index = Cache.Hash(entry); + if (Cache.CacheKey[index] == entry) { + ++Cache.Hits; return RealCode(entry, Cache.CacheVal[index], length); } - ++Cache.Misses; - for (size_t i = 0; i < Entries.size(); ++i) { - if (entry >= Entries[i].MinValue && entry < Entries[i].MaxValue()) { - Cache.CacheKey[index] = entry; - Cache.CacheVal[index] = Entries[i]; - return RealCode(entry, Cache.CacheVal[index], length); - } - } - - ythrow yexception() << "bad huff tree"; - return 0; + ++Cache.Misses; + for (size_t i = 0; i < Entries.size(); ++i) { + if (entry >= Entries[i].MinValue && entry < Entries[i].MaxValue()) { + Cache.CacheKey[index] = entry; + Cache.CacheVal[index] = Entries[i]; + return RealCode(entry, Cache.CacheVal[index], length); + } + } + + ythrow yexception() << "bad huff tree"; + return 0; } - }; + }; -} +} diff --git a/library/cpp/compproto/metainfo.h b/library/cpp/compproto/metainfo.h index 5675de76bc..6e68f86e12 100644 --- a/library/cpp/compproto/metainfo.h +++ b/library/cpp/compproto/metainfo.h @@ -10,295 +10,295 @@ #include "compressor.h" namespace NCompProto { - const size_t MAX_ELEMENTS = 32; + const size_t MAX_ELEMENTS = 32; - struct TScalarDefaultValue { - TScalarDefaultValue() - : Type(None) - , Value(0) - { - } - enum EType { - None = 0, - Fixed = 1, - }; - EType Type; - ui32 Value; + struct TScalarDefaultValue { + TScalarDefaultValue() + : Type(None) + , Value(0) + { + } + enum EType { + None = 0, + Fixed = 1, + }; + EType Type; + ui32 Value; }; - template <class X> - struct TMetaInfo { - X Index; - X Count; - X Mask; - X Scalar[MAX_ELEMENTS]; - TAtomicSharedPtr<TMetaInfo> Repeated[MAX_ELEMENTS]; - TScalarDefaultValue Default[MAX_ELEMENTS]; + template <class X> + struct TMetaInfo { + X Index; + X Count; + X Mask; + X Scalar[MAX_ELEMENTS]; + TAtomicSharedPtr<TMetaInfo> Repeated[MAX_ELEMENTS]; + TScalarDefaultValue Default[MAX_ELEMENTS]; + + ui32 ScalarMask; + ui32 RepeatedMask; + size_t Size; + TString Name; + TString ChildName[MAX_ELEMENTS]; + size_t Id; + TMetaInfo* Parent; + + struct TCheck { + ui32 Count; + ui32 Mask; + ui32 LastIndex; + ui32 Check(size_t id, size_t rule) { + ui32 mask = 1UL << id; + if (Mask & mask) { + ythrow yexception() << "Mask & mask"; + } + Mask |= mask; + Y_ENSURE(rule & mask, "!(rule & mask)"); + return mask; + } + void ClearCount() { + Count = 0; + } + void ClearMask() { + Mask = 0; + } + void ClearIndex() { + LastIndex = ui32(-1); + } + void ClearAll() { + ClearCount(); + ClearIndex(); + ClearMask(); + } + TCheck() { + ClearAll(); + } + }; - ui32 ScalarMask; - ui32 RepeatedMask; - size_t Size; - TString Name; - TString ChildName[MAX_ELEMENTS]; - size_t Id; - TMetaInfo* Parent; + TCheck Serializer; - struct TCheck { - ui32 Count; - ui32 Mask; - ui32 LastIndex; - ui32 Check(size_t id, size_t rule) { - ui32 mask = 1UL << id; - if (Mask & mask) { - ythrow yexception() << "Mask & mask"; - } - Mask |= mask; - Y_ENSURE(rule & mask, "!(rule & mask)"); - return mask; - } - void ClearCount() { - Count = 0; - } - void ClearMask() { - Mask = 0; - } - void ClearIndex() { - LastIndex = ui32(-1); - } - void ClearAll() { - ClearCount(); - ClearIndex(); - ClearMask(); - } - TCheck() { - ClearAll(); - } - }; + void SetDefaults(TMetaInfo* parent) { + ScalarMask = 0; + RepeatedMask = 0; + Size = 0; + Id = 0; + Parent = parent; + } - TCheck Serializer; - - void SetDefaults(TMetaInfo* parent) { - ScalarMask = 0; - RepeatedMask = 0; - Size = 0; - Id = 0; - Parent = parent; - } - - template <class T, class TBuilder> - TMetaInfo(const TMetaInfo<T>& meta, const TBuilder& builder) { - SetDefaults(nullptr); - Name = meta.Name; - builder.Build(Index, meta.Index); - builder.Build(Count, meta.Count); - builder.Build(Mask, meta.Mask); - Size = meta.Size; - Id = meta.Id; - ScalarMask = meta.ScalarMask; - RepeatedMask = meta.RepeatedMask; - for (size_t i = 0; i < Size; ++i) { - ui32 index = (1UL << i); - if (ScalarMask & index) { - builder.Build(Scalar[i], meta.Scalar[i]); - ChildName[i] = meta.ChildName[i]; - Default[i] = meta.Default[i]; - } - if (RepeatedMask & index) { - THolder<TMetaInfo> info(new TMetaInfo(*meta.Repeated[i], builder)); - info->Parent = this; - Repeated[i].Reset(info.Release()); - } + template <class T, class TBuilder> + TMetaInfo(const TMetaInfo<T>& meta, const TBuilder& builder) { + SetDefaults(nullptr); + Name = meta.Name; + builder.Build(Index, meta.Index); + builder.Build(Count, meta.Count); + builder.Build(Mask, meta.Mask); + Size = meta.Size; + Id = meta.Id; + ScalarMask = meta.ScalarMask; + RepeatedMask = meta.RepeatedMask; + for (size_t i = 0; i < Size; ++i) { + ui32 index = (1UL << i); + if (ScalarMask & index) { + builder.Build(Scalar[i], meta.Scalar[i]); + ChildName[i] = meta.ChildName[i]; + Default[i] = meta.Default[i]; + } + if (RepeatedMask & index) { + THolder<TMetaInfo> info(new TMetaInfo(*meta.Repeated[i], builder)); + info->Parent = this; + Repeated[i].Reset(info.Release()); + } } } - - TMetaInfo(TMetaInfo* parent) { - SetDefaults(parent); + + TMetaInfo(TMetaInfo* parent) { + SetDefaults(parent); } - - template <class T> - TMetaInfo& BeginRepeated(size_t id, T& functor) { - Serializer.Check(id, RepeatedMask); - TMetaInfo& res = *Repeated[id].Get(); - res.Count.BeginElement(functor); - return res; + + template <class T> + TMetaInfo& BeginRepeated(size_t id, T& functor) { + Serializer.Check(id, RepeatedMask); + TMetaInfo& res = *Repeated[id].Get(); + res.Count.BeginElement(functor); + return res; } - - template <class T> - void BeginSelf(T& functor) { - Serializer.ClearAll(); - Count.BeginElement(functor); + + template <class T> + void BeginSelf(T& functor) { + Serializer.ClearAll(); + Count.BeginElement(functor); } - - template <class T> - void EndRepeated(T& functor) { - Count.AddDelayed(Serializer.Count, functor); - Serializer.ClearCount(); - Serializer.ClearIndex(); - Y_ENSURE(Serializer.Mask == 0, "Serializer.Mask != 0"); + + template <class T> + void EndRepeated(T& functor) { + Count.AddDelayed(Serializer.Count, functor); + Serializer.ClearCount(); + Serializer.ClearIndex(); + Y_ENSURE(Serializer.Mask == 0, "Serializer.Mask != 0"); } - - template <class T> - void BeginElement(ui32 index, T& functor) { - Y_ENSURE(!(index < Serializer.LastIndex + 1), - "Name: " << Name.Quote() << " error: index < Serializer.LastIndex + 1; " - << "index: " << index << " LastIndex: " << Serializer.LastIndex); - Index.Add(index - Serializer.LastIndex, functor); - Mask.BeginElement(functor); - Serializer.LastIndex = index; - ++Serializer.Count; + + template <class T> + void BeginElement(ui32 index, T& functor) { + Y_ENSURE(!(index < Serializer.LastIndex + 1), + "Name: " << Name.Quote() << " error: index < Serializer.LastIndex + 1; " + << "index: " << index << " LastIndex: " << Serializer.LastIndex); + Index.Add(index - Serializer.LastIndex, functor); + Mask.BeginElement(functor); + Serializer.LastIndex = index; + ++Serializer.Count; } - template <class TFunctor> - void Iterate(TFunctor& functor) { - Cout << Name << " " - << "Index" << Endl; - functor.Process(Index); - Cout << "Count" << Endl; - functor.Process(Count); - Cout << "Mask" << Endl; - functor.Process(Mask); + template <class TFunctor> + void Iterate(TFunctor& functor) { + Cout << Name << " " + << "Index" << Endl; + functor.Process(Index); + Cout << "Count" << Endl; + functor.Process(Count); + Cout << "Mask" << Endl; + functor.Process(Mask); - for (size_t i = 0; i < Size; ++i) { - ui32 index = (1UL << i); - if (ScalarMask & index) { - Cout << "scalar" - << " " << i << Endl; - functor.Process(Scalar[i]); - } - if (RepeatedMask & index) { - Cout << "repeated" - << " " << i << Endl; - Repeated[i]->Iterate(functor); - } + for (size_t i = 0; i < Size; ++i) { + ui32 index = (1UL << i); + if (ScalarMask & index) { + Cout << "scalar" + << " " << i << Endl; + functor.Process(Scalar[i]); + } + if (RepeatedMask & index) { + Cout << "repeated" + << " " << i << Endl; + Repeated[i]->Iterate(functor); + } } } - template <class T> - void EndElement(T& functor) { - Mask.AddDelayed(Serializer.Mask, functor); - Serializer.ClearMask(); - } + template <class T> + void EndElement(T& functor) { + Mask.AddDelayed(Serializer.Mask, functor); + Serializer.ClearMask(); + } - template <class T> - void SetScalar(size_t id, ui32 value, T& functor) { - if (Default[id].Type != TScalarDefaultValue::Fixed || value != Default[id].Value) { - Serializer.Check(id, ScalarMask); - Scalar[id].Add(value, functor); - } - } + template <class T> + void SetScalar(size_t id, ui32 value, T& functor) { + if (Default[id].Type != TScalarDefaultValue::Fixed || value != Default[id].Value) { + Serializer.Check(id, ScalarMask); + Scalar[id].Add(value, functor); + } + } - ui32 Check(size_t id) { - Y_ENSURE(id < MAX_ELEMENTS, "id >= MAX_ELEMENTS"); + ui32 Check(size_t id) { + Y_ENSURE(id < MAX_ELEMENTS, "id >= MAX_ELEMENTS"); - ui32 mask = 1UL << id; - if (ScalarMask & mask) { - ythrow yexception() << "ScalarMask & mask"; + ui32 mask = 1UL << id; + if (ScalarMask & mask) { + ythrow yexception() << "ScalarMask & mask"; } - if (RepeatedMask & mask) { - ythrow yexception() << "RepeatedMask & mask"; + if (RepeatedMask & mask) { + ythrow yexception() << "RepeatedMask & mask"; } - Size = Max(id + 1, Size); - return mask; + Size = Max(id + 1, Size); + return mask; } - TMetaInfo(IInputStream& stream) { - SetDefaults(nullptr); - Load(stream); + TMetaInfo(IInputStream& stream) { + SetDefaults(nullptr); + Load(stream); } - TMetaInfo(const TString& str) { - SetDefaults(nullptr); - TStringInput stream(str); - Load(stream); + TMetaInfo(const TString& str) { + SetDefaults(nullptr); + TStringInput stream(str); + Load(stream); } - void Save(IOutputStream& stream, const TString& offset = TString()) { - stream << offset << "repeated " << Name << " id " << Id << Endl; - TString step = " "; - stream << step << offset << "index" << Endl; - Index.Save(stream, step + offset); - stream << step << offset << "count" << Endl; - Count.Save(stream, step + offset); - stream << step << offset << "mask" << Endl; - Mask.Save(stream, step + offset); + void Save(IOutputStream& stream, const TString& offset = TString()) { + stream << offset << "repeated " << Name << " id " << Id << Endl; + TString step = " "; + stream << step << offset << "index" << Endl; + Index.Save(stream, step + offset); + stream << step << offset << "count" << Endl; + Count.Save(stream, step + offset); + stream << step << offset << "mask" << Endl; + Mask.Save(stream, step + offset); - for (size_t i = 0; i < MAX_ELEMENTS; ++i) { - ui32 mask = 1UL << i; - if (mask & RepeatedMask) { - Repeated[i]->Save(stream, step + offset); - } else if (mask & ScalarMask) { - stream << step << offset << "scalar " << ChildName[i] << " id " << i; - stream << " default " << (Default[i].Type == TScalarDefaultValue::None ? " no " : " const "); - if (Default[i].Type == TScalarDefaultValue::Fixed) { - stream << Default[i].Value; - } - stream << Endl; - stream << step << offset << "table " - << "id " << i << Endl; - Scalar[i].Save(stream, step + offset); + for (size_t i = 0; i < MAX_ELEMENTS; ++i) { + ui32 mask = 1UL << i; + if (mask & RepeatedMask) { + Repeated[i]->Save(stream, step + offset); + } else if (mask & ScalarMask) { + stream << step << offset << "scalar " << ChildName[i] << " id " << i; + stream << " default " << (Default[i].Type == TScalarDefaultValue::None ? " no " : " const "); + if (Default[i].Type == TScalarDefaultValue::Fixed) { + stream << Default[i].Value; + } + stream << Endl; + stream << step << offset << "table " + << "id " << i << Endl; + Scalar[i].Save(stream, step + offset); } } - stream << offset << "end" << Endl; + stream << offset << "end" << Endl; } - void Load(IInputStream& stream) { - TString name; + void Load(IInputStream& stream) { + TString name; + stream >> name; + if (name == "repeated") { + stream >> name; + } + Name = name; stream >> name; - if (name == "repeated") { - stream >> name; - } - Name = name; - stream >> name; - Y_ENSURE(name == "id", "Name mismatch: " << name.Quote() << " != id. "); - stream >> Id; + Y_ENSURE(name == "id", "Name mismatch: " << name.Quote() << " != id. "); + stream >> Id; - while (1) { + while (1) { stream >> name; - if (name == "index") { - Index.Load(stream); - } else if (name == "count") { - Count.Load(stream); - } else if (name == "mask") { - Mask.Load(stream); - } else if (name == "table") { - stream >> name; - Y_ENSURE(name == "id", "Name mismatch: " << name.Quote() << " != id. "); - size_t id; - stream >> id; - Scalar[id].Load(stream); - } else if (name == "repeated") { - THolder<TMetaInfo> info(new TMetaInfo(this)); - info->Load(stream); - size_t id = info->Id; - RepeatedMask |= Check(id); - Repeated[id].Reset(info.Release()); - } else if (name == "scalar") { - stream >> name; - TString childName = name; - stream >> name; - Y_ENSURE(name == "id", "Name mismatch: " << name.Quote() << " != id. "); - size_t id; - stream >> id; - ScalarMask |= Check(id); - ChildName[id] = childName; - stream >> name; - Y_ENSURE(name == "default", "Name mismatch: " << name.Quote() << " != default. "); - stream >> name; - if (name == "no") { - Default[id].Type = TScalarDefaultValue::None; - } else if (name == "const") { - ui32 def; - stream >> def; - Default[id].Type = TScalarDefaultValue::Fixed; - Default[id].Value = def; - } else { - ythrow yexception() << "Unsupported default value specification: " << name.Quote(); - } - } else if (name == "end" /* || stream.IsEOF()*/) { - return; + if (name == "index") { + Index.Load(stream); + } else if (name == "count") { + Count.Load(stream); + } else if (name == "mask") { + Mask.Load(stream); + } else if (name == "table") { + stream >> name; + Y_ENSURE(name == "id", "Name mismatch: " << name.Quote() << " != id. "); + size_t id; + stream >> id; + Scalar[id].Load(stream); + } else if (name == "repeated") { + THolder<TMetaInfo> info(new TMetaInfo(this)); + info->Load(stream); + size_t id = info->Id; + RepeatedMask |= Check(id); + Repeated[id].Reset(info.Release()); + } else if (name == "scalar") { + stream >> name; + TString childName = name; + stream >> name; + Y_ENSURE(name == "id", "Name mismatch: " << name.Quote() << " != id. "); + size_t id; + stream >> id; + ScalarMask |= Check(id); + ChildName[id] = childName; + stream >> name; + Y_ENSURE(name == "default", "Name mismatch: " << name.Quote() << " != default. "); + stream >> name; + if (name == "no") { + Default[id].Type = TScalarDefaultValue::None; + } else if (name == "const") { + ui32 def; + stream >> def; + Default[id].Type = TScalarDefaultValue::Fixed; + Default[id].Value = def; + } else { + ythrow yexception() << "Unsupported default value specification: " << name.Quote(); + } + } else if (name == "end" /* || stream.IsEOF()*/) { + return; } } } - }; + }; -} +} |