diff options
author | Anton Samokhvalov <pg83@yandex.ru> | 2022-02-10 16:45:15 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:15 +0300 |
commit | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch) | |
tree | da2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /library/cpp/codecs/huffman_codec.cpp | |
parent | 778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff) | |
download | ydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz |
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/codecs/huffman_codec.cpp')
-rw-r--r-- | library/cpp/codecs/huffman_codec.cpp | 870 |
1 files changed, 435 insertions, 435 deletions
diff --git a/library/cpp/codecs/huffman_codec.cpp b/library/cpp/codecs/huffman_codec.cpp index 650fe7cdfd..c8b126ccd0 100644 --- a/library/cpp/codecs/huffman_codec.cpp +++ b/library/cpp/codecs/huffman_codec.cpp @@ -9,584 +9,584 @@ #include <util/string/printf.h> namespace NCodecs { - template <typename T> - struct TCanonicalCmp { - bool operator()(const T& a, const T& b) const { - if (a.CodeLength == b.CodeLength) { - return a.Char < b.Char; - } else { - return a.CodeLength < b.CodeLength; - } - } - }; - - template <typename T> - struct TByCharCmp { - bool operator()(const T& a, const T& b) const { + template <typename T> + struct TCanonicalCmp { + bool operator()(const T& a, const T& b) const { + if (a.CodeLength == b.CodeLength) { + return a.Char < b.Char; + } else { + return a.CodeLength < b.CodeLength; + } + } + }; + + template <typename T> + struct TByCharCmp { + bool operator()(const T& a, const T& b) const { return a.Char < b.Char; } - }; + }; - struct TTreeEntry { - static const ui32 InvalidBranch = (ui32)-1; + struct TTreeEntry { + static const ui32 InvalidBranch = (ui32)-1; - ui64 Freq = 0; - ui32 Branches[2]{InvalidBranch, InvalidBranch}; + ui64 Freq = 0; + ui32 Branches[2]{InvalidBranch, InvalidBranch}; - ui32 CodeLength = 0; - ui8 Char = 0; - bool Invalid = false; + ui32 CodeLength = 0; + ui8 Char = 0; + bool Invalid = false; - TTreeEntry() = default; + TTreeEntry() = default; - static bool ByFreq(const TTreeEntry& a, const TTreeEntry& b) { - return a.Freq < b.Freq; - } + static bool ByFreq(const TTreeEntry& a, const TTreeEntry& b) { + return a.Freq < b.Freq; + } - static bool ByFreqRev(const TTreeEntry& a, const TTreeEntry& b) { - return a.Freq > b.Freq; - } - }; + static bool ByFreqRev(const TTreeEntry& a, const TTreeEntry& b) { + return a.Freq > b.Freq; + } + }; - using TCodeTree = TVector<TTreeEntry>; + using TCodeTree = TVector<TTreeEntry>; - void InitTreeByFreqs(TCodeTree& tree, const ui64 freqs[256]) { - tree.reserve(255 * 256 / 2); // worst case - balanced tree + void InitTreeByFreqs(TCodeTree& tree, const ui64 freqs[256]) { + tree.reserve(255 * 256 / 2); // worst case - balanced tree - for (ui32 i = 0; i < 256; ++i) { - tree.emplace_back(); - tree.back().Char = i; - tree.back().Freq = freqs[i]; - } + for (ui32 i = 0; i < 256; ++i) { + tree.emplace_back(); + tree.back().Char = i; + tree.back().Freq = freqs[i]; + } - StableSort(tree.begin(), tree.end(), TTreeEntry::ByFreq); + StableSort(tree.begin(), tree.end(), TTreeEntry::ByFreq); } - void InitTree(TCodeTree& tree, ISequenceReader* in) { - using namespace NPrivate; - ui64 freqs[256]; - Zero(freqs); + void InitTree(TCodeTree& tree, ISequenceReader* in) { + using namespace NPrivate; + ui64 freqs[256]; + Zero(freqs); - TStringBuf r; - while (in->NextRegion(r)) { + TStringBuf r; + while (in->NextRegion(r)) { for (ui64 i = 0; i < r.size(); ++i) - ++freqs[(ui8)r[i]]; - } + ++freqs[(ui8)r[i]]; + } - InitTreeByFreqs(tree, freqs); + InitTreeByFreqs(tree, freqs); } - void CalculateCodeLengths(TCodeTree& tree) { - Y_ENSURE(tree.size() == 256, " "); - const ui32 firstbranch = tree.size(); + void CalculateCodeLengths(TCodeTree& tree) { + Y_ENSURE(tree.size() == 256, " "); + const ui32 firstbranch = tree.size(); - ui32 curleaf = 0; - ui32 curbranch = firstbranch; + ui32 curleaf = 0; + ui32 curbranch = firstbranch; - // building code tree. two priority queues are combined in one. - while (firstbranch - curleaf + tree.size() - curbranch >= 2) { - TTreeEntry e; + // building code tree. two priority queues are combined in one. + while (firstbranch - curleaf + tree.size() - curbranch >= 2) { + TTreeEntry e; - for (auto& branche : e.Branches) { - ui32 br; + for (auto& branche : e.Branches) { + ui32 br; - if (curleaf >= firstbranch) - br = curbranch++; - else if (curbranch >= tree.size()) - br = curleaf++; - else if (tree[curleaf].Freq < tree[curbranch].Freq) - br = curleaf++; - else - br = curbranch++; + if (curleaf >= firstbranch) + br = curbranch++; + else if (curbranch >= tree.size()) + br = curleaf++; + else if (tree[curleaf].Freq < tree[curbranch].Freq) + br = curleaf++; + else + br = curbranch++; - Y_ENSURE(br < tree.size(), " "); - branche = br; - e.Freq += tree[br].Freq; - } + Y_ENSURE(br < tree.size(), " "); + branche = br; + e.Freq += tree[br].Freq; + } - tree.push_back(e); - PushHeap(tree.begin() + curbranch, tree.end(), TTreeEntry::ByFreqRev); + tree.push_back(e); + PushHeap(tree.begin() + curbranch, tree.end(), TTreeEntry::ByFreqRev); } - // computing code lengths - for (ui64 i = tree.size() - 1; i >= firstbranch; --i) { - TTreeEntry e = tree[i]; + // computing code lengths + for (ui64 i = tree.size() - 1; i >= firstbranch; --i) { + TTreeEntry e = tree[i]; - for (auto branche : e.Branches) - tree[branche].CodeLength = e.CodeLength + 1; - } - - // chopping off the branches - tree.resize(firstbranch); + for (auto branche : e.Branches) + tree[branche].CodeLength = e.CodeLength + 1; + } - Sort(tree.begin(), tree.end(), TCanonicalCmp<TTreeEntry>()); + // chopping off the branches + tree.resize(firstbranch); - // simplification: we are stripping codes longer than 64 bits - while (!tree.empty() && tree.back().CodeLength > 64) - tree.pop_back(); + Sort(tree.begin(), tree.end(), TCanonicalCmp<TTreeEntry>()); - // will not compress - if (tree.empty()) - return; + // simplification: we are stripping codes longer than 64 bits + while (!tree.empty() && tree.back().CodeLength > 64) + tree.pop_back(); - // special invalid code word - tree.back().Invalid = true; - } + // will not compress + if (tree.empty()) + return; - struct TEncoderEntry { - ui64 Code = 0; + // special invalid code word + tree.back().Invalid = true; + } - ui8 CodeLength = 0; - ui8 Char = 0; - ui8 Invalid = true; + struct TEncoderEntry { + ui64 Code = 0; - explicit TEncoderEntry(TTreeEntry e) - : CodeLength(e.CodeLength) - , Char(e.Char) - , Invalid(e.Invalid) - { - } + ui8 CodeLength = 0; + ui8 Char = 0; + ui8 Invalid = true; - TEncoderEntry() = default; - }; + explicit TEncoderEntry(TTreeEntry e) + : CodeLength(e.CodeLength) + , Char(e.Char) + , Invalid(e.Invalid) + { + } - struct TEncoderTable { - TEncoderEntry Entries[256]; + TEncoderEntry() = default; + }; - void Save(IOutputStream* out) const { - ui16 nval = 0; + struct TEncoderTable { + TEncoderEntry Entries[256]; - for (auto entrie : Entries) - nval += !entrie.Invalid; - - ::Save(out, nval); - - for (auto entrie : Entries) { - if (!entrie.Invalid) { - ::Save(out, entrie.Char); - ::Save(out, entrie.CodeLength); - } - } - } + void Save(IOutputStream* out) const { + ui16 nval = 0; - void Load(IInputStream* in) { - ui16 nval = 0; - ::Load(in, nval); + for (auto entrie : Entries) + nval += !entrie.Invalid; - for (ui32 i = 0; i < 256; ++i) - Entries[i].Char = i; + ::Save(out, nval); - for (ui32 i = 0; i < nval; ++i) { - ui8 ch = 0; - ui8 len = 0; - ::Load(in, ch); - ::Load(in, len); - Entries[ch].CodeLength = len; - Entries[ch].Invalid = false; + for (auto entrie : Entries) { + if (!entrie.Invalid) { + ::Save(out, entrie.Char); + ::Save(out, entrie.CodeLength); + } } } - }; - - struct TDecoderEntry { - ui32 NextTable : 10; - ui32 Char : 8; - ui32 Invalid : 1; - ui32 Bad : 1; - - TDecoderEntry() - : NextTable() - , Char() - , Invalid() - , Bad() - { - } - }; - - struct TDecoderTable: public TIntrusiveListItem<TDecoderTable> { - ui64 Length = 0; - ui64 BaseCode = 0; - TDecoderEntry Entries[256]; - - TDecoderTable() { - Zero(Entries); + void Load(IInputStream* in) { + ui16 nval = 0; + ::Load(in, nval); + + for (ui32 i = 0; i < 256; ++i) + Entries[i].Char = i; + + for (ui32 i = 0; i < nval; ++i) { + ui8 ch = 0; + ui8 len = 0; + ::Load(in, ch); + ::Load(in, len); + Entries[ch].CodeLength = len; + Entries[ch].Invalid = false; + } } - }; - - const int CACHE_BITS_COUNT = 16; - class THuffmanCodec::TImpl: public TAtomicRefCount<TImpl> { - TEncoderTable Encoder; - TDecoderTable Decoder[256]; - - TEncoderEntry Invalid; - - ui32 SubTablesNum; - - class THuffmanCache { - struct TCacheEntry { - int EndOffset : 24; - int BitsLeft : 8; - }; - TVector<char> DecodeCache; - TVector<TCacheEntry> CacheEntries; - const TImpl& Original; - - public: - THuffmanCache(const THuffmanCodec::TImpl& encoder); - - void Decode(NBitIO::TBitInput& in, TBuffer& out) const; + }; + + struct TDecoderEntry { + ui32 NextTable : 10; + ui32 Char : 8; + ui32 Invalid : 1; + ui32 Bad : 1; + + TDecoderEntry() + : NextTable() + , Char() + , Invalid() + , Bad() + { + } + }; + + struct TDecoderTable: public TIntrusiveListItem<TDecoderTable> { + ui64 Length = 0; + ui64 BaseCode = 0; + + TDecoderEntry Entries[256]; + + TDecoderTable() { + Zero(Entries); + } + }; + + const int CACHE_BITS_COUNT = 16; + class THuffmanCodec::TImpl: public TAtomicRefCount<TImpl> { + TEncoderTable Encoder; + TDecoderTable Decoder[256]; + + TEncoderEntry Invalid; + + ui32 SubTablesNum; + + class THuffmanCache { + struct TCacheEntry { + int EndOffset : 24; + int BitsLeft : 8; + }; + TVector<char> DecodeCache; + TVector<TCacheEntry> CacheEntries; + const TImpl& Original; + + public: + THuffmanCache(const THuffmanCodec::TImpl& encoder); + + void Decode(NBitIO::TBitInput& in, TBuffer& out) const; }; - THolder<THuffmanCache> Cache; + THolder<THuffmanCache> Cache; - public: - TImpl() - : SubTablesNum(1) - { - Invalid.CodeLength = 255; - } + public: + TImpl() + : SubTablesNum(1) + { + Invalid.CodeLength = 255; + } - ui8 Encode(TStringBuf in, TBuffer& out) const { - out.Clear(); + ui8 Encode(TStringBuf in, TBuffer& out) const { + out.Clear(); if (in.empty()) { - return 0; - } + return 0; + } out.Reserve(in.size() * 2); - { - NBitIO::TBitOutputVector<TBuffer> bout(&out); - TStringBuf tin = in; + { + NBitIO::TBitOutputVector<TBuffer> bout(&out); + TStringBuf tin = in; - // data is under compression - bout.Write(1, 1); + // data is under compression + bout.Write(1, 1); - for (auto t : tin) { - const TEncoderEntry& ce = Encoder.Entries[(ui8)t]; + for (auto t : tin) { + const TEncoderEntry& ce = Encoder.Entries[(ui8)t]; - bout.Write(ce.Code, ce.CodeLength); + bout.Write(ce.Code, ce.CodeLength); - if (ce.Invalid) { - bout.Write(t, 8); - } - } + if (ce.Invalid) { + bout.Write(t, 8); + } + } - // in canonical huffman coding there cannot be a code having no 0 in the suffix - // and shorter than 8 bits. - bout.Write((ui64)-1, bout.GetByteReminder()); - return bout.GetByteReminder(); + // in canonical huffman coding there cannot be a code having no 0 in the suffix + // and shorter than 8 bits. + bout.Write((ui64)-1, bout.GetByteReminder()); + return bout.GetByteReminder(); } } - void Decode(TStringBuf in, TBuffer& out) const { - out.Clear(); + void Decode(TStringBuf in, TBuffer& out) const { + out.Clear(); if (in.empty()) { - return; - } + return; + } - NBitIO::TBitInput bin(in); - ui64 f = 0; - bin.ReadK<1>(f); + NBitIO::TBitInput bin(in); + ui64 f = 0; + bin.ReadK<1>(f); - // if data is uncompressed - if (!f) { - in.Skip(1); + // if data is uncompressed + if (!f) { + in.Skip(1); out.Append(in.data(), in.size()); - } else { + } else { out.Reserve(in.size() * 8); - if (Cache.Get()) { - Cache->Decode(bin, out); - } else { - while (ReadNextChar(bin, out)) { - } + if (Cache.Get()) { + Cache->Decode(bin, out); + } else { + while (ReadNextChar(bin, out)) { + } } } } - Y_FORCE_INLINE int ReadNextChar(NBitIO::TBitInput& bin, TBuffer& out) const { - const TDecoderTable* table = Decoder; - TDecoderEntry e; + Y_FORCE_INLINE int ReadNextChar(NBitIO::TBitInput& bin, TBuffer& out) const { + const TDecoderTable* table = Decoder; + TDecoderEntry e; - int bitsRead = 0; - while (true) { - ui64 code = 0; + int bitsRead = 0; + while (true) { + ui64 code = 0; - if (Y_UNLIKELY(!bin.Read(code, table->Length))) - return 0; - bitsRead += table->Length; + if (Y_UNLIKELY(!bin.Read(code, table->Length))) + return 0; + bitsRead += table->Length; - if (Y_UNLIKELY(code < table->BaseCode)) - return 0; + if (Y_UNLIKELY(code < table->BaseCode)) + return 0; - code -= table->BaseCode; + code -= table->BaseCode; - if (Y_UNLIKELY(code > 255)) - return 0; + if (Y_UNLIKELY(code > 255)) + return 0; - e = table->Entries[code]; + e = table->Entries[code]; - if (Y_UNLIKELY(e.Bad)) - return 0; + if (Y_UNLIKELY(e.Bad)) + return 0; - if (e.NextTable) { - table = Decoder + e.NextTable; + if (e.NextTable) { + table = Decoder + e.NextTable; } else { - if (e.Invalid) { - code = 0; - bin.ReadK<8>(code); - bitsRead += 8; - out.Append((ui8)code); - } else { - out.Append((ui8)e.Char); - } - - return bitsRead; + if (e.Invalid) { + code = 0; + bin.ReadK<8>(code); + bitsRead += 8; + out.Append((ui8)code); + } else { + out.Append((ui8)e.Char); + } + + return bitsRead; } - } + } - Y_ENSURE(false, " could not decode input"); - return 0; + Y_ENSURE(false, " could not decode input"); + return 0; } - void GenerateEncoder(TCodeTree& tree) { - const ui64 sz = tree.size(); + void GenerateEncoder(TCodeTree& tree) { + const ui64 sz = tree.size(); - TEncoderEntry lastcode = Encoder.Entries[tree[0].Char] = TEncoderEntry(tree[0]); + TEncoderEntry lastcode = Encoder.Entries[tree[0].Char] = TEncoderEntry(tree[0]); - for (ui32 i = 1; i < sz; ++i) { - const TTreeEntry& te = tree[i]; - TEncoderEntry& e = Encoder.Entries[te.Char]; - e = TEncoderEntry(te); + for (ui32 i = 1; i < sz; ++i) { + const TTreeEntry& te = tree[i]; + TEncoderEntry& e = Encoder.Entries[te.Char]; + e = TEncoderEntry(te); - e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength); - lastcode = e; + e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength); + lastcode = e; - e.Code = ReverseBits(e.Code, e.CodeLength); - - if (e.Invalid) - Invalid = e; - } + e.Code = ReverseBits(e.Code, e.CodeLength); - for (auto& e : Encoder.Entries) { - if (e.Invalid) - e = Invalid; + if (e.Invalid) + Invalid = e; + } - Y_ENSURE(e.CodeLength, " "); - } + for (auto& e : Encoder.Entries) { + if (e.Invalid) + e = Invalid; + + Y_ENSURE(e.CodeLength, " "); + } } - void RegenerateEncoder() { - for (auto& entrie : Encoder.Entries) { - if (entrie.Invalid) - entrie.CodeLength = Invalid.CodeLength; - } + void RegenerateEncoder() { + for (auto& entrie : Encoder.Entries) { + if (entrie.Invalid) + entrie.CodeLength = Invalid.CodeLength; + } - Sort(Encoder.Entries, Encoder.Entries + 256, TCanonicalCmp<TEncoderEntry>()); + Sort(Encoder.Entries, Encoder.Entries + 256, TCanonicalCmp<TEncoderEntry>()); - TEncoderEntry lastcode = Encoder.Entries[0]; + TEncoderEntry lastcode = Encoder.Entries[0]; - for (ui32 i = 1; i < 256; ++i) { - TEncoderEntry& e = Encoder.Entries[i]; - e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength); - lastcode = e; + for (ui32 i = 1; i < 256; ++i) { + TEncoderEntry& e = Encoder.Entries[i]; + e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength); + lastcode = e; - e.Code = ReverseBits(e.Code, e.CodeLength); - } + e.Code = ReverseBits(e.Code, e.CodeLength); + } - for (auto& entrie : Encoder.Entries) { - if (entrie.Invalid) { - Invalid = entrie; - break; - } - } + for (auto& entrie : Encoder.Entries) { + if (entrie.Invalid) { + Invalid = entrie; + break; + } + } - Sort(Encoder.Entries, Encoder.Entries + 256, TByCharCmp<TEncoderEntry>()); + Sort(Encoder.Entries, Encoder.Entries + 256, TByCharCmp<TEncoderEntry>()); - for (auto& entrie : Encoder.Entries) { - if (entrie.Invalid) - entrie = Invalid; + for (auto& entrie : Encoder.Entries) { + if (entrie.Invalid) + entrie = Invalid; } } - void BuildDecoder() { - TEncoderTable enc = Encoder; - Sort(enc.Entries, enc.Entries + 256, TCanonicalCmp<TEncoderEntry>()); - - TEncoderEntry& e1 = enc.Entries[0]; - Decoder[0].BaseCode = e1.Code; - Decoder[0].Length = e1.CodeLength; - - for (auto e2 : enc.Entries) { - SetEntry(Decoder, e2.Code, e2.CodeLength, e2); - } - Cache.Reset(new THuffmanCache(*this)); + void BuildDecoder() { + TEncoderTable enc = Encoder; + Sort(enc.Entries, enc.Entries + 256, TCanonicalCmp<TEncoderEntry>()); + + TEncoderEntry& e1 = enc.Entries[0]; + Decoder[0].BaseCode = e1.Code; + Decoder[0].Length = e1.CodeLength; + + for (auto e2 : enc.Entries) { + SetEntry(Decoder, e2.Code, e2.CodeLength, e2); + } + Cache.Reset(new THuffmanCache(*this)); } - void SetEntry(TDecoderTable* t, ui64 code, ui64 len, TEncoderEntry e) { - Y_ENSURE(len >= t->Length, len << " < " << t->Length); + void SetEntry(TDecoderTable* t, ui64 code, ui64 len, TEncoderEntry e) { + Y_ENSURE(len >= t->Length, len << " < " << t->Length); - ui64 idx = (code & MaskLowerBits(t->Length)) - t->BaseCode; - TDecoderEntry& d = t->Entries[idx]; + ui64 idx = (code & MaskLowerBits(t->Length)) - t->BaseCode; + TDecoderEntry& d = t->Entries[idx]; - if (len == t->Length) { - Y_ENSURE(!d.NextTable, " "); + if (len == t->Length) { + Y_ENSURE(!d.NextTable, " "); - d.Char = e.Char; - d.Invalid = e.Invalid; - return; - } + d.Char = e.Char; + d.Invalid = e.Invalid; + return; + } - if (!d.NextTable) { - Y_ENSURE(SubTablesNum < Y_ARRAY_SIZE(Decoder), " "); - d.NextTable = SubTablesNum++; - TDecoderTable* nt = Decoder + d.NextTable; - nt->Length = Min<ui64>(8, len - t->Length); - nt->BaseCode = (code >> t->Length) & MaskLowerBits(nt->Length); - } + if (!d.NextTable) { + Y_ENSURE(SubTablesNum < Y_ARRAY_SIZE(Decoder), " "); + d.NextTable = SubTablesNum++; + TDecoderTable* nt = Decoder + d.NextTable; + nt->Length = Min<ui64>(8, len - t->Length); + nt->BaseCode = (code >> t->Length) & MaskLowerBits(nt->Length); + } - SetEntry(Decoder + d.NextTable, code >> t->Length, len - t->Length, e); + SetEntry(Decoder + d.NextTable, code >> t->Length, len - t->Length, e); } - void Learn(ISequenceReader* in) { - { - TCodeTree tree; - InitTree(tree, in); - CalculateCodeLengths(tree); - Y_ENSURE(!tree.empty(), " "); - GenerateEncoder(tree); - } - BuildDecoder(); + void Learn(ISequenceReader* in) { + { + TCodeTree tree; + InitTree(tree, in); + CalculateCodeLengths(tree); + Y_ENSURE(!tree.empty(), " "); + GenerateEncoder(tree); + } + BuildDecoder(); } void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) { TCodeTree tree; - ui64 freqsArray[256]; - Zero(freqsArray); + ui64 freqsArray[256]; + Zero(freqsArray); - for (const auto& freq : freqs) - freqsArray[static_cast<ui8>(freq.first)] += freq.second; + for (const auto& freq : freqs) + freqsArray[static_cast<ui8>(freq.first)] += freq.second; - InitTreeByFreqs(tree, freqsArray); - CalculateCodeLengths(tree); + InitTreeByFreqs(tree, freqsArray); + CalculateCodeLengths(tree); - Y_ENSURE(!tree.empty(), " "); + Y_ENSURE(!tree.empty(), " "); - GenerateEncoder(tree); - BuildDecoder(); - } + GenerateEncoder(tree); + BuildDecoder(); + } - void Save(IOutputStream* out) { - ::Save(out, Invalid.CodeLength); - Encoder.Save(out); - } + void Save(IOutputStream* out) { + ::Save(out, Invalid.CodeLength); + Encoder.Save(out); + } - void Load(IInputStream* in) { - ::Load(in, Invalid.CodeLength); - Encoder.Load(in); - RegenerateEncoder(); - BuildDecoder(); - } - }; + void Load(IInputStream* in) { + ::Load(in, Invalid.CodeLength); + Encoder.Load(in); + RegenerateEncoder(); + BuildDecoder(); + } + }; - THuffmanCodec::TImpl::THuffmanCache::THuffmanCache(const THuffmanCodec::TImpl& codec) - : Original(codec) - { - CacheEntries.resize(1 << CACHE_BITS_COUNT); + THuffmanCodec::TImpl::THuffmanCache::THuffmanCache(const THuffmanCodec::TImpl& codec) + : Original(codec) + { + CacheEntries.resize(1 << CACHE_BITS_COUNT); DecodeCache.reserve(CacheEntries.size() * 2); - char buffer[2]; - TBuffer decoded; + char buffer[2]; + TBuffer decoded; for (size_t i = 0; i < CacheEntries.size(); i++) { - buffer[1] = i >> 8; - buffer[0] = i; - NBitIO::TBitInput bin(buffer, buffer + sizeof(buffer)); - int totalBits = 0; - while (true) { - decoded.Resize(0); - int bits = codec.ReadNextChar(bin, decoded); - if (totalBits + bits > 16 || !bits) { - TCacheEntry e = {static_cast<int>(DecodeCache.size()), 16 - totalBits}; - CacheEntries[i] = e; - break; - } - - for (TBuffer::TConstIterator it = decoded.Begin(); it != decoded.End(); ++it) { - DecodeCache.push_back(*it); - } - totalBits += bits; + buffer[1] = i >> 8; + buffer[0] = i; + NBitIO::TBitInput bin(buffer, buffer + sizeof(buffer)); + int totalBits = 0; + while (true) { + decoded.Resize(0); + int bits = codec.ReadNextChar(bin, decoded); + if (totalBits + bits > 16 || !bits) { + TCacheEntry e = {static_cast<int>(DecodeCache.size()), 16 - totalBits}; + CacheEntries[i] = e; + break; + } + + for (TBuffer::TConstIterator it = decoded.Begin(); it != decoded.End(); ++it) { + DecodeCache.push_back(*it); + } + totalBits += bits; } } - DecodeCache.push_back(0); - CacheEntries.shrink_to_fit(); - DecodeCache.shrink_to_fit(); + DecodeCache.push_back(0); + CacheEntries.shrink_to_fit(); + DecodeCache.shrink_to_fit(); } - void THuffmanCodec::TImpl::THuffmanCache::Decode(NBitIO::TBitInput& bin, TBuffer& out) const { - int bits = 0; - ui64 code = 0; - while (!bin.Eof()) { - ui64 f = 0; - const int toRead = 16 - bits; - if (toRead > 0 && bin.Read(f, toRead)) { - code = (code >> (16 - bits)) | (f << bits); - code &= 0xFFFF; - TCacheEntry entry = CacheEntries[code]; - int start = code > 0 ? CacheEntries[code - 1].EndOffset : 0; - out.Append((const char*)&DecodeCache[start], (const char*)&DecodeCache[entry.EndOffset]); - bits = entry.BitsLeft; - } else { // should never happen until there are exceptions or unaligned input - bin.Back(bits); - if (!Original.ReadNextChar(bin, out)) - break; - - code = 0; - bits = 0; - } + void THuffmanCodec::TImpl::THuffmanCache::Decode(NBitIO::TBitInput& bin, TBuffer& out) const { + int bits = 0; + ui64 code = 0; + while (!bin.Eof()) { + ui64 f = 0; + const int toRead = 16 - bits; + if (toRead > 0 && bin.Read(f, toRead)) { + code = (code >> (16 - bits)) | (f << bits); + code &= 0xFFFF; + TCacheEntry entry = CacheEntries[code]; + int start = code > 0 ? CacheEntries[code - 1].EndOffset : 0; + out.Append((const char*)&DecodeCache[start], (const char*)&DecodeCache[entry.EndOffset]); + bits = entry.BitsLeft; + } else { // should never happen until there are exceptions or unaligned input + bin.Back(bits); + if (!Original.ReadNextChar(bin, out)) + break; + + code = 0; + bits = 0; + } } } - THuffmanCodec::THuffmanCodec() - : Impl(new TImpl) - { - MyTraits.NeedsTraining = true; - MyTraits.PreservesPrefixGrouping = true; - MyTraits.PaddingBit = 1; - MyTraits.SizeOnEncodeMultiplier = 2; - MyTraits.SizeOnDecodeMultiplier = 8; - MyTraits.RecommendedSampleSize = 1 << 21; - } + THuffmanCodec::THuffmanCodec() + : Impl(new TImpl) + { + MyTraits.NeedsTraining = true; + MyTraits.PreservesPrefixGrouping = true; + MyTraits.PaddingBit = 1; + MyTraits.SizeOnEncodeMultiplier = 2; + MyTraits.SizeOnDecodeMultiplier = 8; + MyTraits.RecommendedSampleSize = 1 << 21; + } - THuffmanCodec::~THuffmanCodec() = default; + THuffmanCodec::~THuffmanCodec() = default; - ui8 THuffmanCodec::Encode(TStringBuf in, TBuffer& bbb) const { - if (Y_UNLIKELY(!Trained)) - ythrow TCodecException() << " not trained"; + ui8 THuffmanCodec::Encode(TStringBuf in, TBuffer& bbb) const { + if (Y_UNLIKELY(!Trained)) + ythrow TCodecException() << " not trained"; - return Impl->Encode(in, bbb); - } + return Impl->Encode(in, bbb); + } - void THuffmanCodec::Decode(TStringBuf in, TBuffer& bbb) const { - Impl->Decode(in, bbb); - } + void THuffmanCodec::Decode(TStringBuf in, TBuffer& bbb) const { + Impl->Decode(in, bbb); + } - void THuffmanCodec::Save(IOutputStream* out) const { - Impl->Save(out); - } + void THuffmanCodec::Save(IOutputStream* out) const { + Impl->Save(out); + } - void THuffmanCodec::Load(IInputStream* in) { - Impl->Load(in); - } + void THuffmanCodec::Load(IInputStream* in) { + Impl->Load(in); + } - void THuffmanCodec::DoLearn(ISequenceReader& in) { - Impl->Learn(&in); - } + void THuffmanCodec::DoLearn(ISequenceReader& in) { + Impl->Learn(&in); + } void THuffmanCodec::LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) { - Impl->LearnByFreqs(freqs); - Trained = true; - } + Impl->LearnByFreqs(freqs); + Trained = true; + } } |