aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/codecs/huffman_codec.cpp
diff options
context:
space:
mode:
authorRuslan Kovalev <ruslan.a.kovalev@gmail.com>2022-02-10 16:46:44 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:46:44 +0300
commit59e19371de37995fcb36beb16cd6ec030af960bc (patch)
treefa68e36093ebff8b805462e9e6d331fe9d348214 /library/cpp/codecs/huffman_codec.cpp
parent89db6fe2fe2c32d2a832ddfeb04e8d078e301084 (diff)
downloadydb-59e19371de37995fcb36beb16cd6ec030af960bc.tar.gz
Restoring authorship annotation for Ruslan Kovalev <ruslan.a.kovalev@gmail.com>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/codecs/huffman_codec.cpp')
-rw-r--r--library/cpp/codecs/huffman_codec.cpp266
1 files changed, 133 insertions, 133 deletions
diff --git a/library/cpp/codecs/huffman_codec.cpp b/library/cpp/codecs/huffman_codec.cpp
index 650fe7cdfd..391662fb0d 100644
--- a/library/cpp/codecs/huffman_codec.cpp
+++ b/library/cpp/codecs/huffman_codec.cpp
@@ -1,14 +1,14 @@
-#include "huffman_codec.h"
+#include "huffman_codec.h"
#include <library/cpp/bit_io/bitinput.h>
#include <library/cpp/bit_io/bitoutput.h>
-
-#include <util/generic/algorithm.h>
+
+#include <util/generic/algorithm.h>
#include <util/generic/bitops.h>
-#include <util/stream/buffer.h>
-#include <util/stream/length.h>
-#include <util/string/printf.h>
-
-namespace NCodecs {
+#include <util/stream/buffer.h>
+#include <util/stream/length.h>
+#include <util/string/printf.h>
+
+namespace NCodecs {
template <typename T>
struct TCanonicalCmp {
bool operator()(const T& a, const T& b) const {
@@ -19,40 +19,40 @@ namespace NCodecs {
}
}
};
-
+
template <typename T>
struct TByCharCmp {
bool operator()(const T& a, const T& b) const {
- return a.Char < b.Char;
- }
+ return a.Char < b.Char;
+ }
};
-
+
struct TTreeEntry {
static const ui32 InvalidBranch = (ui32)-1;
-
+
ui64 Freq = 0;
ui32 Branches[2]{InvalidBranch, InvalidBranch};
-
+
ui32 CodeLength = 0;
ui8 Char = 0;
bool Invalid = false;
-
+
TTreeEntry() = default;
-
+
static bool ByFreq(const TTreeEntry& a, const TTreeEntry& b) {
return a.Freq < b.Freq;
}
-
+
static bool ByFreqRev(const TTreeEntry& a, const TTreeEntry& b) {
return a.Freq > b.Freq;
}
};
-
+
using TCodeTree = TVector<TTreeEntry>;
-
+
void InitTreeByFreqs(TCodeTree& tree, const ui64 freqs[256]) {
tree.reserve(255 * 256 / 2); // worst case - balanced tree
-
+
for (ui32 i = 0; i < 256; ++i) {
tree.emplace_back();
tree.back().Char = i;
@@ -72,24 +72,24 @@ namespace NCodecs {
for (ui64 i = 0; i < r.size(); ++i)
++freqs[(ui8)r[i]];
}
-
+
InitTreeByFreqs(tree, freqs);
- }
-
+ }
+
void CalculateCodeLengths(TCodeTree& tree) {
Y_ENSURE(tree.size() == 256, " ");
const ui32 firstbranch = tree.size();
-
+
ui32 curleaf = 0;
ui32 curbranch = firstbranch;
-
+
// building code tree. two priority queues are combined in one.
while (firstbranch - curleaf + tree.size() - curbranch >= 2) {
TTreeEntry e;
-
+
for (auto& branche : e.Branches) {
ui32 br;
-
+
if (curleaf >= firstbranch)
br = curbranch++;
else if (curbranch >= tree.size())
@@ -98,84 +98,84 @@ namespace NCodecs {
br = curleaf++;
else
br = curbranch++;
-
+
Y_ENSURE(br < tree.size(), " ");
branche = br;
e.Freq += tree[br].Freq;
}
-
+
tree.push_back(e);
PushHeap(tree.begin() + curbranch, tree.end(), TTreeEntry::ByFreqRev);
- }
-
+ }
+
// computing code lengths
for (ui64 i = tree.size() - 1; i >= firstbranch; --i) {
TTreeEntry e = tree[i];
-
+
for (auto branche : e.Branches)
tree[branche].CodeLength = e.CodeLength + 1;
}
-
+
// chopping off the branches
tree.resize(firstbranch);
-
+
Sort(tree.begin(), tree.end(), TCanonicalCmp<TTreeEntry>());
-
+
// simplification: we are stripping codes longer than 64 bits
while (!tree.empty() && tree.back().CodeLength > 64)
tree.pop_back();
-
+
// will not compress
if (tree.empty())
return;
-
+
// special invalid code word
tree.back().Invalid = true;
}
-
+
struct TEncoderEntry {
ui64 Code = 0;
-
+
ui8 CodeLength = 0;
ui8 Char = 0;
ui8 Invalid = true;
-
+
explicit TEncoderEntry(TTreeEntry e)
: CodeLength(e.CodeLength)
, Char(e.Char)
, Invalid(e.Invalid)
{
}
-
+
TEncoderEntry() = default;
};
-
+
struct TEncoderTable {
TEncoderEntry Entries[256];
-
+
void Save(IOutputStream* out) const {
ui16 nval = 0;
-
+
for (auto entrie : Entries)
nval += !entrie.Invalid;
-
+
::Save(out, nval);
-
+
for (auto entrie : Entries) {
if (!entrie.Invalid) {
::Save(out, entrie.Char);
::Save(out, entrie.CodeLength);
}
- }
- }
-
+ }
+ }
+
void Load(IInputStream* in) {
ui16 nval = 0;
::Load(in, nval);
-
+
for (ui32 i = 0; i < 256; ++i)
Entries[i].Char = i;
-
+
for (ui32 i = 0; i < nval; ++i) {
ui8 ch = 0;
ui8 len = 0;
@@ -184,15 +184,15 @@ namespace NCodecs {
Entries[ch].CodeLength = len;
Entries[ch].Invalid = false;
}
- }
+ }
};
-
+
struct TDecoderEntry {
ui32 NextTable : 10;
ui32 Char : 8;
ui32 Invalid : 1;
ui32 Bad : 1;
-
+
TDecoderEntry()
: NextTable()
, Char()
@@ -201,27 +201,27 @@ namespace NCodecs {
{
}
};
-
+
struct TDecoderTable: public TIntrusiveListItem<TDecoderTable> {
ui64 Length = 0;
ui64 BaseCode = 0;
-
+
TDecoderEntry Entries[256];
-
+
TDecoderTable() {
Zero(Entries);
}
};
-
+
const int CACHE_BITS_COUNT = 16;
class THuffmanCodec::TImpl: public TAtomicRefCount<TImpl> {
TEncoderTable Encoder;
TDecoderTable Decoder[256];
-
+
TEncoderEntry Invalid;
-
+
ui32 SubTablesNum;
-
+
class THuffmanCache {
struct TCacheEntry {
int EndOffset : 24;
@@ -230,7 +230,7 @@ namespace NCodecs {
TVector<char> DecodeCache;
TVector<TCacheEntry> CacheEntries;
const TImpl& Original;
-
+
public:
THuffmanCache(const THuffmanCodec::TImpl& encoder);
@@ -252,51 +252,51 @@ namespace NCodecs {
if (in.empty()) {
return 0;
}
-
+
out.Reserve(in.size() * 2);
-
+
{
NBitIO::TBitOutputVector<TBuffer> bout(&out);
TStringBuf tin = in;
-
+
// data is under compression
bout.Write(1, 1);
-
+
for (auto t : tin) {
const TEncoderEntry& ce = Encoder.Entries[(ui8)t];
-
+
bout.Write(ce.Code, ce.CodeLength);
-
+
if (ce.Invalid) {
bout.Write(t, 8);
}
}
-
+
// in canonical huffman coding there cannot be a code having no 0 in the suffix
// and shorter than 8 bits.
bout.Write((ui64)-1, bout.GetByteReminder());
return bout.GetByteReminder();
- }
- }
-
+ }
+ }
+
void Decode(TStringBuf in, TBuffer& out) const {
out.Clear();
-
+
if (in.empty()) {
return;
}
-
+
NBitIO::TBitInput bin(in);
ui64 f = 0;
bin.ReadK<1>(f);
-
+
// if data is uncompressed
if (!f) {
in.Skip(1);
out.Append(in.data(), in.size());
} else {
out.Reserve(in.size() * 8);
-
+
if (Cache.Get()) {
Cache->Decode(bin, out);
} else {
@@ -304,36 +304,36 @@ namespace NCodecs {
}
}
}
- }
-
+ }
+
Y_FORCE_INLINE int ReadNextChar(NBitIO::TBitInput& bin, TBuffer& out) const {
const TDecoderTable* table = Decoder;
TDecoderEntry e;
-
+
int bitsRead = 0;
while (true) {
ui64 code = 0;
-
+
if (Y_UNLIKELY(!bin.Read(code, table->Length)))
return 0;
bitsRead += table->Length;
-
+
if (Y_UNLIKELY(code < table->BaseCode))
return 0;
-
+
code -= table->BaseCode;
-
+
if (Y_UNLIKELY(code > 255))
return 0;
-
+
e = table->Entries[code];
-
+
if (Y_UNLIKELY(e.Bad))
return 0;
-
+
if (e.NextTable) {
table = Decoder + e.NextTable;
- } else {
+ } else {
if (e.Invalid) {
code = 0;
bin.ReadK<8>(code);
@@ -344,77 +344,77 @@ namespace NCodecs {
}
return bitsRead;
- }
+ }
}
-
+
Y_ENSURE(false, " could not decode input");
return 0;
- }
-
+ }
+
void GenerateEncoder(TCodeTree& tree) {
const ui64 sz = tree.size();
-
+
TEncoderEntry lastcode = Encoder.Entries[tree[0].Char] = TEncoderEntry(tree[0]);
-
+
for (ui32 i = 1; i < sz; ++i) {
const TTreeEntry& te = tree[i];
TEncoderEntry& e = Encoder.Entries[te.Char];
e = TEncoderEntry(te);
-
+
e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength);
lastcode = e;
-
+
e.Code = ReverseBits(e.Code, e.CodeLength);
-
+
if (e.Invalid)
Invalid = e;
}
-
+
for (auto& e : Encoder.Entries) {
if (e.Invalid)
e = Invalid;
Y_ENSURE(e.CodeLength, " ");
}
- }
-
+ }
+
void RegenerateEncoder() {
for (auto& entrie : Encoder.Entries) {
if (entrie.Invalid)
entrie.CodeLength = Invalid.CodeLength;
}
-
+
Sort(Encoder.Entries, Encoder.Entries + 256, TCanonicalCmp<TEncoderEntry>());
-
+
TEncoderEntry lastcode = Encoder.Entries[0];
-
+
for (ui32 i = 1; i < 256; ++i) {
TEncoderEntry& e = Encoder.Entries[i];
e.Code = (lastcode.Code + 1) << (e.CodeLength - lastcode.CodeLength);
lastcode = e;
-
+
e.Code = ReverseBits(e.Code, e.CodeLength);
}
-
+
for (auto& entrie : Encoder.Entries) {
if (entrie.Invalid) {
Invalid = entrie;
break;
}
}
-
+
Sort(Encoder.Entries, Encoder.Entries + 256, TByCharCmp<TEncoderEntry>());
-
+
for (auto& entrie : Encoder.Entries) {
if (entrie.Invalid)
entrie = Invalid;
- }
- }
-
+ }
+ }
+
void BuildDecoder() {
TEncoderTable enc = Encoder;
Sort(enc.Entries, enc.Entries + 256, TCanonicalCmp<TEncoderEntry>());
-
+
TEncoderEntry& e1 = enc.Entries[0];
Decoder[0].BaseCode = e1.Code;
Decoder[0].Length = e1.CodeLength;
@@ -423,22 +423,22 @@ namespace NCodecs {
SetEntry(Decoder, e2.Code, e2.CodeLength, e2);
}
Cache.Reset(new THuffmanCache(*this));
- }
-
+ }
+
void SetEntry(TDecoderTable* t, ui64 code, ui64 len, TEncoderEntry e) {
Y_ENSURE(len >= t->Length, len << " < " << t->Length);
-
+
ui64 idx = (code & MaskLowerBits(t->Length)) - t->BaseCode;
TDecoderEntry& d = t->Entries[idx];
-
+
if (len == t->Length) {
Y_ENSURE(!d.NextTable, " ");
-
+
d.Char = e.Char;
d.Invalid = e.Invalid;
return;
}
-
+
if (!d.NextTable) {
Y_ENSURE(SubTablesNum < Y_ARRAY_SIZE(Decoder), " ");
d.NextTable = SubTablesNum++;
@@ -446,10 +446,10 @@ namespace NCodecs {
nt->Length = Min<ui64>(8, len - t->Length);
nt->BaseCode = (code >> t->Length) & MaskLowerBits(nt->Length);
}
-
+
SetEntry(Decoder + d.NextTable, code >> t->Length, len - t->Length, e);
- }
-
+ }
+
void Learn(ISequenceReader* in) {
{
TCodeTree tree;
@@ -459,11 +459,11 @@ namespace NCodecs {
GenerateEncoder(tree);
}
BuildDecoder();
- }
-
+ }
+
void LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) {
- TCodeTree tree;
-
+ TCodeTree tree;
+
ui64 freqsArray[256];
Zero(freqsArray);
@@ -491,7 +491,7 @@ namespace NCodecs {
BuildDecoder();
}
};
-
+
THuffmanCodec::TImpl::THuffmanCache::THuffmanCache(const THuffmanCodec::TImpl& codec)
: Original(codec)
{
@@ -512,7 +512,7 @@ namespace NCodecs {
CacheEntries[i] = e;
break;
}
-
+
for (TBuffer::TConstIterator it = decoded.Begin(); it != decoded.End(); ++it) {
DecodeCache.push_back(*it);
}
@@ -558,32 +558,32 @@ namespace NCodecs {
MyTraits.SizeOnDecodeMultiplier = 8;
MyTraits.RecommendedSampleSize = 1 << 21;
}
-
+
THuffmanCodec::~THuffmanCodec() = default;
-
+
ui8 THuffmanCodec::Encode(TStringBuf in, TBuffer& bbb) const {
if (Y_UNLIKELY(!Trained))
ythrow TCodecException() << " not trained";
-
+
return Impl->Encode(in, bbb);
}
-
+
void THuffmanCodec::Decode(TStringBuf in, TBuffer& bbb) const {
Impl->Decode(in, bbb);
}
-
+
void THuffmanCodec::Save(IOutputStream* out) const {
Impl->Save(out);
}
-
+
void THuffmanCodec::Load(IInputStream* in) {
Impl->Load(in);
}
-
+
void THuffmanCodec::DoLearn(ISequenceReader& in) {
Impl->Learn(&in);
}
-
+
void THuffmanCodec::LearnByFreqs(const TArrayRef<std::pair<char, ui64>>& freqs) {
Impl->LearnByFreqs(freqs);
Trained = true;