aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/compproto/huff.h
diff options
context:
space:
mode:
authorAnton Samokhvalov <pg83@yandex.ru>2022-02-10 16:45:15 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:45:15 +0300
commit72cb13b4aff9bc9cf22e49251bc8fd143f82538f (patch)
treeda2c34829458c7d4e74bdfbdf85dff449e9e7fb8 /library/cpp/compproto/huff.h
parent778e51ba091dc39e7b7fcab2b9cf4dbedfb6f2b5 (diff)
downloadydb-72cb13b4aff9bc9cf22e49251bc8fd143f82538f.tar.gz
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 1 of 2.
Diffstat (limited to 'library/cpp/compproto/huff.h')
-rw-r--r--library/cpp/compproto/huff.h702
1 files changed, 351 insertions, 351 deletions
diff --git a/library/cpp/compproto/huff.h b/library/cpp/compproto/huff.h
index fa5c139189..a27c014bbb 100644
--- a/library/cpp/compproto/huff.h
+++ b/library/cpp/compproto/huff.h
@@ -7,396 +7,396 @@
#include <util/generic/algorithm.h>
#include <utility>
-#include <queue>
+#include <queue>
#include "compressor.h"
namespace NCompProto {
- template <size_t CacheSize, typename TEntry>
- struct TCache {
- ui32 CacheKey[CacheSize];
- TEntry CacheVal[CacheSize];
- size_t Hits;
- size_t Misses;
- ui32 Hash(ui32 key) {
- return key % CacheSize;
- }
- TCache() {
- Hits = 0;
- Misses = 0;
- Clear();
- }
- void Clear() {
- for (size_t i = 0; i < CacheSize; ++i) {
- ui32 j = 0;
- for (; Hash(j) == i; ++j)
- ;
- CacheKey[i] = j;
- }
- }
- };
-
- struct TCode {
- i64 Probability;
- ui32 Start;
- ui32 Bits;
- ui32 Prefix;
- ui32 PrefLength;
- TCode(i64 probability = 0, ui32 start = 0, ui32 bits = 0)
- : Probability(probability)
- , Start(start)
- , Bits(bits)
- {
- }
-
- bool operator<(const TCode& code) const {
- return Probability < code.Probability;
+ template <size_t CacheSize, typename TEntry>
+ struct TCache {
+ ui32 CacheKey[CacheSize];
+ TEntry CacheVal[CacheSize];
+ size_t Hits;
+ size_t Misses;
+ ui32 Hash(ui32 key) {
+ return key % CacheSize;
+ }
+ TCache() {
+ Hits = 0;
+ Misses = 0;
+ Clear();
+ }
+ void Clear() {
+ for (size_t i = 0; i < CacheSize; ++i) {
+ ui32 j = 0;
+ for (; Hash(j) == i; ++j)
+ ;
+ CacheKey[i] = j;
+ }
+ }
+ };
+
+ struct TCode {
+ i64 Probability;
+ ui32 Start;
+ ui32 Bits;
+ ui32 Prefix;
+ ui32 PrefLength;
+ TCode(i64 probability = 0, ui32 start = 0, ui32 bits = 0)
+ : Probability(probability)
+ , Start(start)
+ , Bits(bits)
+ {
}
- bool operator>(const TCode& code) const {
- return Probability > code.Probability;
- }
- };
-
- struct TAccum {
- struct TTable {
- TAutoPtr<TTable> Tables[16];
- i64 Counts[16];
- TTable(const TTable& other) {
- for (size_t i = 0; i < 16; ++i) {
- Counts[i] = other.Counts[i];
- if (other.Tables[i].Get()) {
- Tables[i].Reset(new TTable(*other.Tables[i].Get()));
- }
+ bool operator<(const TCode& code) const {
+ return Probability < code.Probability;
+ }
+
+ bool operator>(const TCode& code) const {
+ return Probability > code.Probability;
+ }
+ };
+
+ struct TAccum {
+ struct TTable {
+ TAutoPtr<TTable> Tables[16];
+ i64 Counts[16];
+ TTable(const TTable& other) {
+ for (size_t i = 0; i < 16; ++i) {
+ Counts[i] = other.Counts[i];
+ if (other.Tables[i].Get()) {
+ Tables[i].Reset(new TTable(*other.Tables[i].Get()));
+ }
+ }
+ }
+ TTable() {
+ for (auto& count : Counts)
+ count = 0;
+ }
+
+ i64 GetCellCount(size_t i) {
+ i64 count = Counts[i];
+ if (Tables[i].Get()) {
+ for (size_t j = 0; j < 16; ++j) {
+ count += Tables[i]->GetCellCount(j);
+ }
}
- }
- TTable() {
- for (auto& count : Counts)
- count = 0;
+ return count;
}
- i64 GetCellCount(size_t i) {
- i64 count = Counts[i];
- if (Tables[i].Get()) {
- for (size_t j = 0; j < 16; ++j) {
- count += Tables[i]->GetCellCount(j);
- }
- }
- return count;
- }
-
- i64 GetCount() {
- i64 count = 0;
+ i64 GetCount() {
+ i64 count = 0;
for (size_t j = 0; j < 16; ++j) {
- count += GetCellCount(j);
+ count += GetCellCount(j);
}
- return count;
+ return count;
}
- void GenerateFreqs(TVector<std::pair<i64, TCode>>& codes, int depth, int termDepth, ui32 code, i64 cnt) {
- if (depth == termDepth) {
- for (size_t i = 0; i < 16; ++i) {
- i64 iCount = GetCellCount(i);
- if (Tables[i].Get()) {
- Counts[i] = iCount;
- Tables[i].Reset(nullptr);
- }
-
- if (iCount > cnt || (termDepth == 0 && iCount > 0)) {
- std::pair<i64, TCode> codep;
- codep.first = iCount;
- codep.second.Probability = iCount;
- codep.second.Start = code + (i << (28 - depth));
- codep.second.Bits = 28 - depth;
- codes.push_back(codep);
- Counts[i] = 0;
- }
- }
- }
+ void GenerateFreqs(TVector<std::pair<i64, TCode>>& codes, int depth, int termDepth, ui32 code, i64 cnt) {
+ if (depth == termDepth) {
+ for (size_t i = 0; i < 16; ++i) {
+ i64 iCount = GetCellCount(i);
+ if (Tables[i].Get()) {
+ Counts[i] = iCount;
+ Tables[i].Reset(nullptr);
+ }
+
+ if (iCount > cnt || (termDepth == 0 && iCount > 0)) {
+ std::pair<i64, TCode> codep;
+ codep.first = iCount;
+ codep.second.Probability = iCount;
+ codep.second.Start = code + (i << (28 - depth));
+ codep.second.Bits = 28 - depth;
+ codes.push_back(codep);
+ Counts[i] = 0;
+ }
+ }
+ }
for (size_t i = 0; i < 16; ++i) {
if (Tables[i].Get()) {
- Tables[i]->GenerateFreqs(codes, depth + 4, termDepth, code + (i << (28 - depth)), cnt);
+ Tables[i]->GenerateFreqs(codes, depth + 4, termDepth, code + (i << (28 - depth)), cnt);
}
}
}
- };
-
- TTable Root;
- int TableCount;
- i64 Total;
- ui64 Max;
-
- TAccum() {
- TableCount = 0;
- Total = 0;
- Max = 0;
- }
-
- void GenerateFreqs(TVector<std::pair<i64, TCode>>& codes, int mul) const {
- TTable root(Root);
-
- for (int i = 28; i > 0; i -= 4) {
- root.GenerateFreqs(codes, 0, i, 0, Total / mul);
- }
-
- i64 iCount = root.GetCount();
- if (iCount == 0)
- return;
- std::pair<i64, TCode> codep;
- codep.first = iCount;
- codep.second.Probability = iCount;
- codep.second.Start = 0;
- ui32 bits = 0;
- while (1) {
- if ((1ULL << bits) > Max)
- break;
- ++bits;
- }
- codep.second.Bits = bits;
- codes.push_back(codep);
+ };
+
+ TTable Root;
+ int TableCount;
+ i64 Total;
+ ui64 Max;
+
+ TAccum() {
+ TableCount = 0;
+ Total = 0;
+ Max = 0;
+ }
+
+ void GenerateFreqs(TVector<std::pair<i64, TCode>>& codes, int mul) const {
+ TTable root(Root);
+
+ for (int i = 28; i > 0; i -= 4) {
+ root.GenerateFreqs(codes, 0, i, 0, Total / mul);
+ }
+
+ i64 iCount = root.GetCount();
+ if (iCount == 0)
+ return;
+ std::pair<i64, TCode> codep;
+ codep.first = iCount;
+ codep.second.Probability = iCount;
+ codep.second.Start = 0;
+ ui32 bits = 0;
+ while (1) {
+ if ((1ULL << bits) > Max)
+ break;
+ ++bits;
+ }
+ codep.second.Bits = bits;
+ codes.push_back(codep);
}
- TCache<256, i64*> Cache;
-
- void AddMap(ui32 value, i64 weight = 1) {
- ui32 index = Cache.Hash(value);
- if (Cache.CacheKey[index] == value) {
- Cache.CacheVal[index][0] += weight;
- return;
- }
- TTable* root = &Root;
- for (size_t i = 0; i < 15; ++i) {
- ui32 index2 = (value >> (28 - i * 4)) & 0xf;
- if (!root->Tables[index2].Get()) {
- if (TableCount < 1024) {
- ++TableCount;
- root->Tables[index2].Reset(new TTable);
- } else {
- Cache.CacheKey[index2] = value;
- Cache.CacheVal[index2] = &root->Counts[index2];
- root->Counts[index2] += weight;
- return;
- }
+ TCache<256, i64*> Cache;
+
+ void AddMap(ui32 value, i64 weight = 1) {
+ ui32 index = Cache.Hash(value);
+ if (Cache.CacheKey[index] == value) {
+ Cache.CacheVal[index][0] += weight;
+ return;
+ }
+ TTable* root = &Root;
+ for (size_t i = 0; i < 15; ++i) {
+ ui32 index2 = (value >> (28 - i * 4)) & 0xf;
+ if (!root->Tables[index2].Get()) {
+ if (TableCount < 1024) {
+ ++TableCount;
+ root->Tables[index2].Reset(new TTable);
+ } else {
+ Cache.CacheKey[index2] = value;
+ Cache.CacheVal[index2] = &root->Counts[index2];
+ root->Counts[index2] += weight;
+ return;
+ }
}
- root = root->Tables[index2].Get();
+ root = root->Tables[index2].Get();
}
-
- Cache.CacheKey[index] = value;
- Cache.CacheVal[index] = &root->Counts[value & 0xf];
- root->Counts[value & 0xf] += weight;
+
+ Cache.CacheKey[index] = value;
+ Cache.CacheVal[index] = &root->Counts[value & 0xf];
+ root->Counts[value & 0xf] += weight;
}
- void Add(ui32 value, i64 weight = 1) {
- Max = ::Max(Max, (ui64)value);
- Total += weight;
- AddMap(value, weight);
- };
+ void Add(ui32 value, i64 weight = 1) {
+ Max = ::Max(Max, (ui64)value);
+ Total += weight;
+ AddMap(value, weight);
+ };
};
- struct THuffNode {
- i64 Weight;
- i64 Priority;
- THuffNode* Nodes[2];
- TCode* Code;
- THuffNode(i64 weight, i64 priority, TCode* code)
- : Weight(weight)
- , Priority(priority)
- , Code(code)
- {
- Nodes[0] = nullptr;
- Nodes[1] = nullptr;
- }
-
- void BuildPrefixes(ui32 depth, ui32 prefix) {
- if (Code) {
- Code->Prefix = prefix;
- Code->PrefLength = depth;
- return;
- }
- Nodes[0]->BuildPrefixes(depth + 1, prefix + (0UL << depth));
- Nodes[1]->BuildPrefixes(depth + 1, prefix + (1UL << depth));
- }
-
- i64 Iterate(size_t depth) const {
- if (Code) {
- return (depth + Code->Bits) * Code->Probability;
- }
- return Nodes[0]->Iterate(depth + 1) + Nodes[1]->Iterate(depth + 1);
+ struct THuffNode {
+ i64 Weight;
+ i64 Priority;
+ THuffNode* Nodes[2];
+ TCode* Code;
+ THuffNode(i64 weight, i64 priority, TCode* code)
+ : Weight(weight)
+ , Priority(priority)
+ , Code(code)
+ {
+ Nodes[0] = nullptr;
+ Nodes[1] = nullptr;
+ }
+
+ void BuildPrefixes(ui32 depth, ui32 prefix) {
+ if (Code) {
+ Code->Prefix = prefix;
+ Code->PrefLength = depth;
+ return;
+ }
+ Nodes[0]->BuildPrefixes(depth + 1, prefix + (0UL << depth));
+ Nodes[1]->BuildPrefixes(depth + 1, prefix + (1UL << depth));
+ }
+
+ i64 Iterate(size_t depth) const {
+ if (Code) {
+ return (depth + Code->Bits) * Code->Probability;
+ }
+ return Nodes[0]->Iterate(depth + 1) + Nodes[1]->Iterate(depth + 1);
}
- size_t Depth() const {
- if (Code) {
- return 0;
- }
- return Max(Nodes[0]->Depth(), Nodes[1]->Depth()) + 1;
+ size_t Depth() const {
+ if (Code) {
+ return 0;
+ }
+ return Max(Nodes[0]->Depth(), Nodes[1]->Depth()) + 1;
}
- };
-
- struct THLess {
- bool operator()(const THuffNode* a, const THuffNode* b) {
- if (a->Weight > b->Weight)
- return 1;
- if (a->Weight == b->Weight && a->Priority > b->Priority)
- return 1;
+ };
+
+ struct THLess {
+ bool operator()(const THuffNode* a, const THuffNode* b) {
+ if (a->Weight > b->Weight)
+ return 1;
+ if (a->Weight == b->Weight && a->Priority > b->Priority)
+ return 1;
return 0;
}
- };
-
- inline i64 BuildHuff(TVector<TCode>& codes) {
- TVector<TSimpleSharedPtr<THuffNode>> hold;
- std::priority_queue<THuffNode*, TVector<THuffNode*>, THLess> nodes;
- i64 ret = 0;
-
- int priority = 0;
- for (size_t i = 0; i < codes.size(); ++i) {
- TSimpleSharedPtr<THuffNode> node(new THuffNode(codes[i].Probability, priority++, &codes[i]));
- hold.push_back(node);
- nodes.push(node.Get());
- }
-
- while (nodes.size() > 1) {
- THuffNode* nodea = nodes.top();
- nodes.pop();
- THuffNode* nodeb = nodes.top();
- nodes.pop();
- TSimpleSharedPtr<THuffNode> node(new THuffNode(nodea->Weight + nodeb->Weight, priority++, nullptr));
- node->Nodes[0] = nodea;
- node->Nodes[1] = nodeb;
- hold.push_back(node);
- nodes.push(node.Get());
- }
-
- if (nodes.size()) {
- THuffNode* node = nodes.top();
- node->BuildPrefixes(0, 0);
- ret = node->Iterate(0);
- }
-
- return ret;
- };
-
- struct TCoderEntry {
- ui32 MinValue;
- ui16 Prefix;
- ui8 PrefixBits;
- ui8 AllBits;
-
- ui64 MaxValue() const {
- return MinValue + (1ULL << (AllBits - PrefixBits));
- }
- };
-
- inline i64 Analyze(const TAccum& acc, TVector<TCoderEntry>& retCodes) {
- i64 ret;
- for (int k = 256; k > 0; --k) {
- retCodes.clear();
- TVector<std::pair<i64, TCode>> pairs;
- acc.GenerateFreqs(pairs, k);
- TVector<TCode> codes;
- for (size_t i = 0; i < pairs.size(); ++i) {
- codes.push_back(pairs[i].second);
- }
-
- StableSort(codes.begin(), codes.end(), std::greater<TCode>());
-
- ret = BuildHuff(codes);
- bool valid = true;
- for (size_t i = 0; i < codes.size(); ++i) {
- TCoderEntry code;
- code.MinValue = codes[i].Start;
- code.Prefix = codes[i].Prefix;
- code.PrefixBits = codes[i].PrefLength;
- if (code.PrefixBits > 6)
- valid = false;
- code.AllBits = code.PrefixBits + codes[i].Bits;
- retCodes.push_back(code);
- }
- if (valid)
- return ret;
- }
-
- return ret;
- }
-
- struct TComparer {
- bool operator()(const TCoderEntry& e0, const TCoderEntry& e1) const {
- return e0.AllBits < e1.AllBits;
+ };
+
+ inline i64 BuildHuff(TVector<TCode>& codes) {
+ TVector<TSimpleSharedPtr<THuffNode>> hold;
+ std::priority_queue<THuffNode*, TVector<THuffNode*>, THLess> nodes;
+ i64 ret = 0;
+
+ int priority = 0;
+ for (size_t i = 0; i < codes.size(); ++i) {
+ TSimpleSharedPtr<THuffNode> node(new THuffNode(codes[i].Probability, priority++, &codes[i]));
+ hold.push_back(node);
+ nodes.push(node.Get());
+ }
+
+ while (nodes.size() > 1) {
+ THuffNode* nodea = nodes.top();
+ nodes.pop();
+ THuffNode* nodeb = nodes.top();
+ nodes.pop();
+ TSimpleSharedPtr<THuffNode> node(new THuffNode(nodea->Weight + nodeb->Weight, priority++, nullptr));
+ node->Nodes[0] = nodea;
+ node->Nodes[1] = nodeb;
+ hold.push_back(node);
+ nodes.push(node.Get());
+ }
+
+ if (nodes.size()) {
+ THuffNode* node = nodes.top();
+ node->BuildPrefixes(0, 0);
+ ret = node->Iterate(0);
+ }
+
+ return ret;
+ };
+
+ struct TCoderEntry {
+ ui32 MinValue;
+ ui16 Prefix;
+ ui8 PrefixBits;
+ ui8 AllBits;
+
+ ui64 MaxValue() const {
+ return MinValue + (1ULL << (AllBits - PrefixBits));
+ }
+ };
+
+ inline i64 Analyze(const TAccum& acc, TVector<TCoderEntry>& retCodes) {
+ i64 ret;
+ for (int k = 256; k > 0; --k) {
+ retCodes.clear();
+ TVector<std::pair<i64, TCode>> pairs;
+ acc.GenerateFreqs(pairs, k);
+ TVector<TCode> codes;
+ for (size_t i = 0; i < pairs.size(); ++i) {
+ codes.push_back(pairs[i].second);
+ }
+
+ StableSort(codes.begin(), codes.end(), std::greater<TCode>());
+
+ ret = BuildHuff(codes);
+ bool valid = true;
+ for (size_t i = 0; i < codes.size(); ++i) {
+ TCoderEntry code;
+ code.MinValue = codes[i].Start;
+ code.Prefix = codes[i].Prefix;
+ code.PrefixBits = codes[i].PrefLength;
+ if (code.PrefixBits > 6)
+ valid = false;
+ code.AllBits = code.PrefixBits + codes[i].Bits;
+ retCodes.push_back(code);
+ }
+ if (valid)
+ return ret;
}
- };
- struct TCoder {
- TVector<TCoderEntry> Entries;
- void Normalize() {
- TComparer comp;
- StableSort(Entries.begin(), Entries.end(), comp);
- }
- TCoder() {
- InitDefault();
- }
- void InitDefault() {
- ui64 cum = 0;
- Cache.Clear();
- Entries.clear();
- ui16 b = 1;
- for (ui16 i = 0; i < 40; ++i) {
- ui16 bits = Min(b, (ui16)(32));
- b = (b * 16) / 10 + 1;
- if (b > 32)
- b = 32;
- TCoderEntry entry;
- entry.PrefixBits = i + 1;
- entry.AllBits = entry.PrefixBits + bits;
- entry.MinValue = (ui32)Min(cum, (ui64)(ui32)(-1));
- cum += (1ULL << bits);
- entry.Prefix = ((1UL << i) - 1);
- Entries.push_back(entry);
- if (cum > (ui32)(-1)) {
- return;
- }
- };
- }
+ return ret;
+ }
- TCache<1024, TCoderEntry> Cache;
-
- ui64 RealCode(ui32 value, const TCoderEntry& entry, size_t& length) {
- length = entry.AllBits;
- return (ui64(value - entry.MinValue) << entry.PrefixBits) + entry.Prefix;
- }
-
- bool Empty() const {
- return Entries.empty();
+ struct TComparer {
+ bool operator()(const TCoderEntry& e0, const TCoderEntry& e1) const {
+ return e0.AllBits < e1.AllBits;
}
- const TCoderEntry& GetEntry(ui32 code, ui8& id) const {
- for (size_t i = 0; i < Entries.size(); ++i) {
- const TCoderEntry& entry = Entries[i];
- ui32 prefMask = (1UL << entry.PrefixBits) - 1UL;
- if (entry.Prefix == (code & prefMask)) {
- id = ui8(i);
- return entry;
- }
+ };
+
+ struct TCoder {
+ TVector<TCoderEntry> Entries;
+ void Normalize() {
+ TComparer comp;
+ StableSort(Entries.begin(), Entries.end(), comp);
+ }
+ TCoder() {
+ InitDefault();
+ }
+ void InitDefault() {
+ ui64 cum = 0;
+ Cache.Clear();
+ Entries.clear();
+ ui16 b = 1;
+ for (ui16 i = 0; i < 40; ++i) {
+ ui16 bits = Min(b, (ui16)(32));
+ b = (b * 16) / 10 + 1;
+ if (b > 32)
+ b = 32;
+ TCoderEntry entry;
+ entry.PrefixBits = i + 1;
+ entry.AllBits = entry.PrefixBits + bits;
+ entry.MinValue = (ui32)Min(cum, (ui64)(ui32)(-1));
+ cum += (1ULL << bits);
+ entry.Prefix = ((1UL << i) - 1);
+ Entries.push_back(entry);
+ if (cum > (ui32)(-1)) {
+ return;
+ }
+ };
+ }
+
+ TCache<1024, TCoderEntry> Cache;
+
+ ui64 RealCode(ui32 value, const TCoderEntry& entry, size_t& length) {
+ length = entry.AllBits;
+ return (ui64(value - entry.MinValue) << entry.PrefixBits) + entry.Prefix;
+ }
+
+ bool Empty() const {
+ return Entries.empty();
+ }
+ const TCoderEntry& GetEntry(ui32 code, ui8& id) const {
+ for (size_t i = 0; i < Entries.size(); ++i) {
+ const TCoderEntry& entry = Entries[i];
+ ui32 prefMask = (1UL << entry.PrefixBits) - 1UL;
+ if (entry.Prefix == (code & prefMask)) {
+ id = ui8(i);
+ return entry;
+ }
}
- ythrow yexception() << "bad entry";
- return Entries[0];
+ ythrow yexception() << "bad entry";
+ return Entries[0];
}
- ui64 Code(ui32 entry, size_t& length) {
- ui32 index = Cache.Hash(entry);
- if (Cache.CacheKey[index] == entry) {
- ++Cache.Hits;
+ ui64 Code(ui32 entry, size_t& length) {
+ ui32 index = Cache.Hash(entry);
+ if (Cache.CacheKey[index] == entry) {
+ ++Cache.Hits;
return RealCode(entry, Cache.CacheVal[index], length);
}
- ++Cache.Misses;
- for (size_t i = 0; i < Entries.size(); ++i) {
- if (entry >= Entries[i].MinValue && entry < Entries[i].MaxValue()) {
- Cache.CacheKey[index] = entry;
- Cache.CacheVal[index] = Entries[i];
- return RealCode(entry, Cache.CacheVal[index], length);
- }
- }
-
- ythrow yexception() << "bad huff tree";
- return 0;
+ ++Cache.Misses;
+ for (size_t i = 0; i < Entries.size(); ++i) {
+ if (entry >= Entries[i].MinValue && entry < Entries[i].MaxValue()) {
+ Cache.CacheKey[index] = entry;
+ Cache.CacheVal[index] = Entries[i];
+ return RealCode(entry, Cache.CacheVal[index], length);
+ }
+ }
+
+ ythrow yexception() << "bad huff tree";
+ return 0;
}
- };
+ };
-}
+}