diff options
author | Anton Samokhvalov <pg83@yandex.ru> | 2022-02-10 16:45:17 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:17 +0300 |
commit | d3a398281c6fd1d3672036cb2d63f842d2cb28c5 (patch) | |
tree | dd4bd3ca0f36b817e96812825ffaf10d645803f2 /library/cpp/comptable | |
parent | 72cb13b4aff9bc9cf22e49251bc8fd143f82538f (diff) | |
download | ydb-d3a398281c6fd1d3672036cb2d63f842d2cb28c5.tar.gz |
Restoring authorship annotation for Anton Samokhvalov <pg83@yandex.ru>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/comptable')
-rw-r--r-- | library/cpp/comptable/comptable.cpp | 730 | ||||
-rw-r--r-- | library/cpp/comptable/comptable.h | 94 | ||||
-rw-r--r-- | library/cpp/comptable/usage/usage.cpp | 30 | ||||
-rw-r--r-- | library/cpp/comptable/ut/comptable_ut.cpp | 28 |
4 files changed, 441 insertions, 441 deletions
diff --git a/library/cpp/comptable/comptable.cpp b/library/cpp/comptable/comptable.cpp index 8e87da3060..8a92d4d1aa 100644 --- a/library/cpp/comptable/comptable.cpp +++ b/library/cpp/comptable/comptable.cpp @@ -11,433 +11,433 @@ #include <cstdlib> namespace NCompTable { - static const ui32 magicHashMul = 0xd077cd1f; - static const size_t hashSizeLog = 18; - static const size_t hashSize = 1 << hashSizeLog; + static const ui32 magicHashMul = 0xd077cd1f; + static const size_t hashSizeLog = 18; + static const size_t hashSize = 1 << hashSizeLog; - size_t HashIndex(ui32 value, ui32 hashMul) { - return (value * hashMul) >> (32 - hashSizeLog); - } + size_t HashIndex(ui32 value, ui32 hashMul) { + return (value * hashMul) >> (32 - hashSizeLog); + } - void TCompressorTable::GetHuffCode(const NCompProto::TCoderEntry& entry, ui32 value, ui64& bitCode, ui8& bitLength) const { - ui64 code = value - entry.MinValue; - bitCode = (code << entry.PrefixBits) | entry.Prefix; - bitLength = entry.AllBits; - } - void TCompressorTable::GetLastHuffCode(ui32 value, ui64& bitCode, ui8& bitLength) const { - GetHuffCode(HuffCodes[9], value, bitCode, bitLength); - } - void TCompressorTable::GetHuffCode(ui32 value, ui64& bitCode, ui8& bitLength) const { - for (auto huffCode : HuffCodes) { - ui32 minValue = huffCode.MinValue; - if (minValue <= value && value < huffCode.MaxValue()) { - GetHuffCode(huffCode, value, bitCode, bitLength); - return; - } + void TCompressorTable::GetHuffCode(const NCompProto::TCoderEntry& entry, ui32 value, ui64& bitCode, ui8& bitLength) const { + ui64 code = value - entry.MinValue; + bitCode = (code << entry.PrefixBits) | entry.Prefix; + bitLength = entry.AllBits; + } + void TCompressorTable::GetLastHuffCode(ui32 value, ui64& bitCode, ui8& bitLength) const { + GetHuffCode(HuffCodes[9], value, bitCode, bitLength); + } + void TCompressorTable::GetHuffCode(ui32 value, ui64& bitCode, ui8& bitLength) const { + for (auto huffCode : HuffCodes) { + ui32 minValue = huffCode.MinValue; + if (minValue <= value && value < huffCode.MaxValue()) { + GetHuffCode(huffCode, value, bitCode, bitLength); + return; + } } - abort(); + abort(); } - ui8 TCompressorTable::GetHuffIndex(ui8 prefix) { - for (size_t i = 0; i < 10; ++i) { - if ((prefix & ((1 << HuffCodes[i].PrefixBits) - 1)) == HuffCodes[i].Prefix) { - return i; - } + ui8 TCompressorTable::GetHuffIndex(ui8 prefix) { + for (size_t i = 0; i < 10; ++i) { + if ((prefix & ((1 << HuffCodes[i].PrefixBits) - 1)) == HuffCodes[i].Prefix) { + return i; + } } - abort(); - return 0; + abort(); + return 0; } - void TCompressorTable::BuildHuffCodes(i64 totalFreq, i64 freqs[65536]) { - i64 add = 1; - while (1) { - if (BuildHuffCodes(totalFreq, freqs, add)) { - return; - } - add = add * 2; + void TCompressorTable::BuildHuffCodes(i64 totalFreq, i64 freqs[65536]) { + i64 add = 1; + while (1) { + if (BuildHuffCodes(totalFreq, freqs, add)) { + return; + } + add = add * 2; } } - bool TCompressorTable::BuildHuffCodes(i64 totalFreq, i64 freqs[65536], i64 add) { - TVector<NCompProto::TCode> codes; - size_t bits[] = {0, 1, 2, 4, 8, 10, 12, 14, 16, 32}; - size_t off = 0; - ui32 total = totalFreq; - for (size_t i = 0; i < 9; ++i) { - size_t size = 1 << bits[i]; - ui32 weight = 0; - for (size_t j = off; j < off + size && j < 65536; ++j) { - weight += freqs[j]; - } - codes.push_back(NCompProto::TCode(weight + add, ui32(off), bits[i])); - total = total > weight ? total - weight : 0; - off += size; + bool TCompressorTable::BuildHuffCodes(i64 totalFreq, i64 freqs[65536], i64 add) { + TVector<NCompProto::TCode> codes; + size_t bits[] = {0, 1, 2, 4, 8, 10, 12, 14, 16, 32}; + size_t off = 0; + ui32 total = totalFreq; + for (size_t i = 0; i < 9; ++i) { + size_t size = 1 << bits[i]; + ui32 weight = 0; + for (size_t j = off; j < off + size && j < 65536; ++j) { + weight += freqs[j]; + } + codes.push_back(NCompProto::TCode(weight + add, ui32(off), bits[i])); + total = total > weight ? total - weight : 0; + off += size; } - codes.push_back(NCompProto::TCode(total + add, 0, 32)); - i64 ret = NCompProto::BuildHuff(codes); - Y_UNUSED(ret); - for (size_t i = 0; i < codes.size(); ++i) { - NCompProto::TCoderEntry& code = HuffCodes[i]; - code.MinValue = codes[i].Start; - code.Prefix = codes[i].Prefix; - code.PrefixBits = codes[i].PrefLength; - code.AllBits = code.PrefixBits + codes[i].Bits; - if (code.PrefixBits > 8) { - return false; - } + codes.push_back(NCompProto::TCode(total + add, 0, 32)); + i64 ret = NCompProto::BuildHuff(codes); + Y_UNUSED(ret); + for (size_t i = 0; i < codes.size(); ++i) { + NCompProto::TCoderEntry& code = HuffCodes[i]; + code.MinValue = codes[i].Start; + code.Prefix = codes[i].Prefix; + code.PrefixBits = codes[i].PrefLength; + code.AllBits = code.PrefixBits + codes[i].Bits; + if (code.PrefixBits > 8) { + return false; + } + } + for (size_t i = 0; i < 256; ++i) { + HuffIndices[i] = GetHuffIndex(ui8(i)); } - for (size_t i = 0; i < 256; ++i) { - HuffIndices[i] = GetHuffIndex(ui8(i)); - } - return true; + return true; } - template <class TIterator> - void Iterate(const TStringBuf& data, TIterator& iterator) { - size_t i = 0; - iterator.Visit(ui32(data.size())); - for (; i + 3 < data.size(); i += 4) { + template <class TIterator> + void Iterate(const TStringBuf& data, TIterator& iterator) { + size_t i = 0; + iterator.Visit(ui32(data.size())); + for (; i + 3 < data.size(); i += 4) { iterator.Visit(reinterpret_cast<const ui32*>(data.data() + i)[0]); - } - if (i != data.size()) { - ui32 buffer[1] = {0}; + } + if (i != data.size()) { + ui32 buffer[1] = {0}; memcpy(buffer, data.data() + i, data.size() - i); - iterator.Visit(buffer[0]); - } + iterator.Visit(buffer[0]); + } } - class TDataCompressor { - ui32 Keys[hashSize]; - ui32 Vals[hashSize]; - ui64 BitCodes[hashSize]; - ui8 BitLengths[hashSize]; - ui32 HashMul; - const TCompressorTable Table; - - public: - TDataCompressor(const TCompressorTable& table) - : Table(table) - { - HashMul = table.HashMul; - for (size_t i = 0; i < hashSize; ++i) { - Keys[i] = 0; - Vals[i] = ui32(-1); - } - for (size_t i = 0; i < 65536; ++i) { - size_t index = HashIndex(table.Table[i], table.HashMul); - Keys[index] = table.Table[i]; - Vals[index] = i; - table.GetHuffCode(ui32(i), BitCodes[index], BitLengths[index]); - } + class TDataCompressor { + ui32 Keys[hashSize]; + ui32 Vals[hashSize]; + ui64 BitCodes[hashSize]; + ui8 BitLengths[hashSize]; + ui32 HashMul; + const TCompressorTable Table; + + public: + TDataCompressor(const TCompressorTable& table) + : Table(table) + { + HashMul = table.HashMul; + for (size_t i = 0; i < hashSize; ++i) { + Keys[i] = 0; + Vals[i] = ui32(-1); + } + for (size_t i = 0; i < 65536; ++i) { + size_t index = HashIndex(table.Table[i], table.HashMul); + Keys[index] = table.Table[i]; + Vals[index] = i; + table.GetHuffCode(ui32(i), BitCodes[index], BitLengths[index]); + } } - void GetStat(ui32 val, ui32& stat) const { - size_t index = HashIndex(val, HashMul); - if (Keys[index] == val) { - stat = Vals[index]; - } else { - stat = ui32(-1); - } + void GetStat(ui32 val, ui32& stat) const { + size_t index = HashIndex(val, HashMul); + if (Keys[index] == val) { + stat = Vals[index]; + } else { + stat = ui32(-1); + } } - void GetStat(ui32 val, ui64& bitCode, ui8& bitLength) const { - size_t index = HashIndex(val, HashMul); - if (Keys[index] == val) { - bitCode = BitCodes[index]; - bitLength = BitLengths[index]; - } else { - Table.GetLastHuffCode(val, bitCode, bitLength); - } + void GetStat(ui32 val, ui64& bitCode, ui8& bitLength) const { + size_t index = HashIndex(val, HashMul); + if (Keys[index] == val) { + bitCode = BitCodes[index]; + bitLength = BitLengths[index]; + } else { + Table.GetLastHuffCode(val, bitCode, bitLength); + } } - size_t Compress4(const ui32* data4, ui8* dataBuff) const { - ui8* oldBuff = dataBuff; - ++dataBuff; - ui32 status = 0; - for (size_t i = 0; i < 4; ++i) { - status = (status << 2); - ui32 data = data4[i]; - if (data == 0) { - continue; - } - ui32 stat; - GetStat(data, stat); - if (stat < 256) { - memcpy(dataBuff, &stat, 1); - dataBuff += 1; - status += 1; - } else if (stat < 65536) { - memcpy(dataBuff, &stat, 2); - dataBuff += 2; - status += 2; - } else { - memcpy(dataBuff, &data, 4); - dataBuff += 4; - status += 3; - } - } - oldBuff[0] = ui8(status); - return dataBuff - oldBuff; + size_t Compress4(const ui32* data4, ui8* dataBuff) const { + ui8* oldBuff = dataBuff; + ++dataBuff; + ui32 status = 0; + for (size_t i = 0; i < 4; ++i) { + status = (status << 2); + ui32 data = data4[i]; + if (data == 0) { + continue; + } + ui32 stat; + GetStat(data, stat); + if (stat < 256) { + memcpy(dataBuff, &stat, 1); + dataBuff += 1; + status += 1; + } else if (stat < 65536) { + memcpy(dataBuff, &stat, 2); + dataBuff += 2; + status += 2; + } else { + memcpy(dataBuff, &data, 4); + dataBuff += 4; + status += 3; + } + } + oldBuff[0] = ui8(status); + return dataBuff - oldBuff; } - struct TCompressorIterator { - TVector<char>& Result; - const TDataCompressor& Compressor; - ui32 Cached[4]; - size_t Index; - size_t RealSize; - TCompressorIterator(TVector<char>& result, const TDataCompressor& compressor) - : Result(result) - , Compressor(compressor) - , Index(0) - , RealSize(0) - { + struct TCompressorIterator { + TVector<char>& Result; + const TDataCompressor& Compressor; + ui32 Cached[4]; + size_t Index; + size_t RealSize; + TCompressorIterator(TVector<char>& result, const TDataCompressor& compressor) + : Result(result) + , Compressor(compressor) + , Index(0) + , RealSize(0) + { + } + void Flush() { + Result.yresize(RealSize + 32); + RealSize += Compressor.Compress4(Cached, reinterpret_cast<ui8*>(Result.data()) + RealSize); + Index = 0; + } + void Visit(const ui32 data) { + Cached[Index] = data; + ++Index; + if (Index == 4) { + Flush(); + } + } + ~TCompressorIterator() { + if (Index != 0) { + for (size_t i = Index; i < 4; ++i) { + Cached[i] = 0; + } + Flush(); + } + Result.yresize(RealSize); } - void Flush() { - Result.yresize(RealSize + 32); - RealSize += Compressor.Compress4(Cached, reinterpret_cast<ui8*>(Result.data()) + RealSize); - Index = 0; - } - void Visit(const ui32 data) { - Cached[Index] = data; - ++Index; - if (Index == 4) { - Flush(); - } - } - ~TCompressorIterator() { - if (Index != 0) { - for (size_t i = Index; i < 4; ++i) { - Cached[i] = 0; - } - Flush(); - } - Result.yresize(RealSize); - } - }; - struct THQCompressorIterator { - TVector<char>& Result; - const TDataCompressor& Compressor; - size_t RealSize; - ui64 Offset; - THQCompressorIterator(TVector<char>& result, const TDataCompressor& compressor) - : Result(result) - , Compressor(compressor) - , RealSize(0) - , Offset(0) - { - } - void Visit(const ui32 data) { - size_t byteOff = Offset >> 3; - Result.yresize(byteOff + 32); - ui64 bitCode; - ui8 bitLength; - Compressor.GetStat(data, bitCode, bitLength); - ui64 dst; - memcpy(&dst, &Result[byteOff], sizeof(dst)); - ui64 mask = ((1ULL << (Offset & 7)) - 1ULL); - dst = (dst & mask) | (bitCode << (Offset & 7)); - memcpy(&Result[byteOff], &dst, sizeof(dst)); - Offset += bitLength; - } - ~THQCompressorIterator() { - Result.yresize((Offset + 7) >> 3); - if (Offset & 7) - Result.back() &= (1 << (Offset & 7)) - 1; - } - }; - template <bool HQ> - void Compress(const TStringBuf& stringBuf, TVector<char>& rslt) const { - if (!HQ) { - TCompressorIterator iterator(rslt, *this); - Iterate(stringBuf, iterator); + }; + struct THQCompressorIterator { + TVector<char>& Result; + const TDataCompressor& Compressor; + size_t RealSize; + ui64 Offset; + THQCompressorIterator(TVector<char>& result, const TDataCompressor& compressor) + : Result(result) + , Compressor(compressor) + , RealSize(0) + , Offset(0) + { + } + void Visit(const ui32 data) { + size_t byteOff = Offset >> 3; + Result.yresize(byteOff + 32); + ui64 bitCode; + ui8 bitLength; + Compressor.GetStat(data, bitCode, bitLength); + ui64 dst; + memcpy(&dst, &Result[byteOff], sizeof(dst)); + ui64 mask = ((1ULL << (Offset & 7)) - 1ULL); + dst = (dst & mask) | (bitCode << (Offset & 7)); + memcpy(&Result[byteOff], &dst, sizeof(dst)); + Offset += bitLength; + } + ~THQCompressorIterator() { + Result.yresize((Offset + 7) >> 3); + if (Offset & 7) + Result.back() &= (1 << (Offset & 7)) - 1; + } + }; + template <bool HQ> + void Compress(const TStringBuf& stringBuf, TVector<char>& rslt) const { + if (!HQ) { + TCompressorIterator iterator(rslt, *this); + Iterate(stringBuf, iterator); } else { - THQCompressorIterator iterator(rslt, *this); - Iterate(stringBuf, iterator); + THQCompressorIterator iterator(rslt, *this); + Iterate(stringBuf, iterator); } } - }; - - class TDataDecompressor { - const TCompressorTable Table; - - public: - TDataDecompressor(const TCompressorTable& table) - : Table(table) + }; + + class TDataDecompressor { + const TCompressorTable Table; + + public: + TDataDecompressor(const TCompressorTable& table) + : Table(table) { } - size_t Decompress(ui32* data4, const ui8* dataBuff, ui32 type) const { - if (type == 0) { - data4[0] = 0; - return 0; - } - if (type == 1) { - ui8 masked; - memcpy(&masked, dataBuff, sizeof(masked)); - data4[0] = Table.Table[masked]; - return 1; - } else if (type == 2) { - ui16 masked; - memcpy(&masked, dataBuff, sizeof(masked)); - data4[0] = Table.Table[masked]; - return 2; - } else { - memcpy(data4, dataBuff, sizeof(*data4)); - return 4; - } + size_t Decompress(ui32* data4, const ui8* dataBuff, ui32 type) const { + if (type == 0) { + data4[0] = 0; + return 0; + } + if (type == 1) { + ui8 masked; + memcpy(&masked, dataBuff, sizeof(masked)); + data4[0] = Table.Table[masked]; + return 1; + } else if (type == 2) { + ui16 masked; + memcpy(&masked, dataBuff, sizeof(masked)); + data4[0] = Table.Table[masked]; + return 2; + } else { + memcpy(data4, dataBuff, sizeof(*data4)); + return 4; + } + } + size_t Decompress(ui32* data4, const ui8* dataBuff) const { + ui32 status = *dataBuff; + const ui8* oldData = dataBuff; + ++dataBuff; + dataBuff += Decompress(data4 + 0, dataBuff, (status >> 6)); + dataBuff += Decompress(data4 + 1, dataBuff, (status >> 4) & 0x3); + dataBuff += Decompress(data4 + 2, dataBuff, (status >> 2) & 0x3); + dataBuff += Decompress(data4 + 3, dataBuff, (status)&0x3); + return dataBuff - oldData; } - size_t Decompress(ui32* data4, const ui8* dataBuff) const { - ui32 status = *dataBuff; - const ui8* oldData = dataBuff; - ++dataBuff; - dataBuff += Decompress(data4 + 0, dataBuff, (status >> 6)); - dataBuff += Decompress(data4 + 1, dataBuff, (status >> 4) & 0x3); - dataBuff += Decompress(data4 + 2, dataBuff, (status >> 2) & 0x3); - dataBuff += Decompress(data4 + 3, dataBuff, (status)&0x3); - return dataBuff - oldData; - } - void Decompress(ui32* data, const ui8* dataBuff, ui64& offset, ui64 border) const { - size_t off = offset >> 3; - ui64 val = 0; - if (border - off >= sizeof(val)) { - memcpy(&val, dataBuff + off, sizeof(val)); - } else { - memcpy(&val, dataBuff + off, border - off); + void Decompress(ui32* data, const ui8* dataBuff, ui64& offset, ui64 border) const { + size_t off = offset >> 3; + ui64 val = 0; + if (border - off >= sizeof(val)) { + memcpy(&val, dataBuff + off, sizeof(val)); + } else { + memcpy(&val, dataBuff + off, border - off); } - val >>= (offset & 7); - ui8 index = Table.HuffIndices[ui8(val)]; - const NCompProto::TCoderEntry& entry = Table.HuffCodes[index]; - ui8 allBits = entry.AllBits; - val = (val & ((1ULL << allBits) - 1ULL)) >> entry.PrefixBits; - val = val + entry.MinValue; - data[0] = (index == 9) ? val : Table.Table[val]; - offset += allBits; + val >>= (offset & 7); + ui8 index = Table.HuffIndices[ui8(val)]; + const NCompProto::TCoderEntry& entry = Table.HuffCodes[index]; + ui8 allBits = entry.AllBits; + val = (val & ((1ULL << allBits) - 1ULL)) >> entry.PrefixBits; + val = val + entry.MinValue; + data[0] = (index == 9) ? val : Table.Table[val]; + offset += allBits; } - size_t GetJunkSize() const { - return 8; - } - template <bool HQ> - void Decompress(const TStringBuf& dataBuf, TVector<char>& rslt) const { - rslt.clear(); - if (dataBuf.empty()) { - return; - } + size_t GetJunkSize() const { + return 8; + } + template <bool HQ> + void Decompress(const TStringBuf& dataBuf, TVector<char>& rslt) const { + rslt.clear(); + if (dataBuf.empty()) { + return; + } const ui8* src = reinterpret_cast<const ui8*>(dataBuf.data()); ui64 border = dataBuf.size(); - ui32 len = 0; - ui32 nullTerm = 1; - if (HQ) { - ui64 offset = 0; - ui32 length = 0; - Decompress(&length, src, offset, border); - size_t length32 = (length + 3) / 4; - rslt.yresize(length32 * 4 + nullTerm); - ui32* result = reinterpret_cast<ui32*>(rslt.data()); - len = length; - for (size_t i = 0; i < length32; ++i) { - Decompress(&result[i], src, offset, border); + ui32 len = 0; + ui32 nullTerm = 1; + if (HQ) { + ui64 offset = 0; + ui32 length = 0; + Decompress(&length, src, offset, border); + size_t length32 = (length + 3) / 4; + rslt.yresize(length32 * 4 + nullTerm); + ui32* result = reinterpret_cast<ui32*>(rslt.data()); + len = length; + for (size_t i = 0; i < length32; ++i) { + Decompress(&result[i], src, offset, border); + } + } else { + ui32 data[4]; + src += Decompress(data, src); + len = data[0]; + size_t length32x4 = (4 + len + 15) / 16; + rslt.yresize(length32x4 * 16 + nullTerm); + ui32* result = reinterpret_cast<ui32*>(rslt.data()); + for (size_t i = 0; i < 3; ++i) { + result[i] = data[i + 1]; + } + for (size_t j = 1; j < length32x4; ++j) { + src += Decompress(&result[j * 4 - 1], src); } - } else { - ui32 data[4]; - src += Decompress(data, src); - len = data[0]; - size_t length32x4 = (4 + len + 15) / 16; - rslt.yresize(length32x4 * 16 + nullTerm); - ui32* result = reinterpret_cast<ui32*>(rslt.data()); - for (size_t i = 0; i < 3; ++i) { - result[i] = data[i + 1]; - } - for (size_t j = 1; j < length32x4; ++j) { - src += Decompress(&result[j * 4 - 1], src); - } } - rslt.resize(len); + rslt.resize(len); } }; - - struct TSamplerIterator { - TDataSampler& Sampler; - TSamplerIterator(TDataSampler& sampler) - : Sampler(sampler) + + struct TSamplerIterator { + TDataSampler& Sampler; + TSamplerIterator(TDataSampler& sampler) + : Sampler(sampler) { } void Visit(const ui32 data) { - Sampler.AddStat(data); + Sampler.AddStat(data); } - }; - - struct TGreaterComparer { - //std::greater in arcadia??? - bool operator()(ui64 a, ui64 b) const { - return a > b; + }; + + struct TGreaterComparer { + //std::greater in arcadia??? + bool operator()(ui64 a, ui64 b) const { + return a > b; } }; - - TDataSampler::TDataSampler() { - memset(this, 0, sizeof(*this)); + + TDataSampler::TDataSampler() { + memset(this, 0, sizeof(*this)); } - void TDataSampler::BuildTable(TCompressorTable& table) const { - std::vector<ui64> sorted; - for (size_t i = 0; i < Size; ++i) { - ui64 res = (ui64(EntryHit[i]) << 32) + ui32(i); - sorted.push_back(res); + void TDataSampler::BuildTable(TCompressorTable& table) const { + std::vector<ui64> sorted; + for (size_t i = 0; i < Size; ++i) { + ui64 res = (ui64(EntryHit[i]) << 32) + ui32(i); + sorted.push_back(res); } - std::vector<ui32> entryTbl(Size); - std::sort(sorted.begin(), sorted.end(), TGreaterComparer()); - table.HashMul = magicHashMul; - i64 freqs[65536]; - for (size_t i = 0; i < 65536; ++i) { - ui32 ind = ui32(sorted[i]); - table.Table[i] = EntryVal[ind]; - freqs[i] = EntryHit[ind]; + std::vector<ui32> entryTbl(Size); + std::sort(sorted.begin(), sorted.end(), TGreaterComparer()); + table.HashMul = magicHashMul; + i64 freqs[65536]; + for (size_t i = 0; i < 65536; ++i) { + ui32 ind = ui32(sorted[i]); + table.Table[i] = EntryVal[ind]; + freqs[i] = EntryHit[ind]; } - table.BuildHuffCodes(Counter, freqs); + table.BuildHuffCodes(Counter, freqs); } - - void TDataSampler::AddStat(ui32 val) { - ++Counter; - size_t hashInd = HashIndex(val, magicHashMul); - if (EntryVal[hashInd] == val) { - ++EntryHit[hashInd]; - } else if (EntryHit[hashInd] > 1) { - --EntryHit[hashInd]; + + void TDataSampler::AddStat(ui32 val) { + ++Counter; + size_t hashInd = HashIndex(val, magicHashMul); + if (EntryVal[hashInd] == val) { + ++EntryHit[hashInd]; + } else if (EntryHit[hashInd] > 1) { + --EntryHit[hashInd]; } else { - EntryHit[hashInd] = 1; - EntryVal[hashInd] = val; + EntryHit[hashInd] = 1; + EntryVal[hashInd] = val; } } - - void TDataSampler::AddStat(const TStringBuf& stringBuf) { - TSamplerIterator iterator(*this); - Iterate(stringBuf, iterator); + + void TDataSampler::AddStat(const TStringBuf& stringBuf) { + TSamplerIterator iterator(*this); + Iterate(stringBuf, iterator); } - TChunkCompressor::TChunkCompressor(bool highQuality, const TCompressorTable& table) - : HighQuality(highQuality) + TChunkCompressor::TChunkCompressor(bool highQuality, const TCompressorTable& table) + : HighQuality(highQuality) { - Compressor.Reset(new TDataCompressor(table)); + Compressor.Reset(new TDataCompressor(table)); } - void TChunkCompressor::Compress(TStringBuf data, TVector<char>* rslt) const { - if (HighQuality) { - Compressor->Compress<1>(data, *rslt); - } else { - Compressor->Compress<0>(data, *rslt); - } + void TChunkCompressor::Compress(TStringBuf data, TVector<char>* rslt) const { + if (HighQuality) { + Compressor->Compress<1>(data, *rslt); + } else { + Compressor->Compress<0>(data, *rslt); + } } - TChunkCompressor::~TChunkCompressor() = default; + TChunkCompressor::~TChunkCompressor() = default; - TChunkDecompressor::TChunkDecompressor(bool highQuality, const TCompressorTable& table) - : HighQuality(highQuality) - { - Decompressor.Reset(new TDataDecompressor(table)); + TChunkDecompressor::TChunkDecompressor(bool highQuality, const TCompressorTable& table) + : HighQuality(highQuality) + { + Decompressor.Reset(new TDataDecompressor(table)); } - void TChunkDecompressor::Decompress(TStringBuf data, TVector<char>* rslt) const { - if (HighQuality) { - Decompressor->Decompress<1>(data, *rslt); - } else { - Decompressor->Decompress<0>(data, *rslt); - } + void TChunkDecompressor::Decompress(TStringBuf data, TVector<char>* rslt) const { + if (HighQuality) { + Decompressor->Decompress<1>(data, *rslt); + } else { + Decompressor->Decompress<0>(data, *rslt); + } } - TChunkDecompressor::~TChunkDecompressor() = default; + TChunkDecompressor::~TChunkDecompressor() = default; } diff --git a/library/cpp/comptable/comptable.h b/library/cpp/comptable/comptable.h index ff9f241964..d225fed7a0 100644 --- a/library/cpp/comptable/comptable.h +++ b/library/cpp/comptable/comptable.h @@ -1,5 +1,5 @@ #pragma once - + #include <util/generic/vector.h> #include <util/memory/blob.h> #include <util/ysaveload.h> @@ -7,62 +7,62 @@ #include <library/cpp/compproto/huff.h> namespace NCompTable { - struct TCompressorTable { - ui32 Table[65536]; - ui32 HashMul; - NCompProto::TCoderEntry HuffCodes[10]; - ui8 HuffIndices[256]; + struct TCompressorTable { + ui32 Table[65536]; + ui32 HashMul; + NCompProto::TCoderEntry HuffCodes[10]; + ui8 HuffIndices[256]; - void GetHuffCode(const NCompProto::TCoderEntry& entry, ui32 value, ui64& bitCode, ui8& bitLength) const; - void GetLastHuffCode(ui32 value, ui64& bitCode, ui8& bitLength) const; - void GetHuffCode(ui32 value, ui64& bitCode, ui8& bitLength) const; - ui8 GetHuffIndex(ui8 prefix); - void BuildHuffCodes(i64 totalFreq, i64 freqs[65536]); - bool BuildHuffCodes(i64 totalFreq, i64 freqs[65536], i64 add); - }; + void GetHuffCode(const NCompProto::TCoderEntry& entry, ui32 value, ui64& bitCode, ui8& bitLength) const; + void GetLastHuffCode(ui32 value, ui64& bitCode, ui8& bitLength) const; + void GetHuffCode(ui32 value, ui64& bitCode, ui8& bitLength) const; + ui8 GetHuffIndex(ui8 prefix); + void BuildHuffCodes(i64 totalFreq, i64 freqs[65536]); + bool BuildHuffCodes(i64 totalFreq, i64 freqs[65536], i64 add); + }; - struct TDataSampler { - enum { - Size = 1 << 18, - }; - ui32 EntryVal[Size]; - i64 EntryHit[Size]; - i64 Counter; + struct TDataSampler { + enum { + Size = 1 << 18, + }; + ui32 EntryVal[Size]; + i64 EntryHit[Size]; + i64 Counter; - public: - TDataSampler(); - void BuildTable(TCompressorTable& table) const; - void AddStat(ui32 val); - void AddStat(const TStringBuf& stringBuf); + public: + TDataSampler(); + void BuildTable(TCompressorTable& table) const; + void AddStat(ui32 val); + void AddStat(const TStringBuf& stringBuf); }; - class TDataCompressor; - class TDataDecompressor; + class TDataCompressor; + class TDataDecompressor; - class TChunkCompressor { - public: - TChunkCompressor(bool highQuality, const TCompressorTable& table); - void Compress(TStringBuf data, TVector<char>* result) const; - ~TChunkCompressor(); + class TChunkCompressor { + public: + TChunkCompressor(bool highQuality, const TCompressorTable& table); + void Compress(TStringBuf data, TVector<char>* result) const; + ~TChunkCompressor(); - private: - bool HighQuality; - THolder<TDataCompressor> Compressor; - }; + private: + bool HighQuality; + THolder<TDataCompressor> Compressor; + }; - class TChunkDecompressor { - public: - TChunkDecompressor(bool highQuality, const TCompressorTable& table); - void Decompress(TStringBuf data, TVector<char>* result) const; - ~TChunkDecompressor(); + class TChunkDecompressor { + public: + TChunkDecompressor(bool highQuality, const TCompressorTable& table); + void Decompress(TStringBuf data, TVector<char>* result) const; + ~TChunkDecompressor(); + + private: + bool HighQuality; + THolder<TDataDecompressor> Decompressor; + }; - private: - bool HighQuality; - THolder<TDataDecompressor> Decompressor; - }; +} -} - template <> class TSerializer<NCompTable::TCompressorTable> { public: diff --git a/library/cpp/comptable/usage/usage.cpp b/library/cpp/comptable/usage/usage.cpp index 550b6777c3..9997c83686 100644 --- a/library/cpp/comptable/usage/usage.cpp +++ b/library/cpp/comptable/usage/usage.cpp @@ -1,15 +1,15 @@ #include <library/cpp/comptable/comptable.h> -#include <util/random/random.h> -#include <util/random/fast.h> - +#include <util/random/random.h> +#include <util/random/fast.h> + #include <time.h> #include <stdlib.h> using namespace NCompTable; -template <bool HQ> -void DoTest(const TCompressorTable& table, const TVector<TString>& lines) { +template <bool HQ> +void DoTest(const TCompressorTable& table, const TVector<TString>& lines) { TVector<char> compressed; TVector<char> decompressed; @@ -19,8 +19,8 @@ void DoTest(const TCompressorTable& table, const TVector<TString>& lines) { size_t origSize = 0; size_t compSize = 0; float cl1 = clock(); - for (size_t i = 0; i < lines.size(); ++i) { - const TString& line = lines[i]; + for (size_t i = 0; i < lines.size(); ++i) { + const TString& line = lines[i]; compressor.Compress(line, &compressed); origSize += line.size(); compSize += compressed.size(); @@ -28,20 +28,20 @@ void DoTest(const TCompressorTable& table, const TVector<TString>& lines) { deCompressor.Decompress(in, &decompressed); if (decompressed.size() != line.size() || memcmp(decompressed.data(), line.data(), decompressed.size())) { Cout << i << "\n"; - Cout << line << "\n" - << TString(decompressed.data(), decompressed.size()) << "\n"; + Cout << line << "\n" + << TString(decompressed.data(), decompressed.size()) << "\n"; abort(); } } float cl2 = clock(); float secs = (cl2 - cl1) / CLOCKS_PER_SEC; Cout << "origSize: " << origSize << "\tcompSize: " << compSize << Endl; - Cout << "yep! compression + decompression speed " << origSize / 1024.0f / 1024.0f / secs << " mbps\n"; + Cout << "yep! compression + decompression speed " << origSize / 1024.0f / 1024.0f / secs << " mbps\n"; Cout << "yep! compression ratio " << double(origSize) / double(compSize + 1) << "\n"; } -int main(int argc, const char* argv[]) { - TReallyFastRng32 rr(17); +int main(int argc, const char* argv[]) { + TReallyFastRng32 rr(17); TVector<TString> lines; /*FILE *fp = fopen("res", "rb"); while (!feof(fp)) { @@ -52,16 +52,16 @@ int main(int argc, const char* argv[]) { //for (size_t i = 0; i < 10000000; ++i) { //for (size_t i = 0; i < 1000000; ++i) { for (size_t i = 0; i < 1000000; ++i) { - size_t size = rr.Uniform(32); + size_t size = rr.Uniform(32); TString res = "www.yandex.ru/yandsearch?text="; for (size_t j = 0; j < size; ++j) { - res += "qwer"[rr.Uniform(4)]; + res += "qwer"[rr.Uniform(4)]; } lines.push_back(res); } THolder<TDataSampler> sampler(new TDataSampler); for (size_t i = 0; i < lines.size(); ++i) { - sampler->AddStat(lines[i]); + sampler->AddStat(lines[i]); } TCompressorTable table; sampler->BuildTable(table); diff --git a/library/cpp/comptable/ut/comptable_ut.cpp b/library/cpp/comptable/ut/comptable_ut.cpp index 5bb2058fce..5901d0246f 100644 --- a/library/cpp/comptable/ut/comptable_ut.cpp +++ b/library/cpp/comptable/ut/comptable_ut.cpp @@ -1,13 +1,13 @@ #include <library/cpp/comptable/comptable.h> #include <library/cpp/testing/unittest/registar.h> -#include <util/random/random.h> -#include <util/random/fast.h> - +#include <util/random/random.h> +#include <util/random/fast.h> + using namespace NCompTable; -template <bool HQ> -void DoTest(const TCompressorTable& table, const TVector<TString>& lines) { +template <bool HQ> +void DoTest(const TCompressorTable& table, const TVector<TString>& lines) { TVector<char> compressed; TVector<char> decompressed; @@ -21,8 +21,8 @@ void DoTest(const TCompressorTable& table, const TVector<TString>& lines) { size_t origSize = 0; size_t compSize = 0; - for (size_t i = 0; i < lines.size(); ++i) { - const TString& line = lines[i]; + for (size_t i = 0; i < lines.size(); ++i) { + const TString& line = lines[i]; compressor.Compress(line, &compressed); origSize += line.size(); compSize += compressed.size(); @@ -30,30 +30,30 @@ void DoTest(const TCompressorTable& table, const TVector<TString>& lines) { deCompressor.Decompress(in, &decompressed); UNIT_ASSERT(decompressed.size() == line.size() && memcmp(decompressed.data(), line.data(), decompressed.size()) == 0); } - UNIT_ASSERT_EQUAL(origSize, 45491584); + UNIT_ASSERT_EQUAL(origSize, 45491584); if (HQ) { - UNIT_ASSERT_EQUAL(compSize, 11074583); + UNIT_ASSERT_EQUAL(compSize, 11074583); } else { - UNIT_ASSERT_EQUAL(compSize, 17459336); + UNIT_ASSERT_EQUAL(compSize, 17459336); } UNIT_ASSERT(compSize < origSize); } Y_UNIT_TEST_SUITE(TestComptable) { Y_UNIT_TEST(TestComptableCompressDecompress) { - TReallyFastRng32 rr(17); + TReallyFastRng32 rr(17); TVector<TString> lines; for (size_t i = 0; i < 1000000; ++i) { - size_t size = rr.Uniform(32); + size_t size = rr.Uniform(32); TString res = "www.yandex.ru/yandsearch?text="; for (size_t j = 0; j < size; ++j) { - res += "qwer"[rr.Uniform(4)]; + res += "qwer"[rr.Uniform(4)]; } lines.push_back(res); } THolder<TDataSampler> sampler(new TDataSampler); for (size_t i = 0; i < lines.size(); ++i) { - sampler->AddStat(lines[i]); + sampler->AddStat(lines[i]); } TCompressorTable table; sampler->BuildTable(table); |