diff options
author | snow <snow@yandex-team.ru> | 2022-02-10 16:49:51 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:49:51 +0300 |
commit | b3f165d1a2e8a2dbfcbb351310aeb05fae1f8a6c (patch) | |
tree | 8f92b813636cef1b46a00d28d05c485d385f1ac0 /library | |
parent | c943ab142d4182af287664691cf116c143172792 (diff) | |
download | ydb-b3f165d1a2e8a2dbfcbb351310aeb05fae1f8a6c.tar.gz |
Restoring authorship annotation for <snow@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'library')
-rw-r--r-- | library/cpp/comptable/comptable.cpp | 140 | ||||
-rw-r--r-- | library/cpp/comptable/comptable.h | 50 | ||||
-rw-r--r-- | library/cpp/comptable/usage/usage.cpp | 94 | ||||
-rw-r--r-- | library/cpp/comptable/usage/ya.make | 10 | ||||
-rw-r--r-- | library/cpp/comptable/ut/comptable_ut.cpp | 82 | ||||
-rw-r--r-- | library/cpp/comptable/ut/ya.make | 8 | ||||
-rw-r--r-- | library/cpp/comptable/ya.make | 12 |
7 files changed, 198 insertions, 198 deletions
diff --git a/library/cpp/comptable/comptable.cpp b/library/cpp/comptable/comptable.cpp index 8a92d4d1aa..ca4b5758e1 100644 --- a/library/cpp/comptable/comptable.cpp +++ b/library/cpp/comptable/comptable.cpp @@ -1,24 +1,24 @@ -#include <util/system/defaults.h> -#include <util/system/filemap.h> +#include <util/system/defaults.h> +#include <util/system/filemap.h> #include <util/generic/string.h> -#include <util/generic/vector.h> -#include <util/string/cast.h> -#include <util/stream/file.h> +#include <util/generic/vector.h> +#include <util/string/cast.h> +#include <util/stream/file.h> #include <library/cpp/compproto/huff.h> - -#include "comptable.h" - + +#include "comptable.h" + #include <cstdlib> -namespace NCompTable { +namespace NCompTable { static const ui32 magicHashMul = 0xd077cd1f; static const size_t hashSizeLog = 18; static const size_t hashSize = 1 << hashSizeLog; - + size_t HashIndex(ui32 value, ui32 hashMul) { return (value * hashMul) >> (32 - hashSizeLog); } - + void TCompressorTable::GetHuffCode(const NCompProto::TCoderEntry& entry, ui32 value, ui64& bitCode, ui8& bitLength) const { ui64 code = value - entry.MinValue; bitCode = (code << entry.PrefixBits) | entry.Prefix; @@ -34,19 +34,19 @@ namespace NCompTable { GetHuffCode(huffCode, value, bitCode, bitLength); return; } - } + } abort(); - } + } ui8 TCompressorTable::GetHuffIndex(ui8 prefix) { for (size_t i = 0; i < 10; ++i) { if ((prefix & ((1 << HuffCodes[i].PrefixBits) - 1)) == HuffCodes[i].Prefix) { return i; } - } + } abort(); return 0; - } - + } + void TCompressorTable::BuildHuffCodes(i64 totalFreq, i64 freqs[65536]) { i64 add = 1; while (1) { @@ -54,9 +54,9 @@ namespace NCompTable { return; } add = add * 2; - } - } - + } + } + bool TCompressorTable::BuildHuffCodes(i64 totalFreq, i64 freqs[65536], i64 add) { TVector<NCompProto::TCode> codes; size_t bits[] = {0, 1, 2, 4, 8, 10, 12, 14, 16, 32}; @@ -71,7 +71,7 @@ namespace NCompTable { codes.push_back(NCompProto::TCode(weight + add, ui32(off), bits[i])); total = total > weight ? total - weight : 0; off += size; - } + } codes.push_back(NCompProto::TCode(total + add, 0, 32)); i64 ret = NCompProto::BuildHuff(codes); Y_UNUSED(ret); @@ -84,13 +84,13 @@ namespace NCompTable { if (code.PrefixBits > 8) { return false; } - } + } for (size_t i = 0; i < 256; ++i) { HuffIndices[i] = GetHuffIndex(ui8(i)); } return true; - } - + } + template <class TIterator> void Iterate(const TStringBuf& data, TIterator& iterator) { size_t i = 0; @@ -103,8 +103,8 @@ namespace NCompTable { memcpy(buffer, data.data() + i, data.size() - i); iterator.Visit(buffer[0]); } - } - + } + class TDataCompressor { ui32 Keys[hashSize]; ui32 Vals[hashSize]; @@ -128,7 +128,7 @@ namespace NCompTable { Vals[index] = i; table.GetHuffCode(ui32(i), BitCodes[index], BitLengths[index]); } - } + } void GetStat(ui32 val, ui32& stat) const { size_t index = HashIndex(val, HashMul); if (Keys[index] == val) { @@ -136,7 +136,7 @@ namespace NCompTable { } else { stat = ui32(-1); } - } + } void GetStat(ui32 val, ui64& bitCode, ui8& bitLength) const { size_t index = HashIndex(val, HashMul); if (Keys[index] == val) { @@ -145,7 +145,7 @@ namespace NCompTable { } else { Table.GetLastHuffCode(val, bitCode, bitLength); } - } + } size_t Compress4(const ui32* data4, ui8* dataBuff) const { ui8* oldBuff = dataBuff; ++dataBuff; @@ -174,7 +174,7 @@ namespace NCompTable { } oldBuff[0] = ui8(status); return dataBuff - oldBuff; - } + } struct TCompressorIterator { TVector<char>& Result; const TDataCompressor& Compressor; @@ -187,7 +187,7 @@ namespace NCompTable { , Index(0) , RealSize(0) { - } + } void Flush() { Result.yresize(RealSize + 32); RealSize += Compressor.Compress4(Cached, reinterpret_cast<ui8*>(Result.data()) + RealSize); @@ -246,11 +246,11 @@ namespace NCompTable { if (!HQ) { TCompressorIterator iterator(rslt, *this); Iterate(stringBuf, iterator); - } else { + } else { THQCompressorIterator iterator(rslt, *this); Iterate(stringBuf, iterator); - } - } + } + } }; class TDataDecompressor { @@ -259,8 +259,8 @@ namespace NCompTable { public: TDataDecompressor(const TCompressorTable& table) : Table(table) - { - } + { + } size_t Decompress(ui32* data4, const ui8* dataBuff, ui32 type) const { if (type == 0) { data4[0] = 0; @@ -280,7 +280,7 @@ namespace NCompTable { memcpy(data4, dataBuff, sizeof(*data4)); return 4; } - } + } size_t Decompress(ui32* data4, const ui8* dataBuff) const { ui32 status = *dataBuff; const ui8* oldData = dataBuff; @@ -298,7 +298,7 @@ namespace NCompTable { memcpy(&val, dataBuff + off, sizeof(val)); } else { memcpy(&val, dataBuff + off, border - off); - } + } val >>= (offset & 7); ui8 index = Table.HuffIndices[ui8(val)]; const NCompProto::TCoderEntry& entry = Table.HuffCodes[index]; @@ -307,7 +307,7 @@ namespace NCompTable { val = val + entry.MinValue; data[0] = (index == 9) ? val : Table.Table[val]; offset += allBits; - } + } size_t GetJunkSize() const { return 8; } @@ -331,7 +331,7 @@ namespace NCompTable { len = length; for (size_t i = 0; i < length32; ++i) { Decompress(&result[i], src, offset, border); - } + } } else { ui32 data[4]; src += Decompress(data, src); @@ -345,39 +345,39 @@ namespace NCompTable { for (size_t j = 1; j < length32x4; ++j) { src += Decompress(&result[j * 4 - 1], src); } - } + } rslt.resize(len); - } - }; + } + }; struct TSamplerIterator { TDataSampler& Sampler; TSamplerIterator(TDataSampler& sampler) : Sampler(sampler) - { - } - void Visit(const ui32 data) { + { + } + void Visit(const ui32 data) { Sampler.AddStat(data); - } + } }; struct TGreaterComparer { //std::greater in arcadia??? bool operator()(ui64 a, ui64 b) const { return a > b; - } - }; + } + }; TDataSampler::TDataSampler() { memset(this, 0, sizeof(*this)); - } - + } + void TDataSampler::BuildTable(TCompressorTable& table) const { std::vector<ui64> sorted; for (size_t i = 0; i < Size; ++i) { ui64 res = (ui64(EntryHit[i]) << 32) + ui32(i); sorted.push_back(res); - } + } std::vector<ui32> entryTbl(Size); std::sort(sorted.begin(), sorted.end(), TGreaterComparer()); table.HashMul = magicHashMul; @@ -386,9 +386,9 @@ namespace NCompTable { ui32 ind = ui32(sorted[i]); table.Table[i] = EntryVal[ind]; freqs[i] = EntryHit[ind]; - } + } table.BuildHuffCodes(Counter, freqs); - } + } void TDataSampler::AddStat(ui32 val) { ++Counter; @@ -397,47 +397,47 @@ namespace NCompTable { ++EntryHit[hashInd]; } else if (EntryHit[hashInd] > 1) { --EntryHit[hashInd]; - } else { + } else { EntryHit[hashInd] = 1; EntryVal[hashInd] = val; - } - } + } + } void TDataSampler::AddStat(const TStringBuf& stringBuf) { TSamplerIterator iterator(*this); Iterate(stringBuf, iterator); - } - + } + TChunkCompressor::TChunkCompressor(bool highQuality, const TCompressorTable& table) : HighQuality(highQuality) - { + { Compressor.Reset(new TDataCompressor(table)); - } - + } + void TChunkCompressor::Compress(TStringBuf data, TVector<char>* rslt) const { if (HighQuality) { Compressor->Compress<1>(data, *rslt); } else { Compressor->Compress<0>(data, *rslt); } - } - + } + TChunkCompressor::~TChunkCompressor() = default; - + TChunkDecompressor::TChunkDecompressor(bool highQuality, const TCompressorTable& table) : HighQuality(highQuality) { Decompressor.Reset(new TDataDecompressor(table)); - } - + } + void TChunkDecompressor::Decompress(TStringBuf data, TVector<char>* rslt) const { if (HighQuality) { Decompressor->Decompress<1>(data, *rslt); } else { Decompressor->Decompress<0>(data, *rslt); } - } - + } + TChunkDecompressor::~TChunkDecompressor() = default; - -} + +} diff --git a/library/cpp/comptable/comptable.h b/library/cpp/comptable/comptable.h index d225fed7a0..3ec626965c 100644 --- a/library/cpp/comptable/comptable.h +++ b/library/cpp/comptable/comptable.h @@ -1,18 +1,18 @@ -#pragma once - -#include <util/generic/vector.h> -#include <util/memory/blob.h> -#include <util/ysaveload.h> +#pragma once +#include <util/generic/vector.h> +#include <util/memory/blob.h> +#include <util/ysaveload.h> + #include <library/cpp/compproto/huff.h> - -namespace NCompTable { + +namespace NCompTable { struct TCompressorTable { ui32 Table[65536]; ui32 HashMul; NCompProto::TCoderEntry HuffCodes[10]; ui8 HuffIndices[256]; - + void GetHuffCode(const NCompProto::TCoderEntry& entry, ui32 value, ui64& bitCode, ui8& bitLength) const; void GetLastHuffCode(ui32 value, ui64& bitCode, ui8& bitLength) const; void GetHuffCode(ui32 value, ui64& bitCode, ui8& bitLength) const; @@ -20,7 +20,7 @@ namespace NCompTable { void BuildHuffCodes(i64 totalFreq, i64 freqs[65536]); bool BuildHuffCodes(i64 totalFreq, i64 freqs[65536], i64 add); }; - + struct TDataSampler { enum { Size = 1 << 18, @@ -28,48 +28,48 @@ namespace NCompTable { ui32 EntryVal[Size]; i64 EntryHit[Size]; i64 Counter; - + public: TDataSampler(); void BuildTable(TCompressorTable& table) const; void AddStat(ui32 val); void AddStat(const TStringBuf& stringBuf); - }; - + }; + class TDataCompressor; class TDataDecompressor; - + class TChunkCompressor { public: TChunkCompressor(bool highQuality, const TCompressorTable& table); void Compress(TStringBuf data, TVector<char>* result) const; ~TChunkCompressor(); - + private: bool HighQuality; THolder<TDataCompressor> Compressor; }; - + class TChunkDecompressor { public: TChunkDecompressor(bool highQuality, const TCompressorTable& table); void Decompress(TStringBuf data, TVector<char>* result) const; ~TChunkDecompressor(); - + private: bool HighQuality; THolder<TDataDecompressor> Decompressor; }; - + } -template <> -class TSerializer<NCompTable::TCompressorTable> { -public: +template <> +class TSerializer<NCompTable::TCompressorTable> { +public: static inline void Save(IOutputStream* out, const NCompTable::TCompressorTable& entry) { - SavePodType(out, entry); - } + SavePodType(out, entry); + } static inline void Load(IInputStream* in, NCompTable::TCompressorTable& entry) { - LoadPodType(in, entry); - } -}; + LoadPodType(in, entry); + } +}; diff --git a/library/cpp/comptable/usage/usage.cpp b/library/cpp/comptable/usage/usage.cpp index 9997c83686..60ed9b9422 100644 --- a/library/cpp/comptable/usage/usage.cpp +++ b/library/cpp/comptable/usage/usage.cpp @@ -1,75 +1,75 @@ #include <library/cpp/comptable/comptable.h> - + #include <util/random/random.h> #include <util/random/fast.h> #include <time.h> #include <stdlib.h> -using namespace NCompTable; - +using namespace NCompTable; + template <bool HQ> void DoTest(const TCompressorTable& table, const TVector<TString>& lines) { TVector<char> compressed; TVector<char> decompressed; - - TChunkCompressor compressor(HQ, table); - TChunkDecompressor deCompressor(HQ, table); - - size_t origSize = 0; - size_t compSize = 0; - float cl1 = clock(); + + TChunkCompressor compressor(HQ, table); + TChunkDecompressor deCompressor(HQ, table); + + size_t origSize = 0; + size_t compSize = 0; + float cl1 = clock(); for (size_t i = 0; i < lines.size(); ++i) { const TString& line = lines[i]; - compressor.Compress(line, &compressed); - origSize += line.size(); - compSize += compressed.size(); - TStringBuf in(compressed.data(), compressed.size()); - deCompressor.Decompress(in, &decompressed); + compressor.Compress(line, &compressed); + origSize += line.size(); + compSize += compressed.size(); + TStringBuf in(compressed.data(), compressed.size()); + deCompressor.Decompress(in, &decompressed); if (decompressed.size() != line.size() || memcmp(decompressed.data(), line.data(), decompressed.size())) { - Cout << i << "\n"; + Cout << i << "\n"; Cout << line << "\n" << TString(decompressed.data(), decompressed.size()) << "\n"; - abort(); - } - } - float cl2 = clock(); - float secs = (cl2 - cl1) / CLOCKS_PER_SEC; - Cout << "origSize: " << origSize << "\tcompSize: " << compSize << Endl; + abort(); + } + } + float cl2 = clock(); + float secs = (cl2 - cl1) / CLOCKS_PER_SEC; + Cout << "origSize: " << origSize << "\tcompSize: " << compSize << Endl; Cout << "yep! compression + decompression speed " << origSize / 1024.0f / 1024.0f / secs << " mbps\n"; - Cout << "yep! compression ratio " << double(origSize) / double(compSize + 1) << "\n"; -} - + Cout << "yep! compression ratio " << double(origSize) / double(compSize + 1) << "\n"; +} + int main(int argc, const char* argv[]) { TReallyFastRng32 rr(17); TVector<TString> lines; - /*FILE *fp = fopen("res", "rb"); - while (!feof(fp)) { - char buff[4096]; - fscanf(fp, "%s", buff); + /*FILE *fp = fopen("res", "rb"); + while (!feof(fp)) { + char buff[4096]; + fscanf(fp, "%s", buff); lines.push_back(TString(buff)); - }*/ - //for (size_t i = 0; i < 10000000; ++i) { - //for (size_t i = 0; i < 1000000; ++i) { + }*/ + //for (size_t i = 0; i < 10000000; ++i) { + //for (size_t i = 0; i < 1000000; ++i) { for (size_t i = 0; i < 1000000; ++i) { size_t size = rr.Uniform(32); TString res = "www.yandex.ru/yandsearch?text="; - for (size_t j = 0; j < size; ++j) { + for (size_t j = 0; j < size; ++j) { res += "qwer"[rr.Uniform(4)]; - } - lines.push_back(res); - } - THolder<TDataSampler> sampler(new TDataSampler); - for (size_t i = 0; i < lines.size(); ++i) { + } + lines.push_back(res); + } + THolder<TDataSampler> sampler(new TDataSampler); + for (size_t i = 0; i < lines.size(); ++i) { sampler->AddStat(lines[i]); - } - TCompressorTable table; - sampler->BuildTable(table); - - DoTest<true>(table, lines); - DoTest<false>(table, lines); - + } + TCompressorTable table; + sampler->BuildTable(table); + + DoTest<true>(table, lines); + DoTest<false>(table, lines); + Y_UNUSED(argc); Y_UNUSED(argv); - return 0; -} + return 0; +} diff --git a/library/cpp/comptable/usage/ya.make b/library/cpp/comptable/usage/ya.make index ab31e7528c..b23b545623 100644 --- a/library/cpp/comptable/usage/ya.make +++ b/library/cpp/comptable/usage/ya.make @@ -1,13 +1,13 @@ -PROGRAM() - +PROGRAM() + OWNER(ironpeter) - + SRCS( usage.cpp ) - + PEERDIR( library/cpp/comptable ) -END() +END() diff --git a/library/cpp/comptable/ut/comptable_ut.cpp b/library/cpp/comptable/ut/comptable_ut.cpp index 5901d0246f..0f58a60e87 100644 --- a/library/cpp/comptable/ut/comptable_ut.cpp +++ b/library/cpp/comptable/ut/comptable_ut.cpp @@ -1,64 +1,64 @@ #include <library/cpp/comptable/comptable.h> #include <library/cpp/testing/unittest/registar.h> - + #include <util/random/random.h> #include <util/random/fast.h> -using namespace NCompTable; - +using namespace NCompTable; + template <bool HQ> void DoTest(const TCompressorTable& table, const TVector<TString>& lines) { TVector<char> compressed; TVector<char> decompressed; - - TChunkCompressor compressor(HQ, table); - TStringStream tmp; - Save(&tmp, table); - TCompressorTable tableLoaded; - Load(&tmp, tableLoaded); - UNIT_ASSERT(memcmp(&table, &tableLoaded, sizeof(table)) == 0); - TChunkDecompressor deCompressor(HQ, tableLoaded); - - size_t origSize = 0; - size_t compSize = 0; + + TChunkCompressor compressor(HQ, table); + TStringStream tmp; + Save(&tmp, table); + TCompressorTable tableLoaded; + Load(&tmp, tableLoaded); + UNIT_ASSERT(memcmp(&table, &tableLoaded, sizeof(table)) == 0); + TChunkDecompressor deCompressor(HQ, tableLoaded); + + size_t origSize = 0; + size_t compSize = 0; for (size_t i = 0; i < lines.size(); ++i) { const TString& line = lines[i]; - compressor.Compress(line, &compressed); - origSize += line.size(); - compSize += compressed.size(); - TStringBuf in(compressed.data(), compressed.size()); - deCompressor.Decompress(in, &decompressed); + compressor.Compress(line, &compressed); + origSize += line.size(); + compSize += compressed.size(); + TStringBuf in(compressed.data(), compressed.size()); + deCompressor.Decompress(in, &decompressed); UNIT_ASSERT(decompressed.size() == line.size() && memcmp(decompressed.data(), line.data(), decompressed.size()) == 0); - } + } UNIT_ASSERT_EQUAL(origSize, 45491584); - if (HQ) { + if (HQ) { UNIT_ASSERT_EQUAL(compSize, 11074583); - } else { + } else { UNIT_ASSERT_EQUAL(compSize, 17459336); - } - UNIT_ASSERT(compSize < origSize); -} - + } + UNIT_ASSERT(compSize < origSize); +} + Y_UNIT_TEST_SUITE(TestComptable) { Y_UNIT_TEST(TestComptableCompressDecompress) { TReallyFastRng32 rr(17); TVector<TString> lines; - for (size_t i = 0; i < 1000000; ++i) { + for (size_t i = 0; i < 1000000; ++i) { size_t size = rr.Uniform(32); TString res = "www.yandex.ru/yandsearch?text="; - for (size_t j = 0; j < size; ++j) { + for (size_t j = 0; j < size; ++j) { res += "qwer"[rr.Uniform(4)]; - } - lines.push_back(res); - } - THolder<TDataSampler> sampler(new TDataSampler); - for (size_t i = 0; i < lines.size(); ++i) { + } + lines.push_back(res); + } + THolder<TDataSampler> sampler(new TDataSampler); + for (size_t i = 0; i < lines.size(); ++i) { sampler->AddStat(lines[i]); - } - TCompressorTable table; - sampler->BuildTable(table); - - DoTest<true>(table, lines); - DoTest<false>(table, lines); - } -} + } + TCompressorTable table; + sampler->BuildTable(table); + + DoTest<true>(table, lines); + DoTest<false>(table, lines); + } +} diff --git a/library/cpp/comptable/ut/ya.make b/library/cpp/comptable/ut/ya.make index d0a49793a5..75bc8d1031 100644 --- a/library/cpp/comptable/ut/ya.make +++ b/library/cpp/comptable/ut/ya.make @@ -1,9 +1,9 @@ UNITTEST_FOR(library/cpp/comptable) - + OWNER(ironpeter) - + SRCS( comptable_ut.cpp ) - -END() + +END() diff --git a/library/cpp/comptable/ya.make b/library/cpp/comptable/ya.make index 314603c62a..0e52c5b698 100644 --- a/library/cpp/comptable/ya.make +++ b/library/cpp/comptable/ya.make @@ -1,13 +1,13 @@ -LIBRARY() - +LIBRARY() + OWNER(ironpeter) - + SRCS( comptable.cpp ) - + PEERDIR( library/cpp/compproto ) - -END() + +END() |