aboutsummaryrefslogtreecommitdiffstats
path: root/library
diff options
context:
space:
mode:
authorsnow <snow@yandex-team.ru>2022-02-10 16:49:51 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:49:51 +0300
commitb3f165d1a2e8a2dbfcbb351310aeb05fae1f8a6c (patch)
tree8f92b813636cef1b46a00d28d05c485d385f1ac0 /library
parentc943ab142d4182af287664691cf116c143172792 (diff)
downloadydb-b3f165d1a2e8a2dbfcbb351310aeb05fae1f8a6c.tar.gz
Restoring authorship annotation for <snow@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'library')
-rw-r--r--library/cpp/comptable/comptable.cpp140
-rw-r--r--library/cpp/comptable/comptable.h50
-rw-r--r--library/cpp/comptable/usage/usage.cpp94
-rw-r--r--library/cpp/comptable/usage/ya.make10
-rw-r--r--library/cpp/comptable/ut/comptable_ut.cpp82
-rw-r--r--library/cpp/comptable/ut/ya.make8
-rw-r--r--library/cpp/comptable/ya.make12
7 files changed, 198 insertions, 198 deletions
diff --git a/library/cpp/comptable/comptable.cpp b/library/cpp/comptable/comptable.cpp
index 8a92d4d1aa..ca4b5758e1 100644
--- a/library/cpp/comptable/comptable.cpp
+++ b/library/cpp/comptable/comptable.cpp
@@ -1,24 +1,24 @@
-#include <util/system/defaults.h>
-#include <util/system/filemap.h>
+#include <util/system/defaults.h>
+#include <util/system/filemap.h>
#include <util/generic/string.h>
-#include <util/generic/vector.h>
-#include <util/string/cast.h>
-#include <util/stream/file.h>
+#include <util/generic/vector.h>
+#include <util/string/cast.h>
+#include <util/stream/file.h>
#include <library/cpp/compproto/huff.h>
-
-#include "comptable.h"
-
+
+#include "comptable.h"
+
#include <cstdlib>
-namespace NCompTable {
+namespace NCompTable {
static const ui32 magicHashMul = 0xd077cd1f;
static const size_t hashSizeLog = 18;
static const size_t hashSize = 1 << hashSizeLog;
-
+
size_t HashIndex(ui32 value, ui32 hashMul) {
return (value * hashMul) >> (32 - hashSizeLog);
}
-
+
void TCompressorTable::GetHuffCode(const NCompProto::TCoderEntry& entry, ui32 value, ui64& bitCode, ui8& bitLength) const {
ui64 code = value - entry.MinValue;
bitCode = (code << entry.PrefixBits) | entry.Prefix;
@@ -34,19 +34,19 @@ namespace NCompTable {
GetHuffCode(huffCode, value, bitCode, bitLength);
return;
}
- }
+ }
abort();
- }
+ }
ui8 TCompressorTable::GetHuffIndex(ui8 prefix) {
for (size_t i = 0; i < 10; ++i) {
if ((prefix & ((1 << HuffCodes[i].PrefixBits) - 1)) == HuffCodes[i].Prefix) {
return i;
}
- }
+ }
abort();
return 0;
- }
-
+ }
+
void TCompressorTable::BuildHuffCodes(i64 totalFreq, i64 freqs[65536]) {
i64 add = 1;
while (1) {
@@ -54,9 +54,9 @@ namespace NCompTable {
return;
}
add = add * 2;
- }
- }
-
+ }
+ }
+
bool TCompressorTable::BuildHuffCodes(i64 totalFreq, i64 freqs[65536], i64 add) {
TVector<NCompProto::TCode> codes;
size_t bits[] = {0, 1, 2, 4, 8, 10, 12, 14, 16, 32};
@@ -71,7 +71,7 @@ namespace NCompTable {
codes.push_back(NCompProto::TCode(weight + add, ui32(off), bits[i]));
total = total > weight ? total - weight : 0;
off += size;
- }
+ }
codes.push_back(NCompProto::TCode(total + add, 0, 32));
i64 ret = NCompProto::BuildHuff(codes);
Y_UNUSED(ret);
@@ -84,13 +84,13 @@ namespace NCompTable {
if (code.PrefixBits > 8) {
return false;
}
- }
+ }
for (size_t i = 0; i < 256; ++i) {
HuffIndices[i] = GetHuffIndex(ui8(i));
}
return true;
- }
-
+ }
+
template <class TIterator>
void Iterate(const TStringBuf& data, TIterator& iterator) {
size_t i = 0;
@@ -103,8 +103,8 @@ namespace NCompTable {
memcpy(buffer, data.data() + i, data.size() - i);
iterator.Visit(buffer[0]);
}
- }
-
+ }
+
class TDataCompressor {
ui32 Keys[hashSize];
ui32 Vals[hashSize];
@@ -128,7 +128,7 @@ namespace NCompTable {
Vals[index] = i;
table.GetHuffCode(ui32(i), BitCodes[index], BitLengths[index]);
}
- }
+ }
void GetStat(ui32 val, ui32& stat) const {
size_t index = HashIndex(val, HashMul);
if (Keys[index] == val) {
@@ -136,7 +136,7 @@ namespace NCompTable {
} else {
stat = ui32(-1);
}
- }
+ }
void GetStat(ui32 val, ui64& bitCode, ui8& bitLength) const {
size_t index = HashIndex(val, HashMul);
if (Keys[index] == val) {
@@ -145,7 +145,7 @@ namespace NCompTable {
} else {
Table.GetLastHuffCode(val, bitCode, bitLength);
}
- }
+ }
size_t Compress4(const ui32* data4, ui8* dataBuff) const {
ui8* oldBuff = dataBuff;
++dataBuff;
@@ -174,7 +174,7 @@ namespace NCompTable {
}
oldBuff[0] = ui8(status);
return dataBuff - oldBuff;
- }
+ }
struct TCompressorIterator {
TVector<char>& Result;
const TDataCompressor& Compressor;
@@ -187,7 +187,7 @@ namespace NCompTable {
, Index(0)
, RealSize(0)
{
- }
+ }
void Flush() {
Result.yresize(RealSize + 32);
RealSize += Compressor.Compress4(Cached, reinterpret_cast<ui8*>(Result.data()) + RealSize);
@@ -246,11 +246,11 @@ namespace NCompTable {
if (!HQ) {
TCompressorIterator iterator(rslt, *this);
Iterate(stringBuf, iterator);
- } else {
+ } else {
THQCompressorIterator iterator(rslt, *this);
Iterate(stringBuf, iterator);
- }
- }
+ }
+ }
};
class TDataDecompressor {
@@ -259,8 +259,8 @@ namespace NCompTable {
public:
TDataDecompressor(const TCompressorTable& table)
: Table(table)
- {
- }
+ {
+ }
size_t Decompress(ui32* data4, const ui8* dataBuff, ui32 type) const {
if (type == 0) {
data4[0] = 0;
@@ -280,7 +280,7 @@ namespace NCompTable {
memcpy(data4, dataBuff, sizeof(*data4));
return 4;
}
- }
+ }
size_t Decompress(ui32* data4, const ui8* dataBuff) const {
ui32 status = *dataBuff;
const ui8* oldData = dataBuff;
@@ -298,7 +298,7 @@ namespace NCompTable {
memcpy(&val, dataBuff + off, sizeof(val));
} else {
memcpy(&val, dataBuff + off, border - off);
- }
+ }
val >>= (offset & 7);
ui8 index = Table.HuffIndices[ui8(val)];
const NCompProto::TCoderEntry& entry = Table.HuffCodes[index];
@@ -307,7 +307,7 @@ namespace NCompTable {
val = val + entry.MinValue;
data[0] = (index == 9) ? val : Table.Table[val];
offset += allBits;
- }
+ }
size_t GetJunkSize() const {
return 8;
}
@@ -331,7 +331,7 @@ namespace NCompTable {
len = length;
for (size_t i = 0; i < length32; ++i) {
Decompress(&result[i], src, offset, border);
- }
+ }
} else {
ui32 data[4];
src += Decompress(data, src);
@@ -345,39 +345,39 @@ namespace NCompTable {
for (size_t j = 1; j < length32x4; ++j) {
src += Decompress(&result[j * 4 - 1], src);
}
- }
+ }
rslt.resize(len);
- }
- };
+ }
+ };
struct TSamplerIterator {
TDataSampler& Sampler;
TSamplerIterator(TDataSampler& sampler)
: Sampler(sampler)
- {
- }
- void Visit(const ui32 data) {
+ {
+ }
+ void Visit(const ui32 data) {
Sampler.AddStat(data);
- }
+ }
};
struct TGreaterComparer {
//std::greater in arcadia???
bool operator()(ui64 a, ui64 b) const {
return a > b;
- }
- };
+ }
+ };
TDataSampler::TDataSampler() {
memset(this, 0, sizeof(*this));
- }
-
+ }
+
void TDataSampler::BuildTable(TCompressorTable& table) const {
std::vector<ui64> sorted;
for (size_t i = 0; i < Size; ++i) {
ui64 res = (ui64(EntryHit[i]) << 32) + ui32(i);
sorted.push_back(res);
- }
+ }
std::vector<ui32> entryTbl(Size);
std::sort(sorted.begin(), sorted.end(), TGreaterComparer());
table.HashMul = magicHashMul;
@@ -386,9 +386,9 @@ namespace NCompTable {
ui32 ind = ui32(sorted[i]);
table.Table[i] = EntryVal[ind];
freqs[i] = EntryHit[ind];
- }
+ }
table.BuildHuffCodes(Counter, freqs);
- }
+ }
void TDataSampler::AddStat(ui32 val) {
++Counter;
@@ -397,47 +397,47 @@ namespace NCompTable {
++EntryHit[hashInd];
} else if (EntryHit[hashInd] > 1) {
--EntryHit[hashInd];
- } else {
+ } else {
EntryHit[hashInd] = 1;
EntryVal[hashInd] = val;
- }
- }
+ }
+ }
void TDataSampler::AddStat(const TStringBuf& stringBuf) {
TSamplerIterator iterator(*this);
Iterate(stringBuf, iterator);
- }
-
+ }
+
TChunkCompressor::TChunkCompressor(bool highQuality, const TCompressorTable& table)
: HighQuality(highQuality)
- {
+ {
Compressor.Reset(new TDataCompressor(table));
- }
-
+ }
+
void TChunkCompressor::Compress(TStringBuf data, TVector<char>* rslt) const {
if (HighQuality) {
Compressor->Compress<1>(data, *rslt);
} else {
Compressor->Compress<0>(data, *rslt);
}
- }
-
+ }
+
TChunkCompressor::~TChunkCompressor() = default;
-
+
TChunkDecompressor::TChunkDecompressor(bool highQuality, const TCompressorTable& table)
: HighQuality(highQuality)
{
Decompressor.Reset(new TDataDecompressor(table));
- }
-
+ }
+
void TChunkDecompressor::Decompress(TStringBuf data, TVector<char>* rslt) const {
if (HighQuality) {
Decompressor->Decompress<1>(data, *rslt);
} else {
Decompressor->Decompress<0>(data, *rslt);
}
- }
-
+ }
+
TChunkDecompressor::~TChunkDecompressor() = default;
-
-}
+
+}
diff --git a/library/cpp/comptable/comptable.h b/library/cpp/comptable/comptable.h
index d225fed7a0..3ec626965c 100644
--- a/library/cpp/comptable/comptable.h
+++ b/library/cpp/comptable/comptable.h
@@ -1,18 +1,18 @@
-#pragma once
-
-#include <util/generic/vector.h>
-#include <util/memory/blob.h>
-#include <util/ysaveload.h>
+#pragma once
+#include <util/generic/vector.h>
+#include <util/memory/blob.h>
+#include <util/ysaveload.h>
+
#include <library/cpp/compproto/huff.h>
-
-namespace NCompTable {
+
+namespace NCompTable {
struct TCompressorTable {
ui32 Table[65536];
ui32 HashMul;
NCompProto::TCoderEntry HuffCodes[10];
ui8 HuffIndices[256];
-
+
void GetHuffCode(const NCompProto::TCoderEntry& entry, ui32 value, ui64& bitCode, ui8& bitLength) const;
void GetLastHuffCode(ui32 value, ui64& bitCode, ui8& bitLength) const;
void GetHuffCode(ui32 value, ui64& bitCode, ui8& bitLength) const;
@@ -20,7 +20,7 @@ namespace NCompTable {
void BuildHuffCodes(i64 totalFreq, i64 freqs[65536]);
bool BuildHuffCodes(i64 totalFreq, i64 freqs[65536], i64 add);
};
-
+
struct TDataSampler {
enum {
Size = 1 << 18,
@@ -28,48 +28,48 @@ namespace NCompTable {
ui32 EntryVal[Size];
i64 EntryHit[Size];
i64 Counter;
-
+
public:
TDataSampler();
void BuildTable(TCompressorTable& table) const;
void AddStat(ui32 val);
void AddStat(const TStringBuf& stringBuf);
- };
-
+ };
+
class TDataCompressor;
class TDataDecompressor;
-
+
class TChunkCompressor {
public:
TChunkCompressor(bool highQuality, const TCompressorTable& table);
void Compress(TStringBuf data, TVector<char>* result) const;
~TChunkCompressor();
-
+
private:
bool HighQuality;
THolder<TDataCompressor> Compressor;
};
-
+
class TChunkDecompressor {
public:
TChunkDecompressor(bool highQuality, const TCompressorTable& table);
void Decompress(TStringBuf data, TVector<char>* result) const;
~TChunkDecompressor();
-
+
private:
bool HighQuality;
THolder<TDataDecompressor> Decompressor;
};
-
+
}
-template <>
-class TSerializer<NCompTable::TCompressorTable> {
-public:
+template <>
+class TSerializer<NCompTable::TCompressorTable> {
+public:
static inline void Save(IOutputStream* out, const NCompTable::TCompressorTable& entry) {
- SavePodType(out, entry);
- }
+ SavePodType(out, entry);
+ }
static inline void Load(IInputStream* in, NCompTable::TCompressorTable& entry) {
- LoadPodType(in, entry);
- }
-};
+ LoadPodType(in, entry);
+ }
+};
diff --git a/library/cpp/comptable/usage/usage.cpp b/library/cpp/comptable/usage/usage.cpp
index 9997c83686..60ed9b9422 100644
--- a/library/cpp/comptable/usage/usage.cpp
+++ b/library/cpp/comptable/usage/usage.cpp
@@ -1,75 +1,75 @@
#include <library/cpp/comptable/comptable.h>
-
+
#include <util/random/random.h>
#include <util/random/fast.h>
#include <time.h>
#include <stdlib.h>
-using namespace NCompTable;
-
+using namespace NCompTable;
+
template <bool HQ>
void DoTest(const TCompressorTable& table, const TVector<TString>& lines) {
TVector<char> compressed;
TVector<char> decompressed;
-
- TChunkCompressor compressor(HQ, table);
- TChunkDecompressor deCompressor(HQ, table);
-
- size_t origSize = 0;
- size_t compSize = 0;
- float cl1 = clock();
+
+ TChunkCompressor compressor(HQ, table);
+ TChunkDecompressor deCompressor(HQ, table);
+
+ size_t origSize = 0;
+ size_t compSize = 0;
+ float cl1 = clock();
for (size_t i = 0; i < lines.size(); ++i) {
const TString& line = lines[i];
- compressor.Compress(line, &compressed);
- origSize += line.size();
- compSize += compressed.size();
- TStringBuf in(compressed.data(), compressed.size());
- deCompressor.Decompress(in, &decompressed);
+ compressor.Compress(line, &compressed);
+ origSize += line.size();
+ compSize += compressed.size();
+ TStringBuf in(compressed.data(), compressed.size());
+ deCompressor.Decompress(in, &decompressed);
if (decompressed.size() != line.size() || memcmp(decompressed.data(), line.data(), decompressed.size())) {
- Cout << i << "\n";
+ Cout << i << "\n";
Cout << line << "\n"
<< TString(decompressed.data(), decompressed.size()) << "\n";
- abort();
- }
- }
- float cl2 = clock();
- float secs = (cl2 - cl1) / CLOCKS_PER_SEC;
- Cout << "origSize: " << origSize << "\tcompSize: " << compSize << Endl;
+ abort();
+ }
+ }
+ float cl2 = clock();
+ float secs = (cl2 - cl1) / CLOCKS_PER_SEC;
+ Cout << "origSize: " << origSize << "\tcompSize: " << compSize << Endl;
Cout << "yep! compression + decompression speed " << origSize / 1024.0f / 1024.0f / secs << " mbps\n";
- Cout << "yep! compression ratio " << double(origSize) / double(compSize + 1) << "\n";
-}
-
+ Cout << "yep! compression ratio " << double(origSize) / double(compSize + 1) << "\n";
+}
+
int main(int argc, const char* argv[]) {
TReallyFastRng32 rr(17);
TVector<TString> lines;
- /*FILE *fp = fopen("res", "rb");
- while (!feof(fp)) {
- char buff[4096];
- fscanf(fp, "%s", buff);
+ /*FILE *fp = fopen("res", "rb");
+ while (!feof(fp)) {
+ char buff[4096];
+ fscanf(fp, "%s", buff);
lines.push_back(TString(buff));
- }*/
- //for (size_t i = 0; i < 10000000; ++i) {
- //for (size_t i = 0; i < 1000000; ++i) {
+ }*/
+ //for (size_t i = 0; i < 10000000; ++i) {
+ //for (size_t i = 0; i < 1000000; ++i) {
for (size_t i = 0; i < 1000000; ++i) {
size_t size = rr.Uniform(32);
TString res = "www.yandex.ru/yandsearch?text=";
- for (size_t j = 0; j < size; ++j) {
+ for (size_t j = 0; j < size; ++j) {
res += "qwer"[rr.Uniform(4)];
- }
- lines.push_back(res);
- }
- THolder<TDataSampler> sampler(new TDataSampler);
- for (size_t i = 0; i < lines.size(); ++i) {
+ }
+ lines.push_back(res);
+ }
+ THolder<TDataSampler> sampler(new TDataSampler);
+ for (size_t i = 0; i < lines.size(); ++i) {
sampler->AddStat(lines[i]);
- }
- TCompressorTable table;
- sampler->BuildTable(table);
-
- DoTest<true>(table, lines);
- DoTest<false>(table, lines);
-
+ }
+ TCompressorTable table;
+ sampler->BuildTable(table);
+
+ DoTest<true>(table, lines);
+ DoTest<false>(table, lines);
+
Y_UNUSED(argc);
Y_UNUSED(argv);
- return 0;
-}
+ return 0;
+}
diff --git a/library/cpp/comptable/usage/ya.make b/library/cpp/comptable/usage/ya.make
index ab31e7528c..b23b545623 100644
--- a/library/cpp/comptable/usage/ya.make
+++ b/library/cpp/comptable/usage/ya.make
@@ -1,13 +1,13 @@
-PROGRAM()
-
+PROGRAM()
+
OWNER(ironpeter)
-
+
SRCS(
usage.cpp
)
-
+
PEERDIR(
library/cpp/comptable
)
-END()
+END()
diff --git a/library/cpp/comptable/ut/comptable_ut.cpp b/library/cpp/comptable/ut/comptable_ut.cpp
index 5901d0246f..0f58a60e87 100644
--- a/library/cpp/comptable/ut/comptable_ut.cpp
+++ b/library/cpp/comptable/ut/comptable_ut.cpp
@@ -1,64 +1,64 @@
#include <library/cpp/comptable/comptable.h>
#include <library/cpp/testing/unittest/registar.h>
-
+
#include <util/random/random.h>
#include <util/random/fast.h>
-using namespace NCompTable;
-
+using namespace NCompTable;
+
template <bool HQ>
void DoTest(const TCompressorTable& table, const TVector<TString>& lines) {
TVector<char> compressed;
TVector<char> decompressed;
-
- TChunkCompressor compressor(HQ, table);
- TStringStream tmp;
- Save(&tmp, table);
- TCompressorTable tableLoaded;
- Load(&tmp, tableLoaded);
- UNIT_ASSERT(memcmp(&table, &tableLoaded, sizeof(table)) == 0);
- TChunkDecompressor deCompressor(HQ, tableLoaded);
-
- size_t origSize = 0;
- size_t compSize = 0;
+
+ TChunkCompressor compressor(HQ, table);
+ TStringStream tmp;
+ Save(&tmp, table);
+ TCompressorTable tableLoaded;
+ Load(&tmp, tableLoaded);
+ UNIT_ASSERT(memcmp(&table, &tableLoaded, sizeof(table)) == 0);
+ TChunkDecompressor deCompressor(HQ, tableLoaded);
+
+ size_t origSize = 0;
+ size_t compSize = 0;
for (size_t i = 0; i < lines.size(); ++i) {
const TString& line = lines[i];
- compressor.Compress(line, &compressed);
- origSize += line.size();
- compSize += compressed.size();
- TStringBuf in(compressed.data(), compressed.size());
- deCompressor.Decompress(in, &decompressed);
+ compressor.Compress(line, &compressed);
+ origSize += line.size();
+ compSize += compressed.size();
+ TStringBuf in(compressed.data(), compressed.size());
+ deCompressor.Decompress(in, &decompressed);
UNIT_ASSERT(decompressed.size() == line.size() && memcmp(decompressed.data(), line.data(), decompressed.size()) == 0);
- }
+ }
UNIT_ASSERT_EQUAL(origSize, 45491584);
- if (HQ) {
+ if (HQ) {
UNIT_ASSERT_EQUAL(compSize, 11074583);
- } else {
+ } else {
UNIT_ASSERT_EQUAL(compSize, 17459336);
- }
- UNIT_ASSERT(compSize < origSize);
-}
-
+ }
+ UNIT_ASSERT(compSize < origSize);
+}
+
Y_UNIT_TEST_SUITE(TestComptable) {
Y_UNIT_TEST(TestComptableCompressDecompress) {
TReallyFastRng32 rr(17);
TVector<TString> lines;
- for (size_t i = 0; i < 1000000; ++i) {
+ for (size_t i = 0; i < 1000000; ++i) {
size_t size = rr.Uniform(32);
TString res = "www.yandex.ru/yandsearch?text=";
- for (size_t j = 0; j < size; ++j) {
+ for (size_t j = 0; j < size; ++j) {
res += "qwer"[rr.Uniform(4)];
- }
- lines.push_back(res);
- }
- THolder<TDataSampler> sampler(new TDataSampler);
- for (size_t i = 0; i < lines.size(); ++i) {
+ }
+ lines.push_back(res);
+ }
+ THolder<TDataSampler> sampler(new TDataSampler);
+ for (size_t i = 0; i < lines.size(); ++i) {
sampler->AddStat(lines[i]);
- }
- TCompressorTable table;
- sampler->BuildTable(table);
-
- DoTest<true>(table, lines);
- DoTest<false>(table, lines);
- }
-}
+ }
+ TCompressorTable table;
+ sampler->BuildTable(table);
+
+ DoTest<true>(table, lines);
+ DoTest<false>(table, lines);
+ }
+}
diff --git a/library/cpp/comptable/ut/ya.make b/library/cpp/comptable/ut/ya.make
index d0a49793a5..75bc8d1031 100644
--- a/library/cpp/comptable/ut/ya.make
+++ b/library/cpp/comptable/ut/ya.make
@@ -1,9 +1,9 @@
UNITTEST_FOR(library/cpp/comptable)
-
+
OWNER(ironpeter)
-
+
SRCS(
comptable_ut.cpp
)
-
-END()
+
+END()
diff --git a/library/cpp/comptable/ya.make b/library/cpp/comptable/ya.make
index 314603c62a..0e52c5b698 100644
--- a/library/cpp/comptable/ya.make
+++ b/library/cpp/comptable/ya.make
@@ -1,13 +1,13 @@
-LIBRARY()
-
+LIBRARY()
+
OWNER(ironpeter)
-
+
SRCS(
comptable.cpp
)
-
+
PEERDIR(
library/cpp/compproto
)
-
-END()
+
+END()