aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/codecs/zstd_dict_codec.cpp
diff options
context:
space:
mode:
authorRuslan Kovalev <ruslan.a.kovalev@gmail.com>2022-02-10 16:46:45 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:46:45 +0300
commit9123176b341b6f2658cff5132482b8237c1416c8 (patch)
tree49e222ea1c5804306084bb3ae065bb702625360f /library/cpp/codecs/zstd_dict_codec.cpp
parent59e19371de37995fcb36beb16cd6ec030af960bc (diff)
downloadydb-9123176b341b6f2658cff5132482b8237c1416c8.tar.gz
Restoring authorship annotation for Ruslan Kovalev <ruslan.a.kovalev@gmail.com>. Commit 2 of 2.
Diffstat (limited to 'library/cpp/codecs/zstd_dict_codec.cpp')
-rw-r--r--library/cpp/codecs/zstd_dict_codec.cpp440
1 files changed, 220 insertions, 220 deletions
diff --git a/library/cpp/codecs/zstd_dict_codec.cpp b/library/cpp/codecs/zstd_dict_codec.cpp
index d543736b3d..c42a2879e6 100644
--- a/library/cpp/codecs/zstd_dict_codec.cpp
+++ b/library/cpp/codecs/zstd_dict_codec.cpp
@@ -1,173 +1,173 @@
-#include "zstd_dict_codec.h"
-
+#include "zstd_dict_codec.h"
+
#include <library/cpp/packers/packers.h>
-
-#include <util/generic/ptr.h>
-#include <util/generic/refcount.h>
-#include <util/generic/noncopyable.h>
-#include <util/string/builder.h>
-#include <util/system/src_location.h>
-#include <util/ysaveload.h>
-
-#define ZDICT_STATIC_LINKING_ONLY
-
+
+#include <util/generic/ptr.h>
+#include <util/generic/refcount.h>
+#include <util/generic/noncopyable.h>
+#include <util/string/builder.h>
+#include <util/system/src_location.h>
+#include <util/ysaveload.h>
+
+#define ZDICT_STATIC_LINKING_ONLY
+
#include <contrib/libs/zstd/include/zdict.h>
#include <contrib/libs/zstd/include/zstd.h>
#include <contrib/libs/zstd/include/zstd_errors.h>
-
-// See IGNIETFERRO-320 for possible bugs
-
-namespace NCodecs {
- class TZStdDictCodec::TImpl: public TAtomicRefCount<TZStdDictCodec::TImpl> {
- template <class T, size_t Deleter(T*)>
- class TPtrHolder : TMoveOnly {
- T* Ptr = nullptr;
-
- public:
- TPtrHolder() = default;
-
- TPtrHolder(T* dict)
- : Ptr(dict)
+
+// See IGNIETFERRO-320 for possible bugs
+
+namespace NCodecs {
+ class TZStdDictCodec::TImpl: public TAtomicRefCount<TZStdDictCodec::TImpl> {
+ template <class T, size_t Deleter(T*)>
+ class TPtrHolder : TMoveOnly {
+ T* Ptr = nullptr;
+
+ public:
+ TPtrHolder() = default;
+
+ TPtrHolder(T* dict)
+ : Ptr(dict)
{
}
-
- T* Get() {
- return Ptr;
- }
-
- const T* Get() const {
- return Ptr;
- }
-
- void Reset(T* dict) {
- Dispose();
- Ptr = dict;
- }
-
- void Dispose() {
- if (Ptr) {
- Deleter(Ptr);
- Ptr = nullptr;
- }
- }
-
- ~TPtrHolder() {
- Dispose();
- }
- };
-
- using TCDict = TPtrHolder<ZSTD_CDict, ZSTD_freeCDict>;
- using TDDict = TPtrHolder<ZSTD_DDict, ZSTD_freeDDict>;
- using TCCtx = TPtrHolder<ZSTD_CCtx, ZSTD_freeCCtx>;
- using TDCtx = TPtrHolder<ZSTD_DCtx, ZSTD_freeDCtx>;
-
- using TSizePacker = NPackers::TPacker<ui64>;
-
- public:
- static const ui32 SampleSize = (1 << 22) * 5;
-
- explicit TImpl(ui32 comprLevel)
- : CompressionLevel(comprLevel)
- {
- const size_t zeroSz = TSizePacker().MeasureLeaf(0);
- Zero.Resize(zeroSz);
+
+ T* Get() {
+ return Ptr;
+ }
+
+ const T* Get() const {
+ return Ptr;
+ }
+
+ void Reset(T* dict) {
+ Dispose();
+ Ptr = dict;
+ }
+
+ void Dispose() {
+ if (Ptr) {
+ Deleter(Ptr);
+ Ptr = nullptr;
+ }
+ }
+
+ ~TPtrHolder() {
+ Dispose();
+ }
+ };
+
+ using TCDict = TPtrHolder<ZSTD_CDict, ZSTD_freeCDict>;
+ using TDDict = TPtrHolder<ZSTD_DDict, ZSTD_freeDDict>;
+ using TCCtx = TPtrHolder<ZSTD_CCtx, ZSTD_freeCCtx>;
+ using TDCtx = TPtrHolder<ZSTD_DCtx, ZSTD_freeDCtx>;
+
+ using TSizePacker = NPackers::TPacker<ui64>;
+
+ public:
+ static const ui32 SampleSize = (1 << 22) * 5;
+
+ explicit TImpl(ui32 comprLevel)
+ : CompressionLevel(comprLevel)
+ {
+ const size_t zeroSz = TSizePacker().MeasureLeaf(0);
+ Zero.Resize(zeroSz);
TSizePacker().PackLeaf(Zero.data(), 0, zeroSz);
- }
-
- ui32 GetCompressionLevel() const {
- return CompressionLevel;
- }
-
- ui8 Encode(TStringBuf in, TBuffer& outbuf) const {
- outbuf.Clear();
-
+ }
+
+ ui32 GetCompressionLevel() const {
+ return CompressionLevel;
+ }
+
+ ui8 Encode(TStringBuf in, TBuffer& outbuf) const {
+ outbuf.Clear();
+
if (in.empty()) {
- return 0;
- }
-
- TSizePacker packer;
-
+ return 0;
+ }
+
+ TSizePacker packer;
+
const char* rawBeg = in.data();
const size_t rawSz = in.size();
-
- const size_t szSz = packer.MeasureLeaf(rawSz);
- const size_t maxDatSz = ZSTD_compressBound(rawSz);
-
- outbuf.Resize(szSz + maxDatSz);
+
+ const size_t szSz = packer.MeasureLeaf(rawSz);
+ const size_t maxDatSz = ZSTD_compressBound(rawSz);
+
+ outbuf.Resize(szSz + maxDatSz);
packer.PackLeaf(outbuf.data(), rawSz, szSz);
-
- TCCtx ctx{CheckPtr(ZSTD_createCCtx(), __LOCATION__)};
- const size_t resSz = CheckSize(ZSTD_compress_usingCDict(
+
+ TCCtx ctx{CheckPtr(ZSTD_createCCtx(), __LOCATION__)};
+ const size_t resSz = CheckSize(ZSTD_compress_usingCDict(
ctx.Get(), outbuf.data() + szSz, maxDatSz, rawBeg, rawSz, CDict.Get()),
__LOCATION__);
-
- if (resSz < rawSz) {
- outbuf.Resize(resSz + szSz);
- } else {
+
+ if (resSz < rawSz) {
+ outbuf.Resize(resSz + szSz);
+ } else {
outbuf.Resize(Zero.size() + rawSz);
memcpy(outbuf.data(), Zero.data(), Zero.size());
memcpy(outbuf.data() + Zero.size(), rawBeg, rawSz);
- }
- return 0;
- }
-
- void Decode(TStringBuf in, TBuffer& outbuf) const {
- outbuf.Clear();
-
+ }
+ return 0;
+ }
+
+ void Decode(TStringBuf in, TBuffer& outbuf) const {
+ outbuf.Clear();
+
if (in.empty()) {
- return;
- }
-
- TSizePacker packer;
-
+ return;
+ }
+
+ TSizePacker packer;
+
const char* rawBeg = in.data();
size_t rawSz = in.size();
-
- const size_t szSz = packer.SkipLeaf(rawBeg);
- ui64 datSz = 0;
- packer.UnpackLeaf(rawBeg, datSz);
-
- rawBeg += szSz;
- rawSz -= szSz;
-
- if (!datSz) {
- outbuf.Resize(rawSz);
+
+ const size_t szSz = packer.SkipLeaf(rawBeg);
+ ui64 datSz = 0;
+ packer.UnpackLeaf(rawBeg, datSz);
+
+ rawBeg += szSz;
+ rawSz -= szSz;
+
+ if (!datSz) {
+ outbuf.Resize(rawSz);
memcpy(outbuf.data(), rawBeg, rawSz);
- } else {
+ } else {
// size_t zSz = ZSTD_getDecompressedSize(rawBeg, rawSz);
// Y_ENSURE_EX(datSz == zSz, TCodecException() << datSz << " != " << zSz);
- outbuf.Resize(datSz);
- TDCtx ctx{CheckPtr(ZSTD_createDCtx(), __LOCATION__)};
- CheckSize(ZSTD_decompress_usingDDict(
+ outbuf.Resize(datSz);
+ TDCtx ctx{CheckPtr(ZSTD_createDCtx(), __LOCATION__)};
+ CheckSize(ZSTD_decompress_usingDDict(
ctx.Get(), outbuf.data(), outbuf.size(), rawBeg, rawSz, DDict.Get()),
__LOCATION__);
- outbuf.Resize(datSz);
- }
- }
-
+ outbuf.Resize(datSz);
+ }
+ }
+
bool Learn(ISequenceReader& in, bool throwOnError) {
- TBuffer data;
+ TBuffer data;
TVector<size_t> lens;
-
- data.Reserve(2 * SampleSize);
- TStringBuf r;
- while (in.NextRegion(r)) {
- if (!r) {
- continue;
- }
+
+ data.Reserve(2 * SampleSize);
+ TStringBuf r;
+ while (in.NextRegion(r)) {
+ if (!r) {
+ continue;
+ }
data.Append(r.data(), r.size());
lens.push_back(r.size());
- }
-
+ }
+
ZDICT_legacy_params_t params;
- memset(&params, 0, sizeof(params));
+ memset(&params, 0, sizeof(params));
params.zParams.compressionLevel = 1;
params.zParams.notificationLevel = 1;
- Dict.Resize(Max<size_t>(1 << 20, data.Size() + 16 * lens.size()));
-
- if (!lens) {
- Dict.Reset();
- } else {
+ Dict.Resize(Max<size_t>(1 << 20, data.Size() + 16 * lens.size()));
+
+ if (!lens) {
+ Dict.Reset();
+ } else {
size_t trainResult = ZDICT_trainFromBuffer_legacy(
Dict.data(), Dict.size(), data.Data(), const_cast<const size_t*>(&lens[0]), lens.size(), params);
if (ZSTD_isError(trainResult)) {
@@ -177,105 +177,105 @@ namespace NCodecs {
CheckSize(trainResult, __LOCATION__);
}
Dict.Resize(trainResult);
- Dict.ShrinkToFit();
- }
- InitContexts();
+ Dict.ShrinkToFit();
+ }
+ InitContexts();
return true;
- }
-
+ }
+
void Save(IOutputStream* out) const {
- ::Save(out, Dict);
- }
-
+ ::Save(out, Dict);
+ }
+
void Load(IInputStream* in) {
- ::Load(in, Dict);
- InitContexts();
- }
-
- void InitContexts() {
+ ::Load(in, Dict);
+ InitContexts();
+ }
+
+ void InitContexts() {
CDict.Reset(CheckPtr(ZSTD_createCDict(Dict.data(), Dict.size(), CompressionLevel), __LOCATION__));
DDict.Reset(CheckPtr(ZSTD_createDDict(Dict.data(), Dict.size()), __LOCATION__));
- }
-
- static size_t CheckSize(size_t sz, TSourceLocation loc) {
- if (ZSTD_isError(sz)) {
- ythrow TCodecException() << loc << " " << ZSTD_getErrorName(sz) << " (code " << (int)ZSTD_getErrorCode(sz) << ")";
- }
- return sz;
- }
-
- template <class T>
- static T* CheckPtr(T* t, TSourceLocation loc) {
+ }
+
+ static size_t CheckSize(size_t sz, TSourceLocation loc) {
+ if (ZSTD_isError(sz)) {
+ ythrow TCodecException() << loc << " " << ZSTD_getErrorName(sz) << " (code " << (int)ZSTD_getErrorCode(sz) << ")";
+ }
+ return sz;
+ }
+
+ template <class T>
+ static T* CheckPtr(T* t, TSourceLocation loc) {
Y_ENSURE_EX(t, TCodecException() << loc << " "
<< "unexpected nullptr");
- return t;
- }
-
- private:
- ui32 CompressionLevel = 1;
-
- TBuffer Zero;
- TBuffer Dict;
-
- TCDict CDict;
- TDDict DDict;
- };
-
- TZStdDictCodec::TZStdDictCodec(ui32 comprLevel)
- : Impl(new TImpl(comprLevel))
- {
- MyTraits.NeedsTraining = true;
- MyTraits.SizeOnEncodeMultiplier = 2;
- MyTraits.SizeOnDecodeMultiplier = 10;
- MyTraits.RecommendedSampleSize = TImpl::SampleSize; // same as for solar
- }
-
+ return t;
+ }
+
+ private:
+ ui32 CompressionLevel = 1;
+
+ TBuffer Zero;
+ TBuffer Dict;
+
+ TCDict CDict;
+ TDDict DDict;
+ };
+
+ TZStdDictCodec::TZStdDictCodec(ui32 comprLevel)
+ : Impl(new TImpl(comprLevel))
+ {
+ MyTraits.NeedsTraining = true;
+ MyTraits.SizeOnEncodeMultiplier = 2;
+ MyTraits.SizeOnDecodeMultiplier = 10;
+ MyTraits.RecommendedSampleSize = TImpl::SampleSize; // same as for solar
+ }
+
TZStdDictCodec::~TZStdDictCodec() {
}
-
+
TString TZStdDictCodec::GetName() const {
return TStringBuilder() << MyName() << "-" << Impl->GetCompressionLevel();
- }
-
- ui8 TZStdDictCodec::Encode(TStringBuf in, TBuffer& out) const {
- return Impl->Encode(in, out);
- }
-
- void TZStdDictCodec::Decode(TStringBuf in, TBuffer& out) const {
- Impl->Decode(in, out);
- }
-
- void TZStdDictCodec::DoLearn(ISequenceReader& in) {
- Impl = new TImpl(Impl->GetCompressionLevel());
+ }
+
+ ui8 TZStdDictCodec::Encode(TStringBuf in, TBuffer& out) const {
+ return Impl->Encode(in, out);
+ }
+
+ void TZStdDictCodec::Decode(TStringBuf in, TBuffer& out) const {
+ Impl->Decode(in, out);
+ }
+
+ void TZStdDictCodec::DoLearn(ISequenceReader& in) {
+ Impl = new TImpl(Impl->GetCompressionLevel());
Impl->Learn(in, true/*throwOnError*/);
- }
-
+ }
+
bool TZStdDictCodec::DoTryToLearn(ISequenceReader& in) {
Impl = new TImpl(Impl->GetCompressionLevel());
return Impl->Learn(in, false/*throwOnError*/);
}
void TZStdDictCodec::Save(IOutputStream* out) const {
- Impl->Save(out);
- }
-
+ Impl->Save(out);
+ }
+
void TZStdDictCodec::Load(IInputStream* in) {
- Impl->Load(in);
- }
-
+ Impl->Load(in);
+ }
+
TVector<TString> TZStdDictCodec::ListCompressionNames() {
TVector<TString> res;
- for (int i = 1; i <= ZSTD_maxCLevel(); ++i) {
- res.emplace_back(TStringBuilder() << MyName() << "-" << i);
- }
- return res;
- }
-
- int TZStdDictCodec::ParseCompressionName(TStringBuf name) {
- int c = 0;
- TryFromString(name.After('-'), c);
- Y_ENSURE_EX(name.Before('-') == MyName() && c > 0 && c <= ZSTD_maxCLevel(), TCodecException() << "invald codec name" << name);
- return c;
- }
-
-}
+ for (int i = 1; i <= ZSTD_maxCLevel(); ++i) {
+ res.emplace_back(TStringBuilder() << MyName() << "-" << i);
+ }
+ return res;
+ }
+
+ int TZStdDictCodec::ParseCompressionName(TStringBuf name) {
+ int c = 0;
+ TryFromString(name.After('-'), c);
+ Y_ENSURE_EX(name.Before('-') == MyName() && c > 0 && c <= ZSTD_maxCLevel(), TCodecException() << "invald codec name" << name);
+ return c;
+ }
+
+}