diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /library/cpp/streams/lz/lz.cpp | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'library/cpp/streams/lz/lz.cpp')
-rw-r--r-- | library/cpp/streams/lz/lz.cpp | 731 |
1 files changed, 731 insertions, 0 deletions
diff --git a/library/cpp/streams/lz/lz.cpp b/library/cpp/streams/lz/lz.cpp new file mode 100644 index 0000000000..b65bb3ed96 --- /dev/null +++ b/library/cpp/streams/lz/lz.cpp @@ -0,0 +1,731 @@ +#include "lz.h" + +#include <util/system/yassert.h> +#include <util/system/byteorder.h> +#include <util/memory/addstorage.h> +#include <util/generic/buffer.h> +#include <util/generic/utility.h> +#include <util/generic/singleton.h> +#include <util/generic/yexception.h> +#include <util/stream/mem.h> + +#include <contrib/libs/lz4/lz4.h> +#include <contrib/libs/fastlz/fastlz.h> +#include <contrib/libs/snappy/snappy.h> +#include <contrib/libs/quicklz/quicklz.h> +#include <contrib/libs/minilzo/minilzo.h> + +static inline ui8 HostToLittle(ui8 t) noexcept { + return t; +} + +static inline ui8 LittleToHost(ui8 t) noexcept { + return t; +} + +struct TCommonData { + static const size_t overhead = sizeof(ui16) + sizeof(ui8); +}; + +const size_t SIGNATURE_SIZE = 4; + +template <class TCompressor, class TBase> +class TCompressorBase: public TAdditionalStorage<TCompressorBase<TCompressor, TBase>>, public TCompressor, public TCommonData { +public: + inline TCompressorBase(IOutputStream* slave, ui16 blockSize) + : Slave_(slave) + , BlockSize_(blockSize) + { + /* + * save signature + */ + static_assert(sizeof(TCompressor::signature) - 1 == SIGNATURE_SIZE, "expect sizeof(TCompressor::signature) - 1 == SIGNATURE_SIZE"); + Slave_->Write(TCompressor::signature, sizeof(TCompressor::signature) - 1); + + /* + * save version + */ + this->Save((ui32)1); + + /* + * save block size + */ + this->Save(BlockSize()); + } + + inline ~TCompressorBase() { + } + + inline void Write(const char* buf, size_t len) { + while (len) { + const ui16 toWrite = (ui16)Min<size_t>(len, this->BlockSize()); + + this->WriteBlock(buf, toWrite); + + buf += toWrite; + len -= toWrite; + } + } + + inline void Flush() { + } + + inline void Finish() { + this->Flush(); + this->WriteBlock(nullptr, 0); + } + + template <class T> + static inline void Save(T t, IOutputStream* out) { + t = HostToLittle(t); + + out->Write(&t, sizeof(t)); + } + + template <class T> + inline void Save(T t) { + Save(t, Slave_); + } + +private: + inline void* Block() const noexcept { + return this->AdditionalData(); + } + + inline ui16 BlockSize() const noexcept { + return BlockSize_; + } + + inline void WriteBlock(const void* ptr, ui16 len) { + Y_ASSERT(len <= this->BlockSize()); + + ui8 compressed = false; + + if (len) { + const size_t out = this->Compress((const char*)ptr, len, (char*)Block(), this->AdditionalDataLength()); + // catch compressor buffer overrun (e.g. SEARCH-2043) + //Y_VERIFY(out <= this->Hint(this->BlockSize())); + + if (out < len || TCompressor::SaveIncompressibleChunks()) { + compressed = true; + ptr = Block(); + len = (ui16)out; + } + } + + char tmp[overhead]; + TMemoryOutput header(tmp, sizeof(tmp)); + + this->Save(len, &header); + this->Save(compressed, &header); + + using TPart = IOutputStream::TPart; + if (ptr) { + const TPart parts[] = { + TPart(tmp, sizeof(tmp)), + TPart(ptr, len), + }; + + Slave_->Write(parts, sizeof(parts) / sizeof(*parts)); + } else { + Slave_->Write(tmp, sizeof(tmp)); + } + } + +private: + IOutputStream* Slave_; + const ui16 BlockSize_; +}; + +template <class T> +static inline T GLoad(IInputStream* input) { + T t; + + if (input->Load(&t, sizeof(t)) != sizeof(t)) { + ythrow TDecompressorError() << "stream error"; + } + + return LittleToHost(t); +} + +class TDecompressSignature { +public: + inline TDecompressSignature(IInputStream* input) { + if (input->Load(Buffer_, SIGNATURE_SIZE) != SIGNATURE_SIZE) { + ythrow TDecompressorError() << "can not load stream signature"; + } + } + + template <class TDecompressor> + inline bool Check() const { + static_assert(sizeof(TDecompressor::signature) - 1 == SIGNATURE_SIZE, "expect sizeof(TDecompressor::signature) - 1 == SIGNATURE_SIZE"); + return memcmp(TDecompressor::signature, Buffer_, SIGNATURE_SIZE) == 0; + } + +private: + char Buffer_[SIGNATURE_SIZE]; +}; + +template <class TDecompressor> +static inline IInputStream* ConsumeSignature(IInputStream* input) { + TDecompressSignature sign(input); + if (!sign.Check<TDecompressor>()) { + ythrow TDecompressorError() << "incorrect signature"; + } + return input; +} + +template <class TDecompressor> +class TDecompressorBaseImpl: public TDecompressor, public TCommonData { +public: + static inline ui32 CheckVer(ui32 v) { + if (v != 1) { + ythrow yexception() << TStringBuf("incorrect stream version: ") << v; + } + + return v; + } + + inline TDecompressorBaseImpl(IInputStream* slave) + : Slave_(slave) + , Input_(nullptr, 0) + , Eof_(false) + , Version_(CheckVer(Load<ui32>())) + , BlockSize_(Load<ui16>()) + , OutBufSize_(TDecompressor::Hint(BlockSize_)) + , Tmp_(2 * OutBufSize_) + , In_(Tmp_.Data()) + , Out_(In_ + OutBufSize_) + { + this->InitFromStream(Slave_); + } + + inline ~TDecompressorBaseImpl() { + } + + inline size_t Read(void* buf, size_t len) { + size_t ret = Input_.Read(buf, len); + + if (ret) { + return ret; + } + + if (Eof_) { + return 0; + } + + this->FillNextBlock(); + + ret = Input_.Read(buf, len); + + if (ret) { + return ret; + } + + Eof_ = true; + + return 0; + } + + inline void FillNextBlock() { + char tmp[overhead]; + + if (Slave_->Load(tmp, sizeof(tmp)) != sizeof(tmp)) { + ythrow TDecompressorError() << "can not read block header"; + } + + TMemoryInput header(tmp, sizeof(tmp)); + + const ui16 len = GLoad<ui16>(&header); + if (len > Tmp_.Capacity()) { + ythrow TDecompressorError() << "invalid len inside block header"; + } + const ui8 compressed = GLoad<ui8>(&header); + + if (compressed > 1) { + ythrow TDecompressorError() << "broken header"; + } + + if (Slave_->Load(In_, len) != len) { + ythrow TDecompressorError() << "can not read data"; + } + + if (compressed) { + const size_t ret = this->Decompress(In_, len, Out_, OutBufSize_); + + Input_.Reset(Out_, ret); + } else { + Input_.Reset(In_, len); + } + } + + template <class T> + inline T Load() { + return GLoad<T>(Slave_); + } + +protected: + IInputStream* Slave_; + TMemoryInput Input_; + bool Eof_; + const ui32 Version_; + const ui16 BlockSize_; + const size_t OutBufSize_; + TBuffer Tmp_; + char* In_; + char* Out_; +}; + +template <class TDecompressor, class TBase> +class TDecompressorBase: public TDecompressorBaseImpl<TDecompressor> { +public: + inline TDecompressorBase(IInputStream* slave) + : TDecompressorBaseImpl<TDecompressor>(ConsumeSignature<TDecompressor>(slave)) + { + } + + inline ~TDecompressorBase() { + } +}; + +#define DEF_COMPRESSOR_COMMON(rname, name) \ + rname::~rname() { \ + try { \ + Finish(); \ + } catch (...) { \ + } \ + } \ + \ + void rname::DoWrite(const void* buf, size_t len) { \ + if (!Impl_) { \ + ythrow yexception() << "can not write to finalized stream"; \ + } \ + \ + Impl_->Write((const char*)buf, len); \ + } \ + \ + void rname::DoFlush() { \ + if (!Impl_) { \ + ythrow yexception() << "can not flush finalized stream"; \ + } \ + \ + Impl_->Flush(); \ + } \ + \ + void rname::DoFinish() { \ + THolder<TImpl> impl(Impl_.Release()); \ + \ + if (impl) { \ + impl->Finish(); \ + } \ + } + +#define DEF_COMPRESSOR(rname, name) \ + class rname::TImpl: public TCompressorBase<name, TImpl> { \ + public: \ + inline TImpl(IOutputStream* out, ui16 blockSize) \ + : TCompressorBase<name, TImpl>(out, blockSize) { \ + } \ + }; \ + \ + rname::rname(IOutputStream* slave, ui16 blockSize) \ + : Impl_(new (TImpl::Hint(blockSize)) TImpl(slave, blockSize)) { \ + } \ + \ + DEF_COMPRESSOR_COMMON(rname, name) + +#define DEF_DECOMPRESSOR(rname, name) \ + class rname::TImpl: public TDecompressorBase<name, TImpl> { \ + public: \ + inline TImpl(IInputStream* in) \ + : TDecompressorBase<name, TImpl>(in) { \ + } \ + }; \ + \ + rname::rname(IInputStream* slave) \ + : Impl_(new TImpl(slave)) { \ + } \ + \ + rname::~rname() { \ + } \ + \ + size_t rname::DoRead(void* buf, size_t len) { \ + return Impl_->Read(buf, len); \ + } + +/* + * MiniLzo + */ +class TMiniLzo { + class TInit { + public: + inline TInit() { + if (lzo_init() != LZO_E_OK) { + ythrow yexception() << "can not init lzo engine"; + } + } + }; + +public: + static const char signature[]; + + inline TMiniLzo() { + Singleton<TInit>(); + } + + inline ~TMiniLzo() { + } + + static inline size_t Hint(size_t len) noexcept { + // see SEARCH-2043 and, e.g. examples at + // http://stackoverflow.com/questions/4235019/how-to-get-lzo-to-work-with-a-file-stream + return len + (len / 16) + 64 + 3; + } + + static inline bool SaveIncompressibleChunks() noexcept { + return false; + } +}; + +const char TMiniLzo::signature[] = "YLZO"; + +template <size_t N> +class TFixedArray { +public: + inline TFixedArray() noexcept { + memset(WorkMem_, 0, sizeof(WorkMem_)); + } + +protected: + char WorkMem_[N]; +}; + +class TMiniLzoCompressor: public TMiniLzo, public TFixedArray<LZO1X_MEM_COMPRESS + 1> { +public: + inline size_t Compress(const char* data, size_t len, char* ptr, size_t /*dstMaxSize*/) { + lzo_uint out = 0; + lzo1x_1_compress((const lzo_bytep)data, len, (lzo_bytep)ptr, &out, WorkMem_); + + return out; + } +}; + +class TMiniLzoDecompressor: public TMiniLzo, public TFixedArray<LZO1X_MEM_DECOMPRESS + 1> { +public: + inline size_t Decompress(const char* data, size_t len, char* ptr, size_t /*max*/) { + lzo_uint ret = 0; + + lzo1x_decompress((const lzo_bytep)data, len, (lzo_bytep)ptr, &ret, WorkMem_); + + return ret; + } + + inline void InitFromStream(IInputStream*) const noexcept { + } +}; + +DEF_COMPRESSOR(TLzoCompress, TMiniLzoCompressor) +DEF_DECOMPRESSOR(TLzoDecompress, TMiniLzoDecompressor) + +/* + * FastLZ + */ +class TFastLZ { +public: + static const char signature[]; + + static inline size_t Hint(size_t len) noexcept { + return Max<size_t>((size_t)(len * 1.06), 100); + } + + inline size_t Compress(const char* data, size_t len, char* ptr, size_t /*dstMaxSize*/) { + return fastlz_compress(data, len, ptr); + } + + inline size_t Decompress(const char* data, size_t len, char* ptr, size_t max) { + return fastlz_decompress(data, len, ptr, max); + } + + inline void InitFromStream(IInputStream*) const noexcept { + } + + static inline bool SaveIncompressibleChunks() noexcept { + return false; + } +}; + +const char TFastLZ::signature[] = "YLZF"; + +DEF_COMPRESSOR(TLzfCompress, TFastLZ) +DEF_DECOMPRESSOR(TLzfDecompress, TFastLZ) + +/* + * LZ4 + */ +class TLZ4 { +public: + static const char signature[]; + + static inline size_t Hint(size_t len) noexcept { + return Max<size_t>((size_t)(len * 1.06), 100); + } + + inline size_t Compress(const char* data, size_t len, char* ptr, size_t dstMaxSize) { + return LZ4_compress_default(data, ptr, len, dstMaxSize); + } + + inline size_t Decompress(const char* data, size_t len, char* ptr, size_t max) { + int res = LZ4_decompress_safe(data, ptr, len, max); + if (res < 0) + ythrow TDecompressorError(); + return res; + } + + inline void InitFromStream(IInputStream*) const noexcept { + } + + static inline bool SaveIncompressibleChunks() noexcept { + return false; + } +}; + +const char TLZ4::signature[] = "LZ.4"; + +DEF_COMPRESSOR(TLz4Compress, TLZ4) +DEF_DECOMPRESSOR(TLz4Decompress, TLZ4) + +/* + * Snappy + */ +class TSnappy { +public: + static const char signature[]; + + static inline size_t Hint(size_t len) noexcept { + return Max<size_t>(snappy::MaxCompressedLength(len), 100); + } + + inline size_t Compress(const char* data, size_t len, char* ptr, size_t /*dstMaxSize*/) { + size_t reslen = 0; + snappy::RawCompress(data, len, ptr, &reslen); + return reslen; + } + + inline size_t Decompress(const char* data, size_t len, char* ptr, size_t) { + size_t srclen = 0; + if (!snappy::GetUncompressedLength(data, len, &srclen) || !snappy::RawUncompress(data, len, ptr)) + ythrow TDecompressorError(); + return srclen; + } + + inline void InitFromStream(IInputStream*) const noexcept { + } + + static inline bool SaveIncompressibleChunks() noexcept { + return false; + } +}; + +const char TSnappy::signature[] = "Snap"; + +DEF_COMPRESSOR(TSnappyCompress, TSnappy) +DEF_DECOMPRESSOR(TSnappyDecompress, TSnappy) + +/* + * QuickLZ + */ +class TQuickLZBase { +public: + static const char signature[]; + + static inline size_t Hint(size_t len) noexcept { + return len + 500; + } + + inline TQuickLZBase() + : Table_(nullptr) + { + } + + inline void Init(unsigned ver, unsigned lev, unsigned mod, unsigned type) { + Table_ = LzqTable(ver, lev, mod); + + if (!Table_) { + ythrow yexception() << "unsupported lzq stream(" << ver << ", " << lev << ", " << mod << ")"; + } + + const size_t size = Table_->Setting(3) + Table_->Setting(type); + + Mem_.Reset(::operator new(size)); + memset(Mem_.Get(), 0, size); + } + + inline bool SaveIncompressibleChunks() const noexcept { + // we must save incompressible chunks "as is" + // after compressor run in streaming mode + return Table_->Setting(3); + } + +protected: + const TQuickLZMethods* Table_; + THolder<void> Mem_; +}; + +const char TQuickLZBase::signature[] = "YLZQ"; + +class TQuickLZCompress: public TQuickLZBase { +public: + inline size_t Compress(const char* data, size_t len, char* ptr, size_t /*dstMaxSize*/) { + return Table_->Compress(data, ptr, len, (char*)Mem_.Get()); + } +}; + +class TQuickLZDecompress: public TQuickLZBase { +public: + inline size_t Decompress(const char* data, size_t /*len*/, char* ptr, size_t /*max*/) { + return Table_->Decompress(data, ptr, (char*)Mem_.Get()); + } + + inline void InitFromStream(IInputStream* in) { + const ui8 ver = ::GLoad<ui8>(in); + const ui8 lev = ::GLoad<ui8>(in); + const ui8 mod = ::GLoad<ui8>(in); + + Init(ver, lev, mod, 2); + } +}; + +class TLzqCompress::TImpl: public TCompressorBase<TQuickLZCompress, TImpl> { +public: + inline TImpl(IOutputStream* out, ui16 blockSize, EVersion ver, unsigned level, EMode mode) + : TCompressorBase<TQuickLZCompress, TImpl>(out, blockSize) + { + memset(AdditionalData(), 0, AdditionalDataLength()); + + Init(ver, level, mode, 1); + + Save((ui8)ver); + Save((ui8)level); + Save((ui8)mode); + } +}; + +TLzqCompress::TLzqCompress(IOutputStream* slave, ui16 blockSize, EVersion ver, unsigned level, EMode mode) + : Impl_(new (TImpl::Hint(blockSize)) TImpl(slave, blockSize, ver, level, mode)) +{ +} + +DEF_COMPRESSOR_COMMON(TLzqCompress, TQuickLZCompress) +DEF_DECOMPRESSOR(TLzqDecompress, TQuickLZDecompress) + +namespace { + template <class T> + struct TInputHolder { + static inline T Set(T t) noexcept { + return t; + } + }; + + template <class T> + struct TInputHolder<TAutoPtr<T>> { + inline T* Set(TAutoPtr<T> v) noexcept { + V_ = v; + + return V_.Get(); + } + + TAutoPtr<T> V_; + }; + + // Decompressing input streams without signature verification + template <class TInput, class TDecompressor> + class TLzDecompressInput: public TInputHolder<TInput>, public IInputStream { + public: + inline TLzDecompressInput(TInput in) + : Impl_(this->Set(in)) + { + } + + private: + size_t DoRead(void* buf, size_t len) override { + return Impl_.Read(buf, len); + } + + private: + TDecompressorBaseImpl<TDecompressor> Impl_; + }; +} + +template <class T> +static TAutoPtr<IInputStream> TryOpenLzDecompressorX(const TDecompressSignature& s, T input) { + if (s.Check<TLZ4>()) + return new TLzDecompressInput<T, TLZ4>(input); + + if (s.Check<TSnappy>()) + return new TLzDecompressInput<T, TSnappy>(input); + + if (s.Check<TMiniLzo>()) + return new TLzDecompressInput<T, TMiniLzoDecompressor>(input); + + if (s.Check<TFastLZ>()) + return new TLzDecompressInput<T, TFastLZ>(input); + + if (s.Check<TQuickLZDecompress>()) + return new TLzDecompressInput<T, TQuickLZDecompress>(input); + + return nullptr; +} + +template <class T> +static inline TAutoPtr<IInputStream> TryOpenLzDecompressorImpl(const TStringBuf& signature, T input) { + if (signature.size() == SIGNATURE_SIZE) { + TMemoryInput mem(signature.data(), signature.size()); + TDecompressSignature s(&mem); + + return TryOpenLzDecompressorX(s, input); + } + + return nullptr; +} + +template <class T> +static inline TAutoPtr<IInputStream> TryOpenLzDecompressorImpl(T input) { + TDecompressSignature s(&*input); + + return TryOpenLzDecompressorX(s, input); +} + +template <class T> +static inline TAutoPtr<IInputStream> OpenLzDecompressorImpl(T input) { + TAutoPtr<IInputStream> ret = TryOpenLzDecompressorImpl(input); + + if (!ret) { + ythrow TDecompressorError() << "Unknown compression format"; + } + + return ret; +} + +TAutoPtr<IInputStream> OpenLzDecompressor(IInputStream* input) { + return OpenLzDecompressorImpl(input); +} + +TAutoPtr<IInputStream> TryOpenLzDecompressor(IInputStream* input) { + return TryOpenLzDecompressorImpl(input); +} + +TAutoPtr<IInputStream> TryOpenLzDecompressor(const TStringBuf& signature, IInputStream* input) { + return TryOpenLzDecompressorImpl(signature, input); +} + +TAutoPtr<IInputStream> OpenOwnedLzDecompressor(TAutoPtr<IInputStream> input) { + return OpenLzDecompressorImpl(input); +} + +TAutoPtr<IInputStream> TryOpenOwnedLzDecompressor(TAutoPtr<IInputStream> input) { + return TryOpenLzDecompressorImpl(input); +} + +TAutoPtr<IInputStream> TryOpenOwnedLzDecompressor(const TStringBuf& signature, TAutoPtr<IInputStream> input) { + return TryOpenLzDecompressorImpl(signature, input); +} |