1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
|
#pragma once
#include <memory>
#include <boost/noncopyable.hpp>
#include <Compression/CompressionInfo.h>
#include <base/types.h>
#include <Parsers/IAST.h>
#include <Common/SipHash.h>
namespace DB
{
extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size);
/**
* Represents interface for compression codecs like LZ4, ZSTD, etc.
*/
class ICompressionCodec : private boost::noncopyable
{
public:
virtual ~ICompressionCodec() = default;
/// Byte which indicates codec in compressed file
virtual uint8_t getMethodByte() const = 0;
/// Codec description, for example "ZSTD(2)" or "LZ4,LZ4HC(5)"
virtual ASTPtr getCodecDesc() const;
/// Codec description with "CODEC" prefix, for example "CODEC(ZSTD(2))" or
/// "CODEC(LZ4,LZ4HC(5))"
ASTPtr getFullCodecDesc() const;
/// Hash, that depends on codec ast and optional parameters like data type
virtual void updateHash(SipHash & hash) const = 0;
UInt64 getHash() const;
/// Compressed bytes from uncompressed source to dest. Dest should preallocate memory
UInt32 compress(const char * source, UInt32 source_size, char * dest) const;
/// Decompress bytes from compressed source to dest. Dest should preallocate memory;
UInt32 decompress(const char * source, UInt32 source_size, char * dest) const;
/// Three kinds of codec mode:
/// Synchronous mode which is commonly used by default;
/// --- For the codec with HW decompressor, it means submit request to HW and busy wait till complete.
/// Asynchronous mode which required HW decompressor support;
/// --- For the codec with HW decompressor, it means submit request to HW and return immediately.
/// --- Must be used in pair with flushAsynchronousDecompressRequests.
/// SoftwareFallback mode is exclusively defined for the codec with HW decompressor, enable its capability of "fallback to SW codec".
enum class CodecMode
{
Synchronous,
Asynchronous,
SoftwareFallback
};
/// Get current decompression mode
CodecMode getDecompressMode() const{ return decompressMode; }
/// if set mode to CodecMode::Asynchronous, must be followed with flushAsynchronousDecompressRequests
void setDecompressMode(CodecMode mode){ decompressMode = mode; }
/// Flush result for previous asynchronous decompression requests.
/// This function must be called following several requests offload to HW.
/// To make sure asynchronous results have been flushed into target buffer completely.
/// Meanwhile, source and target buffer for decompression can not be overwritten until this function execute completely.
/// Otherwise it would conflict with HW offloading and cause exception.
/// For QPL deflate, it support the maximum number of requests equal to DeflateQplJobHWPool::jobPoolSize
virtual void flushAsynchronousDecompressRequests(){}
/// Number of bytes, that will be used to compress uncompressed_size bytes with current codec
virtual UInt32 getCompressedReserveSize(UInt32 uncompressed_size) const
{
return getHeaderSize() + getMaxCompressedDataSize(uncompressed_size);
}
/// Some codecs (LZ4, for example) require additional bytes at end of buffer
virtual UInt32 getAdditionalSizeAtTheEndOfBuffer() const { return 0; }
/// Size of header in compressed data on disk
static constexpr UInt8 getHeaderSize() { return COMPRESSED_BLOCK_HEADER_SIZE; }
/// Read size of compressed block from compressed source
static UInt32 readCompressedBlockSize(const char * source);
/// Read size of decompressed block from compressed source
static UInt32 readDecompressedBlockSize(const char * source);
/// Read method byte from compressed source
static uint8_t readMethod(const char * source);
/// Return true if this codec actually compressing something. Otherwise it can be just transformation that helps compression (e.g. Delta).
virtual bool isCompression() const = 0;
/// Is it a generic compression algorithm like lz4, zstd. Usually it does not make sense to apply generic compression more than single time.
virtual bool isGenericCompression() const = 0;
/// If it is a post-processing codec such as encryption. Usually it does not make sense to apply non-post-processing codecs after this.
virtual bool isEncryption() const { return false; }
/// If it is a specialized codec for floating-point time series. Applying it to non-floating point data is suspicious.
virtual bool isFloatingPointTimeSeriesCodec() const { return false; }
/// If the codec's purpose is to calculate deltas between consecutive values.
virtual bool isDeltaCompression() const { return false; }
/// It is a codec available only for evaluation purposes and not meant to be used in production.
/// It will not be allowed to use unless the user will turn off the safety switch.
virtual bool isExperimental() const { return false; }
/// Is this the DEFLATE_QPL codec?
virtual bool isDeflateQpl() const { return false; }
/// If it does nothing.
virtual bool isNone() const { return false; }
protected:
/// This is used for fuzz testing
friend int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size);
/// Return size of compressed data without header
virtual UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const { return uncompressed_size; }
/// Actually compress data without header
virtual UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const = 0;
/// Actually decompress data without header
virtual void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const = 0;
/// Construct and set codec description from codec name and arguments. Must be called in codec constructor.
void setCodecDescription(const String & name, const ASTs & arguments = {});
private:
ASTPtr full_codec_desc;
CodecMode decompressMode{CodecMode::Synchronous};
};
using CompressionCodecPtr = std::shared_ptr<ICompressionCodec>;
using Codecs = std::vector<CompressionCodecPtr>;
}
|