1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
|
#include "decompress.h"
#include <contrib/libs/lzma/liblzma/api/lzma.h>
#include <util/generic/yexception.h>
#include <util/stream/output.h>
#include <util/stream/str.h>
#include <util/stream/zerocopy.h>
// Based on https://fossies.org/linux/xz/doc/examples/02_decompress.c
///////////////////////////////////////////////////////////////////////////////
//
/// \file 02_decompress.c
/// \brief Decompress .xz files to stdout
///
/// Usage: ./02_decompress INPUT_FILES... > OUTFILE
///
/// Example: ./02_decompress foo.xz bar.xz > foobar
//
// Author: Lasse Collin
//
// This file has been put into the public domain.
// You can do whatever you want with this file.
//
///////////////////////////////////////////////////////////////////////////////
namespace {
class IInput {
public:
virtual ~IInput() = default;
virtual size_t Next(const ui8*& ptr) = 0;
};
class TCopyInput: public IInput {
public:
TCopyInput(IInputStream* slave)
: Slave_(slave)
{
}
size_t Next(const ui8*& ptr) override {
ptr = Inbuf_;
return Slave_->Read(Inbuf_, sizeof(Inbuf_));
}
private:
IInputStream* Slave_;
ui8 Inbuf_[4096];
};
class TZeroCopy: public IInput {
public:
TZeroCopy(IZeroCopyInput* slave)
: Slave_(slave)
{
}
size_t Next(const ui8*& ptr) override {
return Slave_->Next(&ptr);
}
private:
IZeroCopyInput* Slave_;
};
std::unique_ptr<IInput> createInput(IInputStream* slave) {
return std::make_unique<TCopyInput>(slave);
}
std::unique_ptr<IInput> createInput(IZeroCopyInput* slave) {
return std::make_unique<TZeroCopy>(slave);
}
}
class TUnbufferedXzDecompress::TImpl {
public:
template <class T>
TImpl(T* slave)
: Input_(createInput(slave))
, Strm_(LZMA_STREAM_INIT)
{
TString err;
Y_ENSURE(initDecoder(&Strm_, err),
"Error initializing the decoder: " << err);
Strm_.next_in = NULL;
Strm_.avail_in = 0;
}
~TImpl() {
// Free the memory allocated for the decoder
lzma_end(&Strm_);
}
size_t DoRead(void* buf, size_t len) {
if (IsOutFinished_) {
return 0;
}
size_t res;
TString err;
Y_ENSURE(decompress(buf, len, res, err),
"lzma decoder error: " << err);
return res;
}
private:
bool decompress(void* buf, size_t len, size_t& outLen, TString& err) {
// When LZMA_CONCATENATED flag was used when initializing the decoder,
// we need to tell lzma_code() when there will be no more input.
// This is done by setting action to LZMA_FINISH instead of LZMA_RUN
// in the same way as it is done when encoding.
//
// When LZMA_CONCATENATED isn't used, there is no need to use
// LZMA_FINISH to tell when all the input has been read, but it
// is still OK to use it if you want. When LZMA_CONCATENATED isn't
// used, the decoder will stop after the first .xz stream. In that
// case some unused data may be left in strm->next_in.
lzma_action action = LZMA_RUN;
Strm_.next_out = (ui8*)buf;
Strm_.avail_out = len;
while (true) {
if (Strm_.avail_in == 0 && !IsInFinished_) {
size_t size = Input_->Next(Strm_.next_in);
if (size == 0) {
IsInFinished_ = true;
} else {
Strm_.avail_in = size;
}
// Once the end of the input file has been reached,
// we need to tell lzma_code() that no more input
// will be coming. As said before, this isn't required
// if the LZMA_CONCATENATED flag isn't used when
// initializing the decoder.
if (IsInFinished_)
action = LZMA_FINISH;
}
lzma_ret ret = lzma_code(&Strm_, action);
if (ret == LZMA_STREAM_END) {
// Once everything has been decoded successfully, the
// return value of lzma_code() will be LZMA_STREAM_END.
//
// It is important to check for LZMA_STREAM_END. Do not
// assume that getting ret != LZMA_OK would mean that
// everything has gone well or that when you aren't
// getting more output it must have successfully
// decoded everything.
IsOutFinished_ = true;
}
if (Strm_.avail_out == 0 || ret == LZMA_STREAM_END) {
outLen = len - Strm_.avail_out;
return true;
}
if (ret != LZMA_OK) {
// It's not LZMA_OK nor LZMA_STREAM_END,
// so it must be an error code. See lzma/base.h
// (src/liblzma/api/lzma/base.h in the source package
// or e.g. /usr/include/lzma/base.h depending on the
// install prefix) for the list and documentation of
// possible values. Many values listen in lzma_ret
// enumeration aren't possible in this example, but
// can be made possible by enabling memory usage limit
// or adding flags to the decoder initialization.
switch (ret) {
case LZMA_MEM_ERROR:
err = "Memory allocation failed";
break;
case LZMA_FORMAT_ERROR:
// .xz magic bytes weren't found.
err = "The input is not in the .xz format";
break;
case LZMA_OPTIONS_ERROR:
// For example, the headers specify a filter
// that isn't supported by this liblzma
// version (or it hasn't been enabled when
// building liblzma, but no-one sane does
// that unless building liblzma for an
// embedded system). Upgrading to a newer
// liblzma might help.
//
// Note that it is unlikely that the file has
// accidentally became corrupt if you get this
// error. The integrity of the .xz headers is
// always verified with a CRC32, so
// unintentionally corrupt files can be
// distinguished from unsupported files.
err = "Unsupported compression options";
break;
case LZMA_DATA_ERROR:
err = "Compressed file is corrupt";
break;
case LZMA_BUF_ERROR:
// Typically this error means that a valid
// file has got truncated, but it might also
// be a damaged part in the file that makes
// the decoder think the file is truncated.
// If you prefer, you can use the same error
// message for this as for LZMA_DATA_ERROR.
err = "Compressed file is truncated or "
"otherwise corrupt";
break;
default:
// This is most likely LZMA_PROG_ERROR.
err = "Unknown error, possibly a bug";
break;
}
TStringOutput out(err);
out << "[" << (int)ret << "]";
return false;
}
}
}
static bool initDecoder(lzma_stream* strm, TString& err) {
// Initialize a .xz decoder. The decoder supports a memory usage limit
// and a set of flags.
//
// The memory usage of the decompressor depends on the settings used
// to compress a .xz file. It can vary from less than a megabyte to
// a few gigabytes, but in practice (at least for now) it rarely
// exceeds 65 MiB because that's how much memory is required to
// decompress files created with "xz -9". Settings requiring more
// memory take extra effort to use and don't (at least for now)
// provide significantly better compression in most cases.
//
// Memory usage limit is useful if it is important that the
// decompressor won't consume gigabytes of memory. The need
// for limiting depends on the application. In this example,
// no memory usage limiting is used. This is done by setting
// the limit to UINT64_MAX.
//
// The .xz format allows concatenating compressed files as is:
//
// echo foo | xz > foobar.xz
// echo bar | xz >> foobar.xz
//
// When decompressing normal standalone .xz files, LZMA_CONCATENATED
// should always be used to support decompression of concatenated
// .xz files. If LZMA_CONCATENATED isn't used, the decoder will stop
// after the first .xz stream. This can be useful when .xz data has
// been embedded inside another file format.
//
// Flags other than LZMA_CONCATENATED are supported too, and can
// be combined with bitwise-or. See lzma/container.h
// (src/liblzma/api/lzma/container.h in the source package or e.g.
// /usr/include/lzma/container.h depending on the install prefix)
// for details.
lzma_ret ret = lzma_auto_decoder(
strm, UINT64_MAX, LZMA_CONCATENATED);
// Return successfully if the initialization went fine.
if (ret == LZMA_OK)
return true;
// Something went wrong. The possible errors are documented in
// lzma/container.h (src/liblzma/api/lzma/container.h in the source
// package or e.g. /usr/include/lzma/container.h depending on the
// install prefix).
//
// Note that LZMA_MEMLIMIT_ERROR is never possible here. If you
// specify a very tiny limit, the error will be delayed until
// the first headers have been parsed by a call to lzma_code().
switch (ret) {
case LZMA_MEM_ERROR:
err = "Memory allocation failed";
break;
case LZMA_OPTIONS_ERROR:
err = "Unsupported decompressor flags";
break;
default:
// This is most likely LZMA_PROG_ERROR indicating a bug in
// this program or in liblzma. It is inconvenient to have a
// separate error message for errors that should be impossible
// to occur, but knowing the error code is important for
// debugging. That's why it is good to print the error code
// at least when there is no good error message to show.
err = "Unknown error, possibly a bug";
break;
}
TStringOutput out(err);
out << "[" << (int)ret << "]";
return false;
}
private:
std::unique_ptr<IInput> Input_;
lzma_stream Strm_;
bool IsInFinished_ = false;
bool IsOutFinished_ = false;
};
TUnbufferedXzDecompress::TUnbufferedXzDecompress(IInputStream* slave)
: Impl_(std::make_unique<TImpl>(slave))
{
}
TUnbufferedXzDecompress::TUnbufferedXzDecompress(IZeroCopyInput* slave)
: Impl_(std::make_unique<TImpl>(slave))
{
}
TUnbufferedXzDecompress::~TUnbufferedXzDecompress() = default;
size_t TUnbufferedXzDecompress::DoRead(void* buf, size_t len) {
return Impl_->DoRead(buf, len);
}
|