summaryrefslogtreecommitdiffstats
path: root/contrib/libs/zstd/lib/decompress
diff options
context:
space:
mode:
authorthegeorg <[email protected]>2023-02-27 12:38:58 +0300
committerthegeorg <[email protected]>2023-02-27 12:38:58 +0300
commite82ffade6959fe8feaf41fe12a57d9f56873be40 (patch)
tree784bf61d9ca6cab4beabc995186d9b41e762ae60 /contrib/libs/zstd/lib/decompress
parente58cceed352de42c1526ab4eecdfeee158b1ede3 (diff)
Update contrib/libs/zstd to 1.5.4
Diffstat (limited to 'contrib/libs/zstd/lib/decompress')
-rw-r--r--contrib/libs/zstd/lib/decompress/huf_decompress.c875
-rw-r--r--contrib/libs/zstd/lib/decompress/huf_decompress_amd64.S69
-rw-r--r--contrib/libs/zstd/lib/decompress/zstd_ddict.c7
-rw-r--r--contrib/libs/zstd/lib/decompress/zstd_ddict.h2
-rw-r--r--contrib/libs/zstd/lib/decompress/zstd_decompress.c206
-rw-r--r--contrib/libs/zstd/lib/decompress/zstd_decompress_block.c251
-rw-r--r--contrib/libs/zstd/lib/decompress/zstd_decompress_block.h2
-rw-r--r--contrib/libs/zstd/lib/decompress/zstd_decompress_internal.h6
8 files changed, 823 insertions, 595 deletions
diff --git a/contrib/libs/zstd/lib/decompress/huf_decompress.c b/contrib/libs/zstd/lib/decompress/huf_decompress.c
index 2027188255e..c2d1f633a49 100644
--- a/contrib/libs/zstd/lib/decompress/huf_decompress.c
+++ b/contrib/libs/zstd/lib/decompress/huf_decompress.c
@@ -1,7 +1,7 @@
/* ******************************************************************
* huff0 huffman decoder,
* part of Finite State Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
*
* You can contact the author at :
* - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -19,10 +19,10 @@
#include "../common/compiler.h"
#include "../common/bitstream.h" /* BIT_* */
#include "../common/fse.h" /* to compress headers */
-#define HUF_STATIC_LINKING_ONLY
#include "../common/huf.h"
#include "../common/error_private.h"
#include "../common/zstd_internal.h"
+#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
/* **************************************************************
* Constants
@@ -43,10 +43,14 @@
#error "Cannot force the use of the X1 and X2 decoders at the same time!"
#endif
-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
+ * supported at runtime, so we can add the BMI2 target attribute.
+ * When it is disabled, we will still get BMI2 if it is enabled statically.
+ */
+#if DYNAMIC_BMI2
+# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
#else
-# define HUF_ASM_X86_64_BMI2_ATTRS
+# define HUF_FAST_BMI2_ATTRS
#endif
#ifdef __cplusplus
@@ -56,18 +60,12 @@
#endif
#define HUF_ASM_DECL HUF_EXTERN_C
-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+#if DYNAMIC_BMI2
# define HUF_NEED_BMI2_FUNCTION 1
#else
# define HUF_NEED_BMI2_FUNCTION 0
#endif
-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
-# define HUF_NEED_DEFAULT_FUNCTION 1
-#else
-# define HUF_NEED_DEFAULT_FUNCTION 0
-#endif
-
/* **************************************************************
* Error Management
****************************************************************/
@@ -84,6 +82,11 @@
/* **************************************************************
* BMI2 Variant Wrappers
****************************************************************/
+typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
+ const void *cSrc,
+ size_t cSrcSize,
+ const HUF_DTable *DTable);
+
#if DYNAMIC_BMI2
#define HUF_DGEN(fn) \
@@ -105,9 +108,9 @@
} \
\
static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
{ \
- if (bmi2) { \
+ if (flags & HUF_flags_bmi2) { \
return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \
} \
return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \
@@ -117,9 +120,9 @@
#define HUF_DGEN(fn) \
static size_t fn(void* dst, size_t dstSize, void const* cSrc, \
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \
+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \
{ \
- (void)bmi2; \
+ (void)flags; \
return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \
}
@@ -138,15 +141,28 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
return dtd;
}
-#if ZSTD_ENABLE_ASM_X86_64_BMI2
-
-static size_t HUF_initDStream(BYTE const* ip) {
+static size_t HUF_initFastDStream(BYTE const* ip) {
BYTE const lastByte = ip[7];
- size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
size_t const value = MEM_readLEST(ip) | 1;
assert(bitsConsumed <= 8);
+ assert(sizeof(size_t) == 8);
return value << bitsConsumed;
}
+
+
+/**
+ * The input/output arguments to the Huffman fast decoding loop:
+ *
+ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
+ * op [in/out] - The output pointers, must be updated to reflect what is written.
+ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
+ * dt [in] - The decoding table.
+ * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
+ * oend [in] - The end of the output stream. op[3] must not cross oend.
+ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
+ * as long as it is above ilimit, but that indicates corruption.
+ */
typedef struct {
BYTE const* ip[4];
BYTE* op[4];
@@ -155,15 +171,17 @@ typedef struct {
BYTE const* ilimit;
BYTE* oend;
BYTE const* iend[4];
-} HUF_DecompressAsmArgs;
+} HUF_DecompressFastArgs;
+
+typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
/**
- * Initializes args for the asm decoding loop.
- * @returns 0 on success
- * 1 if the fallback implementation should be used.
+ * Initializes args for the fast decoding loop.
+ * @returns 1 on success
+ * 0 if the fallback implementation should be used.
* Or an error code on failure.
*/
-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
+static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
{
void const* dt = DTable + 1;
U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
@@ -172,9 +190,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
BYTE* const oend = (BYTE*)dst + dstSize;
- /* The following condition is false on x32 platform,
- * but HUF_asm is not compatible with this ABI */
- if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
+ /* The fast decoding loop assumes 64-bit little-endian.
+ * This condition is false on x32.
+ */
+ if (!MEM_isLittleEndian() || MEM_32bits())
+ return 0;
/* strict minimum : jump table + 1 byte per stream */
if (srcSize < 10)
@@ -185,7 +205,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
* On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
*/
if (dtLog != HUF_DECODER_FAST_TABLELOG)
- return 1;
+ return 0;
/* Read the jump table. */
{
@@ -199,13 +219,13 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
args->iend[2] = args->iend[1] + length2;
args->iend[3] = args->iend[2] + length3;
- /* HUF_initDStream() requires this, and this small of an input
+ /* HUF_initFastDStream() requires this, and this small of an input
* won't benefit from the ASM loop anyways.
* length1 must be >= 16 so that ip[0] >= ilimit before the loop
* starts.
*/
if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
- return 1;
+ return 0;
if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */
}
/* ip[] contains the position that is currently loaded into bits[]. */
@@ -222,7 +242,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
/* No point to call the ASM loop for tiny outputs. */
if (args->op[3] >= oend)
- return 1;
+ return 0;
/* bits[] is the bit container.
* It is read from the MSB down to the LSB.
@@ -231,10 +251,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
* set, so that CountTrailingZeros(bits[]) can be used
* to count how many bits we've consumed.
*/
- args->bits[0] = HUF_initDStream(args->ip[0]);
- args->bits[1] = HUF_initDStream(args->ip[1]);
- args->bits[2] = HUF_initDStream(args->ip[2]);
- args->bits[3] = HUF_initDStream(args->ip[3]);
+ args->bits[0] = HUF_initFastDStream(args->ip[0]);
+ args->bits[1] = HUF_initFastDStream(args->ip[1]);
+ args->bits[2] = HUF_initFastDStream(args->ip[2]);
+ args->bits[3] = HUF_initFastDStream(args->ip[3]);
/* If ip[] >= ilimit, it is guaranteed to be safe to
* reload bits[]. It may be beyond its section, but is
@@ -245,10 +265,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
args->oend = oend;
args->dt = dt;
- return 0;
+ return 1;
}
-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
+static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
{
/* Validate that we haven't overwritten. */
if (args->op[stream] > segmentEnd)
@@ -262,15 +282,15 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
return ERROR(corruption_detected);
/* Construct the BIT_DStream_t. */
- bit->bitContainer = MEM_readLE64(args->ip[stream]);
- bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
+ assert(sizeof(size_t) == 8);
+ bit->bitContainer = MEM_readLEST(args->ip[stream]);
+ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
bit->start = (const char*)args->iend[0];
bit->limitPtr = bit->start + sizeof(size_t);
bit->ptr = (const char*)args->ip[stream];
return 0;
}
-#endif
#ifndef HUF_FORCE_DECOMPRESS_X2
@@ -287,10 +307,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decodi
static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
U64 D4;
if (MEM_isLittleEndian()) {
- D4 = (symbol << 8) + nbBits;
+ D4 = (U64)((symbol << 8) + nbBits);
} else {
- D4 = symbol + (nbBits << 8);
+ D4 = (U64)(symbol + (nbBits << 8));
}
+ assert(D4 < (1U << 16));
D4 *= 0x0001000100010001ULL;
return D4;
}
@@ -333,13 +354,7 @@ typedef struct {
BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
} HUF_ReadDTableX1_Workspace;
-
-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
-{
- return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
{
U32 tableLog = 0;
U32 nbSymbols = 0;
@@ -354,7 +369,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
/* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
- iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
+ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
if (HUF_isError(iSize)) return iSize;
@@ -381,9 +396,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
* rankStart[0] is not filled because there are no entries in the table for
* weight 0.
*/
- {
- int n;
- int nextRankStart = 0;
+ { int n;
+ U32 nextRankStart = 0;
int const unroll = 4;
int const nLimit = (int)nbSymbols - unroll + 1;
for (n=0; n<(int)tableLog+1; n++) {
@@ -410,10 +424,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
* We can switch based on the length to a different inner loop which is
* optimized for that particular case.
*/
- {
- U32 w;
- int symbol=wksp->rankVal[0];
- int rankStart=0;
+ { U32 w;
+ int symbol = wksp->rankVal[0];
+ int rankStart = 0;
for (w=1; w<tableLog+1; ++w) {
int const symbolCount = wksp->rankVal[w];
int const length = (1 << w) >> 1;
@@ -523,7 +536,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
while (p < pEnd)
HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
- return pEnd-pStart;
+ return (size_t)(pEnd-pStart);
}
FORCE_INLINE_TEMPLATE size_t
@@ -549,6 +562,10 @@ HUF_decompress1X1_usingDTable_internal_body(
return dstSize;
}
+/* HUF_decompress4X1_usingDTable_internal_body():
+ * Conditions :
+ * @dstSize >= 6
+ */
FORCE_INLINE_TEMPLATE size_t
HUF_decompress4X1_usingDTable_internal_body(
void* dst, size_t dstSize,
@@ -592,6 +609,7 @@ HUF_decompress4X1_usingDTable_internal_body(
if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -654,38 +672,142 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
}
#endif
-#if HUF_NEED_DEFAULT_FUNCTION
static
size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
size_t cSrcSize, HUF_DTable const* DTable) {
return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
}
-#endif
#if ZSTD_ENABLE_ASM_X86_64_BMI2
-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
+HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
-static HUF_ASM_X86_64_BMI2_ATTRS
+#endif
+
+static HUF_FAST_BMI2_ATTRS
+void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
+{
+ U64 bits[4];
+ BYTE const* ip[4];
+ BYTE* op[4];
+ U16 const* const dtable = (U16 const*)args->dt;
+ BYTE* const oend = args->oend;
+ BYTE const* const ilimit = args->ilimit;
+
+ /* Copy the arguments to local variables */
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
+ ZSTD_memcpy(&ip, &args->ip, sizeof(ip));
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
+
+ assert(MEM_isLittleEndian());
+ assert(!MEM_32bits());
+
+ for (;;) {
+ BYTE* olimit;
+ int stream;
+ int symbol;
+
+ /* Assert loop preconditions */
+#ifndef NDEBUG
+ for (stream = 0; stream < 4; ++stream) {
+ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
+ assert(ip[stream] >= ilimit);
+ }
+#endif
+ /* Compute olimit */
+ {
+ /* Each iteration produces 5 output symbols per stream */
+ size_t const oiters = (size_t)(oend - op[3]) / 5;
+ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
+ * per stream.
+ */
+ size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
+ /* We can safely run iters iterations before running bounds checks */
+ size_t const iters = MIN(oiters, iiters);
+ size_t const symbols = iters * 5;
+
+ /* We can simply check that op[3] < olimit, instead of checking all
+ * of our bounds, since we can't hit the other bounds until we've run
+ * iters iterations, which only happens when op[3] == olimit.
+ */
+ olimit = op[3] + symbols;
+
+ /* Exit fast decoding loop once we get close to the end. */
+ if (op[3] + 20 > olimit)
+ break;
+
+ /* Exit the decoding loop if any input pointer has crossed the
+ * previous one. This indicates corruption, and a precondition
+ * to our loop is that ip[i] >= ip[0].
+ */
+ for (stream = 1; stream < 4; ++stream) {
+ if (ip[stream] < ip[stream - 1])
+ goto _out;
+ }
+ }
+
+#ifndef NDEBUG
+ for (stream = 1; stream < 4; ++stream) {
+ assert(ip[stream] >= ip[stream - 1]);
+ }
+#endif
+
+ do {
+ /* Decode 5 symbols in each of the 4 streams */
+ for (symbol = 0; symbol < 5; ++symbol) {
+ for (stream = 0; stream < 4; ++stream) {
+ int const index = (int)(bits[stream] >> 53);
+ int const entry = (int)dtable[index];
+ bits[stream] <<= (entry & 63);
+ op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF);
+ }
+ }
+ /* Reload the bitstreams */
+ for (stream = 0; stream < 4; ++stream) {
+ int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
+ int const nbBits = ctz & 7;
+ int const nbBytes = ctz >> 3;
+ op[stream] += 5;
+ ip[stream] -= nbBytes;
+ bits[stream] = MEM_read64(ip[stream]) | 1;
+ bits[stream] <<= nbBits;
+ }
+ } while (op[3] < olimit);
+ }
+
+_out:
+
+ /* Save the final values of each of the state variables back to args. */
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
+ ZSTD_memcpy(&args->ip, &ip, sizeof(ip));
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
+}
+
+/**
+ * @returns @p dstSize on success (>= 6)
+ * 0 if the fallback implementation should be used
+ * An error if an error occurred
+ */
+static HUF_FAST_BMI2_ATTRS
size_t
-HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+HUF_decompress4X1_usingDTable_internal_fast(
void* dst, size_t dstSize,
const void* cSrc, size_t cSrcSize,
- const HUF_DTable* DTable)
+ const HUF_DTable* DTable,
+ HUF_DecompressFastLoopFn loopFn)
{
void const* dt = DTable + 1;
const BYTE* const iend = (const BYTE*)cSrc + 6;
BYTE* const oend = (BYTE*)dst + dstSize;
- HUF_DecompressAsmArgs args;
- {
- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
- FORWARD_IF_ERROR(ret, "Failed to init asm args");
- if (ret != 0)
- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+ HUF_DecompressFastArgs args;
+ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+ FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
+ if (ret == 0)
+ return 0;
}
assert(args.ip[0] >= args.ilimit);
- HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
+ loopFn(&args);
/* Our loop guarantees that ip[] >= ilimit and that we haven't
* overwritten any op[].
@@ -698,8 +820,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
(void)iend;
/* finish bit streams one by one. */
- {
- size_t const segmentSize = (dstSize+3) / 4;
+ { size_t const segmentSize = (dstSize+3) / 4;
BYTE* segmentEnd = (BYTE*)dst;
int i;
for (i = 0; i < 4; ++i) {
@@ -716,97 +837,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
}
/* decoded size */
+ assert(dstSize != 0);
return dstSize;
}
-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
-
-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
- const void *cSrc,
- size_t cSrcSize,
- const HUF_DTable *DTable);
HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
{
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
+
#if DYNAMIC_BMI2
- if (bmi2) {
+ if (flags & HUF_flags_bmi2) {
+ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
# if ZSTD_ENABLE_ASM_X86_64_BMI2
- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-# else
- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+ if (!(flags & HUF_flags_disableAsm)) {
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
+ }
# endif
+ } else {
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
}
-#else
- (void)bmi2;
#endif
#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-#else
- return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
+ if (!(flags & HUF_flags_disableAsm)) {
+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
+ }
#endif
-}
-
-size_t HUF_decompress1X1_usingDTable(
- void* dst, size_t dstSize,
- const void* cSrc, size_t cSrcSize,
- const HUF_DTable* DTable)
-{
- DTableDesc dtd = HUF_getDTableDesc(DTable);
- if (dtd.tableType != 0) return ERROR(GENERIC);
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-}
-
-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
- const void* cSrc, size_t cSrcSize,
- void* workSpace, size_t wkspSize)
-{
- const BYTE* ip = (const BYTE*) cSrc;
-
- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
- if (HUF_isError(hSize)) return hSize;
- if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
- ip += hSize; cSrcSize -= hSize;
-
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
-}
-
-
-size_t HUF_decompress4X1_usingDTable(
- void* dst, size_t dstSize,
- const void* cSrc, size_t cSrcSize,
- const HUF_DTable* DTable)
-{
- DTableDesc dtd = HUF_getDTableDesc(DTable);
- if (dtd.tableType != 0) return ERROR(GENERIC);
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+ if (!(flags & HUF_flags_disableFast)) {
+ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
+ if (ret != 0)
+ return ret;
+ }
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
}
-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
const void* cSrc, size_t cSrcSize,
- void* workSpace, size_t wkspSize, int bmi2)
+ void* workSpace, size_t wkspSize, int flags)
{
const BYTE* ip = (const BYTE*) cSrc;
- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
if (HUF_isError(hSize)) return hSize;
if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
ip += hSize; cSrcSize -= hSize;
- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
-}
-
-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
- const void* cSrc, size_t cSrcSize,
- void* workSpace, size_t wkspSize)
-{
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
+ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
}
-
#endif /* HUF_FORCE_DECOMPRESS_X2 */
@@ -989,7 +1072,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
const sortedSymbol_t* sortedList,
- const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
const U32 nbBitsBaseline)
{
U32* const rankVal = rankValOrigin[0];
@@ -1044,14 +1127,7 @@ typedef struct {
size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
const void* src, size_t srcSize,
- void* workSpace, size_t wkspSize)
-{
- return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
- const void* src, size_t srcSize,
- void* workSpace, size_t wkspSize, int bmi2)
+ void* workSpace, size_t wkspSize, int flags)
{
U32 tableLog, maxW, nbSymbols;
DTableDesc dtd = HUF_getDTableDesc(DTable);
@@ -1073,7 +1149,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
/* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
- iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
+ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
if (HUF_isError(iSize)) return iSize;
/* check result */
@@ -1244,6 +1320,11 @@ HUF_decompress1X2_usingDTable_internal_body(
/* decoded size */
return dstSize;
}
+
+/* HUF_decompress4X2_usingDTable_internal_body():
+ * Conditions:
+ * @dstSize >= 6
+ */
FORCE_INLINE_TEMPLATE size_t
HUF_decompress4X2_usingDTable_internal_body(
void* dst, size_t dstSize,
@@ -1284,8 +1365,9 @@ HUF_decompress4X2_usingDTable_internal_body(
DTableDesc const dtd = HUF_getDTableDesc(DTable);
U32 const dtLog = dtd.tableLog;
- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
- if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
+ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */
+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */
CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -1370,36 +1452,177 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
}
#endif
-#if HUF_NEED_DEFAULT_FUNCTION
static
size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
size_t cSrcSize, HUF_DTable const* DTable) {
return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
}
-#endif
#if ZSTD_ENABLE_ASM_X86_64_BMI2
-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
+HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
+
+#endif
+
+static HUF_FAST_BMI2_ATTRS
+void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
+{
+ U64 bits[4];
+ BYTE const* ip[4];
+ BYTE* op[4];
+ BYTE* oend[4];
+ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
+ BYTE const* const ilimit = args->ilimit;
+
+ /* Copy the arguments to local registers. */
+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
+ ZSTD_memcpy(&ip, &args->ip, sizeof(ip));
+ ZSTD_memcpy(&op, &args->op, sizeof(op));
+
+ oend[0] = op[1];
+ oend[1] = op[2];
+ oend[2] = op[3];
+ oend[3] = args->oend;
+
+ assert(MEM_isLittleEndian());
+ assert(!MEM_32bits());
+
+ for (;;) {
+ BYTE* olimit;
+ int stream;
+ int symbol;
+
+ /* Assert loop preconditions */
+#ifndef NDEBUG
+ for (stream = 0; stream < 4; ++stream) {
+ assert(op[stream] <= oend[stream]);
+ assert(ip[stream] >= ilimit);
+ }
+#endif
+ /* Compute olimit */
+ {
+ /* Each loop does 5 table lookups for each of the 4 streams.
+ * Each table lookup consumes up to 11 bits of input, and produces
+ * up to 2 bytes of output.
+ */
+ /* We can consume up to 7 bytes of input per iteration per stream.
+ * We also know that each input pointer is >= ip[0]. So we can run
+ * iters loops before running out of input.
+ */
+ size_t iters = (size_t)(ip[0] - ilimit) / 7;
+ /* Each iteration can produce up to 10 bytes of output per stream.
+ * Each output stream my advance at different rates. So take the
+ * minimum number of safe iterations among all the output streams.
+ */
+ for (stream = 0; stream < 4; ++stream) {
+ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
+ iters = MIN(iters, oiters);
+ }
+
+ /* Each iteration produces at least 5 output symbols. So until
+ * op[3] crosses olimit, we know we haven't executed iters
+ * iterations yet. This saves us maintaining an iters counter,
+ * at the expense of computing the remaining # of iterations
+ * more frequently.
+ */
+ olimit = op[3] + (iters * 5);
+
+ /* Exit the fast decoding loop if we are too close to the end. */
+ if (op[3] + 10 > olimit)
+ break;
+
+ /* Exit the decoding loop if any input pointer has crossed the
+ * previous one. This indicates corruption, and a precondition
+ * to our loop is that ip[i] >= ip[0].
+ */
+ for (stream = 1; stream < 4; ++stream) {
+ if (ip[stream] < ip[stream - 1])
+ goto _out;
+ }
+ }
+
+#ifndef NDEBUG
+ for (stream = 1; stream < 4; ++stream) {
+ assert(ip[stream] >= ip[stream - 1]);
+ }
+#endif
+
+ do {
+ /* Do 5 table lookups for each of the first 3 streams */
+ for (symbol = 0; symbol < 5; ++symbol) {
+ for (stream = 0; stream < 3; ++stream) {
+ int const index = (int)(bits[stream] >> 53);
+ HUF_DEltX2 const entry = dtable[index];
+ MEM_write16(op[stream], entry.sequence);
+ bits[stream] <<= (entry.nbBits);
+ op[stream] += (entry.length);
+ }
+ }
+ /* Do 1 table lookup from the final stream */
+ {
+ int const index = (int)(bits[3] >> 53);
+ HUF_DEltX2 const entry = dtable[index];
+ MEM_write16(op[3], entry.sequence);
+ bits[3] <<= (entry.nbBits);
+ op[3] += (entry.length);
+ }
+ /* Do 4 table lookups from the final stream & reload bitstreams */
+ for (stream = 0; stream < 4; ++stream) {
+ /* Do a table lookup from the final stream.
+ * This is interleaved with the reloading to reduce register
+ * pressure. This shouldn't be necessary, but compilers can
+ * struggle with codegen with high register pressure.
+ */
+ {
+ int const index = (int)(bits[3] >> 53);
+ HUF_DEltX2 const entry = dtable[index];
+ MEM_write16(op[3], entry.sequence);
+ bits[3] <<= (entry.nbBits);
+ op[3] += (entry.length);
+ }
+ /* Reload the bistreams. The final bitstream must be reloaded
+ * after the 5th symbol was decoded.
+ */
+ {
+ int const ctz = ZSTD_countTrailingZeros64(bits[stream]);
+ int const nbBits = ctz & 7;
+ int const nbBytes = ctz >> 3;
+ ip[stream] -= nbBytes;
+ bits[stream] = MEM_read64(ip[stream]) | 1;
+ bits[stream] <<= nbBits;
+ }
+ }
+ } while (op[3] < olimit);
+ }
-static HUF_ASM_X86_64_BMI2_ATTRS size_t
-HUF_decompress4X2_usingDTable_internal_bmi2_asm(
+_out:
+
+ /* Save the final values of each of the state variables back to args. */
+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
+ ZSTD_memcpy(&args->ip, &ip, sizeof(ip));
+ ZSTD_memcpy(&args->op, &op, sizeof(op));
+}
+
+
+static HUF_FAST_BMI2_ATTRS size_t
+HUF_decompress4X2_usingDTable_internal_fast(
void* dst, size_t dstSize,
const void* cSrc, size_t cSrcSize,
- const HUF_DTable* DTable) {
+ const HUF_DTable* DTable,
+ HUF_DecompressFastLoopFn loopFn) {
void const* dt = DTable + 1;
const BYTE* const iend = (const BYTE*)cSrc + 6;
BYTE* const oend = (BYTE*)dst + dstSize;
- HUF_DecompressAsmArgs args;
+ HUF_DecompressFastArgs args;
{
- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
FORWARD_IF_ERROR(ret, "Failed to init asm args");
- if (ret != 0)
- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+ if (ret == 0)
+ return 0;
}
assert(args.ip[0] >= args.ilimit);
- HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
+ loopFn(&args);
/* note : op4 already verified within main loop */
assert(args.ip[0] >= iend);
@@ -1430,91 +1653,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
/* decoded size */
return dstSize;
}
-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
- size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
+ size_t cSrcSize, HUF_DTable const* DTable, int flags)
{
+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
+
#if DYNAMIC_BMI2
- if (bmi2) {
+ if (flags & HUF_flags_bmi2) {
+ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
# if ZSTD_ENABLE_ASM_X86_64_BMI2
- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-# else
- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+ if (!(flags & HUF_flags_disableAsm)) {
+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
+ }
# endif
+ } else {
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
}
-#else
- (void)bmi2;
#endif
#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-#else
- return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
+ if (!(flags & HUF_flags_disableAsm)) {
+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
+ }
#endif
+
+ if (!(flags & HUF_flags_disableFast)) {
+ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
+ if (ret != 0)
+ return ret;
+ }
+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
}
HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
-size_t HUF_decompress1X2_usingDTable(
- void* dst, size_t dstSize,
- const void* cSrc, size_t cSrcSize,
- const HUF_DTable* DTable)
-{
- DTableDesc dtd = HUF_getDTableDesc(DTable);
- if (dtd.tableType != 1) return ERROR(GENERIC);
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-}
-
size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
const void* cSrc, size_t cSrcSize,
- void* workSpace, size_t wkspSize)
+ void* workSpace, size_t wkspSize, int flags)
{
const BYTE* ip = (const BYTE*) cSrc;
size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
- workSpace, wkspSize);
+ workSpace, wkspSize, flags);
if (HUF_isError(hSize)) return hSize;
if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
ip += hSize; cSrcSize -= hSize;
- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
-}
-
-
-size_t HUF_decompress4X2_usingDTable(
- void* dst, size_t dstSize,
- const void* cSrc, size_t cSrcSize,
- const HUF_DTable* DTable)
-{
- DTableDesc dtd = HUF_getDTableDesc(DTable);
- if (dtd.tableType != 1) return ERROR(GENERIC);
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
}
-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
const void* cSrc, size_t cSrcSize,
- void* workSpace, size_t wkspSize, int bmi2)
+ void* workSpace, size_t wkspSize, int flags)
{
const BYTE* ip = (const BYTE*) cSrc;
size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
- workSpace, wkspSize);
+ workSpace, wkspSize, flags);
if (HUF_isError(hSize)) return hSize;
if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
ip += hSize; cSrcSize -= hSize;
- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
}
-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
- const void* cSrc, size_t cSrcSize,
- void* workSpace, size_t wkspSize)
-{
- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-
#endif /* HUF_FORCE_DECOMPRESS_X1 */
@@ -1522,44 +1726,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
/* Universal decompression selectors */
/* ***********************************/
-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
- const void* cSrc, size_t cSrcSize,
- const HUF_DTable* DTable)
-{
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
- (void)dtd;
- assert(dtd.tableType == 0);
- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
- (void)dtd;
- assert(dtd.tableType == 1);
- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#else
- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#endif
-}
-
-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
- const void* cSrc, size_t cSrcSize,
- const HUF_DTable* DTable)
-{
- DTableDesc const dtd = HUF_getDTableDesc(DTable);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
- (void)dtd;
- assert(dtd.tableType == 0);
- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
- (void)dtd;
- assert(dtd.tableType == 1);
- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#else
- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#endif
-}
-
#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
@@ -1614,36 +1780,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
#endif
}
-
-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
- size_t dstSize, const void* cSrc,
- size_t cSrcSize, void* workSpace,
- size_t wkspSize)
-{
- /* validation checks */
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
- if (cSrcSize == 0) return ERROR(corruption_detected);
-
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
- (void)algoNb;
- assert(algoNb == 0);
- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
- (void)algoNb;
- assert(algoNb == 1);
- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
-#else
- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
- cSrcSize, workSpace, wkspSize):
- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
-#endif
- }
-}
-
size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
const void* cSrc, size_t cSrcSize,
- void* workSpace, size_t wkspSize)
+ void* workSpace, size_t wkspSize, int flags)
{
/* validation checks */
if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1656,71 +1795,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
(void)algoNb;
assert(algoNb == 0);
return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
- cSrcSize, workSpace, wkspSize);
+ cSrcSize, workSpace, wkspSize, flags);
#elif defined(HUF_FORCE_DECOMPRESS_X2)
(void)algoNb;
assert(algoNb == 1);
return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
- cSrcSize, workSpace, wkspSize);
+ cSrcSize, workSpace, wkspSize, flags);
#else
return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
- cSrcSize, workSpace, wkspSize):
+ cSrcSize, workSpace, wkspSize, flags):
HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
- cSrcSize, workSpace, wkspSize);
+ cSrcSize, workSpace, wkspSize, flags);
#endif
}
}
-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
{
DTableDesc const dtd = HUF_getDTableDesc(DTable);
#if defined(HUF_FORCE_DECOMPRESS_X1)
(void)dtd;
assert(dtd.tableType == 0);
- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
#elif defined(HUF_FORCE_DECOMPRESS_X2)
(void)dtd;
assert(dtd.tableType == 1);
- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
#else
- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
+ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
#endif
}
#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
{
const BYTE* ip = (const BYTE*) cSrc;
- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
if (HUF_isError(hSize)) return hSize;
if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
ip += hSize; cSrcSize -= hSize;
- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
}
#endif
-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
{
DTableDesc const dtd = HUF_getDTableDesc(DTable);
#if defined(HUF_FORCE_DECOMPRESS_X1)
(void)dtd;
assert(dtd.tableType == 0);
- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
#elif defined(HUF_FORCE_DECOMPRESS_X2)
(void)dtd;
assert(dtd.tableType == 1);
- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
#else
- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
+ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
#endif
}
-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
{
/* validation checks */
if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1730,160 +1869,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
#if defined(HUF_FORCE_DECOMPRESS_X1)
(void)algoNb;
assert(algoNb == 0);
- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
#elif defined(HUF_FORCE_DECOMPRESS_X2)
(void)algoNb;
assert(algoNb == 1);
- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
#else
- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
+ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
#endif
}
}
-
-#ifndef ZSTD_NO_UNUSED_FUNCTIONS
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize)
-{
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
- return HUF_readDTableX1_wksp(DTable, src, srcSize,
- workSpace, sizeof(workSpace));
-}
-
-size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
- const void* cSrc, size_t cSrcSize)
-{
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
- return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
- workSpace, sizeof(workSpace));
-}
-
-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
-{
- HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
- return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
-}
-#endif
-
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
-{
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
- return HUF_readDTableX2_wksp(DTable, src, srcSize,
- workSpace, sizeof(workSpace));
-}
-
-size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
- const void* cSrc, size_t cSrcSize)
-{
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
- return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
- workSpace, sizeof(workSpace));
-}
-
-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
-{
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
- return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
-}
-#endif
-
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
-{
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
- workSpace, sizeof(workSpace));
-}
-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
-{
- HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
- return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
-}
-#endif
-
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
- const void* cSrc, size_t cSrcSize)
-{
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
- workSpace, sizeof(workSpace));
-}
-
-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
-{
- HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
- return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
-}
-#endif
-
-typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
-
-size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
-{
-#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
- static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 };
-#endif
-
- /* validation checks */
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
- if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
- if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
- if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
-
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
- (void)algoNb;
- assert(algoNb == 0);
- return HUF_decompress4X1(dst, dstSize, cSrc, cSrcSize);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
- (void)algoNb;
- assert(algoNb == 1);
- return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);
-#else
- return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
-#endif
- }
-}
-
-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
-{
- /* validation checks */
- if (dstSize == 0) return ERROR(dstSize_tooSmall);
- if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */
- if (cSrcSize == dstSize) { ZSTD_memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */
- if (cSrcSize == 1) { ZSTD_memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */
-
- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
- (void)algoNb;
- assert(algoNb == 0);
- return HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
- (void)algoNb;
- assert(algoNb == 1);
- return HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
-#else
- return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
- HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
-#endif
- }
-}
-
-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
-{
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
- return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
- workSpace, sizeof(workSpace));
-}
-
-size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
- const void* cSrc, size_t cSrcSize)
-{
- U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
- return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
- workSpace, sizeof(workSpace));
-}
-#endif
diff --git a/contrib/libs/zstd/lib/decompress/huf_decompress_amd64.S b/contrib/libs/zstd/lib/decompress/huf_decompress_amd64.S
index 49589cb6114..671624fe343 100644
--- a/contrib/libs/zstd/lib/decompress/huf_decompress_amd64.S
+++ b/contrib/libs/zstd/lib/decompress/huf_decompress_amd64.S
@@ -1,5 +1,5 @@
/*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
@@ -30,14 +30,14 @@
* TODO: Support Windows calling convention.
*/
-ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
-ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
-ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
-ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
-.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
-.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
-.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
-.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
+ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
+ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
+ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
+ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
+.global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
+.global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
+.global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
+.global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
.text
/* Sets up register mappings for clarity.
@@ -95,8 +95,9 @@ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
/* Define both _HUF_* & HUF_* symbols because MacOS
* C symbols are prefixed with '_' & Linux symbols aren't.
*/
-_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
-HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
+_HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
+HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
+ ZSTD_CET_ENDBRANCH
/* Save all registers - even if they are callee saved for simplicity. */
push %rax
push %rbx
@@ -350,8 +351,9 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
pop %rax
ret
-_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
-HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
+_HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
+HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
+ ZSTD_CET_ENDBRANCH
/* Save all registers - even if they are callee saved for simplicity. */
push %rax
push %rbx
@@ -427,41 +429,30 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
/* r15 = (ip0 - ilimit) / 7 */
movq %rdx, %r15
- movabsq $-3689348814741910323, %rdx
- movq 8(%rsp), %rax /* rax = oend0 */
- subq %op0, %rax /* rax = oend0 - op0 */
- mulq %rdx
- shrq $3, %rdx /* rdx = rax / 10 */
-
- /* r15 = min(%rdx, %r15) */
- cmpq %rdx, %r15
- cmova %rdx, %r15
+ /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
+ movq 8(%rsp), %rax /* rax = oend0 */
+ subq %op0, %rax /* rax = oend0 - op0 */
+ movq 16(%rsp), %rdx /* rdx = oend1 */
+ subq %op1, %rdx /* rdx = oend1 - op1 */
- movabsq $-3689348814741910323, %rdx
- movq 16(%rsp), %rax /* rax = oend1 */
- subq %op1, %rax /* rax = oend1 - op1 */
- mulq %rdx
- shrq $3, %rdx /* rdx = rax / 10 */
-
- /* r15 = min(%rdx, %r15) */
- cmpq %rdx, %r15
- cmova %rdx, %r15
+ cmpq %rax, %rdx
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
- movabsq $-3689348814741910323, %rdx
movq 24(%rsp), %rax /* rax = oend2 */
subq %op2, %rax /* rax = oend2 - op2 */
- mulq %rdx
- shrq $3, %rdx /* rdx = rax / 10 */
- /* r15 = min(%rdx, %r15) */
- cmpq %rdx, %r15
- cmova %rdx, %r15
+ cmpq %rax, %rdx
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
- movabsq $-3689348814741910323, %rdx
movq 32(%rsp), %rax /* rax = oend3 */
subq %op3, %rax /* rax = oend3 - op3 */
+
+ cmpq %rax, %rdx
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
+
+ movabsq $-3689348814741910323, %rax
mulq %rdx
- shrq $3, %rdx /* rdx = rax / 10 */
+ shrq $3, %rdx /* rdx = rdx / 10 */
/* r15 = min(%rdx, %r15) */
cmpq %rdx, %r15
diff --git a/contrib/libs/zstd/lib/decompress/zstd_ddict.c b/contrib/libs/zstd/lib/decompress/zstd_ddict.c
index ce335477b32..ad5c34a7fc0 100644
--- a/contrib/libs/zstd/lib/decompress/zstd_ddict.c
+++ b/contrib/libs/zstd/lib/decompress/zstd_ddict.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
@@ -19,7 +19,6 @@
#include "../common/mem.h" /* low level memory routines */
#define FSE_STATIC_LINKING_ONLY
#include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
#include "../common/huf.h"
#include "zstd_decompress_internal.h"
#include "zstd_ddict.h"
@@ -134,7 +133,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
ZSTD_memcpy(internalBuffer, dict, dictSize);
}
ddict->dictSize = dictSize;
- ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */
+ ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */
/* parse dictionary content */
FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
@@ -240,5 +239,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
{
if (ddict==NULL) return 0;
- return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
+ return ddict->dictID;
}
diff --git a/contrib/libs/zstd/lib/decompress/zstd_ddict.h b/contrib/libs/zstd/lib/decompress/zstd_ddict.h
index bd03268b508..c4ca8877a07 100644
--- a/contrib/libs/zstd/lib/decompress/zstd_ddict.h
+++ b/contrib/libs/zstd/lib/decompress/zstd_ddict.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
diff --git a/contrib/libs/zstd/lib/decompress/zstd_decompress.c b/contrib/libs/zstd/lib/decompress/zstd_decompress.c
index b8bbefd538f..093a32716df 100644
--- a/contrib/libs/zstd/lib/decompress/zstd_decompress.c
+++ b/contrib/libs/zstd/lib/decompress/zstd_decompress.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
@@ -59,13 +59,13 @@
#include "../common/mem.h" /* low level memory routines */
#define FSE_STATIC_LINKING_ONLY
#include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
#include "../common/huf.h"
#include <contrib/libs/xxhash/xxhash.h> /* XXH64_reset, XXH64_update, XXH64_digest, XXH64 */
#include "../common/zstd_internal.h" /* blockProperties_t */
#include "zstd_decompress_internal.h" /* ZSTD_DCtx */
#include "zstd_ddict.h" /* ZSTD_DDictDictContent */
#include "zstd_decompress_block.h" /* ZSTD_decompressBlock_internal */
+#include "../common/bits.h" /* ZSTD_highbit32 */
#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
# include "../legacy/zstd_legacy.h"
@@ -78,11 +78,11 @@
*************************************/
#define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
- * Currently, that means a 0.75 load factor.
- * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
- * the load factor of the ddict hash set.
- */
+#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
+ * Currently, that means a 0.75 load factor.
+ * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
+ * the load factor of the ddict hash set.
+ */
#define DDICT_HASHSET_TABLE_BASE_SIZE 64
#define DDICT_HASHSET_RESIZE_FACTOR 2
@@ -243,6 +243,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
dctx->outBufferMode = ZSTD_bm_buffered;
dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
+ dctx->disableHufAsm = 0;
}
static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
@@ -438,16 +439,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
* note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
* @return : 0, `zfhPtr` is correctly filled,
* >0, `srcSize` is too small, value is wanted `srcSize` amount,
- * or an error code, which can be tested using ZSTD_isError() */
+** or an error code, which can be tested using ZSTD_isError() */
size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
{
const BYTE* ip = (const BYTE*)src;
size_t const minInputSize = ZSTD_startingInputLength(format);
- ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
- if (srcSize < minInputSize) return minInputSize;
- RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter");
+ DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
+ if (srcSize > 0) {
+ /* note : technically could be considered an assert(), since it's an invalid entry */
+ RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
+ }
+ if (srcSize < minInputSize) {
+ if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
+ /* when receiving less than @minInputSize bytes,
+ * control these bytes at least correspond to a supported magic number
+ * in order to error out early if they don't.
+ **/
+ size_t const toCopy = MIN(4, srcSize);
+ unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
+ assert(src != NULL);
+ ZSTD_memcpy(hbuf, src, toCopy);
+ if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) {
+ /* not a zstd frame : let's check if it's a skippable frame */
+ MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
+ ZSTD_memcpy(hbuf, src, toCopy);
+ if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
+ RETURN_ERROR(prefix_unknown,
+ "first bytes don't correspond to any supported magic number");
+ } } }
+ return minInputSize;
+ }
+
+ ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
if ( (format != ZSTD_f_zstd1_magicless)
&& (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
@@ -757,10 +782,11 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
ip += 4;
}
+ frameSizeInfo.nbBlocks = nbBlocks;
frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
? zfh.frameContentSize
- : nbBlocks * zfh.blockSizeMax;
+ : (unsigned long long)nbBlocks * zfh.blockSizeMax;
return frameSizeInfo;
}
}
@@ -800,6 +826,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
return bound;
}
+size_t ZSTD_decompressionMargin(void const* src, size_t srcSize)
+{
+ size_t margin = 0;
+ unsigned maxBlockSize = 0;
+
+ /* Iterate over each frame */
+ while (srcSize > 0) {
+ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+ size_t const compressedSize = frameSizeInfo.compressedSize;
+ unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
+ ZSTD_frameHeader zfh;
+
+ FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), "");
+ if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
+ return ERROR(corruption_detected);
+
+ if (zfh.frameType == ZSTD_frame) {
+ /* Add the frame header to our margin */
+ margin += zfh.headerSize;
+ /* Add the checksum to our margin */
+ margin += zfh.checksumFlag ? 4 : 0;
+ /* Add 3 bytes per block */
+ margin += 3 * frameSizeInfo.nbBlocks;
+
+ /* Compute the max block size */
+ maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax);
+ } else {
+ assert(zfh.frameType == ZSTD_skippableFrame);
+ /* Add the entire skippable frame size to our margin. */
+ margin += compressedSize;
+ }
+
+ assert(srcSize >= compressedSize);
+ src = (const BYTE*)src + compressedSize;
+ srcSize -= compressedSize;
+ }
+
+ /* Add the max block size back to the margin. */
+ margin += maxBlockSize;
+
+ return margin;
+}
/*-*************************************************************
* Frame decoding
@@ -825,7 +893,7 @@ static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity,
if (srcSize == 0) return 0;
RETURN_ERROR(dstBuffer_null, "");
}
- ZSTD_memcpy(dst, src, srcSize);
+ ZSTD_memmove(dst, src, srcSize);
return srcSize;
}
@@ -903,6 +971,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
/* Loop on each block */
while (1) {
+ BYTE* oBlockEnd = oend;
size_t decodedSize;
blockProperties_t blockProperties;
size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSrcSize, &blockProperties);
@@ -912,16 +981,34 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
remainingSrcSize -= ZSTD_blockHeaderSize;
RETURN_ERROR_IF(cBlockSize > remainingSrcSize, srcSize_wrong, "");
+ if (ip >= op && ip < oBlockEnd) {
+ /* We are decompressing in-place. Limit the output pointer so that we
+ * don't overwrite the block that we are currently reading. This will
+ * fail decompression if the input & output pointers aren't spaced
+ * far enough apart.
+ *
+ * This is important to set, even when the pointers are far enough
+ * apart, because ZSTD_decompressBlock_internal() can decide to store
+ * literals in the output buffer, after the block it is decompressing.
+ * Since we don't want anything to overwrite our input, we have to tell
+ * ZSTD_decompressBlock_internal to never write past ip.
+ *
+ * See ZSTD_allocateLiteralsBuffer() for reference.
+ */
+ oBlockEnd = op + (ip - op);
+ }
+
switch(blockProperties.blockType)
{
case bt_compressed:
- decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1, not_streaming);
+ decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming);
break;
case bt_raw :
+ /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */
decodedSize = ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, cBlockSize);
break;
case bt_rle :
- decodedSize = ZSTD_setRleBlock(op, (size_t)(oend-op), *ip, blockProperties.origSize);
+ decodedSize = ZSTD_setRleBlock(op, (size_t)(oBlockEnd-op), *ip, blockProperties.origSize);
break;
case bt_reserved :
default:
@@ -956,6 +1043,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
}
ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
/* Allow caller to get size read */
+ DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr);
*srcPtr = ip;
*srcSizePtr = remainingSrcSize;
return (size_t)(op-ostart);
@@ -1108,8 +1196,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
/**
- * Similar to ZSTD_nextSrcSizeToDecompress(), but when when a block input can be streamed,
- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can
+ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
+ * allow taking a partial block as the input. Currently only raw uncompressed blocks can
* be streamed.
*
* For blocks that can be streamed, this allows us to reduce the latency until we produce
@@ -1309,7 +1397,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
default:
assert(0); /* impossible */
- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */
+ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */
}
}
@@ -1350,11 +1438,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
/* in minimal huffman, we always use X1 variants */
size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
dictPtr, dictEnd - dictPtr,
- workspace, workspaceSize);
+ workspace, workspaceSize, /* flags */ 0);
#else
size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
dictPtr, (size_t)(dictEnd - dictPtr),
- workspace, workspaceSize);
+ workspace, workspaceSize, /* flags */ 0);
#endif
RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
dictPtr += hSize;
@@ -1453,7 +1541,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
dctx->prefixStart = NULL;
dctx->virtualStart = NULL;
dctx->dictEnd = NULL;
- dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */
+ dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */
dctx->litEntropy = dctx->fseEntropy = 0;
dctx->dictID = 0;
dctx->bType = bt_reserved;
@@ -1515,7 +1603,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
* This could for one of the following reasons :
* - The frame does not require a dictionary (most common case).
* - The frame was built with dictID intentionally removed.
- * Needed dictionary is a hidden information.
+ * Needed dictionary is a hidden piece of information.
* Note : this use case also happens when using a non-conformant dictionary.
* - `srcSize` is too small, and as a result, frame header could not be decoded.
* Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
@@ -1524,7 +1612,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
* ZSTD_getFrameHeader(), which will provide a more precise error code. */
unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
{
- ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
+ ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 };
size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
if (ZSTD_isError(hError)) return 0;
return zfp.dictID;
@@ -1631,7 +1719,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
size_t ZSTD_initDStream(ZSTD_DStream* zds)
{
DEBUGLOG(4, "ZSTD_initDStream");
- return ZSTD_initDStream_usingDDict(zds, NULL);
+ FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
+ FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), "");
+ return ZSTD_startingInputLength(zds->format);
}
/* ZSTD_initDStream_usingDDict() :
@@ -1639,6 +1729,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
* this function cannot fail */
size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
{
+ DEBUGLOG(4, "ZSTD_initDStream_usingDDict");
FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
return ZSTD_startingInputLength(dctx->format);
@@ -1649,6 +1740,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
* this function cannot fail */
size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
{
+ DEBUGLOG(4, "ZSTD_resetDStream");
FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
return ZSTD_startingInputLength(dctx->format);
}
@@ -1720,6 +1812,11 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
return bounds;
+ case ZSTD_d_disableHuffmanAssembly:
+ bounds.lowerBound = 0;
+ bounds.upperBound = 1;
+ return bounds;
+
default:;
}
bounds.error = ERROR(parameter_unsupported);
@@ -1760,6 +1857,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
case ZSTD_d_refMultipleDDicts:
*value = (int)dctx->refMultipleDDicts;
return 0;
+ case ZSTD_d_disableHuffmanAssembly:
+ *value = (int)dctx->disableHufAsm;
+ return 0;
default:;
}
RETURN_ERROR(parameter_unsupported, "");
@@ -1793,6 +1893,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
}
dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
return 0;
+ case ZSTD_d_disableHuffmanAssembly:
+ CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
+ dctx->disableHufAsm = value != 0;
+ return 0;
default:;
}
RETURN_ERROR(parameter_unsupported, "");
@@ -1980,7 +2084,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
if (zds->refMultipleDDicts && zds->ddictSet) {
ZSTD_DCtx_selectFrameDDict(zds);
}
- DEBUGLOG(5, "header size : %u", (U32)hSize);
if (ZSTD_isError(hSize)) {
#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
U32 const legacyVersion = ZSTD_isLegacy(istart, iend-istart);
@@ -2012,6 +2115,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
zds->lhSize += remainingInput;
}
input->pos = input->size;
+ /* check first few bytes */
+ FORWARD_IF_ERROR(
+ ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
+ "First few bytes detected incorrect" );
+ /* return hint input size */
return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */
}
assert(ip != NULL);
@@ -2029,8 +2137,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
if (ZSTD_isError(decompressedSize)) return decompressedSize;
DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
+ assert(istart != NULL);
ip = istart + cSize;
- op += decompressedSize;
+ op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
zds->expected = 0;
zds->streamStage = zdss_init;
someMoreWork = 0;
@@ -2114,6 +2223,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
}
if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */
FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
+ assert(ip != NULL);
ip += neededInSize;
/* Function modifies the stage so we must break */
break;
@@ -2128,7 +2238,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
int const isSkipFrame = ZSTD_isSkipFrame(zds);
size_t loadedSize;
/* At this point we shouldn't be decompressing a block that we can stream. */
- assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
+ assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)));
if (isSkipFrame) {
loadedSize = MIN(toLoad, (size_t)(iend-ip));
} else {
@@ -2137,8 +2247,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
"should never happen");
loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
}
- ip += loadedSize;
- zds->inPos += loadedSize;
+ if (loadedSize != 0) {
+ /* ip may be NULL */
+ ip += loadedSize;
+ zds->inPos += loadedSize;
+ }
if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */
/* decode loaded input */
@@ -2148,14 +2261,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
break;
}
case zdss_flush:
- { size_t const toFlushSize = zds->outEnd - zds->outStart;
+ {
+ size_t const toFlushSize = zds->outEnd - zds->outStart;
size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize);
- op += flushedSize;
+
+ op = op ? op + flushedSize : op;
+
zds->outStart += flushedSize;
if (flushedSize == toFlushSize) { /* flush completed */
zds->streamStage = zdss_read;
if ( (zds->outBuffSize < zds->fParams.frameContentSize)
- && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+ && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
(int)(zds->outBuffSize - zds->outStart),
(U32)zds->fParams.blockSizeMax);
@@ -2169,7 +2285,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
default:
assert(0); /* impossible */
- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */
+ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */
} }
/* result */
@@ -2182,8 +2298,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
if ((ip==istart) && (op==ostart)) { /* no forward progress */
zds->noForwardProgress ++;
if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
- RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
- RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
+ RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, "");
+ RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, "");
assert(0);
}
} else {
@@ -2220,11 +2336,17 @@ size_t ZSTD_decompressStream_simpleArgs (
void* dst, size_t dstCapacity, size_t* dstPos,
const void* src, size_t srcSize, size_t* srcPos)
{
- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
- ZSTD_inBuffer input = { src, srcSize, *srcPos };
- /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
- size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
- *dstPos = output.pos;
- *srcPos = input.pos;
- return cErr;
+ ZSTD_outBuffer output;
+ ZSTD_inBuffer input;
+ output.dst = dst;
+ output.size = dstCapacity;
+ output.pos = *dstPos;
+ input.src = src;
+ input.size = srcSize;
+ input.pos = *srcPos;
+ { size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+ *dstPos = output.pos;
+ *srcPos = input.pos;
+ return cErr;
+ }
}
diff --git a/contrib/libs/zstd/lib/decompress/zstd_decompress_block.c b/contrib/libs/zstd/lib/decompress/zstd_decompress_block.c
index 2e44d30d2f3..0a06a021e15 100644
--- a/contrib/libs/zstd/lib/decompress/zstd_decompress_block.c
+++ b/contrib/libs/zstd/lib/decompress/zstd_decompress_block.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
@@ -20,12 +20,12 @@
#include "../common/mem.h" /* low level memory routines */
#define FSE_STATIC_LINKING_ONLY
#include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
#include "../common/huf.h"
#include "../common/zstd_internal.h"
#include "zstd_decompress_internal.h" /* ZSTD_DCtx */
#include "zstd_ddict.h" /* ZSTD_DDictDictContent */
#include "zstd_decompress_block.h"
+#include "../common/bits.h" /* ZSTD_highbit32 */
/*_*******************************************************
* Macros
@@ -89,7 +89,7 @@ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const
dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
}
else {
- /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
+ /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
}
@@ -134,13 +134,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
ZSTD_FALLTHROUGH;
case set_compressed:
- RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
+ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
{ size_t lhSize, litSize, litCSize;
U32 singleStream=0;
U32 const lhlCode = (istart[0] >> 2) & 3;
U32 const lhc = MEM_readLE32(istart);
size_t hufSuccess;
size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
+ int const flags = 0
+ | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
+ | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
switch(lhlCode)
{
case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */
@@ -165,6 +168,10 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
}
RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
+ if (!singleStream)
+ RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
+ "Not enough literals (%zu) for the 4-streams mode (min %u)",
+ litSize, MIN_LITERALS_FOR_4_STREAMS);
RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
@@ -176,13 +183,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
if (litEncType==set_repeat) {
if (singleStream) {
- hufSuccess = HUF_decompress1X_usingDTable_bmi2(
+ hufSuccess = HUF_decompress1X_usingDTable(
dctx->litBuffer, litSize, istart+lhSize, litCSize,
- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
+ dctx->HUFptr, flags);
} else {
- hufSuccess = HUF_decompress4X_usingDTable_bmi2(
+ assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
+ hufSuccess = HUF_decompress4X_usingDTable(
dctx->litBuffer, litSize, istart+lhSize, litCSize,
- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
+ dctx->HUFptr, flags);
}
} else {
if (singleStream) {
@@ -190,18 +198,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
hufSuccess = HUF_decompress1X_DCtx_wksp(
dctx->entropy.hufTable, dctx->litBuffer, litSize,
istart+lhSize, litCSize, dctx->workspace,
- sizeof(dctx->workspace));
+ sizeof(dctx->workspace), flags);
#else
- hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
+ hufSuccess = HUF_decompress1X1_DCtx_wksp(
dctx->entropy.hufTable, dctx->litBuffer, litSize,
istart+lhSize, litCSize, dctx->workspace,
- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
+ sizeof(dctx->workspace), flags);
#endif
} else {
- hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
+ hufSuccess = HUF_decompress4X_hufOnly_wksp(
dctx->entropy.hufTable, dctx->litBuffer, litSize,
istart+lhSize, litCSize, dctx->workspace,
- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
+ sizeof(dctx->workspace), flags);
}
}
if (dctx->litBufferLocation == ZSTD_split)
@@ -237,6 +245,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
break;
case 3:
lhSize = 3;
+ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
litSize = MEM_readLE24(istart) >> 4;
break;
}
@@ -279,12 +288,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
break;
case 1:
lhSize = 2;
+ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
litSize = MEM_readLE16(istart) >> 4;
break;
case 3:
lhSize = 3;
+ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
litSize = MEM_readLE24(istart) >> 4;
- RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
break;
}
RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
@@ -506,14 +516,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
for (i = 8; i < n; i += 8) {
MEM_write64(spread + pos + i, sv);
}
- pos += n;
+ assert(n>=0);
+ pos += (size_t)n;
}
}
/* Now we spread those positions across the table.
- * The benefit of doing it in two stages is that we avoid the the
+ * The benefit of doing it in two stages is that we avoid the
* variable size inner loop, which caused lots of branch misses.
* Now we can run through all the positions without any branch misses.
- * We unroll the loop twice, since that is what emperically worked best.
+ * We unroll the loop twice, since that is what empirically worked best.
*/
{
size_t position = 0;
@@ -540,7 +551,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
for (i=0; i<n; i++) {
tableDecode[position].baseValue = s;
position = (position + step) & tableMask;
- while (position > highThreshold) position = (position + step) & tableMask; /* lowprob area */
+ while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */
} }
assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
}
@@ -551,7 +562,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
for (u=0; u<tableSize; u++) {
U32 const symbol = tableDecode[u].baseValue;
U32 const nextState = symbolNext[symbol]++;
- tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+ tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
assert(nbAdditionalBits[symbol] < 255);
tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
@@ -964,6 +975,11 @@ size_t ZSTD_execSequence(BYTE* op,
assert(op != NULL /* Precondition */);
assert(oend_w < oend /* No underflow */);
+
+#if defined(__aarch64__)
+ /* prefetch sequence starting from match that will be used for copy later */
+ PREFETCH_L1(match);
+#endif
/* Handle edge cases in a slow path:
* - Read beyond end of literals
* - Match end is within WILDCOPY_OVERLIMIT of oend
@@ -1154,7 +1170,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
}
/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
+ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
* bits before reloading. This value is the maximum number of bytes we read
* after reloading when we are decoding long offsets.
*/
@@ -1169,9 +1185,27 @@ FORCE_INLINE_TEMPLATE seq_t
ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
{
seq_t seq;
+ /*
+ * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be
+ * loaded in one operation and extracted its fields by simply shifting or
+ * bit-extracting on aarch64.
+ * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
+ * operations that cause performance drop. This can be avoided by using this
+ * ZSTD_memcpy hack.
+ */
+#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
+ ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
+ ZSTD_seqSymbol* const llDInfo = &llDInfoS;
+ ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
+ ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
+ ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
+ ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
+ ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
+#else
const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
+#endif
seq.matchLength = mlDInfo->baseValue;
seq.litLength = llDInfo->baseValue;
{ U32 const ofBase = ofDInfo->baseValue;
@@ -1186,9 +1220,13 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
U32 const llnbBits = llDInfo->nbBits;
U32 const mlnbBits = mlDInfo->nbBits;
U32 const ofnbBits = ofDInfo->nbBits;
+
+ assert(llBits <= MaxLLBits);
+ assert(mlBits <= MaxMLBits);
+ assert(ofBits <= MaxOff);
/*
* As gcc has better branch and block analyzers, sometimes it is only
- * valuable to mark likelyness for clang, it gives around 3-4% of
+ * valuable to mark likeliness for clang, it gives around 3-4% of
* performance.
*/
@@ -1201,13 +1239,16 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
#endif
ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
- assert(ofBits <= MaxOff);
+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
+ /* Always read extra bits, this keeps the logic simple,
+ * avoids branches, and avoids accidentally reading 0 bits.
+ */
+ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
BIT_reloadDStream(&seqState->DStream);
- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */
+ offset += BIT_readBitsFast(&seqState->DStream, extraBits);
} else {
offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
@@ -1552,7 +1593,7 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
- DEBUGLOG(5, "ZSTD_decompressSequences_body");
+ DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
(void)frame;
/* Regen sequences */
@@ -1945,34 +1986,79 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+/**
+ * @returns The total size of the history referencable by zstd, including
+ * both the prefix and the extDict. At @p op any offset larger than this
+ * is invalid.
+ */
+static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
+{
+ return (size_t)(op - virtualStart);
+}
-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
-/* ZSTD_getLongOffsetsShare() :
+typedef struct {
+ unsigned longOffsetShare;
+ unsigned maxNbAdditionalBits;
+} ZSTD_OffsetInfo;
+
+/* ZSTD_getOffsetInfo() :
* condition : offTable must be valid
* @return : "share" of long offsets (arbitrarily defined as > (1<<23))
- * compared to maximum possible of (1<<OffFSELog) */
-static unsigned
-ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
+ * compared to maximum possible of (1<<OffFSELog),
+ * as well as the maximum number additional bits required.
+ */
+static ZSTD_OffsetInfo
+ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
{
- const void* ptr = offTable;
- U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
- const ZSTD_seqSymbol* table = offTable + 1;
- U32 const max = 1 << tableLog;
- U32 u, total = 0;
- DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
-
- assert(max <= (1 << OffFSELog)); /* max not too large */
- for (u=0; u<max; u++) {
- if (table[u].nbAdditionalBits > 22) total += 1;
+ ZSTD_OffsetInfo info = {0, 0};
+ /* If nbSeq == 0, then the offTable is uninitialized, but we have
+ * no sequences, so both values should be 0.
+ */
+ if (nbSeq != 0) {
+ const void* ptr = offTable;
+ U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+ const ZSTD_seqSymbol* table = offTable + 1;
+ U32 const max = 1 << tableLog;
+ U32 u;
+ DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+
+ assert(max <= (1 << OffFSELog)); /* max not too large */
+ for (u=0; u<max; u++) {
+ info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
+ if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
+ }
+
+ assert(tableLog <= OffFSELog);
+ info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */
}
- assert(tableLog <= OffFSELog);
- total <<= (OffFSELog - tableLog); /* scale to OffFSELog */
+ return info;
+}
- return total;
+/**
+ * @returns The maximum offset we can decode in one read of our bitstream, without
+ * reloading more bits in the middle of the offset bits read. Any offsets larger
+ * than this must use the long offset decoder.
+ */
+static size_t ZSTD_maxShortOffset(void)
+{
+ if (MEM_64bits()) {
+ /* We can decode any offset without reloading bits.
+ * This might change if the max window size grows.
+ */
+ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
+ return (size_t)-1;
+ } else {
+ /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
+ * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
+ * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
+ */
+ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
+ size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
+ assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
+ return maxOffset;
+ }
}
-#endif
size_t
ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
@@ -1980,20 +2066,21 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
{ /* blockType == blockCompressed */
const BYTE* ip = (const BYTE*)src;
- /* isLongOffset must be true if there are long offsets.
- * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
- * We don't expect that to be the case in 64-bit mode.
- * In block mode, window size is not known, so we have to be conservative.
- * (note: but it could be evaluated from current-lowLimit)
- */
- ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
+ /* Note : the wording of the specification
+ * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX.
+ * This generally does not happen, as it makes little sense,
+ * since an uncompressed block would feature same size and have no decompression cost.
+ * Also, note that decoder from reference libzstd before < v1.5.4
+ * would consider this edge case as an error.
+ * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX
+ * for broader compatibility with the deployed ecosystem of zstd decoders */
+ RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
/* Decode literals section */
{ size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
- DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
+ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
if (ZSTD_isError(litCSize)) return litCSize;
ip += litCSize;
srcSize -= litCSize;
@@ -2001,6 +2088,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
/* Build Decoding Tables */
{
+ /* Compute the maximum block size, which must also work when !frame and fParams are unset.
+ * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
+ */
+ size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX));
+ size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart);
+ /* isLongOffset must be true if there are long offsets.
+ * Offsets are long if they are larger than ZSTD_maxShortOffset().
+ * We don't expect that to be the case in 64-bit mode.
+ *
+ * We check here to see if our history is large enough to allow long offsets.
+ * If it isn't, then we can't possible have (valid) long offsets. If the offset
+ * is invalid, then it is okay to read it incorrectly.
+ *
+ * If isLongOffsets is true, then we will later check our decoding table to see
+ * if it is even possible to generate long offsets.
+ */
+ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
/* These macros control at build-time which decompressor implementation
* we use. If neither is defined, we do some inspection and dispatch at
* runtime.
@@ -2008,6 +2112,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
!defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
int usePrefetchDecoder = dctx->ddictIsCold;
+#else
+ /* Set to 1 to avoid computing offset info if we don't need to.
+ * Otherwise this value is ignored.
+ */
+ int usePrefetchDecoder = 1;
#endif
int nbSeq;
size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
@@ -2017,26 +2126,38 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
- if ( !usePrefetchDecoder
- && (!frame || (dctx->fParams.windowSize > (1<<24)))
- && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */
- U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
- U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
- usePrefetchDecoder = (shareLongOffsets >= minShare);
+ /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
+ * compute information about the share of long offsets, and the maximum nbAdditionalBits.
+ * NOTE: could probably use a larger nbSeq limit
+ */
+ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
+ ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
+ if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
+ /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
+ * enough, then we know it is impossible to have too long an offset in this block, so we can
+ * use the regular offset decoder.
+ */
+ isLongOffset = ZSTD_lo_isRegularOffset;
+ }
+ if (!usePrefetchDecoder) {
+ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+ usePrefetchDecoder = (info.longOffsetShare >= minShare);
+ }
}
-#endif
dctx->ddictIsCold = 0;
#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
!defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
- if (usePrefetchDecoder)
+ if (usePrefetchDecoder) {
+#else
+ (void)usePrefetchDecoder;
+ {
#endif
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
#endif
+ }
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
/* else */
diff --git a/contrib/libs/zstd/lib/decompress/zstd_decompress_block.h b/contrib/libs/zstd/lib/decompress/zstd_decompress_block.h
index c61a9d0c4b3..67791dbc3ad 100644
--- a/contrib/libs/zstd/lib/decompress/zstd_decompress_block.h
+++ b/contrib/libs/zstd/lib/decompress/zstd_decompress_block.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
diff --git a/contrib/libs/zstd/lib/decompress/zstd_decompress_internal.h b/contrib/libs/zstd/lib/decompress/zstd_decompress_internal.h
index 2b5a53850ac..c2ec5d9fbef 100644
--- a/contrib/libs/zstd/lib/decompress/zstd_decompress_internal.h
+++ b/contrib/libs/zstd/lib/decompress/zstd_decompress_internal.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
@@ -75,12 +75,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
#define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
+#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
typedef struct {
ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */
ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */
ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
- HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */
+ HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; /* can accommodate HUF_decompress4X */
U32 rep[ZSTD_REP_NUM];
U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
} ZSTD_entropyDTables_t;
@@ -164,6 +165,7 @@ struct ZSTD_DCtx_s
ZSTD_dictUses_e dictUses;
ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */
ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
+ int disableHufAsm;
/* streaming */
ZSTD_dStreamStage streamStage;