summaryrefslogtreecommitdiffstats
path: root/contrib/libs/zstd/lib/compress/zstd_fast.c
diff options
context:
space:
mode:
authorrobot-contrib <[email protected]>2025-02-21 20:33:28 +0300
committerrobot-contrib <[email protected]>2025-02-21 22:22:00 +0300
commit005b8a9fd31e325164fdaad9ccde51222d9dc1c7 (patch)
tree927528cb5e48c5a3af50e324fd4476fe7d3c9657 /contrib/libs/zstd/lib/compress/zstd_fast.c
parent74abb8215ba55f6017c2504dc3312470949afa89 (diff)
Update contrib/libs/zstd to 1.5.7
Release highlights are: * The compression speed for small data blocks has been notably (~10%) improved at fast compression levels. * The `--patch-from` functionality of the zstd CLI ... had its speed reduced to uncomfortable levels. v1.5.7 largely mitigates the speed impact of high compression levels 18+ * The compression ratio has been enhanced slightly for large data across all compression levels, commit_hash:c66283dd802e1fd38484f7179848006c58f2baa4
Diffstat (limited to 'contrib/libs/zstd/lib/compress/zstd_fast.c')
-rw-r--r--contrib/libs/zstd/lib/compress/zstd_fast.c165
1 files changed, 91 insertions, 74 deletions
diff --git a/contrib/libs/zstd/lib/compress/zstd_fast.c b/contrib/libs/zstd/lib/compress/zstd_fast.c
index 6c4554cfca7..ee25bcbac8d 100644
--- a/contrib/libs/zstd/lib/compress/zstd_fast.c
+++ b/contrib/libs/zstd/lib/compress/zstd_fast.c
@@ -13,7 +13,7 @@
static
ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
-void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
+void ZSTD_fillHashTableForCDict(ZSTD_MatchState_t* ms,
const void* const end,
ZSTD_dictTableLoadMethod_e dtlm)
{
@@ -45,12 +45,12 @@ void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */
ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
- } } } }
+ } } } }
}
static
ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
-void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
+void ZSTD_fillHashTableForCCtx(ZSTD_MatchState_t* ms,
const void* const end,
ZSTD_dictTableLoadMethod_e dtlm)
{
@@ -84,7 +84,7 @@ void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
} } } }
}
-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+void ZSTD_fillHashTable(ZSTD_MatchState_t* ms,
const void* const end,
ZSTD_dictTableLoadMethod_e dtlm,
ZSTD_tableFillPurpose_e tfp)
@@ -97,6 +97,50 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
}
+typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit);
+
+static int
+ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
+{
+ /* Array of ~random data, should have low probability of matching data.
+ * Load from here if the index is invalid.
+ * Used to avoid unpredictable branches. */
+ static const BYTE dummy[] = {0x12,0x34,0x56,0x78};
+
+ /* currentIdx >= lowLimit is a (somewhat) unpredictable branch.
+ * However expression below compiles into conditional move.
+ */
+ const BYTE* mvalAddr = ZSTD_selectAddr(matchIdx, idxLowLimit, matchAddress, dummy);
+ /* Note: this used to be written as : return test1 && test2;
+ * Unfortunately, once inlined, these tests become branches,
+ * in which case it becomes critical that they are executed in the right order (test1 then test2).
+ * So we have to write these tests in a specific manner to ensure their ordering.
+ */
+ if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0;
+ /* force ordering of these tests, which matters once the function is inlined, as they become branches */
+#if defined(__GNUC__)
+ __asm__("");
+#endif
+ return matchIdx >= idxLowLimit;
+}
+
+static int
+ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
+{
+ /* using a branch instead of a cmov,
+ * because it's faster in scenarios where matchIdx >= idxLowLimit is generally true,
+ * aka almost all candidates are within range */
+ U32 mval;
+ if (matchIdx >= idxLowLimit) {
+ mval = MEM_read32(matchAddress);
+ } else {
+ mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */
+ }
+
+ return (MEM_read32(currentPtr) == mval);
+}
+
+
/**
* If you squint hard enough (and ignore repcodes), the search operation at any
* given position is broken into 4 stages:
@@ -146,15 +190,14 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
FORCE_INLINE_TEMPLATE
ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
size_t ZSTD_compressBlock_fast_noDict_generic(
- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
void const* src, size_t srcSize,
- U32 const mls, U32 const hasStep)
+ U32 const mls, int useCmov)
{
const ZSTD_compressionParameters* const cParams = &ms->cParams;
U32* const hashTable = ms->hashTable;
U32 const hlog = cParams->hashLog;
- /* support stepSize of 0 */
- size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2;
+ size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; /* min 2 */
const BYTE* const base = ms->window.base;
const BYTE* const istart = (const BYTE*)src;
const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
@@ -176,8 +219,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
size_t hash0; /* hash for ip0 */
size_t hash1; /* hash for ip1 */
- U32 idx; /* match idx for ip0 */
- U32 mval; /* src value at match idx */
+ U32 matchIdx; /* match idx for ip0 */
U32 offcode;
const BYTE* match0;
@@ -190,6 +232,7 @@ size_t ZSTD_compressBlock_fast_noDict_generic(
size_t step;
const BYTE* nextStep;
const size_t kStepIncr = (1 << (kSearchStrength - 1));
+ const ZSTD_match4Found matchFound = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch;
DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
ip0 += (ip0 == prefixStart);
@@ -218,7 +261,7 @@ _start: /* Requires: ip0 */
hash0 = ZSTD_hashPtr(ip0, hlog, mls);
hash1 = ZSTD_hashPtr(ip1, hlog, mls);
- idx = hashTable[hash0];
+ matchIdx = hashTable[hash0];
do {
/* load repcode match for ip[2]*/
@@ -238,35 +281,25 @@ _start: /* Requires: ip0 */
offcode = REPCODE1_TO_OFFBASE;
mLength += 4;
- /* First write next hash table entry; we've already calculated it.
- * This write is known to be safe because the ip1 is before the
+ /* Write next hash table entry: it's already calculated.
+ * This write is known to be safe because ip1 is before the
* repcode (ip2). */
hashTable[hash1] = (U32)(ip1 - base);
goto _match;
}
- /* load match for ip[0] */
- if (idx >= prefixStartIndex) {
- mval = MEM_read32(base + idx);
- } else {
- mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
- }
-
- /* check match at ip[0] */
- if (MEM_read32(ip0) == mval) {
- /* found a match! */
-
- /* First write next hash table entry; we've already calculated it.
- * This write is known to be safe because the ip1 == ip0 + 1, so
- * we know we will resume searching after ip1 */
+ if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
+ /* Write next hash table entry (it's already calculated).
+ * This write is known to be safe because the ip1 == ip0 + 1,
+ * so searching will resume after ip1 */
hashTable[hash1] = (U32)(ip1 - base);
goto _offset;
}
/* lookup ip[1] */
- idx = hashTable[hash1];
+ matchIdx = hashTable[hash1];
/* hash ip[2] */
hash0 = hash1;
@@ -281,36 +314,19 @@ _start: /* Requires: ip0 */
current0 = (U32)(ip0 - base);
hashTable[hash0] = current0;
- /* load match for ip[0] */
- if (idx >= prefixStartIndex) {
- mval = MEM_read32(base + idx);
- } else {
- mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
- }
-
- /* check match at ip[0] */
- if (MEM_read32(ip0) == mval) {
- /* found a match! */
-
- /* first write next hash table entry; we've already calculated it */
+ if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
+ /* Write next hash table entry, since it's already calculated */
if (step <= 4) {
- /* We need to avoid writing an index into the hash table >= the
- * position at which we will pick up our searching after we've
- * taken this match.
- *
- * The minimum possible match has length 4, so the earliest ip0
- * can be after we take this match will be the current ip0 + 4.
- * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
- * write this position.
- */
+ /* Avoid writing an index if it's >= position where search will resume.
+ * The minimum possible match has length 4, so search can resume at ip0 + 4.
+ */
hashTable[hash1] = (U32)(ip1 - base);
}
-
goto _offset;
}
/* lookup ip[1] */
- idx = hashTable[hash1];
+ matchIdx = hashTable[hash1];
/* hash ip[2] */
hash0 = hash1;
@@ -332,7 +348,7 @@ _start: /* Requires: ip0 */
} while (ip3 < ilimit);
_cleanup:
- /* Note that there are probably still a couple positions we could search.
+ /* Note that there are probably still a couple positions one could search.
* However, it seems to be a meaningful performance hit to try to search
* them. So let's not. */
@@ -361,7 +377,7 @@ _cleanup:
_offset: /* Requires: ip0, idx */
/* Compute the offset code. */
- match0 = base + idx;
+ match0 = base + matchIdx;
rep_offset2 = rep_offset1;
rep_offset1 = (U32)(ip0-match0);
offcode = OFFSET_TO_OFFBASE(rep_offset1);
@@ -406,12 +422,12 @@ _match: /* Requires: ip0, match0, offcode */
goto _start;
}
-#define ZSTD_GEN_FAST_FN(dictMode, mls, step) \
- static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step( \
- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \
+#define ZSTD_GEN_FAST_FN(dictMode, mml, cmov) \
+ static size_t ZSTD_compressBlock_fast_##dictMode##_##mml##_##cmov( \
+ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \
void const* src, size_t srcSize) \
{ \
- return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \
+ return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mml, cmov); \
}
ZSTD_GEN_FAST_FN(noDict, 4, 1)
@@ -425,13 +441,15 @@ ZSTD_GEN_FAST_FN(noDict, 6, 0)
ZSTD_GEN_FAST_FN(noDict, 7, 0)
size_t ZSTD_compressBlock_fast(
- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
void const* src, size_t srcSize)
{
- U32 const mls = ms->cParams.minMatch;
+ U32 const mml = ms->cParams.minMatch;
+ /* use cmov when "candidate in range" branch is likely unpredictable */
+ int const useCmov = ms->cParams.windowLog < 19;
assert(ms->dictMatchState == NULL);
- if (ms->cParams.targetLength > 1) {
- switch(mls)
+ if (useCmov) {
+ switch(mml)
{
default: /* includes case 3 */
case 4 :
@@ -444,7 +462,8 @@ size_t ZSTD_compressBlock_fast(
return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize);
}
} else {
- switch(mls)
+ /* use a branch instead */
+ switch(mml)
{
default: /* includes case 3 */
case 4 :
@@ -456,14 +475,13 @@ size_t ZSTD_compressBlock_fast(
case 7 :
return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize);
}
-
}
}
FORCE_INLINE_TEMPLATE
ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
size_t ZSTD_compressBlock_fast_dictMatchState_generic(
- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
{
const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -482,7 +500,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
const BYTE* const ilimit = iend - HASH_READ_SIZE;
U32 offset_1=rep[0], offset_2=rep[1];
- const ZSTD_matchState_t* const dms = ms->dictMatchState;
+ const ZSTD_MatchState_t* const dms = ms->dictMatchState;
const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
const U32* const dictHashTable = dms->hashTable;
const U32 dictStartIndex = dms->window.dictLimit;
@@ -546,8 +564,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
hashTable[hash0] = curr; /* update hash table */
- if (((U32) ((prefixStartIndex - 1) - repIndex) >=
- 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+ if ((ZSTD_index_overlap_check(prefixStartIndex, repIndex))
&& (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
@@ -580,8 +597,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
}
}
- if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
- /* found a regular match */
+ if (ZSTD_match4Found_cmov(ip0, match, matchIndex, prefixStartIndex)) {
+ /* found a regular match of size >= 4 */
U32 const offset = (U32) (ip0 - match);
mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
while (((ip0 > anchor) & (match > prefixStart))
@@ -631,7 +648,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
dictBase - dictIndexDelta + repIndex2 :
base + repIndex2;
- if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+ if ( (ZSTD_index_overlap_check(prefixStartIndex, repIndex2))
&& (MEM_read32(repMatch2) == MEM_read32(ip0))) {
const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
@@ -667,7 +684,7 @@ ZSTD_GEN_FAST_FN(dictMatchState, 6, 0)
ZSTD_GEN_FAST_FN(dictMatchState, 7, 0)
size_t ZSTD_compressBlock_fast_dictMatchState(
- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
void const* src, size_t srcSize)
{
U32 const mls = ms->cParams.minMatch;
@@ -690,7 +707,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState(
static
ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
size_t ZSTD_compressBlock_fast_extDict_generic(
- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
{
const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -925,7 +942,7 @@ _match: /* Requires: ip0, match0, offcode, matchEnd */
while (ip0 <= ilimit) {
U32 const repIndex2 = (U32)(ip0-base) - offset_2;
const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0)) /* intentional underflow */
+ if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) & (offset_2 > 0))
&& (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
@@ -948,7 +965,7 @@ ZSTD_GEN_FAST_FN(extDict, 6, 0)
ZSTD_GEN_FAST_FN(extDict, 7, 0)
size_t ZSTD_compressBlock_fast_extDict(
- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
void const* src, size_t srcSize)
{
U32 const mls = ms->cParams.minMatch;