diff options
author | Ivan Blinkov <ivan@blinkov.ru> | 2022-02-10 16:47:10 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:47:10 +0300 |
commit | 1aeb9a455974457866f78722ad98114bafc84e8a (patch) | |
tree | e4340eaf1668684d83a0a58c36947c5def5350ad /contrib/libs/hyperscan/src/nfa/shufti.c | |
parent | bd5ef432f5cfb1e18851381329d94665a4c22470 (diff) | |
download | ydb-1aeb9a455974457866f78722ad98114bafc84e8a.tar.gz |
Restoring authorship annotation for Ivan Blinkov <ivan@blinkov.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/hyperscan/src/nfa/shufti.c')
-rw-r--r-- | contrib/libs/hyperscan/src/nfa/shufti.c | 1144 |
1 files changed, 572 insertions, 572 deletions
diff --git a/contrib/libs/hyperscan/src/nfa/shufti.c b/contrib/libs/hyperscan/src/nfa/shufti.c index 09ffc0cf9a..59138a4798 100644 --- a/contrib/libs/hyperscan/src/nfa/shufti.c +++ b/contrib/libs/hyperscan/src/nfa/shufti.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -34,7 +34,7 @@ #include "shufti.h" #include "ue2common.h" -#include "util/arch.h" +#include "util/arch.h" #include "util/bitutils.h" #include "util/simd_utils.h" #include "util/unaligned.h" @@ -71,65 +71,65 @@ void dumpMsk##_t##AsChars(m##_t msk) { \ #endif -/** \brief Naive byte-by-byte implementation. */ -static really_inline -const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf, - const u8 *buf_end) { - assert(buf < buf_end); - - for (; buf < buf_end; ++buf) { - u8 c = *buf; - if (lo[c & 0xf] & hi[c >> 4]) { - break; - } - } - return buf; -} - -/** \brief Naive byte-by-byte implementation. */ -static really_inline -const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf, - const u8 *buf_end) { - assert(buf < buf_end); - - for (buf_end--; buf_end >= buf; buf_end--) { - u8 c = *buf_end; - if (lo[c & 0xf] & hi[c >> 4]) { - break; - } - } - return buf_end; -} - -#if !defined(HAVE_AVX2) +/** \brief Naive byte-by-byte implementation. */ +static really_inline +const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf, + const u8 *buf_end) { + assert(buf < buf_end); + + for (; buf < buf_end; ++buf) { + u8 c = *buf; + if (lo[c & 0xf] & hi[c >> 4]) { + break; + } + } + return buf; +} + +/** \brief Naive byte-by-byte implementation. */ +static really_inline +const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf, + const u8 *buf_end) { + assert(buf < buf_end); + + for (buf_end--; buf_end >= buf; buf_end--) { + u8 c = *buf_end; + if (lo[c & 0xf] & hi[c >> 4]) { + break; + } + } + return buf_end; +} + +#if !defined(HAVE_AVX2) /* Normal SSSE3 shufti */ -#ifdef DEBUG -DUMP_MSK(128) -#endif - +#ifdef DEBUG +DUMP_MSK(128) +#endif + #define GET_LO_4(chars) and128(chars, low4bits) -#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4) +#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4) static really_inline -u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits, - const m128 compare) { - m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); - m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); - m128 t = and128(c_lo, c_hi); - +u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits, + const m128 compare) { + m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); + m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); + m128 t = and128(c_lo, c_hi); + #ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); + DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n"); + DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n"); + DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n"); + DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n"); + DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); #endif - return movemask128(eq128(t, compare)); -} + return movemask128(eq128(t, compare)); +} -static really_inline -const u8 *firstMatch(const u8 *buf, u32 z) { +static really_inline +const u8 *firstMatch(const u8 *buf, u32 z) { if (unlikely(z != 0xffff)) { u32 pos = ctz32(~z & 0xffff); assert(pos < 16); @@ -142,9 +142,9 @@ const u8 *firstMatch(const u8 *buf, u32 z) { static really_inline const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf, const m128 low4bits, const m128 zeroes) { - u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); + u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); - return firstMatch(buf, z); + return firstMatch(buf, z); } const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, @@ -219,8 +219,8 @@ const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) { static really_inline const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf, const m128 low4bits, const m128 zeroes) { - m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); - m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); + m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); + m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); m128 t = and128(c_lo, c_hi); #ifdef DEBUG @@ -289,8 +289,8 @@ const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, const m128 ones) { m128 chars_lo = GET_LO_4(chars); m128 chars_hi = GET_HI_4(chars); - m128 c_lo = pshufb_m128(mask1_lo, chars_lo); - m128 c_hi = pshufb_m128(mask1_hi, chars_hi); + m128 c_lo = pshufb_m128(mask1_lo, chars_lo); + m128 c_hi = pshufb_m128(mask1_hi, chars_hi); m128 t = or128(c_lo, c_hi); #ifdef DEBUG @@ -301,9 +301,9 @@ const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); #endif - m128 c2_lo = pshufb_m128(mask2_lo, chars_lo); - m128 c2_hi = pshufb_m128(mask2_hi, chars_hi); - m128 t2 = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1)); + m128 c2_lo = pshufb_m128(mask2_lo, chars_lo); + m128 c2_hi = pshufb_m128(mask2_hi, chars_hi); + m128 t2 = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1)); #ifdef DEBUG DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo); printf("\n"); @@ -311,9 +311,9 @@ const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, DEBUG_PRINTF(" t2: "); dumpMsk128(t2); printf("\n"); #endif - u32 z = movemask128(eq128(t2, ones)); - DEBUG_PRINTF(" z: 0x%08x\n", z); - return firstMatch(buf, z); + u32 z = movemask128(eq128(t2, ones)); + DEBUG_PRINTF(" z: 0x%08x\n", z); + return firstMatch(buf, z); } const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, @@ -360,41 +360,41 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, return buf_end; } -#elif !defined(HAVE_AVX512) -// AVX2 - 256 wide shuftis +#elif !defined(HAVE_AVX512) +// AVX2 - 256 wide shuftis #ifdef DEBUG DUMP_MSK(256) #endif #define GET_LO_4(chars) and256(chars, low4bits) -#define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4) +#define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4) static really_inline -u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits, - const m256 compare) { - m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); - m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); - m256 t = and256(c_lo, c_hi); - +u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits, + const m256 compare) { + m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); + m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); + m256 t = and256(c_lo, c_hi); + #ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); + DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n"); + DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n"); + DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n"); + DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n"); + DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); #endif - return movemask256(eq256(t, compare)); -} - -static really_inline -const u8 *firstMatch(const u8 *buf, u32 z) { - DEBUG_PRINTF("z 0x%08x\n", z); + return movemask256(eq256(t, compare)); +} + +static really_inline +const u8 *firstMatch(const u8 *buf, u32 z) { + DEBUG_PRINTF("z 0x%08x\n", z); if (unlikely(z != 0xffffffff)) { u32 pos = ctz32(~z); assert(pos < 32); - DEBUG_PRINTF("match @ pos %u\n", pos); + DEBUG_PRINTF("match @ pos %u\n", pos); return buf + pos; } else { return NULL; // no match @@ -402,44 +402,44 @@ const u8 *firstMatch(const u8 *buf, u32 z) { } static really_inline -const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf, - const m256 low4bits) { - // do the hi and lo shuffles in the one avx register - m256 c = combine2x128(rshift64_m128(chars, 4), chars); - c = and256(c, low4bits); - m256 c_shuf = pshufb_m256(mask, c); - m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf)); - // the upper 32-bits can't match - u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128())); - - return firstMatch(buf, z); -} - -static really_inline -const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf, - const u8 *buf_end, const m256 low4bits) { - // run shufti over two overlapping 16-byte unaligned reads - const m256 mask = combine2x128(mask_hi, mask_lo); - m128 chars = loadu128(buf); - const u8 *rv = fwdBlockShort(mask, chars, buf, low4bits); - if (rv) { - return rv; - } - - chars = loadu128(buf_end - 16); - rv = fwdBlockShort(mask, chars, buf_end - 16, low4bits); - if (rv) { - return rv; - } - return buf_end; -} - -static really_inline +const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf, + const m256 low4bits) { + // do the hi and lo shuffles in the one avx register + m256 c = combine2x128(rshift64_m128(chars, 4), chars); + c = and256(c, low4bits); + m256 c_shuf = pshufb_m256(mask, c); + m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf)); + // the upper 32-bits can't match + u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128())); + + return firstMatch(buf, z); +} + +static really_inline +const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end, const m256 low4bits) { + // run shufti over two overlapping 16-byte unaligned reads + const m256 mask = combine2x128(mask_hi, mask_lo); + m128 chars = loadu128(buf); + const u8 *rv = fwdBlockShort(mask, chars, buf, low4bits); + if (rv) { + return rv; + } + + chars = loadu128(buf_end - 16); + rv = fwdBlockShort(mask, chars, buf_end - 16, low4bits); + if (rv) { + return rv; + } + return buf_end; +} + +static really_inline const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf, const m256 low4bits, const m256 zeroes) { - u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); + u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); - return firstMatch(buf, z); + return firstMatch(buf, z); } /* takes 128 bit masks, but operates on 256 bits of data */ @@ -447,20 +447,20 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *buf_end) { assert(buf && buf_end); assert(buf < buf_end); - DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf); + DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf); // Slow path for small cases. - if (buf_end - buf < 16) { + if (buf_end - buf < 16) { return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf, buf_end); } - const m256 low4bits = set32x8(0xf); - - if (buf_end - buf <= 32) { - return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits); - } - + const m256 low4bits = set32x8(0xf); + + if (buf_end - buf <= 32) { + return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits); + } + const m256 zeroes = zeroes256(); const m256 wide_mask_lo = set2x128(mask_lo); const m256 wide_mask_hi = set2x128(mask_hi); @@ -503,7 +503,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, } static really_inline -const u8 *lastMatch(const u8 *buf, u32 z) { +const u8 *lastMatch(const u8 *buf, u32 z) { if (unlikely(z != 0xffffffff)) { u32 pos = clz32(~z); DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos); @@ -516,8 +516,8 @@ const u8 *lastMatch(const u8 *buf, u32 z) { static really_inline const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf, const m256 low4bits, const m256 zeroes) { - m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); - m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); + m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); + m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); m256 t = and256(c_lo, c_hi); #ifdef DEBUG @@ -528,45 +528,45 @@ const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf, DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); #endif - u32 z = movemask256(eq256(t, zeroes)); - return lastMatch(buf, z); -} - -static really_inline -const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf, - const m256 low4bits) { - // do the hi and lo shuffles in the one avx register - m256 c = combine2x128(rshift64_m128(chars, 4), chars); - c = and256(c, low4bits); - m256 c_shuf = pshufb_m256(mask, c); - m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf)); - // the upper 32-bits can't match - u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128())); - - return lastMatch(buf, z); -} - -static really_inline -const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf, - const u8 *buf_end, const m256 low4bits) { - // run shufti over two overlapping 16-byte unaligned reads - const m256 mask = combine2x128(mask_hi, mask_lo); - - m128 chars = loadu128(buf_end - 16); - const u8 *rv = revBlockShort(mask, chars, buf_end - 16, low4bits); - if (rv) { - return rv; - } - - chars = loadu128(buf); - rv = revBlockShort(mask, chars, buf, low4bits); - if (rv) { - return rv; - } - return buf - 1; + u32 z = movemask256(eq256(t, zeroes)); + return lastMatch(buf, z); } - +static really_inline +const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf, + const m256 low4bits) { + // do the hi and lo shuffles in the one avx register + m256 c = combine2x128(rshift64_m128(chars, 4), chars); + c = and256(c, low4bits); + m256 c_shuf = pshufb_m256(mask, c); + m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf)); + // the upper 32-bits can't match + u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128())); + + return lastMatch(buf, z); +} + +static really_inline +const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end, const m256 low4bits) { + // run shufti over two overlapping 16-byte unaligned reads + const m256 mask = combine2x128(mask_hi, mask_lo); + + m128 chars = loadu128(buf_end - 16); + const u8 *rv = revBlockShort(mask, chars, buf_end - 16, low4bits); + if (rv) { + return rv; + } + + chars = loadu128(buf); + rv = revBlockShort(mask, chars, buf, low4bits); + if (rv) { + return rv; + } + return buf - 1; +} + + /* takes 128 bit masks, but operates on 256 bits of data */ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *buf_end) { @@ -574,17 +574,17 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, assert(buf < buf_end); // Slow path for small cases. - if (buf_end - buf < 16) { + if (buf_end - buf < 16) { return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, buf, buf_end); } - const m256 low4bits = set32x8(0xf); - - if (buf_end - buf <= 32) { - return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits); - } - + const m256 low4bits = set32x8(0xf); + + if (buf_end - buf <= 32) { + return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits); + } + const m256 zeroes = zeroes256(); const m256 wide_mask_lo = set2x128(mask_lo); const m256 wide_mask_hi = set2x128(mask_hi); @@ -630,8 +630,8 @@ const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi, DEBUG_PRINTF("buf %p\n", buf); m256 chars_lo = GET_LO_4(chars); m256 chars_hi = GET_HI_4(chars); - m256 c_lo = pshufb_m256(mask1_lo, chars_lo); - m256 c_hi = pshufb_m256(mask1_hi, chars_hi); + m256 c_lo = pshufb_m256(mask1_lo, chars_lo); + m256 c_hi = pshufb_m256(mask1_hi, chars_hi); m256 t = or256(c_lo, c_hi); #ifdef DEBUG @@ -642,71 +642,71 @@ const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi, DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); #endif - m256 c2_lo = pshufb_m256(mask2_lo, chars_lo); - m256 c2_hi = pshufb_m256(mask2_hi, chars_hi); - m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1)); + m256 c2_lo = pshufb_m256(mask2_lo, chars_lo); + m256 c2_hi = pshufb_m256(mask2_hi, chars_hi); + m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1)); #ifdef DEBUG DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo); printf("\n"); DEBUG_PRINTF(" c2_hi: "); dumpMsk256(c2_hi); printf("\n"); DEBUG_PRINTF(" t2: "); dumpMsk256(t2); printf("\n"); #endif - u32 z = movemask256(eq256(t2, ones)); - - return firstMatch(buf, z); -} - -static really_inline -const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf, - const m256 low4bits) { - // do the hi and lo shuffles in the one avx register - m256 c = combine2x128(rshift64_m128(chars, 4), chars); - c = and256(c, low4bits); - m256 c_shuf1 = pshufb_m256(mask1, c); - m256 c_shuf2 = rshift128_m256(pshufb_m256(mask2, c), 1); - m256 t0 = or256(c_shuf1, c_shuf2); - m128 t = or128(movdq_hi(t0), cast256to128(t0)); - // the upper 32-bits can't match - u32 z = 0xffff0000U | movemask128(eq128(t, ones128())); - - return firstMatch(buf, z); -} - -static really_inline -const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, - m128 mask2_hi, const u8 *buf, const u8 *buf_end) { - DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); - const m256 low4bits = set32x8(0xf); - // run shufti over two overlapping 16-byte unaligned reads - const m256 mask1 = combine2x128(mask1_hi, mask1_lo); - const m256 mask2 = combine2x128(mask2_hi, mask2_lo); - m128 chars = loadu128(buf); - const u8 *rv = fwdBlockShort2(mask1, mask2, chars, buf, low4bits); - if (rv) { - return rv; - } + u32 z = movemask256(eq256(t2, ones)); - chars = loadu128(buf_end - 16); - rv = fwdBlockShort2(mask1, mask2, chars, buf_end - 16, low4bits); - if (rv) { - return rv; - } - return buf_end; + return firstMatch(buf, z); } +static really_inline +const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf, + const m256 low4bits) { + // do the hi and lo shuffles in the one avx register + m256 c = combine2x128(rshift64_m128(chars, 4), chars); + c = and256(c, low4bits); + m256 c_shuf1 = pshufb_m256(mask1, c); + m256 c_shuf2 = rshift128_m256(pshufb_m256(mask2, c), 1); + m256 t0 = or256(c_shuf1, c_shuf2); + m128 t = or128(movdq_hi(t0), cast256to128(t0)); + // the upper 32-bits can't match + u32 z = 0xffff0000U | movemask128(eq128(t, ones128())); + + return firstMatch(buf, z); +} + +static really_inline +const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, + m128 mask2_hi, const u8 *buf, const u8 *buf_end) { + DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); + const m256 low4bits = set32x8(0xf); + // run shufti over two overlapping 16-byte unaligned reads + const m256 mask1 = combine2x128(mask1_hi, mask1_lo); + const m256 mask2 = combine2x128(mask2_hi, mask2_lo); + m128 chars = loadu128(buf); + const u8 *rv = fwdBlockShort2(mask1, mask2, chars, buf, low4bits); + if (rv) { + return rv; + } + + chars = loadu128(buf_end - 16); + rv = fwdBlockShort2(mask1, mask2, chars, buf_end - 16, low4bits); + if (rv) { + return rv; + } + return buf_end; +} + /* takes 128 bit masks, but operates on 256 bits of data */ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, const u8 *buf, const u8 *buf_end) { - /* we should always have at least 16 bytes */ - assert(buf_end - buf >= 16); - DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); - + /* we should always have at least 16 bytes */ + assert(buf_end - buf >= 16); + DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); + if (buf_end - buf < 32) { - return shuftiDoubleShort(mask1_lo, mask1_hi, mask2_lo, mask2_hi, buf, - buf_end); + return shuftiDoubleShort(mask1_lo, mask1_hi, mask2_lo, mask2_hi, buf, + buf_end); } - + const m256 ones = ones256(); const m256 low4bits = set32x8(0xf); const m256 wide_mask1_lo = set2x128(mask1_lo); @@ -751,347 +751,347 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, return buf_end; } -#else // defined(HAVE_AVX512) - -#ifdef DEBUG -DUMP_MSK(512) -#endif - -static really_inline -u64a block(m512 mask_lo, m512 mask_hi, m512 chars, const m512 low4bits, - const m512 compare) { - m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits)); - m512 c_hi = pshufb_m512(mask_hi, - rshift64_m512(andnot512(low4bits, chars), 4)); - m512 t = and512(c_lo, c_hi); - -#ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); -#endif - - return eq512mask(t, compare); -} -static really_inline -const u8 *firstMatch64(const u8 *buf, u64a z) { - DEBUG_PRINTF("z 0x%016llx\n", z); - if (unlikely(z != ~0ULL)) { - u32 pos = ctz64(~z); - DEBUG_PRINTF("match @ pos %u\n", pos); - assert(pos < 64); - return buf + pos; - } else { - return NULL; // no match - } -} - -static really_inline -const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf, - const m512 low4bits, const m512 zeroes) { - u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); - - return firstMatch64(buf, z); -} - -static really_inline -const u8 *shortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf, - const u8 *buf_end, const m512 low4bits, - const m512 zeroes) { - DEBUG_PRINTF("short shufti %p len %zu\n", buf, buf_end - buf); - uintptr_t len = buf_end - buf; - assert(len <= 64); - - // load mask - u64a k = (~0ULL) >> (64 - len); - DEBUG_PRINTF("load mask 0x%016llx\n", k); - - m512 chars = loadu_maskz_m512(k, buf); - - u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); - - // reuse the load mask to indicate valid bytes - return firstMatch64(buf, z | ~k); -} - -/* takes 128 bit masks, but operates on 512 bits of data */ -const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - const u8 *buf_end) { - assert(buf && buf_end); - assert(buf < buf_end); - DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf); - DEBUG_PRINTF("b %s\n", buf); - - const m512 low4bits = set64x8(0xf); - const m512 zeroes = zeroes512(); - const m512 wide_mask_lo = set4x128(mask_lo); - const m512 wide_mask_hi = set4x128(mask_hi); - const u8 *rv; - - // small cases. - if (buf_end - buf <= 64) { - rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits, - zeroes); - return rv ? rv : buf_end; - } - - assert(buf_end - buf >= 64); - - // Preconditioning: most of the time our buffer won't be aligned. - if ((uintptr_t)buf % 64) { - rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf, - ROUNDUP_PTR(buf, 64), low4bits, zeroes); - if (rv) { - return rv; - } - buf = ROUNDUP_PTR(buf, 64); - } - - const u8 *last_block = ROUNDDOWN_PTR(buf_end, 64); - while (buf < last_block) { - m512 lchars = load512(buf); - rv = fwdBlock512(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, - zeroes); - if (rv) { - return rv; - } - buf += 64; - } - - if (buf == buf_end) { - goto done; - } - - // Use an unaligned load to mop up the last 64 bytes and get an accurate - // picture to buf_end. - assert(buf <= buf_end && buf >= buf_end - 64); - m512 chars = loadu512(buf_end - 64); - rv = fwdBlock512(wide_mask_lo, wide_mask_hi, chars, buf_end - 64, low4bits, - zeroes); - if (rv) { - return rv; - } -done: - return buf_end; -} - -static really_inline -const u8 *lastMatch64(const u8 *buf, u64a z) { - DEBUG_PRINTF("z 0x%016llx\n", z); - if (unlikely(z != ~0ULL)) { - u32 pos = clz64(~z); - DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos); - return buf + (63 - pos); - } else { - return NULL; // no match - } -} - -static really_inline -const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf, - const u8 *buf_end, const m512 low4bits, - const m512 zeroes) { - DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf); - uintptr_t len = buf_end - buf; - assert(len <= 64); - - // load mask - u64a k = (~0ULL) >> (64 - len); - DEBUG_PRINTF("load mask 0x%016llx\n", k); - - m512 chars = loadu_maskz_m512(k, buf); - - u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); - - // reuse the load mask to indicate valid bytes - return lastMatch64(buf, z | ~k); -} - -static really_inline -const u8 *revBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf, - const m512 low4bits, const m512 zeroes) { - m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits)); - m512 c_hi = pshufb_m512(mask_hi, - rshift64_m512(andnot512(low4bits, chars), 4)); - m512 t = and512(c_lo, c_hi); - -#ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); -#endif - - u64a z = eq512mask(t, zeroes); - return lastMatch64(buf, z); -} - -/* takes 128 bit masks, but operates on 512 bits of data */ -const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, - const u8 *buf_end) { - DEBUG_PRINTF("buf %p buf_end %p\n", buf, buf_end); - assert(buf && buf_end); - assert(buf < buf_end); - - const m512 low4bits = set64x8(0xf); - const m512 zeroes = zeroes512(); - const m512 wide_mask_lo = set4x128(mask_lo); - const m512 wide_mask_hi = set4x128(mask_hi); - const u8 *rv; - - if (buf_end - buf < 64) { - rv = rshortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits, - zeroes); - return rv ? rv : buf - 1; - } - - if (ROUNDDOWN_PTR(buf_end, 64) != buf_end) { - // peel off unaligned portion - assert(buf_end - buf >= 64); - DEBUG_PRINTF("start\n"); - rv = rshortShufti512(wide_mask_lo, wide_mask_hi, - ROUNDDOWN_PTR(buf_end, 64), buf_end, low4bits, - zeroes); - if (rv) { - return rv; - } - buf_end = ROUNDDOWN_PTR(buf_end, 64); - } - - const u8 *last_block = ROUNDUP_PTR(buf, 64); - while (buf_end > last_block) { - buf_end -= 64; - m512 lchars = load512(buf_end); - rv = revBlock512(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, - zeroes); - if (rv) { - return rv; - } - } - if (buf_end == buf) { - goto done; - } - // Use an unaligned load to mop up the last 64 bytes and get an accurate - // picture to buf. - m512 chars = loadu512(buf); - rv = revBlock512(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes); - if (rv) { - return rv; - } -done: - return buf - 1; -} - -static really_inline -const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi, - m512 chars, const u8 *buf, const m512 low4bits, - const m512 ones, __mmask64 k) { - DEBUG_PRINTF("buf %p %.64s\n", buf, buf); - m512 chars_lo = and512(chars, low4bits); - m512 chars_hi = rshift64_m512(andnot512(low4bits, chars), 4); - m512 c_lo = maskz_pshufb_m512(k, mask1_lo, chars_lo); - m512 c_hi = maskz_pshufb_m512(k, mask1_hi, chars_hi); - m512 t = or512(c_lo, c_hi); - -#ifdef DEBUG - DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); - DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); - DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); - DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); - DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); -#endif - - m512 c2_lo = maskz_pshufb_m512(k, mask2_lo, chars_lo); - m512 c2_hi = maskz_pshufb_m512(k, mask2_hi, chars_hi); - m512 t2 = or512(t, rshift128_m512(or512(c2_lo, c2_hi), 1)); - -#ifdef DEBUG - DEBUG_PRINTF(" c2_lo: "); dumpMsk512(c2_lo); printf("\n"); - DEBUG_PRINTF(" c2_hi: "); dumpMsk512(c2_hi); printf("\n"); - DEBUG_PRINTF(" t2: "); dumpMsk512(t2); printf("\n"); -#endif - u64a z = eq512mask(t2, ones); - - return firstMatch64(buf, z | ~k); -} - -static really_inline -const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, - m512 mask2_hi, const u8 *buf, const u8 *buf_end, - const m512 low4bits, const m512 ones) { - DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf); - uintptr_t len = buf_end - buf; - assert(len <= 64); - - u64a k = (~0ULL) >> (64 - len); - DEBUG_PRINTF("load mask 0x%016llx\n", k); - - m512 chars = loadu_mask_m512(ones, k, buf); - - const u8 *rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf, - low4bits, ones, k); - - return rv; -} - -/* takes 128 bit masks, but operates on 512 bits of data */ -const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, - m128 mask2_lo, m128 mask2_hi, - const u8 *buf, const u8 *buf_end) { - /* we should always have at least 16 bytes */ - assert(buf_end - buf >= 16); - DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); - - const m512 ones = ones512(); - const m512 low4bits = set64x8(0xf); - const m512 wide_mask1_lo = set4x128(mask1_lo); - const m512 wide_mask1_hi = set4x128(mask1_hi); - const m512 wide_mask2_lo = set4x128(mask2_lo); - const m512 wide_mask2_hi = set4x128(mask2_hi); - const u8 *rv; - - if (buf_end - buf <= 64) { - rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, - wide_mask2_hi, buf, buf_end, low4bits, ones); - DEBUG_PRINTF("rv %p\n", rv); - return rv ? rv : buf_end; - } - - // Preconditioning: most of the time our buffer won't be aligned. - if ((uintptr_t)buf % 64) { - rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, - wide_mask2_hi, buf, ROUNDUP_PTR(buf, 64), - low4bits, ones); - if (rv) { - return rv; - } - - buf = ROUNDUP_PTR(buf, 64); - } - - const u8 *last_block = buf_end - 64; - while (buf < last_block) { - m512 lchars = load512(buf); - rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, - wide_mask2_hi, lchars, buf, low4bits, ones, ~0); - if (rv) { - return rv; - } - buf += 64; - } - - // Use an unaligned load to mop up the last 64 bytes and get an accurate - // picture to buf_end. - m512 chars = loadu512(buf_end - 64); - rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, - chars, buf_end - 64, low4bits, ones, ~0); - if (rv) { - return rv; - } - - return buf_end; -} -#endif +#else // defined(HAVE_AVX512) + +#ifdef DEBUG +DUMP_MSK(512) +#endif + +static really_inline +u64a block(m512 mask_lo, m512 mask_hi, m512 chars, const m512 low4bits, + const m512 compare) { + m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits)); + m512 c_hi = pshufb_m512(mask_hi, + rshift64_m512(andnot512(low4bits, chars), 4)); + m512 t = and512(c_lo, c_hi); + +#ifdef DEBUG + DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); + DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); + DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); + DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); + DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); +#endif + + return eq512mask(t, compare); +} +static really_inline +const u8 *firstMatch64(const u8 *buf, u64a z) { + DEBUG_PRINTF("z 0x%016llx\n", z); + if (unlikely(z != ~0ULL)) { + u32 pos = ctz64(~z); + DEBUG_PRINTF("match @ pos %u\n", pos); + assert(pos < 64); + return buf + pos; + } else { + return NULL; // no match + } +} + +static really_inline +const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf, + const m512 low4bits, const m512 zeroes) { + u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); + + return firstMatch64(buf, z); +} + +static really_inline +const u8 *shortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf, + const u8 *buf_end, const m512 low4bits, + const m512 zeroes) { + DEBUG_PRINTF("short shufti %p len %zu\n", buf, buf_end - buf); + uintptr_t len = buf_end - buf; + assert(len <= 64); + + // load mask + u64a k = (~0ULL) >> (64 - len); + DEBUG_PRINTF("load mask 0x%016llx\n", k); + + m512 chars = loadu_maskz_m512(k, buf); + + u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); + + // reuse the load mask to indicate valid bytes + return firstMatch64(buf, z | ~k); +} + +/* takes 128 bit masks, but operates on 512 bits of data */ +const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end) { + assert(buf && buf_end); + assert(buf < buf_end); + DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf); + DEBUG_PRINTF("b %s\n", buf); + + const m512 low4bits = set64x8(0xf); + const m512 zeroes = zeroes512(); + const m512 wide_mask_lo = set4x128(mask_lo); + const m512 wide_mask_hi = set4x128(mask_hi); + const u8 *rv; + + // small cases. + if (buf_end - buf <= 64) { + rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits, + zeroes); + return rv ? rv : buf_end; + } + + assert(buf_end - buf >= 64); + + // Preconditioning: most of the time our buffer won't be aligned. + if ((uintptr_t)buf % 64) { + rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf, + ROUNDUP_PTR(buf, 64), low4bits, zeroes); + if (rv) { + return rv; + } + buf = ROUNDUP_PTR(buf, 64); + } + + const u8 *last_block = ROUNDDOWN_PTR(buf_end, 64); + while (buf < last_block) { + m512 lchars = load512(buf); + rv = fwdBlock512(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, + zeroes); + if (rv) { + return rv; + } + buf += 64; + } + + if (buf == buf_end) { + goto done; + } + + // Use an unaligned load to mop up the last 64 bytes and get an accurate + // picture to buf_end. + assert(buf <= buf_end && buf >= buf_end - 64); + m512 chars = loadu512(buf_end - 64); + rv = fwdBlock512(wide_mask_lo, wide_mask_hi, chars, buf_end - 64, low4bits, + zeroes); + if (rv) { + return rv; + } +done: + return buf_end; +} + +static really_inline +const u8 *lastMatch64(const u8 *buf, u64a z) { + DEBUG_PRINTF("z 0x%016llx\n", z); + if (unlikely(z != ~0ULL)) { + u32 pos = clz64(~z); + DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos); + return buf + (63 - pos); + } else { + return NULL; // no match + } +} + +static really_inline +const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf, + const u8 *buf_end, const m512 low4bits, + const m512 zeroes) { + DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf); + uintptr_t len = buf_end - buf; + assert(len <= 64); + + // load mask + u64a k = (~0ULL) >> (64 - len); + DEBUG_PRINTF("load mask 0x%016llx\n", k); + + m512 chars = loadu_maskz_m512(k, buf); + + u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); + + // reuse the load mask to indicate valid bytes + return lastMatch64(buf, z | ~k); +} + +static really_inline +const u8 *revBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf, + const m512 low4bits, const m512 zeroes) { + m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits)); + m512 c_hi = pshufb_m512(mask_hi, + rshift64_m512(andnot512(low4bits, chars), 4)); + m512 t = and512(c_lo, c_hi); + +#ifdef DEBUG + DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); + DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); + DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); + DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); + DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); +#endif + + u64a z = eq512mask(t, zeroes); + return lastMatch64(buf, z); +} + +/* takes 128 bit masks, but operates on 512 bits of data */ +const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("buf %p buf_end %p\n", buf, buf_end); + assert(buf && buf_end); + assert(buf < buf_end); + + const m512 low4bits = set64x8(0xf); + const m512 zeroes = zeroes512(); + const m512 wide_mask_lo = set4x128(mask_lo); + const m512 wide_mask_hi = set4x128(mask_hi); + const u8 *rv; + + if (buf_end - buf < 64) { + rv = rshortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits, + zeroes); + return rv ? rv : buf - 1; + } + + if (ROUNDDOWN_PTR(buf_end, 64) != buf_end) { + // peel off unaligned portion + assert(buf_end - buf >= 64); + DEBUG_PRINTF("start\n"); + rv = rshortShufti512(wide_mask_lo, wide_mask_hi, + ROUNDDOWN_PTR(buf_end, 64), buf_end, low4bits, + zeroes); + if (rv) { + return rv; + } + buf_end = ROUNDDOWN_PTR(buf_end, 64); + } + + const u8 *last_block = ROUNDUP_PTR(buf, 64); + while (buf_end > last_block) { + buf_end -= 64; + m512 lchars = load512(buf_end); + rv = revBlock512(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, + zeroes); + if (rv) { + return rv; + } + } + if (buf_end == buf) { + goto done; + } + // Use an unaligned load to mop up the last 64 bytes and get an accurate + // picture to buf. + m512 chars = loadu512(buf); + rv = revBlock512(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes); + if (rv) { + return rv; + } +done: + return buf - 1; +} + +static really_inline +const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi, + m512 chars, const u8 *buf, const m512 low4bits, + const m512 ones, __mmask64 k) { + DEBUG_PRINTF("buf %p %.64s\n", buf, buf); + m512 chars_lo = and512(chars, low4bits); + m512 chars_hi = rshift64_m512(andnot512(low4bits, chars), 4); + m512 c_lo = maskz_pshufb_m512(k, mask1_lo, chars_lo); + m512 c_hi = maskz_pshufb_m512(k, mask1_hi, chars_hi); + m512 t = or512(c_lo, c_hi); + +#ifdef DEBUG + DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); + DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); + DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); + DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); + DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); +#endif + + m512 c2_lo = maskz_pshufb_m512(k, mask2_lo, chars_lo); + m512 c2_hi = maskz_pshufb_m512(k, mask2_hi, chars_hi); + m512 t2 = or512(t, rshift128_m512(or512(c2_lo, c2_hi), 1)); + +#ifdef DEBUG + DEBUG_PRINTF(" c2_lo: "); dumpMsk512(c2_lo); printf("\n"); + DEBUG_PRINTF(" c2_hi: "); dumpMsk512(c2_hi); printf("\n"); + DEBUG_PRINTF(" t2: "); dumpMsk512(t2); printf("\n"); +#endif + u64a z = eq512mask(t2, ones); + + return firstMatch64(buf, z | ~k); +} + +static really_inline +const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, + m512 mask2_hi, const u8 *buf, const u8 *buf_end, + const m512 low4bits, const m512 ones) { + DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf); + uintptr_t len = buf_end - buf; + assert(len <= 64); + + u64a k = (~0ULL) >> (64 - len); + DEBUG_PRINTF("load mask 0x%016llx\n", k); + + m512 chars = loadu_mask_m512(ones, k, buf); + + const u8 *rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf, + low4bits, ones, k); + + return rv; +} + +/* takes 128 bit masks, but operates on 512 bits of data */ +const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, + m128 mask2_lo, m128 mask2_hi, + const u8 *buf, const u8 *buf_end) { + /* we should always have at least 16 bytes */ + assert(buf_end - buf >= 16); + DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); + + const m512 ones = ones512(); + const m512 low4bits = set64x8(0xf); + const m512 wide_mask1_lo = set4x128(mask1_lo); + const m512 wide_mask1_hi = set4x128(mask1_hi); + const m512 wide_mask2_lo = set4x128(mask2_lo); + const m512 wide_mask2_hi = set4x128(mask2_hi); + const u8 *rv; + + if (buf_end - buf <= 64) { + rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, + wide_mask2_hi, buf, buf_end, low4bits, ones); + DEBUG_PRINTF("rv %p\n", rv); + return rv ? rv : buf_end; + } + + // Preconditioning: most of the time our buffer won't be aligned. + if ((uintptr_t)buf % 64) { + rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, + wide_mask2_hi, buf, ROUNDUP_PTR(buf, 64), + low4bits, ones); + if (rv) { + return rv; + } + + buf = ROUNDUP_PTR(buf, 64); + } + + const u8 *last_block = buf_end - 64; + while (buf < last_block) { + m512 lchars = load512(buf); + rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, + wide_mask2_hi, lchars, buf, low4bits, ones, ~0); + if (rv) { + return rv; + } + buf += 64; + } + + // Use an unaligned load to mop up the last 64 bytes and get an accurate + // picture to buf_end. + m512 chars = loadu512(buf_end - 64); + rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, + chars, buf_end - 64, low4bits, ones, ~0); + if (rv) { + return rv; + } + + return buf_end; +} +#endif |