aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/hyperscan/src/fdr/teddy_runtime_common.h
diff options
context:
space:
mode:
authorIvan Blinkov <ivan@blinkov.ru>2022-02-10 16:47:11 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:47:11 +0300
commit5b283123c882433dafbaf6b338adeea16c1a0ea0 (patch)
tree339adc63bce23800021202ae4a8328a843dc447a /contrib/libs/hyperscan/src/fdr/teddy_runtime_common.h
parent1aeb9a455974457866f78722ad98114bafc84e8a (diff)
downloadydb-5b283123c882433dafbaf6b338adeea16c1a0ea0.tar.gz
Restoring authorship annotation for Ivan Blinkov <ivan@blinkov.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/hyperscan/src/fdr/teddy_runtime_common.h')
-rw-r--r--contrib/libs/hyperscan/src/fdr/teddy_runtime_common.h886
1 files changed, 443 insertions, 443 deletions
diff --git a/contrib/libs/hyperscan/src/fdr/teddy_runtime_common.h b/contrib/libs/hyperscan/src/fdr/teddy_runtime_common.h
index 692761c5e6..b76800eb04 100644
--- a/contrib/libs/hyperscan/src/fdr/teddy_runtime_common.h
+++ b/contrib/libs/hyperscan/src/fdr/teddy_runtime_common.h
@@ -1,50 +1,50 @@
-/*
+/*
* Copyright (c) 2016-2020, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Teddy literal matcher: common runtime procedures.
- */
-
-#ifndef TEDDY_RUNTIME_COMMON_H_
-#define TEDDY_RUNTIME_COMMON_H_
-
-#include "fdr_confirm.h"
-#include "fdr_confirm_runtime.h"
-#include "ue2common.h"
-#include "util/bitutils.h"
-#include "util/simd_utils.h"
-#include "util/uniform_ops.h"
-
-extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
-#if defined(HAVE_AVX2)
-extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64];
-#endif
-
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Teddy literal matcher: common runtime procedures.
+ */
+
+#ifndef TEDDY_RUNTIME_COMMON_H_
+#define TEDDY_RUNTIME_COMMON_H_
+
+#include "fdr_confirm.h"
+#include "fdr_confirm_runtime.h"
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+#include "util/uniform_ops.h"
+
+extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
+#if defined(HAVE_AVX2)
+extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64];
+#endif
+
#if defined(HAVE_AVX512VBMI)
static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -55,405 +55,405 @@ static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
};
#endif
-#ifdef ARCH_64_BIT
-#define TEDDY_CONF_TYPE u64a
-#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf)
-#else
-#define TEDDY_CONF_TYPE u32
-#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_32(conf)
-#endif
-
-#define CHECK_HWLM_TERMINATE_MATCHING \
-do { \
- if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \
- return HWLM_TERMINATED; \
- } \
-} while (0);
-
-#define CHECK_FLOOD \
-do { \
- if (unlikely(ptr > tryFloodDetect)) { \
- tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, \
- &floodBackoff, &control, iterBytes); \
- CHECK_HWLM_TERMINATE_MATCHING; \
- } \
-} while (0);
-
-/*
- * \brief Copy a block of [0,15] bytes efficiently.
- *
- * This function is a workaround intended to stop some compilers from
- * synthesizing a memcpy function call out of the copy of a small number of
- * bytes that we do in vectoredLoad128.
- */
-static really_inline
-void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
- switch (len) {
- case 0:
- break;
- case 1:
- *dst = *src;
- break;
- case 2:
- unaligned_store_u16(dst, unaligned_load_u16(src));
- break;
- case 3:
- unaligned_store_u16(dst, unaligned_load_u16(src));
- dst[2] = src[2];
- break;
- case 4:
- unaligned_store_u32(dst, unaligned_load_u32(src));
- break;
- case 5:
- case 6:
- case 7:
- /* Perform copy with two overlapping 4-byte chunks. */
- unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
- unaligned_store_u32(dst, unaligned_load_u32(src));
- break;
- case 8:
- unaligned_store_u64a(dst, unaligned_load_u64a(src));
- break;
- default:
- /* Perform copy with two overlapping 8-byte chunks. */
- assert(len < 16);
- unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
- unaligned_store_u64a(dst, unaligned_load_u64a(src));
- break;
- }
-}
-
-// Note: p_mask is an output param that initialises a poison mask.
-// *p_mask = load128(p_mask_arr[n] + 16 - m) means:
-// m byte 0xff in the beginning, followed by n byte 0x00,
-// then followed by the rest bytes 0xff.
-// ptr >= lo:
-// no history.
-// for end/short zone, ptr==lo and start_offset==0
-// for start zone, see below
-// lo ptr hi hi
-// |----------|-------|----------------|............|
-// -start 0 -start+offset MIN(avail,16)
-// p_mask ffff..ff0000...........00ffff..........
-// ptr < lo:
-// only start zone.
-// history
-// ptr lo hi hi
-// |----------|-------|----------------|............|
-// 0 start start+offset end(<=16)
-// p_mask ffff.....ffffff..ff0000...........00ffff..........
-static really_inline
-m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
- const u8 *lo, const u8 *hi,
- const u8 *buf_history, size_t len_history,
- const u32 nMasks) {
- union {
- u8 val8[16];
- m128 val128;
- } u;
- u.val128 = zeroes128();
-
- uintptr_t copy_start;
- uintptr_t copy_len;
-
- if (ptr >= lo) { // short/end/start zone
- uintptr_t start = (uintptr_t)(ptr - lo);
- uintptr_t avail = (uintptr_t)(hi - ptr);
- if (avail >= 16) {
- assert(start_offset - start <= 16);
- *p_mask = loadu128(p_mask_arr[16 - start_offset + start]
- + 16 - start_offset + start);
- return loadu128(ptr);
- }
- assert(start_offset - start <= avail);
- *p_mask = loadu128(p_mask_arr[avail - start_offset + start]
- + 16 - start_offset + start);
- copy_start = 0;
- copy_len = avail;
- } else { // start zone
- uintptr_t need = MIN((uintptr_t)(lo - ptr),
- MIN(len_history, nMasks - 1));
- uintptr_t start = (uintptr_t)(lo - ptr);
- uintptr_t i;
- for (i = start - need; i < start; i++) {
- u.val8[i] = buf_history[len_history - (start - i)];
- }
- uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
- assert(start + start_offset <= end);
- *p_mask = loadu128(p_mask_arr[end - start - start_offset]
- + 16 - start - start_offset);
- copy_start = start;
- copy_len = end - start;
- }
-
- // Runt block from the buffer.
- copyRuntBlock128(&u.val8[copy_start], &ptr[copy_start], copy_len);
-
- return u.val128;
-}
-
-#if defined(HAVE_AVX2)
-/*
- * \brief Copy a block of [0,31] bytes efficiently.
- *
- * This function is a workaround intended to stop some compilers from
- * synthesizing a memcpy function call out of the copy of a small number of
- * bytes that we do in vectoredLoad256.
- */
-static really_inline
-void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
- switch (len) {
- case 0:
- break;
- case 1:
- *dst = *src;
- break;
- case 2:
- unaligned_store_u16(dst, unaligned_load_u16(src));
- break;
- case 3:
- unaligned_store_u16(dst, unaligned_load_u16(src));
- dst[2] = src[2];
- break;
- case 4:
- unaligned_store_u32(dst, unaligned_load_u32(src));
- break;
- case 5:
- case 6:
- case 7:
- /* Perform copy with two overlapping 4-byte chunks. */
- unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
- unaligned_store_u32(dst, unaligned_load_u32(src));
- break;
- case 8:
- unaligned_store_u64a(dst, unaligned_load_u64a(src));
- break;
- case 9:
- case 10:
- case 11:
- case 12:
- case 13:
- case 14:
- case 15:
- /* Perform copy with two overlapping 8-byte chunks. */
- unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
- unaligned_store_u64a(dst, unaligned_load_u64a(src));
- break;
- case 16:
- storeu128(dst, loadu128(src));
- break;
- default:
- /* Perform copy with two overlapping 16-byte chunks. */
- assert(len < 32);
- storeu128(dst + len - 16, loadu128(src + len - 16));
- storeu128(dst, loadu128(src));
- break;
- }
-}
-
-// Note: p_mask is an output param that initialises a poison mask.
-// *p_mask = load256(p_mask_arr256[n] + 32 - m) means:
-// m byte 0xff in the beginning, followed by n byte 0x00,
-// then followed by the rest bytes 0xff.
-// ptr >= lo:
-// no history.
-// for end/short zone, ptr==lo and start_offset==0
-// for start zone, see below
-// lo ptr hi hi
-// |----------|-------|----------------|............|
-// -start 0 -start+offset MIN(avail,32)
-// p_mask ffff..ff0000...........00ffff..........
-// ptr < lo:
-// only start zone.
-// history
-// ptr lo hi hi
-// |----------|-------|----------------|............|
-// 0 start start+offset end(<=32)
-// p_mask ffff.....ffffff..ff0000...........00ffff..........
-static really_inline
-m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
- const u8 *lo, const u8 *hi,
- const u8 *buf_history, size_t len_history,
- const u32 nMasks) {
- union {
- u8 val8[32];
- m256 val256;
- } u;
- u.val256 = zeroes256();
-
- uintptr_t copy_start;
- uintptr_t copy_len;
-
- if (ptr >= lo) { // short/end/start zone
- uintptr_t start = (uintptr_t)(ptr - lo);
- uintptr_t avail = (uintptr_t)(hi - ptr);
- if (avail >= 32) {
- assert(start_offset - start <= 32);
- *p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
- + 32 - start_offset + start);
- return loadu256(ptr);
- }
- assert(start_offset - start <= avail);
- *p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
- + 32 - start_offset + start);
- copy_start = 0;
- copy_len = avail;
- } else { //start zone
- uintptr_t need = MIN((uintptr_t)(lo - ptr),
- MIN(len_history, nMasks - 1));
- uintptr_t start = (uintptr_t)(lo - ptr);
- uintptr_t i;
- for (i = start - need; i < start; i++) {
- u.val8[i] = buf_history[len_history - (start - i)];
- }
- uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
- assert(start + start_offset <= end);
- *p_mask = loadu256(p_mask_arr256[end - start - start_offset]
- + 32 - start - start_offset);
- copy_start = start;
- copy_len = end - start;
- }
-
- // Runt block from the buffer.
- copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
-
- return u.val256;
-}
-#endif // HAVE_AVX2
-
-#if defined(HAVE_AVX512)
-// Note: p_mask is an output param that initialises a poison mask.
-// u64a k = ones_u64a << n' >> m'; // m' < n'
-// *p_mask = set_mask_m512(~k);
-// means p_mask is consist of:
-// (n' - m') poison bytes "0xff" at the beginning,
-// followed by (64 - n') valid bytes "0x00",
-// then followed by the rest m' poison bytes "0xff".
-// ptr >= lo:
-// no history.
-// for end/short zone, ptr==lo and start_offset==0
-// for start zone, see below
-// lo ptr hi hi
-// |----------|-------|----------------|............|
-// -start 0 -start+offset MIN(avail,64)
-// p_mask ffff..ff0000...........00ffff..........
-// ptr < lo:
-// only start zone.
-// history
-// ptr lo hi hi
-// |----------|-------|----------------|............|
-// 0 start start+offset end(<=64)
-// p_mask ffff.....ffffff..ff0000...........00ffff..........
-static really_inline
-m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset,
- const u8 *lo, const u8 *hi, const u8 *hbuf, size_t hlen,
- const u32 nMasks) {
- m512 val;
-
- uintptr_t copy_start;
- uintptr_t copy_len;
-
- if (ptr >= lo) { // short/end/start zone
- uintptr_t start = (uintptr_t)(ptr - lo);
- uintptr_t avail = (uintptr_t)(hi - ptr);
- if (avail >= 64) {
- assert(start_offset - start <= 64);
- u64a k = ones_u64a << (start_offset - start);
- *p_mask = set_mask_m512(~k);
- return loadu512(ptr);
- }
- assert(start_offset - start <= avail);
- u64a k = ones_u64a << (64 - avail + start_offset - start)
- >> (64 - avail);
- *p_mask = set_mask_m512(~k);
- copy_start = 0;
- copy_len = avail;
- } else { //start zone
- uintptr_t need = MIN((uintptr_t)(lo - ptr),
- MIN(hlen, nMasks - 1));
- uintptr_t start = (uintptr_t)(lo - ptr);
- u64a j = 0x7fffffffffffffffULL >> (63 - need) << (start - need);
- val = loadu_maskz_m512(j, &hbuf[hlen - start]);
- uintptr_t end = MIN(64, (uintptr_t)(hi - ptr));
- assert(start + start_offset <= end);
- u64a k = ones_u64a << (64 - end + start + start_offset) >> (64 - end);
- *p_mask = set_mask_m512(~k);
- copy_start = start;
- copy_len = end - start;
- }
-
- assert(copy_len < 64);
- assert(copy_len > 0);
- u64a j = ones_u64a >> (64 - copy_len) << copy_start;
- val = loadu_mask_m512(val, j, ptr);
-
- return val;
-}
-#endif // HAVE_AVX512
-
-static really_inline
-u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
+#ifdef ARCH_64_BIT
+#define TEDDY_CONF_TYPE u64a
+#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf)
+#else
+#define TEDDY_CONF_TYPE u32
+#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_32(conf)
+#endif
+
+#define CHECK_HWLM_TERMINATE_MATCHING \
+do { \
+ if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \
+ return HWLM_TERMINATED; \
+ } \
+} while (0);
+
+#define CHECK_FLOOD \
+do { \
+ if (unlikely(ptr > tryFloodDetect)) { \
+ tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, \
+ &floodBackoff, &control, iterBytes); \
+ CHECK_HWLM_TERMINATE_MATCHING; \
+ } \
+} while (0);
+
+/*
+ * \brief Copy a block of [0,15] bytes efficiently.
+ *
+ * This function is a workaround intended to stop some compilers from
+ * synthesizing a memcpy function call out of the copy of a small number of
+ * bytes that we do in vectoredLoad128.
+ */
+static really_inline
+void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
+ switch (len) {
+ case 0:
+ break;
+ case 1:
+ *dst = *src;
+ break;
+ case 2:
+ unaligned_store_u16(dst, unaligned_load_u16(src));
+ break;
+ case 3:
+ unaligned_store_u16(dst, unaligned_load_u16(src));
+ dst[2] = src[2];
+ break;
+ case 4:
+ unaligned_store_u32(dst, unaligned_load_u32(src));
+ break;
+ case 5:
+ case 6:
+ case 7:
+ /* Perform copy with two overlapping 4-byte chunks. */
+ unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+ unaligned_store_u32(dst, unaligned_load_u32(src));
+ break;
+ case 8:
+ unaligned_store_u64a(dst, unaligned_load_u64a(src));
+ break;
+ default:
+ /* Perform copy with two overlapping 8-byte chunks. */
+ assert(len < 16);
+ unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+ unaligned_store_u64a(dst, unaligned_load_u64a(src));
+ break;
+ }
+}
+
+// Note: p_mask is an output param that initialises a poison mask.
+// *p_mask = load128(p_mask_arr[n] + 16 - m) means:
+// m byte 0xff in the beginning, followed by n byte 0x00,
+// then followed by the rest bytes 0xff.
+// ptr >= lo:
+// no history.
+// for end/short zone, ptr==lo and start_offset==0
+// for start zone, see below
+// lo ptr hi hi
+// |----------|-------|----------------|............|
+// -start 0 -start+offset MIN(avail,16)
+// p_mask ffff..ff0000...........00ffff..........
+// ptr < lo:
+// only start zone.
+// history
+// ptr lo hi hi
+// |----------|-------|----------------|............|
+// 0 start start+offset end(<=16)
+// p_mask ffff.....ffffff..ff0000...........00ffff..........
+static really_inline
+m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
+ const u8 *lo, const u8 *hi,
+ const u8 *buf_history, size_t len_history,
+ const u32 nMasks) {
+ union {
+ u8 val8[16];
+ m128 val128;
+ } u;
+ u.val128 = zeroes128();
+
+ uintptr_t copy_start;
+ uintptr_t copy_len;
+
+ if (ptr >= lo) { // short/end/start zone
+ uintptr_t start = (uintptr_t)(ptr - lo);
+ uintptr_t avail = (uintptr_t)(hi - ptr);
+ if (avail >= 16) {
+ assert(start_offset - start <= 16);
+ *p_mask = loadu128(p_mask_arr[16 - start_offset + start]
+ + 16 - start_offset + start);
+ return loadu128(ptr);
+ }
+ assert(start_offset - start <= avail);
+ *p_mask = loadu128(p_mask_arr[avail - start_offset + start]
+ + 16 - start_offset + start);
+ copy_start = 0;
+ copy_len = avail;
+ } else { // start zone
+ uintptr_t need = MIN((uintptr_t)(lo - ptr),
+ MIN(len_history, nMasks - 1));
+ uintptr_t start = (uintptr_t)(lo - ptr);
+ uintptr_t i;
+ for (i = start - need; i < start; i++) {
+ u.val8[i] = buf_history[len_history - (start - i)];
+ }
+ uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
+ assert(start + start_offset <= end);
+ *p_mask = loadu128(p_mask_arr[end - start - start_offset]
+ + 16 - start - start_offset);
+ copy_start = start;
+ copy_len = end - start;
+ }
+
+ // Runt block from the buffer.
+ copyRuntBlock128(&u.val8[copy_start], &ptr[copy_start], copy_len);
+
+ return u.val128;
+}
+
+#if defined(HAVE_AVX2)
+/*
+ * \brief Copy a block of [0,31] bytes efficiently.
+ *
+ * This function is a workaround intended to stop some compilers from
+ * synthesizing a memcpy function call out of the copy of a small number of
+ * bytes that we do in vectoredLoad256.
+ */
+static really_inline
+void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
+ switch (len) {
+ case 0:
+ break;
+ case 1:
+ *dst = *src;
+ break;
+ case 2:
+ unaligned_store_u16(dst, unaligned_load_u16(src));
+ break;
+ case 3:
+ unaligned_store_u16(dst, unaligned_load_u16(src));
+ dst[2] = src[2];
+ break;
+ case 4:
+ unaligned_store_u32(dst, unaligned_load_u32(src));
+ break;
+ case 5:
+ case 6:
+ case 7:
+ /* Perform copy with two overlapping 4-byte chunks. */
+ unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+ unaligned_store_u32(dst, unaligned_load_u32(src));
+ break;
+ case 8:
+ unaligned_store_u64a(dst, unaligned_load_u64a(src));
+ break;
+ case 9:
+ case 10:
+ case 11:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ /* Perform copy with two overlapping 8-byte chunks. */
+ unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+ unaligned_store_u64a(dst, unaligned_load_u64a(src));
+ break;
+ case 16:
+ storeu128(dst, loadu128(src));
+ break;
+ default:
+ /* Perform copy with two overlapping 16-byte chunks. */
+ assert(len < 32);
+ storeu128(dst + len - 16, loadu128(src + len - 16));
+ storeu128(dst, loadu128(src));
+ break;
+ }
+}
+
+// Note: p_mask is an output param that initialises a poison mask.
+// *p_mask = load256(p_mask_arr256[n] + 32 - m) means:
+// m byte 0xff in the beginning, followed by n byte 0x00,
+// then followed by the rest bytes 0xff.
+// ptr >= lo:
+// no history.
+// for end/short zone, ptr==lo and start_offset==0
+// for start zone, see below
+// lo ptr hi hi
+// |----------|-------|----------------|............|
+// -start 0 -start+offset MIN(avail,32)
+// p_mask ffff..ff0000...........00ffff..........
+// ptr < lo:
+// only start zone.
+// history
+// ptr lo hi hi
+// |----------|-------|----------------|............|
+// 0 start start+offset end(<=32)
+// p_mask ffff.....ffffff..ff0000...........00ffff..........
+static really_inline
+m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
+ const u8 *lo, const u8 *hi,
+ const u8 *buf_history, size_t len_history,
+ const u32 nMasks) {
+ union {
+ u8 val8[32];
+ m256 val256;
+ } u;
+ u.val256 = zeroes256();
+
+ uintptr_t copy_start;
+ uintptr_t copy_len;
+
+ if (ptr >= lo) { // short/end/start zone
+ uintptr_t start = (uintptr_t)(ptr - lo);
+ uintptr_t avail = (uintptr_t)(hi - ptr);
+ if (avail >= 32) {
+ assert(start_offset - start <= 32);
+ *p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
+ + 32 - start_offset + start);
+ return loadu256(ptr);
+ }
+ assert(start_offset - start <= avail);
+ *p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
+ + 32 - start_offset + start);
+ copy_start = 0;
+ copy_len = avail;
+ } else { //start zone
+ uintptr_t need = MIN((uintptr_t)(lo - ptr),
+ MIN(len_history, nMasks - 1));
+ uintptr_t start = (uintptr_t)(lo - ptr);
+ uintptr_t i;
+ for (i = start - need; i < start; i++) {
+ u.val8[i] = buf_history[len_history - (start - i)];
+ }
+ uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
+ assert(start + start_offset <= end);
+ *p_mask = loadu256(p_mask_arr256[end - start - start_offset]
+ + 32 - start - start_offset);
+ copy_start = start;
+ copy_len = end - start;
+ }
+
+ // Runt block from the buffer.
+ copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
+
+ return u.val256;
+}
+#endif // HAVE_AVX2
+
+#if defined(HAVE_AVX512)
+// Note: p_mask is an output param that initialises a poison mask.
+// u64a k = ones_u64a << n' >> m'; // m' < n'
+// *p_mask = set_mask_m512(~k);
+// means p_mask is consist of:
+// (n' - m') poison bytes "0xff" at the beginning,
+// followed by (64 - n') valid bytes "0x00",
+// then followed by the rest m' poison bytes "0xff".
+// ptr >= lo:
+// no history.
+// for end/short zone, ptr==lo and start_offset==0
+// for start zone, see below
+// lo ptr hi hi
+// |----------|-------|----------------|............|
+// -start 0 -start+offset MIN(avail,64)
+// p_mask ffff..ff0000...........00ffff..........
+// ptr < lo:
+// only start zone.
+// history
+// ptr lo hi hi
+// |----------|-------|----------------|............|
+// 0 start start+offset end(<=64)
+// p_mask ffff.....ffffff..ff0000...........00ffff..........
+static really_inline
+m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset,
+ const u8 *lo, const u8 *hi, const u8 *hbuf, size_t hlen,
+ const u32 nMasks) {
+ m512 val;
+
+ uintptr_t copy_start;
+ uintptr_t copy_len;
+
+ if (ptr >= lo) { // short/end/start zone
+ uintptr_t start = (uintptr_t)(ptr - lo);
+ uintptr_t avail = (uintptr_t)(hi - ptr);
+ if (avail >= 64) {
+ assert(start_offset - start <= 64);
+ u64a k = ones_u64a << (start_offset - start);
+ *p_mask = set_mask_m512(~k);
+ return loadu512(ptr);
+ }
+ assert(start_offset - start <= avail);
+ u64a k = ones_u64a << (64 - avail + start_offset - start)
+ >> (64 - avail);
+ *p_mask = set_mask_m512(~k);
+ copy_start = 0;
+ copy_len = avail;
+ } else { //start zone
+ uintptr_t need = MIN((uintptr_t)(lo - ptr),
+ MIN(hlen, nMasks - 1));
+ uintptr_t start = (uintptr_t)(lo - ptr);
+ u64a j = 0x7fffffffffffffffULL >> (63 - need) << (start - need);
+ val = loadu_maskz_m512(j, &hbuf[hlen - start]);
+ uintptr_t end = MIN(64, (uintptr_t)(hi - ptr));
+ assert(start + start_offset <= end);
+ u64a k = ones_u64a << (64 - end + start + start_offset) >> (64 - end);
+ *p_mask = set_mask_m512(~k);
+ copy_start = start;
+ copy_len = end - start;
+ }
+
+ assert(copy_len < 64);
+ assert(copy_len > 0);
+ u64a j = ones_u64a >> (64 - copy_len) << copy_start;
+ val = loadu_mask_m512(val, j, ptr);
+
+ return val;
+}
+#endif // HAVE_AVX512
+
+static really_inline
+u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
UNUSED CautionReason reason) {
- u64a confVal = 0;
- const u8 *buf = a->buf;
- size_t len = a->len;
- const u8 *confirm_loc = ptr + byte - 7;
+ u64a confVal = 0;
+ const u8 *buf = a->buf;
+ size_t len = a->len;
+ const u8 *confirm_loc = ptr + byte - 7;
#if defined(HAVE_AVX512VBMI)
if (likely(confirm_loc >= buf)) {
#else
- if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
+ if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
#endif
- confVal = lv_u64a(confirm_loc, buf, buf + len);
- } else { // r == VECTORING, confirm_loc < buf
- u64a histBytes = a->histBytes;
- confVal = lv_u64a_ce(confirm_loc, buf, buf + len);
- // stitch together confVal and history
- u32 overhang = buf - confirm_loc;
- histBytes >>= 64 - (overhang * 8);
- confVal |= histBytes;
- }
- return confVal;
-}
-
-static really_inline
-void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
- const u32 *confBase, CautionReason reason,
- const struct FDR_Runtime_Args *a, const u8 *ptr,
- hwlmcb_rv_t *control, u32 *last_match) {
- do {
- u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
- u32 byte = bit / bucket + offset;
- u32 idx = bit % bucket;
- u32 cf = confBase[idx];
- if (!cf) {
- continue;
- }
- const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
- ((const u8 *)confBase + cf);
- if (!(fdrc->groups & *control)) {
- continue;
- }
- u64a tmp = 0;
- u64a confVal = getConfVal(a, ptr, byte, reason);
- confWithBit(fdrc, a, ptr - a->buf + byte, control,
- last_match, confVal, &tmp, 0);
- } while (unlikely(*conf));
-}
-
-static really_inline
-const m128 *getMaskBase(const struct Teddy *teddy) {
- return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
-}
-
-static really_inline
-const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
- return (const u64a *)((const u8 *)getMaskBase(teddy)
- + ROUNDUP_CL(2 * numMask * sizeof(m128)));
-}
-
-static really_inline
-const u32 *getConfBase(const struct Teddy *teddy) {
- return (const u32 *)((const u8 *)teddy + teddy->confOffset);
-}
-
-#endif /* TEDDY_RUNTIME_COMMON_H_ */
+ confVal = lv_u64a(confirm_loc, buf, buf + len);
+ } else { // r == VECTORING, confirm_loc < buf
+ u64a histBytes = a->histBytes;
+ confVal = lv_u64a_ce(confirm_loc, buf, buf + len);
+ // stitch together confVal and history
+ u32 overhang = buf - confirm_loc;
+ histBytes >>= 64 - (overhang * 8);
+ confVal |= histBytes;
+ }
+ return confVal;
+}
+
+static really_inline
+void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
+ const u32 *confBase, CautionReason reason,
+ const struct FDR_Runtime_Args *a, const u8 *ptr,
+ hwlmcb_rv_t *control, u32 *last_match) {
+ do {
+ u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
+ u32 byte = bit / bucket + offset;
+ u32 idx = bit % bucket;
+ u32 cf = confBase[idx];
+ if (!cf) {
+ continue;
+ }
+ const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+ ((const u8 *)confBase + cf);
+ if (!(fdrc->groups & *control)) {
+ continue;
+ }
+ u64a tmp = 0;
+ u64a confVal = getConfVal(a, ptr, byte, reason);
+ confWithBit(fdrc, a, ptr - a->buf + byte, control,
+ last_match, confVal, &tmp, 0);
+ } while (unlikely(*conf));
+}
+
+static really_inline
+const m128 *getMaskBase(const struct Teddy *teddy) {
+ return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
+}
+
+static really_inline
+const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
+ return (const u64a *)((const u8 *)getMaskBase(teddy)
+ + ROUNDUP_CL(2 * numMask * sizeof(m128)));
+}
+
+static really_inline
+const u32 *getConfBase(const struct Teddy *teddy) {
+ return (const u32 *)((const u8 *)teddy + teddy->confOffset);
+}
+
+#endif /* TEDDY_RUNTIME_COMMON_H_ */