Restoring authorship annotation for Ivan Blinkov <ivan@blinkov.ru>. Commit 2 of 2.

author: Ivan Blinkov <ivan@blinkov.ru> 2022-02-10 16:47:11 +0300
committer: Daniil Cherednik <dcherednik@yandex-team.ru> 2022-02-10 16:47:11 +0300
commit: 5b283123c882433dafbaf6b338adeea16c1a0ea0 (patch)
tree: 339adc63bce23800021202ae4a8328a843dc447a /contrib/libs/hyperscan/src/fdr/teddy_runtime_common.h
parent: 1aeb9a455974457866f78722ad98114bafc84e8a (diff)
download: ydb-5b283123c882433dafbaf6b338adeea16c1a0ea0.tar.gz
1 files changed, 443 insertions, 443 deletions
diff --git a/contrib/libs/hyperscan/src/fdr/teddy_runtime_common.h b/contrib/libs/hyperscan/src/fdr/teddy_runtime_common.h
index 692761c5e6..b76800eb04 100644
--- a/contrib/libs/hyperscan/src/fdr/teddy_runtime_common.h
+++ b/contrib/libs/hyperscan/src/fdr/teddy_runtime_common.h
@@ -1,50 +1,50 @@
-/* 
+/*
  * Copyright (c) 2016-2020, Intel Corporation
- * 
- * Redistribution and use in source and binary forms, with or without 
- * modification, are permitted provided that the following conditions are met: 
- * 
- *  * Redistributions of source code must retain the above copyright notice, 
- *    this list of conditions and the following disclaimer. 
- *  * Redistributions in binary form must reproduce the above copyright 
- *    notice, this list of conditions and the following disclaimer in the 
- *    documentation and/or other materials provided with the distribution. 
- *  * Neither the name of Intel Corporation nor the names of its contributors 
- *    may be used to endorse or promote products derived from this software 
- *    without specific prior written permission. 
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
- * POSSIBILITY OF SUCH DAMAGE. 
- */ 
- 
-/** \file 
- * \brief Teddy literal matcher: common runtime procedures. 
- */ 
- 
-#ifndef TEDDY_RUNTIME_COMMON_H_ 
-#define TEDDY_RUNTIME_COMMON_H_ 
- 
-#include "fdr_confirm.h" 
-#include "fdr_confirm_runtime.h" 
-#include "ue2common.h" 
-#include "util/bitutils.h" 
-#include "util/simd_utils.h" 
-#include "util/uniform_ops.h" 
- 
-extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32]; 
-#if defined(HAVE_AVX2) 
-extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64]; 
-#endif 
- 
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Teddy literal matcher: common runtime procedures.
+ */
+
+#ifndef TEDDY_RUNTIME_COMMON_H_
+#define TEDDY_RUNTIME_COMMON_H_
+
+#include "fdr_confirm.h"
+#include "fdr_confirm_runtime.h"
+#include "ue2common.h"
+#include "util/bitutils.h"
+#include "util/simd_utils.h"
+#include "util/uniform_ops.h"
+
+extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
+#if defined(HAVE_AVX2)
+extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64];
+#endif
+
 #if defined(HAVE_AVX512VBMI)
 static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -55,405 +55,405 @@ static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
 };
 #endif
 
-#ifdef ARCH_64_BIT 
-#define TEDDY_CONF_TYPE u64a 
-#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf) 
-#else 
-#define TEDDY_CONF_TYPE u32 
-#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_32(conf) 
-#endif 
- 
-#define CHECK_HWLM_TERMINATE_MATCHING                                       \ 
-do {                                                                        \ 
-    if (unlikely(control == HWLM_TERMINATE_MATCHING)) {                     \ 
-        return HWLM_TERMINATED;                                             \ 
-    }                                                                       \ 
-} while (0); 
- 
-#define CHECK_FLOOD                                                         \ 
-do {                                                                        \ 
-    if (unlikely(ptr > tryFloodDetect)) {                                   \ 
-        tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect,          \ 
-                                     &floodBackoff, &control, iterBytes);   \ 
-        CHECK_HWLM_TERMINATE_MATCHING;                                      \ 
-    }                                                                       \ 
-} while (0); 
- 
-/* 
- * \brief Copy a block of [0,15] bytes efficiently. 
- * 
- * This function is a workaround intended to stop some compilers from 
- * synthesizing a memcpy function call out of the copy of a small number of 
- * bytes that we do in vectoredLoad128. 
- */ 
-static really_inline 
-void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) { 
-    switch (len) { 
-    case 0: 
-        break; 
-    case 1: 
-        *dst = *src; 
-        break; 
-    case 2: 
-        unaligned_store_u16(dst, unaligned_load_u16(src)); 
-        break; 
-    case 3: 
-        unaligned_store_u16(dst, unaligned_load_u16(src)); 
-        dst[2] = src[2]; 
-        break; 
-    case 4: 
-        unaligned_store_u32(dst, unaligned_load_u32(src)); 
-        break; 
-    case 5: 
-    case 6: 
-    case 7: 
-        /* Perform copy with two overlapping 4-byte chunks. */ 
-        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4)); 
-        unaligned_store_u32(dst, unaligned_load_u32(src)); 
-        break; 
-    case 8: 
-        unaligned_store_u64a(dst, unaligned_load_u64a(src)); 
-        break; 
-    default: 
-        /* Perform copy with two overlapping 8-byte chunks. */ 
-        assert(len < 16); 
-        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8)); 
-        unaligned_store_u64a(dst, unaligned_load_u64a(src)); 
-        break; 
-    } 
-} 
- 
-// Note: p_mask is an output param that initialises a poison mask. 
-//       *p_mask = load128(p_mask_arr[n] + 16 - m) means: 
-//       m byte 0xff in the beginning, followed by n byte 0x00, 
-//       then followed by the rest bytes 0xff. 
-// ptr >= lo: 
-//     no history. 
-//     for end/short zone, ptr==lo and start_offset==0 
-//     for start zone, see below 
-//          lo         ptr                      hi           hi 
-//          |----------|-------|----------------|............| 
-//          -start     0       -start+offset    MIN(avail,16) 
-// p_mask              ffff..ff0000...........00ffff.......... 
-// ptr < lo: 
-//     only start zone. 
-//             history 
-//          ptr        lo                       hi           hi 
-//          |----------|-------|----------------|............| 
-//          0          start   start+offset     end(<=16) 
-// p_mask   ffff.....ffffff..ff0000...........00ffff.......... 
-static really_inline 
-m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset, 
-                     const u8 *lo, const u8 *hi, 
-                     const u8 *buf_history, size_t len_history, 
-                     const u32 nMasks) { 
-    union { 
-        u8 val8[16]; 
-        m128 val128; 
-    } u; 
-    u.val128 = zeroes128(); 
- 
-    uintptr_t copy_start; 
-    uintptr_t copy_len; 
- 
-    if (ptr >= lo) { // short/end/start zone 
-        uintptr_t start = (uintptr_t)(ptr - lo); 
-        uintptr_t avail = (uintptr_t)(hi - ptr); 
-        if (avail >= 16) { 
-            assert(start_offset - start <= 16); 
-            *p_mask = loadu128(p_mask_arr[16 - start_offset + start] 
-                               + 16 - start_offset + start); 
-            return loadu128(ptr); 
-        } 
-        assert(start_offset - start <= avail); 
-        *p_mask = loadu128(p_mask_arr[avail - start_offset + start] 
-                           + 16 - start_offset + start); 
-        copy_start = 0; 
-        copy_len = avail; 
-    } else { // start zone 
-        uintptr_t need = MIN((uintptr_t)(lo - ptr), 
-                             MIN(len_history, nMasks - 1)); 
-        uintptr_t start = (uintptr_t)(lo - ptr); 
-        uintptr_t i; 
-        for (i = start - need; i < start; i++) { 
-            u.val8[i] = buf_history[len_history - (start - i)]; 
-        } 
-        uintptr_t end = MIN(16, (uintptr_t)(hi - ptr)); 
-        assert(start + start_offset <= end); 
-        *p_mask = loadu128(p_mask_arr[end - start - start_offset] 
-                           + 16 - start - start_offset); 
-        copy_start = start; 
-        copy_len = end - start; 
-    } 
- 
-    // Runt block from the buffer. 
-    copyRuntBlock128(&u.val8[copy_start], &ptr[copy_start], copy_len); 
- 
-    return u.val128; 
-} 
- 
-#if defined(HAVE_AVX2) 
-/* 
- * \brief Copy a block of [0,31] bytes efficiently. 
- * 
- * This function is a workaround intended to stop some compilers from 
- * synthesizing a memcpy function call out of the copy of a small number of 
- * bytes that we do in vectoredLoad256. 
- */ 
-static really_inline 
-void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) { 
-    switch (len) { 
-    case 0: 
-        break; 
-    case 1: 
-        *dst = *src; 
-        break; 
-    case 2: 
-        unaligned_store_u16(dst, unaligned_load_u16(src)); 
-        break; 
-    case 3: 
-        unaligned_store_u16(dst, unaligned_load_u16(src)); 
-        dst[2] = src[2]; 
-        break; 
-    case 4: 
-        unaligned_store_u32(dst, unaligned_load_u32(src)); 
-        break; 
-    case 5: 
-    case 6: 
-    case 7: 
-        /* Perform copy with two overlapping 4-byte chunks. */ 
-        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4)); 
-        unaligned_store_u32(dst, unaligned_load_u32(src)); 
-        break; 
-    case 8: 
-        unaligned_store_u64a(dst, unaligned_load_u64a(src)); 
-        break; 
-    case 9: 
-    case 10: 
-    case 11: 
-    case 12: 
-    case 13: 
-    case 14: 
-    case 15: 
-        /* Perform copy with two overlapping 8-byte chunks. */ 
-        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8)); 
-        unaligned_store_u64a(dst, unaligned_load_u64a(src)); 
-        break; 
-    case 16: 
-        storeu128(dst, loadu128(src)); 
-        break; 
-    default: 
-        /* Perform copy with two overlapping 16-byte chunks. */ 
-        assert(len < 32); 
-        storeu128(dst + len - 16, loadu128(src + len - 16)); 
-        storeu128(dst, loadu128(src)); 
-        break; 
-    } 
-} 
- 
-// Note: p_mask is an output param that initialises a poison mask. 
-//       *p_mask = load256(p_mask_arr256[n] + 32 - m) means: 
-//       m byte 0xff in the beginning, followed by n byte 0x00, 
-//       then followed by the rest bytes 0xff. 
-// ptr >= lo: 
-//     no history. 
-//     for end/short zone, ptr==lo and start_offset==0 
-//     for start zone, see below 
-//          lo         ptr                      hi           hi 
-//          |----------|-------|----------------|............| 
-//          -start     0       -start+offset    MIN(avail,32) 
-// p_mask              ffff..ff0000...........00ffff.......... 
-// ptr < lo: 
-//     only start zone. 
-//             history 
-//          ptr        lo                       hi           hi 
-//          |----------|-------|----------------|............| 
-//          0          start   start+offset     end(<=32) 
-// p_mask   ffff.....ffffff..ff0000...........00ffff.......... 
-static really_inline 
-m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset, 
-                     const u8 *lo, const u8 *hi, 
-                     const u8 *buf_history, size_t len_history, 
-                     const u32 nMasks) { 
-    union { 
-        u8 val8[32]; 
-        m256 val256; 
-    } u; 
-    u.val256 = zeroes256(); 
- 
-    uintptr_t copy_start; 
-    uintptr_t copy_len; 
- 
-    if (ptr >= lo) { // short/end/start zone 
-        uintptr_t start = (uintptr_t)(ptr - lo); 
-        uintptr_t avail = (uintptr_t)(hi - ptr); 
-        if (avail >= 32) { 
-            assert(start_offset - start <= 32); 
-            *p_mask = loadu256(p_mask_arr256[32 - start_offset + start] 
-                               + 32 - start_offset + start); 
-            return loadu256(ptr); 
-        } 
-        assert(start_offset - start <= avail); 
-        *p_mask = loadu256(p_mask_arr256[avail - start_offset + start] 
-                           + 32 - start_offset + start); 
-        copy_start = 0; 
-        copy_len = avail; 
-    } else { //start zone 
-        uintptr_t need = MIN((uintptr_t)(lo - ptr), 
-                             MIN(len_history, nMasks - 1)); 
-        uintptr_t start = (uintptr_t)(lo - ptr); 
-        uintptr_t i; 
-        for (i = start - need; i < start; i++) { 
-            u.val8[i] = buf_history[len_history - (start - i)]; 
-        } 
-        uintptr_t end = MIN(32, (uintptr_t)(hi - ptr)); 
-        assert(start + start_offset <= end); 
-        *p_mask = loadu256(p_mask_arr256[end - start - start_offset] 
-                           + 32 - start - start_offset); 
-        copy_start = start; 
-        copy_len = end - start; 
-    } 
- 
-    // Runt block from the buffer. 
-    copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len); 
- 
-    return u.val256; 
-} 
-#endif // HAVE_AVX2 
- 
-#if defined(HAVE_AVX512) 
-// Note: p_mask is an output param that initialises a poison mask. 
-//       u64a k = ones_u64a << n' >> m'; // m' < n' 
-//       *p_mask = set_mask_m512(~k); 
-//       means p_mask is consist of: 
-//       (n' - m') poison bytes "0xff" at the beginning, 
-//       followed by (64 - n') valid bytes "0x00", 
-//       then followed by the rest m' poison bytes "0xff". 
-// ptr >= lo: 
-//     no history. 
-//     for end/short zone, ptr==lo and start_offset==0 
-//     for start zone, see below 
-//          lo         ptr                      hi           hi 
-//          |----------|-------|----------------|............| 
-//          -start     0       -start+offset    MIN(avail,64) 
-// p_mask              ffff..ff0000...........00ffff.......... 
-// ptr < lo: 
-//     only start zone. 
-//             history 
-//          ptr        lo                       hi           hi 
-//          |----------|-------|----------------|............| 
-//          0          start   start+offset     end(<=64) 
-// p_mask   ffff.....ffffff..ff0000...........00ffff.......... 
-static really_inline 
-m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset, 
-                     const u8 *lo, const u8 *hi, const u8 *hbuf, size_t hlen, 
-                     const u32 nMasks) { 
-    m512 val; 
- 
-    uintptr_t copy_start; 
-    uintptr_t copy_len; 
- 
-    if (ptr >= lo) { // short/end/start zone 
-        uintptr_t start = (uintptr_t)(ptr - lo); 
-        uintptr_t avail = (uintptr_t)(hi - ptr); 
-        if (avail >= 64) { 
-            assert(start_offset - start <= 64); 
-            u64a k = ones_u64a << (start_offset - start); 
-            *p_mask = set_mask_m512(~k); 
-            return loadu512(ptr); 
-        } 
-        assert(start_offset - start <= avail); 
-        u64a k = ones_u64a << (64 - avail + start_offset - start) 
-                           >> (64 - avail); 
-        *p_mask = set_mask_m512(~k); 
-        copy_start = 0; 
-        copy_len = avail; 
-    } else { //start zone 
-        uintptr_t need = MIN((uintptr_t)(lo - ptr), 
-                             MIN(hlen, nMasks - 1)); 
-        uintptr_t start = (uintptr_t)(lo - ptr); 
-        u64a j = 0x7fffffffffffffffULL >> (63 - need) << (start - need); 
-        val = loadu_maskz_m512(j, &hbuf[hlen - start]); 
-        uintptr_t end = MIN(64, (uintptr_t)(hi - ptr)); 
-        assert(start + start_offset <= end); 
-        u64a k = ones_u64a << (64 - end + start + start_offset) >> (64 - end); 
-        *p_mask = set_mask_m512(~k); 
-        copy_start = start; 
-        copy_len = end - start; 
-    } 
- 
-    assert(copy_len < 64); 
-    assert(copy_len > 0); 
-    u64a j = ones_u64a >> (64 - copy_len) << copy_start; 
-    val = loadu_mask_m512(val, j, ptr); 
- 
-    return val; 
-} 
-#endif // HAVE_AVX512 
- 
-static really_inline 
-u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte, 
+#ifdef ARCH_64_BIT
+#define TEDDY_CONF_TYPE u64a
+#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf)
+#else
+#define TEDDY_CONF_TYPE u32
+#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_32(conf)
+#endif
+
+#define CHECK_HWLM_TERMINATE_MATCHING                                       \
+do {                                                                        \
+    if (unlikely(control == HWLM_TERMINATE_MATCHING)) {                     \
+        return HWLM_TERMINATED;                                             \
+    }                                                                       \
+} while (0);
+
+#define CHECK_FLOOD                                                         \
+do {                                                                        \
+    if (unlikely(ptr > tryFloodDetect)) {                                   \
+        tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect,          \
+                                     &floodBackoff, &control, iterBytes);   \
+        CHECK_HWLM_TERMINATE_MATCHING;                                      \
+    }                                                                       \
+} while (0);
+
+/*
+ * \brief Copy a block of [0,15] bytes efficiently.
+ *
+ * This function is a workaround intended to stop some compilers from
+ * synthesizing a memcpy function call out of the copy of a small number of
+ * bytes that we do in vectoredLoad128.
+ */
+static really_inline
+void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
+    switch (len) {
+    case 0:
+        break;
+    case 1:
+        *dst = *src;
+        break;
+    case 2:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        break;
+    case 3:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        dst[2] = src[2];
+        break;
+    case 4:
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 5:
+    case 6:
+    case 7:
+        /* Perform copy with two overlapping 4-byte chunks. */
+        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 8:
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    default:
+        /* Perform copy with two overlapping 8-byte chunks. */
+        assert(len < 16);
+        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    }
+}
+
+// Note: p_mask is an output param that initialises a poison mask.
+//       *p_mask = load128(p_mask_arr[n] + 16 - m) means:
+//       m byte 0xff in the beginning, followed by n byte 0x00,
+//       then followed by the rest bytes 0xff.
+// ptr >= lo:
+//     no history.
+//     for end/short zone, ptr==lo and start_offset==0
+//     for start zone, see below
+//          lo         ptr                      hi           hi
+//          |----------|-------|----------------|............|
+//          -start     0       -start+offset    MIN(avail,16)
+// p_mask              ffff..ff0000...........00ffff..........
+// ptr < lo:
+//     only start zone.
+//             history
+//          ptr        lo                       hi           hi
+//          |----------|-------|----------------|............|
+//          0          start   start+offset     end(<=16)
+// p_mask   ffff.....ffffff..ff0000...........00ffff..........
+static really_inline
+m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
+                     const u8 *lo, const u8 *hi,
+                     const u8 *buf_history, size_t len_history,
+                     const u32 nMasks) {
+    union {
+        u8 val8[16];
+        m128 val128;
+    } u;
+    u.val128 = zeroes128();
+
+    uintptr_t copy_start;
+    uintptr_t copy_len;
+
+    if (ptr >= lo) { // short/end/start zone
+        uintptr_t start = (uintptr_t)(ptr - lo);
+        uintptr_t avail = (uintptr_t)(hi - ptr);
+        if (avail >= 16) {
+            assert(start_offset - start <= 16);
+            *p_mask = loadu128(p_mask_arr[16 - start_offset + start]
+                               + 16 - start_offset + start);
+            return loadu128(ptr);
+        }
+        assert(start_offset - start <= avail);
+        *p_mask = loadu128(p_mask_arr[avail - start_offset + start]
+                           + 16 - start_offset + start);
+        copy_start = 0;
+        copy_len = avail;
+    } else { // start zone
+        uintptr_t need = MIN((uintptr_t)(lo - ptr),
+                             MIN(len_history, nMasks - 1));
+        uintptr_t start = (uintptr_t)(lo - ptr);
+        uintptr_t i;
+        for (i = start - need; i < start; i++) {
+            u.val8[i] = buf_history[len_history - (start - i)];
+        }
+        uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
+        assert(start + start_offset <= end);
+        *p_mask = loadu128(p_mask_arr[end - start - start_offset]
+                           + 16 - start - start_offset);
+        copy_start = start;
+        copy_len = end - start;
+    }
+
+    // Runt block from the buffer.
+    copyRuntBlock128(&u.val8[copy_start], &ptr[copy_start], copy_len);
+
+    return u.val128;
+}
+
+#if defined(HAVE_AVX2)
+/*
+ * \brief Copy a block of [0,31] bytes efficiently.
+ *
+ * This function is a workaround intended to stop some compilers from
+ * synthesizing a memcpy function call out of the copy of a small number of
+ * bytes that we do in vectoredLoad256.
+ */
+static really_inline
+void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
+    switch (len) {
+    case 0:
+        break;
+    case 1:
+        *dst = *src;
+        break;
+    case 2:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        break;
+    case 3:
+        unaligned_store_u16(dst, unaligned_load_u16(src));
+        dst[2] = src[2];
+        break;
+    case 4:
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 5:
+    case 6:
+    case 7:
+        /* Perform copy with two overlapping 4-byte chunks. */
+        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
+        unaligned_store_u32(dst, unaligned_load_u32(src));
+        break;
+    case 8:
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+    case 14:
+    case 15:
+        /* Perform copy with two overlapping 8-byte chunks. */
+        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
+        unaligned_store_u64a(dst, unaligned_load_u64a(src));
+        break;
+    case 16:
+        storeu128(dst, loadu128(src));
+        break;
+    default:
+        /* Perform copy with two overlapping 16-byte chunks. */
+        assert(len < 32);
+        storeu128(dst + len - 16, loadu128(src + len - 16));
+        storeu128(dst, loadu128(src));
+        break;
+    }
+}
+
+// Note: p_mask is an output param that initialises a poison mask.
+//       *p_mask = load256(p_mask_arr256[n] + 32 - m) means:
+//       m byte 0xff in the beginning, followed by n byte 0x00,
+//       then followed by the rest bytes 0xff.
+// ptr >= lo:
+//     no history.
+//     for end/short zone, ptr==lo and start_offset==0
+//     for start zone, see below
+//          lo         ptr                      hi           hi
+//          |----------|-------|----------------|............|
+//          -start     0       -start+offset    MIN(avail,32)
+// p_mask              ffff..ff0000...........00ffff..........
+// ptr < lo:
+//     only start zone.
+//             history
+//          ptr        lo                       hi           hi
+//          |----------|-------|----------------|............|
+//          0          start   start+offset     end(<=32)
+// p_mask   ffff.....ffffff..ff0000...........00ffff..........
+static really_inline
+m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
+                     const u8 *lo, const u8 *hi,
+                     const u8 *buf_history, size_t len_history,
+                     const u32 nMasks) {
+    union {
+        u8 val8[32];
+        m256 val256;
+    } u;
+    u.val256 = zeroes256();
+
+    uintptr_t copy_start;
+    uintptr_t copy_len;
+
+    if (ptr >= lo) { // short/end/start zone
+        uintptr_t start = (uintptr_t)(ptr - lo);
+        uintptr_t avail = (uintptr_t)(hi - ptr);
+        if (avail >= 32) {
+            assert(start_offset - start <= 32);
+            *p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
+                               + 32 - start_offset + start);
+            return loadu256(ptr);
+        }
+        assert(start_offset - start <= avail);
+        *p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
+                           + 32 - start_offset + start);
+        copy_start = 0;
+        copy_len = avail;
+    } else { //start zone
+        uintptr_t need = MIN((uintptr_t)(lo - ptr),
+                             MIN(len_history, nMasks - 1));
+        uintptr_t start = (uintptr_t)(lo - ptr);
+        uintptr_t i;
+        for (i = start - need; i < start; i++) {
+            u.val8[i] = buf_history[len_history - (start - i)];
+        }
+        uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
+        assert(start + start_offset <= end);
+        *p_mask = loadu256(p_mask_arr256[end - start - start_offset]
+                           + 32 - start - start_offset);
+        copy_start = start;
+        copy_len = end - start;
+    }
+
+    // Runt block from the buffer.
+    copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
+
+    return u.val256;
+}
+#endif // HAVE_AVX2
+
+#if defined(HAVE_AVX512)
+// Note: p_mask is an output param that initialises a poison mask.
+//       u64a k = ones_u64a << n' >> m'; // m' < n'
+//       *p_mask = set_mask_m512(~k);
+//       means p_mask is consist of:
+//       (n' - m') poison bytes "0xff" at the beginning,
+//       followed by (64 - n') valid bytes "0x00",
+//       then followed by the rest m' poison bytes "0xff".
+// ptr >= lo:
+//     no history.
+//     for end/short zone, ptr==lo and start_offset==0
+//     for start zone, see below
+//          lo         ptr                      hi           hi
+//          |----------|-------|----------------|............|
+//          -start     0       -start+offset    MIN(avail,64)
+// p_mask              ffff..ff0000...........00ffff..........
+// ptr < lo:
+//     only start zone.
+//             history
+//          ptr        lo                       hi           hi
+//          |----------|-------|----------------|............|
+//          0          start   start+offset     end(<=64)
+// p_mask   ffff.....ffffff..ff0000...........00ffff..........
+static really_inline
+m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset,
+                     const u8 *lo, const u8 *hi, const u8 *hbuf, size_t hlen,
+                     const u32 nMasks) {
+    m512 val;
+
+    uintptr_t copy_start;
+    uintptr_t copy_len;
+
+    if (ptr >= lo) { // short/end/start zone
+        uintptr_t start = (uintptr_t)(ptr - lo);
+        uintptr_t avail = (uintptr_t)(hi - ptr);
+        if (avail >= 64) {
+            assert(start_offset - start <= 64);
+            u64a k = ones_u64a << (start_offset - start);
+            *p_mask = set_mask_m512(~k);
+            return loadu512(ptr);
+        }
+        assert(start_offset - start <= avail);
+        u64a k = ones_u64a << (64 - avail + start_offset - start)
+                           >> (64 - avail);
+        *p_mask = set_mask_m512(~k);
+        copy_start = 0;
+        copy_len = avail;
+    } else { //start zone
+        uintptr_t need = MIN((uintptr_t)(lo - ptr),
+                             MIN(hlen, nMasks - 1));
+        uintptr_t start = (uintptr_t)(lo - ptr);
+        u64a j = 0x7fffffffffffffffULL >> (63 - need) << (start - need);
+        val = loadu_maskz_m512(j, &hbuf[hlen - start]);
+        uintptr_t end = MIN(64, (uintptr_t)(hi - ptr));
+        assert(start + start_offset <= end);
+        u64a k = ones_u64a << (64 - end + start + start_offset) >> (64 - end);
+        *p_mask = set_mask_m512(~k);
+        copy_start = start;
+        copy_len = end - start;
+    }
+
+    assert(copy_len < 64);
+    assert(copy_len > 0);
+    u64a j = ones_u64a >> (64 - copy_len) << copy_start;
+    val = loadu_mask_m512(val, j, ptr);
+
+    return val;
+}
+#endif // HAVE_AVX512
+
+static really_inline
+u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte,
                 UNUSED CautionReason reason) {
-    u64a confVal = 0; 
-    const u8 *buf = a->buf; 
-    size_t len = a->len; 
-    const u8 *confirm_loc = ptr + byte - 7; 
+    u64a confVal = 0;
+    const u8 *buf = a->buf;
+    size_t len = a->len;
+    const u8 *confirm_loc = ptr + byte - 7;
 #if defined(HAVE_AVX512VBMI)
     if (likely(confirm_loc >= buf)) {
 #else
-    if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) { 
+    if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) {
 #endif
-        confVal = lv_u64a(confirm_loc, buf, buf + len); 
-    } else { // r == VECTORING, confirm_loc < buf 
-        u64a histBytes = a->histBytes; 
-        confVal = lv_u64a_ce(confirm_loc, buf, buf + len); 
-        // stitch together confVal and history 
-        u32 overhang = buf - confirm_loc; 
-        histBytes >>= 64 - (overhang * 8); 
-        confVal |= histBytes; 
-    } 
-    return confVal; 
-} 
- 
-static really_inline 
-void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset, 
-                          const u32 *confBase, CautionReason reason, 
-                          const struct FDR_Runtime_Args *a, const u8 *ptr, 
-                          hwlmcb_rv_t *control, u32 *last_match) { 
-    do  { 
-        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf); 
-        u32 byte = bit / bucket + offset; 
-        u32 idx  = bit % bucket; 
-        u32 cf = confBase[idx]; 
-        if (!cf) { 
-            continue; 
-        } 
-        const struct FDRConfirm *fdrc = (const struct FDRConfirm *) 
-                                        ((const u8 *)confBase + cf); 
-        if (!(fdrc->groups & *control)) { 
-            continue; 
-        } 
-        u64a tmp = 0; 
-        u64a confVal = getConfVal(a, ptr, byte, reason); 
-        confWithBit(fdrc, a, ptr - a->buf + byte, control, 
-                    last_match, confVal, &tmp, 0); 
-    } while (unlikely(*conf)); 
-} 
- 
-static really_inline 
-const m128 *getMaskBase(const struct Teddy *teddy) { 
-    return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))); 
-} 
- 
-static really_inline 
-const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) { 
-    return (const u64a *)((const u8 *)getMaskBase(teddy) 
-                          + ROUNDUP_CL(2 * numMask * sizeof(m128))); 
-} 
- 
-static really_inline 
-const u32 *getConfBase(const struct Teddy *teddy) { 
-    return (const u32 *)((const u8 *)teddy + teddy->confOffset); 
-} 
- 
-#endif /* TEDDY_RUNTIME_COMMON_H_ */ 
+        confVal = lv_u64a(confirm_loc, buf, buf + len);
+    } else { // r == VECTORING, confirm_loc < buf
+        u64a histBytes = a->histBytes;
+        confVal = lv_u64a_ce(confirm_loc, buf, buf + len);
+        // stitch together confVal and history
+        u32 overhang = buf - confirm_loc;
+        histBytes >>= 64 - (overhang * 8);
+        confVal |= histBytes;
+    }
+    return confVal;
+}
+
+static really_inline
+void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
+                          const u32 *confBase, CautionReason reason,
+                          const struct FDR_Runtime_Args *a, const u8 *ptr,
+                          hwlmcb_rv_t *control, u32 *last_match) {
+    do  {
+        u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
+        u32 byte = bit / bucket + offset;
+        u32 idx  = bit % bucket;
+        u32 cf = confBase[idx];
+        if (!cf) {
+            continue;
+        }
+        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+                                        ((const u8 *)confBase + cf);
+        if (!(fdrc->groups & *control)) {
+            continue;
+        }
+        u64a tmp = 0;
+        u64a confVal = getConfVal(a, ptr, byte, reason);
+        confWithBit(fdrc, a, ptr - a->buf + byte, control,
+                    last_match, confVal, &tmp, 0);
+    } while (unlikely(*conf));
+}
+
+static really_inline
+const m128 *getMaskBase(const struct Teddy *teddy) {
+    return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
+}
+
+static really_inline
+const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
+    return (const u64a *)((const u8 *)getMaskBase(teddy)
+                          + ROUNDUP_CL(2 * numMask * sizeof(m128)));
+}
+
+static really_inline
+const u32 *getConfBase(const struct Teddy *teddy) {
+    return (const u32 *)((const u8 *)teddy + teddy->confOffset);
+}
+
+#endif /* TEDDY_RUNTIME_COMMON_H_ */
author	Ivan Blinkov <ivan@blinkov.ru>	2022-02-10 16:47:11 +0300
committer	Daniil Cherednik <dcherednik@yandex-team.ru>	2022-02-10 16:47:11 +0300
commit	5b283123c882433dafbaf6b338adeea16c1a0ea0 (patch)
tree	339adc63bce23800021202ae4a8328a843dc447a /contrib/libs/hyperscan/src/fdr/teddy_runtime_common.h
parent	1aeb9a455974457866f78722ad98114bafc84e8a (diff)
download	ydb-5b283123c882433dafbaf6b338adeea16c1a0ea0.tar.gz