diff options
author | Ivan Blinkov <ivan@blinkov.ru> | 2022-02-10 16:47:10 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:47:10 +0300 |
commit | 1aeb9a455974457866f78722ad98114bafc84e8a (patch) | |
tree | e4340eaf1668684d83a0a58c36947c5def5350ad /contrib/libs/hyperscan/src/hwlm/noodle_engine_avx512.c | |
parent | bd5ef432f5cfb1e18851381329d94665a4c22470 (diff) | |
download | ydb-1aeb9a455974457866f78722ad98114bafc84e8a.tar.gz |
Restoring authorship annotation for Ivan Blinkov <ivan@blinkov.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/hyperscan/src/hwlm/noodle_engine_avx512.c')
-rw-r--r-- | contrib/libs/hyperscan/src/hwlm/noodle_engine_avx512.c | 382 |
1 files changed, 191 insertions, 191 deletions
diff --git a/contrib/libs/hyperscan/src/hwlm/noodle_engine_avx512.c b/contrib/libs/hyperscan/src/hwlm/noodle_engine_avx512.c index 8cac1b15c2..9bf445821a 100644 --- a/contrib/libs/hyperscan/src/hwlm/noodle_engine_avx512.c +++ b/contrib/libs/hyperscan/src/hwlm/noodle_engine_avx512.c @@ -1,191 +1,191 @@ -/* - * Copyright (c) 2017, Intel Corporation - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/* noodle scan parts for AVX512 */ - -static really_inline -m512 getMask(u8 c, bool noCase) { - u8 k = caseClear8(c, noCase); - return set64x8(k); -} - -static really_inline -m512 getCaseMask(void) { - return set64x8(CASE_CLEAR); -} - -// The short scan routine. It is used both to scan data up to an -// alignment boundary if needed and to finish off data that the aligned scan -// function can't handle (due to small/unaligned chunk at end) -static really_inline -hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, - size_t len, bool noCase, m512 caseMask, m512 mask1, - const struct cb_info *cbi, size_t start, - size_t end) { - const u8 *d = buf + start; - ptrdiff_t scan_len = end - start; - DEBUG_PRINTF("scan_len %zu\n", scan_len); - assert(scan_len <= 64); - if (!scan_len) { - return HWLM_SUCCESS; - } - - __mmask64 k = (~0ULL) >> (64 - scan_len); - DEBUG_PRINTF("load mask 0x%016llx\n", k); - - m512 v = loadu_maskz_m512(k, d); - - if (noCase) { - v = and512(v, caseMask); - } - - // reuse the load mask to indicate valid bytes - u64a z = masked_eq512mask(k, mask1, v); - - SINGLE_ZSCAN(); - - return HWLM_SUCCESS; -} - -static really_inline -hwlm_error_t scanSingle512(const struct noodTable *n, const u8 *buf, size_t len, - bool noCase, m512 caseMask, m512 mask1, - const struct cb_info *cbi, size_t start, - size_t end) { - const u8 *d = buf + start; - const u8 *e = buf + end; - DEBUG_PRINTF("start %p end %p \n", d, e); - assert(d < e); - if (d + 64 >= e) { - goto tail; - } - - // peel off first part to cacheline boundary - const u8 *d1 = ROUNDUP_PTR(d, 64); - if (scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, start, - d1 - buf) == HWLM_TERMINATED) { - return HWLM_TERMINATED; - } - d = d1; - - for (; d + 64 < e; d += 64) { - DEBUG_PRINTF("d %p e %p \n", d, e); - m512 v = noCase ? and512(load512(d), caseMask) : load512(d); - - u64a z = eq512mask(mask1, v); - __builtin_prefetch(d + 128); - - SINGLE_ZSCAN(); - } - -tail: - DEBUG_PRINTF("d %p e %p \n", d, e); - // finish off tail - - return scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, d - buf, - e - buf); -} - -static really_inline -hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, - size_t len, bool noCase, m512 caseMask, m512 mask1, - m512 mask2, const struct cb_info *cbi, - u64a *lastz0, size_t start, size_t end) { - DEBUG_PRINTF("start %zu end %zu last 0x%016llx\n", start, end, *lastz0); - const u8 *d = buf + start; - ptrdiff_t scan_len = end - start; - if (!scan_len) { - return HWLM_SUCCESS; - } - assert(scan_len <= 64); - __mmask64 k = (~0ULL) >> (64 - scan_len); - DEBUG_PRINTF("load mask 0x%016llx scan_len %zu\n", k, scan_len); - - m512 v = loadu_maskz_m512(k, d); - if (noCase) { - v = and512(v, caseMask); - } - - u64a z0 = masked_eq512mask(k, mask1, v); - u64a z1 = masked_eq512mask(k, mask2, v); - u64a z = (*lastz0 | (z0 << 1)) & z1; - DEBUG_PRINTF("z 0x%016llx\n", z); - - DOUBLE_ZSCAN(); - *lastz0 = z0 >> (scan_len - 1); - return HWLM_SUCCESS; -} - -static really_inline -hwlm_error_t scanDouble512(const struct noodTable *n, const u8 *buf, size_t len, - bool noCase, m512 caseMask, m512 mask1, m512 mask2, - const struct cb_info *cbi, size_t start, - size_t end) { - const u8 *d = buf + start; - const u8 *e = buf + end; - u64a lastz0 = 0; - DEBUG_PRINTF("start %zu end %zu \n", start, end); - assert(d < e); - if (d + 64 >= e) { - goto tail; - } - - // peel off first part to cacheline boundary - const u8 *d1 = ROUNDUP_PTR(d, 64); - if (scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi, - &lastz0, start, d1 - buf) == HWLM_TERMINATED) { - return HWLM_TERMINATED; - } - d = d1; - - for (; d + 64 < e; d += 64) { - DEBUG_PRINTF("d %p e %p 0x%016llx\n", d, e, lastz0); - m512 v = noCase ? and512(load512(d), caseMask) : load512(d); - - /* we have to pull the masks out of the AVX registers because we can't - byte shift between the lanes */ - u64a z0 = eq512mask(mask1, v); - u64a z1 = eq512mask(mask2, v); - u64a z = (lastz0 | (z0 << 1)) & z1; - lastz0 = z0 >> 63; - - // On large packet buffers, this prefetch appears to get us about 2%. - __builtin_prefetch(d + 256); - - DEBUG_PRINTF("z 0x%016llx\n", z); - - DOUBLE_ZSCAN(); - } - -tail: - DEBUG_PRINTF("d %p e %p off %zu \n", d, e, d - buf); - // finish off tail - - return scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi, - &lastz0, d - buf, end); -} +/* + * Copyright (c) 2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* noodle scan parts for AVX512 */ + +static really_inline +m512 getMask(u8 c, bool noCase) { + u8 k = caseClear8(c, noCase); + return set64x8(k); +} + +static really_inline +m512 getCaseMask(void) { + return set64x8(CASE_CLEAR); +} + +// The short scan routine. It is used both to scan data up to an +// alignment boundary if needed and to finish off data that the aligned scan +// function can't handle (due to small/unaligned chunk at end) +static really_inline +hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, + size_t len, bool noCase, m512 caseMask, m512 mask1, + const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + start; + ptrdiff_t scan_len = end - start; + DEBUG_PRINTF("scan_len %zu\n", scan_len); + assert(scan_len <= 64); + if (!scan_len) { + return HWLM_SUCCESS; + } + + __mmask64 k = (~0ULL) >> (64 - scan_len); + DEBUG_PRINTF("load mask 0x%016llx\n", k); + + m512 v = loadu_maskz_m512(k, d); + + if (noCase) { + v = and512(v, caseMask); + } + + // reuse the load mask to indicate valid bytes + u64a z = masked_eq512mask(k, mask1, v); + + SINGLE_ZSCAN(); + + return HWLM_SUCCESS; +} + +static really_inline +hwlm_error_t scanSingle512(const struct noodTable *n, const u8 *buf, size_t len, + bool noCase, m512 caseMask, m512 mask1, + const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + start; + const u8 *e = buf + end; + DEBUG_PRINTF("start %p end %p \n", d, e); + assert(d < e); + if (d + 64 >= e) { + goto tail; + } + + // peel off first part to cacheline boundary + const u8 *d1 = ROUNDUP_PTR(d, 64); + if (scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, start, + d1 - buf) == HWLM_TERMINATED) { + return HWLM_TERMINATED; + } + d = d1; + + for (; d + 64 < e; d += 64) { + DEBUG_PRINTF("d %p e %p \n", d, e); + m512 v = noCase ? and512(load512(d), caseMask) : load512(d); + + u64a z = eq512mask(mask1, v); + __builtin_prefetch(d + 128); + + SINGLE_ZSCAN(); + } + +tail: + DEBUG_PRINTF("d %p e %p \n", d, e); + // finish off tail + + return scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, d - buf, + e - buf); +} + +static really_inline +hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, + size_t len, bool noCase, m512 caseMask, m512 mask1, + m512 mask2, const struct cb_info *cbi, + u64a *lastz0, size_t start, size_t end) { + DEBUG_PRINTF("start %zu end %zu last 0x%016llx\n", start, end, *lastz0); + const u8 *d = buf + start; + ptrdiff_t scan_len = end - start; + if (!scan_len) { + return HWLM_SUCCESS; + } + assert(scan_len <= 64); + __mmask64 k = (~0ULL) >> (64 - scan_len); + DEBUG_PRINTF("load mask 0x%016llx scan_len %zu\n", k, scan_len); + + m512 v = loadu_maskz_m512(k, d); + if (noCase) { + v = and512(v, caseMask); + } + + u64a z0 = masked_eq512mask(k, mask1, v); + u64a z1 = masked_eq512mask(k, mask2, v); + u64a z = (*lastz0 | (z0 << 1)) & z1; + DEBUG_PRINTF("z 0x%016llx\n", z); + + DOUBLE_ZSCAN(); + *lastz0 = z0 >> (scan_len - 1); + return HWLM_SUCCESS; +} + +static really_inline +hwlm_error_t scanDouble512(const struct noodTable *n, const u8 *buf, size_t len, + bool noCase, m512 caseMask, m512 mask1, m512 mask2, + const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + start; + const u8 *e = buf + end; + u64a lastz0 = 0; + DEBUG_PRINTF("start %zu end %zu \n", start, end); + assert(d < e); + if (d + 64 >= e) { + goto tail; + } + + // peel off first part to cacheline boundary + const u8 *d1 = ROUNDUP_PTR(d, 64); + if (scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi, + &lastz0, start, d1 - buf) == HWLM_TERMINATED) { + return HWLM_TERMINATED; + } + d = d1; + + for (; d + 64 < e; d += 64) { + DEBUG_PRINTF("d %p e %p 0x%016llx\n", d, e, lastz0); + m512 v = noCase ? and512(load512(d), caseMask) : load512(d); + + /* we have to pull the masks out of the AVX registers because we can't + byte shift between the lanes */ + u64a z0 = eq512mask(mask1, v); + u64a z1 = eq512mask(mask2, v); + u64a z = (lastz0 | (z0 << 1)) & z1; + lastz0 = z0 >> 63; + + // On large packet buffers, this prefetch appears to get us about 2%. + __builtin_prefetch(d + 256); + + DEBUG_PRINTF("z 0x%016llx\n", z); + + DOUBLE_ZSCAN(); + } + +tail: + DEBUG_PRINTF("d %p e %p off %zu \n", d, e, d - buf); + // finish off tail + + return scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi, + &lastz0, d - buf, end); +} |