/* * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "fdr.h" #include "fdr_confirm.h" #include "fdr_confirm_runtime.h" #include "fdr_internal.h" #include "fdr_loadval.h" #include "flood_runtime.h" #include "scratch.h" #include "teddy.h" #include "teddy_internal.h" #include "util/arch.h" #include "util/simd_utils.h" #include "util/uniform_ops.h" /** \brief number of bytes processed in each iteration */ #define ITER_BYTES 16 /** \brief total zone buffer size */ #define ZONE_TOTAL_SIZE 64 /** \brief maximum number of allowed zones */ #define ZONE_MAX 3 /** \brief zone information. * * Zone represents a region of data to scan in FDR. * * The incoming buffer is to split in multiple zones to ensure two properties: * 1: that we can read 8? bytes behind to generate a hash safely * 2: that we can read the 3 byte after the current byte (domain > 8) */ struct zone { /** \brief copied buffer, used only when it is a boundary zone. */ u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE]; /** \brief shift amount for fdr state to avoid unwanted match. */ u8 shift; /** \brief if boundary zone, start points into the zone buffer after the * pre-padding. Otherwise, points to the main buffer, appropriately. */ const u8 *start; /** \brief if boundary zone, end points to the end of zone. Otherwise, * pointer to the main buffer, appropriately. */ const u8 *end; /** \brief the amount to adjust to go from a pointer in the zones region * (between start and end) to a pointer in the original data buffer. */ ptrdiff_t zone_pointer_adjust; /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones, * otherwise end of the zone buf. floodPtr always points inside the same * buffer as the start pointe. */ const u8 *floodPtr; }; static const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = { { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } }; /* compilers don't reliably synthesize the 32-bit ANDN instruction here, * so we force its generation. */ static really_inline u64a andn(const u32 a, const u8 *b) { u64a r; #if defined(HAVE_BMI) && !defined(NO_ASM) __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b)); #else r = unaligned_load_u32(b) & ~a; #endif return r; } /* generates an initial state mask based on the last byte-ish of history rather * than being all accepting. If there is no history to consider, the state is * generated based on the minimum length of each bucket in order to prevent * confirms. */ static really_inline m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft, const struct zone *z) { m128 s; if (len_history) { /* +1: the zones ensure that we can read the byte at z->end */ u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1); tmp &= fdr->domainMask; s = load_m128_from_u64a(ft + tmp); s = rshiftbyte_m128(s, 1); } else { s = fdr->start; } return s; } static really_inline void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, UNUSED const u8 *end_ptr, u32 domain_mask_flipped, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { /* +1: the zones ensure that we can read the byte at z->end */ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); u64a reach0 = andn(domain_mask_flipped, itPtr); u64a reach1 = andn(domain_mask_flipped, itPtr + 1); u64a reach2 = andn(domain_mask_flipped, itPtr + 2); u64a reach3 = andn(domain_mask_flipped, itPtr + 3); m128 st0 = load_m128_from_u64a(ft + reach0); m128 st1 = load_m128_from_u64a(ft + reach1); m128 st2 = load_m128_from_u64a(ft + reach2); m128 st3 = load_m128_from_u64a(ft + reach3); u64a reach4 = andn(domain_mask_flipped, itPtr + 4); u64a reach5 = andn(domain_mask_flipped, itPtr + 5); u64a reach6 = andn(domain_mask_flipped, itPtr + 6); u64a reach7 = andn(domain_mask_flipped, itPtr + 7); m128 st4 = load_m128_from_u64a(ft + reach4); m128 st5 = load_m128_from_u64a(ft + reach5); m128 st6 = load_m128_from_u64a(ft + reach6); m128 st7 = load_m128_from_u64a(ft + reach7); st1 = lshiftbyte_m128(st1, 1); st2 = lshiftbyte_m128(st2, 2); st3 = lshiftbyte_m128(st3, 3); st4 = lshiftbyte_m128(st4, 4); st5 = lshiftbyte_m128(st5, 5); st6 = lshiftbyte_m128(st6, 6); st7 = lshiftbyte_m128(st7, 7); st0 = or128(st0, st1); st2 = or128(st2, st3); st4 = or128(st4, st5); st6 = or128(st6, st7); st0 = or128(st0, st2); st4 = or128(st4, st6); st0 = or128(st0, st4); *s = or128(*s, st0); *conf0 = movq(*s); *s = rshiftbyte_m128(*s, 8); *conf0 ^= ~0ULL; u64a reach8 = andn(domain_mask_flipped, itPtr + 8); u64a reach9 = andn(domain_mask_flipped, itPtr + 9); u64a reach10 = andn(domain_mask_flipped, itPtr + 10); u64a reach11 = andn(domain_mask_flipped, itPtr + 11); m128 st8 = load_m128_from_u64a(ft + reach8); m128 st9 = load_m128_from_u64a(ft + reach9); m128 st10 = load_m128_from_u64a(ft + reach10); m128 st11 = load_m128_from_u64a(ft + reach11); u64a reach12 = andn(domain_mask_flipped, itPtr + 12); u64a reach13 = andn(domain_mask_flipped, itPtr + 13); u64a reach14 = andn(domain_mask_flipped, itPtr + 14); u64a reach15 = andn(domain_mask_flipped, itPtr + 15); m128 st12 = load_m128_from_u64a(ft + reach12); m128 st13 = load_m128_from_u64a(ft + reach13); m128 st14 = load_m128_from_u64a(ft + reach14); m128 st15 = load_m128_from_u64a(ft + reach15); st9 = lshiftbyte_m128(st9, 1); st10 = lshiftbyte_m128(st10, 2); st11 = lshiftbyte_m128(st11, 3); st12 = lshiftbyte_m128(st12, 4); st13 = lshiftbyte_m128(st13, 5); st14 = lshiftbyte_m128(st14, 6); st15 = lshiftbyte_m128(st15, 7); st8 = or128(st8, st9); st10 = or128(st10, st11); st12 = or128(st12, st13); st14 = or128(st14, st15); st8 = or128(st8, st10); st12 = or128(st12, st14); st8 = or128(st8, st12); *s = or128(*s, st8); *conf8 = movq(*s); *s = rshiftbyte_m128(*s, 8); *conf8 ^= ~0ULL; } static really_inline void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr, UNUSED const u8 *end_ptr, u32 domain_mask_flipped, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); u64a reach0 = andn(domain_mask_flipped, itPtr); u64a reach2 = andn(domain_mask_flipped, itPtr + 2); u64a reach4 = andn(domain_mask_flipped, itPtr + 4); u64a reach6 = andn(domain_mask_flipped, itPtr + 6); m128 st0 = load_m128_from_u64a(ft + reach0); m128 st2 = load_m128_from_u64a(ft + reach2); m128 st4 = load_m128_from_u64a(ft + reach4); m128 st6 = load_m128_from_u64a(ft + reach6); u64a reach8 = andn(domain_mask_flipped, itPtr + 8); u64a reach10 = andn(domain_mask_flipped, itPtr + 10); u64a reach12 = andn(domain_mask_flipped, itPtr + 12); u64a reach14 = andn(domain_mask_flipped, itPtr + 14); m128 st8 = load_m128_from_u64a(ft + reach8); m128 st10 = load_m128_from_u64a(ft + reach10); m128 st12 = load_m128_from_u64a(ft + reach12); m128 st14 = load_m128_from_u64a(ft + reach14); st2 = lshiftbyte_m128(st2, 2); st4 = lshiftbyte_m128(st4, 4); st6 = lshiftbyte_m128(st6, 6); *s = or128(*s, st0); *s = or128(*s, st2); *s = or128(*s, st4); *s = or128(*s, st6); *conf0 = movq(*s); *s = rshiftbyte_m128(*s, 8); *conf0 ^= ~0ULL; st10 = lshiftbyte_m128(st10, 2); st12 = lshiftbyte_m128(st12, 4); st14 = lshiftbyte_m128(st14, 6); *s = or128(*s, st8); *s = or128(*s, st10); *s = or128(*s, st12); *s = or128(*s, st14); *conf8 = movq(*s); *s = rshiftbyte_m128(*s, 8); *conf8 ^= ~0ULL; } static really_inline void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr, UNUSED const u8 *end_ptr, u32 domain_mask_flipped, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); u64a reach0 = andn(domain_mask_flipped, itPtr); u64a reach4 = andn(domain_mask_flipped, itPtr + 4); u64a reach8 = andn(domain_mask_flipped, itPtr + 8); u64a reach12 = andn(domain_mask_flipped, itPtr + 12); m128 st0 = load_m128_from_u64a(ft + reach0); m128 st4 = load_m128_from_u64a(ft + reach4); m128 st8 = load_m128_from_u64a(ft + reach8); m128 st12 = load_m128_from_u64a(ft + reach12); st4 = lshiftbyte_m128(st4, 4); st12 = lshiftbyte_m128(st12, 4); *s = or128(*s, st0); *s = or128(*s, st4); *conf0 = movq(*s); *s = rshiftbyte_m128(*s, 8); *conf0 ^= ~0ULL; *s = or128(*s, st8); *s = or128(*s, st12); *conf8 = movq(*s); *s = rshiftbyte_m128(*s, 8); *conf8 ^= ~0ULL; } static really_inline void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control, const u32 *confBase, const struct FDR_Runtime_Args *a, const u8 *ptr, u32 *last_match_id, struct zone *z) { const u8 bucket = 8; if (likely(!*conf)) { return; } /* ptr is currently referring to a location in the zone's buffer, we also * need a pointer in the original, main buffer for the final string compare. */ const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); const u8 *confLoc = ptr; do { u32 bit = findAndClearLSB_64(conf); u32 byte = bit / bucket + offset; u32 bitRem = bit % bucket; u32 idx = bitRem; u32 cf = confBase[idx]; if (!cf) { continue; } const struct FDRConfirm *fdrc = (const struct FDRConfirm *) ((const u8 *)confBase + cf); if (!(fdrc->groups & *control)) { continue; } u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1); confWithBit(fdrc, a, ptr_main - a->buf + byte, control, last_match_id, confVal, conf, bit); } while (unlikely(!!*conf)); } static really_inline void dumpZoneInfo(UNUSED struct zone *z, UNUSED size_t zone_id) { #ifdef DEBUG DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf); DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n", z->start, z->end, z->shift); DEBUG_PRINTF("zone: zone_pointer_adjust=%zd, floodPtr=%p\n", z->zone_pointer_adjust, z->floodPtr); DEBUG_PRINTF("zone buf:"); for (size_t i = 0; i < ZONE_TOTAL_SIZE; i++) { if (i % 8 == 0) { printf("_"); } if (z->buf[i]) { printf("%02x", z->buf[i]); } else { printf(".."); } } printf("\n"); #endif }; /** * \brief Updates attributes for non-boundary region zone. */ static really_inline void createMainZone(const u8 *flood, const u8 *begin, const u8 *end, struct zone *z) { z->zone_pointer_adjust = 0; /* zone buffer is the main buffer */ z->start = begin; z->end = end; z->floodPtr = flood; z->shift = 0; } /** * \brief Create zone for short cases (<= ITER_BYTES). * * For this case we need to copy everything into the zone's internal buffer. * * We need to ensure that we run over real data if it exists (in history or * before zone begin). We also need to ensure 8 bytes before any data being * matched can be read (to perform a conf hash). * * We also need to ensure that the data at z->end can be read. * * Hence, the zone consists of: * 16 bytes of history, * 1 - 24 bytes of data form the buffer (ending at end), * 1 byte of final padding */ static really_inline void createShortZone(const u8 *buf, const u8 *hend, const u8 *begin, const u8 *end, struct zone *z) { /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid * the checks in boundary zone. */ z->floodPtr = z->buf + ZONE_TOTAL_SIZE; ptrdiff_t z_len = end - begin; assert(z_len > 0); assert(z_len <= ITER_BYTES); z->shift = ITER_BYTES - z_len; /* ignore bytes outside region specified */ static const size_t ZONE_SHORT_DATA_OFFSET = 16; /* after history */ /* we are guaranteed to always have 16 initialised bytes at the end of * the history buffer (they may be garbage coming from the stream state * preceding hbuf, but bytes that don't correspond to actual history * shouldn't affect computations). */ *(m128 *)z->buf = loadu128(hend - sizeof(m128)); /* The amount of data we have to copy from main buffer. */ size_t copy_len = MIN((size_t)(end - buf), ITER_BYTES + sizeof(CONF_TYPE)); u8 *zone_data = z->buf + ZONE_SHORT_DATA_OFFSET; switch (copy_len) { case 1: *zone_data = *(end - 1); break; case 2: *(u16 *)zone_data = unaligned_load_u16(end - 2); break; case 3: *(u16 *)zone_data = unaligned_load_u16(end - 3); *(zone_data + 2) = *(end - 1); break; case 4: *(u32 *)zone_data = unaligned_load_u32(end - 4); break; case 5: case 6: case 7: /* perform copy with 2 overlapping 4-byte chunks from buf. */ *(u32 *)zone_data = unaligned_load_u32(end - copy_len); unaligned_store_u32(zone_data + copy_len - sizeof(u32), unaligned_load_u32(end - sizeof(u32))); break; case 8: *(u64a *)zone_data = unaligned_load_u64a(end - 8); break; case 9: case 10: case 11: case 12: case 13: case 14: case 15: /* perform copy with 2 overlapping 8-byte chunks from buf. */ *(u64a *)zone_data = unaligned_load_u64a(end - copy_len); unaligned_store_u64a(zone_data + copy_len - sizeof(u64a), unaligned_load_u64a(end - sizeof(u64a))); break; case 16: /* copy 16-bytes from buf. */ *(m128 *)zone_data = loadu128(end - 16); break; default: assert(copy_len <= sizeof(m128) + sizeof(u64a)); /* perform copy with (potentially overlapping) 8-byte and 16-byte chunks. */ *(u64a *)zone_data = unaligned_load_u64a(end - copy_len); storeu128(zone_data + copy_len - sizeof(m128), loadu128(end - sizeof(m128))); break; } /* set the start and end location of the zone buf * to be scanned */ u8 *z_end = z->buf + ZONE_SHORT_DATA_OFFSET + copy_len; assert(ZONE_SHORT_DATA_OFFSET + copy_len >= ITER_BYTES); /* copy the post-padding byte; this is required for domain > 8 due to * overhang */ assert(ZONE_SHORT_DATA_OFFSET + copy_len + 3 < 64); *z_end = 0; z->end = z_end; z->start = z_end - ITER_BYTES; z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end); assert(z->start + z->shift == z_end - z_len); } /** * \brief Create a zone for the start region. * * This function requires that there is > ITER_BYTES of data in the buffer to * scan. The start zone itself is always responsible for scanning exactly * ITER_BYTES of data - there are no warmup/junk bytes scanned. * * This zone ensures that the byte at z->end can be read and corresponds to * the next byte of data. * * 8 bytes of history data are provided before z->start to allow proper hash * generation in streaming mode. If buf != begin, upto 8 bytes of data * prior to begin is also provided. * * Although we are not interested in bare literals which start before begin * if buf != begin, lookarounds associated with the literal may require * the data prior to begin for hash purposes. */ static really_inline void createStartZone(const u8 *buf, const u8 *hend, const u8 *begin, struct zone *z) { assert(ITER_BYTES == sizeof(m128)); assert(sizeof(CONF_TYPE) == 8); static const size_t ZONE_START_BEGIN = sizeof(CONF_TYPE); const u8 *end = begin + ITER_BYTES; /* set floodPtr to the end of zone buf to avoid checks in start zone */ z->floodPtr = z->buf + ZONE_TOTAL_SIZE; z->shift = 0; /* we are processing ITER_BYTES of real data */ /* we are guaranteed to always have 16 initialised bytes at the end of the * history buffer (they may be garbage coming from the stream state * preceding hbuf, but bytes that don't correspond to actual history * shouldn't affect computations). However, for start zones, history is only * required for conf hash purposes so we only need 8 bytes */ unaligned_store_u64a(z->buf, unaligned_load_u64a(hend - sizeof(u64a))); /* The amount of data we have to copy from main buffer. */ size_t copy_len = MIN((size_t)(end - buf), ITER_BYTES + sizeof(CONF_TYPE)); assert(copy_len >= 16); /* copy the post-padding byte; this is required for domain > 8 due to * overhang. The start requires that there is data after the zone so it * it safe to dereference end */ z->buf[ZONE_START_BEGIN + copy_len] = *end; /* set the start and end location of the zone buf to be scanned */ u8 *z_end = z->buf + ZONE_START_BEGIN + copy_len; z->end = z_end; z->start = z_end - ITER_BYTES; /* copy the first 8 bytes of the valid region */ unaligned_store_u64a(z->buf + ZONE_START_BEGIN, unaligned_load_u64a(end - copy_len)); /* copy the last 16 bytes, may overlap with the previous 8 byte write */ storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128))); z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end); assert(ZONE_START_BEGIN + copy_len + 3 < 64); } /** * \brief Create a zone for the end region. * * This function requires that there is > ITER_BYTES of data in the buffer to * scan. The end zone is responsible for a scanning the <= ITER_BYTES rump of * data and optional ITER_BYTES. The main zone cannot handle the last 3 bytes * of the buffer. The end zone is required to handle an optional full * ITER_BYTES from main zone when there are less than 3 bytes to scan. The * main zone size is reduced by ITER_BYTES in this case. * * This zone ensures that the byte at z->end can be read by filling it with a * padding character. * * Upto 8 bytes of data prior to begin is also provided for the purposes of * generating hashes. History is not copied, as all locations which require * history for generating a hash are the responsiblity of the start zone. */ static really_inline void createEndZone(const u8 *buf, const u8 *begin, const u8 *end, struct zone *z) { /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid * the checks in boundary zone. */ z->floodPtr = z->buf + ZONE_TOTAL_SIZE; ptrdiff_t z_len = end - begin; assert(z_len > 0); size_t iter_bytes_second = 0; size_t z_len_first = z_len; if (z_len > ITER_BYTES) { z_len_first = z_len - ITER_BYTES; iter_bytes_second = ITER_BYTES; } z->shift = ITER_BYTES - z_len_first; const u8 *end_first = end - iter_bytes_second; /* The amount of data we have to copy from main buffer for the * first iteration. */ size_t copy_len_first = MIN((size_t)(end_first - buf), ITER_BYTES + sizeof(CONF_TYPE)); assert(copy_len_first >= 16); size_t total_copy_len = copy_len_first + iter_bytes_second; assert(total_copy_len + 3 < 64); /* copy the post-padding byte; this is required for domain > 8 due to * overhang */ z->buf[total_copy_len] = 0; /* set the start and end location of the zone buf * to be scanned */ u8 *z_end = z->buf + total_copy_len; z->end = z_end; z->start = z_end - ITER_BYTES - iter_bytes_second; assert(z->start + z->shift == z_end - z_len); u8 *z_end_first = z_end - iter_bytes_second; /* copy the first 8 bytes of the valid region */ unaligned_store_u64a(z->buf, unaligned_load_u64a(end_first - copy_len_first)); /* copy the last 16 bytes, may overlap with the previous 8 byte write */ storeu128(z_end_first - sizeof(m128), loadu128(end_first - sizeof(m128))); if (iter_bytes_second) { storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128))); } z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end); } /** * \brief Prepare zones. * * This function prepares zones with actual buffer and some padded bytes. * The actual ITER_BYTES bytes in zone is preceded by main buf and/or * history buf and succeeded by padded bytes possibly from main buf, * if available. */ static really_inline size_t prepareZones(const u8 *buf, size_t len, const u8 *hend, size_t start, const u8 *flood, struct zone *zoneArr) { const u8 *ptr = buf + start; size_t remaining = len - start; if (remaining <= ITER_BYTES) { /* enough bytes to make only one zone */ createShortZone(buf, hend, ptr, buf + len, &zoneArr[0]); return 1; } /* enough bytes to make more than one zone */ size_t numZone = 0; createStartZone(buf, hend, ptr, &zoneArr[numZone++]); ptr += ITER_BYTES; assert(ptr < buf + len); /* find maximum buffer location that the main zone can scan * - must be a multiple of ITER_BYTES, and * - cannot contain the last 3 bytes (due to 3 bytes read behind the end of buffer in FDR main loop) */ const u8 *main_end = buf + start + ROUNDDOWN_N(len - start - 3, ITER_BYTES); /* create a zone if multiple of ITER_BYTES are found */ if (main_end > ptr) { createMainZone(flood, ptr, main_end, &zoneArr[numZone++]); ptr = main_end; } /* create a zone with rest of the data from the main buffer */ createEndZone(buf, ptr, buf + len, &zoneArr[numZone++]); return numZone; } #define INVALID_MATCH_ID (~0U) #define FDR_MAIN_LOOP(zz, s, get_conf_fn) \ do { \ const u8 *tryFloodDetect = zz->floodPtr; \ const u8 *start_ptr = zz->start; \ const u8 *end_ptr = zz->end; \ \ for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \ itPtr += ITER_BYTES) { \ if (unlikely(itPtr > tryFloodDetect)) { \ tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\ &floodBackoff, &control, \ ITER_BYTES); \ if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \ return HWLM_TERMINATED; \ } \ } \ __builtin_prefetch(itPtr + ITER_BYTES); \ u64a conf0; \ u64a conf8; \ get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped, \ ft, &conf0, &conf8, &s); \ do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, \ &last_match_id, zz); \ do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, \ &last_match_id, zz); \ if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \ return HWLM_TERMINATED; \ } \ } /* end for loop */ \ } while (0) \ static never_inline hwlm_error_t fdr_engine_exec(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control) { assert(ISALIGNED_CL(fdr)); u32 floodBackoff = FLOOD_BACKOFF_START; u32 last_match_id = INVALID_MATCH_ID; u32 domain_mask_flipped = ~fdr->domainMask; u8 stride = fdr->stride; const u64a *ft = (const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR))); assert(ISALIGNED_CL(ft)); const u32 *confBase = (const u32 *)((const u8 *)fdr + fdr->confOffset); assert(ISALIGNED_CL(confBase)); struct zone zones[ZONE_MAX]; assert(fdr->domain > 8 && fdr->domain < 16); size_t numZone = prepareZones(a->buf, a->len, a->buf_history + a->len_history, a->start_offset, a->firstFloodDetect, zones); assert(numZone <= ZONE_MAX); m128 state = getInitState(fdr, a->len_history, ft, &zones[0]); for (size_t curZone = 0; curZone < numZone; curZone++) { struct zone *z = &zones[curZone]; dumpZoneInfo(z, curZone); /* When a zone contains less data than is processed in an iteration * of FDR_MAIN_LOOP(), we need to scan over some extra data. * * We have chosen to scan this extra data at the start of the * iteration. The extra data is either data we have already scanned or * garbage (if it is earlier than offset 0), * * As a result we need to shift the incoming state back so that it will * properly line up with the data being scanned. * * We also need to forbid reporting any matches in the data being * rescanned as they have already been reported (or are over garbage but * later stages should also provide that safety guarantee). */ u8 shift = z->shift; state = variable_byte_shift_m128(state, shift); state = or128(state, load128(zone_or_mask[shift])); switch (stride) { case 1: FDR_MAIN_LOOP(z, state, get_conf_stride_1); break; case 2: FDR_MAIN_LOOP(z, state, get_conf_stride_2); break; case 4: FDR_MAIN_LOOP(z, state, get_conf_stride_4); break; default: break; } } return HWLM_SUCCESS; } #if defined(HAVE_AVX2) #define ONLY_AVX2(func) func #else #define ONLY_AVX2(func) NULL #endif typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, const struct FDR_Runtime_Args *a, hwlm_group_t control); static const FDRFUNCTYPE funcs[] = { fdr_engine_exec, NULL, /* old: fast teddy */ NULL, /* old: fast teddy */ ONLY_AVX2(fdr_exec_fat_teddy_msks1), ONLY_AVX2(fdr_exec_fat_teddy_msks1_pck), ONLY_AVX2(fdr_exec_fat_teddy_msks2), ONLY_AVX2(fdr_exec_fat_teddy_msks2_pck), ONLY_AVX2(fdr_exec_fat_teddy_msks3), ONLY_AVX2(fdr_exec_fat_teddy_msks3_pck), ONLY_AVX2(fdr_exec_fat_teddy_msks4), ONLY_AVX2(fdr_exec_fat_teddy_msks4_pck), fdr_exec_teddy_msks1, fdr_exec_teddy_msks1_pck, fdr_exec_teddy_msks2, fdr_exec_teddy_msks2_pck, fdr_exec_teddy_msks3, fdr_exec_teddy_msks3_pck, fdr_exec_teddy_msks4, fdr_exec_teddy_msks4_pck, }; #define FAKE_HISTORY_SIZE 16 static const u8 fake_history[FAKE_HISTORY_SIZE]; hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len, size_t start, HWLMCallback cb, struct hs_scratch *scratch, hwlm_group_t groups) { // We guarantee (for safezone construction) that it is safe to read 16 // bytes before the end of the history buffer. const u8 *hbuf = fake_history + FAKE_HISTORY_SIZE; const struct FDR_Runtime_Args a = { buf, len, hbuf, 0, start, cb, scratch, nextFloodDetect(buf, len, FLOOD_BACKOFF_START), 0 }; if (unlikely(a.start_offset >= a.len)) { return HWLM_SUCCESS; } else { assert(funcs[fdr->engineID]); return funcs[fdr->engineID](fdr, &a, groups); } } hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf, size_t hlen, const u8 *buf, size_t len, size_t start, HWLMCallback cb, struct hs_scratch *scratch, hwlm_group_t groups) { struct FDR_Runtime_Args a = { buf, len, hbuf, hlen, start, cb, scratch, nextFloodDetect(buf, len, FLOOD_BACKOFF_START), /* we are guaranteed to always have 16 initialised bytes at the end of * the history buffer (they may be garbage). */ hbuf ? unaligned_load_u64a(hbuf + hlen - sizeof(u64a)) : (u64a)0 }; hwlm_error_t ret; if (unlikely(a.start_offset >= a.len)) { ret = HWLM_SUCCESS; } else { assert(funcs[fdr->engineID]); ret = funcs[fdr->engineID](fdr, &a, groups); } return ret; }