aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/hyperscan/src/fdr/fdr.c
diff options
context:
space:
mode:
authorIvan Blinkov <ivan@blinkov.ru>2022-02-10 16:47:10 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:47:10 +0300
commit1aeb9a455974457866f78722ad98114bafc84e8a (patch)
treee4340eaf1668684d83a0a58c36947c5def5350ad /contrib/libs/hyperscan/src/fdr/fdr.c
parentbd5ef432f5cfb1e18851381329d94665a4c22470 (diff)
downloadydb-1aeb9a455974457866f78722ad98114bafc84e8a.tar.gz
Restoring authorship annotation for Ivan Blinkov <ivan@blinkov.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/hyperscan/src/fdr/fdr.c')
-rw-r--r--contrib/libs/hyperscan/src/fdr/fdr.c1620
1 files changed, 810 insertions, 810 deletions
diff --git a/contrib/libs/hyperscan/src/fdr/fdr.c b/contrib/libs/hyperscan/src/fdr/fdr.c
index d33756d358..8fd5bcb6fd 100644
--- a/contrib/libs/hyperscan/src/fdr/fdr.c
+++ b/contrib/libs/hyperscan/src/fdr/fdr.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@@ -27,818 +27,818 @@
*/
#include "fdr.h"
-#include "fdr_confirm.h"
-#include "fdr_confirm_runtime.h"
+#include "fdr_confirm.h"
+#include "fdr_confirm_runtime.h"
#include "fdr_internal.h"
-#include "fdr_loadval.h"
-#include "flood_runtime.h"
-#include "scratch.h"
-#include "teddy.h"
+#include "fdr_loadval.h"
+#include "flood_runtime.h"
+#include "scratch.h"
+#include "teddy.h"
#include "teddy_internal.h"
-#include "util/arch.h"
-#include "util/simd_utils.h"
-#include "util/uniform_ops.h"
-
-/** \brief number of bytes processed in each iteration */
-#define ITER_BYTES 16
-
-/** \brief total zone buffer size */
-#define ZONE_TOTAL_SIZE 64
-
-/** \brief maximum number of allowed zones */
-#define ZONE_MAX 3
-
-/** \brief zone information.
- *
- * Zone represents a region of data to scan in FDR.
- *
- * The incoming buffer is to split in multiple zones to ensure two properties:
- * 1: that we can read 8? bytes behind to generate a hash safely
- * 2: that we can read the 3 byte after the current byte (domain > 8)
- */
-struct zone {
- /** \brief copied buffer, used only when it is a boundary zone. */
- u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE];
-
- /** \brief shift amount for fdr state to avoid unwanted match. */
- u8 shift;
-
- /** \brief if boundary zone, start points into the zone buffer after the
- * pre-padding. Otherwise, points to the main buffer, appropriately. */
- const u8 *start;
-
- /** \brief if boundary zone, end points to the end of zone. Otherwise,
- * pointer to the main buffer, appropriately. */
- const u8 *end;
-
- /** \brief the amount to adjust to go from a pointer in the zones region
- * (between start and end) to a pointer in the original data buffer. */
- ptrdiff_t zone_pointer_adjust;
-
- /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones,
- * otherwise end of the zone buf. floodPtr always points inside the same
- * buffer as the start pointe. */
- const u8 *floodPtr;
-};
-
-static
-const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
- { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 },
- { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 },
- { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
-};
-
-/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
- * so we force its generation.
- */
-static really_inline
-u64a andn(const u32 a, const u8 *b) {
- u64a r;
-#if defined(HAVE_BMI) && !defined(NO_ASM)
- __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
-#else
- r = unaligned_load_u32(b) & ~a;
-#endif
- return r;
-}
-
-/* generates an initial state mask based on the last byte-ish of history rather
- * than being all accepting. If there is no history to consider, the state is
- * generated based on the minimum length of each bucket in order to prevent
- * confirms.
- */
-static really_inline
-m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft,
- const struct zone *z) {
- m128 s;
- if (len_history) {
- /* +1: the zones ensure that we can read the byte at z->end */
- u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1);
- tmp &= fdr->domainMask;
- s = load_m128_from_u64a(ft + tmp);
- s = rshiftbyte_m128(s, 1);
- } else {
- s = fdr->start;
- }
- return s;
-}
-
-static really_inline
-void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
- UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
- const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
- /* +1: the zones ensure that we can read the byte at z->end */
- assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
- u64a reach0 = andn(domain_mask_flipped, itPtr);
- u64a reach1 = andn(domain_mask_flipped, itPtr + 1);
- u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
- u64a reach3 = andn(domain_mask_flipped, itPtr + 3);
-
- m128 st0 = load_m128_from_u64a(ft + reach0);
- m128 st1 = load_m128_from_u64a(ft + reach1);
- m128 st2 = load_m128_from_u64a(ft + reach2);
- m128 st3 = load_m128_from_u64a(ft + reach3);
-
- u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
- u64a reach5 = andn(domain_mask_flipped, itPtr + 5);
- u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
- u64a reach7 = andn(domain_mask_flipped, itPtr + 7);
-
- m128 st4 = load_m128_from_u64a(ft + reach4);
- m128 st5 = load_m128_from_u64a(ft + reach5);
- m128 st6 = load_m128_from_u64a(ft + reach6);
- m128 st7 = load_m128_from_u64a(ft + reach7);
-
- st1 = lshiftbyte_m128(st1, 1);
- st2 = lshiftbyte_m128(st2, 2);
- st3 = lshiftbyte_m128(st3, 3);
- st4 = lshiftbyte_m128(st4, 4);
- st5 = lshiftbyte_m128(st5, 5);
- st6 = lshiftbyte_m128(st6, 6);
- st7 = lshiftbyte_m128(st7, 7);
-
- st0 = or128(st0, st1);
- st2 = or128(st2, st3);
- st4 = or128(st4, st5);
- st6 = or128(st6, st7);
- st0 = or128(st0, st2);
- st4 = or128(st4, st6);
- st0 = or128(st0, st4);
- *s = or128(*s, st0);
-
- *conf0 = movq(*s);
- *s = rshiftbyte_m128(*s, 8);
- *conf0 ^= ~0ULL;
-
- u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
- u64a reach9 = andn(domain_mask_flipped, itPtr + 9);
- u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
- u64a reach11 = andn(domain_mask_flipped, itPtr + 11);
-
- m128 st8 = load_m128_from_u64a(ft + reach8);
- m128 st9 = load_m128_from_u64a(ft + reach9);
- m128 st10 = load_m128_from_u64a(ft + reach10);
- m128 st11 = load_m128_from_u64a(ft + reach11);
-
- u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
- u64a reach13 = andn(domain_mask_flipped, itPtr + 13);
- u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
- u64a reach15 = andn(domain_mask_flipped, itPtr + 15);
-
- m128 st12 = load_m128_from_u64a(ft + reach12);
- m128 st13 = load_m128_from_u64a(ft + reach13);
- m128 st14 = load_m128_from_u64a(ft + reach14);
- m128 st15 = load_m128_from_u64a(ft + reach15);
-
- st9 = lshiftbyte_m128(st9, 1);
- st10 = lshiftbyte_m128(st10, 2);
- st11 = lshiftbyte_m128(st11, 3);
- st12 = lshiftbyte_m128(st12, 4);
- st13 = lshiftbyte_m128(st13, 5);
- st14 = lshiftbyte_m128(st14, 6);
- st15 = lshiftbyte_m128(st15, 7);
-
- st8 = or128(st8, st9);
- st10 = or128(st10, st11);
- st12 = or128(st12, st13);
- st14 = or128(st14, st15);
- st8 = or128(st8, st10);
- st12 = or128(st12, st14);
- st8 = or128(st8, st12);
- *s = or128(*s, st8);
-
- *conf8 = movq(*s);
- *s = rshiftbyte_m128(*s, 8);
- *conf8 ^= ~0ULL;
-}
-
-static really_inline
-void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
- UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
- const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
- assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
- u64a reach0 = andn(domain_mask_flipped, itPtr);
- u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
- u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
- u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
-
- m128 st0 = load_m128_from_u64a(ft + reach0);
- m128 st2 = load_m128_from_u64a(ft + reach2);
- m128 st4 = load_m128_from_u64a(ft + reach4);
- m128 st6 = load_m128_from_u64a(ft + reach6);
-
- u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
- u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
- u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
- u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
-
- m128 st8 = load_m128_from_u64a(ft + reach8);
- m128 st10 = load_m128_from_u64a(ft + reach10);
- m128 st12 = load_m128_from_u64a(ft + reach12);
- m128 st14 = load_m128_from_u64a(ft + reach14);
-
- st2 = lshiftbyte_m128(st2, 2);
- st4 = lshiftbyte_m128(st4, 4);
- st6 = lshiftbyte_m128(st6, 6);
-
- *s = or128(*s, st0);
- *s = or128(*s, st2);
- *s = or128(*s, st4);
- *s = or128(*s, st6);
-
- *conf0 = movq(*s);
- *s = rshiftbyte_m128(*s, 8);
- *conf0 ^= ~0ULL;
-
- st10 = lshiftbyte_m128(st10, 2);
- st12 = lshiftbyte_m128(st12, 4);
- st14 = lshiftbyte_m128(st14, 6);
-
- *s = or128(*s, st8);
- *s = or128(*s, st10);
- *s = or128(*s, st12);
- *s = or128(*s, st14);
-
- *conf8 = movq(*s);
- *s = rshiftbyte_m128(*s, 8);
- *conf8 ^= ~0ULL;
-}
-
-static really_inline
-void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
- UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
- const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
- assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
- u64a reach0 = andn(domain_mask_flipped, itPtr);
- u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
- u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
- u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
-
- m128 st0 = load_m128_from_u64a(ft + reach0);
- m128 st4 = load_m128_from_u64a(ft + reach4);
- m128 st8 = load_m128_from_u64a(ft + reach8);
- m128 st12 = load_m128_from_u64a(ft + reach12);
-
- st4 = lshiftbyte_m128(st4, 4);
- st12 = lshiftbyte_m128(st12, 4);
-
- *s = or128(*s, st0);
- *s = or128(*s, st4);
- *conf0 = movq(*s);
- *s = rshiftbyte_m128(*s, 8);
- *conf0 ^= ~0ULL;
-
- *s = or128(*s, st8);
- *s = or128(*s, st12);
- *conf8 = movq(*s);
- *s = rshiftbyte_m128(*s, 8);
- *conf8 ^= ~0ULL;
-}
-
-static really_inline
-void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
- const u32 *confBase, const struct FDR_Runtime_Args *a,
- const u8 *ptr, u32 *last_match_id, struct zone *z) {
- const u8 bucket = 8;
-
- if (likely(!*conf)) {
- return;
- }
-
- /* ptr is currently referring to a location in the zone's buffer, we also
- * need a pointer in the original, main buffer for the final string compare.
- */
- const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust);
-
- const u8 *confLoc = ptr;
-
- do {
- u32 bit = findAndClearLSB_64(conf);
- u32 byte = bit / bucket + offset;
- u32 bitRem = bit % bucket;
- u32 idx = bitRem;
- u32 cf = confBase[idx];
- if (!cf) {
- continue;
- }
- const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
- ((const u8 *)confBase + cf);
- if (!(fdrc->groups & *control)) {
- continue;
- }
- u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
- confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
- last_match_id, confVal, conf, bit);
- } while (unlikely(!!*conf));
-}
-
-static really_inline
-void dumpZoneInfo(UNUSED struct zone *z, UNUSED size_t zone_id) {
-#ifdef DEBUG
- DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf);
- DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n",
- z->start, z->end, z->shift);
- DEBUG_PRINTF("zone: zone_pointer_adjust=%zd, floodPtr=%p\n",
- z->zone_pointer_adjust, z->floodPtr);
- DEBUG_PRINTF("zone buf:");
- for (size_t i = 0; i < ZONE_TOTAL_SIZE; i++) {
- if (i % 8 == 0) {
- printf("_");
- }
- if (z->buf[i]) {
- printf("%02x", z->buf[i]);
- } else {
- printf("..");
- }
- }
- printf("\n");
-#endif
-};
-
-/**
- * \brief Updates attributes for non-boundary region zone.
- */
-static really_inline
-void createMainZone(const u8 *flood, const u8 *begin, const u8 *end,
- struct zone *z) {
- z->zone_pointer_adjust = 0; /* zone buffer is the main buffer */
- z->start = begin;
- z->end = end;
- z->floodPtr = flood;
- z->shift = 0;
-}
-
-/**
- * \brief Create zone for short cases (<= ITER_BYTES).
- *
- * For this case we need to copy everything into the zone's internal buffer.
- *
- * We need to ensure that we run over real data if it exists (in history or
- * before zone begin). We also need to ensure 8 bytes before any data being
- * matched can be read (to perform a conf hash).
- *
- * We also need to ensure that the data at z->end can be read.
- *
- * Hence, the zone consists of:
- * 16 bytes of history,
- * 1 - 24 bytes of data form the buffer (ending at end),
- * 1 byte of final padding
- */
-static really_inline
-void createShortZone(const u8 *buf, const u8 *hend, const u8 *begin,
- const u8 *end, struct zone *z) {
- /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid
- * the checks in boundary zone. */
- z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
-
- ptrdiff_t z_len = end - begin;
- assert(z_len > 0);
- assert(z_len <= ITER_BYTES);
-
- z->shift = ITER_BYTES - z_len; /* ignore bytes outside region specified */
-
- static const size_t ZONE_SHORT_DATA_OFFSET = 16; /* after history */
-
- /* we are guaranteed to always have 16 initialised bytes at the end of
- * the history buffer (they may be garbage coming from the stream state
- * preceding hbuf, but bytes that don't correspond to actual history
- * shouldn't affect computations). */
- *(m128 *)z->buf = loadu128(hend - sizeof(m128));
-
- /* The amount of data we have to copy from main buffer. */
- size_t copy_len = MIN((size_t)(end - buf),
- ITER_BYTES + sizeof(CONF_TYPE));
-
- u8 *zone_data = z->buf + ZONE_SHORT_DATA_OFFSET;
- switch (copy_len) {
- case 1:
- *zone_data = *(end - 1);
- break;
- case 2:
- *(u16 *)zone_data = unaligned_load_u16(end - 2);
- break;
- case 3:
- *(u16 *)zone_data = unaligned_load_u16(end - 3);
- *(zone_data + 2) = *(end - 1);
- break;
- case 4:
- *(u32 *)zone_data = unaligned_load_u32(end - 4);
- break;
- case 5:
- case 6:
- case 7:
- /* perform copy with 2 overlapping 4-byte chunks from buf. */
- *(u32 *)zone_data = unaligned_load_u32(end - copy_len);
- unaligned_store_u32(zone_data + copy_len - sizeof(u32),
- unaligned_load_u32(end - sizeof(u32)));
- break;
- case 8:
- *(u64a *)zone_data = unaligned_load_u64a(end - 8);
- break;
- case 9:
- case 10:
- case 11:
- case 12:
- case 13:
- case 14:
- case 15:
- /* perform copy with 2 overlapping 8-byte chunks from buf. */
- *(u64a *)zone_data = unaligned_load_u64a(end - copy_len);
- unaligned_store_u64a(zone_data + copy_len - sizeof(u64a),
- unaligned_load_u64a(end - sizeof(u64a)));
- break;
- case 16:
- /* copy 16-bytes from buf. */
- *(m128 *)zone_data = loadu128(end - 16);
- break;
- default:
- assert(copy_len <= sizeof(m128) + sizeof(u64a));
-
- /* perform copy with (potentially overlapping) 8-byte and 16-byte chunks.
- */
- *(u64a *)zone_data = unaligned_load_u64a(end - copy_len);
- storeu128(zone_data + copy_len - sizeof(m128),
- loadu128(end - sizeof(m128)));
- break;
- }
-
- /* set the start and end location of the zone buf
- * to be scanned */
- u8 *z_end = z->buf + ZONE_SHORT_DATA_OFFSET + copy_len;
- assert(ZONE_SHORT_DATA_OFFSET + copy_len >= ITER_BYTES);
-
- /* copy the post-padding byte; this is required for domain > 8 due to
- * overhang */
- assert(ZONE_SHORT_DATA_OFFSET + copy_len + 3 < 64);
- *z_end = 0;
-
- z->end = z_end;
- z->start = z_end - ITER_BYTES;
- z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
- assert(z->start + z->shift == z_end - z_len);
-}
-
-/**
- * \brief Create a zone for the start region.
- *
- * This function requires that there is > ITER_BYTES of data in the buffer to
- * scan. The start zone itself is always responsible for scanning exactly
- * ITER_BYTES of data - there are no warmup/junk bytes scanned.
- *
- * This zone ensures that the byte at z->end can be read and corresponds to
- * the next byte of data.
- *
- * 8 bytes of history data are provided before z->start to allow proper hash
- * generation in streaming mode. If buf != begin, upto 8 bytes of data
- * prior to begin is also provided.
- *
- * Although we are not interested in bare literals which start before begin
- * if buf != begin, lookarounds associated with the literal may require
- * the data prior to begin for hash purposes.
- */
-static really_inline
-void createStartZone(const u8 *buf, const u8 *hend, const u8 *begin,
- struct zone *z) {
- assert(ITER_BYTES == sizeof(m128));
- assert(sizeof(CONF_TYPE) == 8);
- static const size_t ZONE_START_BEGIN = sizeof(CONF_TYPE);
-
- const u8 *end = begin + ITER_BYTES;
-
- /* set floodPtr to the end of zone buf to avoid checks in start zone */
- z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
-
- z->shift = 0; /* we are processing ITER_BYTES of real data */
-
- /* we are guaranteed to always have 16 initialised bytes at the end of the
- * history buffer (they may be garbage coming from the stream state
- * preceding hbuf, but bytes that don't correspond to actual history
- * shouldn't affect computations). However, for start zones, history is only
- * required for conf hash purposes so we only need 8 bytes */
- unaligned_store_u64a(z->buf, unaligned_load_u64a(hend - sizeof(u64a)));
-
- /* The amount of data we have to copy from main buffer. */
- size_t copy_len = MIN((size_t)(end - buf),
- ITER_BYTES + sizeof(CONF_TYPE));
- assert(copy_len >= 16);
-
- /* copy the post-padding byte; this is required for domain > 8 due to
- * overhang. The start requires that there is data after the zone so it
- * it safe to dereference end */
- z->buf[ZONE_START_BEGIN + copy_len] = *end;
-
- /* set the start and end location of the zone buf to be scanned */
- u8 *z_end = z->buf + ZONE_START_BEGIN + copy_len;
- z->end = z_end;
- z->start = z_end - ITER_BYTES;
-
- /* copy the first 8 bytes of the valid region */
- unaligned_store_u64a(z->buf + ZONE_START_BEGIN,
- unaligned_load_u64a(end - copy_len));
-
- /* copy the last 16 bytes, may overlap with the previous 8 byte write */
- storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
-
- z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
-
- assert(ZONE_START_BEGIN + copy_len + 3 < 64);
-}
-
-/**
- * \brief Create a zone for the end region.
- *
- * This function requires that there is > ITER_BYTES of data in the buffer to
- * scan. The end zone is responsible for a scanning the <= ITER_BYTES rump of
- * data and optional ITER_BYTES. The main zone cannot handle the last 3 bytes
- * of the buffer. The end zone is required to handle an optional full
- * ITER_BYTES from main zone when there are less than 3 bytes to scan. The
- * main zone size is reduced by ITER_BYTES in this case.
- *
- * This zone ensures that the byte at z->end can be read by filling it with a
- * padding character.
- *
- * Upto 8 bytes of data prior to begin is also provided for the purposes of
- * generating hashes. History is not copied, as all locations which require
- * history for generating a hash are the responsiblity of the start zone.
- */
-static really_inline
-void createEndZone(const u8 *buf, const u8 *begin, const u8 *end,
- struct zone *z) {
- /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid
- * the checks in boundary zone. */
- z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
-
- ptrdiff_t z_len = end - begin;
- assert(z_len > 0);
- size_t iter_bytes_second = 0;
- size_t z_len_first = z_len;
- if (z_len > ITER_BYTES) {
- z_len_first = z_len - ITER_BYTES;
- iter_bytes_second = ITER_BYTES;
- }
- z->shift = ITER_BYTES - z_len_first;
-
- const u8 *end_first = end - iter_bytes_second;
- /* The amount of data we have to copy from main buffer for the
- * first iteration. */
- size_t copy_len_first = MIN((size_t)(end_first - buf),
- ITER_BYTES + sizeof(CONF_TYPE));
- assert(copy_len_first >= 16);
-
- size_t total_copy_len = copy_len_first + iter_bytes_second;
- assert(total_copy_len + 3 < 64);
-
- /* copy the post-padding byte; this is required for domain > 8 due to
- * overhang */
- z->buf[total_copy_len] = 0;
-
- /* set the start and end location of the zone buf
- * to be scanned */
- u8 *z_end = z->buf + total_copy_len;
- z->end = z_end;
- z->start = z_end - ITER_BYTES - iter_bytes_second;
- assert(z->start + z->shift == z_end - z_len);
-
- u8 *z_end_first = z_end - iter_bytes_second;
- /* copy the first 8 bytes of the valid region */
- unaligned_store_u64a(z->buf,
- unaligned_load_u64a(end_first - copy_len_first));
-
- /* copy the last 16 bytes, may overlap with the previous 8 byte write */
- storeu128(z_end_first - sizeof(m128), loadu128(end_first - sizeof(m128)));
- if (iter_bytes_second) {
- storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
- }
-
- z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
-}
-
-/**
- * \brief Prepare zones.
- *
- * This function prepares zones with actual buffer and some padded bytes.
- * The actual ITER_BYTES bytes in zone is preceded by main buf and/or
- * history buf and succeeded by padded bytes possibly from main buf,
- * if available.
- */
-static really_inline
-size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
- size_t start, const u8 *flood, struct zone *zoneArr) {
- const u8 *ptr = buf + start;
- size_t remaining = len - start;
-
- if (remaining <= ITER_BYTES) {
- /* enough bytes to make only one zone */
- createShortZone(buf, hend, ptr, buf + len, &zoneArr[0]);
- return 1;
- }
-
- /* enough bytes to make more than one zone */
-
- size_t numZone = 0;
- createStartZone(buf, hend, ptr, &zoneArr[numZone++]);
- ptr += ITER_BYTES;
-
- assert(ptr < buf + len);
-
- /* find maximum buffer location that the main zone can scan
- * - must be a multiple of ITER_BYTES, and
- * - cannot contain the last 3 bytes (due to 3 bytes read behind the
- end of buffer in FDR main loop)
- */
- const u8 *main_end = buf + start + ROUNDDOWN_N(len - start - 3, ITER_BYTES);
-
- /* create a zone if multiple of ITER_BYTES are found */
- if (main_end > ptr) {
- createMainZone(flood, ptr, main_end, &zoneArr[numZone++]);
- ptr = main_end;
- }
- /* create a zone with rest of the data from the main buffer */
- createEndZone(buf, ptr, buf + len, &zoneArr[numZone++]);
- return numZone;
-}
-
-#define INVALID_MATCH_ID (~0U)
-
-#define FDR_MAIN_LOOP(zz, s, get_conf_fn) \
- do { \
- const u8 *tryFloodDetect = zz->floodPtr; \
- const u8 *start_ptr = zz->start; \
- const u8 *end_ptr = zz->end; \
- \
- for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \
- itPtr += ITER_BYTES) { \
- if (unlikely(itPtr > tryFloodDetect)) { \
- tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\
- &floodBackoff, &control, \
- ITER_BYTES); \
- if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \
- return HWLM_TERMINATED; \
- } \
- } \
- __builtin_prefetch(itPtr + ITER_BYTES); \
- u64a conf0; \
- u64a conf8; \
- get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped, \
- ft, &conf0, &conf8, &s); \
- do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, \
- &last_match_id, zz); \
- do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, \
- &last_match_id, zz); \
- if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \
- return HWLM_TERMINATED; \
- } \
- } /* end for loop */ \
- } while (0) \
-
-static never_inline
-hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
- const struct FDR_Runtime_Args *a,
- hwlm_group_t control) {
- assert(ISALIGNED_CL(fdr));
-
- u32 floodBackoff = FLOOD_BACKOFF_START;
- u32 last_match_id = INVALID_MATCH_ID;
- u32 domain_mask_flipped = ~fdr->domainMask;
- u8 stride = fdr->stride;
- const u64a *ft =
- (const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR)));
- assert(ISALIGNED_CL(ft));
- const u32 *confBase = (const u32 *)((const u8 *)fdr + fdr->confOffset);
- assert(ISALIGNED_CL(confBase));
- struct zone zones[ZONE_MAX];
- assert(fdr->domain > 8 && fdr->domain < 16);
-
- size_t numZone = prepareZones(a->buf, a->len,
- a->buf_history + a->len_history,
- a->start_offset, a->firstFloodDetect, zones);
- assert(numZone <= ZONE_MAX);
- m128 state = getInitState(fdr, a->len_history, ft, &zones[0]);
-
- for (size_t curZone = 0; curZone < numZone; curZone++) {
- struct zone *z = &zones[curZone];
- dumpZoneInfo(z, curZone);
-
- /* When a zone contains less data than is processed in an iteration
- * of FDR_MAIN_LOOP(), we need to scan over some extra data.
- *
- * We have chosen to scan this extra data at the start of the
- * iteration. The extra data is either data we have already scanned or
- * garbage (if it is earlier than offset 0),
- *
- * As a result we need to shift the incoming state back so that it will
- * properly line up with the data being scanned.
- *
- * We also need to forbid reporting any matches in the data being
- * rescanned as they have already been reported (or are over garbage but
- * later stages should also provide that safety guarantee).
- */
-
- u8 shift = z->shift;
-
- state = variable_byte_shift_m128(state, shift);
-
- state = or128(state, load128(zone_or_mask[shift]));
-
- switch (stride) {
- case 1:
- FDR_MAIN_LOOP(z, state, get_conf_stride_1);
- break;
- case 2:
- FDR_MAIN_LOOP(z, state, get_conf_stride_2);
- break;
- case 4:
- FDR_MAIN_LOOP(z, state, get_conf_stride_4);
- break;
- default:
- break;
- }
- }
-
- return HWLM_SUCCESS;
-}
-
-#if defined(HAVE_AVX2)
-#define ONLY_AVX2(func) func
-#else
-#define ONLY_AVX2(func) NULL
-#endif
-
-typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr,
- const struct FDR_Runtime_Args *a,
- hwlm_group_t control);
-
-static const FDRFUNCTYPE funcs[] = {
- fdr_engine_exec,
- NULL, /* old: fast teddy */
- NULL, /* old: fast teddy */
- ONLY_AVX2(fdr_exec_fat_teddy_msks1),
- ONLY_AVX2(fdr_exec_fat_teddy_msks1_pck),
- ONLY_AVX2(fdr_exec_fat_teddy_msks2),
- ONLY_AVX2(fdr_exec_fat_teddy_msks2_pck),
- ONLY_AVX2(fdr_exec_fat_teddy_msks3),
- ONLY_AVX2(fdr_exec_fat_teddy_msks3_pck),
- ONLY_AVX2(fdr_exec_fat_teddy_msks4),
- ONLY_AVX2(fdr_exec_fat_teddy_msks4_pck),
- fdr_exec_teddy_msks1,
- fdr_exec_teddy_msks1_pck,
- fdr_exec_teddy_msks2,
- fdr_exec_teddy_msks2_pck,
- fdr_exec_teddy_msks3,
- fdr_exec_teddy_msks3_pck,
- fdr_exec_teddy_msks4,
- fdr_exec_teddy_msks4_pck,
-};
-
+#include "util/arch.h"
+#include "util/simd_utils.h"
+#include "util/uniform_ops.h"
+
+/** \brief number of bytes processed in each iteration */
+#define ITER_BYTES 16
+
+/** \brief total zone buffer size */
+#define ZONE_TOTAL_SIZE 64
+
+/** \brief maximum number of allowed zones */
+#define ZONE_MAX 3
+
+/** \brief zone information.
+ *
+ * Zone represents a region of data to scan in FDR.
+ *
+ * The incoming buffer is to split in multiple zones to ensure two properties:
+ * 1: that we can read 8? bytes behind to generate a hash safely
+ * 2: that we can read the 3 byte after the current byte (domain > 8)
+ */
+struct zone {
+ /** \brief copied buffer, used only when it is a boundary zone. */
+ u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE];
+
+ /** \brief shift amount for fdr state to avoid unwanted match. */
+ u8 shift;
+
+ /** \brief if boundary zone, start points into the zone buffer after the
+ * pre-padding. Otherwise, points to the main buffer, appropriately. */
+ const u8 *start;
+
+ /** \brief if boundary zone, end points to the end of zone. Otherwise,
+ * pointer to the main buffer, appropriately. */
+ const u8 *end;
+
+ /** \brief the amount to adjust to go from a pointer in the zones region
+ * (between start and end) to a pointer in the original data buffer. */
+ ptrdiff_t zone_pointer_adjust;
+
+ /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones,
+ * otherwise end of the zone buf. floodPtr always points inside the same
+ * buffer as the start pointe. */
+ const u8 *floodPtr;
+};
+
+static
+const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
+};
+
+/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
+ * so we force its generation.
+ */
+static really_inline
+u64a andn(const u32 a, const u8 *b) {
+ u64a r;
+#if defined(HAVE_BMI) && !defined(NO_ASM)
+ __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
+#else
+ r = unaligned_load_u32(b) & ~a;
+#endif
+ return r;
+}
+
+/* generates an initial state mask based on the last byte-ish of history rather
+ * than being all accepting. If there is no history to consider, the state is
+ * generated based on the minimum length of each bucket in order to prevent
+ * confirms.
+ */
+static really_inline
+m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft,
+ const struct zone *z) {
+ m128 s;
+ if (len_history) {
+ /* +1: the zones ensure that we can read the byte at z->end */
+ u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1);
+ tmp &= fdr->domainMask;
+ s = load_m128_from_u64a(ft + tmp);
+ s = rshiftbyte_m128(s, 1);
+ } else {
+ s = fdr->start;
+ }
+ return s;
+}
+
+static really_inline
+void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
+ UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
+ const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
+ /* +1: the zones ensure that we can read the byte at z->end */
+ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+ u64a reach0 = andn(domain_mask_flipped, itPtr);
+ u64a reach1 = andn(domain_mask_flipped, itPtr + 1);
+ u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
+ u64a reach3 = andn(domain_mask_flipped, itPtr + 3);
+
+ m128 st0 = load_m128_from_u64a(ft + reach0);
+ m128 st1 = load_m128_from_u64a(ft + reach1);
+ m128 st2 = load_m128_from_u64a(ft + reach2);
+ m128 st3 = load_m128_from_u64a(ft + reach3);
+
+ u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
+ u64a reach5 = andn(domain_mask_flipped, itPtr + 5);
+ u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
+ u64a reach7 = andn(domain_mask_flipped, itPtr + 7);
+
+ m128 st4 = load_m128_from_u64a(ft + reach4);
+ m128 st5 = load_m128_from_u64a(ft + reach5);
+ m128 st6 = load_m128_from_u64a(ft + reach6);
+ m128 st7 = load_m128_from_u64a(ft + reach7);
+
+ st1 = lshiftbyte_m128(st1, 1);
+ st2 = lshiftbyte_m128(st2, 2);
+ st3 = lshiftbyte_m128(st3, 3);
+ st4 = lshiftbyte_m128(st4, 4);
+ st5 = lshiftbyte_m128(st5, 5);
+ st6 = lshiftbyte_m128(st6, 6);
+ st7 = lshiftbyte_m128(st7, 7);
+
+ st0 = or128(st0, st1);
+ st2 = or128(st2, st3);
+ st4 = or128(st4, st5);
+ st6 = or128(st6, st7);
+ st0 = or128(st0, st2);
+ st4 = or128(st4, st6);
+ st0 = or128(st0, st4);
+ *s = or128(*s, st0);
+
+ *conf0 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf0 ^= ~0ULL;
+
+ u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
+ u64a reach9 = andn(domain_mask_flipped, itPtr + 9);
+ u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
+ u64a reach11 = andn(domain_mask_flipped, itPtr + 11);
+
+ m128 st8 = load_m128_from_u64a(ft + reach8);
+ m128 st9 = load_m128_from_u64a(ft + reach9);
+ m128 st10 = load_m128_from_u64a(ft + reach10);
+ m128 st11 = load_m128_from_u64a(ft + reach11);
+
+ u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
+ u64a reach13 = andn(domain_mask_flipped, itPtr + 13);
+ u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
+ u64a reach15 = andn(domain_mask_flipped, itPtr + 15);
+
+ m128 st12 = load_m128_from_u64a(ft + reach12);
+ m128 st13 = load_m128_from_u64a(ft + reach13);
+ m128 st14 = load_m128_from_u64a(ft + reach14);
+ m128 st15 = load_m128_from_u64a(ft + reach15);
+
+ st9 = lshiftbyte_m128(st9, 1);
+ st10 = lshiftbyte_m128(st10, 2);
+ st11 = lshiftbyte_m128(st11, 3);
+ st12 = lshiftbyte_m128(st12, 4);
+ st13 = lshiftbyte_m128(st13, 5);
+ st14 = lshiftbyte_m128(st14, 6);
+ st15 = lshiftbyte_m128(st15, 7);
+
+ st8 = or128(st8, st9);
+ st10 = or128(st10, st11);
+ st12 = or128(st12, st13);
+ st14 = or128(st14, st15);
+ st8 = or128(st8, st10);
+ st12 = or128(st12, st14);
+ st8 = or128(st8, st12);
+ *s = or128(*s, st8);
+
+ *conf8 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf8 ^= ~0ULL;
+}
+
+static really_inline
+void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
+ UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
+ const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
+ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+ u64a reach0 = andn(domain_mask_flipped, itPtr);
+ u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
+ u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
+ u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
+
+ m128 st0 = load_m128_from_u64a(ft + reach0);
+ m128 st2 = load_m128_from_u64a(ft + reach2);
+ m128 st4 = load_m128_from_u64a(ft + reach4);
+ m128 st6 = load_m128_from_u64a(ft + reach6);
+
+ u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
+ u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
+ u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
+ u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
+
+ m128 st8 = load_m128_from_u64a(ft + reach8);
+ m128 st10 = load_m128_from_u64a(ft + reach10);
+ m128 st12 = load_m128_from_u64a(ft + reach12);
+ m128 st14 = load_m128_from_u64a(ft + reach14);
+
+ st2 = lshiftbyte_m128(st2, 2);
+ st4 = lshiftbyte_m128(st4, 4);
+ st6 = lshiftbyte_m128(st6, 6);
+
+ *s = or128(*s, st0);
+ *s = or128(*s, st2);
+ *s = or128(*s, st4);
+ *s = or128(*s, st6);
+
+ *conf0 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf0 ^= ~0ULL;
+
+ st10 = lshiftbyte_m128(st10, 2);
+ st12 = lshiftbyte_m128(st12, 4);
+ st14 = lshiftbyte_m128(st14, 6);
+
+ *s = or128(*s, st8);
+ *s = or128(*s, st10);
+ *s = or128(*s, st12);
+ *s = or128(*s, st14);
+
+ *conf8 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf8 ^= ~0ULL;
+}
+
+static really_inline
+void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
+ UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
+ const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
+ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+ u64a reach0 = andn(domain_mask_flipped, itPtr);
+ u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
+ u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
+ u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
+
+ m128 st0 = load_m128_from_u64a(ft + reach0);
+ m128 st4 = load_m128_from_u64a(ft + reach4);
+ m128 st8 = load_m128_from_u64a(ft + reach8);
+ m128 st12 = load_m128_from_u64a(ft + reach12);
+
+ st4 = lshiftbyte_m128(st4, 4);
+ st12 = lshiftbyte_m128(st12, 4);
+
+ *s = or128(*s, st0);
+ *s = or128(*s, st4);
+ *conf0 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf0 ^= ~0ULL;
+
+ *s = or128(*s, st8);
+ *s = or128(*s, st12);
+ *conf8 = movq(*s);
+ *s = rshiftbyte_m128(*s, 8);
+ *conf8 ^= ~0ULL;
+}
+
+static really_inline
+void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
+ const u32 *confBase, const struct FDR_Runtime_Args *a,
+ const u8 *ptr, u32 *last_match_id, struct zone *z) {
+ const u8 bucket = 8;
+
+ if (likely(!*conf)) {
+ return;
+ }
+
+ /* ptr is currently referring to a location in the zone's buffer, we also
+ * need a pointer in the original, main buffer for the final string compare.
+ */
+ const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust);
+
+ const u8 *confLoc = ptr;
+
+ do {
+ u32 bit = findAndClearLSB_64(conf);
+ u32 byte = bit / bucket + offset;
+ u32 bitRem = bit % bucket;
+ u32 idx = bitRem;
+ u32 cf = confBase[idx];
+ if (!cf) {
+ continue;
+ }
+ const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
+ ((const u8 *)confBase + cf);
+ if (!(fdrc->groups & *control)) {
+ continue;
+ }
+ u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
+ confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
+ last_match_id, confVal, conf, bit);
+ } while (unlikely(!!*conf));
+}
+
+static really_inline
+void dumpZoneInfo(UNUSED struct zone *z, UNUSED size_t zone_id) {
+#ifdef DEBUG
+ DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf);
+ DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n",
+ z->start, z->end, z->shift);
+ DEBUG_PRINTF("zone: zone_pointer_adjust=%zd, floodPtr=%p\n",
+ z->zone_pointer_adjust, z->floodPtr);
+ DEBUG_PRINTF("zone buf:");
+ for (size_t i = 0; i < ZONE_TOTAL_SIZE; i++) {
+ if (i % 8 == 0) {
+ printf("_");
+ }
+ if (z->buf[i]) {
+ printf("%02x", z->buf[i]);
+ } else {
+ printf("..");
+ }
+ }
+ printf("\n");
+#endif
+};
+
+/**
+ * \brief Updates attributes for non-boundary region zone.
+ */
+static really_inline
+void createMainZone(const u8 *flood, const u8 *begin, const u8 *end,
+ struct zone *z) {
+ z->zone_pointer_adjust = 0; /* zone buffer is the main buffer */
+ z->start = begin;
+ z->end = end;
+ z->floodPtr = flood;
+ z->shift = 0;
+}
+
+/**
+ * \brief Create zone for short cases (<= ITER_BYTES).
+ *
+ * For this case we need to copy everything into the zone's internal buffer.
+ *
+ * We need to ensure that we run over real data if it exists (in history or
+ * before zone begin). We also need to ensure 8 bytes before any data being
+ * matched can be read (to perform a conf hash).
+ *
+ * We also need to ensure that the data at z->end can be read.
+ *
+ * Hence, the zone consists of:
+ * 16 bytes of history,
+ * 1 - 24 bytes of data form the buffer (ending at end),
+ * 1 byte of final padding
+ */
+static really_inline
+void createShortZone(const u8 *buf, const u8 *hend, const u8 *begin,
+ const u8 *end, struct zone *z) {
+ /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid
+ * the checks in boundary zone. */
+ z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
+
+ ptrdiff_t z_len = end - begin;
+ assert(z_len > 0);
+ assert(z_len <= ITER_BYTES);
+
+ z->shift = ITER_BYTES - z_len; /* ignore bytes outside region specified */
+
+ static const size_t ZONE_SHORT_DATA_OFFSET = 16; /* after history */
+
+ /* we are guaranteed to always have 16 initialised bytes at the end of
+ * the history buffer (they may be garbage coming from the stream state
+ * preceding hbuf, but bytes that don't correspond to actual history
+ * shouldn't affect computations). */
+ *(m128 *)z->buf = loadu128(hend - sizeof(m128));
+
+ /* The amount of data we have to copy from main buffer. */
+ size_t copy_len = MIN((size_t)(end - buf),
+ ITER_BYTES + sizeof(CONF_TYPE));
+
+ u8 *zone_data = z->buf + ZONE_SHORT_DATA_OFFSET;
+ switch (copy_len) {
+ case 1:
+ *zone_data = *(end - 1);
+ break;
+ case 2:
+ *(u16 *)zone_data = unaligned_load_u16(end - 2);
+ break;
+ case 3:
+ *(u16 *)zone_data = unaligned_load_u16(end - 3);
+ *(zone_data + 2) = *(end - 1);
+ break;
+ case 4:
+ *(u32 *)zone_data = unaligned_load_u32(end - 4);
+ break;
+ case 5:
+ case 6:
+ case 7:
+ /* perform copy with 2 overlapping 4-byte chunks from buf. */
+ *(u32 *)zone_data = unaligned_load_u32(end - copy_len);
+ unaligned_store_u32(zone_data + copy_len - sizeof(u32),
+ unaligned_load_u32(end - sizeof(u32)));
+ break;
+ case 8:
+ *(u64a *)zone_data = unaligned_load_u64a(end - 8);
+ break;
+ case 9:
+ case 10:
+ case 11:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ /* perform copy with 2 overlapping 8-byte chunks from buf. */
+ *(u64a *)zone_data = unaligned_load_u64a(end - copy_len);
+ unaligned_store_u64a(zone_data + copy_len - sizeof(u64a),
+ unaligned_load_u64a(end - sizeof(u64a)));
+ break;
+ case 16:
+ /* copy 16-bytes from buf. */
+ *(m128 *)zone_data = loadu128(end - 16);
+ break;
+ default:
+ assert(copy_len <= sizeof(m128) + sizeof(u64a));
+
+ /* perform copy with (potentially overlapping) 8-byte and 16-byte chunks.
+ */
+ *(u64a *)zone_data = unaligned_load_u64a(end - copy_len);
+ storeu128(zone_data + copy_len - sizeof(m128),
+ loadu128(end - sizeof(m128)));
+ break;
+ }
+
+ /* set the start and end location of the zone buf
+ * to be scanned */
+ u8 *z_end = z->buf + ZONE_SHORT_DATA_OFFSET + copy_len;
+ assert(ZONE_SHORT_DATA_OFFSET + copy_len >= ITER_BYTES);
+
+ /* copy the post-padding byte; this is required for domain > 8 due to
+ * overhang */
+ assert(ZONE_SHORT_DATA_OFFSET + copy_len + 3 < 64);
+ *z_end = 0;
+
+ z->end = z_end;
+ z->start = z_end - ITER_BYTES;
+ z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
+ assert(z->start + z->shift == z_end - z_len);
+}
+
+/**
+ * \brief Create a zone for the start region.
+ *
+ * This function requires that there is > ITER_BYTES of data in the buffer to
+ * scan. The start zone itself is always responsible for scanning exactly
+ * ITER_BYTES of data - there are no warmup/junk bytes scanned.
+ *
+ * This zone ensures that the byte at z->end can be read and corresponds to
+ * the next byte of data.
+ *
+ * 8 bytes of history data are provided before z->start to allow proper hash
+ * generation in streaming mode. If buf != begin, upto 8 bytes of data
+ * prior to begin is also provided.
+ *
+ * Although we are not interested in bare literals which start before begin
+ * if buf != begin, lookarounds associated with the literal may require
+ * the data prior to begin for hash purposes.
+ */
+static really_inline
+void createStartZone(const u8 *buf, const u8 *hend, const u8 *begin,
+ struct zone *z) {
+ assert(ITER_BYTES == sizeof(m128));
+ assert(sizeof(CONF_TYPE) == 8);
+ static const size_t ZONE_START_BEGIN = sizeof(CONF_TYPE);
+
+ const u8 *end = begin + ITER_BYTES;
+
+ /* set floodPtr to the end of zone buf to avoid checks in start zone */
+ z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
+
+ z->shift = 0; /* we are processing ITER_BYTES of real data */
+
+ /* we are guaranteed to always have 16 initialised bytes at the end of the
+ * history buffer (they may be garbage coming from the stream state
+ * preceding hbuf, but bytes that don't correspond to actual history
+ * shouldn't affect computations). However, for start zones, history is only
+ * required for conf hash purposes so we only need 8 bytes */
+ unaligned_store_u64a(z->buf, unaligned_load_u64a(hend - sizeof(u64a)));
+
+ /* The amount of data we have to copy from main buffer. */
+ size_t copy_len = MIN((size_t)(end - buf),
+ ITER_BYTES + sizeof(CONF_TYPE));
+ assert(copy_len >= 16);
+
+ /* copy the post-padding byte; this is required for domain > 8 due to
+ * overhang. The start requires that there is data after the zone so it
+ * it safe to dereference end */
+ z->buf[ZONE_START_BEGIN + copy_len] = *end;
+
+ /* set the start and end location of the zone buf to be scanned */
+ u8 *z_end = z->buf + ZONE_START_BEGIN + copy_len;
+ z->end = z_end;
+ z->start = z_end - ITER_BYTES;
+
+ /* copy the first 8 bytes of the valid region */
+ unaligned_store_u64a(z->buf + ZONE_START_BEGIN,
+ unaligned_load_u64a(end - copy_len));
+
+ /* copy the last 16 bytes, may overlap with the previous 8 byte write */
+ storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
+
+ z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
+
+ assert(ZONE_START_BEGIN + copy_len + 3 < 64);
+}
+
+/**
+ * \brief Create a zone for the end region.
+ *
+ * This function requires that there is > ITER_BYTES of data in the buffer to
+ * scan. The end zone is responsible for a scanning the <= ITER_BYTES rump of
+ * data and optional ITER_BYTES. The main zone cannot handle the last 3 bytes
+ * of the buffer. The end zone is required to handle an optional full
+ * ITER_BYTES from main zone when there are less than 3 bytes to scan. The
+ * main zone size is reduced by ITER_BYTES in this case.
+ *
+ * This zone ensures that the byte at z->end can be read by filling it with a
+ * padding character.
+ *
+ * Upto 8 bytes of data prior to begin is also provided for the purposes of
+ * generating hashes. History is not copied, as all locations which require
+ * history for generating a hash are the responsiblity of the start zone.
+ */
+static really_inline
+void createEndZone(const u8 *buf, const u8 *begin, const u8 *end,
+ struct zone *z) {
+ /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid
+ * the checks in boundary zone. */
+ z->floodPtr = z->buf + ZONE_TOTAL_SIZE;
+
+ ptrdiff_t z_len = end - begin;
+ assert(z_len > 0);
+ size_t iter_bytes_second = 0;
+ size_t z_len_first = z_len;
+ if (z_len > ITER_BYTES) {
+ z_len_first = z_len - ITER_BYTES;
+ iter_bytes_second = ITER_BYTES;
+ }
+ z->shift = ITER_BYTES - z_len_first;
+
+ const u8 *end_first = end - iter_bytes_second;
+ /* The amount of data we have to copy from main buffer for the
+ * first iteration. */
+ size_t copy_len_first = MIN((size_t)(end_first - buf),
+ ITER_BYTES + sizeof(CONF_TYPE));
+ assert(copy_len_first >= 16);
+
+ size_t total_copy_len = copy_len_first + iter_bytes_second;
+ assert(total_copy_len + 3 < 64);
+
+ /* copy the post-padding byte; this is required for domain > 8 due to
+ * overhang */
+ z->buf[total_copy_len] = 0;
+
+ /* set the start and end location of the zone buf
+ * to be scanned */
+ u8 *z_end = z->buf + total_copy_len;
+ z->end = z_end;
+ z->start = z_end - ITER_BYTES - iter_bytes_second;
+ assert(z->start + z->shift == z_end - z_len);
+
+ u8 *z_end_first = z_end - iter_bytes_second;
+ /* copy the first 8 bytes of the valid region */
+ unaligned_store_u64a(z->buf,
+ unaligned_load_u64a(end_first - copy_len_first));
+
+ /* copy the last 16 bytes, may overlap with the previous 8 byte write */
+ storeu128(z_end_first - sizeof(m128), loadu128(end_first - sizeof(m128)));
+ if (iter_bytes_second) {
+ storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
+ }
+
+ z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
+}
+
+/**
+ * \brief Prepare zones.
+ *
+ * This function prepares zones with actual buffer and some padded bytes.
+ * The actual ITER_BYTES bytes in zone is preceded by main buf and/or
+ * history buf and succeeded by padded bytes possibly from main buf,
+ * if available.
+ */
+static really_inline
+size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
+ size_t start, const u8 *flood, struct zone *zoneArr) {
+ const u8 *ptr = buf + start;
+ size_t remaining = len - start;
+
+ if (remaining <= ITER_BYTES) {
+ /* enough bytes to make only one zone */
+ createShortZone(buf, hend, ptr, buf + len, &zoneArr[0]);
+ return 1;
+ }
+
+ /* enough bytes to make more than one zone */
+
+ size_t numZone = 0;
+ createStartZone(buf, hend, ptr, &zoneArr[numZone++]);
+ ptr += ITER_BYTES;
+
+ assert(ptr < buf + len);
+
+ /* find maximum buffer location that the main zone can scan
+ * - must be a multiple of ITER_BYTES, and
+ * - cannot contain the last 3 bytes (due to 3 bytes read behind the
+ end of buffer in FDR main loop)
+ */
+ const u8 *main_end = buf + start + ROUNDDOWN_N(len - start - 3, ITER_BYTES);
+
+ /* create a zone if multiple of ITER_BYTES are found */
+ if (main_end > ptr) {
+ createMainZone(flood, ptr, main_end, &zoneArr[numZone++]);
+ ptr = main_end;
+ }
+ /* create a zone with rest of the data from the main buffer */
+ createEndZone(buf, ptr, buf + len, &zoneArr[numZone++]);
+ return numZone;
+}
+
+#define INVALID_MATCH_ID (~0U)
+
+#define FDR_MAIN_LOOP(zz, s, get_conf_fn) \
+ do { \
+ const u8 *tryFloodDetect = zz->floodPtr; \
+ const u8 *start_ptr = zz->start; \
+ const u8 *end_ptr = zz->end; \
+ \
+ for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \
+ itPtr += ITER_BYTES) { \
+ if (unlikely(itPtr > tryFloodDetect)) { \
+ tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\
+ &floodBackoff, &control, \
+ ITER_BYTES); \
+ if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \
+ return HWLM_TERMINATED; \
+ } \
+ } \
+ __builtin_prefetch(itPtr + ITER_BYTES); \
+ u64a conf0; \
+ u64a conf8; \
+ get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped, \
+ ft, &conf0, &conf8, &s); \
+ do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, \
+ &last_match_id, zz); \
+ do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, \
+ &last_match_id, zz); \
+ if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \
+ return HWLM_TERMINATED; \
+ } \
+ } /* end for loop */ \
+ } while (0) \
+
+static never_inline
+hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a,
+ hwlm_group_t control) {
+ assert(ISALIGNED_CL(fdr));
+
+ u32 floodBackoff = FLOOD_BACKOFF_START;
+ u32 last_match_id = INVALID_MATCH_ID;
+ u32 domain_mask_flipped = ~fdr->domainMask;
+ u8 stride = fdr->stride;
+ const u64a *ft =
+ (const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR)));
+ assert(ISALIGNED_CL(ft));
+ const u32 *confBase = (const u32 *)((const u8 *)fdr + fdr->confOffset);
+ assert(ISALIGNED_CL(confBase));
+ struct zone zones[ZONE_MAX];
+ assert(fdr->domain > 8 && fdr->domain < 16);
+
+ size_t numZone = prepareZones(a->buf, a->len,
+ a->buf_history + a->len_history,
+ a->start_offset, a->firstFloodDetect, zones);
+ assert(numZone <= ZONE_MAX);
+ m128 state = getInitState(fdr, a->len_history, ft, &zones[0]);
+
+ for (size_t curZone = 0; curZone < numZone; curZone++) {
+ struct zone *z = &zones[curZone];
+ dumpZoneInfo(z, curZone);
+
+ /* When a zone contains less data than is processed in an iteration
+ * of FDR_MAIN_LOOP(), we need to scan over some extra data.
+ *
+ * We have chosen to scan this extra data at the start of the
+ * iteration. The extra data is either data we have already scanned or
+ * garbage (if it is earlier than offset 0),
+ *
+ * As a result we need to shift the incoming state back so that it will
+ * properly line up with the data being scanned.
+ *
+ * We also need to forbid reporting any matches in the data being
+ * rescanned as they have already been reported (or are over garbage but
+ * later stages should also provide that safety guarantee).
+ */
+
+ u8 shift = z->shift;
+
+ state = variable_byte_shift_m128(state, shift);
+
+ state = or128(state, load128(zone_or_mask[shift]));
+
+ switch (stride) {
+ case 1:
+ FDR_MAIN_LOOP(z, state, get_conf_stride_1);
+ break;
+ case 2:
+ FDR_MAIN_LOOP(z, state, get_conf_stride_2);
+ break;
+ case 4:
+ FDR_MAIN_LOOP(z, state, get_conf_stride_4);
+ break;
+ default:
+ break;
+ }
+ }
+
+ return HWLM_SUCCESS;
+}
+
+#if defined(HAVE_AVX2)
+#define ONLY_AVX2(func) func
+#else
+#define ONLY_AVX2(func) NULL
+#endif
+
+typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr,
+ const struct FDR_Runtime_Args *a,
+ hwlm_group_t control);
+
+static const FDRFUNCTYPE funcs[] = {
+ fdr_engine_exec,
+ NULL, /* old: fast teddy */
+ NULL, /* old: fast teddy */
+ ONLY_AVX2(fdr_exec_fat_teddy_msks1),
+ ONLY_AVX2(fdr_exec_fat_teddy_msks1_pck),
+ ONLY_AVX2(fdr_exec_fat_teddy_msks2),
+ ONLY_AVX2(fdr_exec_fat_teddy_msks2_pck),
+ ONLY_AVX2(fdr_exec_fat_teddy_msks3),
+ ONLY_AVX2(fdr_exec_fat_teddy_msks3_pck),
+ ONLY_AVX2(fdr_exec_fat_teddy_msks4),
+ ONLY_AVX2(fdr_exec_fat_teddy_msks4_pck),
+ fdr_exec_teddy_msks1,
+ fdr_exec_teddy_msks1_pck,
+ fdr_exec_teddy_msks2,
+ fdr_exec_teddy_msks2_pck,
+ fdr_exec_teddy_msks3,
+ fdr_exec_teddy_msks3_pck,
+ fdr_exec_teddy_msks4,
+ fdr_exec_teddy_msks4_pck,
+};
+
#define FAKE_HISTORY_SIZE 16
static const u8 fake_history[FAKE_HISTORY_SIZE];
-hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
- size_t start, HWLMCallback cb,
- struct hs_scratch *scratch, hwlm_group_t groups) {
- // We guarantee (for safezone construction) that it is safe to read 16
- // bytes before the end of the history buffer.
- const u8 *hbuf = fake_history + FAKE_HISTORY_SIZE;
+hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
+ size_t start, HWLMCallback cb,
+ struct hs_scratch *scratch, hwlm_group_t groups) {
+ // We guarantee (for safezone construction) that it is safe to read 16
+ // bytes before the end of the history buffer.
+ const u8 *hbuf = fake_history + FAKE_HISTORY_SIZE;
const struct FDR_Runtime_Args a = {
buf,
len,
- hbuf,
+ hbuf,
0,
start,
cb,
- scratch,
+ scratch,
nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
0
};
@@ -846,15 +846,15 @@ hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len,
return HWLM_SUCCESS;
} else {
assert(funcs[fdr->engineID]);
- return funcs[fdr->engineID](fdr, &a, groups);
+ return funcs[fdr->engineID](fdr, &a, groups);
}
}
hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
size_t hlen, const u8 *buf, size_t len,
- size_t start, HWLMCallback cb,
- struct hs_scratch *scratch,
- hwlm_group_t groups) {
+ size_t start, HWLMCallback cb,
+ struct hs_scratch *scratch,
+ hwlm_group_t groups) {
struct FDR_Runtime_Args a = {
buf,
len,
@@ -862,11 +862,11 @@ hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
hlen,
start,
cb,
- scratch,
+ scratch,
nextFloodDetect(buf, len, FLOOD_BACKOFF_START),
- /* we are guaranteed to always have 16 initialised bytes at the end of
- * the history buffer (they may be garbage). */
- hbuf ? unaligned_load_u64a(hbuf + hlen - sizeof(u64a)) : (u64a)0
+ /* we are guaranteed to always have 16 initialised bytes at the end of
+ * the history buffer (they may be garbage). */
+ hbuf ? unaligned_load_u64a(hbuf + hlen - sizeof(u64a)) : (u64a)0
};
hwlm_error_t ret;
@@ -874,7 +874,7 @@ hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf,
ret = HWLM_SUCCESS;
} else {
assert(funcs[fdr->engineID]);
- ret = funcs[fdr->engineID](fdr, &a, groups);
+ ret = funcs[fdr->engineID](fdr, &a, groups);
}
return ret;