/*
 * Copyright (c) 2016-2020, Intel Corporation
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of Intel Corporation nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "sheng.h"

#include "accel.h"
#include "sheng_internal.h"
#include "nfa_api.h"
#include "nfa_api_queue.h"
#include "nfa_internal.h"
#include "util/bitutils.h"
#include "util/compare.h"
#include "util/join.h"
#include "util/simd_utils.h"

enum MatchMode {
    CALLBACK_OUTPUT,
    STOP_AT_MATCH,
    NO_MATCHES
};

static really_inline
const struct sheng *get_sheng(const struct NFA *n) {
    return (const struct sheng *)getImplNfa(n);
}

static really_inline
const struct sstate_aux *get_aux(const struct sheng *sh, u8 id) {
    u32 offset = sh->aux_offset - sizeof(struct NFA) +
            (id & SHENG_STATE_MASK) * sizeof(struct sstate_aux);
    DEBUG_PRINTF("Getting aux for state %u at offset %llu\n",
                 id & SHENG_STATE_MASK, (u64a)offset + sizeof(struct NFA));
    return (const struct sstate_aux *)((const char *) sh + offset);
}

static really_inline
const union AccelAux *get_accel(const struct sheng *sh, u8 id) {
    const struct sstate_aux *saux = get_aux(sh, id);
    DEBUG_PRINTF("Getting accel aux at offset %u\n", saux->accel);
    const union AccelAux *aux = (const union AccelAux *)
            ((const char *)sh + saux->accel - sizeof(struct NFA));
    return aux;
}

static really_inline
const struct report_list *get_rl(const struct sheng *sh,
                                 const struct sstate_aux *aux) {
    DEBUG_PRINTF("Getting report list at offset %u\n", aux->accept);
    return (const struct report_list *)
        ((const char *)sh + aux->accept - sizeof(struct NFA));
}

static really_inline
const struct report_list *get_eod_rl(const struct sheng *sh,
                                     const struct sstate_aux *aux) {
    DEBUG_PRINTF("Getting EOD report list at offset %u\n", aux->accept);
    return (const struct report_list *)
        ((const char *)sh + aux->accept_eod - sizeof(struct NFA));
}

static really_inline
char shengHasAccept(const struct sheng *sh, const struct sstate_aux *aux,
                    ReportID report) {
    assert(sh && aux);

    const struct report_list *rl = get_rl(sh, aux);
    assert(ISALIGNED_N(rl, 4));

    DEBUG_PRINTF("report list has %u entries\n", rl->count);

    for (u32 i = 0; i < rl->count; i++) {
        if (rl->report[i] == report) {
            DEBUG_PRINTF("reporting %u\n", rl->report[i]);
            return 1;
        }
    }

    return 0;
}

static really_inline
char fireSingleReport(NfaCallback cb, void *ctxt, ReportID r, u64a loc) {
    DEBUG_PRINTF("reporting %u\n", r);
    if (cb(0, loc, r, ctxt) == MO_HALT_MATCHING) {
        return MO_HALT_MATCHING; /* termination requested */
    }
    return MO_CONTINUE_MATCHING; /* continue execution */
}

static really_inline
char fireReports(const struct sheng *sh, NfaCallback cb, void *ctxt,
                 const u8 state, u64a loc, u8 *const cached_accept_state,
                 ReportID *const cached_accept_id, char eod) {
    DEBUG_PRINTF("reporting matches @ %llu\n", loc);

    if (!eod && state == *cached_accept_state) {
        DEBUG_PRINTF("reporting %u\n", *cached_accept_id);
        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
            return MO_HALT_MATCHING; /* termination requested */
        }

        return MO_CONTINUE_MATCHING; /* continue execution */
    }
    const struct sstate_aux *aux = get_aux(sh, state);
    const struct report_list *rl = eod ? get_eod_rl(sh, aux) : get_rl(sh, aux);
    assert(ISALIGNED(rl));

    DEBUG_PRINTF("report list has %u entries\n", rl->count);
    u32 count = rl->count;

    if (!eod && count == 1) {
        *cached_accept_state = state;
        *cached_accept_id = rl->report[0];

        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
            return MO_HALT_MATCHING; /* termination requested */
        }

        return MO_CONTINUE_MATCHING; /* continue execution */
    }

    for (u32 i = 0; i < count; i++) {
        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
            return MO_HALT_MATCHING; /* termination requested */
        }
    }
    return MO_CONTINUE_MATCHING; /* continue execution */
}

#if defined(HAVE_AVX512VBMI)
// Sheng32
static really_inline
const struct sheng32 *get_sheng32(const struct NFA *n) {
    return (const struct sheng32 *)getImplNfa(n);
}

static really_inline
const struct sstate_aux *get_aux32(const struct sheng32 *sh, u8 id) {
    u32 offset = sh->aux_offset - sizeof(struct NFA) +
            (id & SHENG32_STATE_MASK) * sizeof(struct sstate_aux);
    DEBUG_PRINTF("Getting aux for state %u at offset %llu\n",
                 id & SHENG32_STATE_MASK, (u64a)offset + sizeof(struct NFA));
    return (const struct sstate_aux *)((const char *) sh + offset);
}

static really_inline
const union AccelAux *get_accel32(const struct sheng32 *sh, u8 id) {
    const struct sstate_aux *saux = get_aux32(sh, id);
    DEBUG_PRINTF("Getting accel aux at offset %u\n", saux->accel);
    const union AccelAux *aux = (const union AccelAux *)
            ((const char *)sh + saux->accel - sizeof(struct NFA));
    return aux;
}

static really_inline
const struct report_list *get_rl32(const struct sheng32 *sh,
                                   const struct sstate_aux *aux) {
    DEBUG_PRINTF("Getting report list at offset %u\n", aux->accept);
    return (const struct report_list *)
        ((const char *)sh + aux->accept - sizeof(struct NFA));
}

static really_inline
const struct report_list *get_eod_rl32(const struct sheng32 *sh,
                                       const struct sstate_aux *aux) {
    DEBUG_PRINTF("Getting EOD report list at offset %u\n", aux->accept);
    return (const struct report_list *)
        ((const char *)sh + aux->accept_eod - sizeof(struct NFA));
}

static really_inline
char sheng32HasAccept(const struct sheng32 *sh, const struct sstate_aux *aux,
                      ReportID report) {
    assert(sh && aux);

    const struct report_list *rl = get_rl32(sh, aux);
    assert(ISALIGNED_N(rl, 4));

    DEBUG_PRINTF("report list has %u entries\n", rl->count);

    for (u32 i = 0; i < rl->count; i++) {
        if (rl->report[i] == report) {
            DEBUG_PRINTF("reporting %u\n", rl->report[i]);
            return 1;
        }
    }

    return 0;
}

static really_inline
char fireReports32(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
                   const u8 state, u64a loc, u8 *const cached_accept_state,
                   ReportID *const cached_accept_id, char eod) {
    DEBUG_PRINTF("reporting matches @ %llu\n", loc);

    if (!eod && state == *cached_accept_state) {
        DEBUG_PRINTF("reporting %u\n", *cached_accept_id);
        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
            return MO_HALT_MATCHING; /* termination requested */
        }

        return MO_CONTINUE_MATCHING; /* continue execution */
    }
    const struct sstate_aux *aux = get_aux32(sh, state);
    const struct report_list *rl = eod ? get_eod_rl32(sh, aux) :
                                         get_rl32(sh, aux);
    assert(ISALIGNED(rl));

    DEBUG_PRINTF("report list has %u entries\n", rl->count);
    u32 count = rl->count;

    if (!eod && count == 1) {
        *cached_accept_state = state;
        *cached_accept_id = rl->report[0];

        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
            return MO_HALT_MATCHING; /* termination requested */
        }

        return MO_CONTINUE_MATCHING; /* continue execution */
    }

    for (u32 i = 0; i < count; i++) {
        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
            return MO_HALT_MATCHING; /* termination requested */
        }
    }
    return MO_CONTINUE_MATCHING; /* continue execution */
}

// Sheng64
static really_inline
const struct sheng64 *get_sheng64(const struct NFA *n) {
    return (const struct sheng64 *)getImplNfa(n);
}

static really_inline
const struct sstate_aux *get_aux64(const struct sheng64 *sh, u8 id) {
    u32 offset = sh->aux_offset - sizeof(struct NFA) +
            (id & SHENG64_STATE_MASK) * sizeof(struct sstate_aux);
    DEBUG_PRINTF("Getting aux for state %u at offset %llu\n",
                 id & SHENG64_STATE_MASK, (u64a)offset + sizeof(struct NFA));
    return (const struct sstate_aux *)((const char *) sh + offset);
}

static really_inline
const struct report_list *get_rl64(const struct sheng64 *sh,
                                   const struct sstate_aux *aux) {
    DEBUG_PRINTF("Getting report list at offset %u\n", aux->accept);
    return (const struct report_list *)
        ((const char *)sh + aux->accept - sizeof(struct NFA));
}

static really_inline
const struct report_list *get_eod_rl64(const struct sheng64 *sh,
                                       const struct sstate_aux *aux) {
    DEBUG_PRINTF("Getting EOD report list at offset %u\n", aux->accept);
    return (const struct report_list *)
        ((const char *)sh + aux->accept_eod - sizeof(struct NFA));
}

static really_inline
char sheng64HasAccept(const struct sheng64 *sh, const struct sstate_aux *aux,
                      ReportID report) {
    assert(sh && aux);

    const struct report_list *rl = get_rl64(sh, aux);
    assert(ISALIGNED_N(rl, 4));

    DEBUG_PRINTF("report list has %u entries\n", rl->count);

    for (u32 i = 0; i < rl->count; i++) {
        if (rl->report[i] == report) {
            DEBUG_PRINTF("reporting %u\n", rl->report[i]);
            return 1;
        }
    }

    return 0;
}

static really_inline
char fireReports64(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
                   const u8 state, u64a loc, u8 *const cached_accept_state,
                   ReportID *const cached_accept_id, char eod) {
    DEBUG_PRINTF("reporting matches @ %llu\n", loc);

    if (!eod && state == *cached_accept_state) {
        DEBUG_PRINTF("reporting %u\n", *cached_accept_id);
        if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) {
            return MO_HALT_MATCHING; /* termination requested */
        }

        return MO_CONTINUE_MATCHING; /* continue execution */
    }
    const struct sstate_aux *aux = get_aux64(sh, state);
    const struct report_list *rl = eod ? get_eod_rl64(sh, aux) :
                                         get_rl64(sh, aux);
    assert(ISALIGNED(rl));

    DEBUG_PRINTF("report list has %u entries\n", rl->count);
    u32 count = rl->count;

    if (!eod && count == 1) {
        *cached_accept_state = state;
        *cached_accept_id = rl->report[0];

        DEBUG_PRINTF("reporting %u\n", rl->report[0]);
        if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) {
            return MO_HALT_MATCHING; /* termination requested */
        }

        return MO_CONTINUE_MATCHING; /* continue execution */
    }

    for (u32 i = 0; i < count; i++) {
        DEBUG_PRINTF("reporting %u\n", rl->report[i]);
        if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) {
            return MO_HALT_MATCHING; /* termination requested */
        }
    }
    return MO_CONTINUE_MATCHING; /* continue execution */
}
#endif // end of HAVE_AVX512VBMI

/* include Sheng function definitions */
#include "sheng_defs.h"

static really_inline
char runShengCb(const struct sheng *sh, NfaCallback cb, void *ctxt, u64a offset,
                u8 *const cached_accept_state, ReportID *const cached_accept_id,
                const u8 *cur_buf, const u8 *start, const u8 *end, u8 can_die,
                u8 has_accel, u8 single, const u8 **scanned, u8 *state) {
    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in callback mode\n",
                 (u64a)(end - start), offset);
    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
                 (s64a)(end - cur_buf));
    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
                 !!has_accel, !!single);
    int rv;
    /* scan and report all matches */
    if (can_die) {
        if (has_accel) {
            rv = sheng4_coda(state, cb, ctxt, sh, cached_accept_state,
                             cached_accept_id, single, offset, cur_buf, start,
                             end, scanned);
        } else {
            rv = sheng4_cod(state, cb, ctxt, sh, cached_accept_state,
                            cached_accept_id, single, offset, cur_buf, start,
                            end, scanned);
        }
        if (rv == MO_HALT_MATCHING) {
            return MO_DEAD;
        }
        rv = sheng_cod(state, cb, ctxt, sh, cached_accept_state,
                       cached_accept_id, single, offset, cur_buf, *scanned, end,
                       scanned);
    } else {
        if (has_accel) {
            rv = sheng4_coa(state, cb, ctxt, sh, cached_accept_state,
                            cached_accept_id, single, offset, cur_buf, start,
                            end, scanned);
        } else {
            rv = sheng4_co(state, cb, ctxt, sh, cached_accept_state,
                           cached_accept_id, single, offset, cur_buf, start,
                           end, scanned);
        }
        if (rv == MO_HALT_MATCHING) {
            return MO_DEAD;
        }
        rv = sheng_co(state, cb, ctxt, sh, cached_accept_state,
                      cached_accept_id, single, offset, cur_buf, *scanned, end,
                      scanned);
    }
    if (rv == MO_HALT_MATCHING) {
        return MO_DEAD;
    }
    return MO_ALIVE;
}

static really_inline
void runShengNm(const struct sheng *sh, NfaCallback cb, void *ctxt, u64a offset,
                u8 *const cached_accept_state, ReportID *const cached_accept_id,
                const u8 *cur_buf, const u8 *start, const u8 *end, u8 can_die,
                u8 has_accel, u8 single, const u8 **scanned, u8 *state) {
    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in nomatch mode\n",
                 (u64a)(end - start), offset);
    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
                 (s64a)(end - cur_buf));
    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
                 !!has_accel, !!single);
    /* just scan the buffer */
    if (can_die) {
        if (has_accel) {
            sheng4_nmda(state, cb, ctxt, sh, cached_accept_state,
                        cached_accept_id, single, offset, cur_buf, start, end,
                        scanned);
        } else {
            sheng4_nmd(state, cb, ctxt, sh, cached_accept_state,
                       cached_accept_id, single, offset, cur_buf, start, end,
                       scanned);
        }
        sheng_nmd(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
                  single, offset, cur_buf, *scanned, end, scanned);
    } else {
        sheng4_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
                  single, offset, cur_buf, start, end, scanned);
        sheng_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
                 single, offset, cur_buf, *scanned, end, scanned);
    }
}

static really_inline
char runShengSam(const struct sheng *sh, NfaCallback cb, void *ctxt,
                 u64a offset, u8 *const cached_accept_state,
                 ReportID *const cached_accept_id, const u8 *cur_buf,
                 const u8 *start, const u8 *end, u8 can_die, u8 has_accel,
                 u8 single, const u8 **scanned, u8 *state) {
    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in stop at match mode\n",
                 (u64a)(end - start), offset);
    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
                 (s64a)(end - cur_buf));
    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
                 !!has_accel, !!single);
    int rv;
    /* scan until first match */
    if (can_die) {
        if (has_accel) {
            rv = sheng4_samda(state, cb, ctxt, sh, cached_accept_state,
                              cached_accept_id, single, offset, cur_buf, start,
                              end, scanned);
        } else {
            rv = sheng4_samd(state, cb, ctxt, sh, cached_accept_state,
                             cached_accept_id, single, offset, cur_buf, start,
                             end, scanned);
        }
        if (rv == MO_HALT_MATCHING) {
            return MO_DEAD;
        }
        /* if we stopped before we expected, we found a match */
        if (rv == MO_MATCHES_PENDING) {
            return MO_MATCHES_PENDING;
        }

        rv = sheng_samd(state, cb, ctxt, sh, cached_accept_state,
                        cached_accept_id, single, offset, cur_buf, *scanned,
                        end, scanned);
    } else {
        if (has_accel) {
            rv = sheng4_sama(state, cb, ctxt, sh, cached_accept_state,
                             cached_accept_id, single, offset, cur_buf, start,
                             end, scanned);
        } else {
            rv = sheng4_sam(state, cb, ctxt, sh, cached_accept_state,
                            cached_accept_id, single, offset, cur_buf, start,
                            end, scanned);
        }
        if (rv == MO_HALT_MATCHING) {
            return MO_DEAD;
        }
        /* if we stopped before we expected, we found a match */
        if (rv == MO_MATCHES_PENDING) {
            return MO_MATCHES_PENDING;
        }

        rv = sheng_sam(state, cb, ctxt, sh, cached_accept_state,
                       cached_accept_id, single, offset, cur_buf, *scanned, end,
                       scanned);
    }
    if (rv == MO_HALT_MATCHING) {
        return MO_DEAD;
    }
    /* if we stopped before we expected, we found a match */
    if (rv == MO_MATCHES_PENDING) {
        return MO_MATCHES_PENDING;
    }
    return MO_ALIVE;
}

static never_inline
char runSheng(const struct sheng *sh, struct mq *q, s64a b_end,
              enum MatchMode mode) {
    u8 state = *(u8 *)q->state;
    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
    u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL;
    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;

    u8 cached_accept_state = 0;
    ReportID cached_accept_id = 0;

    DEBUG_PRINTF("starting Sheng execution in state %u\n",
                 state & SHENG_STATE_MASK);

    if (q->report_current) {
        DEBUG_PRINTF("reporting current pending matches\n");
        assert(sh);

        q->report_current = 0;

        int rv;
        if (single) {
            rv = fireSingleReport(q->cb, q->context, sh->report,
                                  q_cur_offset(q));
        } else {
            rv = fireReports(sh, q->cb, q->context, state, q_cur_offset(q),
                             &cached_accept_state, &cached_accept_id, 0);
        }
        if (rv == MO_HALT_MATCHING) {
            DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK);
            return MO_DEAD;
        }

        DEBUG_PRINTF("proceeding with matching\n");
    }

    assert(q_cur_type(q) == MQE_START);
    s64a start = q_cur_loc(q);

    DEBUG_PRINTF("offset: %lli, location: %lli, mode: %s\n", q->offset, start,
                 mode == CALLBACK_OUTPUT ? "CALLBACK OUTPUT" :
                     mode == NO_MATCHES ? "NO MATCHES" :
                         mode == STOP_AT_MATCH ? "STOP AT MATCH" : "???");

    DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
                 q_cur_type(q) == MQE_START ? "START" :
                     q_cur_type(q) == MQE_TOP ? "TOP" :
                         q_cur_type(q) == MQE_END ? "END" : "???");

    const u8* cur_buf;
    if (start < 0) {
        DEBUG_PRINTF("negative location, scanning history\n");
        DEBUG_PRINTF("min location: %zd\n", -q->hlength);
        cur_buf = q->history + q->hlength;
    } else {
        DEBUG_PRINTF("positive location, scanning buffer\n");
        DEBUG_PRINTF("max location: %lli\n", b_end);
        cur_buf = q->buffer;
    }

    /* if we our queue event is past our end */
    if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
        DEBUG_PRINTF("current location past buffer end\n");
        DEBUG_PRINTF("setting q location to %llu\n", b_end);
        DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK);
        q->items[q->cur].location = b_end;
        return MO_ALIVE;
    }

    q->cur++;

    s64a cur_start = start;

    while (1) {
        DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
                     q_cur_type(q) == MQE_START ? "START" :
                             q_cur_type(q) == MQE_TOP ? "TOP" :
                                     q_cur_type(q) == MQE_END ? "END" : "???");
        s64a end = q_cur_loc(q);
        if (mode != NO_MATCHES) {
            end = MIN(end, b_end);
        }
        assert(end <= (s64a) q->length);
        s64a cur_end = end;

        /* we may cross the border between history and current buffer */
        if (cur_start < 0) {
            cur_end = MIN(0, cur_end);
        }

        DEBUG_PRINTF("start: %lli end: %lli\n", start, end);

        /* don't scan zero length buffer */
        if (cur_start != cur_end) {
            const u8 * scanned = cur_buf;
            char rv;

            if (mode == NO_MATCHES) {
                runShengNm(sh, q->cb, q->context, q->offset,
                           &cached_accept_state, &cached_accept_id, cur_buf,
                           cur_buf + cur_start, cur_buf + cur_end, can_die,
                           has_accel, single, &scanned, &state);
            } else if (mode == CALLBACK_OUTPUT) {
                rv = runShengCb(sh, q->cb, q->context, q->offset,
                                &cached_accept_state, &cached_accept_id,
                                cur_buf, cur_buf + cur_start, cur_buf + cur_end,
                                can_die, has_accel, single, &scanned, &state);
                if (rv == MO_DEAD) {
                    DEBUG_PRINTF("exiting in state %u\n",
                                 state & SHENG_STATE_MASK);
                    return MO_DEAD;
                }
            } else if (mode == STOP_AT_MATCH) {
                rv = runShengSam(sh, q->cb, q->context, q->offset,
                                 &cached_accept_state, &cached_accept_id,
                                 cur_buf, cur_buf + cur_start,
                                 cur_buf + cur_end, can_die, has_accel, single,
                                 &scanned, &state);
                if (rv == MO_DEAD) {
                    DEBUG_PRINTF("exiting in state %u\n",
                                 state & SHENG_STATE_MASK);
                    return rv;
                } else if (rv == MO_MATCHES_PENDING) {
                    assert(q->cur);
                    DEBUG_PRINTF("found a match, setting q location to %zd\n",
                                 scanned - cur_buf + 1);
                    q->cur--;
                    q->items[q->cur].type = MQE_START;
                    q->items[q->cur].location =
                            scanned - cur_buf + 1; /* due to exiting early */
                    *(u8 *)q->state = state;
                    DEBUG_PRINTF("exiting in state %u\n",
                                 state & SHENG_STATE_MASK);
                    return rv;
                }
            } else {
                assert(!"invalid scanning mode!");
            }
            assert(scanned == cur_buf + cur_end);

            cur_start = cur_end;
        }

        /* if we our queue event is past our end */
        if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
            DEBUG_PRINTF("current location past buffer end\n");
            DEBUG_PRINTF("setting q location to %llu\n", b_end);
            DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK);
            q->cur--;
            q->items[q->cur].type = MQE_START;
            q->items[q->cur].location = b_end;
            *(u8 *)q->state = state;
            return MO_ALIVE;
        }

        /* crossing over into actual buffer */
        if (cur_start == 0) {
            DEBUG_PRINTF("positive location, scanning buffer\n");
            DEBUG_PRINTF("max offset: %lli\n", b_end);
            cur_buf = q->buffer;
        }

        /* continue scanning the same buffer */
        if (end != cur_end) {
            continue;
        }

        switch (q_cur_type(q)) {
        case MQE_END:
            *(u8 *)q->state = state;
            q->cur++;
            DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK);
            if (can_die) {
                return (state & SHENG_STATE_DEAD) ? MO_DEAD : MO_ALIVE;
            }
            return MO_ALIVE;
        case MQE_TOP:
            if (q->offset + cur_start == 0) {
                DEBUG_PRINTF("Anchored start, going to state %u\n",
                             sh->anchored);
                state = sh->anchored;
            } else {
                u8 new_state = get_aux(sh, state)->top;
                DEBUG_PRINTF("Top event %u->%u\n", state & SHENG_STATE_MASK,
                             new_state & SHENG_STATE_MASK);
                state = new_state;
            }
            break;
        default:
            assert(!"invalid queue event");
            break;
        }
        q->cur++;
    }
}

char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer,
                    size_t length, NfaCallback cb, void *context) {
    DEBUG_PRINTF("smallwrite Sheng\n");
    assert(n->type == SHENG_NFA);
    const struct sheng *sh = getImplNfa(n);
    u8 state = sh->anchored;
    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
    u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL;
    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
    u8 cached_accept_state = 0;
    ReportID cached_accept_id = 0;

    /* scan and report all matches */
    int rv;
    s64a end = length;
    const u8 *scanned;

    rv = runShengCb(sh, cb, context, offset, &cached_accept_state,
                    &cached_accept_id, buffer, buffer, buffer + end, can_die,
                    has_accel, single, &scanned, &state);
    if (rv == MO_DEAD) {
        DEBUG_PRINTF("exiting in state %u\n",
                     state & SHENG_STATE_MASK);
        return MO_DEAD;
    }

    DEBUG_PRINTF("%u\n", state & SHENG_STATE_MASK);

    const struct sstate_aux *aux = get_aux(sh, state);

    if (aux->accept_eod) {
        DEBUG_PRINTF("Reporting EOD matches\n");
        fireReports(sh, cb, context, state, end + offset, &cached_accept_state,
                    &cached_accept_id, 1);
    }

    return state & SHENG_STATE_DEAD ? MO_DEAD : MO_ALIVE;
}

char nfaExecSheng_Q(const struct NFA *n, struct mq *q, s64a end) {
    const struct sheng *sh = get_sheng(n);
    char rv = runSheng(sh, q, end, CALLBACK_OUTPUT);
    return rv;
}

char nfaExecSheng_Q2(const struct NFA *n, struct mq *q, s64a end) {
    const struct sheng *sh = get_sheng(n);
    char rv = runSheng(sh, q, end, STOP_AT_MATCH);
    return rv;
}

char nfaExecSheng_QR(const struct NFA *n, struct mq *q, ReportID report) {
    assert(q_cur_type(q) == MQE_START);

    const struct sheng *sh = get_sheng(n);
    char rv = runSheng(sh, q, 0 /* end */, NO_MATCHES);

    if (rv && nfaExecSheng_inAccept(n, report, q)) {
        return MO_MATCHES_PENDING;
    }
    return rv;
}

char nfaExecSheng_inAccept(const struct NFA *n, ReportID report, struct mq *q) {
    assert(n && q);

    const struct sheng *sh = get_sheng(n);
    u8 s = *(const u8 *)q->state;
    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG_STATE_MASK));

    const struct sstate_aux *aux = get_aux(sh, s);

    if (!aux->accept) {
        return 0;
    }

    return shengHasAccept(sh, aux, report);
}

char nfaExecSheng_inAnyAccept(const struct NFA *n, struct mq *q) {
    assert(n && q);

    const struct sheng *sh = get_sheng(n);
    u8 s = *(const u8 *)q->state;
    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG_STATE_MASK));

    const struct sstate_aux *aux = get_aux(sh, s);
    return !!aux->accept;
}

char nfaExecSheng_testEOD(const struct NFA *nfa, const char *state,
                          UNUSED const char *streamState, u64a offset,
                          NfaCallback cb, void *ctxt) {
    assert(nfa);

    const struct sheng *sh = get_sheng(nfa);
    u8 s = *(const u8 *)state;
    DEBUG_PRINTF("checking EOD accepts for %u\n", (u8)(s & SHENG_STATE_MASK));

    const struct sstate_aux *aux = get_aux(sh, s);

    if (!aux->accept_eod) {
        return MO_CONTINUE_MATCHING;
    }

    return fireReports(sh, cb, ctxt, s, offset, NULL, NULL, 1);
}

char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q) {
    const struct sheng *sh = (const struct sheng *)getImplNfa(n);
    NfaCallback cb = q->cb;
    void *ctxt = q->context;
    u8 s = *(u8 *)q->state;
    const struct sstate_aux *aux = get_aux(sh, s);
    u64a offset = q_cur_offset(q);
    u8 cached_state_id = 0;
    ReportID cached_report_id = 0;
    assert(q_cur_type(q) == MQE_START);

    if (aux->accept) {
        if (sh->flags & SHENG_FLAG_SINGLE_REPORT) {
            fireSingleReport(cb, ctxt, sh->report, offset);
        } else {
            fireReports(sh, cb, ctxt, s, offset, &cached_state_id,
                        &cached_report_id, 0);
        }
    }

    return 0;
}

char nfaExecSheng_initCompressedState(const struct NFA *nfa, u64a offset,
                                      void *state, UNUSED u8 key) {
    const struct sheng *sh = get_sheng(nfa);
    u8 *s = (u8 *)state;
    *s = offset ? sh->floating: sh->anchored;
    return !(*s & SHENG_STATE_DEAD);
}

char nfaExecSheng_queueInitState(const struct NFA *nfa, struct mq *q) {
    assert(nfa->scratchStateSize == 1);

    /* starting in floating state */
    const struct sheng *sh = get_sheng(nfa);
    *(u8 *)q->state = sh->floating;
    DEBUG_PRINTF("starting in floating state\n");
    return 0;
}

char nfaExecSheng_queueCompressState(UNUSED const struct NFA *nfa,
                                     const struct mq *q, UNUSED s64a loc) {
    void *dest = q->streamState;
    const void *src = q->state;
    assert(nfa->scratchStateSize == 1);
    assert(nfa->streamStateSize == 1);
    *(u8 *)dest = *(const u8 *)src;
    return 0;
}

char nfaExecSheng_expandState(UNUSED const struct NFA *nfa, void *dest,
                              const void *src, UNUSED u64a offset,
                              UNUSED u8 key) {
    assert(nfa->scratchStateSize == 1);
    assert(nfa->streamStateSize == 1);
    *(u8 *)dest = *(const u8 *)src;
    return 0;
}

#if defined(HAVE_AVX512VBMI)
// Sheng32
static really_inline
char runSheng32Cb(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
                  u64a offset, u8 *const cached_accept_state,
                  ReportID *const cached_accept_id, const u8 *cur_buf,
                  const u8 *start, const u8 *end, u8 can_die,
                  u8 has_accel, u8 single, const u8 **scanned, u8 *state) {
    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in callback mode\n",
                 (u64a)(end - start), offset);
    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
                 (s64a)(end - cur_buf));
    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
                 !!has_accel, !!single);
    int rv;
    /* scan and report all matches */
    if (can_die) {
        if (has_accel) {
            rv = sheng32_4_coda(state, cb, ctxt, sh, cached_accept_state,
                                cached_accept_id, single, offset, cur_buf,
                                start, end, scanned);
        } else {
            rv = sheng32_4_cod(state, cb, ctxt, sh, cached_accept_state,
                               cached_accept_id, single, offset, cur_buf,
                               start, end, scanned);
        }
        if (rv == MO_HALT_MATCHING) {
            return MO_DEAD;
        }
        rv = sheng32_cod(state, cb, ctxt, sh, cached_accept_state,
                         cached_accept_id, single, offset, cur_buf,
                         *scanned, end, scanned);
    } else {
        if (has_accel) {
            rv = sheng32_4_coa(state, cb, ctxt, sh, cached_accept_state,
                               cached_accept_id, single, offset, cur_buf,
                               start, end, scanned);
        } else {
            rv = sheng32_4_co(state, cb, ctxt, sh, cached_accept_state,
                              cached_accept_id, single, offset, cur_buf,
                              start, end, scanned);
        }
        if (rv == MO_HALT_MATCHING) {
            return MO_DEAD;
        }
        rv = sheng32_co(state, cb, ctxt, sh, cached_accept_state,
                        cached_accept_id, single, offset, cur_buf,
                        *scanned, end, scanned);
    }
    if (rv == MO_HALT_MATCHING) {
        return MO_DEAD;
    }
    return MO_ALIVE;
}

static really_inline
void runSheng32Nm(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
                  u64a offset, u8 *const cached_accept_state,
                  ReportID *const cached_accept_id, const u8 *cur_buf,
                  const u8 *start, const u8 *end, u8 can_die, u8 has_accel,
                  u8 single, const u8 **scanned, u8 *state) {
    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in nomatch mode\n",
                 (u64a)(end - start), offset);
    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
                 (s64a)(end - cur_buf));
    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
                 !!has_accel, !!single);
    /* just scan the buffer */
    if (can_die) {
        if (has_accel) {
            sheng32_4_nmda(state, cb, ctxt, sh, cached_accept_state,
                           cached_accept_id, single, offset, cur_buf,
                           start, end, scanned);
        } else {
            sheng32_4_nmd(state, cb, ctxt, sh, cached_accept_state,
                          cached_accept_id, single, offset, cur_buf,
                          start, end, scanned);
        }
        sheng32_nmd(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
                    single, offset, cur_buf, *scanned, end, scanned);
    } else {
        sheng32_4_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
                     single, offset, cur_buf, start, end, scanned);
        sheng32_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
                   single, offset, cur_buf, *scanned, end, scanned);
    }
}

static really_inline
char runSheng32Sam(const struct sheng32 *sh, NfaCallback cb, void *ctxt,
                   u64a offset, u8 *const cached_accept_state,
                   ReportID *const cached_accept_id, const u8 *cur_buf,
                   const u8 *start, const u8 *end, u8 can_die, u8 has_accel,
                   u8 single, const u8 **scanned, u8 *state) {
    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in stop at match mode\n",
                 (u64a)(end - start), offset);
    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
                 (s64a)(end - cur_buf));
    DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die,
                 !!has_accel, !!single);
    int rv;
    /* scan until first match */
    if (can_die) {
        if (has_accel) {
            rv = sheng32_4_samda(state, cb, ctxt, sh, cached_accept_state,
                                 cached_accept_id, single, offset, cur_buf,
                                 start, end, scanned);
        } else {
            rv = sheng32_4_samd(state, cb, ctxt, sh, cached_accept_state,
                                cached_accept_id, single, offset, cur_buf,
                                start, end, scanned);
        }
        if (rv == MO_HALT_MATCHING) {
            return MO_DEAD;
        }
        /* if we stopped before we expected, we found a match */
        if (rv == MO_MATCHES_PENDING) {
            return MO_MATCHES_PENDING;
        }

        rv = sheng32_samd(state, cb, ctxt, sh, cached_accept_state,
                          cached_accept_id, single, offset, cur_buf,
                          *scanned, end, scanned);
    } else {
        if (has_accel) {
            rv = sheng32_4_sama(state, cb, ctxt, sh, cached_accept_state,
                                cached_accept_id, single, offset, cur_buf,
                                start, end, scanned);
        } else {
            rv = sheng32_4_sam(state, cb, ctxt, sh, cached_accept_state,
                               cached_accept_id, single, offset, cur_buf,
                               start, end, scanned);
        }
        if (rv == MO_HALT_MATCHING) {
            return MO_DEAD;
        }
        /* if we stopped before we expected, we found a match */
        if (rv == MO_MATCHES_PENDING) {
            return MO_MATCHES_PENDING;
        }

        rv = sheng32_sam(state, cb, ctxt, sh, cached_accept_state,
                         cached_accept_id, single, offset, cur_buf,
                         *scanned, end, scanned);
    }
    if (rv == MO_HALT_MATCHING) {
        return MO_DEAD;
    }
    /* if we stopped before we expected, we found a match */
    if (rv == MO_MATCHES_PENDING) {
        return MO_MATCHES_PENDING;
    }
    return MO_ALIVE;
}

static never_inline
char runSheng32(const struct sheng32 *sh, struct mq *q, s64a b_end,
                enum MatchMode mode) {
    u8 state = *(u8 *)q->state;
    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
    u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL;
    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;

    u8 cached_accept_state = 0;
    ReportID cached_accept_id = 0;

    DEBUG_PRINTF("starting Sheng32 execution in state %u\n",
                 state & SHENG32_STATE_MASK);

    if (q->report_current) {
        DEBUG_PRINTF("reporting current pending matches\n");
        assert(sh);

        q->report_current = 0;

        int rv;
        if (single) {
            rv = fireSingleReport(q->cb, q->context, sh->report,
                                  q_cur_offset(q));
        } else {
            rv = fireReports32(sh, q->cb, q->context, state, q_cur_offset(q),
                               &cached_accept_state, &cached_accept_id, 0);
        }
        if (rv == MO_HALT_MATCHING) {
            DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK);
            return MO_DEAD;
        }

        DEBUG_PRINTF("proceeding with matching\n");
    }

    assert(q_cur_type(q) == MQE_START);
    s64a start = q_cur_loc(q);

    DEBUG_PRINTF("offset: %lli, location: %lli, mode: %s\n", q->offset, start,
                 mode == CALLBACK_OUTPUT ? "CALLBACK OUTPUT" :
                     mode == NO_MATCHES ? "NO MATCHES" :
                         mode == STOP_AT_MATCH ? "STOP AT MATCH" : "???");

    DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
                 q_cur_type(q) == MQE_START ? "START" :
                     q_cur_type(q) == MQE_TOP ? "TOP" :
                         q_cur_type(q) == MQE_END ? "END" : "???");

    const u8* cur_buf;
    if (start < 0) {
        DEBUG_PRINTF("negative location, scanning history\n");
        DEBUG_PRINTF("min location: %zd\n", -q->hlength);
        cur_buf = q->history + q->hlength;
    } else {
        DEBUG_PRINTF("positive location, scanning buffer\n");
        DEBUG_PRINTF("max location: %lli\n", b_end);
        cur_buf = q->buffer;
    }

    /* if we our queue event is past our end */
    if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
        DEBUG_PRINTF("current location past buffer end\n");
        DEBUG_PRINTF("setting q location to %llu\n", b_end);
        DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK);
        q->items[q->cur].location = b_end;
        return MO_ALIVE;
    }

    q->cur++;

    s64a cur_start = start;

    while (1) {
        DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
                     q_cur_type(q) == MQE_START ? "START" :
                             q_cur_type(q) == MQE_TOP ? "TOP" :
                                     q_cur_type(q) == MQE_END ? "END" : "???");
        s64a end = q_cur_loc(q);
        if (mode != NO_MATCHES) {
            end = MIN(end, b_end);
        }
        assert(end <= (s64a) q->length);
        s64a cur_end = end;

        /* we may cross the border between history and current buffer */
        if (cur_start < 0) {
            cur_end = MIN(0, cur_end);
        }

        DEBUG_PRINTF("start: %lli end: %lli\n", start, end);

        /* don't scan zero length buffer */
        if (cur_start != cur_end) {
            const u8 * scanned = cur_buf;
            char rv;

            if (mode == NO_MATCHES) {
                runSheng32Nm(sh, q->cb, q->context, q->offset,
                             &cached_accept_state, &cached_accept_id, cur_buf,
                             cur_buf + cur_start, cur_buf + cur_end, can_die,
                             has_accel, single, &scanned, &state);
            } else if (mode == CALLBACK_OUTPUT) {
                rv = runSheng32Cb(sh, q->cb, q->context, q->offset,
                                  &cached_accept_state, &cached_accept_id,
                                  cur_buf, cur_buf + cur_start, cur_buf + cur_end,
                                  can_die, has_accel, single, &scanned, &state);
                if (rv == MO_DEAD) {
                    DEBUG_PRINTF("exiting in state %u\n",
                                 state & SHENG32_STATE_MASK);
                    return MO_DEAD;
                }
            } else if (mode == STOP_AT_MATCH) {
                rv = runSheng32Sam(sh, q->cb, q->context, q->offset,
                                   &cached_accept_state, &cached_accept_id,
                                   cur_buf, cur_buf + cur_start,
                                   cur_buf + cur_end, can_die, has_accel, single,
                                   &scanned, &state);
                if (rv == MO_DEAD) {
                    DEBUG_PRINTF("exiting in state %u\n",
                                 state & SHENG32_STATE_MASK);
                    return rv;
                } else if (rv == MO_MATCHES_PENDING) {
                    assert(q->cur);
                    DEBUG_PRINTF("found a match, setting q location to %zd\n",
                                 scanned - cur_buf + 1);
                    q->cur--;
                    q->items[q->cur].type = MQE_START;
                    q->items[q->cur].location =
                            scanned - cur_buf + 1; /* due to exiting early */
                    *(u8 *)q->state = state;
                    DEBUG_PRINTF("exiting in state %u\n",
                                 state & SHENG32_STATE_MASK);
                    return rv;
                }
            } else {
                assert(!"invalid scanning mode!");
            }
            assert(scanned == cur_buf + cur_end);

            cur_start = cur_end;
        }

        /* if we our queue event is past our end */
        if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
            DEBUG_PRINTF("current location past buffer end\n");
            DEBUG_PRINTF("setting q location to %llu\n", b_end);
            DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK);
            q->cur--;
            q->items[q->cur].type = MQE_START;
            q->items[q->cur].location = b_end;
            *(u8 *)q->state = state;
            return MO_ALIVE;
        }

        /* crossing over into actual buffer */
        if (cur_start == 0) {
            DEBUG_PRINTF("positive location, scanning buffer\n");
            DEBUG_PRINTF("max offset: %lli\n", b_end);
            cur_buf = q->buffer;
        }

        /* continue scanning the same buffer */
        if (end != cur_end) {
            continue;
        }

        switch (q_cur_type(q)) {
        case MQE_END:
            *(u8 *)q->state = state;
            q->cur++;
            DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK);
            if (can_die) {
                return (state & SHENG32_STATE_DEAD) ? MO_DEAD : MO_ALIVE;
            }
            return MO_ALIVE;
        case MQE_TOP:
            if (q->offset + cur_start == 0) {
                DEBUG_PRINTF("Anchored start, going to state %u\n",
                             sh->anchored);
                state = sh->anchored;
            } else {
                u8 new_state = get_aux32(sh, state)->top;
                DEBUG_PRINTF("Top event %u->%u\n", state & SHENG32_STATE_MASK,
                             new_state & SHENG32_STATE_MASK);
                state = new_state;
            }
            break;
        default:
            assert(!"invalid queue event");
            break;
        }
        q->cur++;
    }
}

char nfaExecSheng32_B(const struct NFA *n, u64a offset, const u8 *buffer,
                      size_t length, NfaCallback cb, void *context) {
    DEBUG_PRINTF("smallwrite Sheng32\n");
    assert(n->type == SHENG_NFA_32);
    const struct sheng32 *sh = getImplNfa(n);
    u8 state = sh->anchored;
    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
    u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL;
    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
    u8 cached_accept_state = 0;
    ReportID cached_accept_id = 0;

    /* scan and report all matches */
    int rv;
    s64a end = length;
    const u8 *scanned;

    rv = runSheng32Cb(sh, cb, context, offset, &cached_accept_state,
                      &cached_accept_id, buffer, buffer, buffer + end, can_die,
                      has_accel, single, &scanned, &state);
    if (rv == MO_DEAD) {
        DEBUG_PRINTF("exiting in state %u\n",
                     state & SHENG32_STATE_MASK);
        return MO_DEAD;
    }

    DEBUG_PRINTF("%u\n", state & SHENG32_STATE_MASK);

    const struct sstate_aux *aux = get_aux32(sh, state);

    if (aux->accept_eod) {
        DEBUG_PRINTF("Reporting EOD matches\n");
        fireReports32(sh, cb, context, state, end + offset,
                      &cached_accept_state, &cached_accept_id, 1);
    }

    return state & SHENG32_STATE_DEAD ? MO_DEAD : MO_ALIVE;
}

char nfaExecSheng32_Q(const struct NFA *n, struct mq *q, s64a end) {
    const struct sheng32 *sh = get_sheng32(n);
    char rv = runSheng32(sh, q, end, CALLBACK_OUTPUT);
    return rv;
}

char nfaExecSheng32_Q2(const struct NFA *n, struct mq *q, s64a end) {
    const struct sheng32 *sh = get_sheng32(n);
    char rv = runSheng32(sh, q, end, STOP_AT_MATCH);
    return rv;
}

char nfaExecSheng32_QR(const struct NFA *n, struct mq *q, ReportID report) {
    assert(q_cur_type(q) == MQE_START);

    const struct sheng32 *sh = get_sheng32(n);
    char rv = runSheng32(sh, q, 0 /* end */, NO_MATCHES);

    if (rv && nfaExecSheng32_inAccept(n, report, q)) {
        return MO_MATCHES_PENDING;
    }
    return rv;
}

char nfaExecSheng32_inAccept(const struct NFA *n, ReportID report,
                             struct mq *q) {
    assert(n && q);

    const struct sheng32 *sh = get_sheng32(n);
    u8 s = *(const u8 *)q->state;
    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG32_STATE_MASK));

    const struct sstate_aux *aux = get_aux32(sh, s);

    if (!aux->accept) {
        return 0;
    }

    return sheng32HasAccept(sh, aux, report);
}

char nfaExecSheng32_inAnyAccept(const struct NFA *n, struct mq *q) {
    assert(n && q);

    const struct sheng32 *sh = get_sheng32(n);
    u8 s = *(const u8 *)q->state;
    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG32_STATE_MASK));

    const struct sstate_aux *aux = get_aux32(sh, s);
    return !!aux->accept;
}

char nfaExecSheng32_testEOD(const struct NFA *nfa, const char *state,
                            UNUSED const char *streamState, u64a offset,
                            NfaCallback cb, void *ctxt) {
    assert(nfa);

    const struct sheng32 *sh = get_sheng32(nfa);
    u8 s = *(const u8 *)state;
    DEBUG_PRINTF("checking EOD accepts for %u\n", (u8)(s & SHENG32_STATE_MASK));

    const struct sstate_aux *aux = get_aux32(sh, s);

    if (!aux->accept_eod) {
        return MO_CONTINUE_MATCHING;
    }

    return fireReports32(sh, cb, ctxt, s, offset, NULL, NULL, 1);
}

char nfaExecSheng32_reportCurrent(const struct NFA *n, struct mq *q) {
    const struct sheng32 *sh = (const struct sheng32 *)getImplNfa(n);
    NfaCallback cb = q->cb;
    void *ctxt = q->context;
    u8 s = *(u8 *)q->state;
    const struct sstate_aux *aux = get_aux32(sh, s);
    u64a offset = q_cur_offset(q);
    u8 cached_state_id = 0;
    ReportID cached_report_id = 0;
    assert(q_cur_type(q) == MQE_START);

    if (aux->accept) {
        if (sh->flags & SHENG_FLAG_SINGLE_REPORT) {
            fireSingleReport(cb, ctxt, sh->report, offset);
        } else {
            fireReports32(sh, cb, ctxt, s, offset, &cached_state_id,
                          &cached_report_id, 0);
        }
    }

    return 0;
}

char nfaExecSheng32_initCompressedState(const struct NFA *nfa, u64a offset,
                                        void *state, UNUSED u8 key) {
    const struct sheng32 *sh = get_sheng32(nfa);
    u8 *s = (u8 *)state;
    *s = offset ? sh->floating: sh->anchored;
    return !(*s & SHENG32_STATE_DEAD);
}

char nfaExecSheng32_queueInitState(const struct NFA *nfa, struct mq *q) {
    assert(nfa->scratchStateSize == 1);

    /* starting in floating state */
    const struct sheng32 *sh = get_sheng32(nfa);
    *(u8 *)q->state = sh->floating;
    DEBUG_PRINTF("starting in floating state\n");
    return 0;
}

char nfaExecSheng32_queueCompressState(UNUSED const struct NFA *nfa,
                                       const struct mq *q, UNUSED s64a loc) {
    void *dest = q->streamState;
    const void *src = q->state;
    assert(nfa->scratchStateSize == 1);
    assert(nfa->streamStateSize == 1);
    *(u8 *)dest = *(const u8 *)src;
    return 0;
}

char nfaExecSheng32_expandState(UNUSED const struct NFA *nfa, void *dest,
                                const void *src, UNUSED u64a offset,
                                UNUSED u8 key) {
    assert(nfa->scratchStateSize == 1);
    assert(nfa->streamStateSize == 1);
    *(u8 *)dest = *(const u8 *)src;
    return 0;
}

// Sheng64
static really_inline
char runSheng64Cb(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
                  u64a offset, u8 *const cached_accept_state,
                  ReportID *const cached_accept_id, const u8 *cur_buf,
                  const u8 *start, const u8 *end, u8 can_die,
                  u8 single, const u8 **scanned, u8 *state) {
    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in callback mode\n",
                 (u64a)(end - start), offset);
    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
                 (s64a)(end - cur_buf));
    DEBUG_PRINTF("can die: %u single: %u\n", !!can_die, !!single);
    int rv;
    /* scan and report all matches */
    if (can_die) {
        rv = sheng64_4_cod(state, cb, ctxt, sh, cached_accept_state,
                           cached_accept_id, single, offset, cur_buf,
                           start, end, scanned);
        if (rv == MO_HALT_MATCHING) {
            return MO_DEAD;
        }
        rv = sheng64_cod(state, cb, ctxt, sh, cached_accept_state,
                         cached_accept_id, single, offset, cur_buf,
                         *scanned, end, scanned);
    } else {
        rv = sheng64_4_co(state, cb, ctxt, sh, cached_accept_state,
                          cached_accept_id, single, offset, cur_buf,
                          start, end, scanned);
        if (rv == MO_HALT_MATCHING) {
            return MO_DEAD;
        }
        rv = sheng64_co(state, cb, ctxt, sh, cached_accept_state,
                        cached_accept_id, single, offset, cur_buf,
                        *scanned, end, scanned);
    }
    if (rv == MO_HALT_MATCHING) {
        return MO_DEAD;
    }
    return MO_ALIVE;
}

static really_inline
void runSheng64Nm(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
                  u64a offset, u8 *const cached_accept_state,
                  ReportID *const cached_accept_id, const u8 *cur_buf,
                  const u8 *start, const u8 *end, u8 can_die,
                  u8 single, const u8 **scanned, u8 *state) {
    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in nomatch mode\n",
                 (u64a)(end - start), offset);
    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
                 (s64a)(end - cur_buf));
    DEBUG_PRINTF("can die: %u single: %u\n", !!can_die, !!single);
    /* just scan the buffer */
    if (can_die) {
        sheng64_4_nmd(state, cb, ctxt, sh, cached_accept_state,
                      cached_accept_id, single, offset, cur_buf,
                      start, end, scanned);
        sheng64_nmd(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
                    single, offset, cur_buf, *scanned, end, scanned);
    } else {
        sheng64_4_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
                     single, offset, cur_buf, start, end, scanned);
        sheng64_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id,
                   single, offset, cur_buf, *scanned, end, scanned);
    }
}

static really_inline
char runSheng64Sam(const struct sheng64 *sh, NfaCallback cb, void *ctxt,
                   u64a offset, u8 *const cached_accept_state,
                   ReportID *const cached_accept_id, const u8 *cur_buf,
                   const u8 *start, const u8 *end, u8 can_die,
                   u8 single, const u8 **scanned, u8 *state) {
    DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in stop at match mode\n",
                 (u64a)(end - start), offset);
    DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf),
                 (s64a)(end - cur_buf));
    DEBUG_PRINTF("can die: %u single: %u\n", !!can_die, !!single);
    int rv;
    /* scan until first match */
    if (can_die) {
        rv = sheng64_4_samd(state, cb, ctxt, sh, cached_accept_state,
                            cached_accept_id, single, offset, cur_buf,
                            start, end, scanned);
        if (rv == MO_HALT_MATCHING) {
            return MO_DEAD;
        }
        /* if we stopped before we expected, we found a match */
        if (rv == MO_MATCHES_PENDING) {
            return MO_MATCHES_PENDING;
        }

        rv = sheng64_samd(state, cb, ctxt, sh, cached_accept_state,
                          cached_accept_id, single, offset, cur_buf,
                          *scanned, end, scanned);
    } else {
        rv = sheng64_4_sam(state, cb, ctxt, sh, cached_accept_state,
                           cached_accept_id, single, offset, cur_buf,
                           start, end, scanned);
        if (rv == MO_HALT_MATCHING) {
            return MO_DEAD;
        }
        /* if we stopped before we expected, we found a match */
        if (rv == MO_MATCHES_PENDING) {
            return MO_MATCHES_PENDING;
        }

        rv = sheng64_sam(state, cb, ctxt, sh, cached_accept_state,
                         cached_accept_id, single, offset, cur_buf,
                         *scanned, end, scanned);
    }
    if (rv == MO_HALT_MATCHING) {
        return MO_DEAD;
    }
    /* if we stopped before we expected, we found a match */
    if (rv == MO_MATCHES_PENDING) {
        return MO_MATCHES_PENDING;
    }
    return MO_ALIVE;
}

static never_inline
char runSheng64(const struct sheng64 *sh, struct mq *q, s64a b_end,
                enum MatchMode mode) {
    u8 state = *(u8 *)q->state;
    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;

    u8 cached_accept_state = 0;
    ReportID cached_accept_id = 0;

    DEBUG_PRINTF("starting Sheng64 execution in state %u\n",
                 state & SHENG64_STATE_MASK);

    if (q->report_current) {
        DEBUG_PRINTF("reporting current pending matches\n");
        assert(sh);

        q->report_current = 0;

        int rv;
        if (single) {
            rv = fireSingleReport(q->cb, q->context, sh->report,
                                  q_cur_offset(q));
        } else {
            rv = fireReports64(sh, q->cb, q->context, state, q_cur_offset(q),
                               &cached_accept_state, &cached_accept_id, 0);
        }
        if (rv == MO_HALT_MATCHING) {
            DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK);
            return MO_DEAD;
        }

        DEBUG_PRINTF("proceeding with matching\n");
    }

    assert(q_cur_type(q) == MQE_START);
    s64a start = q_cur_loc(q);

    DEBUG_PRINTF("offset: %lli, location: %lli, mode: %s\n", q->offset, start,
                 mode == CALLBACK_OUTPUT ? "CALLBACK OUTPUT" :
                     mode == NO_MATCHES ? "NO MATCHES" :
                         mode == STOP_AT_MATCH ? "STOP AT MATCH" : "???");

    DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
                 q_cur_type(q) == MQE_START ? "START" :
                     q_cur_type(q) == MQE_TOP ? "TOP" :
                         q_cur_type(q) == MQE_END ? "END" : "???");

    const u8* cur_buf;
    if (start < 0) {
        DEBUG_PRINTF("negative location, scanning history\n");
        DEBUG_PRINTF("min location: %zd\n", -q->hlength);
        cur_buf = q->history + q->hlength;
    } else {
        DEBUG_PRINTF("positive location, scanning buffer\n");
        DEBUG_PRINTF("max location: %lli\n", b_end);
        cur_buf = q->buffer;
    }

    /* if we our queue event is past our end */
    if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
        DEBUG_PRINTF("current location past buffer end\n");
        DEBUG_PRINTF("setting q location to %llu\n", b_end);
        DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK);
        q->items[q->cur].location = b_end;
        return MO_ALIVE;
    }

    q->cur++;

    s64a cur_start = start;

    while (1) {
        DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q),
                     q_cur_type(q) == MQE_START ? "START" :
                             q_cur_type(q) == MQE_TOP ? "TOP" :
                                     q_cur_type(q) == MQE_END ? "END" : "???");
        s64a end = q_cur_loc(q);
        if (mode != NO_MATCHES) {
            end = MIN(end, b_end);
        }
        assert(end <= (s64a) q->length);
        s64a cur_end = end;

        /* we may cross the border between history and current buffer */
        if (cur_start < 0) {
            cur_end = MIN(0, cur_end);
        }

        DEBUG_PRINTF("start: %lli end: %lli\n", start, end);

        /* don't scan zero length buffer */
        if (cur_start != cur_end) {
            const u8 * scanned = cur_buf;
            char rv;

            if (mode == NO_MATCHES) {
                runSheng64Nm(sh, q->cb, q->context, q->offset,
                             &cached_accept_state, &cached_accept_id, cur_buf,
                             cur_buf + cur_start, cur_buf + cur_end, can_die,
                             single, &scanned, &state);
            } else if (mode == CALLBACK_OUTPUT) {
                rv = runSheng64Cb(sh, q->cb, q->context, q->offset,
                                  &cached_accept_state, &cached_accept_id,
                                  cur_buf, cur_buf + cur_start, cur_buf + cur_end,
                                  can_die, single, &scanned, &state);
                if (rv == MO_DEAD) {
                    DEBUG_PRINTF("exiting in state %u\n",
                                 state & SHENG64_STATE_MASK);
                    return MO_DEAD;
                }
            } else if (mode == STOP_AT_MATCH) {
                rv = runSheng64Sam(sh, q->cb, q->context, q->offset,
                                   &cached_accept_state, &cached_accept_id,
                                   cur_buf, cur_buf + cur_start,
                                   cur_buf + cur_end, can_die, single,
                                   &scanned, &state);
                if (rv == MO_DEAD) {
                    DEBUG_PRINTF("exiting in state %u\n",
                                 state & SHENG64_STATE_MASK);
                    return rv;
                } else if (rv == MO_MATCHES_PENDING) {
                    assert(q->cur);
                    DEBUG_PRINTF("found a match, setting q location to %zd\n",
                                 scanned - cur_buf + 1);
                    q->cur--;
                    q->items[q->cur].type = MQE_START;
                    q->items[q->cur].location =
                            scanned - cur_buf + 1; /* due to exiting early */
                    *(u8 *)q->state = state;
                    DEBUG_PRINTF("exiting in state %u\n",
                                 state & SHENG64_STATE_MASK);
                    return rv;
                }
            } else {
                assert(!"invalid scanning mode!");
            }
            assert(scanned == cur_buf + cur_end);

            cur_start = cur_end;
        }

        /* if we our queue event is past our end */
        if (mode != NO_MATCHES && q_cur_loc(q) > b_end) {
            DEBUG_PRINTF("current location past buffer end\n");
            DEBUG_PRINTF("setting q location to %llu\n", b_end);
            DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK);
            q->cur--;
            q->items[q->cur].type = MQE_START;
            q->items[q->cur].location = b_end;
            *(u8 *)q->state = state;
            return MO_ALIVE;
        }

        /* crossing over into actual buffer */
        if (cur_start == 0) {
            DEBUG_PRINTF("positive location, scanning buffer\n");
            DEBUG_PRINTF("max offset: %lli\n", b_end);
            cur_buf = q->buffer;
        }

        /* continue scanning the same buffer */
        if (end != cur_end) {
            continue;
        }

        switch (q_cur_type(q)) {
        case MQE_END:
            *(u8 *)q->state = state;
            q->cur++;
            DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK);
            if (can_die) {
                return (state & SHENG64_STATE_DEAD) ? MO_DEAD : MO_ALIVE;
            }
            return MO_ALIVE;
        case MQE_TOP:
            if (q->offset + cur_start == 0) {
                DEBUG_PRINTF("Anchored start, going to state %u\n",
                             sh->anchored);
                state = sh->anchored;
            } else {
                u8 new_state = get_aux64(sh, state)->top;
                DEBUG_PRINTF("Top event %u->%u\n", state & SHENG64_STATE_MASK,
                             new_state & SHENG64_STATE_MASK);
                state = new_state;
            }
            break;
        default:
            assert(!"invalid queue event");
            break;
        }
        q->cur++;
    }
}

char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer,
                      size_t length, NfaCallback cb, void *context) {
    DEBUG_PRINTF("smallwrite Sheng64\n");
    assert(n->type == SHENG_NFA_64);
    const struct sheng64 *sh = getImplNfa(n);
    u8 state = sh->anchored;
    u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE;
    u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT;
    u8 cached_accept_state = 0;
    ReportID cached_accept_id = 0;

    /* scan and report all matches */
    int rv;
    s64a end = length;
    const u8 *scanned;

    rv = runSheng64Cb(sh, cb, context, offset, &cached_accept_state,
                      &cached_accept_id, buffer, buffer, buffer + end, can_die,
                      single, &scanned, &state);
    if (rv == MO_DEAD) {
        DEBUG_PRINTF("exiting in state %u\n",
                     state & SHENG64_STATE_MASK);
        return MO_DEAD;
    }

    DEBUG_PRINTF("%u\n", state & SHENG64_STATE_MASK);

    const struct sstate_aux *aux = get_aux64(sh, state);

    if (aux->accept_eod) {
        DEBUG_PRINTF("Reporting EOD matches\n");
        fireReports64(sh, cb, context, state, end + offset,
                      &cached_accept_state, &cached_accept_id, 1);
    }

    return state & SHENG64_STATE_DEAD ? MO_DEAD : MO_ALIVE;
}

char nfaExecSheng64_Q(const struct NFA *n, struct mq *q, s64a end) {
    const struct sheng64 *sh = get_sheng64(n);
    char rv = runSheng64(sh, q, end, CALLBACK_OUTPUT);
    return rv;
}

char nfaExecSheng64_Q2(const struct NFA *n, struct mq *q, s64a end) {
    const struct sheng64 *sh = get_sheng64(n);
    char rv = runSheng64(sh, q, end, STOP_AT_MATCH);
    return rv;
}

char nfaExecSheng64_QR(const struct NFA *n, struct mq *q, ReportID report) {
    assert(q_cur_type(q) == MQE_START);

    const struct sheng64 *sh = get_sheng64(n);
    char rv = runSheng64(sh, q, 0 /* end */, NO_MATCHES);

    if (rv && nfaExecSheng64_inAccept(n, report, q)) {
        return MO_MATCHES_PENDING;
    }
    return rv;
}

char nfaExecSheng64_inAccept(const struct NFA *n, ReportID report,
                             struct mq *q) {
    assert(n && q);

    const struct sheng64 *sh = get_sheng64(n);
    u8 s = *(const u8 *)q->state;
    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG64_STATE_MASK));

    const struct sstate_aux *aux = get_aux64(sh, s);

    if (!aux->accept) {
        return 0;
    }

    return sheng64HasAccept(sh, aux, report);
}

char nfaExecSheng64_inAnyAccept(const struct NFA *n, struct mq *q) {
    assert(n && q);

    const struct sheng64 *sh = get_sheng64(n);
    u8 s = *(const u8 *)q->state;
    DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG64_STATE_MASK));

    const struct sstate_aux *aux = get_aux64(sh, s);
    return !!aux->accept;
}

char nfaExecSheng64_testEOD(const struct NFA *nfa, const char *state,
                            UNUSED const char *streamState, u64a offset,
                            NfaCallback cb, void *ctxt) {
    assert(nfa);

    const struct sheng64 *sh = get_sheng64(nfa);
    u8 s = *(const u8 *)state;
    DEBUG_PRINTF("checking EOD accepts for %u\n", (u8)(s & SHENG64_STATE_MASK));

    const struct sstate_aux *aux = get_aux64(sh, s);

    if (!aux->accept_eod) {
        return MO_CONTINUE_MATCHING;
    }

    return fireReports64(sh, cb, ctxt, s, offset, NULL, NULL, 1);
}

char nfaExecSheng64_reportCurrent(const struct NFA *n, struct mq *q) {
    const struct sheng64 *sh = (const struct sheng64 *)getImplNfa(n);
    NfaCallback cb = q->cb;
    void *ctxt = q->context;
    u8 s = *(u8 *)q->state;
    const struct sstate_aux *aux = get_aux64(sh, s);
    u64a offset = q_cur_offset(q);
    u8 cached_state_id = 0;
    ReportID cached_report_id = 0;
    assert(q_cur_type(q) == MQE_START);

    if (aux->accept) {
        if (sh->flags & SHENG_FLAG_SINGLE_REPORT) {
            fireSingleReport(cb, ctxt, sh->report, offset);
        } else {
            fireReports64(sh, cb, ctxt, s, offset, &cached_state_id,
                          &cached_report_id, 0);
        }
    }

    return 0;
}

char nfaExecSheng64_initCompressedState(const struct NFA *nfa, u64a offset,
                                        void *state, UNUSED u8 key) {
    const struct sheng64 *sh = get_sheng64(nfa);
    u8 *s = (u8 *)state;
    *s = offset ? sh->floating: sh->anchored;
    return !(*s & SHENG64_STATE_DEAD);
}

char nfaExecSheng64_queueInitState(const struct NFA *nfa, struct mq *q) {
    assert(nfa->scratchStateSize == 1);

    /* starting in floating state */
    const struct sheng64 *sh = get_sheng64(nfa);
    *(u8 *)q->state = sh->floating;
    DEBUG_PRINTF("starting in floating state\n");
    return 0;
}

char nfaExecSheng64_queueCompressState(UNUSED const struct NFA *nfa,
                                       const struct mq *q, UNUSED s64a loc) {
    void *dest = q->streamState;
    const void *src = q->state;
    assert(nfa->scratchStateSize == 1);
    assert(nfa->streamStateSize == 1);
    *(u8 *)dest = *(const u8 *)src;
    return 0;
}

char nfaExecSheng64_expandState(UNUSED const struct NFA *nfa, void *dest,
                                const void *src, UNUSED u64a offset,
                                UNUSED u8 key) {
    assert(nfa->scratchStateSize == 1);
    assert(nfa->streamStateSize == 1);
    *(u8 *)dest = *(const u8 *)src;
    return 0;
}
#endif // end of HAVE_AVX512VBMI