diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/hyperscan/src/nfa/mpvcompile.cpp | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/libs/hyperscan/src/nfa/mpvcompile.cpp')
-rw-r--r-- | contrib/libs/hyperscan/src/nfa/mpvcompile.cpp | 400 |
1 files changed, 400 insertions, 0 deletions
diff --git a/contrib/libs/hyperscan/src/nfa/mpvcompile.cpp b/contrib/libs/hyperscan/src/nfa/mpvcompile.cpp new file mode 100644 index 0000000000..8497c64870 --- /dev/null +++ b/contrib/libs/hyperscan/src/nfa/mpvcompile.cpp @@ -0,0 +1,400 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "mpvcompile.h" + +#include "mpv_internal.h" +#include "nfa_api_queue.h" +#include "nfa_internal.h" +#include "shufticompile.h" +#include "trufflecompile.h" +#include "util/alloc.h" +#include "util/multibit_build.h" +#include "util/order_check.h" +#include "util/report_manager.h" +#include "util/verify_types.h" + +#include <algorithm> +#include <iterator> +#include <map> + +#include <boost/range/adaptor/map.hpp> + +using namespace std; +using boost::adaptors::map_values; +using boost::adaptors::map_keys; + +namespace ue2 { + +namespace { +struct pcomp { + bool operator()(const raw_puff &a, const raw_puff &b) const { + return tie(a.repeats, a.unbounded, a.simple_exhaust, a.report) < + tie(b.repeats, b.unbounded, b.simple_exhaust, b.report); + } +}; + +struct ClusterKey { + explicit ClusterKey(const raw_puff &src) + : trigger_event(MQE_INVALID), reach(src.reach), + auto_restart(src.auto_restart) {} + ClusterKey(u32 event, const raw_puff &src) + : trigger_event(event), reach(src.reach), + auto_restart(src.auto_restart) {} + + u32 trigger_event; + CharReach reach; + bool auto_restart; + + bool operator<(const ClusterKey &b) const { + const ClusterKey &a = *this; + ORDER_CHECK(trigger_event); /* want triggered puffs first */ + ORDER_CHECK(auto_restart); + ORDER_CHECK(reach); + return false; + } +}; + +} // namespace + +static +void writePuffette(mpv_puffette *out, const raw_puff &rp, + const ReportManager &rm) { + DEBUG_PRINTF("outputting %u %d %u to %p\n", rp.repeats, (int)rp.unbounded, + rp.report, out); + out->repeats = rp.repeats; + out->unbounded = rp.unbounded; + out->simple_exhaust = rp.simple_exhaust; + out->report = rm.getProgramOffset(rp.report); +} + +static +void writeSentinel(mpv_puffette *out) { + DEBUG_PRINTF("outputting sentinel to %p\n", out); + memset(out, 0, sizeof(*out)); + out->report = INVALID_REPORT; +} + +static +void writeDeadPoint(mpv_kilopuff *out, const vector<raw_puff> &puffs) { + for (const auto &puff : puffs) { + if (puff.unbounded) { /* mpv can never die */ + out->dead_point = MPV_DEAD_VALUE; + return; + } + } + + out->dead_point = puffs.back().repeats + 1; +} + +static +size_t calcSize(const map<ClusterKey, vector<raw_puff>> &raw, + const vector<mpv_counter_info> &counters) { + size_t len = sizeof(NFA) + sizeof(mpv); + + len += sizeof(mpv_kilopuff) * raw.size(); /* need a kilopuff for each + distinct reach */ + + len += sizeof(mpv_counter_info) * counters.size(); + + len += sizeof(mpv_puffette); /* initial sent */ + + for (const vector<raw_puff> &puffs : raw | map_values) { + len += sizeof(mpv_puffette) * puffs.size(); + len += sizeof(mpv_puffette); /* terminal sent */ + } + + return len; +} + +static +void populateClusters(const vector<raw_puff> &puffs_in, + const vector<raw_puff> &triggered_puffs, + map<ClusterKey, vector<raw_puff>> *raw) { + map<ClusterKey, vector<raw_puff>> &puff_clusters = *raw; + + u32 e = MQE_TOP_FIRST; + for (const auto &puff : triggered_puffs) { + puff_clusters[ClusterKey(e, puff)].push_back(puff); + e++; + } + + for (const auto &puff : puffs_in) { + puff_clusters[ClusterKey(puff)].push_back(puff); + } + + + for (vector<raw_puff> &puffs : puff_clusters | map_values) { + sort(puffs.begin(), puffs.end(), pcomp()); + } +} + +static +void writeKiloPuff(const map<ClusterKey, vector<raw_puff>>::const_iterator &it, + const ReportManager &rm, u32 counter_offset, mpv *m, + mpv_kilopuff *kp, mpv_puffette **pa) { + const CharReach &reach = it->first.reach; + const vector<raw_puff> &puffs = it->second; + + kp->auto_restart = it->first.auto_restart; + + if (reach.all()) { + kp->type = MPV_DOT; + } else if (reach.count() == 255) { + kp->type = MPV_VERM; + size_t unset = (~reach).find_first(); + assert(unset != CharReach::npos); + kp->u.verm.c = (char)unset; + } else if (reach.count() == 1) { + kp->type = MPV_NVERM; + size_t set = reach.find_first(); + assert(set != CharReach::npos); + kp->u.verm.c = (char)set; + } else if (shuftiBuildMasks(~reach, (u8 *)&kp->u.shuf.mask_lo, + (u8 *)&kp->u.shuf.mask_hi) != -1) { + kp->type = MPV_SHUFTI; + } else { + kp->type = MPV_TRUFFLE; + truffleBuildMasks(~reach, (u8 *)&kp->u.truffle.mask1, + (u8 *)&kp->u.truffle.mask2); + } + + kp->count = verify_u32(puffs.size()); + kp->counter_offset = counter_offset; + + /* start of real puffette array */ + kp->puffette_offset = verify_u32((char *)*pa - (char *)m); + for (size_t i = 0; i < puffs.size(); i++) { + assert(!it->first.auto_restart || puffs[i].unbounded); + writePuffette(*pa + i, puffs[i], rm); + } + + *pa += puffs.size(); + writeSentinel(*pa); + ++*pa; + + writeDeadPoint(kp, puffs); +} + +static +void writeCoreNfa(NFA *nfa, u32 len, u32 min_width, u32 max_counter, + u32 streamStateSize, u32 scratchStateSize) { + assert(nfa); + + nfa->length = len; + nfa->nPositions = max_counter - 1; + nfa->type = MPV_NFA; + nfa->streamStateSize = streamStateSize; + assert(16 >= sizeof(mpv_decomp_kilo)); + nfa->scratchStateSize = scratchStateSize; + nfa->minWidth = min_width; +} + +static +void findCounterSize(map<ClusterKey, vector<raw_puff>>::const_iterator kp_it, + map<ClusterKey, vector<raw_puff>>::const_iterator kp_ite, + u64a *max_counter_out, u32 *counter_size) { + u32 max_counter = 0; /* max counter that we may need to know about is one + more than largest repeat */ + for (; kp_it != kp_ite; ++kp_it) { + max_counter = MAX(max_counter, kp_it->second.back().repeats + 1); + } + + if (max_counter < (1U << 8)) { + *counter_size = 1; + } else if (max_counter < (1U << 16)) { + *counter_size = 2; + } else if (max_counter < (1U << 24)) { + *counter_size = 3; + } else { + *counter_size = 4; + } + + *max_counter_out = max_counter; +} + +static +void fillCounterInfo(mpv_counter_info *out, u32 *curr_decomp_offset, + u32 *curr_comp_offset, + const map<ClusterKey, vector<raw_puff>> &kilopuffs, + map<ClusterKey, vector<raw_puff>>::const_iterator kp_it, + map<ClusterKey, vector<raw_puff>>::const_iterator kp_ite) { + + out->kilo_begin = distance(kilopuffs.begin(), kp_it); + out->kilo_end = distance(kilopuffs.begin(), kp_ite); + findCounterSize(kp_it, kp_ite, &out->max_counter, &out->counter_size); + out->counter_offset = *curr_decomp_offset; + *curr_decomp_offset += sizeof(u64a); + *curr_comp_offset += out->counter_size; +} + +static +void fillCounterInfos(vector<mpv_counter_info> *out, u32 *curr_decomp_offset, + u32 *curr_comp_offset, + const map<ClusterKey, vector<raw_puff>> &kilopuffs) { + /* first the triggered puffs */ + map<ClusterKey, vector<raw_puff>>::const_iterator it = kilopuffs.begin(); + while (it != kilopuffs.end() && it->first.trigger_event != MQE_INVALID) { + assert(!it->first.auto_restart); + assert(it->first.trigger_event + == MQE_TOP_FIRST + distance(kilopuffs.begin(), it)); + + out->push_back(mpv_counter_info()); + map<ClusterKey, vector<raw_puff>>::const_iterator it_o = it; + ++it; + fillCounterInfo(&out->back(), curr_decomp_offset, curr_comp_offset, + kilopuffs, it_o, it); + } + + /* we may have 2 sets of non triggered puffs: + * 1) always started with no auto_restart + * 2) always started with auto_restart + */ + map<ClusterKey, vector<raw_puff>>::const_iterator trig_ite = it; + while (it != kilopuffs.end() && !it->first.auto_restart) { + assert(it->first.trigger_event == MQE_INVALID); + + ++it; + } + if (it != trig_ite) { + out->push_back(mpv_counter_info()); + fillCounterInfo(&out->back(), curr_decomp_offset, curr_comp_offset, + kilopuffs, kilopuffs.begin(), it); + } + while (it != kilopuffs.end() && it->first.auto_restart) { + assert(it->first.trigger_event == MQE_INVALID); + + out->push_back(mpv_counter_info()); + map<ClusterKey, vector<raw_puff>>::const_iterator it_o = it; + ++it; + fillCounterInfo(&out->back(), curr_decomp_offset, curr_comp_offset, + kilopuffs, it_o, it); + } +} + +static +const mpv_counter_info &findCounter(const vector<mpv_counter_info> &counters, + u32 i) { + for (const auto &counter : counters) { + if (i >= counter.kilo_begin && i < counter.kilo_end) { + return counter; + } + } + assert(0); + return counters.front(); +} + +bytecode_ptr<NFA> mpvCompile(const vector<raw_puff> &puffs_in, + const vector<raw_puff> &triggered_puffs, + const ReportManager &rm) { + assert(!puffs_in.empty() || !triggered_puffs.empty()); + u32 puffette_count = puffs_in.size() + triggered_puffs.size(); + + map<ClusterKey, vector<raw_puff>> puff_clusters; + populateClusters(puffs_in, triggered_puffs, &puff_clusters); + + u32 curr_comp_offset = 0; + + u32 curr_decomp_offset = sizeof(mpv_decomp_state); + curr_decomp_offset += 16 * puff_clusters.size(); + + vector<mpv_counter_info> counters; + fillCounterInfos(&counters, &curr_decomp_offset, &curr_comp_offset, + puff_clusters); + + u32 pq_offset = curr_decomp_offset; + curr_decomp_offset += sizeof(mpv_pq_item) * puff_clusters.size(); + + u32 rl_offset = curr_decomp_offset; + curr_decomp_offset += sizeof(ReportID) * puffette_count; + + u32 reporter_offset = curr_decomp_offset; + curr_decomp_offset += mmbit_size(puff_clusters.size()); + + u32 active_offset = curr_comp_offset; + curr_comp_offset += mmbit_size(puff_clusters.size()); + + u32 len = calcSize(puff_clusters, counters); + + DEBUG_PRINTF("%u puffs, len = %u\n", puffette_count, len); + + auto nfa = make_zeroed_bytecode_ptr<NFA>(len); + + mpv_puffette *pa_base = (mpv_puffette *) + ((char *)nfa.get() + sizeof(NFA) + sizeof(mpv) + + sizeof(mpv_kilopuff) * puff_clusters.size() + + sizeof(mpv_counter_info) * counters.size()); + mpv_puffette *pa = pa_base; + + writeSentinel(pa); + + ++pa; /* skip init sentinel */ + + u32 min_repeat = ~0U; + u32 max_counter = 0; /* max counter that we may need to know about is one + more than largest repeat */ + for (const vector<raw_puff> &puffs : puff_clusters | map_values) { + max_counter = max(max_counter, puffs.back().repeats + 1); + min_repeat = min(min_repeat, puffs.front().repeats); + } + + mpv *m = (mpv *)getMutableImplNfa(nfa.get()); + m->kilo_count = verify_u32(puff_clusters.size()); + m->counter_count = verify_u32(counters.size()); + m->puffette_count = puffette_count; + m->pq_offset = pq_offset; + m->reporter_offset = reporter_offset; + m->report_list_offset = rl_offset; + m->active_offset = active_offset; + m->top_kilo_begin = verify_u32(triggered_puffs.size()); + m->top_kilo_end = verify_u32(puff_clusters.size()); + + mpv_kilopuff *kp_begin = (mpv_kilopuff *)(m + 1); + mpv_kilopuff *kp = kp_begin; + for (auto it = puff_clusters.begin(); it != puff_clusters.end(); ++it) { + writeKiloPuff(it, rm, + findCounter(counters, kp - kp_begin).counter_offset, m, + kp, &pa); + ++kp; + } + assert((char *)pa == (char *)nfa.get() + len); + + mpv_counter_info *out_ci = (mpv_counter_info *)kp; + for (const auto &counter : counters) { + *out_ci = counter; + ++out_ci; + } + assert((char *)out_ci == (char *)pa_base); + + writeCoreNfa(nfa.get(), len, min_repeat, max_counter, curr_comp_offset, + curr_decomp_offset); + + return nfa; +} + +} // namespace ue2 |