aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/hyperscan/src/nfa/mpvcompile.cpp
diff options
context:
space:
mode:
authorDevtools Arcadia <arcadia-devtools@yandex-team.ru>2022-02-07 18:08:42 +0300
committerDevtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net>2022-02-07 18:08:42 +0300
commit1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch)
treee26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/hyperscan/src/nfa/mpvcompile.cpp
downloadydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/libs/hyperscan/src/nfa/mpvcompile.cpp')
-rw-r--r--contrib/libs/hyperscan/src/nfa/mpvcompile.cpp400
1 files changed, 400 insertions, 0 deletions
diff --git a/contrib/libs/hyperscan/src/nfa/mpvcompile.cpp b/contrib/libs/hyperscan/src/nfa/mpvcompile.cpp
new file mode 100644
index 0000000000..8497c64870
--- /dev/null
+++ b/contrib/libs/hyperscan/src/nfa/mpvcompile.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mpvcompile.h"
+
+#include "mpv_internal.h"
+#include "nfa_api_queue.h"
+#include "nfa_internal.h"
+#include "shufticompile.h"
+#include "trufflecompile.h"
+#include "util/alloc.h"
+#include "util/multibit_build.h"
+#include "util/order_check.h"
+#include "util/report_manager.h"
+#include "util/verify_types.h"
+
+#include <algorithm>
+#include <iterator>
+#include <map>
+
+#include <boost/range/adaptor/map.hpp>
+
+using namespace std;
+using boost::adaptors::map_values;
+using boost::adaptors::map_keys;
+
+namespace ue2 {
+
+namespace {
+struct pcomp {
+ bool operator()(const raw_puff &a, const raw_puff &b) const {
+ return tie(a.repeats, a.unbounded, a.simple_exhaust, a.report) <
+ tie(b.repeats, b.unbounded, b.simple_exhaust, b.report);
+ }
+};
+
+struct ClusterKey {
+ explicit ClusterKey(const raw_puff &src)
+ : trigger_event(MQE_INVALID), reach(src.reach),
+ auto_restart(src.auto_restart) {}
+ ClusterKey(u32 event, const raw_puff &src)
+ : trigger_event(event), reach(src.reach),
+ auto_restart(src.auto_restart) {}
+
+ u32 trigger_event;
+ CharReach reach;
+ bool auto_restart;
+
+ bool operator<(const ClusterKey &b) const {
+ const ClusterKey &a = *this;
+ ORDER_CHECK(trigger_event); /* want triggered puffs first */
+ ORDER_CHECK(auto_restart);
+ ORDER_CHECK(reach);
+ return false;
+ }
+};
+
+} // namespace
+
+static
+void writePuffette(mpv_puffette *out, const raw_puff &rp,
+ const ReportManager &rm) {
+ DEBUG_PRINTF("outputting %u %d %u to %p\n", rp.repeats, (int)rp.unbounded,
+ rp.report, out);
+ out->repeats = rp.repeats;
+ out->unbounded = rp.unbounded;
+ out->simple_exhaust = rp.simple_exhaust;
+ out->report = rm.getProgramOffset(rp.report);
+}
+
+static
+void writeSentinel(mpv_puffette *out) {
+ DEBUG_PRINTF("outputting sentinel to %p\n", out);
+ memset(out, 0, sizeof(*out));
+ out->report = INVALID_REPORT;
+}
+
+static
+void writeDeadPoint(mpv_kilopuff *out, const vector<raw_puff> &puffs) {
+ for (const auto &puff : puffs) {
+ if (puff.unbounded) { /* mpv can never die */
+ out->dead_point = MPV_DEAD_VALUE;
+ return;
+ }
+ }
+
+ out->dead_point = puffs.back().repeats + 1;
+}
+
+static
+size_t calcSize(const map<ClusterKey, vector<raw_puff>> &raw,
+ const vector<mpv_counter_info> &counters) {
+ size_t len = sizeof(NFA) + sizeof(mpv);
+
+ len += sizeof(mpv_kilopuff) * raw.size(); /* need a kilopuff for each
+ distinct reach */
+
+ len += sizeof(mpv_counter_info) * counters.size();
+
+ len += sizeof(mpv_puffette); /* initial sent */
+
+ for (const vector<raw_puff> &puffs : raw | map_values) {
+ len += sizeof(mpv_puffette) * puffs.size();
+ len += sizeof(mpv_puffette); /* terminal sent */
+ }
+
+ return len;
+}
+
+static
+void populateClusters(const vector<raw_puff> &puffs_in,
+ const vector<raw_puff> &triggered_puffs,
+ map<ClusterKey, vector<raw_puff>> *raw) {
+ map<ClusterKey, vector<raw_puff>> &puff_clusters = *raw;
+
+ u32 e = MQE_TOP_FIRST;
+ for (const auto &puff : triggered_puffs) {
+ puff_clusters[ClusterKey(e, puff)].push_back(puff);
+ e++;
+ }
+
+ for (const auto &puff : puffs_in) {
+ puff_clusters[ClusterKey(puff)].push_back(puff);
+ }
+
+
+ for (vector<raw_puff> &puffs : puff_clusters | map_values) {
+ sort(puffs.begin(), puffs.end(), pcomp());
+ }
+}
+
+static
+void writeKiloPuff(const map<ClusterKey, vector<raw_puff>>::const_iterator &it,
+ const ReportManager &rm, u32 counter_offset, mpv *m,
+ mpv_kilopuff *kp, mpv_puffette **pa) {
+ const CharReach &reach = it->first.reach;
+ const vector<raw_puff> &puffs = it->second;
+
+ kp->auto_restart = it->first.auto_restart;
+
+ if (reach.all()) {
+ kp->type = MPV_DOT;
+ } else if (reach.count() == 255) {
+ kp->type = MPV_VERM;
+ size_t unset = (~reach).find_first();
+ assert(unset != CharReach::npos);
+ kp->u.verm.c = (char)unset;
+ } else if (reach.count() == 1) {
+ kp->type = MPV_NVERM;
+ size_t set = reach.find_first();
+ assert(set != CharReach::npos);
+ kp->u.verm.c = (char)set;
+ } else if (shuftiBuildMasks(~reach, (u8 *)&kp->u.shuf.mask_lo,
+ (u8 *)&kp->u.shuf.mask_hi) != -1) {
+ kp->type = MPV_SHUFTI;
+ } else {
+ kp->type = MPV_TRUFFLE;
+ truffleBuildMasks(~reach, (u8 *)&kp->u.truffle.mask1,
+ (u8 *)&kp->u.truffle.mask2);
+ }
+
+ kp->count = verify_u32(puffs.size());
+ kp->counter_offset = counter_offset;
+
+ /* start of real puffette array */
+ kp->puffette_offset = verify_u32((char *)*pa - (char *)m);
+ for (size_t i = 0; i < puffs.size(); i++) {
+ assert(!it->first.auto_restart || puffs[i].unbounded);
+ writePuffette(*pa + i, puffs[i], rm);
+ }
+
+ *pa += puffs.size();
+ writeSentinel(*pa);
+ ++*pa;
+
+ writeDeadPoint(kp, puffs);
+}
+
+static
+void writeCoreNfa(NFA *nfa, u32 len, u32 min_width, u32 max_counter,
+ u32 streamStateSize, u32 scratchStateSize) {
+ assert(nfa);
+
+ nfa->length = len;
+ nfa->nPositions = max_counter - 1;
+ nfa->type = MPV_NFA;
+ nfa->streamStateSize = streamStateSize;
+ assert(16 >= sizeof(mpv_decomp_kilo));
+ nfa->scratchStateSize = scratchStateSize;
+ nfa->minWidth = min_width;
+}
+
+static
+void findCounterSize(map<ClusterKey, vector<raw_puff>>::const_iterator kp_it,
+ map<ClusterKey, vector<raw_puff>>::const_iterator kp_ite,
+ u64a *max_counter_out, u32 *counter_size) {
+ u32 max_counter = 0; /* max counter that we may need to know about is one
+ more than largest repeat */
+ for (; kp_it != kp_ite; ++kp_it) {
+ max_counter = MAX(max_counter, kp_it->second.back().repeats + 1);
+ }
+
+ if (max_counter < (1U << 8)) {
+ *counter_size = 1;
+ } else if (max_counter < (1U << 16)) {
+ *counter_size = 2;
+ } else if (max_counter < (1U << 24)) {
+ *counter_size = 3;
+ } else {
+ *counter_size = 4;
+ }
+
+ *max_counter_out = max_counter;
+}
+
+static
+void fillCounterInfo(mpv_counter_info *out, u32 *curr_decomp_offset,
+ u32 *curr_comp_offset,
+ const map<ClusterKey, vector<raw_puff>> &kilopuffs,
+ map<ClusterKey, vector<raw_puff>>::const_iterator kp_it,
+ map<ClusterKey, vector<raw_puff>>::const_iterator kp_ite) {
+
+ out->kilo_begin = distance(kilopuffs.begin(), kp_it);
+ out->kilo_end = distance(kilopuffs.begin(), kp_ite);
+ findCounterSize(kp_it, kp_ite, &out->max_counter, &out->counter_size);
+ out->counter_offset = *curr_decomp_offset;
+ *curr_decomp_offset += sizeof(u64a);
+ *curr_comp_offset += out->counter_size;
+}
+
+static
+void fillCounterInfos(vector<mpv_counter_info> *out, u32 *curr_decomp_offset,
+ u32 *curr_comp_offset,
+ const map<ClusterKey, vector<raw_puff>> &kilopuffs) {
+ /* first the triggered puffs */
+ map<ClusterKey, vector<raw_puff>>::const_iterator it = kilopuffs.begin();
+ while (it != kilopuffs.end() && it->first.trigger_event != MQE_INVALID) {
+ assert(!it->first.auto_restart);
+ assert(it->first.trigger_event
+ == MQE_TOP_FIRST + distance(kilopuffs.begin(), it));
+
+ out->push_back(mpv_counter_info());
+ map<ClusterKey, vector<raw_puff>>::const_iterator it_o = it;
+ ++it;
+ fillCounterInfo(&out->back(), curr_decomp_offset, curr_comp_offset,
+ kilopuffs, it_o, it);
+ }
+
+ /* we may have 2 sets of non triggered puffs:
+ * 1) always started with no auto_restart
+ * 2) always started with auto_restart
+ */
+ map<ClusterKey, vector<raw_puff>>::const_iterator trig_ite = it;
+ while (it != kilopuffs.end() && !it->first.auto_restart) {
+ assert(it->first.trigger_event == MQE_INVALID);
+
+ ++it;
+ }
+ if (it != trig_ite) {
+ out->push_back(mpv_counter_info());
+ fillCounterInfo(&out->back(), curr_decomp_offset, curr_comp_offset,
+ kilopuffs, kilopuffs.begin(), it);
+ }
+ while (it != kilopuffs.end() && it->first.auto_restart) {
+ assert(it->first.trigger_event == MQE_INVALID);
+
+ out->push_back(mpv_counter_info());
+ map<ClusterKey, vector<raw_puff>>::const_iterator it_o = it;
+ ++it;
+ fillCounterInfo(&out->back(), curr_decomp_offset, curr_comp_offset,
+ kilopuffs, it_o, it);
+ }
+}
+
+static
+const mpv_counter_info &findCounter(const vector<mpv_counter_info> &counters,
+ u32 i) {
+ for (const auto &counter : counters) {
+ if (i >= counter.kilo_begin && i < counter.kilo_end) {
+ return counter;
+ }
+ }
+ assert(0);
+ return counters.front();
+}
+
+bytecode_ptr<NFA> mpvCompile(const vector<raw_puff> &puffs_in,
+ const vector<raw_puff> &triggered_puffs,
+ const ReportManager &rm) {
+ assert(!puffs_in.empty() || !triggered_puffs.empty());
+ u32 puffette_count = puffs_in.size() + triggered_puffs.size();
+
+ map<ClusterKey, vector<raw_puff>> puff_clusters;
+ populateClusters(puffs_in, triggered_puffs, &puff_clusters);
+
+ u32 curr_comp_offset = 0;
+
+ u32 curr_decomp_offset = sizeof(mpv_decomp_state);
+ curr_decomp_offset += 16 * puff_clusters.size();
+
+ vector<mpv_counter_info> counters;
+ fillCounterInfos(&counters, &curr_decomp_offset, &curr_comp_offset,
+ puff_clusters);
+
+ u32 pq_offset = curr_decomp_offset;
+ curr_decomp_offset += sizeof(mpv_pq_item) * puff_clusters.size();
+
+ u32 rl_offset = curr_decomp_offset;
+ curr_decomp_offset += sizeof(ReportID) * puffette_count;
+
+ u32 reporter_offset = curr_decomp_offset;
+ curr_decomp_offset += mmbit_size(puff_clusters.size());
+
+ u32 active_offset = curr_comp_offset;
+ curr_comp_offset += mmbit_size(puff_clusters.size());
+
+ u32 len = calcSize(puff_clusters, counters);
+
+ DEBUG_PRINTF("%u puffs, len = %u\n", puffette_count, len);
+
+ auto nfa = make_zeroed_bytecode_ptr<NFA>(len);
+
+ mpv_puffette *pa_base = (mpv_puffette *)
+ ((char *)nfa.get() + sizeof(NFA) + sizeof(mpv)
+ + sizeof(mpv_kilopuff) * puff_clusters.size()
+ + sizeof(mpv_counter_info) * counters.size());
+ mpv_puffette *pa = pa_base;
+
+ writeSentinel(pa);
+
+ ++pa; /* skip init sentinel */
+
+ u32 min_repeat = ~0U;
+ u32 max_counter = 0; /* max counter that we may need to know about is one
+ more than largest repeat */
+ for (const vector<raw_puff> &puffs : puff_clusters | map_values) {
+ max_counter = max(max_counter, puffs.back().repeats + 1);
+ min_repeat = min(min_repeat, puffs.front().repeats);
+ }
+
+ mpv *m = (mpv *)getMutableImplNfa(nfa.get());
+ m->kilo_count = verify_u32(puff_clusters.size());
+ m->counter_count = verify_u32(counters.size());
+ m->puffette_count = puffette_count;
+ m->pq_offset = pq_offset;
+ m->reporter_offset = reporter_offset;
+ m->report_list_offset = rl_offset;
+ m->active_offset = active_offset;
+ m->top_kilo_begin = verify_u32(triggered_puffs.size());
+ m->top_kilo_end = verify_u32(puff_clusters.size());
+
+ mpv_kilopuff *kp_begin = (mpv_kilopuff *)(m + 1);
+ mpv_kilopuff *kp = kp_begin;
+ for (auto it = puff_clusters.begin(); it != puff_clusters.end(); ++it) {
+ writeKiloPuff(it, rm,
+ findCounter(counters, kp - kp_begin).counter_offset, m,
+ kp, &pa);
+ ++kp;
+ }
+ assert((char *)pa == (char *)nfa.get() + len);
+
+ mpv_counter_info *out_ci = (mpv_counter_info *)kp;
+ for (const auto &counter : counters) {
+ *out_ci = counter;
+ ++out_ci;
+ }
+ assert((char *)out_ci == (char *)pa_base);
+
+ writeCoreNfa(nfa.get(), len, min_repeat, max_counter, curr_comp_offset,
+ curr_decomp_offset);
+
+ return nfa;
+}
+
+} // namespace ue2