aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/hyperscan/src/nfagraph/ng_violet.cpp
diff options
context:
space:
mode:
authorIvan Blinkov <ivan@blinkov.ru>2022-02-10 16:47:10 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:47:10 +0300
commit1aeb9a455974457866f78722ad98114bafc84e8a (patch)
treee4340eaf1668684d83a0a58c36947c5def5350ad /contrib/libs/hyperscan/src/nfagraph/ng_violet.cpp
parentbd5ef432f5cfb1e18851381329d94665a4c22470 (diff)
downloadydb-1aeb9a455974457866f78722ad98114bafc84e8a.tar.gz
Restoring authorship annotation for Ivan Blinkov <ivan@blinkov.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/hyperscan/src/nfagraph/ng_violet.cpp')
-rw-r--r--contrib/libs/hyperscan/src/nfagraph/ng_violet.cpp6094
1 files changed, 3047 insertions, 3047 deletions
diff --git a/contrib/libs/hyperscan/src/nfagraph/ng_violet.cpp b/contrib/libs/hyperscan/src/nfagraph/ng_violet.cpp
index 685d452150..4eb4196da1 100644
--- a/contrib/libs/hyperscan/src/nfagraph/ng_violet.cpp
+++ b/contrib/libs/hyperscan/src/nfagraph/ng_violet.cpp
@@ -1,3068 +1,3068 @@
-/*
+/*
* Copyright (c) 2016-2018, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include "ng_violet.h"
-
-#include "grey.h"
-#include "ng_depth.h"
-#include "ng_dominators.h"
-#include "ng_dump.h"
-#include "ng_equivalence.h"
-#include "ng_holder.h"
-#include "ng_is_equal.h"
-#include "ng_literal_analysis.h"
-#include "ng_limex.h"
-#include "ng_mcclellan.h"
-#include "ng_netflow.h"
-#include "ng_prune.h"
-#include "ng_redundancy.h"
-#include "ng_region.h"
-#include "ng_reports.h"
-#include "ng_split.h"
-#include "ng_util.h"
-#include "ng_width.h"
-#include "nfa/rdfa.h"
-#include "rose/rose_build.h"
-#include "rose/rose_build_util.h"
-#include "rose/rose_in_dump.h"
-#include "rose/rose_in_graph.h"
-#include "rose/rose_in_util.h"
-#include "util/compare.h"
-#include "util/compile_context.h"
-#include "util/container.h"
-#include "util/flat_containers.h"
-#include "util/graph.h"
-#include "util/graph_range.h"
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include "ng_violet.h"
+
+#include "grey.h"
+#include "ng_depth.h"
+#include "ng_dominators.h"
+#include "ng_dump.h"
+#include "ng_equivalence.h"
+#include "ng_holder.h"
+#include "ng_is_equal.h"
+#include "ng_literal_analysis.h"
+#include "ng_limex.h"
+#include "ng_mcclellan.h"
+#include "ng_netflow.h"
+#include "ng_prune.h"
+#include "ng_redundancy.h"
+#include "ng_region.h"
+#include "ng_reports.h"
+#include "ng_split.h"
+#include "ng_util.h"
+#include "ng_width.h"
+#include "nfa/rdfa.h"
+#include "rose/rose_build.h"
+#include "rose/rose_build_util.h"
+#include "rose/rose_in_dump.h"
+#include "rose/rose_in_graph.h"
+#include "rose/rose_in_util.h"
+#include "util/compare.h"
+#include "util/compile_context.h"
+#include "util/container.h"
+#include "util/flat_containers.h"
+#include "util/graph.h"
+#include "util/graph_range.h"
#include "util/graph_small_color_map.h"
-#include "util/insertion_ordered.h"
-#include "util/make_unique.h"
-#include "util/order_check.h"
-#include "util/target_info.h"
-#include "util/ue2string.h"
-
-#include <set>
-#include <utility>
-#include <vector>
-#include <boost/dynamic_bitset.hpp>
-#include <boost/range/adaptor/map.hpp>
-
-#define STAGE_DEBUG_PRINTF DEBUG_PRINTF
-
-using namespace std;
-using boost::adaptors::map_values;
-
-namespace ue2 {
-
-/* createsAnchoredLHS() is conservative as the depths take into account
- * back edges that come from beyond the split point and would be missing after
- * the graph is split. */
-static
-bool createsAnchoredLHS(const NGHolder &g, const vector<NFAVertex> &vv,
- const vector<NFAVertexDepth> &depths,
- const Grey &grey, depth max_depth = depth::infinity()) {
- max_depth = min(max_depth, depth(grey.maxAnchoredRegion));
-
- for (auto v : vv) {
- /* avoid issues of self loops blowing out depths:
- * look at preds, add 1 */
- for (auto u : inv_adjacent_vertices_range(v, g)) {
- if (u == v) {
- continue;
- }
-
- u32 idx = g[u].index;
- assert(idx < depths.size());
- if (maxDistFromStartOfData(depths.at(idx)) >= max_depth) {
- return false;
- }
- }
- }
- return true;
-}
-
-/* createsTransientLHS() is conservative as the depths take into account
- * back edges that come from beyond the split point and would be missing after
- * the graph is split. */
-static
-bool createsTransientLHS(const NGHolder &g, const vector<NFAVertex> &vv,
- const vector<NFAVertexDepth> &depths,
- const Grey &grey) {
- const depth max_depth(grey.maxHistoryAvailable);
-
- for (auto v : vv) {
- /* avoid issues of self loops blowing out depths:
- * look at preds, add 1 */
- for (auto u : inv_adjacent_vertices_range(v, g)) {
- if (u == v) {
- continue;
- }
-
- u32 idx = g[u].index;
- assert(idx < depths.size());
- if (maxDistFromInit(depths.at(idx)) >= max_depth) {
- return false;
- }
- }
- }
- return true;
-}
-
+#include "util/insertion_ordered.h"
+#include "util/make_unique.h"
+#include "util/order_check.h"
+#include "util/target_info.h"
+#include "util/ue2string.h"
+
+#include <set>
+#include <utility>
+#include <vector>
+#include <boost/dynamic_bitset.hpp>
+#include <boost/range/adaptor/map.hpp>
+
+#define STAGE_DEBUG_PRINTF DEBUG_PRINTF
+
+using namespace std;
+using boost::adaptors::map_values;
+
+namespace ue2 {
+
+/* createsAnchoredLHS() is conservative as the depths take into account
+ * back edges that come from beyond the split point and would be missing after
+ * the graph is split. */
+static
+bool createsAnchoredLHS(const NGHolder &g, const vector<NFAVertex> &vv,
+ const vector<NFAVertexDepth> &depths,
+ const Grey &grey, depth max_depth = depth::infinity()) {
+ max_depth = min(max_depth, depth(grey.maxAnchoredRegion));
+
+ for (auto v : vv) {
+ /* avoid issues of self loops blowing out depths:
+ * look at preds, add 1 */
+ for (auto u : inv_adjacent_vertices_range(v, g)) {
+ if (u == v) {
+ continue;
+ }
+
+ u32 idx = g[u].index;
+ assert(idx < depths.size());
+ if (maxDistFromStartOfData(depths.at(idx)) >= max_depth) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+/* createsTransientLHS() is conservative as the depths take into account
+ * back edges that come from beyond the split point and would be missing after
+ * the graph is split. */
+static
+bool createsTransientLHS(const NGHolder &g, const vector<NFAVertex> &vv,
+ const vector<NFAVertexDepth> &depths,
+ const Grey &grey) {
+ const depth max_depth(grey.maxHistoryAvailable);
+
+ for (auto v : vv) {
+ /* avoid issues of self loops blowing out depths:
+ * look at preds, add 1 */
+ for (auto u : inv_adjacent_vertices_range(v, g)) {
+ if (u == v) {
+ continue;
+ }
+
+ u32 idx = g[u].index;
+ assert(idx < depths.size());
+ if (maxDistFromInit(depths.at(idx)) >= max_depth) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
/**
* Counts the number of vertices that are reachable from the set of sources
* given.
*/
-static
+static
size_t count_reachable(const NGHolder &g, const vector<NFAVertex> &sources,
small_color_map<decltype(get(vertex_index, g))> &color_map) {
auto null_visitor = boost::make_dfs_visitor(boost::null_visitor());
color_map.fill(small_color::white);
-
+
for (auto v : sources) {
boost::depth_first_visit(g, v, null_visitor, color_map);
}
return color_map.count(small_color::black);
-}
-
-static
-size_t shorter_than(const set<ue2_literal> &s, size_t limit) {
- return count_if(s.begin(), s.end(),
- [&](const ue2_literal &a) { return a.length() < limit; });
-}
-
-static
-u32 min_len(const set<ue2_literal> &s) {
- u32 rv = ~0U;
-
- for (const auto &lit : s) {
- rv = min(rv, (u32)lit.length());
- }
-
- return rv;
-}
-
-static
-u32 min_period(const set<ue2_literal> &s) {
- u32 rv = ~0U;
-
- for (const auto &lit : s) {
- rv = min(rv, (u32)minStringPeriod(lit));
- }
- DEBUG_PRINTF("min period %u\n", rv);
- return rv;
-}
-
-namespace {
-/**
- * Information on a cut: vertices and literals.
- */
-struct VertLitInfo {
- VertLitInfo() {}
- VertLitInfo(NFAVertex v, const set<ue2_literal> &litlit, bool c_anch,
- bool c_tran = false)
- : vv(vector<NFAVertex>(1, v)), lit(litlit), creates_anchored(c_anch),
- creates_transient(c_tran) {}
- VertLitInfo(const vector<NFAVertex> &vv_in, const set<ue2_literal> &lit_in,
- bool c_anch)
- : vv(vv_in), lit(lit_in), creates_anchored(c_anch) {}
- vector<NFAVertex> vv;
- set<ue2_literal> lit;
-
- bool creates_anchored = false;
- bool creates_transient = false;
- double split_ratio = 0;
-};
-
-#define LAST_CHANCE_STRONG_LEN 1
-
-/**
- * \brief Comparator class for comparing different literal cuts.
- */
-class LitComparator {
-public:
- LitComparator(const NGHolder &g_in, bool sa, bool st, bool lc)
- : g(g_in), seeking_anchored(sa), seeking_transient(st),
- last_chance(lc) {}
- bool operator()(const unique_ptr<VertLitInfo> &a,
- const unique_ptr<VertLitInfo> &b) const {
- assert(a && b);
-
- if (seeking_anchored) {
- if (a->creates_anchored != b->creates_anchored) {
- return a->creates_anchored < b->creates_anchored;
- }
- }
-
- if (seeking_transient) {
- if (a->creates_transient != b->creates_transient) {
- return a->creates_transient < b->creates_transient;
- }
- }
-
- if (last_chance
- && min_len(a->lit) > LAST_CHANCE_STRONG_LEN
- && min_len(b->lit) > LAST_CHANCE_STRONG_LEN) {
- DEBUG_PRINTF("using split ratio %g , %g\n", a->split_ratio,
- b->split_ratio);
- return a->split_ratio < b->split_ratio;
- }
-
- u64a score_a = scoreSet(a->lit);
- u64a score_b = scoreSet(b->lit);
-
- if (score_a != score_b) {
- return score_a > score_b;
- }
-
- /* vertices should only be in one candidate cut */
- assert(a->vv == b->vv || a->vv.front() != b->vv.front());
- return g[a->vv.front()].index > g[b->vv.front()].index;
- }
-
-private:
- const NGHolder &g; /**< graph on which cuts are found */
-
- bool seeking_anchored;
- bool seeking_transient;
- bool last_chance;
-};
-}
-
-#define MIN_ANCHORED_LEN 2
-#define MIN_ANCHORED_DESPERATE_LEN 1
-
-/* anchored here means that the cut creates a 'usefully' anchored LHS */
-static
-bool validateRoseLiteralSetQuality(const set<ue2_literal> &s, u64a score,
- bool anchored, u32 min_allowed_floating_len,
- bool desperation, bool last_chance) {
- u32 min_allowed_len = anchored ? MIN_ANCHORED_LEN
- : min_allowed_floating_len;
- if (anchored && last_chance) {
- min_allowed_len = MIN_ANCHORED_DESPERATE_LEN;
- }
- if (last_chance) {
- desperation = true;
- }
-
- DEBUG_PRINTF("validating%s set, min allowed len %u\n",
- anchored ? " anchored" : "", min_allowed_len);
-
- assert(none_of(begin(s), end(s), bad_mixed_sensitivity));
-
- if (score >= NO_LITERAL_AT_EDGE_SCORE) {
- DEBUG_PRINTF("candidate is too bad %llu/%zu\n", score, s.size());
- return false;
- }
-
- assert(!s.empty());
- if (s.empty()) {
- DEBUG_PRINTF("candidate is too bad/something went wrong\n");
- return false;
- }
-
- u32 s_min_len = min_len(s);
- u32 s_min_period = min_period(s);
- size_t short_count = shorter_than(s, 5);
-
- DEBUG_PRINTF("cand '%s': score %llu count=%zu min_len=%u min_period=%u"
- " short_count=%zu desp=%d\n",
- dumpString(*s.begin()).c_str(), score, s.size(), s_min_len,
- s_min_period, short_count, (int)desperation);
-
- bool ok = true;
-
- if (s.size() > 10 /* magic number is magic */
- || s_min_len < min_allowed_len
- || (s_min_period <= 1 && min_allowed_len != 1)) {
- DEBUG_PRINTF("candidate may be bad\n");
- ok = false;
- }
-
- if (!ok && desperation
- && s.size() <= 20 /* more magic numbers are magical */
- && (s_min_len > 5 || (s_min_len > 2 && short_count <= 10))
- && s_min_period > 1) {
- DEBUG_PRINTF("candidate is ok\n");
- ok = true;
- }
-
- if (!ok && desperation
- && s.size() <= 50 /* more magic numbers are magical */
- && s_min_len > 10
- && s_min_period > 1) {
- DEBUG_PRINTF("candidate is ok\n");
- ok = true;
- }
-
- if (!ok) {
- DEBUG_PRINTF("candidate is too shitty\n");
- return false;
- }
-
- return true;
-}
-
-static UNUSED
-void dumpRoseLiteralSet(const set<ue2_literal> &s) {
- for (UNUSED const auto &lit : s) {
- DEBUG_PRINTF(" lit: %s\n", dumpString(lit).c_str());
- }
-}
-
-static
-void getSimpleRoseLiterals(const NGHolder &g, bool seeking_anchored,
- const vector<NFAVertexDepth> *depths,
- const set<NFAVertex> &a_dom,
- vector<unique_ptr<VertLitInfo>> *lits,
- u32 min_allowed_len, bool desperation,
- bool last_chance, const CompileContext &cc) {
- assert(depths || !seeking_anchored);
-
- map<NFAVertex, u64a> scores;
- map<NFAVertex, unique_ptr<VertLitInfo>> lit_info;
- set<ue2_literal> s;
-
- for (auto v : a_dom) {
- s = getLiteralSet(g, v, true); /* RHS will take responsibility for any
- revisits to the target vertex */
-
- if (s.empty()) {
- DEBUG_PRINTF("candidate is too shitty\n");
- continue;
- }
-
- DEBUG_PRINTF("|candidate raw literal set| = %zu\n", s.size());
- dumpRoseLiteralSet(s);
- u64a score = sanitizeAndCompressAndScore(s);
-
- bool anchored = false;
- if (seeking_anchored) {
- anchored = createsAnchoredLHS(g, {v}, *depths, cc.grey);
- }
-
- if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len,
- desperation, last_chance)) {
- continue;
- }
-
- DEBUG_PRINTF("candidate is a candidate\n");
- scores[v] = score;
- lit_info[v] = std::make_unique<VertLitInfo>(v, s, anchored);
- }
-
- /* try to filter out cases where appending some characters produces worse
- * literals. Only bother to look back one byte, TODO make better */
- for (auto u : a_dom) {
- if (out_degree(u, g) != 1 || !scores[u]) {
- continue;
- }
- NFAVertex v = *adjacent_vertices(u, g).first;
- if (contains(scores, v) && scores[v] >= scores[u]) {
- DEBUG_PRINTF("killing off v as score %llu >= %llu\n",
- scores[v], scores[u]);
- lit_info.erase(v);
- }
- }
-
- lits->reserve(lit_info.size());
- for (auto &m : lit_info) {
- lits->push_back(move(m.second));
- }
- DEBUG_PRINTF("%zu candidate literal sets\n", lits->size());
-}
-
-static
-void getRegionRoseLiterals(const NGHolder &g, bool seeking_anchored,
- const vector<NFAVertexDepth> *depths,
- const set<NFAVertex> &bad,
- const set<NFAVertex> *allowed,
- vector<unique_ptr<VertLitInfo>> *lits,
- u32 min_allowed_len, bool desperation,
- bool last_chance, const CompileContext &cc) {
- /* This allows us to get more places to split the graph as we are not
- limited to points where there is a single vertex to split at. */
-
- assert(depths || !seeking_anchored);
-
- /* TODO: operate over 'proto-regions' which ignore back edges */
- auto regions = assignRegions(g);
-
- set<u32> mand, optional;
- map<u32, vector<NFAVertex> > exits;
-
- for (auto v : vertices_range(g)) {
- u32 region = regions[v];
- if (is_any_start(v, g) || region == 0) {
- continue;
- }
-
- if (is_any_accept(v, g)) {
- continue;
- }
-
- if (!generates_callbacks(g) && is_match_vertex(v, g)) {
- /* we cannot leave a completely vacuous infix */
- continue;
- }
-
- if (isRegionExit(g, v, regions)) {
- exits[region].push_back(v);
- }
-
- if (isRegionEntry(g, v, regions)) {
- // Determine whether this region is mandatory or optional. We only
- // need to do this check for the first entry vertex we encounter
- // for this region.
- if (!contains(mand, region) && !contains(optional, region)) {
- if (isOptionalRegion(g, v, regions)) {
- optional.insert(region);
- } else {
- mand.insert(region);
- }
- }
- }
- }
-
- for (const auto &m : exits) {
- if (false) {
- next_cand:
- continue;
- }
-
- const u32 region = m.first;
- const vector<NFAVertex> &vv = m.second;
- assert(!vv.empty());
-
- if (!contains(mand, region)) {
- continue;
- }
-
- for (auto v : vv) {
- /* if an exit is in bad, the region is already handled well
- * by getSimpleRoseLiterals or is otherwise bad */
- if (contains(bad, v)) {
- goto next_cand;
- }
- /* if we are only allowed to consider some vertices, v must be in
- the list; */
- if (allowed && !contains(*allowed, v)) {
- goto next_cand;
- }
- }
-
- /* the final region may not have a neat exit. validate that all exits
- * have an edge to each accept or none do */
- bool edge_to_a = edge(vv[0], g.accept, g).second;
- bool edge_to_aeod = edge(vv[0], g.acceptEod, g).second;
- const auto &reports = g[vv[0]].reports;
- for (auto v : vv) {
- if (edge_to_a != edge(v, g.accept, g).second) {
- goto next_cand;
- }
-
- if (edge_to_aeod != edge(v, g.acceptEod, g).second) {
- goto next_cand;
- }
-
- if (g[v].reports != reports) {
- goto next_cand;
- }
- }
-
- DEBUG_PRINTF("inspecting region %u\n", region);
- set<ue2_literal> s;
- for (auto v : vv) {
- DEBUG_PRINTF(" exit vertex: %zu\n", g[v].index);
- /* Note: RHS can not be depended on to take all subsequent revisits
- * to this vertex */
- set<ue2_literal> ss = getLiteralSet(g, v, false);
- if (ss.empty()) {
- DEBUG_PRINTF("candidate is too shitty\n");
- goto next_cand;
- }
- insert(&s, ss);
- }
-
- assert(!s.empty());
-
- DEBUG_PRINTF("|candidate raw literal set| = %zu\n", s.size());
- dumpRoseLiteralSet(s);
- u64a score = sanitizeAndCompressAndScore(s);
-
- DEBUG_PRINTF("|candidate literal set| = %zu\n", s.size());
- dumpRoseLiteralSet(s);
-
- bool anchored = false;
- if (seeking_anchored) {
- anchored = createsAnchoredLHS(g, vv, *depths, cc.grey);
- }
-
- if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len,
- desperation, last_chance)) {
- goto next_cand;
- }
-
- DEBUG_PRINTF("candidate is a candidate\n");
- lits->push_back(std::make_unique<VertLitInfo>(vv, s, anchored));
- }
-}
-
-static
-void filterCandPivots(const NGHolder &g, const set<NFAVertex> &cand_raw,
- set<NFAVertex> *out) {
- for (auto u : cand_raw) {
- const CharReach &u_cr = g[u].char_reach;
- if (u_cr.count() > 40) {
- continue; /* too wide to be plausible */
- }
-
- if (u_cr.count() > 2) {
- /* include u as a candidate as successor may have backed away from
- * expanding through it */
- out->insert(u);
- continue;
- }
-
- NFAVertex v = getSoleDestVertex(g, u);
- if (v && in_degree(v, g) == 1 && out_degree(u, g) == 1) {
- const CharReach &v_cr = g[v].char_reach;
- if (v_cr.count() == 1 || v_cr.isCaselessChar()) {
- continue; /* v will always generate better literals */
- }
- }
-
- out->insert(u);
- }
-}
-
-/* cand_raw is the candidate set before filtering points which are clearly
- * a bad idea. */
-static
-void getCandidatePivots(const NGHolder &g, set<NFAVertex> *cand,
- set<NFAVertex> *cand_raw) {
- auto dominators = findDominators(g);
-
- set<NFAVertex> accepts;
-
- for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
- if (is_special(v, g)) {
- continue;
- }
- accepts.insert(v);
- }
- for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
- if (is_special(v, g)) {
- continue;
- }
- accepts.insert(v);
- }
-
- assert(!accepts.empty());
-
- vector<NFAVertex> dom_trace;
- auto ait = accepts.begin();
- assert(ait != accepts.end());
- NFAVertex curr = *ait;
- while (curr && !is_special(curr, g)) {
- dom_trace.push_back(curr);
- curr = dominators[curr];
- }
- reverse(dom_trace.begin(), dom_trace.end());
- for (++ait; ait != accepts.end(); ++ait) {
- curr = *ait;
- vector<NFAVertex> dom_trace2;
- while (curr && !is_special(curr, g)) {
- dom_trace2.push_back(curr);
- curr = dominators[curr];
- }
- reverse(dom_trace2.begin(), dom_trace2.end());
- auto dti = dom_trace.begin(), dtie = dom_trace.end();
- auto dtj = dom_trace2.begin(), dtje = dom_trace2.end();
- while (dti != dtie && dtj != dtje && *dti == *dtj) {
- ++dti;
- ++dtj;
- }
- dom_trace.erase(dti, dtie);
- }
-
- cand_raw->insert(dom_trace.begin(), dom_trace.end());
-
- filterCandPivots(g, *cand_raw, cand);
-}
-
-static
-unique_ptr<VertLitInfo> findBestSplit(const NGHolder &g,
- const vector<NFAVertexDepth> *depths,
- bool for_prefix, u32 min_len,
- const set<NFAVertex> *allowed_cand,
- const set<NFAVertex> *disallowed_cand,
- bool last_chance,
- const CompileContext &cc) {
- assert(!for_prefix || depths);
-
- /* look for a single simple split point */
- set<NFAVertex> cand;
- set<NFAVertex> cand_raw;
-
- getCandidatePivots(g, &cand, &cand_raw);
-
- if (allowed_cand) {
- set<NFAVertex> cand2;
- set<NFAVertex> cand2_raw;
- set_intersection(allowed_cand->begin(), allowed_cand->end(),
- cand.begin(), cand.end(),
- inserter(cand2, cand2.begin()));
-
- set_intersection(allowed_cand->begin(), allowed_cand->end(),
- cand_raw.begin(), cand_raw.end(),
- inserter(cand2_raw, cand2_raw.begin()));
-
- cand = std::move(cand2);
- cand_raw = std::move(cand2_raw);
- }
- if (disallowed_cand) {
- DEBUG_PRINTF("%zu disallowed candidates\n", disallowed_cand->size());
- DEBUG_PRINTF("|old cand| = %zu\n", cand.size());
- erase_all(&cand, *disallowed_cand);
- insert(&cand_raw, *disallowed_cand);
- }
-
- if (!generates_callbacks(g)) {
- /* not output exposed so must leave some RHS */
- for (NFAVertex v : inv_adjacent_vertices_range(g.accept, g)) {
- cand.erase(v);
- cand_raw.erase(v);
- }
-
- for (NFAVertex v : inv_adjacent_vertices_range(g.acceptEod, g)) {
- cand.erase(v);
- cand_raw.erase(v);
- }
- }
-
- DEBUG_PRINTF("|cand| = %zu\n", cand.size());
-
- bool seeking_anchored = for_prefix;
- bool seeking_transient = for_prefix;
-
- bool desperation = for_prefix && cc.streaming;
-
- vector<unique_ptr<VertLitInfo>> lits; /**< sorted list of potential cuts */
-
- getSimpleRoseLiterals(g, seeking_anchored, depths, cand, &lits, min_len,
- desperation, last_chance, cc);
- getRegionRoseLiterals(g, seeking_anchored, depths, cand_raw, allowed_cand,
- &lits, min_len, desperation, last_chance, cc);
-
- if (lits.empty()) {
- DEBUG_PRINTF("no literals found\n");
- return nullptr;
- }
-
- if (seeking_transient) {
- for (auto &a : lits) {
- a->creates_transient
- = createsTransientLHS(g, a->vv, *depths, cc.grey);
- }
- }
-
- if (last_chance) {
+}
+
+static
+size_t shorter_than(const set<ue2_literal> &s, size_t limit) {
+ return count_if(s.begin(), s.end(),
+ [&](const ue2_literal &a) { return a.length() < limit; });
+}
+
+static
+u32 min_len(const set<ue2_literal> &s) {
+ u32 rv = ~0U;
+
+ for (const auto &lit : s) {
+ rv = min(rv, (u32)lit.length());
+ }
+
+ return rv;
+}
+
+static
+u32 min_period(const set<ue2_literal> &s) {
+ u32 rv = ~0U;
+
+ for (const auto &lit : s) {
+ rv = min(rv, (u32)minStringPeriod(lit));
+ }
+ DEBUG_PRINTF("min period %u\n", rv);
+ return rv;
+}
+
+namespace {
+/**
+ * Information on a cut: vertices and literals.
+ */
+struct VertLitInfo {
+ VertLitInfo() {}
+ VertLitInfo(NFAVertex v, const set<ue2_literal> &litlit, bool c_anch,
+ bool c_tran = false)
+ : vv(vector<NFAVertex>(1, v)), lit(litlit), creates_anchored(c_anch),
+ creates_transient(c_tran) {}
+ VertLitInfo(const vector<NFAVertex> &vv_in, const set<ue2_literal> &lit_in,
+ bool c_anch)
+ : vv(vv_in), lit(lit_in), creates_anchored(c_anch) {}
+ vector<NFAVertex> vv;
+ set<ue2_literal> lit;
+
+ bool creates_anchored = false;
+ bool creates_transient = false;
+ double split_ratio = 0;
+};
+
+#define LAST_CHANCE_STRONG_LEN 1
+
+/**
+ * \brief Comparator class for comparing different literal cuts.
+ */
+class LitComparator {
+public:
+ LitComparator(const NGHolder &g_in, bool sa, bool st, bool lc)
+ : g(g_in), seeking_anchored(sa), seeking_transient(st),
+ last_chance(lc) {}
+ bool operator()(const unique_ptr<VertLitInfo> &a,
+ const unique_ptr<VertLitInfo> &b) const {
+ assert(a && b);
+
+ if (seeking_anchored) {
+ if (a->creates_anchored != b->creates_anchored) {
+ return a->creates_anchored < b->creates_anchored;
+ }
+ }
+
+ if (seeking_transient) {
+ if (a->creates_transient != b->creates_transient) {
+ return a->creates_transient < b->creates_transient;
+ }
+ }
+
+ if (last_chance
+ && min_len(a->lit) > LAST_CHANCE_STRONG_LEN
+ && min_len(b->lit) > LAST_CHANCE_STRONG_LEN) {
+ DEBUG_PRINTF("using split ratio %g , %g\n", a->split_ratio,
+ b->split_ratio);
+ return a->split_ratio < b->split_ratio;
+ }
+
+ u64a score_a = scoreSet(a->lit);
+ u64a score_b = scoreSet(b->lit);
+
+ if (score_a != score_b) {
+ return score_a > score_b;
+ }
+
+ /* vertices should only be in one candidate cut */
+ assert(a->vv == b->vv || a->vv.front() != b->vv.front());
+ return g[a->vv.front()].index > g[b->vv.front()].index;
+ }
+
+private:
+ const NGHolder &g; /**< graph on which cuts are found */
+
+ bool seeking_anchored;
+ bool seeking_transient;
+ bool last_chance;
+};
+}
+
+#define MIN_ANCHORED_LEN 2
+#define MIN_ANCHORED_DESPERATE_LEN 1
+
+/* anchored here means that the cut creates a 'usefully' anchored LHS */
+static
+bool validateRoseLiteralSetQuality(const set<ue2_literal> &s, u64a score,
+ bool anchored, u32 min_allowed_floating_len,
+ bool desperation, bool last_chance) {
+ u32 min_allowed_len = anchored ? MIN_ANCHORED_LEN
+ : min_allowed_floating_len;
+ if (anchored && last_chance) {
+ min_allowed_len = MIN_ANCHORED_DESPERATE_LEN;
+ }
+ if (last_chance) {
+ desperation = true;
+ }
+
+ DEBUG_PRINTF("validating%s set, min allowed len %u\n",
+ anchored ? " anchored" : "", min_allowed_len);
+
+ assert(none_of(begin(s), end(s), bad_mixed_sensitivity));
+
+ if (score >= NO_LITERAL_AT_EDGE_SCORE) {
+ DEBUG_PRINTF("candidate is too bad %llu/%zu\n", score, s.size());
+ return false;
+ }
+
+ assert(!s.empty());
+ if (s.empty()) {
+ DEBUG_PRINTF("candidate is too bad/something went wrong\n");
+ return false;
+ }
+
+ u32 s_min_len = min_len(s);
+ u32 s_min_period = min_period(s);
+ size_t short_count = shorter_than(s, 5);
+
+ DEBUG_PRINTF("cand '%s': score %llu count=%zu min_len=%u min_period=%u"
+ " short_count=%zu desp=%d\n",
+ dumpString(*s.begin()).c_str(), score, s.size(), s_min_len,
+ s_min_period, short_count, (int)desperation);
+
+ bool ok = true;
+
+ if (s.size() > 10 /* magic number is magic */
+ || s_min_len < min_allowed_len
+ || (s_min_period <= 1 && min_allowed_len != 1)) {
+ DEBUG_PRINTF("candidate may be bad\n");
+ ok = false;
+ }
+
+ if (!ok && desperation
+ && s.size() <= 20 /* more magic numbers are magical */
+ && (s_min_len > 5 || (s_min_len > 2 && short_count <= 10))
+ && s_min_period > 1) {
+ DEBUG_PRINTF("candidate is ok\n");
+ ok = true;
+ }
+
+ if (!ok && desperation
+ && s.size() <= 50 /* more magic numbers are magical */
+ && s_min_len > 10
+ && s_min_period > 1) {
+ DEBUG_PRINTF("candidate is ok\n");
+ ok = true;
+ }
+
+ if (!ok) {
+ DEBUG_PRINTF("candidate is too shitty\n");
+ return false;
+ }
+
+ return true;
+}
+
+static UNUSED
+void dumpRoseLiteralSet(const set<ue2_literal> &s) {
+ for (UNUSED const auto &lit : s) {
+ DEBUG_PRINTF(" lit: %s\n", dumpString(lit).c_str());
+ }
+}
+
+static
+void getSimpleRoseLiterals(const NGHolder &g, bool seeking_anchored,
+ const vector<NFAVertexDepth> *depths,
+ const set<NFAVertex> &a_dom,
+ vector<unique_ptr<VertLitInfo>> *lits,
+ u32 min_allowed_len, bool desperation,
+ bool last_chance, const CompileContext &cc) {
+ assert(depths || !seeking_anchored);
+
+ map<NFAVertex, u64a> scores;
+ map<NFAVertex, unique_ptr<VertLitInfo>> lit_info;
+ set<ue2_literal> s;
+
+ for (auto v : a_dom) {
+ s = getLiteralSet(g, v, true); /* RHS will take responsibility for any
+ revisits to the target vertex */
+
+ if (s.empty()) {
+ DEBUG_PRINTF("candidate is too shitty\n");
+ continue;
+ }
+
+ DEBUG_PRINTF("|candidate raw literal set| = %zu\n", s.size());
+ dumpRoseLiteralSet(s);
+ u64a score = sanitizeAndCompressAndScore(s);
+
+ bool anchored = false;
+ if (seeking_anchored) {
+ anchored = createsAnchoredLHS(g, {v}, *depths, cc.grey);
+ }
+
+ if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len,
+ desperation, last_chance)) {
+ continue;
+ }
+
+ DEBUG_PRINTF("candidate is a candidate\n");
+ scores[v] = score;
+ lit_info[v] = std::make_unique<VertLitInfo>(v, s, anchored);
+ }
+
+ /* try to filter out cases where appending some characters produces worse
+ * literals. Only bother to look back one byte, TODO make better */
+ for (auto u : a_dom) {
+ if (out_degree(u, g) != 1 || !scores[u]) {
+ continue;
+ }
+ NFAVertex v = *adjacent_vertices(u, g).first;
+ if (contains(scores, v) && scores[v] >= scores[u]) {
+ DEBUG_PRINTF("killing off v as score %llu >= %llu\n",
+ scores[v], scores[u]);
+ lit_info.erase(v);
+ }
+ }
+
+ lits->reserve(lit_info.size());
+ for (auto &m : lit_info) {
+ lits->push_back(move(m.second));
+ }
+ DEBUG_PRINTF("%zu candidate literal sets\n", lits->size());
+}
+
+static
+void getRegionRoseLiterals(const NGHolder &g, bool seeking_anchored,
+ const vector<NFAVertexDepth> *depths,
+ const set<NFAVertex> &bad,
+ const set<NFAVertex> *allowed,
+ vector<unique_ptr<VertLitInfo>> *lits,
+ u32 min_allowed_len, bool desperation,
+ bool last_chance, const CompileContext &cc) {
+ /* This allows us to get more places to split the graph as we are not
+ limited to points where there is a single vertex to split at. */
+
+ assert(depths || !seeking_anchored);
+
+ /* TODO: operate over 'proto-regions' which ignore back edges */
+ auto regions = assignRegions(g);
+
+ set<u32> mand, optional;
+ map<u32, vector<NFAVertex> > exits;
+
+ for (auto v : vertices_range(g)) {
+ u32 region = regions[v];
+ if (is_any_start(v, g) || region == 0) {
+ continue;
+ }
+
+ if (is_any_accept(v, g)) {
+ continue;
+ }
+
+ if (!generates_callbacks(g) && is_match_vertex(v, g)) {
+ /* we cannot leave a completely vacuous infix */
+ continue;
+ }
+
+ if (isRegionExit(g, v, regions)) {
+ exits[region].push_back(v);
+ }
+
+ if (isRegionEntry(g, v, regions)) {
+ // Determine whether this region is mandatory or optional. We only
+ // need to do this check for the first entry vertex we encounter
+ // for this region.
+ if (!contains(mand, region) && !contains(optional, region)) {
+ if (isOptionalRegion(g, v, regions)) {
+ optional.insert(region);
+ } else {
+ mand.insert(region);
+ }
+ }
+ }
+ }
+
+ for (const auto &m : exits) {
+ if (false) {
+ next_cand:
+ continue;
+ }
+
+ const u32 region = m.first;
+ const vector<NFAVertex> &vv = m.second;
+ assert(!vv.empty());
+
+ if (!contains(mand, region)) {
+ continue;
+ }
+
+ for (auto v : vv) {
+ /* if an exit is in bad, the region is already handled well
+ * by getSimpleRoseLiterals or is otherwise bad */
+ if (contains(bad, v)) {
+ goto next_cand;
+ }
+ /* if we are only allowed to consider some vertices, v must be in
+ the list; */
+ if (allowed && !contains(*allowed, v)) {
+ goto next_cand;
+ }
+ }
+
+ /* the final region may not have a neat exit. validate that all exits
+ * have an edge to each accept or none do */
+ bool edge_to_a = edge(vv[0], g.accept, g).second;
+ bool edge_to_aeod = edge(vv[0], g.acceptEod, g).second;
+ const auto &reports = g[vv[0]].reports;
+ for (auto v : vv) {
+ if (edge_to_a != edge(v, g.accept, g).second) {
+ goto next_cand;
+ }
+
+ if (edge_to_aeod != edge(v, g.acceptEod, g).second) {
+ goto next_cand;
+ }
+
+ if (g[v].reports != reports) {
+ goto next_cand;
+ }
+ }
+
+ DEBUG_PRINTF("inspecting region %u\n", region);
+ set<ue2_literal> s;
+ for (auto v : vv) {
+ DEBUG_PRINTF(" exit vertex: %zu\n", g[v].index);
+ /* Note: RHS can not be depended on to take all subsequent revisits
+ * to this vertex */
+ set<ue2_literal> ss = getLiteralSet(g, v, false);
+ if (ss.empty()) {
+ DEBUG_PRINTF("candidate is too shitty\n");
+ goto next_cand;
+ }
+ insert(&s, ss);
+ }
+
+ assert(!s.empty());
+
+ DEBUG_PRINTF("|candidate raw literal set| = %zu\n", s.size());
+ dumpRoseLiteralSet(s);
+ u64a score = sanitizeAndCompressAndScore(s);
+
+ DEBUG_PRINTF("|candidate literal set| = %zu\n", s.size());
+ dumpRoseLiteralSet(s);
+
+ bool anchored = false;
+ if (seeking_anchored) {
+ anchored = createsAnchoredLHS(g, vv, *depths, cc.grey);
+ }
+
+ if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len,
+ desperation, last_chance)) {
+ goto next_cand;
+ }
+
+ DEBUG_PRINTF("candidate is a candidate\n");
+ lits->push_back(std::make_unique<VertLitInfo>(vv, s, anchored));
+ }
+}
+
+static
+void filterCandPivots(const NGHolder &g, const set<NFAVertex> &cand_raw,
+ set<NFAVertex> *out) {
+ for (auto u : cand_raw) {
+ const CharReach &u_cr = g[u].char_reach;
+ if (u_cr.count() > 40) {
+ continue; /* too wide to be plausible */
+ }
+
+ if (u_cr.count() > 2) {
+ /* include u as a candidate as successor may have backed away from
+ * expanding through it */
+ out->insert(u);
+ continue;
+ }
+
+ NFAVertex v = getSoleDestVertex(g, u);
+ if (v && in_degree(v, g) == 1 && out_degree(u, g) == 1) {
+ const CharReach &v_cr = g[v].char_reach;
+ if (v_cr.count() == 1 || v_cr.isCaselessChar()) {
+ continue; /* v will always generate better literals */
+ }
+ }
+
+ out->insert(u);
+ }
+}
+
+/* cand_raw is the candidate set before filtering points which are clearly
+ * a bad idea. */
+static
+void getCandidatePivots(const NGHolder &g, set<NFAVertex> *cand,
+ set<NFAVertex> *cand_raw) {
+ auto dominators = findDominators(g);
+
+ set<NFAVertex> accepts;
+
+ for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
+ if (is_special(v, g)) {
+ continue;
+ }
+ accepts.insert(v);
+ }
+ for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
+ if (is_special(v, g)) {
+ continue;
+ }
+ accepts.insert(v);
+ }
+
+ assert(!accepts.empty());
+
+ vector<NFAVertex> dom_trace;
+ auto ait = accepts.begin();
+ assert(ait != accepts.end());
+ NFAVertex curr = *ait;
+ while (curr && !is_special(curr, g)) {
+ dom_trace.push_back(curr);
+ curr = dominators[curr];
+ }
+ reverse(dom_trace.begin(), dom_trace.end());
+ for (++ait; ait != accepts.end(); ++ait) {
+ curr = *ait;
+ vector<NFAVertex> dom_trace2;
+ while (curr && !is_special(curr, g)) {
+ dom_trace2.push_back(curr);
+ curr = dominators[curr];
+ }
+ reverse(dom_trace2.begin(), dom_trace2.end());
+ auto dti = dom_trace.begin(), dtie = dom_trace.end();
+ auto dtj = dom_trace2.begin(), dtje = dom_trace2.end();
+ while (dti != dtie && dtj != dtje && *dti == *dtj) {
+ ++dti;
+ ++dtj;
+ }
+ dom_trace.erase(dti, dtie);
+ }
+
+ cand_raw->insert(dom_trace.begin(), dom_trace.end());
+
+ filterCandPivots(g, *cand_raw, cand);
+}
+
+static
+unique_ptr<VertLitInfo> findBestSplit(const NGHolder &g,
+ const vector<NFAVertexDepth> *depths,
+ bool for_prefix, u32 min_len,
+ const set<NFAVertex> *allowed_cand,
+ const set<NFAVertex> *disallowed_cand,
+ bool last_chance,
+ const CompileContext &cc) {
+ assert(!for_prefix || depths);
+
+ /* look for a single simple split point */
+ set<NFAVertex> cand;
+ set<NFAVertex> cand_raw;
+
+ getCandidatePivots(g, &cand, &cand_raw);
+
+ if (allowed_cand) {
+ set<NFAVertex> cand2;
+ set<NFAVertex> cand2_raw;
+ set_intersection(allowed_cand->begin(), allowed_cand->end(),
+ cand.begin(), cand.end(),
+ inserter(cand2, cand2.begin()));
+
+ set_intersection(allowed_cand->begin(), allowed_cand->end(),
+ cand_raw.begin(), cand_raw.end(),
+ inserter(cand2_raw, cand2_raw.begin()));
+
+ cand = std::move(cand2);
+ cand_raw = std::move(cand2_raw);
+ }
+ if (disallowed_cand) {
+ DEBUG_PRINTF("%zu disallowed candidates\n", disallowed_cand->size());
+ DEBUG_PRINTF("|old cand| = %zu\n", cand.size());
+ erase_all(&cand, *disallowed_cand);
+ insert(&cand_raw, *disallowed_cand);
+ }
+
+ if (!generates_callbacks(g)) {
+ /* not output exposed so must leave some RHS */
+ for (NFAVertex v : inv_adjacent_vertices_range(g.accept, g)) {
+ cand.erase(v);
+ cand_raw.erase(v);
+ }
+
+ for (NFAVertex v : inv_adjacent_vertices_range(g.acceptEod, g)) {
+ cand.erase(v);
+ cand_raw.erase(v);
+ }
+ }
+
+ DEBUG_PRINTF("|cand| = %zu\n", cand.size());
+
+ bool seeking_anchored = for_prefix;
+ bool seeking_transient = for_prefix;
+
+ bool desperation = for_prefix && cc.streaming;
+
+ vector<unique_ptr<VertLitInfo>> lits; /**< sorted list of potential cuts */
+
+ getSimpleRoseLiterals(g, seeking_anchored, depths, cand, &lits, min_len,
+ desperation, last_chance, cc);
+ getRegionRoseLiterals(g, seeking_anchored, depths, cand_raw, allowed_cand,
+ &lits, min_len, desperation, last_chance, cc);
+
+ if (lits.empty()) {
+ DEBUG_PRINTF("no literals found\n");
+ return nullptr;
+ }
+
+ if (seeking_transient) {
+ for (auto &a : lits) {
+ a->creates_transient
+ = createsTransientLHS(g, a->vv, *depths, cc.grey);
+ }
+ }
+
+ if (last_chance) {
const size_t num_verts = num_vertices(g);
auto color_map = make_small_color_map(g);
- for (auto &a : lits) {
+ for (auto &a : lits) {
size_t num_reachable = count_reachable(g, a->vv, color_map);
double ratio = (double)num_reachable / (double)num_verts;
a->split_ratio = ratio > 0.5 ? 1 - ratio : ratio;
- }
- }
-
- auto cmp = LitComparator(g, seeking_anchored, seeking_transient,
- last_chance);
-
- unique_ptr<VertLitInfo> best = move(lits.back());
- lits.pop_back();
- while (!lits.empty()) {
- if (cmp(best, lits.back())) {
- best = move(lits.back());
- }
- lits.pop_back();
- }
-
- DEBUG_PRINTF("best is '%s' %zu a%d t%d\n",
- dumpString(*best->lit.begin()).c_str(),
- g[best->vv.front()].index,
- depths ? (int)createsAnchoredLHS(g, best->vv, *depths, cc.grey) : 0,
- depths ? (int)createsTransientLHS(g, best->vv, *depths, cc.grey) : 0);
-
- return best;
-}
-
-static
-void poisonFromSuccessor(const NGHolder &h, const ue2_literal &succ,
- bool overhang_ok, flat_set<NFAEdge> &bad) {
- DEBUG_PRINTF("poisoning holder of size %zu, succ len %zu\n",
- num_vertices(h), succ.length());
-
- using EdgeSet = boost::dynamic_bitset<>;
-
- const size_t edge_count = num_edges(h);
- EdgeSet bad_edges(edge_count);
-
- unordered_map<NFAVertex, EdgeSet> curr;
- for (const auto &e : in_edges_range(h.accept, h)) {
- auto &path_set = curr[source(e, h)];
- if (path_set.empty()) {
- path_set.resize(edge_count);
- }
- path_set.set(h[e].index);
- }
-
- unordered_map<NFAVertex, EdgeSet> next;
- for (auto it = succ.rbegin(); it != succ.rend(); ++it) {
- for (const auto &path : curr) {
- NFAVertex u = path.first;
- const auto &path_set = path.second;
- if (u == h.start && overhang_ok) {
- DEBUG_PRINTF("poisoning early %zu [overhang]\n",
- path_set.count());
- bad_edges |= path_set;
- continue;
- }
- if (overlaps(h[u].char_reach, *it)) {
- for (const auto &e : in_edges_range(u, h)) {
- auto &new_path_set = next[source(e, h)];
- if (new_path_set.empty()) {
- new_path_set.resize(edge_count);
- }
- new_path_set |= path_set;
- new_path_set.set(h[e].index);
- }
- }
- }
- DEBUG_PRINTF("succ char matches at %zu paths\n", next.size());
- assert(overhang_ok || !curr.empty());
- swap(curr, next);
- next.clear();
- }
-
- assert(overhang_ok || !curr.empty());
- for (const auto &path : curr) {
- bad_edges |= path.second;
- DEBUG_PRINTF("poisoning %zu vertices\n", path.second.count());
- }
-
- for (const auto &e : edges_range(h)) {
- if (bad_edges.test(h[e].index)) {
- bad.insert(e);
- }
- }
-}
-
-static
-void poisonForGoodPrefix(const NGHolder &h,
- const vector<NFAVertexDepth> &depths,
- flat_set<NFAEdge> &bad, const Grey &grey) {
- for (const auto &v : vertices_range(h)) {
- if (!createsAnchoredLHS(h, {v}, depths, grey)
- && !createsTransientLHS(h, {v}, depths, grey)) {
- insert(&bad, in_edges_range(v, h));
- }
- }
-}
-
-static UNUSED
-bool is_any_accept_type(RoseInVertexType t) {
- return t == RIV_ACCEPT || t == RIV_ACCEPT_EOD;
-}
-
-static
-flat_set<NFAEdge> poisonEdges(const NGHolder &h,
- const vector<NFAVertexDepth> *depths,
- const RoseInGraph &vg, const vector<RoseInEdge> &ee,
- bool for_prefix, const Grey &grey) {
- DEBUG_PRINTF("poisoning edges %zu successor edges\n", ee.size());
-
- /* poison edges covered by successor literal */
-
- set<pair<ue2_literal, bool> > succs;
- for (const RoseInEdge &ve : ee) {
- if (vg[target(ve, vg)].type != RIV_LITERAL) {
- /* nothing to poison in suffixes/outfixes */
- assert(generates_callbacks(h));
- assert(is_any_accept_type(vg[target(ve, vg)].type));
- continue;
- }
- succs.insert({vg[target(ve, vg)].s,
- vg[source(ve, vg)].type == RIV_LITERAL});
-
- }
-
- DEBUG_PRINTF("poisoning edges %zu successor literals\n", succs.size());
-
- flat_set<NFAEdge> bad;
- for (const auto &p : succs) {
- poisonFromSuccessor(h, p.first, p.second, bad);
- }
-
- /* poison edges which don't significantly improve a prefix */
-
- if (for_prefix) {
- poisonForGoodPrefix(h, *depths, bad, grey);
- }
-
- return bad;
-}
-
-static
-set<NFAVertex> poisonVertices(const NGHolder &h, const RoseInGraph &vg,
- const vector<RoseInEdge> &ee, const Grey &grey) {
- flat_set<NFAEdge> bad_edges = poisonEdges(h, nullptr, vg, ee, false, grey);
- set<NFAVertex> bad_vertices;
- for (const NFAEdge &e : bad_edges) {
- bad_vertices.insert(target(e, h));
- DEBUG_PRINTF("bad: %zu->%zu\n", h[source(e, h)].index,
- h[target(e, h)].index);
- }
-
- return bad_vertices;
-}
-
-static
-unique_ptr<VertLitInfo> findBestNormalSplit(const NGHolder &g,
- const RoseInGraph &vg,
- const vector<RoseInEdge> &ee,
- const CompileContext &cc) {
- assert(g.kind == NFA_OUTFIX || g.kind == NFA_INFIX || g.kind == NFA_SUFFIX);
- set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
-
- return findBestSplit(g, nullptr, false, cc.grey.minRoseLiteralLength,
- nullptr, &bad_vertices, false, cc);
-}
-
-static
-unique_ptr<VertLitInfo> findBestLastChanceSplit(const NGHolder &g,
- const RoseInGraph &vg,
- const vector<RoseInEdge> &ee,
- const CompileContext &cc) {
- assert(g.kind == NFA_OUTFIX || g.kind == NFA_INFIX || g.kind == NFA_SUFFIX);
- set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
-
- return findBestSplit(g, nullptr, false, cc.grey.minRoseLiteralLength,
- nullptr, &bad_vertices, true, cc);
-}
-
-static
-unique_ptr<VertLitInfo> findSimplePrefixSplit(const NGHolder &g,
- const CompileContext &cc) {
- DEBUG_PRINTF("looking for simple prefix split\n");
- bool anchored = !proper_out_degree(g.startDs, g);
- NFAVertex u = anchored ? g.start : g.startDs;
-
- if (out_degree(u, g) != 2) { /* startDs + succ */
- return nullptr;
- }
-
- NFAVertex v = NGHolder::null_vertex();
- for (NFAVertex t : adjacent_vertices_range(u, g)) {
- if (t != g.startDs) {
- assert(!v);
- v = t;
- }
- }
- assert(v);
-
- if (!anchored) {
- if (out_degree(g.start, g) > 2) {
- return nullptr;
- }
- if (out_degree(g.start, g) == 2 && !edge(g.start, v, g).second) {
- return nullptr;
- }
- }
-
- NFAVertex best_v = NGHolder::null_vertex();
- ue2_literal best_lit;
-
- u32 limit = cc.grey.maxHistoryAvailable;
- if (anchored) {
- LIMIT_TO_AT_MOST(&limit, cc.grey.maxAnchoredRegion);
- }
-
- ue2_literal curr_lit;
- for (u32 i = 0; i < limit; i++) {
- const auto &v_cr = g[v].char_reach;
- if (v_cr.count() == 1 || v_cr.isCaselessChar()) {
- curr_lit.push_back(v_cr.find_first(), v_cr.isCaselessChar());
- } else {
- curr_lit.clear();
- }
-
- if (curr_lit.length() > best_lit.length()) {
- best_lit = curr_lit;
- best_v = v;
- }
-
- if (out_degree(v, g) != 1) {
- break;
- }
- v = *adjacent_vertices(v, g).first;
- }
-
- if (best_lit.length() < cc.grey.minRoseLiteralLength) {
- return nullptr;
- }
-
- set<ue2_literal> best_lit_set({best_lit});
- if (bad_mixed_sensitivity(best_lit)) {
- sanitizeAndCompressAndScore(best_lit_set);
- }
-
- return ue2::make_unique<VertLitInfo>(best_v, best_lit_set, anchored, true);
-}
-
-static
-unique_ptr<VertLitInfo> findBestPrefixSplit(const NGHolder &g,
- const vector<NFAVertexDepth> &depths,
- const RoseInGraph &vg,
- const vector<RoseInEdge> &ee,
- bool last_chance,
- const CompileContext &cc) {
- assert(g.kind == NFA_PREFIX || g.kind == NFA_OUTFIX);
- set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
- auto rv = findBestSplit(g, &depths, true, cc.grey.minRoseLiteralLength,
- nullptr, &bad_vertices, last_chance, cc);
-
- /* large back edges may prevent us identifying anchored or transient cases
- * properly - use a simple walk instead */
- if (!rv || !(rv->creates_transient || rv->creates_anchored)) {
- auto rv2 = findSimplePrefixSplit(g, cc);
- if (rv2) {
- return rv2;
- }
- }
-
- return rv;
-}
-
-static
-unique_ptr<VertLitInfo> findBestCleanSplit(const NGHolder &g,
- const CompileContext &cc) {
- assert(g.kind != NFA_PREFIX);
- set<NFAVertex> cleanSplits;
- for (NFAVertex v : vertices_range(g)) {
- if (!g[v].char_reach.all() || !edge(v, v, g).second) {
- continue;
- }
- insert(&cleanSplits, inv_adjacent_vertices(v, g));
- cleanSplits.erase(v);
- }
- cleanSplits.erase(g.start);
- if (cleanSplits.empty()) {
- return nullptr;
- }
- return findBestSplit(g, nullptr, false, cc.grey.violetEarlyCleanLiteralLen,
- &cleanSplits, nullptr, false, cc);
-}
-
-static
-bool can_match(const NGHolder &g, const ue2_literal &lit, bool overhang_ok) {
- set<NFAVertex> curr, next;
- curr.insert(g.accept);
-
- for (auto it = lit.rbegin(); it != lit.rend(); ++it) {
- next.clear();
-
- for (auto v : curr) {
- for (auto u : inv_adjacent_vertices_range(v, g)) {
- if (u == g.start) {
- if (overhang_ok) {
- DEBUG_PRINTF("bail\n");
- return true;
- } else {
- continue; /* it is not possible for a lhs literal to
- * overhang the start */
- }
- }
-
- const CharReach &cr = g[u].char_reach;
- if (!overlaps(*it, cr)) {
- continue;
- }
-
- next.insert(u);
- }
- }
-
- curr.swap(next);
- }
-
- return !curr.empty();
-}
-
-static
-bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg,
- const vector<RoseInEdge> &ee, const VertLitInfo &split) {
- const vector<NFAVertex> &splitters = split.vv;
- assert(!splitters.empty());
-
- shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
- shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
-
- unordered_map<NFAVertex, NFAVertex> lhs_map;
- unordered_map<NFAVertex, NFAVertex> rhs_map;
-
- splitGraph(base_graph, splitters, lhs.get(), &lhs_map, rhs.get(), &rhs_map);
- DEBUG_PRINTF("split %s:%zu into %s:%zu + %s:%zu\n",
- to_string(base_graph.kind).c_str(), num_vertices(base_graph),
- to_string(lhs->kind).c_str(), num_vertices(*lhs),
- to_string(rhs->kind).c_str(), num_vertices(*rhs));
-
- bool suffix = generates_callbacks(base_graph);
-
- if (is_triggered(base_graph)) {
- /* if we are already guarded, check if the split reduces the size of
- * the problem before continuing with the split */
- if (num_vertices(*lhs) >= num_vertices(base_graph)
- && !(suffix && isVacuous(*rhs))) {
- DEBUG_PRINTF("split's lhs is no smaller\n");
- return false;
- }
-
- if (num_vertices(*rhs) >= num_vertices(base_graph)) {
- DEBUG_PRINTF("split's rhs is no smaller\n");
- return false;
- }
- }
-
- bool do_accept = false;
- bool do_accept_eod = false;
- assert(rhs);
- if (isVacuous(*rhs) && suffix) {
- if (edge(rhs->start, rhs->accept, *rhs).second) {
- DEBUG_PRINTF("rhs has a cliche\n");
- do_accept = true;
- remove_edge(rhs->start, rhs->accept, *rhs);
- }
-
- if (edge(rhs->start, rhs->acceptEod, *rhs).second) {
- DEBUG_PRINTF("rhs has an eod cliche\n");
- do_accept_eod = true;
- remove_edge(rhs->start, rhs->acceptEod, *rhs);
- }
-
- renumber_edges(*rhs);
- }
-
- /* check if we still have a useful graph left over */
- bool do_norm = out_degree(rhs->start, *rhs) != 1;
-
- set<ReportID> splitter_reports;
- for (auto v : splitters) {
- insert(&splitter_reports, base_graph[v].reports);
- }
-
- /* find the targets of each source vertex; insertion_ordered_map used to
- * preserve deterministic ordering */
- insertion_ordered_map<RoseInVertex, vector<RoseInVertex>> images;
- for (const RoseInEdge &e : ee) {
- RoseInVertex src = source(e, vg);
- RoseInVertex dest = target(e, vg);
- images[src].push_back(dest);
- remove_edge(e, vg);
- }
-
- map<vector<RoseInVertex>, vector<RoseInVertex>> verts_by_image;
-
- for (const auto &m : images) {
- const auto &u = m.first;
- const auto &image = m.second;
-
- if (contains(verts_by_image, image)) {
- for (RoseInVertex v : verts_by_image[image]) {
- add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
- }
- continue;
- }
-
- for (const auto &lit : split.lit) {
- assert(!bad_mixed_sensitivity(lit));
-
- /* don't allow overhang in can_match() as literals should
- * correspond to the edge graph being split; overhanging the graph
- * would indicate a false path.*/
- if (!can_match(*lhs, lit, false)) {
- DEBUG_PRINTF("'%s' did not match lhs\n",
- escapeString(lit).c_str());
- continue;
- }
-
- DEBUG_PRINTF("best is '%s'\n", escapeString(lit).c_str());
- auto v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
- add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
-
- /* work out delay later */
- if (do_accept) {
- DEBUG_PRINTF("rhs has a cliche\n");
- auto tt = add_vertex(RoseInVertexProps::makeAccept(
- splitter_reports), vg);
- add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
- }
-
- if (do_accept_eod) {
- DEBUG_PRINTF("rhs has an eod cliche\n");
- auto tt = add_vertex(RoseInVertexProps::makeAcceptEod(
- splitter_reports), vg);
- add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
- }
-
- if (do_norm) {
- assert(out_degree(rhs->start, *rhs) > 1);
- for (RoseInVertex dest : image) {
- add_edge(v, dest, RoseInEdgeProps(rhs, 0U), vg);
- }
- }
- verts_by_image[image].push_back(v);
- }
- }
-
- assert(hasCorrectlyNumberedVertices(*rhs));
- assert(hasCorrectlyNumberedEdges(*rhs));
- assert(isCorrectlyTopped(*rhs));
- assert(hasCorrectlyNumberedVertices(*lhs));
- assert(hasCorrectlyNumberedEdges(*lhs));
- assert(isCorrectlyTopped(*lhs));
-
- return true;
-}
-
-#define MAX_NETFLOW_CUT_WIDTH 40 /* magic number is magic */
-#define MAX_LEN_2_LITERALS_PER_CUT 3
-
-static
-bool checkValidNetflowLits(NGHolder &h, const vector<u64a> &scores,
- const map<NFAEdge, set<ue2_literal>> &cut_lits,
- u32 min_allowed_length) {
- DEBUG_PRINTF("cut width %zu; min allowed %u\n", cut_lits.size(),
- min_allowed_length);
- if (cut_lits.size() > MAX_NETFLOW_CUT_WIDTH) {
- return false;
- }
-
- u32 len_2_count = 0;
-
- for (const auto &cut : cut_lits) {
- if (scores[h[cut.first].index] >= NO_LITERAL_AT_EDGE_SCORE) {
- DEBUG_PRINTF("cut uses a forbidden edge\n");
- return false;
- }
-
- if (min_len(cut.second) < min_allowed_length) {
- DEBUG_PRINTF("cut uses a bad literal\n");
- return false;
- }
-
- for (const auto &lit : cut.second) {
- if (lit.length() == 2) {
- len_2_count++;
- }
- }
- }
-
- if (len_2_count > MAX_LEN_2_LITERALS_PER_CUT) {
- return false;
- }
-
- return true;
-}
-
-static
-void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
- const vector<RoseInEdge> &to_cut,
- const vector<NFAEdge> &cut,
- const map<NFAEdge, set<ue2_literal>> &cut_lits) {
- DEBUG_PRINTF("splitting %s (%zu vertices)\n", to_string(h.kind).c_str(),
- num_vertices(h));
-
- /* create literal vertices and connect preds */
- unordered_set<RoseInVertex> done_sources;
- map<RoseInVertex, vector<pair<RoseInVertex, NFAVertex>>> verts_by_source;
- for (const RoseInEdge &ve : to_cut) {
- assert(&h == &*vg[ve].graph);
- RoseInVertex src = source(ve, vg);
- if (!done_sources.insert(src).second) {
- continue; /* already processed */
- }
-
- /* iterate over cut for determinism */
- for (const auto &e : cut) {
- NFAVertex prev_v = source(e, h);
- NFAVertex pivot = target(e, h);
-
- DEBUG_PRINTF("splitting on pivot %zu\n", h[pivot].index);
- unordered_map<NFAVertex, NFAVertex> temp_map;
- shared_ptr<NGHolder> new_lhs = make_shared<NGHolder>();
- splitLHS(h, pivot, new_lhs.get(), &temp_map);
-
- /* want to cut off paths to pivot from things other than the pivot -
- * makes a more svelte graphy */
- clear_in_edges(temp_map[pivot], *new_lhs);
- NFAEdge pivot_edge = add_edge(temp_map[prev_v], temp_map[pivot],
- *new_lhs);
- if (is_triggered(h) && prev_v == h.start) {
- (*new_lhs)[pivot_edge].tops.insert(DEFAULT_TOP);
- }
-
- pruneUseless(*new_lhs, false);
- renumber_vertices(*new_lhs);
- renumber_edges(*new_lhs);
-
- DEBUG_PRINTF(" into lhs %s (%zu vertices)\n",
- to_string(new_lhs->kind).c_str(),
- num_vertices(*new_lhs));
-
- assert(hasCorrectlyNumberedVertices(*new_lhs));
- assert(hasCorrectlyNumberedEdges(*new_lhs));
- assert(isCorrectlyTopped(*new_lhs));
-
- const set<ue2_literal> &lits = cut_lits.at(e);
- for (const auto &lit : lits) {
- if (!can_match(*new_lhs, lit, is_triggered(h))) {
- continue;
- }
-
- RoseInVertex v
- = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
-
- /* if this is a prefix/infix an edge directly to accept should
- * represent a false path as we have poisoned vertices covered
- * by the literals. */
- if (generates_callbacks(h)) {
- if (edge(pivot, h.accept, h).second) {
- DEBUG_PRINTF("adding acceptEod\n");
- /* literal has a direct connection to accept */
- const flat_set<ReportID> &reports = h[pivot].reports;
- auto tt = add_vertex(
- RoseInVertexProps::makeAccept(reports), vg);
- add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
- }
-
- if (edge(pivot, h.acceptEod, h).second) {
- assert(generates_callbacks(h));
- DEBUG_PRINTF("adding acceptEod\n");
- /* literal has a direct connection to accept */
- const flat_set<ReportID> &reports = h[pivot].reports;
- auto tt = add_vertex(
- RoseInVertexProps::makeAcceptEod(reports), vg);
- add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
- }
- }
-
- add_edge(src, v, RoseInEdgeProps(new_lhs, 0), vg);
- verts_by_source[src].push_back({v, pivot});
- }
- }
- }
-
- /* wire the literal vertices up to successors */
- map<vector<NFAVertex>, shared_ptr<NGHolder> > done_rhs;
- for (const RoseInEdge &ve : to_cut) {
- RoseInVertex src = source(ve, vg);
- RoseInVertex dest = target(ve, vg);
-
- /* iterate over cut for determinism */
- for (const auto &elem : verts_by_source[src]) {
- NFAVertex pivot = elem.second;
- RoseInVertex v = elem.first;
-
- vector<NFAVertex> adj;
- insert(&adj, adj.end(), adjacent_vertices(pivot, h));
- /* we can ignore presence of accept, accepteod in adj as it is best
- effort */
-
- if (!contains(done_rhs, adj)) {
- unordered_map<NFAVertex, NFAVertex> temp_map;
- shared_ptr<NGHolder> new_rhs = make_shared<NGHolder>();
- splitRHS(h, adj, new_rhs.get(), &temp_map);
- remove_edge(new_rhs->start, new_rhs->accept, *new_rhs);
- remove_edge(new_rhs->start, new_rhs->acceptEod, *new_rhs);
- renumber_edges(*new_rhs);
- DEBUG_PRINTF(" into rhs %s (%zu vertices)\n",
- to_string(new_rhs->kind).c_str(),
- num_vertices(*new_rhs));
- done_rhs.emplace(adj, new_rhs);
- assert(isCorrectlyTopped(*new_rhs));
- }
-
- assert(done_rhs[adj].get());
- shared_ptr<NGHolder> new_rhs = done_rhs[adj];
-
- assert(hasCorrectlyNumberedVertices(*new_rhs));
- assert(hasCorrectlyNumberedEdges(*new_rhs));
- assert(isCorrectlyTopped(*new_rhs));
-
- if (vg[dest].type == RIV_LITERAL
- && !can_match(*new_rhs, vg[dest].s, true)) {
- continue;
- }
-
- if (out_degree(new_rhs->start, *new_rhs) != 1) {
- add_edge(v, dest, RoseInEdgeProps(new_rhs, 0), vg);
- }
- }
-
- remove_edge(ve, vg);
- }
-}
-
-static
-bool doNetflowCut(NGHolder &h,
- const vector<NFAVertexDepth> *depths,
- RoseInGraph &vg,
- const vector<RoseInEdge> &ee, bool for_prefix,
- const Grey &grey, u32 min_allowed_length = 0U) {
- ENSURE_AT_LEAST(&min_allowed_length, grey.minRoseNetflowLiteralLength);
-
- DEBUG_PRINTF("doing netflow cut\n");
- /* TODO: we should really get literals/scores from the full graph as this
- * allows us to overlap with previous cuts. */
- assert(!ee.empty());
- assert(&h == &*vg[ee.front()].graph);
- assert(!for_prefix || depths);
-
- if (num_edges(h) > grey.maxRoseNetflowEdges) {
- /* We have a limit on this because scoring edges and running netflow
- * gets very slow for big graphs. */
- DEBUG_PRINTF("too many edges, skipping netflow cut\n");
- return false;
- }
-
- assert(hasCorrectlyNumberedVertices(h));
- assert(hasCorrectlyNumberedEdges(h));
-
- auto known_bad = poisonEdges(h, depths, vg, ee, for_prefix, grey);
-
- /* Step 1: Get scores for all edges */
- vector<u64a> scores = scoreEdges(h, known_bad); /* scores by edge_index */
-
- /* Step 2: Find cutset based on scores */
- vector<NFAEdge> cut = findMinCut(h, scores);
-
- /* Step 3: Get literals corresponding to cut edges */
- map<NFAEdge, set<ue2_literal>> cut_lits;
- for (const auto &e : cut) {
- set<ue2_literal> lits = getLiteralSet(h, e);
- sanitizeAndCompressAndScore(lits);
-
- cut_lits[e] = lits;
- }
-
- /* if literals are underlength bail or if it involves a forbidden edge*/
- if (!checkValidNetflowLits(h, scores, cut_lits, min_allowed_length)) {
- return false;
- }
- DEBUG_PRINTF("splitting\n");
-
- /* Step 4: Split graph based on cuts */
- splitEdgesByCut(h, vg, ee, cut, cut_lits);
-
- return true;
-}
-
-static
-bool deanchorIfNeeded(NGHolder &g) {
- DEBUG_PRINTF("hi\n");
- if (proper_out_degree(g.startDs, g)) {
- return false;
- }
-
- /* look for a non-special dot with a loop following start */
- set<NFAVertex> succ_g;
- insert(&succ_g, adjacent_vertices(g.start, g));
- succ_g.erase(g.startDs);
-
- for (auto v : adjacent_vertices_range(g.start, g)) {
- DEBUG_PRINTF("inspecting cand %zu || = %zu\n", g[v].index,
- g[v].char_reach.count());
-
- if (v == g.startDs || !g[v].char_reach.all()) {
- continue;
- }
-
- set<NFAVertex> succ_v;
- insert(&succ_v, adjacent_vertices(v, g));
-
- if (succ_v == succ_g) {
- DEBUG_PRINTF("found ^.*\n");
- for (auto succ : adjacent_vertices_range(g.start, g)) {
- if (succ == g.startDs) {
- continue;
- }
- add_edge(g.startDs, succ, g);
- }
- clear_vertex(v, g);
- remove_vertex(v, g);
- renumber_vertices(g);
- return true;
- }
-
- if (succ_g.size() == 1 && hasSelfLoop(v, g)) {
- DEBUG_PRINTF("found ^.+\n");
- add_edge(g.startDs, v, g);
- remove_edge(v, v, g);
- return true;
- }
- }
-
- return false;
-}
-
-static
-RoseInGraph populateTrivialGraph(const NGHolder &h) {
- RoseInGraph g;
- shared_ptr<NGHolder> root_g = cloneHolder(h);
- bool orig_anch = isAnchored(*root_g);
- orig_anch |= deanchorIfNeeded(*root_g);
-
- DEBUG_PRINTF("orig_anch %d\n", (int)orig_anch);
-
- auto start = add_vertex(RoseInVertexProps::makeStart(orig_anch), g);
- auto accept = add_vertex(RoseInVertexProps::makeAccept(set<ReportID>()), g);
-
- add_edge(start, accept, RoseInEdgeProps(root_g, 0), g);
-
- return g;
-}
-
-static
-void avoidOutfixes(RoseInGraph &vg, bool last_chance,
- const CompileContext &cc) {
- STAGE_DEBUG_PRINTF("AVOIDING OUTFIX\n");
- assert(num_vertices(vg) == 2);
- assert(num_edges(vg) == 1);
-
- RoseInEdge e = *edges(vg).first;
-
- NGHolder &h = *vg[e].graph;
- assert(isCorrectlyTopped(h));
-
- renumber_vertices(h);
- renumber_edges(h);
-
- unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, {e}, cc);
-
- if (split && splitRoseEdge(h, vg, {e}, *split)) {
- DEBUG_PRINTF("split on simple literal\n");
- return;
- }
-
- if (last_chance) {
- /* look for a prefix split as it allows us to accept very weak anchored
- * literals. */
- auto depths = calcDepths(h);
-
- split = findBestPrefixSplit(h, depths, vg, {e}, last_chance, cc);
-
- if (split && splitRoseEdge(h, vg, {e}, *split)) {
- DEBUG_PRINTF("split on simple literal\n");
- return;
- }
- }
-
- doNetflowCut(h, nullptr, vg, {e}, false, cc.grey);
-}
-
-static
-void removeRedundantPrefixes(RoseInGraph &g) {
- STAGE_DEBUG_PRINTF("REMOVING REDUNDANT PREFIXES\n");
-
- for (const RoseInEdge &e : edges_range(g)) {
- RoseInVertex s = source(e, g);
- RoseInVertex t = target(e, g);
-
- if (g[s].type != RIV_START || g[t].type != RIV_LITERAL) {
- continue;
- }
-
- if (!g[e].graph) {
- continue;
- }
-
- assert(!g[t].delay);
- const ue2_literal &lit = g[t].s;
-
- if (!literalIsWholeGraph(*g[e].graph, lit)) {
- DEBUG_PRINTF("not whole graph\n");
- continue;
- }
-
- if (!isFloating(*g[e].graph)) {
- DEBUG_PRINTF("not floating\n");
- continue;
- }
- g[e].graph.reset();
- }
-}
-
-static
-u32 maxDelay(const CompileContext &cc) {
- if (!cc.streaming) {
- return MO_INVALID_IDX;
- }
- return cc.grey.maxHistoryAvailable;
-}
-
-static
-void removeRedundantLiteralsFromPrefixes(RoseInGraph &g,
- const CompileContext &cc) {
- STAGE_DEBUG_PRINTF("REMOVING LITERALS FROM PREFIXES\n");
-
- vector<RoseInEdge> to_anchor;
- for (const RoseInEdge &e : edges_range(g)) {
- RoseInVertex s = source(e, g);
- RoseInVertex t = target(e, g);
-
- if (g[s].type != RIV_START && g[s].type != RIV_ANCHORED_START) {
- continue;
- }
-
- if (g[t].type != RIV_LITERAL) {
- continue;
- }
-
- if (!g[e].graph) {
- continue;
- }
-
- if (g[e].graph_lag) {
- /* already removed redundant parts of literals */
- continue;
- }
-
- if (g[e].dfa) {
- /* if we removed any more states, we would need to rebuild the
- * the dfa which can be time consuming. */
- continue;
- }
-
- assert(!g[t].delay);
- const ue2_literal &lit = g[t].s;
-
- DEBUG_PRINTF("removing states for literal: %s\n",
- dumpString(lit).c_str());
-
- unique_ptr<NGHolder> h = cloneHolder(*g[e].graph);
- const u32 max_delay = maxDelay(cc);
-
- u32 delay = removeTrailingLiteralStates(*h, lit, max_delay,
- false /* can't overhang start */);
-
- DEBUG_PRINTF("got delay %u (max allowed %u)\n", delay, max_delay);
-
- if (edge(h->startDs, h->accept, *h).second) {
- /* we should have delay == lit.length(), but in really complex
- * cases we may fail to identify that we can remove the whole
- * graph. Regardless, the fact that sds is wired to accept means the
- * graph serves no purpose. */
- DEBUG_PRINTF("whole graph\n");
- g[e].graph.reset();
- continue;
- }
-
- if (delay == lit.length() && edge(h->start, h->accept, *h).second
- && num_vertices(*h) == N_SPECIALS) {
- to_anchor.push_back(e);
- continue;
- }
-
- /* if we got here we should still have an interesting graph */
- assert(delay == max_delay || num_vertices(*h) > N_SPECIALS);
-
- if (delay && delay != MO_INVALID_IDX) {
- DEBUG_PRINTF("setting delay %u on lhs %p\n", delay, h.get());
-
- g[e].graph = move(h);
- g[e].graph_lag = delay;
- }
- }
-
- if (!to_anchor.empty()) {
- RoseInVertex anch = add_vertex(RoseInVertexProps::makeStart(true), g);
-
- for (RoseInEdge e : to_anchor) {
- DEBUG_PRINTF("rehoming to anchor\n");
- RoseInVertex v = target(e, g);
- add_edge(anch, v, g);
- remove_edge(e, g);
- }
- }
-}
-
-static
-bool isStarCliche(const NGHolder &g) {
- DEBUG_PRINTF("checking graph with %zu vertices\n", num_vertices(g));
-
- bool nonspecials_seen = false;
-
- for (auto v : vertices_range(g)) {
- if (is_special(v, g)) {
- continue;
- }
-
- if (nonspecials_seen) {
- return false;
- }
- nonspecials_seen = true;
-
- if (!g[v].char_reach.all()) {
- return false;
- }
-
- if (!hasSelfLoop(v, g)) {
- return false;
- }
- if (!edge(v, g.accept, g).second) {
- return false;
- }
- }
-
- if (!nonspecials_seen) {
- return false;
- }
-
- if (!edge(g.start, g.accept, g).second) {
- return false;
- }
-
- return true;
-}
-
-static
-void removeRedundantLiteralsFromInfix(const NGHolder &h, RoseInGraph &ig,
- const vector<RoseInEdge> &ee,
- const CompileContext &cc) {
- /* TODO: This could be better by not creating a separate graph for each
- * successor literal. This would require using distinct report ids and also
- * taking into account overlap of successor literals. */
-
- set<ue2_literal> preds;
- set<ue2_literal> succs;
- for (const RoseInEdge &e : ee) {
- RoseInVertex u = source(e, ig);
- assert(ig[u].type == RIV_LITERAL);
- assert(!ig[u].delay);
- preds.insert(ig[u].s);
-
- RoseInVertex v = target(e, ig);
- assert(ig[v].type == RIV_LITERAL);
- assert(!ig[v].delay);
- succs.insert(ig[v].s);
-
- if (ig[e].graph_lag) {
- /* already removed redundant parts of literals */
- return;
- }
-
- assert(!ig[e].dfa);
- }
-
- map<ue2_literal, pair<shared_ptr<NGHolder>, u32> > graphs; /* + delay */
-
- for (const ue2_literal &right : succs) {
- size_t max_overlap = 0;
- for (const ue2_literal &left : preds) {
- size_t overlap = maxOverlap(left, right, 0);
- ENSURE_AT_LEAST(&max_overlap, overlap);
- }
-
- u32 max_allowed_delay = right.length() - max_overlap;
-
- if (cc.streaming) {
- LIMIT_TO_AT_MOST(&max_allowed_delay, cc.grey.maxHistoryAvailable);
- }
-
- if (!max_allowed_delay) {
- continue;
- }
-
- shared_ptr<NGHolder> h_new = cloneHolder(h);
-
- u32 delay = removeTrailingLiteralStates(*h_new, right,
- max_allowed_delay);
-
- if (delay == MO_INVALID_IDX) {
- /* successor literal could not match infix -> ignore false path */
- assert(0);
- continue;
- }
-
- if (!delay) {
- /* unable to trim graph --> no point swapping to new holder */
- continue;
- }
-
- assert(isCorrectlyTopped(*h_new));
- graphs[right] = make_pair(h_new, delay);
- }
-
- for (const RoseInEdge &e : ee) {
- RoseInVertex v = target(e, ig);
- const ue2_literal &succ = ig[v].s;
- if (!contains(graphs, succ)) {
- continue;
- }
-
- ig[e].graph = graphs[succ].first;
- ig[e].graph_lag = graphs[succ].second;
-
- if (isStarCliche(*ig[e].graph)) {
- DEBUG_PRINTF("is a X star!\n");
- ig[e].graph.reset();
- ig[e].graph_lag = 0;
- }
- }
-}
-
-static
-void removeRedundantLiteralsFromInfixes(RoseInGraph &g,
- const CompileContext &cc) {
- insertion_ordered_map<NGHolder *, vector<RoseInEdge>> infixes;
-
- for (const RoseInEdge &e : edges_range(g)) {
- RoseInVertex s = source(e, g);
- RoseInVertex t = target(e, g);
-
- if (g[s].type != RIV_LITERAL || g[t].type != RIV_LITERAL) {
- continue;
- }
-
- if (!g[e].graph) {
- continue;
- }
-
- assert(!g[t].delay);
- if (g[e].dfa) {
- /* if we removed any more states, we would need to rebuild the
- * the dfa which can be time consuming. */
- continue;
- }
-
- NGHolder *h = g[e].graph.get();
- infixes[h].push_back(e);
- }
-
- for (const auto &m : infixes) {
- NGHolder *h = m.first;
- const auto &edges = m.second;
- removeRedundantLiteralsFromInfix(*h, g, edges, cc);
- }
-}
-
-static
-void removeRedundantLiterals(RoseInGraph &g, const CompileContext &cc) {
- removeRedundantLiteralsFromPrefixes(g, cc);
- removeRedundantLiteralsFromInfixes(g, cc);
-}
-
-static
-RoseInVertex getStart(RoseInGraph &vg) {
- for (RoseInVertex v : vertices_range(vg)) {
- if (vg[v].type == RIV_START || vg[v].type == RIV_ANCHORED_START) {
- return v;
- }
- }
- assert(0);
- return RoseInGraph::null_vertex();
-}
-
-/**
- * Finds the initial accept vertex created to which suffix/outfixes are
- * attached.
- */
-static
-RoseInVertex getPrimaryAccept(RoseInGraph &vg) {
- for (RoseInVertex v : vertices_range(vg)) {
- if (vg[v].type == RIV_ACCEPT && vg[v].reports.empty()) {
- return v;
- }
- }
- assert(0);
- return RoseInGraph::null_vertex();
-}
-
-static
-bool willBeTransient(const depth &max_depth, const CompileContext &cc) {
- if (!cc.streaming) {
- return max_depth <= depth(ROSE_BLOCK_TRANSIENT_MAX_WIDTH);
- } else {
- return max_depth <= depth(cc.grey.maxHistoryAvailable + 1);
- }
-}
-
-static
-bool willBeAnchoredTable(const depth &max_depth, const Grey &grey) {
- return max_depth <= depth(grey.maxAnchoredRegion);
-}
-
-static
-unique_ptr<NGHolder> make_chain(u32 count) {
- assert(count);
-
- auto rv = std::make_unique<NGHolder>(NFA_INFIX);
-
- NGHolder &h = *rv;
-
- NFAVertex u = h.start;
- for (u32 i = 0; i < count; i++) {
- NFAVertex v = add_vertex(h);
- h[v].char_reach = CharReach::dot();
- add_edge(u, v, h);
- u = v;
- }
- h[u].reports.insert(0);
- add_edge(u, h.accept, h);
-
- setTops(h);
-
- return rv;
-}
-
-#define SHORT_TRIGGER_LEN 16
-
-static
-bool makeTransientFromLongLiteral(NGHolder &h, RoseInGraph &vg,
- const vector<RoseInEdge> &ee,
- const CompileContext &cc) {
- /* check max width and literal lengths to see if possible */
- size_t min_lit = (size_t)~0ULL;
- for (const RoseInEdge &e : ee) {
- RoseInVertex v = target(e, vg);
- LIMIT_TO_AT_MOST(&min_lit, vg[v].s.length());
- }
-
- if (min_lit <= SHORT_TRIGGER_LEN || min_lit >= UINT_MAX) {
- return false;
- }
-
- depth max_width = findMaxWidth(h);
-
- u32 delta = min_lit - SHORT_TRIGGER_LEN;
-
- if (!willBeTransient(max_width - depth(delta), cc)
- && !willBeAnchoredTable(max_width - depth(delta), cc.grey)) {
- return false;
- }
-
- DEBUG_PRINTF("candidate for splitting long literal (len %zu)\n", min_lit);
- DEBUG_PRINTF("delta = %u\n", delta);
-
- /* try split */
- map<RoseInVertex, shared_ptr<NGHolder> > graphs;
- for (const RoseInEdge &e : ee) {
- RoseInVertex v = target(e, vg);
-
- shared_ptr<NGHolder> h_new = cloneHolder(h);
-
- u32 delay = removeTrailingLiteralStates(*h_new, vg[v].s, delta);
-
- DEBUG_PRINTF("delay %u\n", delay);
-
- if (delay != delta) {
- DEBUG_PRINTF("unable to trim literal\n");
- return false;
- }
-
- if (in_degree(v, vg) != 1) {
- DEBUG_PRINTF("complicated\n");
- return false;
- }
-
- DEBUG_PRINTF("new mw = %u\n", (u32)findMaxWidth(*h_new));
- assert(willBeTransient(findMaxWidth(*h_new), cc)
- || willBeAnchoredTable(findMaxWidth(*h_new), cc.grey));
-
- assert(isCorrectlyTopped(*h_new));
- graphs[v] = h_new;
- }
-
- /* add .{repeats} from prefixes to long literals */
- for (const RoseInEdge &e : ee) {
- RoseInVertex s = source(e, vg);
- RoseInVertex t = target(e, vg);
-
- remove_edge(e, vg);
- const ue2_literal &orig_lit = vg[t].s;
-
- ue2_literal lit(orig_lit.begin(), orig_lit.end() - delta);
-
- ue2_literal lit2(orig_lit.end() - delta, orig_lit.end());
-
- assert(lit.length() + delta == orig_lit.length());
-
- vg[t].s = lit2;
-
- RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
- add_edge(s, v, RoseInEdgeProps(graphs[t], 0), vg);
- add_edge(v, t, RoseInEdgeProps(make_chain(delta), 0), vg);
- }
-
- DEBUG_PRINTF("success\n");
- /* TODO: alter split point to avoid pathological splits */
- return true;
-}
-
-static
-void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
- u32 delay, const vector<NFAVertex> &preds) {
- assert(delay <= lit.length());
- assert(isCorrectlyTopped(g));
- DEBUG_PRINTF("adding on '%s' %u\n", dumpString(lit).c_str(), delay);
-
- NFAVertex prev = g.accept;
- auto it = lit.rbegin();
- while (delay--) {
- NFAVertex curr = add_vertex(g);
- assert(it != lit.rend());
- g[curr].char_reach = *it;
- add_edge(curr, prev, g);
- ++it;
- prev = curr;
- }
-
- for (auto v : preds) {
- NFAEdge e = add_edge_if_not_present(v, prev, g);
- if (v == g.start && is_triggered(g)) {
- g[e].tops.insert(DEFAULT_TOP);
- }
- }
-
- // Every predecessor of accept must have a report.
- set_report(g, 0);
-
- renumber_vertices(g);
- renumber_edges(g);
- assert(allMatchStatesHaveReports(g));
- assert(isCorrectlyTopped(g));
-}
-
-static
-void restoreTrailingLiteralStates(NGHolder &g,
- const vector<pair<ue2_literal, u32>> &lits) {
- vector<NFAVertex> preds;
- insert(&preds, preds.end(), inv_adjacent_vertices(g.accept, g));
- clear_in_edges(g.accept, g);
-
- for (auto v : preds) {
- g[v].reports.clear(); /* clear report from old accepts */
- }
-
- for (const auto &p : lits) {
- const ue2_literal &lit = p.first;
- u32 delay = p.second;
-
- restoreTrailingLiteralStates(g, lit, delay, preds);
- }
-}
-
-static
-bool improvePrefix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
- const CompileContext &cc) {
- DEBUG_PRINTF("trying to improve prefix %p, %zu verts\n", &h,
- num_vertices(h));
- assert(isCorrectlyTopped(h));
-
- renumber_vertices(h);
- renumber_edges(h);
-
- auto depths = calcDepths(h);
-
- /* If the reason the prefix is not transient is due to a very long literal
- * following, we can make it transient by restricting ourselves to using
- * just the head of the literal. */
- if (makeTransientFromLongLiteral(h, vg, ee, cc)) {
- return true;
- }
-
- auto split = findBestPrefixSplit(h, depths, vg, ee, false, cc);
-
- if (split && (split->creates_transient || split->creates_anchored)
- && splitRoseEdge(h, vg, ee, *split)) {
- DEBUG_PRINTF("split on simple literal\n");
- return true;
- }
-
- /* large back edges may prevent us identifing anchored or transient cases
- * properly - use a simple walk instead */
-
- if (doNetflowCut(h, &depths, vg, ee, true, cc.grey)) {
- return true;
- }
-
- if (split && splitRoseEdge(h, vg, ee, *split)) {
- /* use the simple split even though it doesn't create a transient
- * prefix */
- DEBUG_PRINTF("split on simple literal\n");
- return true;
- }
-
- /* look for netflow cuts which don't produce good prefixes */
- if (doNetflowCut(h, &depths, vg, ee, false, cc.grey)) {
- return true;
- }
-
- if (ee.size() > 1) {
- DEBUG_PRINTF("split the prefix apart based on succ literals\n");
- unordered_map<shared_ptr<NGHolder>, vector<pair<RoseInEdge, u32> >,
+ }
+ }
+
+ auto cmp = LitComparator(g, seeking_anchored, seeking_transient,
+ last_chance);
+
+ unique_ptr<VertLitInfo> best = move(lits.back());
+ lits.pop_back();
+ while (!lits.empty()) {
+ if (cmp(best, lits.back())) {
+ best = move(lits.back());
+ }
+ lits.pop_back();
+ }
+
+ DEBUG_PRINTF("best is '%s' %zu a%d t%d\n",
+ dumpString(*best->lit.begin()).c_str(),
+ g[best->vv.front()].index,
+ depths ? (int)createsAnchoredLHS(g, best->vv, *depths, cc.grey) : 0,
+ depths ? (int)createsTransientLHS(g, best->vv, *depths, cc.grey) : 0);
+
+ return best;
+}
+
+static
+void poisonFromSuccessor(const NGHolder &h, const ue2_literal &succ,
+ bool overhang_ok, flat_set<NFAEdge> &bad) {
+ DEBUG_PRINTF("poisoning holder of size %zu, succ len %zu\n",
+ num_vertices(h), succ.length());
+
+ using EdgeSet = boost::dynamic_bitset<>;
+
+ const size_t edge_count = num_edges(h);
+ EdgeSet bad_edges(edge_count);
+
+ unordered_map<NFAVertex, EdgeSet> curr;
+ for (const auto &e : in_edges_range(h.accept, h)) {
+ auto &path_set = curr[source(e, h)];
+ if (path_set.empty()) {
+ path_set.resize(edge_count);
+ }
+ path_set.set(h[e].index);
+ }
+
+ unordered_map<NFAVertex, EdgeSet> next;
+ for (auto it = succ.rbegin(); it != succ.rend(); ++it) {
+ for (const auto &path : curr) {
+ NFAVertex u = path.first;
+ const auto &path_set = path.second;
+ if (u == h.start && overhang_ok) {
+ DEBUG_PRINTF("poisoning early %zu [overhang]\n",
+ path_set.count());
+ bad_edges |= path_set;
+ continue;
+ }
+ if (overlaps(h[u].char_reach, *it)) {
+ for (const auto &e : in_edges_range(u, h)) {
+ auto &new_path_set = next[source(e, h)];
+ if (new_path_set.empty()) {
+ new_path_set.resize(edge_count);
+ }
+ new_path_set |= path_set;
+ new_path_set.set(h[e].index);
+ }
+ }
+ }
+ DEBUG_PRINTF("succ char matches at %zu paths\n", next.size());
+ assert(overhang_ok || !curr.empty());
+ swap(curr, next);
+ next.clear();
+ }
+
+ assert(overhang_ok || !curr.empty());
+ for (const auto &path : curr) {
+ bad_edges |= path.second;
+ DEBUG_PRINTF("poisoning %zu vertices\n", path.second.count());
+ }
+
+ for (const auto &e : edges_range(h)) {
+ if (bad_edges.test(h[e].index)) {
+ bad.insert(e);
+ }
+ }
+}
+
+static
+void poisonForGoodPrefix(const NGHolder &h,
+ const vector<NFAVertexDepth> &depths,
+ flat_set<NFAEdge> &bad, const Grey &grey) {
+ for (const auto &v : vertices_range(h)) {
+ if (!createsAnchoredLHS(h, {v}, depths, grey)
+ && !createsTransientLHS(h, {v}, depths, grey)) {
+ insert(&bad, in_edges_range(v, h));
+ }
+ }
+}
+
+static UNUSED
+bool is_any_accept_type(RoseInVertexType t) {
+ return t == RIV_ACCEPT || t == RIV_ACCEPT_EOD;
+}
+
+static
+flat_set<NFAEdge> poisonEdges(const NGHolder &h,
+ const vector<NFAVertexDepth> *depths,
+ const RoseInGraph &vg, const vector<RoseInEdge> &ee,
+ bool for_prefix, const Grey &grey) {
+ DEBUG_PRINTF("poisoning edges %zu successor edges\n", ee.size());
+
+ /* poison edges covered by successor literal */
+
+ set<pair<ue2_literal, bool> > succs;
+ for (const RoseInEdge &ve : ee) {
+ if (vg[target(ve, vg)].type != RIV_LITERAL) {
+ /* nothing to poison in suffixes/outfixes */
+ assert(generates_callbacks(h));
+ assert(is_any_accept_type(vg[target(ve, vg)].type));
+ continue;
+ }
+ succs.insert({vg[target(ve, vg)].s,
+ vg[source(ve, vg)].type == RIV_LITERAL});
+
+ }
+
+ DEBUG_PRINTF("poisoning edges %zu successor literals\n", succs.size());
+
+ flat_set<NFAEdge> bad;
+ for (const auto &p : succs) {
+ poisonFromSuccessor(h, p.first, p.second, bad);
+ }
+
+ /* poison edges which don't significantly improve a prefix */
+
+ if (for_prefix) {
+ poisonForGoodPrefix(h, *depths, bad, grey);
+ }
+
+ return bad;
+}
+
+static
+set<NFAVertex> poisonVertices(const NGHolder &h, const RoseInGraph &vg,
+ const vector<RoseInEdge> &ee, const Grey &grey) {
+ flat_set<NFAEdge> bad_edges = poisonEdges(h, nullptr, vg, ee, false, grey);
+ set<NFAVertex> bad_vertices;
+ for (const NFAEdge &e : bad_edges) {
+ bad_vertices.insert(target(e, h));
+ DEBUG_PRINTF("bad: %zu->%zu\n", h[source(e, h)].index,
+ h[target(e, h)].index);
+ }
+
+ return bad_vertices;
+}
+
+static
+unique_ptr<VertLitInfo> findBestNormalSplit(const NGHolder &g,
+ const RoseInGraph &vg,
+ const vector<RoseInEdge> &ee,
+ const CompileContext &cc) {
+ assert(g.kind == NFA_OUTFIX || g.kind == NFA_INFIX || g.kind == NFA_SUFFIX);
+ set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
+
+ return findBestSplit(g, nullptr, false, cc.grey.minRoseLiteralLength,
+ nullptr, &bad_vertices, false, cc);
+}
+
+static
+unique_ptr<VertLitInfo> findBestLastChanceSplit(const NGHolder &g,
+ const RoseInGraph &vg,
+ const vector<RoseInEdge> &ee,
+ const CompileContext &cc) {
+ assert(g.kind == NFA_OUTFIX || g.kind == NFA_INFIX || g.kind == NFA_SUFFIX);
+ set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
+
+ return findBestSplit(g, nullptr, false, cc.grey.minRoseLiteralLength,
+ nullptr, &bad_vertices, true, cc);
+}
+
+static
+unique_ptr<VertLitInfo> findSimplePrefixSplit(const NGHolder &g,
+ const CompileContext &cc) {
+ DEBUG_PRINTF("looking for simple prefix split\n");
+ bool anchored = !proper_out_degree(g.startDs, g);
+ NFAVertex u = anchored ? g.start : g.startDs;
+
+ if (out_degree(u, g) != 2) { /* startDs + succ */
+ return nullptr;
+ }
+
+ NFAVertex v = NGHolder::null_vertex();
+ for (NFAVertex t : adjacent_vertices_range(u, g)) {
+ if (t != g.startDs) {
+ assert(!v);
+ v = t;
+ }
+ }
+ assert(v);
+
+ if (!anchored) {
+ if (out_degree(g.start, g) > 2) {
+ return nullptr;
+ }
+ if (out_degree(g.start, g) == 2 && !edge(g.start, v, g).second) {
+ return nullptr;
+ }
+ }
+
+ NFAVertex best_v = NGHolder::null_vertex();
+ ue2_literal best_lit;
+
+ u32 limit = cc.grey.maxHistoryAvailable;
+ if (anchored) {
+ LIMIT_TO_AT_MOST(&limit, cc.grey.maxAnchoredRegion);
+ }
+
+ ue2_literal curr_lit;
+ for (u32 i = 0; i < limit; i++) {
+ const auto &v_cr = g[v].char_reach;
+ if (v_cr.count() == 1 || v_cr.isCaselessChar()) {
+ curr_lit.push_back(v_cr.find_first(), v_cr.isCaselessChar());
+ } else {
+ curr_lit.clear();
+ }
+
+ if (curr_lit.length() > best_lit.length()) {
+ best_lit = curr_lit;
+ best_v = v;
+ }
+
+ if (out_degree(v, g) != 1) {
+ break;
+ }
+ v = *adjacent_vertices(v, g).first;
+ }
+
+ if (best_lit.length() < cc.grey.minRoseLiteralLength) {
+ return nullptr;
+ }
+
+ set<ue2_literal> best_lit_set({best_lit});
+ if (bad_mixed_sensitivity(best_lit)) {
+ sanitizeAndCompressAndScore(best_lit_set);
+ }
+
+ return ue2::make_unique<VertLitInfo>(best_v, best_lit_set, anchored, true);
+}
+
+static
+unique_ptr<VertLitInfo> findBestPrefixSplit(const NGHolder &g,
+ const vector<NFAVertexDepth> &depths,
+ const RoseInGraph &vg,
+ const vector<RoseInEdge> &ee,
+ bool last_chance,
+ const CompileContext &cc) {
+ assert(g.kind == NFA_PREFIX || g.kind == NFA_OUTFIX);
+ set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
+ auto rv = findBestSplit(g, &depths, true, cc.grey.minRoseLiteralLength,
+ nullptr, &bad_vertices, last_chance, cc);
+
+ /* large back edges may prevent us identifying anchored or transient cases
+ * properly - use a simple walk instead */
+ if (!rv || !(rv->creates_transient || rv->creates_anchored)) {
+ auto rv2 = findSimplePrefixSplit(g, cc);
+ if (rv2) {
+ return rv2;
+ }
+ }
+
+ return rv;
+}
+
+static
+unique_ptr<VertLitInfo> findBestCleanSplit(const NGHolder &g,
+ const CompileContext &cc) {
+ assert(g.kind != NFA_PREFIX);
+ set<NFAVertex> cleanSplits;
+ for (NFAVertex v : vertices_range(g)) {
+ if (!g[v].char_reach.all() || !edge(v, v, g).second) {
+ continue;
+ }
+ insert(&cleanSplits, inv_adjacent_vertices(v, g));
+ cleanSplits.erase(v);
+ }
+ cleanSplits.erase(g.start);
+ if (cleanSplits.empty()) {
+ return nullptr;
+ }
+ return findBestSplit(g, nullptr, false, cc.grey.violetEarlyCleanLiteralLen,
+ &cleanSplits, nullptr, false, cc);
+}
+
+static
+bool can_match(const NGHolder &g, const ue2_literal &lit, bool overhang_ok) {
+ set<NFAVertex> curr, next;
+ curr.insert(g.accept);
+
+ for (auto it = lit.rbegin(); it != lit.rend(); ++it) {
+ next.clear();
+
+ for (auto v : curr) {
+ for (auto u : inv_adjacent_vertices_range(v, g)) {
+ if (u == g.start) {
+ if (overhang_ok) {
+ DEBUG_PRINTF("bail\n");
+ return true;
+ } else {
+ continue; /* it is not possible for a lhs literal to
+ * overhang the start */
+ }
+ }
+
+ const CharReach &cr = g[u].char_reach;
+ if (!overlaps(*it, cr)) {
+ continue;
+ }
+
+ next.insert(u);
+ }
+ }
+
+ curr.swap(next);
+ }
+
+ return !curr.empty();
+}
+
+static
+bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg,
+ const vector<RoseInEdge> &ee, const VertLitInfo &split) {
+ const vector<NFAVertex> &splitters = split.vv;
+ assert(!splitters.empty());
+
+ shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
+ shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
+
+ unordered_map<NFAVertex, NFAVertex> lhs_map;
+ unordered_map<NFAVertex, NFAVertex> rhs_map;
+
+ splitGraph(base_graph, splitters, lhs.get(), &lhs_map, rhs.get(), &rhs_map);
+ DEBUG_PRINTF("split %s:%zu into %s:%zu + %s:%zu\n",
+ to_string(base_graph.kind).c_str(), num_vertices(base_graph),
+ to_string(lhs->kind).c_str(), num_vertices(*lhs),
+ to_string(rhs->kind).c_str(), num_vertices(*rhs));
+
+ bool suffix = generates_callbacks(base_graph);
+
+ if (is_triggered(base_graph)) {
+ /* if we are already guarded, check if the split reduces the size of
+ * the problem before continuing with the split */
+ if (num_vertices(*lhs) >= num_vertices(base_graph)
+ && !(suffix && isVacuous(*rhs))) {
+ DEBUG_PRINTF("split's lhs is no smaller\n");
+ return false;
+ }
+
+ if (num_vertices(*rhs) >= num_vertices(base_graph)) {
+ DEBUG_PRINTF("split's rhs is no smaller\n");
+ return false;
+ }
+ }
+
+ bool do_accept = false;
+ bool do_accept_eod = false;
+ assert(rhs);
+ if (isVacuous(*rhs) && suffix) {
+ if (edge(rhs->start, rhs->accept, *rhs).second) {
+ DEBUG_PRINTF("rhs has a cliche\n");
+ do_accept = true;
+ remove_edge(rhs->start, rhs->accept, *rhs);
+ }
+
+ if (edge(rhs->start, rhs->acceptEod, *rhs).second) {
+ DEBUG_PRINTF("rhs has an eod cliche\n");
+ do_accept_eod = true;
+ remove_edge(rhs->start, rhs->acceptEod, *rhs);
+ }
+
+ renumber_edges(*rhs);
+ }
+
+ /* check if we still have a useful graph left over */
+ bool do_norm = out_degree(rhs->start, *rhs) != 1;
+
+ set<ReportID> splitter_reports;
+ for (auto v : splitters) {
+ insert(&splitter_reports, base_graph[v].reports);
+ }
+
+ /* find the targets of each source vertex; insertion_ordered_map used to
+ * preserve deterministic ordering */
+ insertion_ordered_map<RoseInVertex, vector<RoseInVertex>> images;
+ for (const RoseInEdge &e : ee) {
+ RoseInVertex src = source(e, vg);
+ RoseInVertex dest = target(e, vg);
+ images[src].push_back(dest);
+ remove_edge(e, vg);
+ }
+
+ map<vector<RoseInVertex>, vector<RoseInVertex>> verts_by_image;
+
+ for (const auto &m : images) {
+ const auto &u = m.first;
+ const auto &image = m.second;
+
+ if (contains(verts_by_image, image)) {
+ for (RoseInVertex v : verts_by_image[image]) {
+ add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
+ }
+ continue;
+ }
+
+ for (const auto &lit : split.lit) {
+ assert(!bad_mixed_sensitivity(lit));
+
+ /* don't allow overhang in can_match() as literals should
+ * correspond to the edge graph being split; overhanging the graph
+ * would indicate a false path.*/
+ if (!can_match(*lhs, lit, false)) {
+ DEBUG_PRINTF("'%s' did not match lhs\n",
+ escapeString(lit).c_str());
+ continue;
+ }
+
+ DEBUG_PRINTF("best is '%s'\n", escapeString(lit).c_str());
+ auto v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
+ add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
+
+ /* work out delay later */
+ if (do_accept) {
+ DEBUG_PRINTF("rhs has a cliche\n");
+ auto tt = add_vertex(RoseInVertexProps::makeAccept(
+ splitter_reports), vg);
+ add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+ }
+
+ if (do_accept_eod) {
+ DEBUG_PRINTF("rhs has an eod cliche\n");
+ auto tt = add_vertex(RoseInVertexProps::makeAcceptEod(
+ splitter_reports), vg);
+ add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+ }
+
+ if (do_norm) {
+ assert(out_degree(rhs->start, *rhs) > 1);
+ for (RoseInVertex dest : image) {
+ add_edge(v, dest, RoseInEdgeProps(rhs, 0U), vg);
+ }
+ }
+ verts_by_image[image].push_back(v);
+ }
+ }
+
+ assert(hasCorrectlyNumberedVertices(*rhs));
+ assert(hasCorrectlyNumberedEdges(*rhs));
+ assert(isCorrectlyTopped(*rhs));
+ assert(hasCorrectlyNumberedVertices(*lhs));
+ assert(hasCorrectlyNumberedEdges(*lhs));
+ assert(isCorrectlyTopped(*lhs));
+
+ return true;
+}
+
+#define MAX_NETFLOW_CUT_WIDTH 40 /* magic number is magic */
+#define MAX_LEN_2_LITERALS_PER_CUT 3
+
+static
+bool checkValidNetflowLits(NGHolder &h, const vector<u64a> &scores,
+ const map<NFAEdge, set<ue2_literal>> &cut_lits,
+ u32 min_allowed_length) {
+ DEBUG_PRINTF("cut width %zu; min allowed %u\n", cut_lits.size(),
+ min_allowed_length);
+ if (cut_lits.size() > MAX_NETFLOW_CUT_WIDTH) {
+ return false;
+ }
+
+ u32 len_2_count = 0;
+
+ for (const auto &cut : cut_lits) {
+ if (scores[h[cut.first].index] >= NO_LITERAL_AT_EDGE_SCORE) {
+ DEBUG_PRINTF("cut uses a forbidden edge\n");
+ return false;
+ }
+
+ if (min_len(cut.second) < min_allowed_length) {
+ DEBUG_PRINTF("cut uses a bad literal\n");
+ return false;
+ }
+
+ for (const auto &lit : cut.second) {
+ if (lit.length() == 2) {
+ len_2_count++;
+ }
+ }
+ }
+
+ if (len_2_count > MAX_LEN_2_LITERALS_PER_CUT) {
+ return false;
+ }
+
+ return true;
+}
+
+static
+void splitEdgesByCut(NGHolder &h, RoseInGraph &vg,
+ const vector<RoseInEdge> &to_cut,
+ const vector<NFAEdge> &cut,
+ const map<NFAEdge, set<ue2_literal>> &cut_lits) {
+ DEBUG_PRINTF("splitting %s (%zu vertices)\n", to_string(h.kind).c_str(),
+ num_vertices(h));
+
+ /* create literal vertices and connect preds */
+ unordered_set<RoseInVertex> done_sources;
+ map<RoseInVertex, vector<pair<RoseInVertex, NFAVertex>>> verts_by_source;
+ for (const RoseInEdge &ve : to_cut) {
+ assert(&h == &*vg[ve].graph);
+ RoseInVertex src = source(ve, vg);
+ if (!done_sources.insert(src).second) {
+ continue; /* already processed */
+ }
+
+ /* iterate over cut for determinism */
+ for (const auto &e : cut) {
+ NFAVertex prev_v = source(e, h);
+ NFAVertex pivot = target(e, h);
+
+ DEBUG_PRINTF("splitting on pivot %zu\n", h[pivot].index);
+ unordered_map<NFAVertex, NFAVertex> temp_map;
+ shared_ptr<NGHolder> new_lhs = make_shared<NGHolder>();
+ splitLHS(h, pivot, new_lhs.get(), &temp_map);
+
+ /* want to cut off paths to pivot from things other than the pivot -
+ * makes a more svelte graphy */
+ clear_in_edges(temp_map[pivot], *new_lhs);
+ NFAEdge pivot_edge = add_edge(temp_map[prev_v], temp_map[pivot],
+ *new_lhs);
+ if (is_triggered(h) && prev_v == h.start) {
+ (*new_lhs)[pivot_edge].tops.insert(DEFAULT_TOP);
+ }
+
+ pruneUseless(*new_lhs, false);
+ renumber_vertices(*new_lhs);
+ renumber_edges(*new_lhs);
+
+ DEBUG_PRINTF(" into lhs %s (%zu vertices)\n",
+ to_string(new_lhs->kind).c_str(),
+ num_vertices(*new_lhs));
+
+ assert(hasCorrectlyNumberedVertices(*new_lhs));
+ assert(hasCorrectlyNumberedEdges(*new_lhs));
+ assert(isCorrectlyTopped(*new_lhs));
+
+ const set<ue2_literal> &lits = cut_lits.at(e);
+ for (const auto &lit : lits) {
+ if (!can_match(*new_lhs, lit, is_triggered(h))) {
+ continue;
+ }
+
+ RoseInVertex v
+ = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
+
+ /* if this is a prefix/infix an edge directly to accept should
+ * represent a false path as we have poisoned vertices covered
+ * by the literals. */
+ if (generates_callbacks(h)) {
+ if (edge(pivot, h.accept, h).second) {
+ DEBUG_PRINTF("adding acceptEod\n");
+ /* literal has a direct connection to accept */
+ const flat_set<ReportID> &reports = h[pivot].reports;
+ auto tt = add_vertex(
+ RoseInVertexProps::makeAccept(reports), vg);
+ add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+ }
+
+ if (edge(pivot, h.acceptEod, h).second) {
+ assert(generates_callbacks(h));
+ DEBUG_PRINTF("adding acceptEod\n");
+ /* literal has a direct connection to accept */
+ const flat_set<ReportID> &reports = h[pivot].reports;
+ auto tt = add_vertex(
+ RoseInVertexProps::makeAcceptEod(reports), vg);
+ add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+ }
+ }
+
+ add_edge(src, v, RoseInEdgeProps(new_lhs, 0), vg);
+ verts_by_source[src].push_back({v, pivot});
+ }
+ }
+ }
+
+ /* wire the literal vertices up to successors */
+ map<vector<NFAVertex>, shared_ptr<NGHolder> > done_rhs;
+ for (const RoseInEdge &ve : to_cut) {
+ RoseInVertex src = source(ve, vg);
+ RoseInVertex dest = target(ve, vg);
+
+ /* iterate over cut for determinism */
+ for (const auto &elem : verts_by_source[src]) {
+ NFAVertex pivot = elem.second;
+ RoseInVertex v = elem.first;
+
+ vector<NFAVertex> adj;
+ insert(&adj, adj.end(), adjacent_vertices(pivot, h));
+ /* we can ignore presence of accept, accepteod in adj as it is best
+ effort */
+
+ if (!contains(done_rhs, adj)) {
+ unordered_map<NFAVertex, NFAVertex> temp_map;
+ shared_ptr<NGHolder> new_rhs = make_shared<NGHolder>();
+ splitRHS(h, adj, new_rhs.get(), &temp_map);
+ remove_edge(new_rhs->start, new_rhs->accept, *new_rhs);
+ remove_edge(new_rhs->start, new_rhs->acceptEod, *new_rhs);
+ renumber_edges(*new_rhs);
+ DEBUG_PRINTF(" into rhs %s (%zu vertices)\n",
+ to_string(new_rhs->kind).c_str(),
+ num_vertices(*new_rhs));
+ done_rhs.emplace(adj, new_rhs);
+ assert(isCorrectlyTopped(*new_rhs));
+ }
+
+ assert(done_rhs[adj].get());
+ shared_ptr<NGHolder> new_rhs = done_rhs[adj];
+
+ assert(hasCorrectlyNumberedVertices(*new_rhs));
+ assert(hasCorrectlyNumberedEdges(*new_rhs));
+ assert(isCorrectlyTopped(*new_rhs));
+
+ if (vg[dest].type == RIV_LITERAL
+ && !can_match(*new_rhs, vg[dest].s, true)) {
+ continue;
+ }
+
+ if (out_degree(new_rhs->start, *new_rhs) != 1) {
+ add_edge(v, dest, RoseInEdgeProps(new_rhs, 0), vg);
+ }
+ }
+
+ remove_edge(ve, vg);
+ }
+}
+
+static
+bool doNetflowCut(NGHolder &h,
+ const vector<NFAVertexDepth> *depths,
+ RoseInGraph &vg,
+ const vector<RoseInEdge> &ee, bool for_prefix,
+ const Grey &grey, u32 min_allowed_length = 0U) {
+ ENSURE_AT_LEAST(&min_allowed_length, grey.minRoseNetflowLiteralLength);
+
+ DEBUG_PRINTF("doing netflow cut\n");
+ /* TODO: we should really get literals/scores from the full graph as this
+ * allows us to overlap with previous cuts. */
+ assert(!ee.empty());
+ assert(&h == &*vg[ee.front()].graph);
+ assert(!for_prefix || depths);
+
+ if (num_edges(h) > grey.maxRoseNetflowEdges) {
+ /* We have a limit on this because scoring edges and running netflow
+ * gets very slow for big graphs. */
+ DEBUG_PRINTF("too many edges, skipping netflow cut\n");
+ return false;
+ }
+
+ assert(hasCorrectlyNumberedVertices(h));
+ assert(hasCorrectlyNumberedEdges(h));
+
+ auto known_bad = poisonEdges(h, depths, vg, ee, for_prefix, grey);
+
+ /* Step 1: Get scores for all edges */
+ vector<u64a> scores = scoreEdges(h, known_bad); /* scores by edge_index */
+
+ /* Step 2: Find cutset based on scores */
+ vector<NFAEdge> cut = findMinCut(h, scores);
+
+ /* Step 3: Get literals corresponding to cut edges */
+ map<NFAEdge, set<ue2_literal>> cut_lits;
+ for (const auto &e : cut) {
+ set<ue2_literal> lits = getLiteralSet(h, e);
+ sanitizeAndCompressAndScore(lits);
+
+ cut_lits[e] = lits;
+ }
+
+ /* if literals are underlength bail or if it involves a forbidden edge*/
+ if (!checkValidNetflowLits(h, scores, cut_lits, min_allowed_length)) {
+ return false;
+ }
+ DEBUG_PRINTF("splitting\n");
+
+ /* Step 4: Split graph based on cuts */
+ splitEdgesByCut(h, vg, ee, cut, cut_lits);
+
+ return true;
+}
+
+static
+bool deanchorIfNeeded(NGHolder &g) {
+ DEBUG_PRINTF("hi\n");
+ if (proper_out_degree(g.startDs, g)) {
+ return false;
+ }
+
+ /* look for a non-special dot with a loop following start */
+ set<NFAVertex> succ_g;
+ insert(&succ_g, adjacent_vertices(g.start, g));
+ succ_g.erase(g.startDs);
+
+ for (auto v : adjacent_vertices_range(g.start, g)) {
+ DEBUG_PRINTF("inspecting cand %zu || = %zu\n", g[v].index,
+ g[v].char_reach.count());
+
+ if (v == g.startDs || !g[v].char_reach.all()) {
+ continue;
+ }
+
+ set<NFAVertex> succ_v;
+ insert(&succ_v, adjacent_vertices(v, g));
+
+ if (succ_v == succ_g) {
+ DEBUG_PRINTF("found ^.*\n");
+ for (auto succ : adjacent_vertices_range(g.start, g)) {
+ if (succ == g.startDs) {
+ continue;
+ }
+ add_edge(g.startDs, succ, g);
+ }
+ clear_vertex(v, g);
+ remove_vertex(v, g);
+ renumber_vertices(g);
+ return true;
+ }
+
+ if (succ_g.size() == 1 && hasSelfLoop(v, g)) {
+ DEBUG_PRINTF("found ^.+\n");
+ add_edge(g.startDs, v, g);
+ remove_edge(v, v, g);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static
+RoseInGraph populateTrivialGraph(const NGHolder &h) {
+ RoseInGraph g;
+ shared_ptr<NGHolder> root_g = cloneHolder(h);
+ bool orig_anch = isAnchored(*root_g);
+ orig_anch |= deanchorIfNeeded(*root_g);
+
+ DEBUG_PRINTF("orig_anch %d\n", (int)orig_anch);
+
+ auto start = add_vertex(RoseInVertexProps::makeStart(orig_anch), g);
+ auto accept = add_vertex(RoseInVertexProps::makeAccept(set<ReportID>()), g);
+
+ add_edge(start, accept, RoseInEdgeProps(root_g, 0), g);
+
+ return g;
+}
+
+static
+void avoidOutfixes(RoseInGraph &vg, bool last_chance,
+ const CompileContext &cc) {
+ STAGE_DEBUG_PRINTF("AVOIDING OUTFIX\n");
+ assert(num_vertices(vg) == 2);
+ assert(num_edges(vg) == 1);
+
+ RoseInEdge e = *edges(vg).first;
+
+ NGHolder &h = *vg[e].graph;
+ assert(isCorrectlyTopped(h));
+
+ renumber_vertices(h);
+ renumber_edges(h);
+
+ unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, {e}, cc);
+
+ if (split && splitRoseEdge(h, vg, {e}, *split)) {
+ DEBUG_PRINTF("split on simple literal\n");
+ return;
+ }
+
+ if (last_chance) {
+ /* look for a prefix split as it allows us to accept very weak anchored
+ * literals. */
+ auto depths = calcDepths(h);
+
+ split = findBestPrefixSplit(h, depths, vg, {e}, last_chance, cc);
+
+ if (split && splitRoseEdge(h, vg, {e}, *split)) {
+ DEBUG_PRINTF("split on simple literal\n");
+ return;
+ }
+ }
+
+ doNetflowCut(h, nullptr, vg, {e}, false, cc.grey);
+}
+
+static
+void removeRedundantPrefixes(RoseInGraph &g) {
+ STAGE_DEBUG_PRINTF("REMOVING REDUNDANT PREFIXES\n");
+
+ for (const RoseInEdge &e : edges_range(g)) {
+ RoseInVertex s = source(e, g);
+ RoseInVertex t = target(e, g);
+
+ if (g[s].type != RIV_START || g[t].type != RIV_LITERAL) {
+ continue;
+ }
+
+ if (!g[e].graph) {
+ continue;
+ }
+
+ assert(!g[t].delay);
+ const ue2_literal &lit = g[t].s;
+
+ if (!literalIsWholeGraph(*g[e].graph, lit)) {
+ DEBUG_PRINTF("not whole graph\n");
+ continue;
+ }
+
+ if (!isFloating(*g[e].graph)) {
+ DEBUG_PRINTF("not floating\n");
+ continue;
+ }
+ g[e].graph.reset();
+ }
+}
+
+static
+u32 maxDelay(const CompileContext &cc) {
+ if (!cc.streaming) {
+ return MO_INVALID_IDX;
+ }
+ return cc.grey.maxHistoryAvailable;
+}
+
+static
+void removeRedundantLiteralsFromPrefixes(RoseInGraph &g,
+ const CompileContext &cc) {
+ STAGE_DEBUG_PRINTF("REMOVING LITERALS FROM PREFIXES\n");
+
+ vector<RoseInEdge> to_anchor;
+ for (const RoseInEdge &e : edges_range(g)) {
+ RoseInVertex s = source(e, g);
+ RoseInVertex t = target(e, g);
+
+ if (g[s].type != RIV_START && g[s].type != RIV_ANCHORED_START) {
+ continue;
+ }
+
+ if (g[t].type != RIV_LITERAL) {
+ continue;
+ }
+
+ if (!g[e].graph) {
+ continue;
+ }
+
+ if (g[e].graph_lag) {
+ /* already removed redundant parts of literals */
+ continue;
+ }
+
+ if (g[e].dfa) {
+ /* if we removed any more states, we would need to rebuild the
+ * the dfa which can be time consuming. */
+ continue;
+ }
+
+ assert(!g[t].delay);
+ const ue2_literal &lit = g[t].s;
+
+ DEBUG_PRINTF("removing states for literal: %s\n",
+ dumpString(lit).c_str());
+
+ unique_ptr<NGHolder> h = cloneHolder(*g[e].graph);
+ const u32 max_delay = maxDelay(cc);
+
+ u32 delay = removeTrailingLiteralStates(*h, lit, max_delay,
+ false /* can't overhang start */);
+
+ DEBUG_PRINTF("got delay %u (max allowed %u)\n", delay, max_delay);
+
+ if (edge(h->startDs, h->accept, *h).second) {
+ /* we should have delay == lit.length(), but in really complex
+ * cases we may fail to identify that we can remove the whole
+ * graph. Regardless, the fact that sds is wired to accept means the
+ * graph serves no purpose. */
+ DEBUG_PRINTF("whole graph\n");
+ g[e].graph.reset();
+ continue;
+ }
+
+ if (delay == lit.length() && edge(h->start, h->accept, *h).second
+ && num_vertices(*h) == N_SPECIALS) {
+ to_anchor.push_back(e);
+ continue;
+ }
+
+ /* if we got here we should still have an interesting graph */
+ assert(delay == max_delay || num_vertices(*h) > N_SPECIALS);
+
+ if (delay && delay != MO_INVALID_IDX) {
+ DEBUG_PRINTF("setting delay %u on lhs %p\n", delay, h.get());
+
+ g[e].graph = move(h);
+ g[e].graph_lag = delay;
+ }
+ }
+
+ if (!to_anchor.empty()) {
+ RoseInVertex anch = add_vertex(RoseInVertexProps::makeStart(true), g);
+
+ for (RoseInEdge e : to_anchor) {
+ DEBUG_PRINTF("rehoming to anchor\n");
+ RoseInVertex v = target(e, g);
+ add_edge(anch, v, g);
+ remove_edge(e, g);
+ }
+ }
+}
+
+static
+bool isStarCliche(const NGHolder &g) {
+ DEBUG_PRINTF("checking graph with %zu vertices\n", num_vertices(g));
+
+ bool nonspecials_seen = false;
+
+ for (auto v : vertices_range(g)) {
+ if (is_special(v, g)) {
+ continue;
+ }
+
+ if (nonspecials_seen) {
+ return false;
+ }
+ nonspecials_seen = true;
+
+ if (!g[v].char_reach.all()) {
+ return false;
+ }
+
+ if (!hasSelfLoop(v, g)) {
+ return false;
+ }
+ if (!edge(v, g.accept, g).second) {
+ return false;
+ }
+ }
+
+ if (!nonspecials_seen) {
+ return false;
+ }
+
+ if (!edge(g.start, g.accept, g).second) {
+ return false;
+ }
+
+ return true;
+}
+
+static
+void removeRedundantLiteralsFromInfix(const NGHolder &h, RoseInGraph &ig,
+ const vector<RoseInEdge> &ee,
+ const CompileContext &cc) {
+ /* TODO: This could be better by not creating a separate graph for each
+ * successor literal. This would require using distinct report ids and also
+ * taking into account overlap of successor literals. */
+
+ set<ue2_literal> preds;
+ set<ue2_literal> succs;
+ for (const RoseInEdge &e : ee) {
+ RoseInVertex u = source(e, ig);
+ assert(ig[u].type == RIV_LITERAL);
+ assert(!ig[u].delay);
+ preds.insert(ig[u].s);
+
+ RoseInVertex v = target(e, ig);
+ assert(ig[v].type == RIV_LITERAL);
+ assert(!ig[v].delay);
+ succs.insert(ig[v].s);
+
+ if (ig[e].graph_lag) {
+ /* already removed redundant parts of literals */
+ return;
+ }
+
+ assert(!ig[e].dfa);
+ }
+
+ map<ue2_literal, pair<shared_ptr<NGHolder>, u32> > graphs; /* + delay */
+
+ for (const ue2_literal &right : succs) {
+ size_t max_overlap = 0;
+ for (const ue2_literal &left : preds) {
+ size_t overlap = maxOverlap(left, right, 0);
+ ENSURE_AT_LEAST(&max_overlap, overlap);
+ }
+
+ u32 max_allowed_delay = right.length() - max_overlap;
+
+ if (cc.streaming) {
+ LIMIT_TO_AT_MOST(&max_allowed_delay, cc.grey.maxHistoryAvailable);
+ }
+
+ if (!max_allowed_delay) {
+ continue;
+ }
+
+ shared_ptr<NGHolder> h_new = cloneHolder(h);
+
+ u32 delay = removeTrailingLiteralStates(*h_new, right,
+ max_allowed_delay);
+
+ if (delay == MO_INVALID_IDX) {
+ /* successor literal could not match infix -> ignore false path */
+ assert(0);
+ continue;
+ }
+
+ if (!delay) {
+ /* unable to trim graph --> no point swapping to new holder */
+ continue;
+ }
+
+ assert(isCorrectlyTopped(*h_new));
+ graphs[right] = make_pair(h_new, delay);
+ }
+
+ for (const RoseInEdge &e : ee) {
+ RoseInVertex v = target(e, ig);
+ const ue2_literal &succ = ig[v].s;
+ if (!contains(graphs, succ)) {
+ continue;
+ }
+
+ ig[e].graph = graphs[succ].first;
+ ig[e].graph_lag = graphs[succ].second;
+
+ if (isStarCliche(*ig[e].graph)) {
+ DEBUG_PRINTF("is a X star!\n");
+ ig[e].graph.reset();
+ ig[e].graph_lag = 0;
+ }
+ }
+}
+
+static
+void removeRedundantLiteralsFromInfixes(RoseInGraph &g,
+ const CompileContext &cc) {
+ insertion_ordered_map<NGHolder *, vector<RoseInEdge>> infixes;
+
+ for (const RoseInEdge &e : edges_range(g)) {
+ RoseInVertex s = source(e, g);
+ RoseInVertex t = target(e, g);
+
+ if (g[s].type != RIV_LITERAL || g[t].type != RIV_LITERAL) {
+ continue;
+ }
+
+ if (!g[e].graph) {
+ continue;
+ }
+
+ assert(!g[t].delay);
+ if (g[e].dfa) {
+ /* if we removed any more states, we would need to rebuild the
+ * the dfa which can be time consuming. */
+ continue;
+ }
+
+ NGHolder *h = g[e].graph.get();
+ infixes[h].push_back(e);
+ }
+
+ for (const auto &m : infixes) {
+ NGHolder *h = m.first;
+ const auto &edges = m.second;
+ removeRedundantLiteralsFromInfix(*h, g, edges, cc);
+ }
+}
+
+static
+void removeRedundantLiterals(RoseInGraph &g, const CompileContext &cc) {
+ removeRedundantLiteralsFromPrefixes(g, cc);
+ removeRedundantLiteralsFromInfixes(g, cc);
+}
+
+static
+RoseInVertex getStart(RoseInGraph &vg) {
+ for (RoseInVertex v : vertices_range(vg)) {
+ if (vg[v].type == RIV_START || vg[v].type == RIV_ANCHORED_START) {
+ return v;
+ }
+ }
+ assert(0);
+ return RoseInGraph::null_vertex();
+}
+
+/**
+ * Finds the initial accept vertex created to which suffix/outfixes are
+ * attached.
+ */
+static
+RoseInVertex getPrimaryAccept(RoseInGraph &vg) {
+ for (RoseInVertex v : vertices_range(vg)) {
+ if (vg[v].type == RIV_ACCEPT && vg[v].reports.empty()) {
+ return v;
+ }
+ }
+ assert(0);
+ return RoseInGraph::null_vertex();
+}
+
+static
+bool willBeTransient(const depth &max_depth, const CompileContext &cc) {
+ if (!cc.streaming) {
+ return max_depth <= depth(ROSE_BLOCK_TRANSIENT_MAX_WIDTH);
+ } else {
+ return max_depth <= depth(cc.grey.maxHistoryAvailable + 1);
+ }
+}
+
+static
+bool willBeAnchoredTable(const depth &max_depth, const Grey &grey) {
+ return max_depth <= depth(grey.maxAnchoredRegion);
+}
+
+static
+unique_ptr<NGHolder> make_chain(u32 count) {
+ assert(count);
+
+ auto rv = std::make_unique<NGHolder>(NFA_INFIX);
+
+ NGHolder &h = *rv;
+
+ NFAVertex u = h.start;
+ for (u32 i = 0; i < count; i++) {
+ NFAVertex v = add_vertex(h);
+ h[v].char_reach = CharReach::dot();
+ add_edge(u, v, h);
+ u = v;
+ }
+ h[u].reports.insert(0);
+ add_edge(u, h.accept, h);
+
+ setTops(h);
+
+ return rv;
+}
+
+#define SHORT_TRIGGER_LEN 16
+
+static
+bool makeTransientFromLongLiteral(NGHolder &h, RoseInGraph &vg,
+ const vector<RoseInEdge> &ee,
+ const CompileContext &cc) {
+ /* check max width and literal lengths to see if possible */
+ size_t min_lit = (size_t)~0ULL;
+ for (const RoseInEdge &e : ee) {
+ RoseInVertex v = target(e, vg);
+ LIMIT_TO_AT_MOST(&min_lit, vg[v].s.length());
+ }
+
+ if (min_lit <= SHORT_TRIGGER_LEN || min_lit >= UINT_MAX) {
+ return false;
+ }
+
+ depth max_width = findMaxWidth(h);
+
+ u32 delta = min_lit - SHORT_TRIGGER_LEN;
+
+ if (!willBeTransient(max_width - depth(delta), cc)
+ && !willBeAnchoredTable(max_width - depth(delta), cc.grey)) {
+ return false;
+ }
+
+ DEBUG_PRINTF("candidate for splitting long literal (len %zu)\n", min_lit);
+ DEBUG_PRINTF("delta = %u\n", delta);
+
+ /* try split */
+ map<RoseInVertex, shared_ptr<NGHolder> > graphs;
+ for (const RoseInEdge &e : ee) {
+ RoseInVertex v = target(e, vg);
+
+ shared_ptr<NGHolder> h_new = cloneHolder(h);
+
+ u32 delay = removeTrailingLiteralStates(*h_new, vg[v].s, delta);
+
+ DEBUG_PRINTF("delay %u\n", delay);
+
+ if (delay != delta) {
+ DEBUG_PRINTF("unable to trim literal\n");
+ return false;
+ }
+
+ if (in_degree(v, vg) != 1) {
+ DEBUG_PRINTF("complicated\n");
+ return false;
+ }
+
+ DEBUG_PRINTF("new mw = %u\n", (u32)findMaxWidth(*h_new));
+ assert(willBeTransient(findMaxWidth(*h_new), cc)
+ || willBeAnchoredTable(findMaxWidth(*h_new), cc.grey));
+
+ assert(isCorrectlyTopped(*h_new));
+ graphs[v] = h_new;
+ }
+
+ /* add .{repeats} from prefixes to long literals */
+ for (const RoseInEdge &e : ee) {
+ RoseInVertex s = source(e, vg);
+ RoseInVertex t = target(e, vg);
+
+ remove_edge(e, vg);
+ const ue2_literal &orig_lit = vg[t].s;
+
+ ue2_literal lit(orig_lit.begin(), orig_lit.end() - delta);
+
+ ue2_literal lit2(orig_lit.end() - delta, orig_lit.end());
+
+ assert(lit.length() + delta == orig_lit.length());
+
+ vg[t].s = lit2;
+
+ RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
+ add_edge(s, v, RoseInEdgeProps(graphs[t], 0), vg);
+ add_edge(v, t, RoseInEdgeProps(make_chain(delta), 0), vg);
+ }
+
+ DEBUG_PRINTF("success\n");
+ /* TODO: alter split point to avoid pathological splits */
+ return true;
+}
+
+static
+void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
+ u32 delay, const vector<NFAVertex> &preds) {
+ assert(delay <= lit.length());
+ assert(isCorrectlyTopped(g));
+ DEBUG_PRINTF("adding on '%s' %u\n", dumpString(lit).c_str(), delay);
+
+ NFAVertex prev = g.accept;
+ auto it = lit.rbegin();
+ while (delay--) {
+ NFAVertex curr = add_vertex(g);
+ assert(it != lit.rend());
+ g[curr].char_reach = *it;
+ add_edge(curr, prev, g);
+ ++it;
+ prev = curr;
+ }
+
+ for (auto v : preds) {
+ NFAEdge e = add_edge_if_not_present(v, prev, g);
+ if (v == g.start && is_triggered(g)) {
+ g[e].tops.insert(DEFAULT_TOP);
+ }
+ }
+
+ // Every predecessor of accept must have a report.
+ set_report(g, 0);
+
+ renumber_vertices(g);
+ renumber_edges(g);
+ assert(allMatchStatesHaveReports(g));
+ assert(isCorrectlyTopped(g));
+}
+
+static
+void restoreTrailingLiteralStates(NGHolder &g,
+ const vector<pair<ue2_literal, u32>> &lits) {
+ vector<NFAVertex> preds;
+ insert(&preds, preds.end(), inv_adjacent_vertices(g.accept, g));
+ clear_in_edges(g.accept, g);
+
+ for (auto v : preds) {
+ g[v].reports.clear(); /* clear report from old accepts */
+ }
+
+ for (const auto &p : lits) {
+ const ue2_literal &lit = p.first;
+ u32 delay = p.second;
+
+ restoreTrailingLiteralStates(g, lit, delay, preds);
+ }
+}
+
+static
+bool improvePrefix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
+ const CompileContext &cc) {
+ DEBUG_PRINTF("trying to improve prefix %p, %zu verts\n", &h,
+ num_vertices(h));
+ assert(isCorrectlyTopped(h));
+
+ renumber_vertices(h);
+ renumber_edges(h);
+
+ auto depths = calcDepths(h);
+
+ /* If the reason the prefix is not transient is due to a very long literal
+ * following, we can make it transient by restricting ourselves to using
+ * just the head of the literal. */
+ if (makeTransientFromLongLiteral(h, vg, ee, cc)) {
+ return true;
+ }
+
+ auto split = findBestPrefixSplit(h, depths, vg, ee, false, cc);
+
+ if (split && (split->creates_transient || split->creates_anchored)
+ && splitRoseEdge(h, vg, ee, *split)) {
+ DEBUG_PRINTF("split on simple literal\n");
+ return true;
+ }
+
+ /* large back edges may prevent us identifing anchored or transient cases
+ * properly - use a simple walk instead */
+
+ if (doNetflowCut(h, &depths, vg, ee, true, cc.grey)) {
+ return true;
+ }
+
+ if (split && splitRoseEdge(h, vg, ee, *split)) {
+ /* use the simple split even though it doesn't create a transient
+ * prefix */
+ DEBUG_PRINTF("split on simple literal\n");
+ return true;
+ }
+
+ /* look for netflow cuts which don't produce good prefixes */
+ if (doNetflowCut(h, &depths, vg, ee, false, cc.grey)) {
+ return true;
+ }
+
+ if (ee.size() > 1) {
+ DEBUG_PRINTF("split the prefix apart based on succ literals\n");
+ unordered_map<shared_ptr<NGHolder>, vector<pair<RoseInEdge, u32> >,
NGHolderHasher, NGHolderEqual> trimmed;
-
- for (const auto &e : ee) {
- shared_ptr<NGHolder> hh = cloneHolder(h);
- auto succ_lit = vg[target(e, vg)].s;
- assert(isCorrectlyTopped(*hh));
- u32 delay = removeTrailingLiteralStates(*hh, succ_lit,
- succ_lit.length(),
- false /* can't overhang start */);
- if (!delay) {
- DEBUG_PRINTF("could not remove any literal, skip over\n");
- continue;
- }
-
- assert(isCorrectlyTopped(*hh));
- trimmed[hh].emplace_back(e, delay);
- }
-
- if (trimmed.size() == 1) {
- return false;
- }
-
- /* shift the contents to a vector so we can modify the graphs without
- * violating the map's invariants. */
- vector<pair<shared_ptr<NGHolder>, vector<pair<RoseInEdge, u32> > > >
- trimmed_vec(trimmed.begin(), trimmed.end());
- trimmed.clear();
- for (auto &elem : trimmed_vec) {
- shared_ptr<NGHolder> &hp = elem.first;
- vector<pair<ue2_literal, u32>> succ_lits;
-
- for (const auto &edge_delay : elem.second) {
- const RoseInEdge &e = edge_delay.first;
- u32 delay = edge_delay.second;
- auto lit = vg[target(e, vg)].s;
-
- vg[e].graph = hp;
- assert(delay <= lit.length());
- succ_lits.emplace_back(lit, delay);
- }
- restoreTrailingLiteralStates(*hp, succ_lits);
- }
- return true;
- }
-
- return false;
-}
-
-#define MAX_FIND_BETTER_PREFIX_GEN 4
-#define MAX_FIND_BETTER_PREFIX_COUNT 100
-
-static
-void findBetterPrefixes(RoseInGraph &vg, const CompileContext &cc) {
- STAGE_DEBUG_PRINTF("FIND BETTER PREFIXES\n");
- RoseInVertex start = getStart(vg);
-
- insertion_ordered_map<NGHolder *, vector<RoseInEdge>> prefixes;
- bool changed;
- u32 gen = 0;
- do {
- DEBUG_PRINTF("gen %u\n", gen);
- changed = false;
- prefixes.clear();
-
- /* find prefixes */
- for (const RoseInEdge &e : out_edges_range(start, vg)) {
- /* outfixes shouldn't have made it this far */
- assert(vg[target(e, vg)].type == RIV_LITERAL);
- if (vg[e].graph) {
- NGHolder *h = vg[e].graph.get();
- prefixes[h].push_back(e);
- }
- }
-
- if (prefixes.size() > MAX_FIND_BETTER_PREFIX_COUNT) {
- break;
- }
-
- /* look for bad prefixes and try to split */
- for (const auto &m : prefixes) {
- NGHolder *h = m.first;
- const auto &edges = m.second;
- depth max_width = findMaxWidth(*h);
- if (willBeTransient(max_width, cc)
- || willBeAnchoredTable(max_width, cc.grey)) {
- continue;
- }
-
- changed = improvePrefix(*h, vg, edges, cc);
- }
- } while (changed && gen++ < MAX_FIND_BETTER_PREFIX_GEN);
-}
-
-#define STRONG_LITERAL_LENGTH 20
-#define MAX_EXTRACT_STRONG_LITERAL_GRAPHS 10
-
-static
-bool extractStrongLiteral(NGHolder &h, RoseInGraph &vg,
- const vector<RoseInEdge> &ee,
- const CompileContext &cc) {
- DEBUG_PRINTF("looking for string literal\n");
- unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, ee, cc);
-
- if (split && min_len(split->lit) >= STRONG_LITERAL_LENGTH) {
- DEBUG_PRINTF("splitting simple literal\n");
- return splitRoseEdge(h, vg, ee, *split);
- }
-
- return false;
-}
-
-static
-void extractStrongLiterals(RoseInGraph &vg, const CompileContext &cc) {
- if (!cc.grey.violetExtractStrongLiterals) {
- return;
- }
-
- STAGE_DEBUG_PRINTF("EXTRACT STRONG LITERALS\n");
-
- unordered_set<NGHolder *> stuck;
- insertion_ordered_map<NGHolder *, vector<RoseInEdge>> edges_by_graph;
- bool changed;
-
- do {
- changed = false;
-
- edges_by_graph.clear();
- for (const RoseInEdge &ve : edges_range(vg)) {
- if (vg[source(ve, vg)].type != RIV_LITERAL) {
- continue;
- }
-
- if (vg[ve].graph) {
- NGHolder *h = vg[ve].graph.get();
- edges_by_graph[h].push_back(ve);
- }
- }
-
- if (edges_by_graph.size() > MAX_EXTRACT_STRONG_LITERAL_GRAPHS) {
- DEBUG_PRINTF("too many graphs, stopping\n");
- return;
- }
-
- for (const auto &m : edges_by_graph) {
- NGHolder *g = m.first;
- const auto &edges = m.second;
- if (contains(stuck, g)) {
- DEBUG_PRINTF("already known to be bad\n");
- continue;
- }
- bool rv = extractStrongLiteral(*g, vg, edges, cc);
- if (rv) {
- changed = true;
- } else {
- stuck.insert(g);
- }
- }
- } while (changed);
-}
-
-#define INFIX_STRONG_GUARD_LEN 8
-#define INFIX_MIN_SPLIT_LITERAL_LEN 12
-
-static
-bool improveInfix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
- const CompileContext &cc) {
- unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, ee, cc);
-
- if (split && min_len(split->lit) >= INFIX_MIN_SPLIT_LITERAL_LEN
- && splitRoseEdge(h, vg, ee, *split)) {
- DEBUG_PRINTF("splitting simple literal\n");
- return true;
- }
-
- DEBUG_PRINTF("trying for a netflow cut\n");
- /* look for netflow cuts which don't produce good prefixes */
- bool rv = doNetflowCut(h, nullptr, vg, ee, false, cc.grey, 8);
-
- DEBUG_PRINTF("did netfow cut? = %d\n", (int)rv);
-
- return rv;
-}
-
-/**
- * Infixes which are weakly guarded can, in effect, act like prefixes as they
- * will often be live. We should try to split these infixes further if they
- * contain strong literals so that we are at least running smaller weak infixes
- * which can hopeful be accelerated/miracled.
- */
-static
-void improveWeakInfixes(RoseInGraph &vg, const CompileContext &cc) {
- if (!cc.grey.violetAvoidWeakInfixes) {
- return;
- }
- STAGE_DEBUG_PRINTF("IMPROVE WEAK INFIXES\n");
-
- RoseInVertex start = getStart(vg);
-
- unordered_set<NGHolder *> weak;
-
- for (RoseInVertex vv : adjacent_vertices_range(start, vg)) {
- /* outfixes shouldn't have made it this far */
- assert(vg[vv].type == RIV_LITERAL);
- if (vg[vv].s.length() >= INFIX_STRONG_GUARD_LEN) {
- continue;
- }
-
- for (const RoseInEdge &e : out_edges_range(vv, vg)) {
- if (vg[target(e, vg)].type != RIV_LITERAL || !vg[e].graph) {
- continue;
- }
-
- NGHolder *h = vg[e].graph.get();
- DEBUG_PRINTF("'%s' guards %p\n", dumpString(vg[vv].s).c_str(), h);
- weak.insert(h);
- }
- }
-
- insertion_ordered_map<NGHolder *, vector<RoseInEdge>> weak_edges;
- for (const RoseInEdge &ve : edges_range(vg)) {
- NGHolder *h = vg[ve].graph.get();
- if (contains(weak, h)) {
- weak_edges[h].push_back(ve);
- }
- }
-
- for (const auto &m : weak_edges) {
- NGHolder *h = m.first;
- const auto &edges = m.second;
- improveInfix(*h, vg, edges, cc);
- }
-}
-
-static
-void splitEdgesForSuffix(const NGHolder &base_graph, RoseInGraph &vg,
- const vector<RoseInEdge> &ee, const VertLitInfo &split,
- bool eod, const flat_set<ReportID> &reports) {
- const vector<NFAVertex> &splitters = split.vv;
- assert(!splitters.empty());
-
- shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
- unordered_map<NFAVertex, NFAVertex> v_map;
- cloneHolder(*lhs, base_graph, &v_map);
- lhs->kind = NFA_INFIX;
- clear_in_edges(lhs->accept, *lhs);
- clear_in_edges(lhs->acceptEod, *lhs);
- add_edge(lhs->accept, lhs->acceptEod, *lhs);
- clearReports(*lhs);
- for (NFAVertex v : splitters) {
- NFAEdge e = add_edge(v_map[v], lhs->accept, *lhs);
- if (v == base_graph.start) {
- (*lhs)[e].tops.insert(DEFAULT_TOP);
- }
- (*lhs)[v_map[v]].reports.insert(0);
-
- }
- pruneUseless(*lhs);
- assert(isCorrectlyTopped(*lhs));
-
- /* create literal vertices and connect preds */
- for (const auto &lit : split.lit) {
- if (!can_match(*lhs, lit, is_triggered(*lhs))) {
- continue;
- }
-
- DEBUG_PRINTF("best is '%s'\n", escapeString(lit).c_str());
- RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
-
- RoseInVertex tt;
- if (eod) {
- DEBUG_PRINTF("doing eod\n");
- tt = add_vertex(RoseInVertexProps::makeAcceptEod(reports), vg);
- } else {
- DEBUG_PRINTF("doing non-eod\n");
- tt = add_vertex(RoseInVertexProps::makeAccept(reports), vg);
- }
- add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
-
- for (const RoseInEdge &e : ee) {
- RoseInVertex u = source(e, vg);
- assert(!edge(u, v, vg).second);
- add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
- }
- }
-}
-
-#define MIN_SUFFIX_LEN 6
-
-static
-bool replaceSuffixWithInfix(const NGHolder &h, RoseInGraph &vg,
- const vector<RoseInEdge> &suffix_edges,
- const CompileContext &cc) {
- DEBUG_PRINTF("inspecting suffix : %p on %zu edges\n", &h,
- suffix_edges.size());
- /*
- * We would, in general, rather not have output exposed engines because
- * once they are triggered, they must be run while infixes only have to run
- * if the successor literal is seen. Matches from output exposed engines
- * also have to be placed in a priority queue and interleaved with matches
- * from other sources.
- *
- * Note:
- * - if the LHS is extremely unlikely we may be better off leaving
- * a suffix unguarded.
- *
- * - limited width suffixes may be less bad as they won't be continuously
- * active, we may want to have (a) stronger controls on if we want to pick
- * a trailing literal in these cases and/or (b) look also for literals
- * near accept as well as right on accept
- *
- * TODO: improve heuristics, splitting logic.
- */
-
- /* we may do multiple splits corresponding to different report behaviour */
- set<NFAVertex> seen;
- map<pair<bool, flat_set<ReportID> >, VertLitInfo> by_reports; /* eod, rep */
-
- for (NFAVertex v : inv_adjacent_vertices_range(h.accept, h)) {
- set<ue2_literal> ss = getLiteralSet(h, v, false);
- if (ss.empty()) {
- DEBUG_PRINTF("candidate is too shitty\n");
- return false;
- }
-
- VertLitInfo &vli = by_reports[make_pair(false, h[v].reports)];
- insert(&vli.lit, ss);
- vli.vv.push_back(v);
- seen.insert(v);
- }
-
- seen.insert(h.accept);
- for (NFAVertex v : inv_adjacent_vertices_range(h.acceptEod, h)) {
- if (contains(seen, v)) {
- continue;
- }
-
- set<ue2_literal> ss = getLiteralSet(h, v, false);
- if (ss.empty()) {
- DEBUG_PRINTF("candidate is too shitty\n");
- return false;
- }
-
- VertLitInfo &vli = by_reports[make_pair(true, h[v].reports)];
- insert(&vli.lit, ss);
- vli.vv.push_back(v);
- }
-
- assert(!by_reports.empty());
-
- /* TODO: how strong a min len do we want here ? */
- u32 min_len = cc.grey.minRoseLiteralLength;
- ENSURE_AT_LEAST(&min_len, MIN_SUFFIX_LEN);
-
- for (auto &vli : by_reports | map_values) {
- u64a score = sanitizeAndCompressAndScore(vli.lit);
-
- if (vli.lit.empty()
- || !validateRoseLiteralSetQuality(vli.lit, score, false, min_len,
- false, false)) {
- return false;
- }
- }
-
- for (const auto &info : by_reports) {
- DEBUG_PRINTF("splitting on simple literals\n");
- splitEdgesForSuffix(h, vg, suffix_edges, info.second,
- info.first.first /* eod */,
- info.first.second /* reports */);
- }
-
- for (const RoseInEdge &e : suffix_edges) {
- remove_edge(e, vg);
- }
- return true;
-}
-
-static
-void avoidSuffixes(RoseInGraph &vg, const CompileContext &cc) {
- if (!cc.grey.violetAvoidSuffixes) {
- return;
- }
-
- STAGE_DEBUG_PRINTF("AVOID SUFFIXES\n");
-
- RoseInVertex accept = getPrimaryAccept(vg);
-
- insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> suffixes;
-
- /* find suffixes */
- for (const RoseInEdge &e : in_edges_range(accept, vg)) {
- /* outfixes shouldn't have made it this far */
- assert(vg[source(e, vg)].type == RIV_LITERAL);
- assert(vg[e].graph); /* non suffix paths should be wired to other
- accepts */
- const NGHolder *h = vg[e].graph.get();
- suffixes[h].push_back(e);
- }
-
- /* look at suffixes and try to split */
- for (const auto &m : suffixes) {
- const NGHolder *h = m.first;
- const auto &edges = m.second;
- replaceSuffixWithInfix(*h, vg, edges, cc);
- }
-}
-
-static
-bool leadingDotStartLiteral(const NGHolder &h, VertLitInfo *out) {
- if (out_degree(h.start, h) != 3) {
- return false;
- }
-
- NFAVertex v = NGHolder::null_vertex();
- NFAVertex ds = NGHolder::null_vertex();
-
- for (NFAVertex a : adjacent_vertices_range(h.start, h)) {
- if (a == h.startDs) {
- continue;
- }
- if (h[a].char_reach.all()) {
- ds = a;
- if (out_degree(ds, h) != 2 || !edge(ds, ds, h).second) {
- return false;
- }
- } else {
- v = a;
- }
- }
-
- if (!v || !ds || !edge(ds, v, h).second) {
- return false;
- }
-
- if (h[v].char_reach.count() != 1 && !h[v].char_reach.isCaselessChar()) {
- return false;
- }
-
- ue2_literal lit;
- lit.push_back(h[v].char_reach.find_first(),
- h[v].char_reach.isCaselessChar());
- while (out_degree(v, h) == 1) {
- NFAVertex vv = *adjacent_vertices(v, h).first;
- if (h[vv].char_reach.count() != 1
- && !h[vv].char_reach.isCaselessChar()) {
- break;
- }
-
- v = vv;
-
- lit.push_back(h[v].char_reach.find_first(),
- h[v].char_reach.isCaselessChar());
- }
-
- if (is_match_vertex(v, h) && h.kind != NFA_SUFFIX) {
- /* we have rediscovered the post-infix literal */
- return false;
- }
-
- if (bad_mixed_sensitivity(lit)) {
- make_nocase(&lit);
- }
-
- DEBUG_PRINTF("%zu found %s\n", h[v].index, dumpString(lit).c_str());
- out->vv = {v};
- out->lit = {lit};
- return true;
-}
-
-static
-bool lookForDoubleCut(const NGHolder &h, const vector<RoseInEdge> &ee,
- RoseInGraph &vg, const Grey &grey) {
- VertLitInfo info;
- if (!leadingDotStartLiteral(h, &info)
- || min_len(info.lit) < grey.violetDoubleCutLiteralLen) {
- return false;
- }
- DEBUG_PRINTF("performing split\n");
- return splitRoseEdge(h, vg, ee, {info});
-}
-
-static
-void lookForDoubleCut(RoseInGraph &vg, const CompileContext &cc) {
- if (!cc.grey.violetDoubleCut) {
- return;
- }
-
- insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> right_edges;
- for (const RoseInEdge &ve : edges_range(vg)) {
- if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) {
- const NGHolder *h = vg[ve].graph.get();
- right_edges[h].push_back(ve);
- }
- }
-
- for (const auto &m : right_edges) {
- const NGHolder *h = m.first;
- const auto &edges = m.second;
- lookForDoubleCut(*h, edges, vg, cc.grey);
- }
-}
-
-static
-pair<NFAVertex, ue2_literal> findLiteralBefore(const NGHolder &h, NFAVertex v) {
- ue2_literal lit;
- if (h[v].char_reach.count() != 1 && !h[v].char_reach.isCaselessChar()) {
- return {v, std::move(lit) };
- }
- lit.push_back(h[v].char_reach.find_first(),
- h[v].char_reach.isCaselessChar());
-
- while (in_degree(v, h) == 1) {
- NFAVertex vv = *inv_adjacent_vertices(v, h).first;
- if (h[vv].char_reach.count() != 1
- && !h[vv].char_reach.isCaselessChar()) {
- break;
- }
-
- lit.push_back(h[vv].char_reach.find_first(),
- h[vv].char_reach.isCaselessChar());
- v = vv;
- }
-
- return {v, std::move(lit) };
-}
-
-static
-bool lookForDotStarPred(NFAVertex v, const NGHolder &h,
- NFAVertex *u, NFAVertex *ds) {
- *u = NGHolder::null_vertex();
- *ds = NGHolder::null_vertex();
- for (NFAVertex a : inv_adjacent_vertices_range(v, h)) {
- if (h[a].char_reach.all()) {
- if (!edge(a, a, h).second) {
- return false;
- }
-
- if (*ds) {
- return false;
- }
-
- *ds = a;
- } else {
- if (*u) {
- return false;
- }
- *u = a;
- }
- }
-
- if (!*u || !*ds) {
- return false;
- }
-
- return true;
-}
-
-static
-bool trailingDotStarLiteral(const NGHolder &h, VertLitInfo *out) {
- /* Note: there is no delay yet - so the final literal is the already
- * discovered successor literal - we are in fact interested in the literal
- * before it. */
-
- if (in_degree(h.accept, h) != 1) {
- return false;
- }
-
- if (in_degree(h.acceptEod, h) != 1) {
- assert(0);
- return false;
- }
-
- NFAVertex v
- = findLiteralBefore(h, *inv_adjacent_vertices(h.accept, h).first).first;
-
- NFAVertex u;
- NFAVertex ds;
-
- if (!lookForDotStarPred(v, h, &u, &ds)) {
- return false;
- }
-
- v = u;
- auto rv = findLiteralBefore(h, v);
-
- if (!lookForDotStarPred(v, h, &u, &ds)) {
- return false;
- }
-
- ue2_literal lit = reverse_literal(rv.second);
- DEBUG_PRINTF("%zu found %s\n", h[v].index, dumpString(lit).c_str());
-
- if (bad_mixed_sensitivity(lit)) {
- make_nocase(&lit);
- }
-
- out->vv = {v};
- out->lit = {lit};
- return true;
-}
-
-static
-bool lookForTrailingLiteralDotStar(const NGHolder &h,
- const vector<RoseInEdge> &ee,
- RoseInGraph &vg, const Grey &grey) {
- VertLitInfo info;
- if (!trailingDotStarLiteral(h, &info)
- || min_len(info.lit) < grey.violetDoubleCutLiteralLen) {
- return false;
- }
- DEBUG_PRINTF("performing split\n");
- return splitRoseEdge(h, vg, ee, info);
-}
-
-/* In streaming mode, active engines have to be caught up at stream boundaries
- * and have to be stored in stream state, so we prefer to decompose patterns
- * in to literals with no state between them if possible. */
-static
-void decomposeLiteralChains(RoseInGraph &vg, const CompileContext &cc) {
- if (!cc.grey.violetLiteralChains) {
- return;
- }
-
- insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> right_edges;
- bool changed;
- do {
- changed = false;
-
- right_edges.clear();
- for (const RoseInEdge &ve : edges_range(vg)) {
- if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) {
- const NGHolder *h = vg[ve].graph.get();
- right_edges[h].push_back(ve);
- }
- }
-
- for (const auto &m : right_edges) {
- const NGHolder *h = m.first;
- const vector<RoseInEdge> &ee = m.second;
- bool rv = lookForDoubleCut(*h, ee, vg, cc.grey);
- if (!rv && h->kind != NFA_SUFFIX) {
- rv = lookForTrailingLiteralDotStar(*h, ee, vg, cc.grey);
- }
- changed |= rv;
- }
- } while (changed);
-}
-
-static
-bool lookForCleanSplit(const NGHolder &h, const vector<RoseInEdge> &ee,
- RoseInGraph &vg, const CompileContext &cc) {
- unique_ptr<VertLitInfo> split = findBestCleanSplit(h, cc);
-
- if (split) {
- return splitRoseEdge(h, vg, {ee}, *split);
- }
-
- return false;
-}
-
-#define MAX_DESIRED_CLEAN_SPLIT_DEPTH 4
-
-static
-void lookForCleanEarlySplits(RoseInGraph &vg, const CompileContext &cc) {
- u32 gen = 0;
-
- insertion_ordered_set<RoseInVertex> prev({getStart(vg)});
- insertion_ordered_set<RoseInVertex> curr;
-
- while (gen < MAX_DESIRED_CLEAN_SPLIT_DEPTH) {
- curr.clear();
- for (RoseInVertex u : prev) {
- for (auto v : adjacent_vertices_range(u, vg)) {
- curr.insert(v);
- }
- }
-
- insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> rightfixes;
- for (RoseInVertex v : curr) {
- for (const RoseInEdge &e : out_edges_range(v, vg)) {
- if (vg[e].graph) {
- NGHolder *h = vg[e].graph.get();
- rightfixes[h].push_back(e);
- }
- }
- }
-
- for (const auto &m : rightfixes) {
- const NGHolder *h = m.first;
- const auto &edges = m.second;
- lookForCleanSplit(*h, edges, vg, cc);
- }
-
- prev = std::move(curr);
- gen++;
- }
-}
-
-static
-void rehomeEodSuffixes(RoseInGraph &vg) {
- // Find edges to accept with EOD-anchored graphs that we can move over to
- // acceptEod.
- vector<RoseInEdge> acc_edges;
- for (const auto &e : edges_range(vg)) {
- if (vg[target(e, vg)].type != RIV_ACCEPT) {
- continue;
- }
- if (vg[e].haig || !vg[e].graph) {
- continue;
- }
-
- const NGHolder &h = *vg[e].graph;
-
- if (in_degree(h.accept, h)) {
- DEBUG_PRINTF("graph isn't eod anchored\n");
- continue;
- }
-
- acc_edges.push_back(e);
- }
-
- for (const RoseInEdge &e : acc_edges) {
- // Move this edge from accept to acceptEod
- RoseInVertex w = add_vertex(RoseInVertexProps::makeAcceptEod(), vg);
- add_edge(source(e, vg), w, vg[e], vg);
- remove_edge(e, vg);
- }
-
- /* old accept vertices will be tidied up by final pruneUseless() call */
-}
-
-static
-bool tryForEarlyDfa(const NGHolder &h, const CompileContext &cc) {
- switch (h.kind) {
- case NFA_OUTFIX: /* 'prefix' of eod */
- case NFA_PREFIX:
- return cc.grey.earlyMcClellanPrefix;
- case NFA_INFIX:
- return cc.grey.earlyMcClellanInfix;
- case NFA_SUFFIX:
- return cc.grey.earlyMcClellanSuffix;
- default:
- DEBUG_PRINTF("kind %u\n", (u32)h.kind);
- assert(0);
- return false;
- }
-}
-
-static
-vector<vector<CharReach>> getDfaTriggers(RoseInGraph &vg,
- const vector<RoseInEdge> &edges,
- bool *single_trigger) {
- vector<vector<CharReach>> triggers;
- u32 min_offset = ~0U;
- u32 max_offset = 0;
- for (const auto &e : edges) {
- RoseInVertex s = source(e, vg);
- if (vg[s].type == RIV_LITERAL) {
- triggers.push_back(as_cr_seq(vg[s].s));
- }
- ENSURE_AT_LEAST(&max_offset, vg[s].max_offset);
- LIMIT_TO_AT_MOST(&min_offset, vg[s].min_offset);
- }
-
- *single_trigger = min_offset == max_offset;
- DEBUG_PRINTF("trigger offset (%u, %u)\n", min_offset, max_offset);
-
- return triggers;
-}
-
-static
-bool doEarlyDfa(RoseBuild &rose, RoseInGraph &vg, NGHolder &h,
- const vector<RoseInEdge> &edges, bool final_chance,
- const ReportManager &rm, const CompileContext &cc) {
- DEBUG_PRINTF("trying for dfa\n");
-
- bool single_trigger;
- for (const auto &e : edges) {
- if (vg[target(e, vg)].type == RIV_ACCEPT_EOD) {
- /* TODO: support eod prefixes */
- return false;
- }
- }
-
- auto triggers = getDfaTriggers(vg, edges, &single_trigger);
-
- /* TODO: literal delay things */
- if (!generates_callbacks(h)) {
- set_report(h, rose.getNewNfaReport());
- }
-
- shared_ptr<raw_dfa> dfa = buildMcClellan(h, &rm, single_trigger, triggers,
- cc.grey, final_chance);
-
- if (!dfa) {
- return false;
- }
-
- DEBUG_PRINTF("dfa ok\n");
- for (const auto &e : edges) {
- vg[e].dfa = dfa;
- }
-
- return true;
-}
-
-#define MAX_EDGES_FOR_IMPLEMENTABILITY 50
-
-static
-bool splitForImplementability(RoseInGraph &vg, NGHolder &h,
- const vector<RoseInEdge> &edges,
- const CompileContext &cc) {
- vector<pair<ue2_literal, u32>> succ_lits;
- DEBUG_PRINTF("trying to split %s with %zu vertices on %zu edges\n",
- to_string(h.kind).c_str(), num_vertices(h), edges.size());
-
- if (edges.size() > MAX_EDGES_FOR_IMPLEMENTABILITY) {
- return false;
- }
-
- if (!generates_callbacks(h)) {
- for (const auto &e : edges) {
- const auto &lit = vg[target(e, vg)].s;
- u32 delay = vg[e].graph_lag;
- vg[e].graph_lag = 0;
-
- assert(delay <= lit.length());
- succ_lits.emplace_back(lit, delay);
- }
- restoreTrailingLiteralStates(h, succ_lits);
- }
-
- unique_ptr<VertLitInfo> split;
- bool last_chance = true;
- if (h.kind == NFA_PREFIX) {
- auto depths = calcDepths(h);
-
- split = findBestPrefixSplit(h, depths, vg, edges, last_chance, cc);
- } else {
- split = findBestLastChanceSplit(h, vg, edges, cc);
- }
-
- if (split && splitRoseEdge(h, vg, edges, *split)) {
- DEBUG_PRINTF("split on simple literal\n");
- return true;
- }
-
- DEBUG_PRINTF("trying to netflow\n");
- bool rv = doNetflowCut(h, nullptr, vg, edges, false, cc.grey);
- DEBUG_PRINTF("done\n");
-
- return rv;
-}
-
-#define MAX_IMPLEMENTABLE_SPLITS 50
-
-bool ensureImplementable(RoseBuild &rose, RoseInGraph &vg, bool allow_changes,
- bool final_chance, const ReportManager &rm,
- const CompileContext &cc) {
- DEBUG_PRINTF("checking for impl %d\n", final_chance);
- bool changed = false;
- bool need_to_recalc = false;
- u32 added_count = 0;
- unordered_set<shared_ptr<NGHolder>> good; /* known to be implementable */
- do {
- changed = false;
- DEBUG_PRINTF("added %u\n", added_count);
- insertion_ordered_map<shared_ptr<NGHolder>,
- vector<RoseInEdge>> edges_by_graph;
- for (const RoseInEdge &ve : edges_range(vg)) {
- if (vg[ve].graph && !vg[ve].dfa) {
- auto &h = vg[ve].graph;
- edges_by_graph[h].push_back(ve);
- }
- }
- for (auto &m : edges_by_graph) {
- auto &h = m.first;
- if (contains(good, h)) {
- continue;
- }
- reduceGraphEquivalences(*h, cc);
- if (isImplementableNFA(*h, &rm, cc)) {
- good.insert(h);
- continue;
- }
-
- const auto &edges = m.second;
-
- if (tryForEarlyDfa(*h, cc) &&
- doEarlyDfa(rose, vg, *h, edges, final_chance, rm, cc)) {
- continue;
- }
-
- DEBUG_PRINTF("eek\n");
- if (!allow_changes) {
- return false;
- }
-
- if (splitForImplementability(vg, *h, edges, cc)) {
- added_count++;
- if (added_count > MAX_IMPLEMENTABLE_SPLITS) {
- DEBUG_PRINTF("added_count hit limit\n");
- return false;
- }
- changed = true;
- continue;
- }
-
- return false;
- }
-
- assert(added_count <= MAX_IMPLEMENTABLE_SPLITS);
-
- if (changed) {
- removeRedundantLiterals(vg, cc);
- pruneUseless(vg);
- need_to_recalc = true;
- }
- } while (changed);
-
- if (need_to_recalc) {
- renumber_vertices(vg);
- calcVertexOffsets(vg);
- }
-
- DEBUG_PRINTF("ok!\n");
- return true;
-}
-
-static
-RoseInGraph doInitialVioletTransform(const NGHolder &h, bool last_chance,
- const CompileContext &cc) {
- assert(!can_never_match(h));
-
- RoseInGraph vg = populateTrivialGraph(h);
-
- if (!cc.grey.allowViolet) {
- return vg;
- }
-
- /* Avoid running the Violet analysis at all on graphs with no vertices with
- * small reach, since we will not be able to extract any literals. */
- if (!hasNarrowReachVertex(h)) {
- DEBUG_PRINTF("fail, no vertices with small reach\n");
- return vg;
- }
-
- DEBUG_PRINTF("hello world\n");
-
- /* Step 1: avoid outfixes as we always have to run them. */
- avoidOutfixes(vg, last_chance, cc);
-
- if (num_vertices(vg) <= 2) {
- return vg; /* unable to transform pattern */
- }
-
- removeRedundantPrefixes(vg);
- dumpPreRoseGraph(vg, cc.grey, "pre_prefix_rose.dot");
-
- /* Step 2: avoid non-transient prefixes (esp in streaming mode) */
- findBetterPrefixes(vg, cc);
-
- dumpPreRoseGraph(vg, cc.grey, "post_prefix_rose.dot");
-
- extractStrongLiterals(vg, cc);
- dumpPreRoseGraph(vg, cc.grey, "post_extract_rose.dot");
- improveWeakInfixes(vg, cc);
- dumpPreRoseGraph(vg, cc.grey, "post_infix_rose.dot");
-
- /* Step 3: avoid output exposed engines if there is a strong trailing
- literal) */
- avoidSuffixes(vg, cc);
-
- /* Step 4: look for infixes/suffixes with leading .*literals
- * This can reduce the amount of work a heavily picked literal has to do and
- * reduce the amount of state used as .* is handled internally to rose. */
- lookForDoubleCut(vg, cc);
-
- if (cc.streaming) {
- lookForCleanEarlySplits(vg, cc);
- decomposeLiteralChains(vg, cc);
- }
-
- rehomeEodSuffixes(vg);
- removeRedundantLiterals(vg, cc);
-
- pruneUseless(vg);
- dumpPreRoseGraph(vg, cc.grey);
- renumber_vertices(vg);
- calcVertexOffsets(vg);
-
- return vg;
-}
-
-bool doViolet(RoseBuild &rose, const NGHolder &h, bool prefilter,
- bool last_chance, const ReportManager &rm,
- const CompileContext &cc) {
- auto vg = doInitialVioletTransform(h, last_chance, cc);
- if (num_vertices(vg) <= 2) {
- return false;
- }
-
- /* Step 5: avoid unimplementable, or overly large engines if possible */
- if (!ensureImplementable(rose, vg, last_chance, last_chance, rm, cc)) {
- return false;
- }
- dumpPreRoseGraph(vg, cc.grey, "post_ensure_rose.dot");
-
- /* Step 6: send to rose */
- bool rv = rose.addRose(vg, prefilter);
- DEBUG_PRINTF("violet: %s\n", rv ? "success" : "fail");
- return rv;
-}
-
-bool checkViolet(const ReportManager &rm, const NGHolder &h, bool prefilter,
- const CompileContext &cc) {
- auto vg = doInitialVioletTransform(h, true, cc);
- if (num_vertices(vg) <= 2) {
- return false;
- }
-
- bool rv = roseCheckRose(vg, prefilter, rm, cc);
- DEBUG_PRINTF("violet: %s\n", rv ? "success" : "fail");
- return rv;
-}
-
-}
+
+ for (const auto &e : ee) {
+ shared_ptr<NGHolder> hh = cloneHolder(h);
+ auto succ_lit = vg[target(e, vg)].s;
+ assert(isCorrectlyTopped(*hh));
+ u32 delay = removeTrailingLiteralStates(*hh, succ_lit,
+ succ_lit.length(),
+ false /* can't overhang start */);
+ if (!delay) {
+ DEBUG_PRINTF("could not remove any literal, skip over\n");
+ continue;
+ }
+
+ assert(isCorrectlyTopped(*hh));
+ trimmed[hh].emplace_back(e, delay);
+ }
+
+ if (trimmed.size() == 1) {
+ return false;
+ }
+
+ /* shift the contents to a vector so we can modify the graphs without
+ * violating the map's invariants. */
+ vector<pair<shared_ptr<NGHolder>, vector<pair<RoseInEdge, u32> > > >
+ trimmed_vec(trimmed.begin(), trimmed.end());
+ trimmed.clear();
+ for (auto &elem : trimmed_vec) {
+ shared_ptr<NGHolder> &hp = elem.first;
+ vector<pair<ue2_literal, u32>> succ_lits;
+
+ for (const auto &edge_delay : elem.second) {
+ const RoseInEdge &e = edge_delay.first;
+ u32 delay = edge_delay.second;
+ auto lit = vg[target(e, vg)].s;
+
+ vg[e].graph = hp;
+ assert(delay <= lit.length());
+ succ_lits.emplace_back(lit, delay);
+ }
+ restoreTrailingLiteralStates(*hp, succ_lits);
+ }
+ return true;
+ }
+
+ return false;
+}
+
+#define MAX_FIND_BETTER_PREFIX_GEN 4
+#define MAX_FIND_BETTER_PREFIX_COUNT 100
+
+static
+void findBetterPrefixes(RoseInGraph &vg, const CompileContext &cc) {
+ STAGE_DEBUG_PRINTF("FIND BETTER PREFIXES\n");
+ RoseInVertex start = getStart(vg);
+
+ insertion_ordered_map<NGHolder *, vector<RoseInEdge>> prefixes;
+ bool changed;
+ u32 gen = 0;
+ do {
+ DEBUG_PRINTF("gen %u\n", gen);
+ changed = false;
+ prefixes.clear();
+
+ /* find prefixes */
+ for (const RoseInEdge &e : out_edges_range(start, vg)) {
+ /* outfixes shouldn't have made it this far */
+ assert(vg[target(e, vg)].type == RIV_LITERAL);
+ if (vg[e].graph) {
+ NGHolder *h = vg[e].graph.get();
+ prefixes[h].push_back(e);
+ }
+ }
+
+ if (prefixes.size() > MAX_FIND_BETTER_PREFIX_COUNT) {
+ break;
+ }
+
+ /* look for bad prefixes and try to split */
+ for (const auto &m : prefixes) {
+ NGHolder *h = m.first;
+ const auto &edges = m.second;
+ depth max_width = findMaxWidth(*h);
+ if (willBeTransient(max_width, cc)
+ || willBeAnchoredTable(max_width, cc.grey)) {
+ continue;
+ }
+
+ changed = improvePrefix(*h, vg, edges, cc);
+ }
+ } while (changed && gen++ < MAX_FIND_BETTER_PREFIX_GEN);
+}
+
+#define STRONG_LITERAL_LENGTH 20
+#define MAX_EXTRACT_STRONG_LITERAL_GRAPHS 10
+
+static
+bool extractStrongLiteral(NGHolder &h, RoseInGraph &vg,
+ const vector<RoseInEdge> &ee,
+ const CompileContext &cc) {
+ DEBUG_PRINTF("looking for string literal\n");
+ unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, ee, cc);
+
+ if (split && min_len(split->lit) >= STRONG_LITERAL_LENGTH) {
+ DEBUG_PRINTF("splitting simple literal\n");
+ return splitRoseEdge(h, vg, ee, *split);
+ }
+
+ return false;
+}
+
+static
+void extractStrongLiterals(RoseInGraph &vg, const CompileContext &cc) {
+ if (!cc.grey.violetExtractStrongLiterals) {
+ return;
+ }
+
+ STAGE_DEBUG_PRINTF("EXTRACT STRONG LITERALS\n");
+
+ unordered_set<NGHolder *> stuck;
+ insertion_ordered_map<NGHolder *, vector<RoseInEdge>> edges_by_graph;
+ bool changed;
+
+ do {
+ changed = false;
+
+ edges_by_graph.clear();
+ for (const RoseInEdge &ve : edges_range(vg)) {
+ if (vg[source(ve, vg)].type != RIV_LITERAL) {
+ continue;
+ }
+
+ if (vg[ve].graph) {
+ NGHolder *h = vg[ve].graph.get();
+ edges_by_graph[h].push_back(ve);
+ }
+ }
+
+ if (edges_by_graph.size() > MAX_EXTRACT_STRONG_LITERAL_GRAPHS) {
+ DEBUG_PRINTF("too many graphs, stopping\n");
+ return;
+ }
+
+ for (const auto &m : edges_by_graph) {
+ NGHolder *g = m.first;
+ const auto &edges = m.second;
+ if (contains(stuck, g)) {
+ DEBUG_PRINTF("already known to be bad\n");
+ continue;
+ }
+ bool rv = extractStrongLiteral(*g, vg, edges, cc);
+ if (rv) {
+ changed = true;
+ } else {
+ stuck.insert(g);
+ }
+ }
+ } while (changed);
+}
+
+#define INFIX_STRONG_GUARD_LEN 8
+#define INFIX_MIN_SPLIT_LITERAL_LEN 12
+
+static
+bool improveInfix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
+ const CompileContext &cc) {
+ unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, ee, cc);
+
+ if (split && min_len(split->lit) >= INFIX_MIN_SPLIT_LITERAL_LEN
+ && splitRoseEdge(h, vg, ee, *split)) {
+ DEBUG_PRINTF("splitting simple literal\n");
+ return true;
+ }
+
+ DEBUG_PRINTF("trying for a netflow cut\n");
+ /* look for netflow cuts which don't produce good prefixes */
+ bool rv = doNetflowCut(h, nullptr, vg, ee, false, cc.grey, 8);
+
+ DEBUG_PRINTF("did netfow cut? = %d\n", (int)rv);
+
+ return rv;
+}
+
+/**
+ * Infixes which are weakly guarded can, in effect, act like prefixes as they
+ * will often be live. We should try to split these infixes further if they
+ * contain strong literals so that we are at least running smaller weak infixes
+ * which can hopeful be accelerated/miracled.
+ */
+static
+void improveWeakInfixes(RoseInGraph &vg, const CompileContext &cc) {
+ if (!cc.grey.violetAvoidWeakInfixes) {
+ return;
+ }
+ STAGE_DEBUG_PRINTF("IMPROVE WEAK INFIXES\n");
+
+ RoseInVertex start = getStart(vg);
+
+ unordered_set<NGHolder *> weak;
+
+ for (RoseInVertex vv : adjacent_vertices_range(start, vg)) {
+ /* outfixes shouldn't have made it this far */
+ assert(vg[vv].type == RIV_LITERAL);
+ if (vg[vv].s.length() >= INFIX_STRONG_GUARD_LEN) {
+ continue;
+ }
+
+ for (const RoseInEdge &e : out_edges_range(vv, vg)) {
+ if (vg[target(e, vg)].type != RIV_LITERAL || !vg[e].graph) {
+ continue;
+ }
+
+ NGHolder *h = vg[e].graph.get();
+ DEBUG_PRINTF("'%s' guards %p\n", dumpString(vg[vv].s).c_str(), h);
+ weak.insert(h);
+ }
+ }
+
+ insertion_ordered_map<NGHolder *, vector<RoseInEdge>> weak_edges;
+ for (const RoseInEdge &ve : edges_range(vg)) {
+ NGHolder *h = vg[ve].graph.get();
+ if (contains(weak, h)) {
+ weak_edges[h].push_back(ve);
+ }
+ }
+
+ for (const auto &m : weak_edges) {
+ NGHolder *h = m.first;
+ const auto &edges = m.second;
+ improveInfix(*h, vg, edges, cc);
+ }
+}
+
+static
+void splitEdgesForSuffix(const NGHolder &base_graph, RoseInGraph &vg,
+ const vector<RoseInEdge> &ee, const VertLitInfo &split,
+ bool eod, const flat_set<ReportID> &reports) {
+ const vector<NFAVertex> &splitters = split.vv;
+ assert(!splitters.empty());
+
+ shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
+ unordered_map<NFAVertex, NFAVertex> v_map;
+ cloneHolder(*lhs, base_graph, &v_map);
+ lhs->kind = NFA_INFIX;
+ clear_in_edges(lhs->accept, *lhs);
+ clear_in_edges(lhs->acceptEod, *lhs);
+ add_edge(lhs->accept, lhs->acceptEod, *lhs);
+ clearReports(*lhs);
+ for (NFAVertex v : splitters) {
+ NFAEdge e = add_edge(v_map[v], lhs->accept, *lhs);
+ if (v == base_graph.start) {
+ (*lhs)[e].tops.insert(DEFAULT_TOP);
+ }
+ (*lhs)[v_map[v]].reports.insert(0);
+
+ }
+ pruneUseless(*lhs);
+ assert(isCorrectlyTopped(*lhs));
+
+ /* create literal vertices and connect preds */
+ for (const auto &lit : split.lit) {
+ if (!can_match(*lhs, lit, is_triggered(*lhs))) {
+ continue;
+ }
+
+ DEBUG_PRINTF("best is '%s'\n", escapeString(lit).c_str());
+ RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), vg);
+
+ RoseInVertex tt;
+ if (eod) {
+ DEBUG_PRINTF("doing eod\n");
+ tt = add_vertex(RoseInVertexProps::makeAcceptEod(reports), vg);
+ } else {
+ DEBUG_PRINTF("doing non-eod\n");
+ tt = add_vertex(RoseInVertexProps::makeAccept(reports), vg);
+ }
+ add_edge(v, tt, RoseInEdgeProps(0U, 0U), vg);
+
+ for (const RoseInEdge &e : ee) {
+ RoseInVertex u = source(e, vg);
+ assert(!edge(u, v, vg).second);
+ add_edge(u, v, RoseInEdgeProps(lhs, 0U), vg);
+ }
+ }
+}
+
+#define MIN_SUFFIX_LEN 6
+
+static
+bool replaceSuffixWithInfix(const NGHolder &h, RoseInGraph &vg,
+ const vector<RoseInEdge> &suffix_edges,
+ const CompileContext &cc) {
+ DEBUG_PRINTF("inspecting suffix : %p on %zu edges\n", &h,
+ suffix_edges.size());
+ /*
+ * We would, in general, rather not have output exposed engines because
+ * once they are triggered, they must be run while infixes only have to run
+ * if the successor literal is seen. Matches from output exposed engines
+ * also have to be placed in a priority queue and interleaved with matches
+ * from other sources.
+ *
+ * Note:
+ * - if the LHS is extremely unlikely we may be better off leaving
+ * a suffix unguarded.
+ *
+ * - limited width suffixes may be less bad as they won't be continuously
+ * active, we may want to have (a) stronger controls on if we want to pick
+ * a trailing literal in these cases and/or (b) look also for literals
+ * near accept as well as right on accept
+ *
+ * TODO: improve heuristics, splitting logic.
+ */
+
+ /* we may do multiple splits corresponding to different report behaviour */
+ set<NFAVertex> seen;
+ map<pair<bool, flat_set<ReportID> >, VertLitInfo> by_reports; /* eod, rep */
+
+ for (NFAVertex v : inv_adjacent_vertices_range(h.accept, h)) {
+ set<ue2_literal> ss = getLiteralSet(h, v, false);
+ if (ss.empty()) {
+ DEBUG_PRINTF("candidate is too shitty\n");
+ return false;
+ }
+
+ VertLitInfo &vli = by_reports[make_pair(false, h[v].reports)];
+ insert(&vli.lit, ss);
+ vli.vv.push_back(v);
+ seen.insert(v);
+ }
+
+ seen.insert(h.accept);
+ for (NFAVertex v : inv_adjacent_vertices_range(h.acceptEod, h)) {
+ if (contains(seen, v)) {
+ continue;
+ }
+
+ set<ue2_literal> ss = getLiteralSet(h, v, false);
+ if (ss.empty()) {
+ DEBUG_PRINTF("candidate is too shitty\n");
+ return false;
+ }
+
+ VertLitInfo &vli = by_reports[make_pair(true, h[v].reports)];
+ insert(&vli.lit, ss);
+ vli.vv.push_back(v);
+ }
+
+ assert(!by_reports.empty());
+
+ /* TODO: how strong a min len do we want here ? */
+ u32 min_len = cc.grey.minRoseLiteralLength;
+ ENSURE_AT_LEAST(&min_len, MIN_SUFFIX_LEN);
+
+ for (auto &vli : by_reports | map_values) {
+ u64a score = sanitizeAndCompressAndScore(vli.lit);
+
+ if (vli.lit.empty()
+ || !validateRoseLiteralSetQuality(vli.lit, score, false, min_len,
+ false, false)) {
+ return false;
+ }
+ }
+
+ for (const auto &info : by_reports) {
+ DEBUG_PRINTF("splitting on simple literals\n");
+ splitEdgesForSuffix(h, vg, suffix_edges, info.second,
+ info.first.first /* eod */,
+ info.first.second /* reports */);
+ }
+
+ for (const RoseInEdge &e : suffix_edges) {
+ remove_edge(e, vg);
+ }
+ return true;
+}
+
+static
+void avoidSuffixes(RoseInGraph &vg, const CompileContext &cc) {
+ if (!cc.grey.violetAvoidSuffixes) {
+ return;
+ }
+
+ STAGE_DEBUG_PRINTF("AVOID SUFFIXES\n");
+
+ RoseInVertex accept = getPrimaryAccept(vg);
+
+ insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> suffixes;
+
+ /* find suffixes */
+ for (const RoseInEdge &e : in_edges_range(accept, vg)) {
+ /* outfixes shouldn't have made it this far */
+ assert(vg[source(e, vg)].type == RIV_LITERAL);
+ assert(vg[e].graph); /* non suffix paths should be wired to other
+ accepts */
+ const NGHolder *h = vg[e].graph.get();
+ suffixes[h].push_back(e);
+ }
+
+ /* look at suffixes and try to split */
+ for (const auto &m : suffixes) {
+ const NGHolder *h = m.first;
+ const auto &edges = m.second;
+ replaceSuffixWithInfix(*h, vg, edges, cc);
+ }
+}
+
+static
+bool leadingDotStartLiteral(const NGHolder &h, VertLitInfo *out) {
+ if (out_degree(h.start, h) != 3) {
+ return false;
+ }
+
+ NFAVertex v = NGHolder::null_vertex();
+ NFAVertex ds = NGHolder::null_vertex();
+
+ for (NFAVertex a : adjacent_vertices_range(h.start, h)) {
+ if (a == h.startDs) {
+ continue;
+ }
+ if (h[a].char_reach.all()) {
+ ds = a;
+ if (out_degree(ds, h) != 2 || !edge(ds, ds, h).second) {
+ return false;
+ }
+ } else {
+ v = a;
+ }
+ }
+
+ if (!v || !ds || !edge(ds, v, h).second) {
+ return false;
+ }
+
+ if (h[v].char_reach.count() != 1 && !h[v].char_reach.isCaselessChar()) {
+ return false;
+ }
+
+ ue2_literal lit;
+ lit.push_back(h[v].char_reach.find_first(),
+ h[v].char_reach.isCaselessChar());
+ while (out_degree(v, h) == 1) {
+ NFAVertex vv = *adjacent_vertices(v, h).first;
+ if (h[vv].char_reach.count() != 1
+ && !h[vv].char_reach.isCaselessChar()) {
+ break;
+ }
+
+ v = vv;
+
+ lit.push_back(h[v].char_reach.find_first(),
+ h[v].char_reach.isCaselessChar());
+ }
+
+ if (is_match_vertex(v, h) && h.kind != NFA_SUFFIX) {
+ /* we have rediscovered the post-infix literal */
+ return false;
+ }
+
+ if (bad_mixed_sensitivity(lit)) {
+ make_nocase(&lit);
+ }
+
+ DEBUG_PRINTF("%zu found %s\n", h[v].index, dumpString(lit).c_str());
+ out->vv = {v};
+ out->lit = {lit};
+ return true;
+}
+
+static
+bool lookForDoubleCut(const NGHolder &h, const vector<RoseInEdge> &ee,
+ RoseInGraph &vg, const Grey &grey) {
+ VertLitInfo info;
+ if (!leadingDotStartLiteral(h, &info)
+ || min_len(info.lit) < grey.violetDoubleCutLiteralLen) {
+ return false;
+ }
+ DEBUG_PRINTF("performing split\n");
+ return splitRoseEdge(h, vg, ee, {info});
+}
+
+static
+void lookForDoubleCut(RoseInGraph &vg, const CompileContext &cc) {
+ if (!cc.grey.violetDoubleCut) {
+ return;
+ }
+
+ insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> right_edges;
+ for (const RoseInEdge &ve : edges_range(vg)) {
+ if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) {
+ const NGHolder *h = vg[ve].graph.get();
+ right_edges[h].push_back(ve);
+ }
+ }
+
+ for (const auto &m : right_edges) {
+ const NGHolder *h = m.first;
+ const auto &edges = m.second;
+ lookForDoubleCut(*h, edges, vg, cc.grey);
+ }
+}
+
+static
+pair<NFAVertex, ue2_literal> findLiteralBefore(const NGHolder &h, NFAVertex v) {
+ ue2_literal lit;
+ if (h[v].char_reach.count() != 1 && !h[v].char_reach.isCaselessChar()) {
+ return {v, std::move(lit) };
+ }
+ lit.push_back(h[v].char_reach.find_first(),
+ h[v].char_reach.isCaselessChar());
+
+ while (in_degree(v, h) == 1) {
+ NFAVertex vv = *inv_adjacent_vertices(v, h).first;
+ if (h[vv].char_reach.count() != 1
+ && !h[vv].char_reach.isCaselessChar()) {
+ break;
+ }
+
+ lit.push_back(h[vv].char_reach.find_first(),
+ h[vv].char_reach.isCaselessChar());
+ v = vv;
+ }
+
+ return {v, std::move(lit) };
+}
+
+static
+bool lookForDotStarPred(NFAVertex v, const NGHolder &h,
+ NFAVertex *u, NFAVertex *ds) {
+ *u = NGHolder::null_vertex();
+ *ds = NGHolder::null_vertex();
+ for (NFAVertex a : inv_adjacent_vertices_range(v, h)) {
+ if (h[a].char_reach.all()) {
+ if (!edge(a, a, h).second) {
+ return false;
+ }
+
+ if (*ds) {
+ return false;
+ }
+
+ *ds = a;
+ } else {
+ if (*u) {
+ return false;
+ }
+ *u = a;
+ }
+ }
+
+ if (!*u || !*ds) {
+ return false;
+ }
+
+ return true;
+}
+
+static
+bool trailingDotStarLiteral(const NGHolder &h, VertLitInfo *out) {
+ /* Note: there is no delay yet - so the final literal is the already
+ * discovered successor literal - we are in fact interested in the literal
+ * before it. */
+
+ if (in_degree(h.accept, h) != 1) {
+ return false;
+ }
+
+ if (in_degree(h.acceptEod, h) != 1) {
+ assert(0);
+ return false;
+ }
+
+ NFAVertex v
+ = findLiteralBefore(h, *inv_adjacent_vertices(h.accept, h).first).first;
+
+ NFAVertex u;
+ NFAVertex ds;
+
+ if (!lookForDotStarPred(v, h, &u, &ds)) {
+ return false;
+ }
+
+ v = u;
+ auto rv = findLiteralBefore(h, v);
+
+ if (!lookForDotStarPred(v, h, &u, &ds)) {
+ return false;
+ }
+
+ ue2_literal lit = reverse_literal(rv.second);
+ DEBUG_PRINTF("%zu found %s\n", h[v].index, dumpString(lit).c_str());
+
+ if (bad_mixed_sensitivity(lit)) {
+ make_nocase(&lit);
+ }
+
+ out->vv = {v};
+ out->lit = {lit};
+ return true;
+}
+
+static
+bool lookForTrailingLiteralDotStar(const NGHolder &h,
+ const vector<RoseInEdge> &ee,
+ RoseInGraph &vg, const Grey &grey) {
+ VertLitInfo info;
+ if (!trailingDotStarLiteral(h, &info)
+ || min_len(info.lit) < grey.violetDoubleCutLiteralLen) {
+ return false;
+ }
+ DEBUG_PRINTF("performing split\n");
+ return splitRoseEdge(h, vg, ee, info);
+}
+
+/* In streaming mode, active engines have to be caught up at stream boundaries
+ * and have to be stored in stream state, so we prefer to decompose patterns
+ * in to literals with no state between them if possible. */
+static
+void decomposeLiteralChains(RoseInGraph &vg, const CompileContext &cc) {
+ if (!cc.grey.violetLiteralChains) {
+ return;
+ }
+
+ insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> right_edges;
+ bool changed;
+ do {
+ changed = false;
+
+ right_edges.clear();
+ for (const RoseInEdge &ve : edges_range(vg)) {
+ if (vg[ve].graph && vg[source(ve, vg)].type == RIV_LITERAL) {
+ const NGHolder *h = vg[ve].graph.get();
+ right_edges[h].push_back(ve);
+ }
+ }
+
+ for (const auto &m : right_edges) {
+ const NGHolder *h = m.first;
+ const vector<RoseInEdge> &ee = m.second;
+ bool rv = lookForDoubleCut(*h, ee, vg, cc.grey);
+ if (!rv && h->kind != NFA_SUFFIX) {
+ rv = lookForTrailingLiteralDotStar(*h, ee, vg, cc.grey);
+ }
+ changed |= rv;
+ }
+ } while (changed);
+}
+
+static
+bool lookForCleanSplit(const NGHolder &h, const vector<RoseInEdge> &ee,
+ RoseInGraph &vg, const CompileContext &cc) {
+ unique_ptr<VertLitInfo> split = findBestCleanSplit(h, cc);
+
+ if (split) {
+ return splitRoseEdge(h, vg, {ee}, *split);
+ }
+
+ return false;
+}
+
+#define MAX_DESIRED_CLEAN_SPLIT_DEPTH 4
+
+static
+void lookForCleanEarlySplits(RoseInGraph &vg, const CompileContext &cc) {
+ u32 gen = 0;
+
+ insertion_ordered_set<RoseInVertex> prev({getStart(vg)});
+ insertion_ordered_set<RoseInVertex> curr;
+
+ while (gen < MAX_DESIRED_CLEAN_SPLIT_DEPTH) {
+ curr.clear();
+ for (RoseInVertex u : prev) {
+ for (auto v : adjacent_vertices_range(u, vg)) {
+ curr.insert(v);
+ }
+ }
+
+ insertion_ordered_map<const NGHolder *, vector<RoseInEdge>> rightfixes;
+ for (RoseInVertex v : curr) {
+ for (const RoseInEdge &e : out_edges_range(v, vg)) {
+ if (vg[e].graph) {
+ NGHolder *h = vg[e].graph.get();
+ rightfixes[h].push_back(e);
+ }
+ }
+ }
+
+ for (const auto &m : rightfixes) {
+ const NGHolder *h = m.first;
+ const auto &edges = m.second;
+ lookForCleanSplit(*h, edges, vg, cc);
+ }
+
+ prev = std::move(curr);
+ gen++;
+ }
+}
+
+static
+void rehomeEodSuffixes(RoseInGraph &vg) {
+ // Find edges to accept with EOD-anchored graphs that we can move over to
+ // acceptEod.
+ vector<RoseInEdge> acc_edges;
+ for (const auto &e : edges_range(vg)) {
+ if (vg[target(e, vg)].type != RIV_ACCEPT) {
+ continue;
+ }
+ if (vg[e].haig || !vg[e].graph) {
+ continue;
+ }
+
+ const NGHolder &h = *vg[e].graph;
+
+ if (in_degree(h.accept, h)) {
+ DEBUG_PRINTF("graph isn't eod anchored\n");
+ continue;
+ }
+
+ acc_edges.push_back(e);
+ }
+
+ for (const RoseInEdge &e : acc_edges) {
+ // Move this edge from accept to acceptEod
+ RoseInVertex w = add_vertex(RoseInVertexProps::makeAcceptEod(), vg);
+ add_edge(source(e, vg), w, vg[e], vg);
+ remove_edge(e, vg);
+ }
+
+ /* old accept vertices will be tidied up by final pruneUseless() call */
+}
+
+static
+bool tryForEarlyDfa(const NGHolder &h, const CompileContext &cc) {
+ switch (h.kind) {
+ case NFA_OUTFIX: /* 'prefix' of eod */
+ case NFA_PREFIX:
+ return cc.grey.earlyMcClellanPrefix;
+ case NFA_INFIX:
+ return cc.grey.earlyMcClellanInfix;
+ case NFA_SUFFIX:
+ return cc.grey.earlyMcClellanSuffix;
+ default:
+ DEBUG_PRINTF("kind %u\n", (u32)h.kind);
+ assert(0);
+ return false;
+ }
+}
+
+static
+vector<vector<CharReach>> getDfaTriggers(RoseInGraph &vg,
+ const vector<RoseInEdge> &edges,
+ bool *single_trigger) {
+ vector<vector<CharReach>> triggers;
+ u32 min_offset = ~0U;
+ u32 max_offset = 0;
+ for (const auto &e : edges) {
+ RoseInVertex s = source(e, vg);
+ if (vg[s].type == RIV_LITERAL) {
+ triggers.push_back(as_cr_seq(vg[s].s));
+ }
+ ENSURE_AT_LEAST(&max_offset, vg[s].max_offset);
+ LIMIT_TO_AT_MOST(&min_offset, vg[s].min_offset);
+ }
+
+ *single_trigger = min_offset == max_offset;
+ DEBUG_PRINTF("trigger offset (%u, %u)\n", min_offset, max_offset);
+
+ return triggers;
+}
+
+static
+bool doEarlyDfa(RoseBuild &rose, RoseInGraph &vg, NGHolder &h,
+ const vector<RoseInEdge> &edges, bool final_chance,
+ const ReportManager &rm, const CompileContext &cc) {
+ DEBUG_PRINTF("trying for dfa\n");
+
+ bool single_trigger;
+ for (const auto &e : edges) {
+ if (vg[target(e, vg)].type == RIV_ACCEPT_EOD) {
+ /* TODO: support eod prefixes */
+ return false;
+ }
+ }
+
+ auto triggers = getDfaTriggers(vg, edges, &single_trigger);
+
+ /* TODO: literal delay things */
+ if (!generates_callbacks(h)) {
+ set_report(h, rose.getNewNfaReport());
+ }
+
+ shared_ptr<raw_dfa> dfa = buildMcClellan(h, &rm, single_trigger, triggers,
+ cc.grey, final_chance);
+
+ if (!dfa) {
+ return false;
+ }
+
+ DEBUG_PRINTF("dfa ok\n");
+ for (const auto &e : edges) {
+ vg[e].dfa = dfa;
+ }
+
+ return true;
+}
+
+#define MAX_EDGES_FOR_IMPLEMENTABILITY 50
+
+static
+bool splitForImplementability(RoseInGraph &vg, NGHolder &h,
+ const vector<RoseInEdge> &edges,
+ const CompileContext &cc) {
+ vector<pair<ue2_literal, u32>> succ_lits;
+ DEBUG_PRINTF("trying to split %s with %zu vertices on %zu edges\n",
+ to_string(h.kind).c_str(), num_vertices(h), edges.size());
+
+ if (edges.size() > MAX_EDGES_FOR_IMPLEMENTABILITY) {
+ return false;
+ }
+
+ if (!generates_callbacks(h)) {
+ for (const auto &e : edges) {
+ const auto &lit = vg[target(e, vg)].s;
+ u32 delay = vg[e].graph_lag;
+ vg[e].graph_lag = 0;
+
+ assert(delay <= lit.length());
+ succ_lits.emplace_back(lit, delay);
+ }
+ restoreTrailingLiteralStates(h, succ_lits);
+ }
+
+ unique_ptr<VertLitInfo> split;
+ bool last_chance = true;
+ if (h.kind == NFA_PREFIX) {
+ auto depths = calcDepths(h);
+
+ split = findBestPrefixSplit(h, depths, vg, edges, last_chance, cc);
+ } else {
+ split = findBestLastChanceSplit(h, vg, edges, cc);
+ }
+
+ if (split && splitRoseEdge(h, vg, edges, *split)) {
+ DEBUG_PRINTF("split on simple literal\n");
+ return true;
+ }
+
+ DEBUG_PRINTF("trying to netflow\n");
+ bool rv = doNetflowCut(h, nullptr, vg, edges, false, cc.grey);
+ DEBUG_PRINTF("done\n");
+
+ return rv;
+}
+
+#define MAX_IMPLEMENTABLE_SPLITS 50
+
+bool ensureImplementable(RoseBuild &rose, RoseInGraph &vg, bool allow_changes,
+ bool final_chance, const ReportManager &rm,
+ const CompileContext &cc) {
+ DEBUG_PRINTF("checking for impl %d\n", final_chance);
+ bool changed = false;
+ bool need_to_recalc = false;
+ u32 added_count = 0;
+ unordered_set<shared_ptr<NGHolder>> good; /* known to be implementable */
+ do {
+ changed = false;
+ DEBUG_PRINTF("added %u\n", added_count);
+ insertion_ordered_map<shared_ptr<NGHolder>,
+ vector<RoseInEdge>> edges_by_graph;
+ for (const RoseInEdge &ve : edges_range(vg)) {
+ if (vg[ve].graph && !vg[ve].dfa) {
+ auto &h = vg[ve].graph;
+ edges_by_graph[h].push_back(ve);
+ }
+ }
+ for (auto &m : edges_by_graph) {
+ auto &h = m.first;
+ if (contains(good, h)) {
+ continue;
+ }
+ reduceGraphEquivalences(*h, cc);
+ if (isImplementableNFA(*h, &rm, cc)) {
+ good.insert(h);
+ continue;
+ }
+
+ const auto &edges = m.second;
+
+ if (tryForEarlyDfa(*h, cc) &&
+ doEarlyDfa(rose, vg, *h, edges, final_chance, rm, cc)) {
+ continue;
+ }
+
+ DEBUG_PRINTF("eek\n");
+ if (!allow_changes) {
+ return false;
+ }
+
+ if (splitForImplementability(vg, *h, edges, cc)) {
+ added_count++;
+ if (added_count > MAX_IMPLEMENTABLE_SPLITS) {
+ DEBUG_PRINTF("added_count hit limit\n");
+ return false;
+ }
+ changed = true;
+ continue;
+ }
+
+ return false;
+ }
+
+ assert(added_count <= MAX_IMPLEMENTABLE_SPLITS);
+
+ if (changed) {
+ removeRedundantLiterals(vg, cc);
+ pruneUseless(vg);
+ need_to_recalc = true;
+ }
+ } while (changed);
+
+ if (need_to_recalc) {
+ renumber_vertices(vg);
+ calcVertexOffsets(vg);
+ }
+
+ DEBUG_PRINTF("ok!\n");
+ return true;
+}
+
+static
+RoseInGraph doInitialVioletTransform(const NGHolder &h, bool last_chance,
+ const CompileContext &cc) {
+ assert(!can_never_match(h));
+
+ RoseInGraph vg = populateTrivialGraph(h);
+
+ if (!cc.grey.allowViolet) {
+ return vg;
+ }
+
+ /* Avoid running the Violet analysis at all on graphs with no vertices with
+ * small reach, since we will not be able to extract any literals. */
+ if (!hasNarrowReachVertex(h)) {
+ DEBUG_PRINTF("fail, no vertices with small reach\n");
+ return vg;
+ }
+
+ DEBUG_PRINTF("hello world\n");
+
+ /* Step 1: avoid outfixes as we always have to run them. */
+ avoidOutfixes(vg, last_chance, cc);
+
+ if (num_vertices(vg) <= 2) {
+ return vg; /* unable to transform pattern */
+ }
+
+ removeRedundantPrefixes(vg);
+ dumpPreRoseGraph(vg, cc.grey, "pre_prefix_rose.dot");
+
+ /* Step 2: avoid non-transient prefixes (esp in streaming mode) */
+ findBetterPrefixes(vg, cc);
+
+ dumpPreRoseGraph(vg, cc.grey, "post_prefix_rose.dot");
+
+ extractStrongLiterals(vg, cc);
+ dumpPreRoseGraph(vg, cc.grey, "post_extract_rose.dot");
+ improveWeakInfixes(vg, cc);
+ dumpPreRoseGraph(vg, cc.grey, "post_infix_rose.dot");
+
+ /* Step 3: avoid output exposed engines if there is a strong trailing
+ literal) */
+ avoidSuffixes(vg, cc);
+
+ /* Step 4: look for infixes/suffixes with leading .*literals
+ * This can reduce the amount of work a heavily picked literal has to do and
+ * reduce the amount of state used as .* is handled internally to rose. */
+ lookForDoubleCut(vg, cc);
+
+ if (cc.streaming) {
+ lookForCleanEarlySplits(vg, cc);
+ decomposeLiteralChains(vg, cc);
+ }
+
+ rehomeEodSuffixes(vg);
+ removeRedundantLiterals(vg, cc);
+
+ pruneUseless(vg);
+ dumpPreRoseGraph(vg, cc.grey);
+ renumber_vertices(vg);
+ calcVertexOffsets(vg);
+
+ return vg;
+}
+
+bool doViolet(RoseBuild &rose, const NGHolder &h, bool prefilter,
+ bool last_chance, const ReportManager &rm,
+ const CompileContext &cc) {
+ auto vg = doInitialVioletTransform(h, last_chance, cc);
+ if (num_vertices(vg) <= 2) {
+ return false;
+ }
+
+ /* Step 5: avoid unimplementable, or overly large engines if possible */
+ if (!ensureImplementable(rose, vg, last_chance, last_chance, rm, cc)) {
+ return false;
+ }
+ dumpPreRoseGraph(vg, cc.grey, "post_ensure_rose.dot");
+
+ /* Step 6: send to rose */
+ bool rv = rose.addRose(vg, prefilter);
+ DEBUG_PRINTF("violet: %s\n", rv ? "success" : "fail");
+ return rv;
+}
+
+bool checkViolet(const ReportManager &rm, const NGHolder &h, bool prefilter,
+ const CompileContext &cc) {
+ auto vg = doInitialVioletTransform(h, true, cc);
+ if (num_vertices(vg) <= 2) {
+ return false;
+ }
+
+ bool rv = roseCheckRose(vg, prefilter, rm, cc);
+ DEBUG_PRINTF("violet: %s\n", rv ? "success" : "fail");
+ return rv;
+}
+
+}