diff options
author | Ivan Blinkov <ivan@blinkov.ru> | 2022-02-10 16:47:11 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:47:11 +0300 |
commit | 5b283123c882433dafbaf6b338adeea16c1a0ea0 (patch) | |
tree | 339adc63bce23800021202ae4a8328a843dc447a /contrib/libs/hyperscan/src/nfagraph/ng_fuzzy.cpp | |
parent | 1aeb9a455974457866f78722ad98114bafc84e8a (diff) | |
download | ydb-5b283123c882433dafbaf6b338adeea16c1a0ea0.tar.gz |
Restoring authorship annotation for Ivan Blinkov <ivan@blinkov.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/hyperscan/src/nfagraph/ng_fuzzy.cpp')
-rw-r--r-- | contrib/libs/hyperscan/src/nfagraph/ng_fuzzy.cpp | 1414 |
1 files changed, 707 insertions, 707 deletions
diff --git a/contrib/libs/hyperscan/src/nfagraph/ng_fuzzy.cpp b/contrib/libs/hyperscan/src/nfagraph/ng_fuzzy.cpp index 9dd19f05de..78fd862919 100644 --- a/contrib/libs/hyperscan/src/nfagraph/ng_fuzzy.cpp +++ b/contrib/libs/hyperscan/src/nfagraph/ng_fuzzy.cpp @@ -1,707 +1,707 @@ -/* - * Copyright (c) 2015-2017, Intel Corporation - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Intel Corporation nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -/** \file - * \brief Graph fuzzer for approximate matching - */ - -#include "ng_fuzzy.h" - -#include "ng.h" -#include "ng_depth.h" -#include "ng_util.h" - -#include <map> -#include <vector> -using namespace std; - -namespace ue2 { - -// returns all successors up to a given depth in a vector of sets, indexed by -// zero-based depth from source vertex -static -vector<flat_set<NFAVertex>> gatherSuccessorsByDepth(const NGHolder &g, - NFAVertex src, u32 depth) { - vector<flat_set<NFAVertex>> result(depth); - flat_set<NFAVertex> cur, next; - - assert(depth > 0); - - // populate current set of successors - for (auto v : adjacent_vertices_range(src, g)) { - // ignore self-loops - if (src == v) { - continue; - } - DEBUG_PRINTF("Node %zu depth 1\n", g[v].index); - cur.insert(v); - } - result[0] = cur; - - for (unsigned d = 1; d < depth; d++) { - // collect all successors for all current level vertices - for (auto v : cur) { - // don't go past special nodes - if (is_special(v, g)) { - continue; - } - - for (auto succ : adjacent_vertices_range(v, g)) { - // ignore self-loops - if (v == succ) { - continue; - } - DEBUG_PRINTF("Node %zu depth %u\n", g[succ].index, d + 1); - next.insert(succ); - } - } - result[d] = next; - next.swap(cur); - next.clear(); - } - - return result; -} - -// returns all predecessors up to a given depth in a vector of sets, indexed by -// zero-based depth from source vertex -static -vector<flat_set<NFAVertex>> gatherPredecessorsByDepth(const NGHolder &g, - NFAVertex src, - u32 depth) { - vector<flat_set<NFAVertex>> result(depth); - flat_set<NFAVertex> cur, next; - - assert(depth > 0); - - // populate current set of successors - for (auto v : inv_adjacent_vertices_range(src, g)) { - // ignore self-loops - if (src == v) { - continue; - } - DEBUG_PRINTF("Node %zu depth 1\n", g[v].index); - cur.insert(v); - } - result[0] = cur; - - for (unsigned d = 1; d < depth; d++) { - // collect all successors for all current level vertices - for (auto v : cur) { - for (auto pred : inv_adjacent_vertices_range(v, g)) { - // ignore self-loops - if (v == pred) { - continue; - } - DEBUG_PRINTF("Node %zu depth %u\n", g[pred].index, d + 1); - next.insert(pred); - } - } - result[d] = next; - next.swap(cur); - next.clear(); - } - - return result; -} - -/* - * This struct produces a fuzzed graph; that is, a graph that is able to match - * the original pattern, as well as input data within a certain edit distance. - * Construct the struct, then call fuzz_graph() to transform the graph. - * - * Terminology used: - * - Shadow vertices: vertices mirroring the original graph at various edit - * distances - * - Shadow graph level: edit distance of a particular shadow graph - * - Helpers: dot vertices assigned to shadow vertices, used for insert/replace - */ -struct ShadowGraph { - NGHolder &g; - u32 edit_distance; - bool hamming; - map<pair<NFAVertex, u32>, NFAVertex> shadow_map; - map<pair<NFAVertex, u32>, NFAVertex> helper_map; - map<NFAVertex, NFAVertex> clones; - // edge creation is deferred - vector<pair<NFAVertex, NFAVertex>> edges_to_be_added; - flat_set<NFAVertex> orig; - - ShadowGraph(NGHolder &g_in, u32 ed_in, bool hamm_in) - : g(g_in), edit_distance(ed_in), hamming(hamm_in) {} - - void fuzz_graph() { - if (edit_distance == 0) { - return; - } - - DEBUG_PRINTF("edit distance = %u hamming = %s\n", edit_distance, - hamming ? "true" : "false"); - - // step 1: prepare the vertices, helpers and shadows according to - // the original graph - prepare_graph(); - - // step 2: add shadow and helper nodes - build_shadow_graph(); - - // step 3: set up reports for newly created vertices (and make clones - // if necessary) - if (!hamming) { - create_reports(); - } - - // step 4: wire up shadow graph and helpers for insert/replace/remove - connect_shadow_graph(); - - // step 5: commit all the edge wirings - DEBUG_PRINTF("Committing edge wirings\n"); - for (const auto &p : edges_to_be_added) { - add_edge_if_not_present(p.first, p.second, g); - } - - DEBUG_PRINTF("Done!\n"); - } - -private: - const NFAVertex& get_clone(const NFAVertex &v) { - return contains(clones, v) ? - clones[v] : v; - } - - void connect_to_clones(const NFAVertex &u, const NFAVertex &v) { - const NFAVertex &clone_u = get_clone(u); - const NFAVertex &clone_v = get_clone(v); - - edges_to_be_added.emplace_back(u, v); - DEBUG_PRINTF("Adding edge: %zu -> %zu\n", g[u].index, g[v].index); - - // do not connect clones to accepts, we do it during cloning - if (is_any_accept(clone_v, g)) { - return; - } - edges_to_be_added.emplace_back(clone_u, clone_v); - DEBUG_PRINTF("Adding edge: %zu -> %zu\n", g[clone_u].index, - g[clone_v].index); - } - - void prepare_graph() { - DEBUG_PRINTF("Building shadow graphs\n"); - - for (auto v : vertices_range(g)) { - // all level 0 vertices are their own helpers and their own shadows - helper_map[make_pair(v, 0)] = v; - shadow_map[make_pair(v, 0)] = v; - - // find special nodes - if (is_any_accept(v, g)) { - DEBUG_PRINTF("Node %zu is a special node\n", g[v].index); - for (unsigned edit = 1; edit <= edit_distance; edit++) { - // all accepts are their own shadows and helpers at all - // levels - shadow_map[make_pair(v, edit)] = v; - helper_map[make_pair(v, edit)] = v; - } - continue; - } - DEBUG_PRINTF("Node %zu is to be shadowed\n", g[v].index); - orig.insert(v); - } - } - - void build_shadow_graph() { - for (auto v : orig) { - DEBUG_PRINTF("Adding shadow/helper nodes for node %zu\n", - g[v].index); - for (unsigned dist = 1; dist <= edit_distance; dist++) { - auto shadow_v = v; - - // start and startDs cannot have shadows but do have helpers - if (!is_any_start(v, g)) { - shadow_v = clone_vertex(g, v); - DEBUG_PRINTF("New shadow node ID: %zu (level %u)\n", - g[shadow_v].index, dist); - } - shadow_map[make_pair(v, dist)] = shadow_v; - - // if there's nowhere to go from this vertex, no helper needed - if (proper_out_degree(v, g) < 1) { - DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n", - g[shadow_v].index, dist); - helper_map[make_pair(v, dist)] = shadow_v; - continue; - } - - // start and startDs only have helpers for insert, so not Hamming - if (hamming && is_any_start(v, g)) { - DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n", - g[shadow_v].index, dist); - helper_map[make_pair(v, dist)] = shadow_v; - continue; - } - - auto helper_v = clone_vertex(g, v); - DEBUG_PRINTF("New helper node ID: %zu (level %u)\n", - g[helper_v].index, dist); - - // this is a helper, so make it a dot - g[helper_v].char_reach = CharReach::dot(); - // do not copy virtual start's assert flags - if (is_virtual_start(v, g)) { - DEBUG_PRINTF("Helper node ID is virtual start: %zu (level %u)\n", - g[helper_v].index, dist); - g[helper_v].assert_flags = 0; - } - helper_map[make_pair(v, dist)] = helper_v; - } - } - } - - // wire up successors according to the original graph, wire helpers - // to shadow successors (insert/replace) - void connect_succs(NFAVertex v, u32 dist) { - DEBUG_PRINTF("Wiring up successors for node %zu shadow level %u\n", - g[v].index, dist); - const auto &cur_shadow_v = shadow_map[make_pair(v, dist)]; - const auto &cur_shadow_helper = helper_map[make_pair(v, dist)]; - - // multiple insert - if (!hamming && dist > 1) { - const auto &prev_level_helper = helper_map[make_pair(v, dist - 1)]; - connect_to_clones(prev_level_helper, cur_shadow_helper); - } - - for (auto orig_dst : adjacent_vertices_range(v, g)) { - const auto &shadow_dst = shadow_map[make_pair(orig_dst, dist)]; - - connect_to_clones(cur_shadow_v, shadow_dst); - - // ignore startDs for insert/replace - if (orig_dst == g.startDs) { - continue; - } - - connect_to_clones(cur_shadow_helper, shadow_dst); - } - } - - // wire up predecessors according to the original graph, wire - // predecessors to helpers (replace), wire predecessor helpers to - // helpers (multiple replace) - void connect_preds(NFAVertex v, u32 dist) { - DEBUG_PRINTF("Wiring up predecessors for node %zu shadow level %u\n", - g[v].index, dist); - const auto &cur_shadow_v = shadow_map[make_pair(v, dist)]; - const auto &cur_shadow_helper = helper_map[make_pair(v, dist)]; - - auto orig_src_vertices = inv_adjacent_vertices_range(v, g); - for (auto orig_src : orig_src_vertices) { - // ignore edges from start to startDs - if (v == g.startDs && orig_src == g.start) { - continue; - } - // ignore self-loops for replace - if (orig_src != v) { - // do not wire a replace node for start vertices if we - // have a virtual start - if (is_virtual_start(v, g) && is_any_start(orig_src, g)) { - continue; - } - - if (dist) { - const auto &prev_level_src = - shadow_map[make_pair(orig_src, dist - 1)]; - const auto &prev_level_helper = - helper_map[make_pair(orig_src, dist - 1)]; - - connect_to_clones(prev_level_src, cur_shadow_helper); - connect_to_clones(prev_level_helper, cur_shadow_helper); - } - } - // wire predecessor according to original graph - const auto &shadow_src = shadow_map[make_pair(orig_src, dist)]; - - connect_to_clones(shadow_src, cur_shadow_v); - } - } - - // wire up previous level helper to current shadow (insert) - void connect_helpers(NFAVertex v, u32 dist) { - DEBUG_PRINTF("Wiring up helpers for node %zu shadow level %u\n", - g[v].index, dist); - const auto &cur_shadow_helper = helper_map[make_pair(v, dist)]; - auto prev_level_v = shadow_map[make_pair(v, dist - 1)]; - - connect_to_clones(prev_level_v, cur_shadow_helper); - } - - /* - * wiring edges for removal is a special case. - * - * when wiring edges for removal, as well as wiring up immediate - * predecessors to immediate successors, we also need to wire up more - * distant successors to their respective shadow graph levels. - * - * for example, consider graph start->a->b->c->d->accept. - * - * at edit distance 1, we need remove edges start->b, a->c, b->d, and - * c->accept, all going from original graph (level 0) to shadow graph - * level 1. - * - * at edit distance 2, we also need edges start->c, a->d and b->accept, - * all going from level 0 to shadow graph level 2. - * - * this is propagated to all shadow levels; that is, given edit - * distance 3, we will have edges from shadow levels 0->1, 0->2, - * 0->3, 1->2, 1->3, and 2->3. - * - * therefore, we wire them in steps: first wire with step 1 (0->1, 1->2, - * 2->3) at depth 1, then wire with step 2 (0->2, 1->3) at depth 2, etc. - * - * we also have to wire helpers to their removal successors, to - * accommodate for a replace followed by a remove, on all shadow levels. - * - * and finally, we also have to wire source shadows into removal - * successor helpers on a level above, to accommodate for a remove - * followed by a replace. - */ - void connect_removals(NFAVertex v) { - DEBUG_PRINTF("Wiring up remove edges for node %zu\n", g[v].index); - - // vertices returned by this function don't include self-loops - auto dst_vertices_by_depth = - gatherSuccessorsByDepth(g, v, edit_distance); - auto orig_src_vertices = inv_adjacent_vertices_range(v, g); - for (auto orig_src : orig_src_vertices) { - // ignore self-loops - if (orig_src == v) { - continue; - } - for (unsigned step = 1; step <= edit_distance; step++) { - for (unsigned dist = step; dist <= edit_distance; dist++) { - auto &dst_vertices = dst_vertices_by_depth[step - 1]; - for (auto &orig_dst : dst_vertices) { - const auto &shadow_src = - shadow_map[make_pair(orig_src, dist - step)]; - const auto &shadow_helper = - helper_map[make_pair(orig_src, dist - step)]; - const auto &shadow_dst = - shadow_map[make_pair(orig_dst, dist)]; - - // removal - connect_to_clones(shadow_src, shadow_dst); - - // removal from helper vertex - connect_to_clones(shadow_helper, shadow_dst); - - // removal into helper, requires additional edit - if ((dist + 1) <= edit_distance) { - const auto &next_level_helper = - helper_map[make_pair(orig_dst, dist + 1)]; - - connect_to_clones(shadow_src, next_level_helper); - } - } - } - } - } - } - - void connect_shadow_graph() { - DEBUG_PRINTF("Wiring up the graph\n"); - - for (auto v : orig) { - - DEBUG_PRINTF("Wiring up edges for node %zu\n", g[v].index); - - for (unsigned dist = 0; dist <= edit_distance; dist++) { - - // handle insert/replace - connect_succs(v, dist); - - // handle replace/multiple insert - connect_preds(v, dist); - - // handle helpers - if (!hamming && dist > 0) { - connect_helpers(v, dist); - } - } - - // handle removals - if (!hamming) { - connect_removals(v); - } - } - } - - void connect_to_targets(NFAVertex src, const flat_set<NFAVertex> &targets) { - for (auto dst : targets) { - DEBUG_PRINTF("Adding edge: %zu -> %zu\n", g[src].index, - g[dst].index); - edges_to_be_added.emplace_back(src, dst); - } - } - - // create a clone of the vertex, but overwrite its report set - void create_clone(NFAVertex v, const flat_set<ReportID> &reports, - unsigned max_edit_distance, - const flat_set<NFAVertex> &targets) { - // some vertices may have the same reports, but different successors; - // therefore, we may need to connect them multiple times, but still only - // clone once - bool needs_cloning = !contains(clones, v); - - DEBUG_PRINTF("Cloning node %zu\n", g[v].index); - // go through all shadows and helpers, including - // original vertex - for (unsigned d = 0; d < max_edit_distance; d++) { - auto shadow_v = shadow_map[make_pair(v, d)]; - auto helper_v = helper_map[make_pair(v, d)]; - - NFAVertex new_shadow_v, new_helper_v; - - // make sure we don't clone the same vertex twice - if (needs_cloning) { - new_shadow_v = clone_vertex(g, shadow_v); - DEBUG_PRINTF("New shadow node ID: %zu (level %u)\n", - g[new_shadow_v].index, d); - clones[shadow_v] = new_shadow_v; - } else { - new_shadow_v = clones[shadow_v]; - } - g[new_shadow_v].reports = reports; - - connect_to_targets(new_shadow_v, targets); - - if (shadow_v == helper_v) { - continue; - } - if (needs_cloning) { - new_helper_v = clone_vertex(g, helper_v); - DEBUG_PRINTF("New helper node ID: %zu (level %u)\n", - g[new_helper_v].index, d); - clones[helper_v] = new_helper_v; - } else { - new_helper_v = clones[helper_v]; - } - g[new_helper_v].reports = reports; - - connect_to_targets(new_helper_v, targets); - } - } - - void write_reports(NFAVertex v, const flat_set<ReportID> &reports, - unsigned max_edit_distance, - const flat_set<NFAVertex> &targets) { - // we're overwriting reports, but we're not losing any - // information as we already cached all the different report - // sets, so vertices having different reports will be cloned and set up - // with the correct report set - - // go through all shadows and helpers, including original - // vertex - for (unsigned d = 0; d < max_edit_distance; d++) { - auto shadow_v = shadow_map[make_pair(v, d)]; - auto helper_v = helper_map[make_pair(v, d)]; - DEBUG_PRINTF("Setting up reports for shadow node: %zu " - "(level %u)\n", - g[shadow_v].index, d); - DEBUG_PRINTF("Setting up reports for helper node: %zu " - "(level %u)\n", - g[helper_v].index, d); - g[shadow_v].reports = reports; - g[helper_v].reports = reports; - - connect_to_targets(shadow_v, targets); - connect_to_targets(helper_v, targets); - } - } - - /* - * we may have multiple report sets per graph. that means, whenever we - * construct additional paths through the graph (alternations, removals), we - * have to account for the fact that some vertices are predecessors to - * vertices with different report sets. - * - * whenever that happens, we have to clone the paths for both report sets, - * and set up these new vertices with their respective report sets as well. - * - * in order to do that, we first have to get all the predecessors for accept - * and acceptEod vertices. then, go through them one by one, and take note - * of the report lists. the first report set we find, wins, the rest we - * clone. - * - * we also have to do this in two passes, because there may be vertices that - * are predecessors to vertices with different report sets, so to avoid - * overwriting reports we will be caching reports info instead. - */ - void create_reports() { - map<flat_set<ReportID>, flat_set<NFAVertex>> reports_to_vertices; - flat_set<NFAVertex> accepts{g.accept, g.acceptEod}; - - // gather reports info from all vertices connected to accept - for (auto accept : accepts) { - for (auto src : inv_adjacent_vertices_range(accept, g)) { - // skip special vertices - if (is_special(src, g)) { - continue; - } - reports_to_vertices[g[src].reports].insert(src); - } - } - - // we expect to see at most two report sets - assert(reports_to_vertices.size() > 0 && - reports_to_vertices.size() <= 2); - - // set up all reports - bool clone = false; - for (auto &pair : reports_to_vertices) { - const auto &reports = pair.first; - const auto &vertices = pair.second; - - for (auto src : vertices) { - // get all predecessors up to edit distance - auto src_vertices_by_depth = - gatherPredecessorsByDepth(g, src, edit_distance); - - // find which accepts source vertex connects to - flat_set<NFAVertex> targets; - for (const auto &accept : accepts) { - NFAEdge e = edge(src, accept, g); - if (e) { - targets.insert(accept); - } - } - assert(targets.size()); - - for (unsigned d = 0; d < src_vertices_by_depth.size(); d++) { - const auto &preds = src_vertices_by_depth[d]; - for (auto v : preds) { - // only clone a node if it already contains reports - if (clone && !g[v].reports.empty()) { - create_clone(v, reports, edit_distance - d, - targets); - } else { - write_reports(v, reports, edit_distance - d, - targets); - } - } - } - } - // clone vertices only if it's not our first report set - clone = true; - } - } -}; - -// check if we will edit our way into a vacuous pattern -static -bool will_turn_vacuous(const NGHolder &g, u32 edit_distance) { - auto depths = calcRevDepths(g); - - depth min_depth = depth::infinity(); - auto idx = g[g.start].index; - - // check distance from start to accept/acceptEod - if (depths[idx].toAccept.min.is_finite()) { - min_depth = min(depths[idx].toAccept.min, min_depth); - } - if (depths[idx].toAcceptEod.min.is_finite()) { - min_depth = min(depths[idx].toAcceptEod.min, min_depth); - } - - idx = g[g.startDs].index; - - // check distance from startDs to accept/acceptEod - if (depths[idx].toAccept.min.is_finite()) { - min_depth = min(depths[idx].toAccept.min, min_depth); - } - if (depths[idx].toAcceptEod.min.is_finite()) { - min_depth = min(depths[idx].toAcceptEod.min, min_depth); - } - - assert(min_depth.is_finite()); - - // now, check if we can edit our way into a vacuous pattern - if (min_depth <= (u64a) edit_distance + 1) { - DEBUG_PRINTF("Pattern will turn vacuous if approximately matched\n"); - return true; - } - return false; -} - -void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool hamming, - bool utf8, const Grey &grey) { - if (edit_distance == 0) { - return; - } - if (!grey.allowApproximateMatching) { - throw CompileError("Approximate matching is disabled."); - } - if (edit_distance > grey.maxEditDistance) { - throw CompileError("Edit distance is too big."); - } - if (utf8) { - throw CompileError("UTF-8 is disallowed for approximate matching."); - } - // graph isn't fuzzable if there are edge assertions anywhere in the graph - for (auto e : edges_range(g)) { - if (g[e].assert_flags) { - throw CompileError("Zero-width assertions are disallowed for " - "approximate matching."); - } - } - if (!hamming && will_turn_vacuous(g, edit_distance)) { - throw CompileError("Approximate matching patterns that reduce to " - "vacuous patterns are disallowed."); - } -} - -void make_fuzzy(NGHolder &g, u32 edit_distance, bool hamming, - const Grey &grey) { - if (edit_distance == 0) { - return; - } - - assert(grey.allowApproximateMatching); - assert(grey.maxEditDistance >= edit_distance); - - ShadowGraph sg(g, edit_distance, hamming); - sg.fuzz_graph(); - - // For safety, enforce limit on actual vertex count. - if (num_vertices(g) > grey.limitApproxMatchingVertices) { - DEBUG_PRINTF("built %zu vertices > limit of %u\n", num_vertices(g), - grey.limitApproxMatchingVertices); - throw ResourceLimitError(); - } -} - -} // namespace ue2 +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Graph fuzzer for approximate matching + */ + +#include "ng_fuzzy.h" + +#include "ng.h" +#include "ng_depth.h" +#include "ng_util.h" + +#include <map> +#include <vector> +using namespace std; + +namespace ue2 { + +// returns all successors up to a given depth in a vector of sets, indexed by +// zero-based depth from source vertex +static +vector<flat_set<NFAVertex>> gatherSuccessorsByDepth(const NGHolder &g, + NFAVertex src, u32 depth) { + vector<flat_set<NFAVertex>> result(depth); + flat_set<NFAVertex> cur, next; + + assert(depth > 0); + + // populate current set of successors + for (auto v : adjacent_vertices_range(src, g)) { + // ignore self-loops + if (src == v) { + continue; + } + DEBUG_PRINTF("Node %zu depth 1\n", g[v].index); + cur.insert(v); + } + result[0] = cur; + + for (unsigned d = 1; d < depth; d++) { + // collect all successors for all current level vertices + for (auto v : cur) { + // don't go past special nodes + if (is_special(v, g)) { + continue; + } + + for (auto succ : adjacent_vertices_range(v, g)) { + // ignore self-loops + if (v == succ) { + continue; + } + DEBUG_PRINTF("Node %zu depth %u\n", g[succ].index, d + 1); + next.insert(succ); + } + } + result[d] = next; + next.swap(cur); + next.clear(); + } + + return result; +} + +// returns all predecessors up to a given depth in a vector of sets, indexed by +// zero-based depth from source vertex +static +vector<flat_set<NFAVertex>> gatherPredecessorsByDepth(const NGHolder &g, + NFAVertex src, + u32 depth) { + vector<flat_set<NFAVertex>> result(depth); + flat_set<NFAVertex> cur, next; + + assert(depth > 0); + + // populate current set of successors + for (auto v : inv_adjacent_vertices_range(src, g)) { + // ignore self-loops + if (src == v) { + continue; + } + DEBUG_PRINTF("Node %zu depth 1\n", g[v].index); + cur.insert(v); + } + result[0] = cur; + + for (unsigned d = 1; d < depth; d++) { + // collect all successors for all current level vertices + for (auto v : cur) { + for (auto pred : inv_adjacent_vertices_range(v, g)) { + // ignore self-loops + if (v == pred) { + continue; + } + DEBUG_PRINTF("Node %zu depth %u\n", g[pred].index, d + 1); + next.insert(pred); + } + } + result[d] = next; + next.swap(cur); + next.clear(); + } + + return result; +} + +/* + * This struct produces a fuzzed graph; that is, a graph that is able to match + * the original pattern, as well as input data within a certain edit distance. + * Construct the struct, then call fuzz_graph() to transform the graph. + * + * Terminology used: + * - Shadow vertices: vertices mirroring the original graph at various edit + * distances + * - Shadow graph level: edit distance of a particular shadow graph + * - Helpers: dot vertices assigned to shadow vertices, used for insert/replace + */ +struct ShadowGraph { + NGHolder &g; + u32 edit_distance; + bool hamming; + map<pair<NFAVertex, u32>, NFAVertex> shadow_map; + map<pair<NFAVertex, u32>, NFAVertex> helper_map; + map<NFAVertex, NFAVertex> clones; + // edge creation is deferred + vector<pair<NFAVertex, NFAVertex>> edges_to_be_added; + flat_set<NFAVertex> orig; + + ShadowGraph(NGHolder &g_in, u32 ed_in, bool hamm_in) + : g(g_in), edit_distance(ed_in), hamming(hamm_in) {} + + void fuzz_graph() { + if (edit_distance == 0) { + return; + } + + DEBUG_PRINTF("edit distance = %u hamming = %s\n", edit_distance, + hamming ? "true" : "false"); + + // step 1: prepare the vertices, helpers and shadows according to + // the original graph + prepare_graph(); + + // step 2: add shadow and helper nodes + build_shadow_graph(); + + // step 3: set up reports for newly created vertices (and make clones + // if necessary) + if (!hamming) { + create_reports(); + } + + // step 4: wire up shadow graph and helpers for insert/replace/remove + connect_shadow_graph(); + + // step 5: commit all the edge wirings + DEBUG_PRINTF("Committing edge wirings\n"); + for (const auto &p : edges_to_be_added) { + add_edge_if_not_present(p.first, p.second, g); + } + + DEBUG_PRINTF("Done!\n"); + } + +private: + const NFAVertex& get_clone(const NFAVertex &v) { + return contains(clones, v) ? + clones[v] : v; + } + + void connect_to_clones(const NFAVertex &u, const NFAVertex &v) { + const NFAVertex &clone_u = get_clone(u); + const NFAVertex &clone_v = get_clone(v); + + edges_to_be_added.emplace_back(u, v); + DEBUG_PRINTF("Adding edge: %zu -> %zu\n", g[u].index, g[v].index); + + // do not connect clones to accepts, we do it during cloning + if (is_any_accept(clone_v, g)) { + return; + } + edges_to_be_added.emplace_back(clone_u, clone_v); + DEBUG_PRINTF("Adding edge: %zu -> %zu\n", g[clone_u].index, + g[clone_v].index); + } + + void prepare_graph() { + DEBUG_PRINTF("Building shadow graphs\n"); + + for (auto v : vertices_range(g)) { + // all level 0 vertices are their own helpers and their own shadows + helper_map[make_pair(v, 0)] = v; + shadow_map[make_pair(v, 0)] = v; + + // find special nodes + if (is_any_accept(v, g)) { + DEBUG_PRINTF("Node %zu is a special node\n", g[v].index); + for (unsigned edit = 1; edit <= edit_distance; edit++) { + // all accepts are their own shadows and helpers at all + // levels + shadow_map[make_pair(v, edit)] = v; + helper_map[make_pair(v, edit)] = v; + } + continue; + } + DEBUG_PRINTF("Node %zu is to be shadowed\n", g[v].index); + orig.insert(v); + } + } + + void build_shadow_graph() { + for (auto v : orig) { + DEBUG_PRINTF("Adding shadow/helper nodes for node %zu\n", + g[v].index); + for (unsigned dist = 1; dist <= edit_distance; dist++) { + auto shadow_v = v; + + // start and startDs cannot have shadows but do have helpers + if (!is_any_start(v, g)) { + shadow_v = clone_vertex(g, v); + DEBUG_PRINTF("New shadow node ID: %zu (level %u)\n", + g[shadow_v].index, dist); + } + shadow_map[make_pair(v, dist)] = shadow_v; + + // if there's nowhere to go from this vertex, no helper needed + if (proper_out_degree(v, g) < 1) { + DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n", + g[shadow_v].index, dist); + helper_map[make_pair(v, dist)] = shadow_v; + continue; + } + + // start and startDs only have helpers for insert, so not Hamming + if (hamming && is_any_start(v, g)) { + DEBUG_PRINTF("No helper for node ID: %zu (level %u)\n", + g[shadow_v].index, dist); + helper_map[make_pair(v, dist)] = shadow_v; + continue; + } + + auto helper_v = clone_vertex(g, v); + DEBUG_PRINTF("New helper node ID: %zu (level %u)\n", + g[helper_v].index, dist); + + // this is a helper, so make it a dot + g[helper_v].char_reach = CharReach::dot(); + // do not copy virtual start's assert flags + if (is_virtual_start(v, g)) { + DEBUG_PRINTF("Helper node ID is virtual start: %zu (level %u)\n", + g[helper_v].index, dist); + g[helper_v].assert_flags = 0; + } + helper_map[make_pair(v, dist)] = helper_v; + } + } + } + + // wire up successors according to the original graph, wire helpers + // to shadow successors (insert/replace) + void connect_succs(NFAVertex v, u32 dist) { + DEBUG_PRINTF("Wiring up successors for node %zu shadow level %u\n", + g[v].index, dist); + const auto &cur_shadow_v = shadow_map[make_pair(v, dist)]; + const auto &cur_shadow_helper = helper_map[make_pair(v, dist)]; + + // multiple insert + if (!hamming && dist > 1) { + const auto &prev_level_helper = helper_map[make_pair(v, dist - 1)]; + connect_to_clones(prev_level_helper, cur_shadow_helper); + } + + for (auto orig_dst : adjacent_vertices_range(v, g)) { + const auto &shadow_dst = shadow_map[make_pair(orig_dst, dist)]; + + connect_to_clones(cur_shadow_v, shadow_dst); + + // ignore startDs for insert/replace + if (orig_dst == g.startDs) { + continue; + } + + connect_to_clones(cur_shadow_helper, shadow_dst); + } + } + + // wire up predecessors according to the original graph, wire + // predecessors to helpers (replace), wire predecessor helpers to + // helpers (multiple replace) + void connect_preds(NFAVertex v, u32 dist) { + DEBUG_PRINTF("Wiring up predecessors for node %zu shadow level %u\n", + g[v].index, dist); + const auto &cur_shadow_v = shadow_map[make_pair(v, dist)]; + const auto &cur_shadow_helper = helper_map[make_pair(v, dist)]; + + auto orig_src_vertices = inv_adjacent_vertices_range(v, g); + for (auto orig_src : orig_src_vertices) { + // ignore edges from start to startDs + if (v == g.startDs && orig_src == g.start) { + continue; + } + // ignore self-loops for replace + if (orig_src != v) { + // do not wire a replace node for start vertices if we + // have a virtual start + if (is_virtual_start(v, g) && is_any_start(orig_src, g)) { + continue; + } + + if (dist) { + const auto &prev_level_src = + shadow_map[make_pair(orig_src, dist - 1)]; + const auto &prev_level_helper = + helper_map[make_pair(orig_src, dist - 1)]; + + connect_to_clones(prev_level_src, cur_shadow_helper); + connect_to_clones(prev_level_helper, cur_shadow_helper); + } + } + // wire predecessor according to original graph + const auto &shadow_src = shadow_map[make_pair(orig_src, dist)]; + + connect_to_clones(shadow_src, cur_shadow_v); + } + } + + // wire up previous level helper to current shadow (insert) + void connect_helpers(NFAVertex v, u32 dist) { + DEBUG_PRINTF("Wiring up helpers for node %zu shadow level %u\n", + g[v].index, dist); + const auto &cur_shadow_helper = helper_map[make_pair(v, dist)]; + auto prev_level_v = shadow_map[make_pair(v, dist - 1)]; + + connect_to_clones(prev_level_v, cur_shadow_helper); + } + + /* + * wiring edges for removal is a special case. + * + * when wiring edges for removal, as well as wiring up immediate + * predecessors to immediate successors, we also need to wire up more + * distant successors to their respective shadow graph levels. + * + * for example, consider graph start->a->b->c->d->accept. + * + * at edit distance 1, we need remove edges start->b, a->c, b->d, and + * c->accept, all going from original graph (level 0) to shadow graph + * level 1. + * + * at edit distance 2, we also need edges start->c, a->d and b->accept, + * all going from level 0 to shadow graph level 2. + * + * this is propagated to all shadow levels; that is, given edit + * distance 3, we will have edges from shadow levels 0->1, 0->2, + * 0->3, 1->2, 1->3, and 2->3. + * + * therefore, we wire them in steps: first wire with step 1 (0->1, 1->2, + * 2->3) at depth 1, then wire with step 2 (0->2, 1->3) at depth 2, etc. + * + * we also have to wire helpers to their removal successors, to + * accommodate for a replace followed by a remove, on all shadow levels. + * + * and finally, we also have to wire source shadows into removal + * successor helpers on a level above, to accommodate for a remove + * followed by a replace. + */ + void connect_removals(NFAVertex v) { + DEBUG_PRINTF("Wiring up remove edges for node %zu\n", g[v].index); + + // vertices returned by this function don't include self-loops + auto dst_vertices_by_depth = + gatherSuccessorsByDepth(g, v, edit_distance); + auto orig_src_vertices = inv_adjacent_vertices_range(v, g); + for (auto orig_src : orig_src_vertices) { + // ignore self-loops + if (orig_src == v) { + continue; + } + for (unsigned step = 1; step <= edit_distance; step++) { + for (unsigned dist = step; dist <= edit_distance; dist++) { + auto &dst_vertices = dst_vertices_by_depth[step - 1]; + for (auto &orig_dst : dst_vertices) { + const auto &shadow_src = + shadow_map[make_pair(orig_src, dist - step)]; + const auto &shadow_helper = + helper_map[make_pair(orig_src, dist - step)]; + const auto &shadow_dst = + shadow_map[make_pair(orig_dst, dist)]; + + // removal + connect_to_clones(shadow_src, shadow_dst); + + // removal from helper vertex + connect_to_clones(shadow_helper, shadow_dst); + + // removal into helper, requires additional edit + if ((dist + 1) <= edit_distance) { + const auto &next_level_helper = + helper_map[make_pair(orig_dst, dist + 1)]; + + connect_to_clones(shadow_src, next_level_helper); + } + } + } + } + } + } + + void connect_shadow_graph() { + DEBUG_PRINTF("Wiring up the graph\n"); + + for (auto v : orig) { + + DEBUG_PRINTF("Wiring up edges for node %zu\n", g[v].index); + + for (unsigned dist = 0; dist <= edit_distance; dist++) { + + // handle insert/replace + connect_succs(v, dist); + + // handle replace/multiple insert + connect_preds(v, dist); + + // handle helpers + if (!hamming && dist > 0) { + connect_helpers(v, dist); + } + } + + // handle removals + if (!hamming) { + connect_removals(v); + } + } + } + + void connect_to_targets(NFAVertex src, const flat_set<NFAVertex> &targets) { + for (auto dst : targets) { + DEBUG_PRINTF("Adding edge: %zu -> %zu\n", g[src].index, + g[dst].index); + edges_to_be_added.emplace_back(src, dst); + } + } + + // create a clone of the vertex, but overwrite its report set + void create_clone(NFAVertex v, const flat_set<ReportID> &reports, + unsigned max_edit_distance, + const flat_set<NFAVertex> &targets) { + // some vertices may have the same reports, but different successors; + // therefore, we may need to connect them multiple times, but still only + // clone once + bool needs_cloning = !contains(clones, v); + + DEBUG_PRINTF("Cloning node %zu\n", g[v].index); + // go through all shadows and helpers, including + // original vertex + for (unsigned d = 0; d < max_edit_distance; d++) { + auto shadow_v = shadow_map[make_pair(v, d)]; + auto helper_v = helper_map[make_pair(v, d)]; + + NFAVertex new_shadow_v, new_helper_v; + + // make sure we don't clone the same vertex twice + if (needs_cloning) { + new_shadow_v = clone_vertex(g, shadow_v); + DEBUG_PRINTF("New shadow node ID: %zu (level %u)\n", + g[new_shadow_v].index, d); + clones[shadow_v] = new_shadow_v; + } else { + new_shadow_v = clones[shadow_v]; + } + g[new_shadow_v].reports = reports; + + connect_to_targets(new_shadow_v, targets); + + if (shadow_v == helper_v) { + continue; + } + if (needs_cloning) { + new_helper_v = clone_vertex(g, helper_v); + DEBUG_PRINTF("New helper node ID: %zu (level %u)\n", + g[new_helper_v].index, d); + clones[helper_v] = new_helper_v; + } else { + new_helper_v = clones[helper_v]; + } + g[new_helper_v].reports = reports; + + connect_to_targets(new_helper_v, targets); + } + } + + void write_reports(NFAVertex v, const flat_set<ReportID> &reports, + unsigned max_edit_distance, + const flat_set<NFAVertex> &targets) { + // we're overwriting reports, but we're not losing any + // information as we already cached all the different report + // sets, so vertices having different reports will be cloned and set up + // with the correct report set + + // go through all shadows and helpers, including original + // vertex + for (unsigned d = 0; d < max_edit_distance; d++) { + auto shadow_v = shadow_map[make_pair(v, d)]; + auto helper_v = helper_map[make_pair(v, d)]; + DEBUG_PRINTF("Setting up reports for shadow node: %zu " + "(level %u)\n", + g[shadow_v].index, d); + DEBUG_PRINTF("Setting up reports for helper node: %zu " + "(level %u)\n", + g[helper_v].index, d); + g[shadow_v].reports = reports; + g[helper_v].reports = reports; + + connect_to_targets(shadow_v, targets); + connect_to_targets(helper_v, targets); + } + } + + /* + * we may have multiple report sets per graph. that means, whenever we + * construct additional paths through the graph (alternations, removals), we + * have to account for the fact that some vertices are predecessors to + * vertices with different report sets. + * + * whenever that happens, we have to clone the paths for both report sets, + * and set up these new vertices with their respective report sets as well. + * + * in order to do that, we first have to get all the predecessors for accept + * and acceptEod vertices. then, go through them one by one, and take note + * of the report lists. the first report set we find, wins, the rest we + * clone. + * + * we also have to do this in two passes, because there may be vertices that + * are predecessors to vertices with different report sets, so to avoid + * overwriting reports we will be caching reports info instead. + */ + void create_reports() { + map<flat_set<ReportID>, flat_set<NFAVertex>> reports_to_vertices; + flat_set<NFAVertex> accepts{g.accept, g.acceptEod}; + + // gather reports info from all vertices connected to accept + for (auto accept : accepts) { + for (auto src : inv_adjacent_vertices_range(accept, g)) { + // skip special vertices + if (is_special(src, g)) { + continue; + } + reports_to_vertices[g[src].reports].insert(src); + } + } + + // we expect to see at most two report sets + assert(reports_to_vertices.size() > 0 && + reports_to_vertices.size() <= 2); + + // set up all reports + bool clone = false; + for (auto &pair : reports_to_vertices) { + const auto &reports = pair.first; + const auto &vertices = pair.second; + + for (auto src : vertices) { + // get all predecessors up to edit distance + auto src_vertices_by_depth = + gatherPredecessorsByDepth(g, src, edit_distance); + + // find which accepts source vertex connects to + flat_set<NFAVertex> targets; + for (const auto &accept : accepts) { + NFAEdge e = edge(src, accept, g); + if (e) { + targets.insert(accept); + } + } + assert(targets.size()); + + for (unsigned d = 0; d < src_vertices_by_depth.size(); d++) { + const auto &preds = src_vertices_by_depth[d]; + for (auto v : preds) { + // only clone a node if it already contains reports + if (clone && !g[v].reports.empty()) { + create_clone(v, reports, edit_distance - d, + targets); + } else { + write_reports(v, reports, edit_distance - d, + targets); + } + } + } + } + // clone vertices only if it's not our first report set + clone = true; + } + } +}; + +// check if we will edit our way into a vacuous pattern +static +bool will_turn_vacuous(const NGHolder &g, u32 edit_distance) { + auto depths = calcRevDepths(g); + + depth min_depth = depth::infinity(); + auto idx = g[g.start].index; + + // check distance from start to accept/acceptEod + if (depths[idx].toAccept.min.is_finite()) { + min_depth = min(depths[idx].toAccept.min, min_depth); + } + if (depths[idx].toAcceptEod.min.is_finite()) { + min_depth = min(depths[idx].toAcceptEod.min, min_depth); + } + + idx = g[g.startDs].index; + + // check distance from startDs to accept/acceptEod + if (depths[idx].toAccept.min.is_finite()) { + min_depth = min(depths[idx].toAccept.min, min_depth); + } + if (depths[idx].toAcceptEod.min.is_finite()) { + min_depth = min(depths[idx].toAcceptEod.min, min_depth); + } + + assert(min_depth.is_finite()); + + // now, check if we can edit our way into a vacuous pattern + if (min_depth <= (u64a) edit_distance + 1) { + DEBUG_PRINTF("Pattern will turn vacuous if approximately matched\n"); + return true; + } + return false; +} + +void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool hamming, + bool utf8, const Grey &grey) { + if (edit_distance == 0) { + return; + } + if (!grey.allowApproximateMatching) { + throw CompileError("Approximate matching is disabled."); + } + if (edit_distance > grey.maxEditDistance) { + throw CompileError("Edit distance is too big."); + } + if (utf8) { + throw CompileError("UTF-8 is disallowed for approximate matching."); + } + // graph isn't fuzzable if there are edge assertions anywhere in the graph + for (auto e : edges_range(g)) { + if (g[e].assert_flags) { + throw CompileError("Zero-width assertions are disallowed for " + "approximate matching."); + } + } + if (!hamming && will_turn_vacuous(g, edit_distance)) { + throw CompileError("Approximate matching patterns that reduce to " + "vacuous patterns are disallowed."); + } +} + +void make_fuzzy(NGHolder &g, u32 edit_distance, bool hamming, + const Grey &grey) { + if (edit_distance == 0) { + return; + } + + assert(grey.allowApproximateMatching); + assert(grey.maxEditDistance >= edit_distance); + + ShadowGraph sg(g, edit_distance, hamming); + sg.fuzz_graph(); + + // For safety, enforce limit on actual vertex count. + if (num_vertices(g) > grey.limitApproxMatchingVertices) { + DEBUG_PRINTF("built %zu vertices > limit of %u\n", num_vertices(g), + grey.limitApproxMatchingVertices); + throw ResourceLimitError(); + } +} + +} // namespace ue2 |