aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/hyperscan/src/nfagraph/ng_asserts.cpp
diff options
context:
space:
mode:
authorbnagaev <bnagaev@yandex-team.ru>2022-02-10 16:47:04 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:47:04 +0300
commitd6449ba66291ff0c0d352c82e6eb3efb4c8a7e8d (patch)
treed5dca6d44593f5e52556a1cc7b1ab0386e096ebe /contrib/libs/hyperscan/src/nfagraph/ng_asserts.cpp
parent1861d4c1402bb2c67a3e6b43b51706081b74508a (diff)
downloadydb-d6449ba66291ff0c0d352c82e6eb3efb4c8a7e8d.tar.gz
Restoring authorship annotation for <bnagaev@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/hyperscan/src/nfagraph/ng_asserts.cpp')
-rw-r--r--contrib/libs/hyperscan/src/nfagraph/ng_asserts.cpp1006
1 files changed, 503 insertions, 503 deletions
diff --git a/contrib/libs/hyperscan/src/nfagraph/ng_asserts.cpp b/contrib/libs/hyperscan/src/nfagraph/ng_asserts.cpp
index 8812afadb7..24d4ecace1 100644
--- a/contrib/libs/hyperscan/src/nfagraph/ng_asserts.cpp
+++ b/contrib/libs/hyperscan/src/nfagraph/ng_asserts.cpp
@@ -1,558 +1,558 @@
-/*
+/*
* Copyright (c) 2015-2017, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Resolve special assert vertices.
- *
- * The assert resolution algorithm proceeds by iterating over those edges with
- * assertion flags, considering source and target vertices of each edge. If a
- * vertex has a superset of the reachability demanded by the assertion on the
- * edge, it is split into alternatives providing the word and non-word paths
- * through that vertex.
- *
- * A great deal of the complexity in the resolveAsserts pass is devoted to
- * handling these assertions when the UCP flag is specified (meaning \\w and \\W
- * are implemented with Unicode properties, rather than their ASCII
- * interpretation) and the prefiltering flag is also used. Complete,
- * non-prefiltering UCP support is not available yet.
- */
-#include "ng_asserts.h"
-
-#include "ng.h"
-#include "ng_prune.h"
-#include "ng_redundancy.h"
-#include "ng_util.h"
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Resolve special assert vertices.
+ *
+ * The assert resolution algorithm proceeds by iterating over those edges with
+ * assertion flags, considering source and target vertices of each edge. If a
+ * vertex has a superset of the reachability demanded by the assertion on the
+ * edge, it is split into alternatives providing the word and non-word paths
+ * through that vertex.
+ *
+ * A great deal of the complexity in the resolveAsserts pass is devoted to
+ * handling these assertions when the UCP flag is specified (meaning \\w and \\W
+ * are implemented with Unicode properties, rather than their ASCII
+ * interpretation) and the prefiltering flag is also used. Complete,
+ * non-prefiltering UCP support is not available yet.
+ */
+#include "ng_asserts.h"
+
+#include "ng.h"
+#include "ng_prune.h"
+#include "ng_redundancy.h"
+#include "ng_util.h"
#include "compiler/compiler.h"
-#include "parser/position.h" // for POS flags
-#include "util/bitutils.h" // for findAndClearLSB_32
-#include "util/boundary_reports.h"
-#include "util/container.h"
-#include "util/compile_context.h"
-#include "util/compile_error.h"
-#include "util/graph_range.h"
-#include "util/report_manager.h"
-#include "util/unicode_def.h"
-
-#include <queue>
-
-using namespace std;
-
-namespace ue2 {
-
-/** \brief Hard limit on the maximum number of vertices we'll clone before we
- * throw up our hands and report 'Pattern too large.' */
-static const size_t MAX_CLONED_VERTICES = 2048;
-
-/** \brief The definition of \\w, since we use it everywhere in here. */
-static const CharReach CHARREACH_WORD(CharReach('a', 'z') |
- CharReach('A', 'Z') | CharReach('0', '9') | CharReach('_'));
-
-/** \brief \\W is the inverse of \\w */
-static const CharReach CHARREACH_NONWORD(~CHARREACH_WORD);
-
-/** \brief Prefiltering definition of \\w for UCP mode.
- *
- * Includes all high bytes as to capture all non-ASCII, however depending on
- * direction only continuers or starters are strictly required - as the input
- * is well-formed, this laxness will not cost us. */
-static const CharReach CHARREACH_WORD_UCP_PRE(CHARREACH_WORD
- | CharReach(128, 255));
-
-/** \brief Prefiltering definition of \\W for UCP Mode.
- *
- * (non-word already includes high bytes) */
-static const CharReach CHARREACH_NONWORD_UCP_PRE(CHARREACH_NONWORD);
-
-/** \brief Find all the edges with assertion flags. */
-static
-vector<NFAEdge> getAsserts(const NGHolder &g) {
- vector<NFAEdge> out;
- for (const auto &e : edges_range(g)) {
- if (g[e].assert_flags) {
- out.push_back(e);
- }
- }
- return out;
-}
-
-static
-void addToSplit(const NGHolder &g, NFAVertex v, map<u32, NFAVertex> *to_split) {
+#include "parser/position.h" // for POS flags
+#include "util/bitutils.h" // for findAndClearLSB_32
+#include "util/boundary_reports.h"
+#include "util/container.h"
+#include "util/compile_context.h"
+#include "util/compile_error.h"
+#include "util/graph_range.h"
+#include "util/report_manager.h"
+#include "util/unicode_def.h"
+
+#include <queue>
+
+using namespace std;
+
+namespace ue2 {
+
+/** \brief Hard limit on the maximum number of vertices we'll clone before we
+ * throw up our hands and report 'Pattern too large.' */
+static const size_t MAX_CLONED_VERTICES = 2048;
+
+/** \brief The definition of \\w, since we use it everywhere in here. */
+static const CharReach CHARREACH_WORD(CharReach('a', 'z') |
+ CharReach('A', 'Z') | CharReach('0', '9') | CharReach('_'));
+
+/** \brief \\W is the inverse of \\w */
+static const CharReach CHARREACH_NONWORD(~CHARREACH_WORD);
+
+/** \brief Prefiltering definition of \\w for UCP mode.
+ *
+ * Includes all high bytes as to capture all non-ASCII, however depending on
+ * direction only continuers or starters are strictly required - as the input
+ * is well-formed, this laxness will not cost us. */
+static const CharReach CHARREACH_WORD_UCP_PRE(CHARREACH_WORD
+ | CharReach(128, 255));
+
+/** \brief Prefiltering definition of \\W for UCP Mode.
+ *
+ * (non-word already includes high bytes) */
+static const CharReach CHARREACH_NONWORD_UCP_PRE(CHARREACH_NONWORD);
+
+/** \brief Find all the edges with assertion flags. */
+static
+vector<NFAEdge> getAsserts(const NGHolder &g) {
+ vector<NFAEdge> out;
+ for (const auto &e : edges_range(g)) {
+ if (g[e].assert_flags) {
+ out.push_back(e);
+ }
+ }
+ return out;
+}
+
+static
+void addToSplit(const NGHolder &g, NFAVertex v, map<u32, NFAVertex> *to_split) {
DEBUG_PRINTF("%zu needs splitting\n", g[v].index);
- to_split->emplace(g[v].index, v);
-}
-
-/** \brief Find vertices that need to be split due to an assertion edge.
- *
- * A vertex needs to be split if has an edge to/from it with an assert with a
- * restriction on the relevant end. */
-static
-void findSplitters(const NGHolder &g, const vector<NFAEdge> &asserts,
- map<u32, NFAVertex> *to_split,
- map<u32, NFAVertex> *to_split_ucp) {
- for (const auto &e : asserts) {
- NFAVertex u = source(e, g);
- NFAVertex v = target(e, g);
- u32 flags = g[e].assert_flags;
- assert(flags);
-
- const CharReach &u_cr = g[u].char_reach;
- const CharReach &v_cr = g[v].char_reach;
-
- bool ucp_assert = flags & UCP_ASSERT_FLAGS;
- bool normal_assert = flags & NON_UCP_ASSERT_FLAGS;
- /* In reality, an expression can only be entirely ucp or not ucp */
- assert(ucp_assert != normal_assert);
-
- if (normal_assert) {
- /* assume any flag results in us have to split if the vertex is not
- * a subset of word or completely disjoint from it. We could be more
- * nuanced if flags is a disjunction of multiple assertions. */
- if (!u_cr.isSubsetOf(CHARREACH_WORD)
- && !u_cr.isSubsetOf(CHARREACH_NONWORD)
- && u != g.start) { /* start is always considered a nonword */
- addToSplit(g, u, to_split);
- }
-
- if (!v_cr.isSubsetOf(CHARREACH_WORD)
- && !v_cr.isSubsetOf(CHARREACH_NONWORD)
- && v != g.accept /* accept require special handling, done on a
- * per edge basis in resolve asserts
- */
- && v != g.acceptEod) { /* eod is always considered a nonword */
- addToSplit(g, v, to_split);
- }
- }
-
- if (ucp_assert) {
- /* note: the ucp prefilter crs overlap - requires a bit more care */
- if (u == g.start) { /* start never needs to be split,
- * treat nonword */
- } else if (flags & POS_FLAG_ASSERT_WORD_TO_ANY_UCP) {
- if (!u_cr.isSubsetOf(CHARREACH_WORD_UCP_PRE)
- && !u_cr.isSubsetOf(~CHARREACH_WORD_UCP_PRE)) {
- addToSplit(g, u, to_split_ucp);
- }
- } else {
- assert(flags & POS_FLAG_ASSERT_NONWORD_TO_ANY_UCP);
- if (!u_cr.isSubsetOf(CHARREACH_NONWORD_UCP_PRE)
- && !u_cr.isSubsetOf(~CHARREACH_NONWORD_UCP_PRE)) {
- addToSplit(g, u, to_split_ucp);
- }
- }
-
- if (v == g.acceptEod /* eod is always considered a nonword */
- || v == g.accept) { /* accept require special handling, done on
- * a per edge basis in resolve asserts */
- } else if (flags & POS_FLAG_ASSERT_ANY_TO_WORD_UCP) {
- if (!v_cr.isSubsetOf(CHARREACH_WORD_UCP_PRE)
- && !v_cr.isSubsetOf(~CHARREACH_WORD_UCP_PRE)) {
- addToSplit(g, v, to_split_ucp);
- }
- } else {
- assert(flags & POS_FLAG_ASSERT_ANY_TO_NONWORD_UCP);
- if (!v_cr.isSubsetOf(CHARREACH_NONWORD_UCP_PRE)
- && !v_cr.isSubsetOf(~CHARREACH_NONWORD_UCP_PRE)) {
- addToSplit(g, v, to_split_ucp);
- }
- }
- }
- }
-}
-
-static
+ to_split->emplace(g[v].index, v);
+}
+
+/** \brief Find vertices that need to be split due to an assertion edge.
+ *
+ * A vertex needs to be split if has an edge to/from it with an assert with a
+ * restriction on the relevant end. */
+static
+void findSplitters(const NGHolder &g, const vector<NFAEdge> &asserts,
+ map<u32, NFAVertex> *to_split,
+ map<u32, NFAVertex> *to_split_ucp) {
+ for (const auto &e : asserts) {
+ NFAVertex u = source(e, g);
+ NFAVertex v = target(e, g);
+ u32 flags = g[e].assert_flags;
+ assert(flags);
+
+ const CharReach &u_cr = g[u].char_reach;
+ const CharReach &v_cr = g[v].char_reach;
+
+ bool ucp_assert = flags & UCP_ASSERT_FLAGS;
+ bool normal_assert = flags & NON_UCP_ASSERT_FLAGS;
+ /* In reality, an expression can only be entirely ucp or not ucp */
+ assert(ucp_assert != normal_assert);
+
+ if (normal_assert) {
+ /* assume any flag results in us have to split if the vertex is not
+ * a subset of word or completely disjoint from it. We could be more
+ * nuanced if flags is a disjunction of multiple assertions. */
+ if (!u_cr.isSubsetOf(CHARREACH_WORD)
+ && !u_cr.isSubsetOf(CHARREACH_NONWORD)
+ && u != g.start) { /* start is always considered a nonword */
+ addToSplit(g, u, to_split);
+ }
+
+ if (!v_cr.isSubsetOf(CHARREACH_WORD)
+ && !v_cr.isSubsetOf(CHARREACH_NONWORD)
+ && v != g.accept /* accept require special handling, done on a
+ * per edge basis in resolve asserts
+ */
+ && v != g.acceptEod) { /* eod is always considered a nonword */
+ addToSplit(g, v, to_split);
+ }
+ }
+
+ if (ucp_assert) {
+ /* note: the ucp prefilter crs overlap - requires a bit more care */
+ if (u == g.start) { /* start never needs to be split,
+ * treat nonword */
+ } else if (flags & POS_FLAG_ASSERT_WORD_TO_ANY_UCP) {
+ if (!u_cr.isSubsetOf(CHARREACH_WORD_UCP_PRE)
+ && !u_cr.isSubsetOf(~CHARREACH_WORD_UCP_PRE)) {
+ addToSplit(g, u, to_split_ucp);
+ }
+ } else {
+ assert(flags & POS_FLAG_ASSERT_NONWORD_TO_ANY_UCP);
+ if (!u_cr.isSubsetOf(CHARREACH_NONWORD_UCP_PRE)
+ && !u_cr.isSubsetOf(~CHARREACH_NONWORD_UCP_PRE)) {
+ addToSplit(g, u, to_split_ucp);
+ }
+ }
+
+ if (v == g.acceptEod /* eod is always considered a nonword */
+ || v == g.accept) { /* accept require special handling, done on
+ * a per edge basis in resolve asserts */
+ } else if (flags & POS_FLAG_ASSERT_ANY_TO_WORD_UCP) {
+ if (!v_cr.isSubsetOf(CHARREACH_WORD_UCP_PRE)
+ && !v_cr.isSubsetOf(~CHARREACH_WORD_UCP_PRE)) {
+ addToSplit(g, v, to_split_ucp);
+ }
+ } else {
+ assert(flags & POS_FLAG_ASSERT_ANY_TO_NONWORD_UCP);
+ if (!v_cr.isSubsetOf(CHARREACH_NONWORD_UCP_PRE)
+ && !v_cr.isSubsetOf(~CHARREACH_NONWORD_UCP_PRE)) {
+ addToSplit(g, v, to_split_ucp);
+ }
+ }
+ }
+ }
+}
+
+static
void setReportId(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
NFAVertex v, s32 adj) {
- // Don't try and set the report ID of a special vertex.
- assert(!is_special(v, g));
-
- // If there's a report set already, we're replacing it.
- g[v].reports.clear();
-
+ // Don't try and set the report ID of a special vertex.
+ assert(!is_special(v, g));
+
+ // If there's a report set already, we're replacing it.
+ g[v].reports.clear();
+
Report ir = rm.getBasicInternalReport(expr, adj);
-
- g[v].reports.insert(rm.getInternalId(ir));
+
+ g[v].reports.insert(rm.getInternalId(ir));
DEBUG_PRINTF("set report id for vertex %zu, adj %d\n", g[v].index, adj);
-}
-
-static
+}
+
+static
NFAVertex makeClone(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
NFAVertex v, const CharReach &cr_mask) {
- NFAVertex clone = clone_vertex(g, v);
- g[clone].char_reach &= cr_mask;
- clone_out_edges(g, v, clone);
- clone_in_edges(g, v, clone);
-
- if (v == g.startDs) {
+ NFAVertex clone = clone_vertex(g, v);
+ g[clone].char_reach &= cr_mask;
+ clone_out_edges(g, v, clone);
+ clone_in_edges(g, v, clone);
+
+ if (v == g.startDs) {
if (expr.utf8) {
- g[clone].char_reach &= ~UTF_START_CR;
- }
-
- DEBUG_PRINTF("marked as virt\n");
- g[clone].assert_flags = POS_FLAG_VIRTUAL_START;
-
+ g[clone].char_reach &= ~UTF_START_CR;
+ }
+
+ DEBUG_PRINTF("marked as virt\n");
+ g[clone].assert_flags = POS_FLAG_VIRTUAL_START;
+
setReportId(rm, g, expr, clone, 0);
- }
-
- return clone;
-}
-
-static
+ }
+
+ return clone;
+}
+
+static
void splitVertex(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
NFAVertex v, bool ucp) {
- assert(v != g.start);
- assert(v != g.accept);
- assert(v != g.acceptEod);
+ assert(v != g.start);
+ assert(v != g.accept);
+ assert(v != g.acceptEod);
DEBUG_PRINTF("partitioning vertex %zu ucp:%d\n", g[v].index, (int)ucp);
-
- CharReach cr_word = ucp ? CHARREACH_WORD_UCP_PRE : CHARREACH_WORD;
- CharReach cr_nonword = ucp ? CHARREACH_NONWORD_UCP_PRE : CHARREACH_NONWORD;
-
- auto has_no_assert = [&g](const NFAEdge &e) { return !g[e].assert_flags; };
-
- // Split v into word/nonword vertices with only asserting out-edges.
+
+ CharReach cr_word = ucp ? CHARREACH_WORD_UCP_PRE : CHARREACH_WORD;
+ CharReach cr_nonword = ucp ? CHARREACH_NONWORD_UCP_PRE : CHARREACH_NONWORD;
+
+ auto has_no_assert = [&g](const NFAEdge &e) { return !g[e].assert_flags; };
+
+ // Split v into word/nonword vertices with only asserting out-edges.
NFAVertex w_out = makeClone(rm, g, expr, v, cr_word);
NFAVertex nw_out = makeClone(rm, g, expr, v, cr_nonword);
- remove_out_edge_if(w_out, has_no_assert, g);
- remove_out_edge_if(nw_out, has_no_assert, g);
-
- // Split v into word/nonword vertices with only asserting in-edges.
+ remove_out_edge_if(w_out, has_no_assert, g);
+ remove_out_edge_if(nw_out, has_no_assert, g);
+
+ // Split v into word/nonword vertices with only asserting in-edges.
NFAVertex w_in = makeClone(rm, g, expr, v, cr_word);
NFAVertex nw_in = makeClone(rm, g, expr, v, cr_nonword);
- remove_in_edge_if(w_in, has_no_assert, g);
- remove_in_edge_if(nw_in, has_no_assert, g);
-
- // Prune edges with asserts from original v.
- auto has_assert = [&g](const NFAEdge &e) { return g[e].assert_flags; };
- remove_in_edge_if(v, has_assert, g);
- remove_out_edge_if(v, has_assert, g);
-}
-
-static
+ remove_in_edge_if(w_in, has_no_assert, g);
+ remove_in_edge_if(nw_in, has_no_assert, g);
+
+ // Prune edges with asserts from original v.
+ auto has_assert = [&g](const NFAEdge &e) { return g[e].assert_flags; };
+ remove_in_edge_if(v, has_assert, g);
+ remove_out_edge_if(v, has_assert, g);
+}
+
+static
void resolveEdges(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
set<NFAEdge> *dead) {
- for (const auto &e : edges_range(g)) {
- u32 flags = g[e].assert_flags;
- if (!flags) {
- continue;
- }
-
- NFAVertex u = source(e, g);
- NFAVertex v = target(e, g);
-
- assert(u != g.startDs);
-
- const CharReach &u_cr = g[u].char_reach;
- const CharReach &v_cr = g[v].char_reach;
-
- bool impassable = true;
- bool ucp = flags & UCP_ASSERT_FLAGS;
+ for (const auto &e : edges_range(g)) {
+ u32 flags = g[e].assert_flags;
+ if (!flags) {
+ continue;
+ }
+
+ NFAVertex u = source(e, g);
+ NFAVertex v = target(e, g);
+
+ assert(u != g.startDs);
+
+ const CharReach &u_cr = g[u].char_reach;
+ const CharReach &v_cr = g[v].char_reach;
+
+ bool impassable = true;
+ bool ucp = flags & UCP_ASSERT_FLAGS;
DEBUG_PRINTF("resolving edge %zu->%zu (flags=0x%x, ucp=%d)\n",
g[u].index, g[v].index, flags, (int)ucp);
- while (flags && impassable) {
- u32 flag = 1U << findAndClearLSB_32(&flags);
- switch (flag) {
- case POS_FLAG_ASSERT_NONWORD_TO_NONWORD:
- case POS_FLAG_ASSERT_NONWORD_TO_WORD:
- if ((u_cr & CHARREACH_NONWORD).none() && u != g.start) {
- continue;
- }
- break;
- case POS_FLAG_ASSERT_WORD_TO_NONWORD:
- case POS_FLAG_ASSERT_WORD_TO_WORD:
- if ((u_cr & CHARREACH_WORD).none() || u == g.start) {
- continue;
- }
- break;
- case POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP:
- case POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP:
- if ((u_cr & ~CHARREACH_NONWORD_UCP_PRE).any() && u != g.start) {
- continue;
- }
- break;
- case POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP:
- case POS_FLAG_ASSERT_WORD_TO_WORD_UCP:
- if ((u_cr & ~CHARREACH_WORD_UCP_PRE).any() || u == g.start) {
- continue;
- }
- break;
- default:
- assert(0);
- }
-
- if (v == g.accept) {
- /* accept special will need to be treated specially later */
- impassable = false;
- continue;
- }
-
- switch (flag) {
- case POS_FLAG_ASSERT_NONWORD_TO_NONWORD:
- case POS_FLAG_ASSERT_WORD_TO_NONWORD:
- if ((v_cr & CHARREACH_NONWORD).none() && v != g.acceptEod) {
- continue;
- }
- break;
- case POS_FLAG_ASSERT_WORD_TO_WORD:
- case POS_FLAG_ASSERT_NONWORD_TO_WORD:
- if ((v_cr & CHARREACH_WORD).none() || v == g.acceptEod) {
- continue;
- }
- break;
- case POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP:
- case POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP:
- if ((v_cr & ~CHARREACH_NONWORD_UCP_PRE).any()
- && v != g.acceptEod) {
- continue;
- }
- break;
- case POS_FLAG_ASSERT_WORD_TO_WORD_UCP:
- case POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP:
- if ((v_cr & ~CHARREACH_WORD_UCP_PRE).any()
- || v == g.acceptEod) {
- continue;
- }
- break;
- default:
- assert(0);
- }
- impassable = false;
- }
-
- if (impassable) {
- dead->insert(e);
- } else if (v == g.accept && !ucp) {
- bool u_w = (u_cr & CHARREACH_NONWORD).none() && u != g.start;
- UNUSED bool u_nw = (u_cr & CHARREACH_WORD).none() || u == g.start;
- assert(u_w != u_nw);
- bool v_w = false;
- bool v_nw = false;
-
- flags = g[e].assert_flags;
- if (u_w) {
- v_w = flags & POS_FLAG_ASSERT_WORD_TO_WORD;
- v_nw = flags & POS_FLAG_ASSERT_WORD_TO_NONWORD;
- } else {
- v_w = flags & POS_FLAG_ASSERT_NONWORD_TO_WORD;
- v_nw = flags & POS_FLAG_ASSERT_NONWORD_TO_NONWORD;
- }
- assert(v_w || v_nw);
- if (v_w && v_nw) {
- /* edge is effectively unconditional */
- g[e].assert_flags = 0;
- } else if (v_w) {
- /* need to add a word byte */
- NFAVertex vv = add_vertex(g);
+ while (flags && impassable) {
+ u32 flag = 1U << findAndClearLSB_32(&flags);
+ switch (flag) {
+ case POS_FLAG_ASSERT_NONWORD_TO_NONWORD:
+ case POS_FLAG_ASSERT_NONWORD_TO_WORD:
+ if ((u_cr & CHARREACH_NONWORD).none() && u != g.start) {
+ continue;
+ }
+ break;
+ case POS_FLAG_ASSERT_WORD_TO_NONWORD:
+ case POS_FLAG_ASSERT_WORD_TO_WORD:
+ if ((u_cr & CHARREACH_WORD).none() || u == g.start) {
+ continue;
+ }
+ break;
+ case POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP:
+ case POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP:
+ if ((u_cr & ~CHARREACH_NONWORD_UCP_PRE).any() && u != g.start) {
+ continue;
+ }
+ break;
+ case POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP:
+ case POS_FLAG_ASSERT_WORD_TO_WORD_UCP:
+ if ((u_cr & ~CHARREACH_WORD_UCP_PRE).any() || u == g.start) {
+ continue;
+ }
+ break;
+ default:
+ assert(0);
+ }
+
+ if (v == g.accept) {
+ /* accept special will need to be treated specially later */
+ impassable = false;
+ continue;
+ }
+
+ switch (flag) {
+ case POS_FLAG_ASSERT_NONWORD_TO_NONWORD:
+ case POS_FLAG_ASSERT_WORD_TO_NONWORD:
+ if ((v_cr & CHARREACH_NONWORD).none() && v != g.acceptEod) {
+ continue;
+ }
+ break;
+ case POS_FLAG_ASSERT_WORD_TO_WORD:
+ case POS_FLAG_ASSERT_NONWORD_TO_WORD:
+ if ((v_cr & CHARREACH_WORD).none() || v == g.acceptEod) {
+ continue;
+ }
+ break;
+ case POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP:
+ case POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP:
+ if ((v_cr & ~CHARREACH_NONWORD_UCP_PRE).any()
+ && v != g.acceptEod) {
+ continue;
+ }
+ break;
+ case POS_FLAG_ASSERT_WORD_TO_WORD_UCP:
+ case POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP:
+ if ((v_cr & ~CHARREACH_WORD_UCP_PRE).any()
+ || v == g.acceptEod) {
+ continue;
+ }
+ break;
+ default:
+ assert(0);
+ }
+ impassable = false;
+ }
+
+ if (impassable) {
+ dead->insert(e);
+ } else if (v == g.accept && !ucp) {
+ bool u_w = (u_cr & CHARREACH_NONWORD).none() && u != g.start;
+ UNUSED bool u_nw = (u_cr & CHARREACH_WORD).none() || u == g.start;
+ assert(u_w != u_nw);
+ bool v_w = false;
+ bool v_nw = false;
+
+ flags = g[e].assert_flags;
+ if (u_w) {
+ v_w = flags & POS_FLAG_ASSERT_WORD_TO_WORD;
+ v_nw = flags & POS_FLAG_ASSERT_WORD_TO_NONWORD;
+ } else {
+ v_w = flags & POS_FLAG_ASSERT_NONWORD_TO_WORD;
+ v_nw = flags & POS_FLAG_ASSERT_NONWORD_TO_NONWORD;
+ }
+ assert(v_w || v_nw);
+ if (v_w && v_nw) {
+ /* edge is effectively unconditional */
+ g[e].assert_flags = 0;
+ } else if (v_w) {
+ /* need to add a word byte */
+ NFAVertex vv = add_vertex(g);
setReportId(rm, g, expr, vv, -1);
- g[vv].char_reach = CHARREACH_WORD;
- add_edge(vv, g.accept, g);
- g[e].assert_flags = 0;
- add_edge(u, vv, g[e], g);
- dead->insert(e);
- } else {
- /* need to add a non word byte or see eod */
- NFAVertex vv = add_vertex(g);
+ g[vv].char_reach = CHARREACH_WORD;
+ add_edge(vv, g.accept, g);
+ g[e].assert_flags = 0;
+ add_edge(u, vv, g[e], g);
+ dead->insert(e);
+ } else {
+ /* need to add a non word byte or see eod */
+ NFAVertex vv = add_vertex(g);
setReportId(rm, g, expr, vv, -1);
- g[vv].char_reach = CHARREACH_NONWORD;
- add_edge(vv, g.accept, g);
- g[e].assert_flags = 0;
- add_edge(u, vv, g[e], g);
+ g[vv].char_reach = CHARREACH_NONWORD;
+ add_edge(vv, g.accept, g);
+ g[e].assert_flags = 0;
+ add_edge(u, vv, g[e], g);
/* there may already be a different edge from start to eod if so
* we need to make it unconditional and alive
*/
if (NFAEdge start_eod = edge(u, g.acceptEod, g)) {
- g[start_eod].assert_flags = 0;
- dead->erase(start_eod);
+ g[start_eod].assert_flags = 0;
+ dead->erase(start_eod);
} else {
add_edge(u, g.acceptEod, g[e], g);
- }
- dead->insert(e);
- }
- } else if (v == g.accept && ucp) {
- DEBUG_PRINTF("resolving ucp assert to accept\n");
- assert(u_cr.any());
- bool u_w = (u_cr & CHARREACH_WORD_UCP_PRE).any()
- && u != g.start;
- bool u_nw = (u_cr & CHARREACH_NONWORD_UCP_PRE).any()
- || u == g.start;
- assert(u_w || u_nw);
-
- bool v_w = false;
- bool v_nw = false;
-
- flags = g[e].assert_flags;
- if (u_w) {
- v_w |= flags & POS_FLAG_ASSERT_WORD_TO_WORD_UCP;
- v_nw |= flags & POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP;
- }
- if (u_nw) {
- v_w |= flags & POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP;
- v_nw |= flags & POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP;
- }
- assert(v_w || v_nw);
- if (v_w && v_nw) {
- /* edge is effectively unconditional */
- g[e].assert_flags = 0;
- } else if (v_w) {
- /* need to add a word byte */
- NFAVertex vv = add_vertex(g);
+ }
+ dead->insert(e);
+ }
+ } else if (v == g.accept && ucp) {
+ DEBUG_PRINTF("resolving ucp assert to accept\n");
+ assert(u_cr.any());
+ bool u_w = (u_cr & CHARREACH_WORD_UCP_PRE).any()
+ && u != g.start;
+ bool u_nw = (u_cr & CHARREACH_NONWORD_UCP_PRE).any()
+ || u == g.start;
+ assert(u_w || u_nw);
+
+ bool v_w = false;
+ bool v_nw = false;
+
+ flags = g[e].assert_flags;
+ if (u_w) {
+ v_w |= flags & POS_FLAG_ASSERT_WORD_TO_WORD_UCP;
+ v_nw |= flags & POS_FLAG_ASSERT_WORD_TO_NONWORD_UCP;
+ }
+ if (u_nw) {
+ v_w |= flags & POS_FLAG_ASSERT_NONWORD_TO_WORD_UCP;
+ v_nw |= flags & POS_FLAG_ASSERT_NONWORD_TO_NONWORD_UCP;
+ }
+ assert(v_w || v_nw);
+ if (v_w && v_nw) {
+ /* edge is effectively unconditional */
+ g[e].assert_flags = 0;
+ } else if (v_w) {
+ /* need to add a word byte */
+ NFAVertex vv = add_vertex(g);
setReportId(rm, g, expr, vv, -1);
- g[vv].char_reach = CHARREACH_WORD_UCP_PRE;
- add_edge(vv, g.accept, g);
- g[e].assert_flags = 0;
- add_edge(u, vv, g[e], g);
- dead->insert(e);
- } else {
- /* need to add a non word byte or see eod */
- NFAVertex vv = add_vertex(g);
+ g[vv].char_reach = CHARREACH_WORD_UCP_PRE;
+ add_edge(vv, g.accept, g);
+ g[e].assert_flags = 0;
+ add_edge(u, vv, g[e], g);
+ dead->insert(e);
+ } else {
+ /* need to add a non word byte or see eod */
+ NFAVertex vv = add_vertex(g);
setReportId(rm, g, expr, vv, -1);
- g[vv].char_reach = CHARREACH_NONWORD_UCP_PRE;
- add_edge(vv, g.accept, g);
- g[e].assert_flags = 0;
- add_edge(u, vv, g[e], g);
+ g[vv].char_reach = CHARREACH_NONWORD_UCP_PRE;
+ add_edge(vv, g.accept, g);
+ g[e].assert_flags = 0;
+ add_edge(u, vv, g[e], g);
/* there may already be a different edge from start to eod if so
* we need to make it unconditional and alive
*/
if (NFAEdge start_eod = edge(u, g.acceptEod, g)) {
- g[start_eod].assert_flags = 0;
- dead->erase(start_eod);
+ g[start_eod].assert_flags = 0;
+ dead->erase(start_eod);
} else {
add_edge(u, g.acceptEod, g[e], g);
- }
- dead->insert(e);
- }
- } else {
- /* we can remove the asserts as we have partitioned the vertices
- * into w/nw around the assert edges
- */
- g[e].assert_flags = 0;
- }
- }
-}
-
+ }
+ dead->insert(e);
+ }
+ } else {
+ /* we can remove the asserts as we have partitioned the vertices
+ * into w/nw around the assert edges
+ */
+ g[e].assert_flags = 0;
+ }
+ }
+}
+
void resolveAsserts(ReportManager &rm, NGHolder &g,
const ExpressionInfo &expr) {
- vector<NFAEdge> asserts = getAsserts(g);
- if (asserts.empty()) {
- return;
- }
-
- map<u32, NFAVertex> to_split; /* by index, for determinism */
- map<u32, NFAVertex> to_split_ucp; /* by index, for determinism */
- findSplitters(g, asserts, &to_split, &to_split_ucp);
- if (to_split.size() + to_split_ucp.size() > MAX_CLONED_VERTICES) {
+ vector<NFAEdge> asserts = getAsserts(g);
+ if (asserts.empty()) {
+ return;
+ }
+
+ map<u32, NFAVertex> to_split; /* by index, for determinism */
+ map<u32, NFAVertex> to_split_ucp; /* by index, for determinism */
+ findSplitters(g, asserts, &to_split, &to_split_ucp);
+ if (to_split.size() + to_split_ucp.size() > MAX_CLONED_VERTICES) {
throw CompileError(expr.index, "Pattern is too large.");
- }
-
- for (const auto &m : to_split) {
- assert(!contains(to_split_ucp, m.first));
+ }
+
+ for (const auto &m : to_split) {
+ assert(!contains(to_split_ucp, m.first));
splitVertex(rm, g, expr, m.second, false);
- }
-
- for (const auto &m : to_split_ucp) {
+ }
+
+ for (const auto &m : to_split_ucp) {
splitVertex(rm, g, expr, m.second, true);
- }
-
- set<NFAEdge> dead;
+ }
+
+ set<NFAEdge> dead;
resolveEdges(rm, g, expr, &dead);
-
- remove_edges(dead, g);
+
+ remove_edges(dead, g);
renumber_vertices(g);
- pruneUseless(g);
- pruneEmptyVertices(g);
-
+ pruneUseless(g);
+ pruneEmptyVertices(g);
+
renumber_vertices(g);
renumber_edges(g);
- clearReports(g);
-}
-
+ clearReports(g);
+}
+
void ensureCodePointStart(ReportManager &rm, NGHolder &g,
const ExpressionInfo &expr) {
- /* In utf8 mode there is an implicit assertion that we start at codepoint
- * boundaries. Assert resolution handles the badness coming from asserts.
- * The only other source of trouble is startDs->accept connections.
- */
+ /* In utf8 mode there is an implicit assertion that we start at codepoint
+ * boundaries. Assert resolution handles the badness coming from asserts.
+ * The only other source of trouble is startDs->accept connections.
+ */
NFAEdge orig = edge(g.startDs, g.accept, g);
if (expr.utf8 && orig) {
DEBUG_PRINTF("rectifying %u\n", expr.report);
Report ir = rm.getBasicInternalReport(expr);
- ReportID rep = rm.getInternalId(ir);
-
- NFAVertex v_a = add_vertex(g);
- g[v_a].assert_flags = POS_FLAG_VIRTUAL_START;
- g[v_a].char_reach = UTF_ASCII_CR;
- add_edge(v_a, g.accept, g[orig], g);
-
- NFAVertex v_2 = add_vertex(g);
- g[v_2].assert_flags = POS_FLAG_VIRTUAL_START;
- g[v_2].char_reach = CharReach(UTF_TWO_BYTE_MIN, UTF_TWO_BYTE_MAX);
-
- NFAVertex v_3 = add_vertex(g);
- g[v_3].assert_flags = POS_FLAG_VIRTUAL_START;
- g[v_3].char_reach = CharReach(UTF_THREE_BYTE_MIN, UTF_THREE_BYTE_MAX);
-
- NFAVertex v_4 = add_vertex(g);
- g[v_4].assert_flags = POS_FLAG_VIRTUAL_START;
- g[v_4].char_reach = CharReach(UTF_FOUR_BYTE_MIN, UTF_FOUR_BYTE_MAX);
-
- NFAVertex v_c = add_vertex(g);
- g[v_c].assert_flags = POS_FLAG_VIRTUAL_START;
- g[v_c].char_reach = UTF_CONT_CR;
- add_edge(v_c, g.accept, g[orig], g);
-
- add_edge(v_2, v_c, g);
-
- NFAVertex v_3c = add_vertex(g);
- g[v_3c].assert_flags = POS_FLAG_VIRTUAL_START;
- g[v_3c].char_reach = UTF_CONT_CR;
- add_edge(v_3c, v_c, g);
- add_edge(v_3, v_3c, g);
-
- NFAVertex v_4c = add_vertex(g);
- g[v_4c].assert_flags = POS_FLAG_VIRTUAL_START;
- g[v_4c].char_reach = UTF_CONT_CR;
- add_edge(v_4c, v_3c, g);
- add_edge(v_4, v_4c, g);
-
- g[v_a].reports.insert(rep);
- g[v_c].reports.insert(rep);
-
- add_edge(g.start, v_a, g);
- add_edge(g.startDs, v_a, g);
- add_edge(g.start, v_2, g);
- add_edge(g.startDs, v_2, g);
- add_edge(g.start, v_3, g);
- add_edge(g.startDs, v_3, g);
- add_edge(g.start, v_4, g);
- add_edge(g.startDs, v_4, g);
- remove_edge(orig, g);
+ ReportID rep = rm.getInternalId(ir);
+
+ NFAVertex v_a = add_vertex(g);
+ g[v_a].assert_flags = POS_FLAG_VIRTUAL_START;
+ g[v_a].char_reach = UTF_ASCII_CR;
+ add_edge(v_a, g.accept, g[orig], g);
+
+ NFAVertex v_2 = add_vertex(g);
+ g[v_2].assert_flags = POS_FLAG_VIRTUAL_START;
+ g[v_2].char_reach = CharReach(UTF_TWO_BYTE_MIN, UTF_TWO_BYTE_MAX);
+
+ NFAVertex v_3 = add_vertex(g);
+ g[v_3].assert_flags = POS_FLAG_VIRTUAL_START;
+ g[v_3].char_reach = CharReach(UTF_THREE_BYTE_MIN, UTF_THREE_BYTE_MAX);
+
+ NFAVertex v_4 = add_vertex(g);
+ g[v_4].assert_flags = POS_FLAG_VIRTUAL_START;
+ g[v_4].char_reach = CharReach(UTF_FOUR_BYTE_MIN, UTF_FOUR_BYTE_MAX);
+
+ NFAVertex v_c = add_vertex(g);
+ g[v_c].assert_flags = POS_FLAG_VIRTUAL_START;
+ g[v_c].char_reach = UTF_CONT_CR;
+ add_edge(v_c, g.accept, g[orig], g);
+
+ add_edge(v_2, v_c, g);
+
+ NFAVertex v_3c = add_vertex(g);
+ g[v_3c].assert_flags = POS_FLAG_VIRTUAL_START;
+ g[v_3c].char_reach = UTF_CONT_CR;
+ add_edge(v_3c, v_c, g);
+ add_edge(v_3, v_3c, g);
+
+ NFAVertex v_4c = add_vertex(g);
+ g[v_4c].assert_flags = POS_FLAG_VIRTUAL_START;
+ g[v_4c].char_reach = UTF_CONT_CR;
+ add_edge(v_4c, v_3c, g);
+ add_edge(v_4, v_4c, g);
+
+ g[v_a].reports.insert(rep);
+ g[v_c].reports.insert(rep);
+
+ add_edge(g.start, v_a, g);
+ add_edge(g.startDs, v_a, g);
+ add_edge(g.start, v_2, g);
+ add_edge(g.startDs, v_2, g);
+ add_edge(g.start, v_3, g);
+ add_edge(g.startDs, v_3, g);
+ add_edge(g.start, v_4, g);
+ add_edge(g.startDs, v_4, g);
+ remove_edge(orig, g);
renumber_edges(g);
clearReports(g);
- }
-}
-
-} // namespace ue2
+ }
+}
+
+} // namespace ue2