diff options
author | Devtools Arcadia <arcadia-devtools@yandex-team.ru> | 2022-02-07 18:08:42 +0300 |
---|---|---|
committer | Devtools Arcadia <arcadia-devtools@mous.vla.yp-c.yandex.net> | 2022-02-07 18:08:42 +0300 |
commit | 1110808a9d39d4b808aef724c861a2e1a38d2a69 (patch) | |
tree | e26c9fed0de5d9873cce7e00bc214573dc2195b7 /contrib/libs/hyperscan/src/rose/rose_build_misc.cpp | |
download | ydb-1110808a9d39d4b808aef724c861a2e1a38d2a69.tar.gz |
intermediate changes
ref:cde9a383711a11544ce7e107a78147fb96cc4029
Diffstat (limited to 'contrib/libs/hyperscan/src/rose/rose_build_misc.cpp')
-rw-r--r-- | contrib/libs/hyperscan/src/rose/rose_build_misc.cpp | 1044 |
1 files changed, 1044 insertions, 0 deletions
diff --git a/contrib/libs/hyperscan/src/rose/rose_build_misc.cpp b/contrib/libs/hyperscan/src/rose/rose_build_misc.cpp new file mode 100644 index 0000000000..0b0e689c99 --- /dev/null +++ b/contrib/libs/hyperscan/src/rose/rose_build_misc.cpp @@ -0,0 +1,1044 @@ +/* + * Copyright (c) 2015-2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "rose_build_misc.h" +#include "rose_build_impl.h" + +#include "rose_build_resources.h" +#include "hwlm/hwlm_literal.h" +#include "nfa/castlecompile.h" +#include "nfa/goughcompile.h" +#include "nfa/mcclellancompile_util.h" +#include "nfa/nfa_api.h" +#include "nfa/rdfa.h" +#include "nfa/tamaramacompile.h" +#include "nfagraph/ng_holder.h" +#include "nfagraph/ng_limex.h" +#include "nfagraph/ng_reports.h" +#include "nfagraph/ng_repeat.h" +#include "nfagraph/ng_util.h" +#include "nfagraph/ng_width.h" +#include "smallwrite/smallwrite_build.h" +#include "util/alloc.h" +#include "util/boundary_reports.h" +#include "util/compile_context.h" +#include "util/container.h" +#include "util/graph.h" +#include "util/graph_range.h" +#include "util/make_unique.h" +#include "util/order_check.h" +#include "util/report_manager.h" +#include "util/ue2string.h" +#include "util/verify_types.h" +#include "ue2common.h" +#include "grey.h" + +#include <boost/graph/breadth_first_search.hpp> + +using namespace std; + +namespace ue2 { + +// just to get it out of the header +RoseBuild::~RoseBuild() { } + +RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in, + SomSlotManager &ssm_in, + SmallWriteBuild &smwr_in, + const CompileContext &cc_in, + const BoundaryReports &boundary_in) + : cc(cc_in), + root(add_vertex(g)), + anchored_root(add_vertex(g)), + hasSom(false), + group_end(0), + ematcher_region_size(0), + eod_event_literal_id(MO_INVALID_IDX), + max_rose_anchored_floating_overlap(0), + rm(rm_in), + ssm(ssm_in), + smwr(smwr_in), + boundary(boundary_in), + next_nfa_report(0) { + // add root vertices to graph + g[root].min_offset = 0; + g[root].max_offset = 0; + + g[anchored_root].min_offset = 0; + g[anchored_root].max_offset = 0; +} + +RoseBuildImpl::~RoseBuildImpl() { + // empty +} + +bool RoseVertexProps::isBoring(void) const { + return !suffix && !left; +} + +bool RoseVertexProps::fixedOffset(void) const { + assert(min_offset <= max_offset); /* ensure offsets calculated */ + return max_offset == min_offset && max_offset != ROSE_BOUND_INF; +} + +bool RoseBuildImpl::isRootSuccessor(const RoseVertex &v) const { + for (auto u : inv_adjacent_vertices_range(v, g)) { + if (isAnyStart(u)) { + return true; + } + } + return false; +} + +bool RoseBuildImpl::isNonRootSuccessor(const RoseVertex &v) const { + for (auto u : inv_adjacent_vertices_range(v, g)) { + if (!isAnyStart(u)) { + return true; + } + } + return false; +} + +bool hasAnchHistorySucc(const RoseGraph &g, RoseVertex v) { + for (const auto &e : out_edges_range(v, g)) { + if (g[e].history == ROSE_ROLE_HISTORY_ANCH) { + return true; + } + } + + return false; +} + +bool hasLastByteHistorySucc(const RoseGraph &g, RoseVertex v) { + for (const auto &e : out_edges_range(v, g)) { + if (g[e].history == ROSE_ROLE_HISTORY_LAST_BYTE) { + return true; + } + } + + return false; +} + +static +bool isInTable(const RoseBuildImpl &tbi, RoseVertex v, + rose_literal_table table) { + const auto &lit_ids = tbi.g[v].literals; + if (lit_ids.empty()) { + return false; // special role with no literals + } + + // All literals for a given vertex will be in the same table, so we need + // only inspect the first one. + const auto lit_table = tbi.literals.at(*lit_ids.begin()).table; + + // Verify that all literals for this vertex are in the same table. + assert(all_of_in(lit_ids, [&](u32 lit_id) { + return tbi.literals.at(lit_id).table == lit_table; + })); + + return lit_table == table; +} + +bool RoseBuildImpl::isAnchored(RoseVertex v) const { + return isInTable(*this, v, ROSE_ANCHORED); +} + +bool RoseBuildImpl::isFloating(RoseVertex v) const { + return isInTable(*this, v, ROSE_FLOATING); +} + +bool RoseBuildImpl::isInETable(RoseVertex v) const { + return isInTable(*this, v, ROSE_EOD_ANCHORED); +} + +bool RoseBuildImpl::hasLiteralInTable(RoseVertex v, + enum rose_literal_table t) const { + return isInTable(*this, v, t); +} + +/* Indicates that the floating table (if it exists) will be only run + conditionally based on matches from the anchored table. */ +bool RoseBuildImpl::hasNoFloatingRoots() const { + for (auto v : adjacent_vertices_range(root, g)) { + if (isFloating(v)) { + DEBUG_PRINTF("direct floating root %zu\n", g[v].index); + return false; + } + } + + /* need to check if the anchored_root has any literals which are too deep */ + for (auto v : adjacent_vertices_range(anchored_root, g)) { + if (isFloating(v)) { + DEBUG_PRINTF("indirect floating root %zu\n", g[v].index); + return false; + } + } + + return true; +} + +size_t RoseBuildImpl::maxLiteralLen(RoseVertex v) const { + const auto &lit_ids = g[v].literals; + assert(!lit_ids.empty()); + + size_t maxlen = 0; + + for (const auto &lit_id : lit_ids) { + maxlen = max(maxlen, literals.at(lit_id).elength()); + } + + return maxlen; +} + +size_t RoseBuildImpl::minLiteralLen(RoseVertex v) const { + const auto &lit_ids = g[v].literals; + assert(!lit_ids.empty()); + + size_t minlen = ROSE_BOUND_INF; + + for (const auto &lit_id : lit_ids) { + minlen = min(minlen, literals.at(lit_id).elength()); + } + + return minlen; +} + +// RoseBuild factory +unique_ptr<RoseBuild> makeRoseBuilder(ReportManager &rm, + SomSlotManager &ssm, + SmallWriteBuild &smwr, + const CompileContext &cc, + const BoundaryReports &boundary) { + return ue2::make_unique<RoseBuildImpl>(rm, ssm, smwr, cc, boundary); +} + +bool roseIsPureLiteral(const RoseEngine *t) { + return t->runtimeImpl == ROSE_RUNTIME_PURE_LITERAL; +} + +// Returns non-zero max overlap len if a suffix of the literal 'a' overlaps +// with a prefix of the literal 'b' or 'a' can be contained in 'b'. +size_t maxOverlap(const ue2_literal &a, const ue2_literal &b, u32 b_delay) { + /* overly conservative if only part of the string is nocase */ + bool nocase = a.any_nocase() || b.any_nocase(); + DEBUG_PRINTF("max overlap %s %s+%u %d\n", dumpString(a).c_str(), + dumpString(b).c_str(), b_delay, (int)nocase); + size_t a_len = a.length(); + size_t b_len = b.length(); + const char *a_end = a.c_str() + a_len; + const char *b_end = b.c_str() + b_len; + if (b_delay >= a_len) { + return b_len + b_delay; + } else if (b_delay) { + /* a can be a substring of b which overlaps some of the end dots + * OR b can be a substring near the end of a */ + /* ignore overlap due to the final trailing dot as delayed literals + * are delivered before undelayed */ + for (u32 j = b_delay - 1; j > 0; j--) { + if (b_len + j >= a_len) { + if (!cmp(a.c_str(), b_end + j - a_len, a_len - j, nocase)) { + return b_len + j; + } + } else { + if (!cmp(a_end - j - b_len, b.c_str(), b_len, nocase)) { + return b_len + j; + } + } + } + } + + return maxStringOverlap(a.get_string(), b.get_string(), nocase); +} + +// Returns non-zero max overlap len if a suffix of the literal ID 'a' overlaps +// with a prefix of the literal ID 'b' or 'a' can be contained in 'b'. +size_t maxOverlap(const rose_literal_id &a, const rose_literal_id &b) { + assert(!a.delay); + return maxOverlap(a.s, b.s, b.delay); +} + +static +const rose_literal_id &getOverlapLiteral(const RoseBuildImpl &tbi, + u32 literal_id) { + auto it = tbi.anchoredLitSuffix.find(literal_id); + if (it != tbi.anchoredLitSuffix.end()) { + return it->second; + } + return tbi.literals.at(literal_id); +} + +ue2_literal findNonOverlappingTail(const set<ue2_literal> &lits, + const ue2_literal &s) { + size_t max_overlap = 0; + + for (const auto &lit : lits) { + size_t overlap = lit != s ? maxStringOverlap(lit, s) + : maxStringSelfOverlap(s); + max_overlap = max(max_overlap, overlap); + } + + /* find the tail that doesn't overlap */ + ue2_literal tail = s.substr(max_overlap); + DEBUG_PRINTF("%zu overlap, tail: '%s'\n", max_overlap, + dumpString(tail).c_str()); + return tail; +} + +size_t RoseBuildImpl::maxLiteralOverlap(RoseVertex u, RoseVertex v) const { + size_t overlap = 0; + for (auto u_lit_id : g[u].literals) { + const rose_literal_id &ul = getOverlapLiteral(*this, u_lit_id); + for (auto v_lit_id : g[v].literals) { + const rose_literal_id &vl = getOverlapLiteral(*this, v_lit_id); + overlap = max(overlap, maxOverlap(ul, vl)); + } + } + return overlap; +} + +void RoseBuildImpl::removeVertices(const vector<RoseVertex> &dead) { + for (auto v : dead) { + assert(!isAnyStart(v)); + DEBUG_PRINTF("removing vertex %zu\n", g[v].index); + for (auto lit_id : g[v].literals) { + literal_info[lit_id].vertices.erase(v); + } + clear_vertex(v, g); + remove_vertex(v, g); + } + renumber_vertices(g); +} + +// Find the maximum bound on the edges to this vertex's successors ignoring +// those via infixes. +u32 RoseBuildImpl::calcSuccMaxBound(RoseVertex u) const { + u32 maxBound = 0; + for (const auto &e : out_edges_range(u, g)) { + RoseVertex v = target(e, g); + + if (g[v].left) { + continue; + } + + u32 thisBound = g[e].maxBound; + + if (thisBound == ROSE_BOUND_INF) { + return ROSE_BOUND_INF; + } + + if (!g[v].eod_accept) { + // Add the length of the longest of our literals. + thisBound += maxLiteralLen(v); + } + + maxBound = max(maxBound, thisBound); + } + + assert(maxBound <= ROSE_BOUND_INF); + return maxBound; +} + +u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, u32 delay, + rose_literal_table table) { + DEBUG_PRINTF("getting id for %s in table %d\n", dumpString(s).c_str(), + table); + assert(table != ROSE_ANCHORED); + rose_literal_id key(s, table, delay); + + auto m = literals.insert(key); + u32 id = m.first; + bool inserted = m.second; + + if (inserted) { + literal_info.push_back(rose_literal_info()); + assert(literal_info.size() == id + 1); + + if (delay) { + u32 undelayed_id = getLiteralId(s, 0, table); + literal_info[id].undelayed_id = undelayed_id; + literal_info[undelayed_id].delayed_ids.insert(id); + } else { + literal_info[id].undelayed_id = id; + } + } + return id; +} + +// Function that operates on a msk/cmp pair and a literal, as used in +// hwlmLiteral, and zeroes msk elements that don't add any power to the +// literal. +void normaliseLiteralMask(const ue2_literal &s_in, vector<u8> &msk, + vector<u8> &cmp) { + assert(msk.size() == cmp.size()); + assert(msk.size() <= HWLM_MASKLEN); + + if (msk.empty()) { + return; + } + + // Work over a caseless copy if the string contains nocase chars. This will + // ensure that we treat masks designed to handle mixed-sensitivity literals + // correctly: these will be matched by the literal matcher in caseless + // mode, with the mask used to narrow the matches. + ue2_literal s(s_in); + if (s.any_nocase()) { + make_nocase(&s); + } + + ue2_literal::const_reverse_iterator it = s.rbegin(), ite = s.rend(); + size_t i = msk.size(); + while (i-- != 0 && it != ite) { + const CharReach &cr = *it; + for (size_t c = cr.find_first(); c != CharReach::npos; + c = cr.find_next(c)) { + if (((u8)c & msk[i]) != cmp[i]) { + goto skip; + } + } + + // If we didn't jump out of the loop to skip, then this mask position + // doesn't further narrow the set of acceptable literals from those + // accepted by s. So we can zero this element. + msk[i] = 0; + cmp[i] = 0; + skip: + ++it; + } + + // Wipe out prefix zeroes. + while (!msk.empty() && msk[0] == 0) { + msk.erase(msk.begin()); + cmp.erase(cmp.begin()); + } +} + +rose_literal_id::rose_literal_id(const ue2_literal &s_in, + const vector<u8> &msk_in, const vector<u8> &cmp_in, + rose_literal_table table_in, u32 delay_in) + : s(s_in), msk(msk_in), cmp(cmp_in), table(table_in), + delay(delay_in), distinctiveness(0) { + assert(msk.size() == cmp.size()); + assert(msk.size() <= HWLM_MASKLEN); + assert(delay <= MAX_DELAY); + + normaliseLiteralMask(s, msk, cmp); +} + +u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, const vector<u8> &msk, + const vector<u8> &cmp, u32 delay, + rose_literal_table table) { + DEBUG_PRINTF("getting id for %s in table %d\n", dumpString(s).c_str(), + table); + assert(table != ROSE_ANCHORED); + rose_literal_id key(s, msk, cmp, table, delay); + + /* ue2_literals are always uppercased if nocase and must have an + * alpha char */ + + auto m = literals.insert(key); + u32 id = m.first; + bool inserted = m.second; + + if (inserted) { + literal_info.push_back(rose_literal_info()); + assert(literal_info.size() == id + 1); + + if (delay) { + u32 undelayed_id = getLiteralId(s, msk, cmp, 0, table); + literal_info[id].undelayed_id = undelayed_id; + literal_info[undelayed_id].delayed_ids.insert(id); + } else { + literal_info[id].undelayed_id = id; + } + } + return id; +} + +u32 RoseBuildImpl::getNewLiteralId() { + rose_literal_id key(ue2_literal(), ROSE_ANCHORED, 0); + u32 numLiterals = verify_u32(literals.size()); + key.distinctiveness = numLiterals; + + auto m = literals.insert(key); + assert(m.second); + u32 id = m.first; + + literal_info.push_back(rose_literal_info()); + assert(literal_info.size() == id + 1); + + literal_info[id].undelayed_id = id; + + return id; +} + +bool operator<(const RoseEdgeProps &a, const RoseEdgeProps &b) { + ORDER_CHECK(minBound); + ORDER_CHECK(maxBound); + ORDER_CHECK(history); + return false; +} + +#ifndef NDEBUG +bool roseHasTops(const RoseBuildImpl &build, RoseVertex v) { + const RoseGraph &g = build.g; + assert(g[v].left); + + set<u32> graph_tops; + if (!build.isRootSuccessor(v)) { + for (const auto &e : in_edges_range(v, g)) { + graph_tops.insert(g[e].rose_top); + } + } + + return is_subset_of(graph_tops, all_tops(g[v].left)); +} +#endif + +u32 OutfixInfo::get_queue(QueueIndexFactory &qif) { + if (queue == ~0U) { + queue = qif.get_queue(); + } + + return queue; +} + +namespace { +class OutfixAllReports : public boost::static_visitor<set<ReportID>> { +public: + set<ReportID> operator()(const boost::blank &) const { + return set<ReportID>(); + } + + template<class T> + set<ReportID> operator()(const unique_ptr<T> &x) const { + return all_reports(*x); + } + + set<ReportID> operator()(const MpvProto &mpv) const { + set<ReportID> reports; + for (const auto &puff : mpv.puffettes) { + reports.insert(puff.report); + } + for (const auto &puff : mpv.triggered_puffettes) { + reports.insert(puff.report); + } + return reports; + } +}; +} + +set<ReportID> all_reports(const OutfixInfo &outfix) { + auto reports = boost::apply_visitor(OutfixAllReports(), outfix.proto); + assert(!reports.empty()); + return reports; +} + +bool RoseSuffixInfo::operator==(const RoseSuffixInfo &b) const { + return top == b.top && graph == b.graph && castle == b.castle && + rdfa == b.rdfa && haig == b.haig && tamarama == b.tamarama; +} + +bool RoseSuffixInfo::operator<(const RoseSuffixInfo &b) const { + const RoseSuffixInfo &a = *this; + ORDER_CHECK(top); + ORDER_CHECK(graph); + ORDER_CHECK(castle); + ORDER_CHECK(haig); + ORDER_CHECK(rdfa); + ORDER_CHECK(tamarama); + assert(a.dfa_min_width == b.dfa_min_width); + assert(a.dfa_max_width == b.dfa_max_width); + return false; +} + +size_t RoseSuffixInfo::hash() const { + return hash_all(top, graph, castle, rdfa, haig, tamarama); +} + +void RoseSuffixInfo::reset(void) { + top = 0; + graph.reset(); + castle.reset(); + rdfa.reset(); + haig.reset(); + tamarama.reset(); + dfa_min_width = depth(0); + dfa_max_width = depth::infinity(); +} + +std::set<ReportID> all_reports(const suffix_id &s) { + assert(s.graph() || s.castle() || s.haig() || s.dfa()); + if (s.tamarama()) { + return all_reports(*s.tamarama()); + } else if (s.graph()) { + return all_reports(*s.graph()); + } else if (s.castle()) { + return all_reports(*s.castle()); + } else if (s.dfa()) { + return all_reports(*s.dfa()); + } else { + return all_reports(*s.haig()); + } +} + +depth findMinWidth(const suffix_id &s) { + assert(s.graph() || s.castle() || s.haig() || s.dfa()); + if (s.graph()) { + return findMinWidth(*s.graph()); + } else if (s.castle()) { + return findMinWidth(*s.castle()); + } else { + return s.dfa_min_width; + } +} + +depth findMinWidth(const suffix_id &s, u32 top) { + assert(s.graph() || s.castle() || s.haig() || s.dfa()); + if (s.graph()) { + return findMinWidth(*s.graph(), top); + } else if (s.castle()) { + return findMinWidth(*s.castle(), top); + } else { + return s.dfa_min_width; + } +} + +depth findMaxWidth(const suffix_id &s) { + assert(s.graph() || s.castle() || s.haig() || s.dfa()); + if (s.graph()) { + return findMaxWidth(*s.graph()); + } else if (s.castle()) { + return findMaxWidth(*s.castle()); + } else { + return s.dfa_max_width; + } +} + +depth findMaxWidth(const suffix_id &s, u32 top) { + assert(s.graph() || s.castle() || s.haig() || s.dfa()); + if (s.graph()) { + return findMaxWidth(*s.graph(), top); + } else if (s.castle()) { + return findMaxWidth(*s.castle(), top); + } else { + return s.dfa_max_width; + } +} + +bool has_eod_accepts(const suffix_id &s) { + assert(s.graph() || s.castle() || s.haig() || s.dfa()); + if (s.graph()) { + /* ignore accept -> eod edge */ + return in_degree(s.graph()->acceptEod, *s.graph()) > 1; + } else if (s.castle()) { + return false; + } else if (s.dfa()) { + return has_eod_accepts(*s.dfa()); + } else { + return has_eod_accepts(*s.haig()); + } +} + +bool has_non_eod_accepts(const suffix_id &s) { + assert(s.graph() || s.castle() || s.haig() || s.dfa()); + if (s.graph()) { + return in_degree(s.graph()->accept, *s.graph()); + } else if (s.castle()) { + return true; + } else if (s.dfa()) { + return has_non_eod_accepts(*s.dfa()); + } else { + return has_non_eod_accepts(*s.haig()); + } +} + +set<u32> all_tops(const suffix_id &s) { + assert(s.graph() || s.castle() || s.haig() || s.dfa()); + if (s.graph()) { + flat_set<u32> tops = getTops(*s.graph()); + assert(!tops.empty()); + return {tops.begin(), tops.end()}; + } + + if (s.castle()) { + return assoc_keys(s.castle()->repeats); + } + + // Other types of suffix are not multi-top. + return {0}; +} + +size_t suffix_id::hash() const { + return hash_all(g, c, d, h, t); +} + +bool isAnchored(const left_id &r) { + assert(r.graph() || r.castle() || r.haig() || r.dfa()); + if (r.graph()) { + return isAnchored(*r.graph()); + } + if (r.dfa()) { + return r.dfa()->start_anchored == DEAD_STATE; + } + if (r.haig()) { + return r.haig()->start_anchored == DEAD_STATE; + } + + // All other types are explicitly anchored. + return true; +} + +depth findMinWidth(const left_id &r) { + assert(r.graph() || r.castle() || r.haig() || r.dfa()); + if (r.graph()) { + return findMinWidth(*r.graph()); + } else if (r.castle()) { + return findMinWidth(*r.castle()); + } else { + return r.dfa_min_width; + } +} + +depth findMaxWidth(const left_id &r) { + assert(r.graph() || r.castle() || r.haig() || r.dfa()); + if (r.graph()) { + return findMaxWidth(*r.graph()); + } else if (r.castle()) { + return findMaxWidth(*r.castle()); + } else { + return r.dfa_max_width; + } +} + +set<u32> all_tops(const left_id &r) { + assert(r.graph() || r.castle() || r.haig() || r.dfa()); + if (r.graph()) { + flat_set<u32> tops = getTops(*r.graph()); + return {tops.begin(), tops.end()}; + } + + if (r.castle()) { + return assoc_keys(r.castle()->repeats); + } + + // Other types of rose are not multi-top. + return {0}; +} + +set<u32> all_reports(const left_id &left) { + assert(left.graph() || left.castle() || left.haig() || left.dfa()); + if (left.graph()) { + return all_reports(*left.graph()); + } else if (left.castle()) { + return all_reports(*left.castle()); + } else if (left.dfa()) { + return all_reports(*left.dfa()); + } else { + return all_reports(*left.haig()); + } +} + +u32 num_tops(const left_id &r) { + return all_tops(r).size(); +} + +size_t left_id::hash() const { + return hash_all(g, c, d, h); +} + +u64a findMaxOffset(const set<ReportID> &reports, const ReportManager &rm) { + assert(!reports.empty()); + u64a maxOffset = 0; + for (const auto &report_id : reports) { + const Report &ir = rm.getReport(report_id); + if (ir.hasBounds()) { + maxOffset = max(maxOffset, ir.maxOffset); + } else { + return MAX_OFFSET; + } + } + return maxOffset; +} + +size_t LeftEngInfo::hash() const { + return hash_all(graph, castle, dfa, haig, tamarama, lag, leftfix_report); +} + +void LeftEngInfo::reset(void) { + graph.reset(); + castle.reset(); + dfa.reset(); + haig.reset(); + tamarama.reset(); + lag = 0; + leftfix_report = MO_INVALID_IDX; + dfa_min_width = depth(0); + dfa_max_width = depth::infinity(); +} + +LeftEngInfo::operator bool() const { + assert((int)!!castle + (int)!!dfa + (int)!!haig <= 1); + assert(!castle || !graph); + assert(!dfa || graph); /* dfas always have the graph as well */ + assert(!haig || graph); + return graph || castle || dfa || haig; +} + +u32 roseQuality(const RoseResources &res, const RoseEngine *t) { + /* Rose is low quality if the atable is a Mcclellan 16 or has multiple DFAs + */ + if (res.has_anchored) { + if (res.has_anchored_multiple) { + DEBUG_PRINTF("multiple atable engines\n"); + return 0; + } + + if (res.has_anchored_large) { + DEBUG_PRINTF("m16 atable engine\n"); + return 0; + } + } + + /* if we always run multiple engines then we are slow */ + u32 always_run = 0; + + if (res.has_anchored) { + always_run++; + } + + if (t->eagerIterOffset) { + /* eager prefixes are always run */ + always_run++; + } + + if (res.has_floating) { + /* TODO: ignore conditional ftables, or ftables beyond smwr region */ + always_run++; + } + + if (t->ematcherOffset) { + always_run++; + } + + /* ignore mpv outfixes as they are v good, mpv outfixes are before begin */ + if (t->outfixBeginQueue != t->outfixEndQueue) { + /* TODO: ignore outfixes > smwr region */ + always_run++; + } + + bool eod_prefix = false; + + const LeftNfaInfo *left = getLeftTable(t); + for (u32 i = 0; i < t->activeLeftCount; i++) { + if (left->eod_check) { + eod_prefix = true; + break; + } + } + + if (eod_prefix) { + always_run++; + DEBUG_PRINTF("eod prefixes are slow"); + return 0; + } + + if (always_run > 1) { + DEBUG_PRINTF("we always run %u engines\n", always_run); + return 0; + } + + return 1; +} + +u32 findMinOffset(const RoseBuildImpl &build, u32 lit_id) { + const auto &lit_vertices = build.literal_info.at(lit_id).vertices; + assert(!lit_vertices.empty()); + + u32 min_offset = UINT32_MAX; + for (const auto &v : lit_vertices) { + min_offset = min(min_offset, build.g[v].min_offset); + } + + return min_offset; +} + +u32 findMaxOffset(const RoseBuildImpl &build, u32 lit_id) { + const auto &lit_vertices = build.literal_info.at(lit_id).vertices; + assert(!lit_vertices.empty()); + + u32 max_offset = 0; + for (const auto &v : lit_vertices) { + max_offset = max(max_offset, build.g[v].max_offset); + } + + return max_offset; +} + +bool canEagerlyReportAtEod(const RoseBuildImpl &build, const RoseEdge &e) { + const auto &g = build.g; + const auto v = target(e, g); + + if (!build.g[v].eod_accept) { + return false; + } + + // If there's a graph between us and EOD, we shouldn't be eager. + if (build.g[v].left) { + return false; + } + + // Must be exactly at EOD. + if (g[e].minBound != 0 || g[e].maxBound != 0) { + return false; + } + + // In streaming mode, we can only eagerly report EOD for literals in the + // EOD-anchored table, as that's the only time we actually know where EOD + // is. In block mode, we always have this information. + const auto u = source(e, g); + if (build.cc.streaming && !build.isInETable(u)) { + return false; + } + + return true; +} + +#ifndef NDEBUG +/** \brief Returns true if all the graphs (NFA, DFA, Haig, etc) in this Rose + * graph are implementable. */ +bool canImplementGraphs(const RoseBuildImpl &tbi) { + const RoseGraph &g = tbi.g; + + // First, check the Rose leftfixes. + + for (auto v : vertices_range(g)) { + DEBUG_PRINTF("leftfix: check vertex %zu\n", g[v].index); + + if (g[v].left.castle) { + DEBUG_PRINTF("castle ok\n"); + continue; + } + if (g[v].left.dfa) { + DEBUG_PRINTF("dfa ok\n"); + continue; + } + if (g[v].left.haig) { + DEBUG_PRINTF("haig ok\n"); + continue; + } + if (g[v].left.graph) { + assert(g[v].left.graph->kind + == (tbi.isRootSuccessor(v) ? NFA_PREFIX : NFA_INFIX)); + if (!isImplementableNFA(*g[v].left.graph, nullptr, tbi.cc)) { + DEBUG_PRINTF("nfa prefix %zu failed (%zu vertices)\n", + g[v].index, num_vertices(*g[v].left.graph)); + return false; + } + } + } + + // Suffix graphs. + + for (auto v : vertices_range(g)) { + DEBUG_PRINTF("suffix: check vertex %zu\n", g[v].index); + + const RoseSuffixInfo &suffix = g[v].suffix; + if (suffix.castle) { + DEBUG_PRINTF("castle suffix ok\n"); + continue; + } + if (suffix.rdfa) { + DEBUG_PRINTF("dfa suffix ok\n"); + continue; + } + if (suffix.haig) { + DEBUG_PRINTF("haig suffix ok\n"); + continue; + } + if (suffix.graph) { + assert(suffix.graph->kind == NFA_SUFFIX); + if (!isImplementableNFA(*suffix.graph, &tbi.rm, tbi.cc)) { + DEBUG_PRINTF("nfa suffix %zu failed (%zu vertices)\n", + g[v].index, num_vertices(*suffix.graph)); + return false; + } + } + } + + return true; +} + +/** + * \brief True if there is an engine with a top that is not triggered by a + * vertex in the Rose graph. This is a consistency check used in assertions. + */ +bool hasOrphanedTops(const RoseBuildImpl &build) { + const RoseGraph &g = build.g; + + unordered_map<left_id, set<u32>> leftfixes; + unordered_map<suffix_id, set<u32>> suffixes; + + for (auto v : vertices_range(g)) { + if (g[v].left) { + set<u32> &tops = leftfixes[g[v].left]; + if (!build.isRootSuccessor(v)) { + // Tops for infixes come from the in-edges. + for (const auto &e : in_edges_range(v, g)) { + tops.insert(g[e].rose_top); + } + } + } + if (g[v].suffix) { + suffixes[g[v].suffix].insert(g[v].suffix.top); + } + } + + for (const auto &e : leftfixes) { + if (all_tops(e.first) != e.second) { + DEBUG_PRINTF("rose tops (%s) don't match rose graph (%s)\n", + as_string_list(all_tops(e.first)).c_str(), + as_string_list(e.second).c_str()); + return true; + } + } + + for (const auto &e : suffixes) { + if (all_tops(e.first) != e.second) { + DEBUG_PRINTF("suffix tops (%s) don't match rose graph (%s)\n", + as_string_list(all_tops(e.first)).c_str(), + as_string_list(e.second).c_str()); + return true; + } + } + + return false; +} + +#endif // NDEBUG + +} // namespace ue2 |