diff options
author | Ivan Blinkov <ivan@blinkov.ru> | 2022-02-10 16:47:11 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:47:11 +0300 |
commit | 5b283123c882433dafbaf6b338adeea16c1a0ea0 (patch) | |
tree | 339adc63bce23800021202ae4a8328a843dc447a /contrib/libs/hyperscan/src/rose/rose_build_lookaround.cpp | |
parent | 1aeb9a455974457866f78722ad98114bafc84e8a (diff) | |
download | ydb-5b283123c882433dafbaf6b338adeea16c1a0ea0.tar.gz |
Restoring authorship annotation for Ivan Blinkov <ivan@blinkov.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/hyperscan/src/rose/rose_build_lookaround.cpp')
-rw-r--r-- | contrib/libs/hyperscan/src/rose/rose_build_lookaround.cpp | 588 |
1 files changed, 294 insertions, 294 deletions
diff --git a/contrib/libs/hyperscan/src/rose/rose_build_lookaround.cpp b/contrib/libs/hyperscan/src/rose/rose_build_lookaround.cpp index 11f9fa2aa2..d0540d79b0 100644 --- a/contrib/libs/hyperscan/src/rose/rose_build_lookaround.cpp +++ b/contrib/libs/hyperscan/src/rose/rose_build_lookaround.cpp @@ -40,12 +40,12 @@ #include "util/container.h" #include "util/dump_charclass.h" #include "util/graph_range.h" -#include "util/flat_containers.h" +#include "util/flat_containers.h" #include "util/verify_types.h" #include <cstdlib> #include <queue> -#include <sstream> +#include <sstream> using namespace std; @@ -63,23 +63,23 @@ static const u32 MAX_LOOKAROUND_ENTRIES = 32; /** \brief We would rather have lookarounds with smaller reach than this. */ static const u32 LOOKAROUND_WIDE_REACH = 200; -#if defined(DEBUG) || defined(DUMP_SUPPORT) -static UNUSED -string dump(const map<s32, CharReach> &look) { - ostringstream oss; - for (auto it = look.begin(), ite = look.end(); it != ite; ++it) { - if (it != look.begin()) { - oss << ", "; - } - oss << "{" << it->first << ": " << describeClass(it->second) << "}"; - } - return oss.str(); -} -#endif - +#if defined(DEBUG) || defined(DUMP_SUPPORT) +static UNUSED +string dump(const map<s32, CharReach> &look) { + ostringstream oss; + for (auto it = look.begin(), ite = look.end(); it != ite; ++it) { + if (it != look.begin()) { + oss << ", "; + } + oss << "{" << it->first << ": " << describeClass(it->second) << "}"; + } + return oss.str(); +} +#endif + static void getForwardReach(const NGHolder &g, u32 top, map<s32, CharReach> &look) { - flat_set<NFAVertex> curr, next; + flat_set<NFAVertex> curr, next; // Consider only successors of start with the required top. for (const auto &e : out_edges_range(g.start, g)) { @@ -87,7 +87,7 @@ void getForwardReach(const NGHolder &g, u32 top, map<s32, CharReach> &look) { if (v == g.startDs) { continue; } - if (contains(g[e].tops, top)) { + if (contains(g[e].tops, top)) { curr.insert(v); } } @@ -116,7 +116,7 @@ void getForwardReach(const NGHolder &g, u32 top, map<s32, CharReach> &look) { static void getBackwardReach(const NGHolder &g, ReportID report, u32 lag, map<s32, CharReach> &look) { - flat_set<NFAVertex> curr, next; + flat_set<NFAVertex> curr, next; for (auto v : inv_adjacent_vertices_range(g.accept, g)) { if (contains(g[v].reports, report)) { @@ -187,7 +187,7 @@ void getForwardReach(const raw_dfa &rdfa, map<s32, CharReach> &look) { return; } - flat_set<dstate_id_t> curr, next; + flat_set<dstate_id_t> curr, next; curr.insert(rdfa.start_anchored); for (u32 i = 0; i < MAX_FWD_LEN && !curr.empty(); i++) { @@ -276,7 +276,7 @@ void findForwardReach(const RoseGraph &g, const RoseVertex v, for (const auto &e : out_edges_range(v, g)) { RoseVertex t = target(e, g); if (!g[t].left) { - DEBUG_PRINTF("successor %zu has no leftfix\n", g[t].index); + DEBUG_PRINTF("successor %zu has no leftfix\n", g[t].index); return; } rose_look.push_back(map<s32, CharReach>()); @@ -447,7 +447,7 @@ static void findFloodReach(const RoseBuildImpl &tbi, const RoseVertex v, set<CharReach> &flood_reach) { for (u32 lit_id : tbi.g[v].literals) { - const ue2_literal &s = tbi.literals.at(lit_id).s; + const ue2_literal &s = tbi.literals.at(lit_id).s; if (s.empty()) { continue; } @@ -460,69 +460,69 @@ void findFloodReach(const RoseBuildImpl &tbi, const RoseVertex v, } } - -namespace { -struct LookProto { - LookProto(s32 offset_in, CharReach reach_in) - : offset(offset_in), reach(move(reach_in)) {} - s32 offset; - CharReach reach; -}; -} - + +namespace { +struct LookProto { + LookProto(s32 offset_in, CharReach reach_in) + : offset(offset_in), reach(move(reach_in)) {} + s32 offset; + CharReach reach; +}; +} + +static +vector<LookProto> findLiteralReach(const rose_literal_id &lit) { + vector<LookProto> look; + look.reserve(lit.s.length()); + + s32 i = 0 - lit.s.length() - lit.delay; + for (const auto &c : lit.s) { + look.emplace_back(i, c); + i++; + } + + return look; +} + static -vector<LookProto> findLiteralReach(const rose_literal_id &lit) { - vector<LookProto> look; - look.reserve(lit.s.length()); - - s32 i = 0 - lit.s.length() - lit.delay; - for (const auto &c : lit.s) { - look.emplace_back(i, c); - i++; - } - - return look; -} - -static -vector<LookProto> findLiteralReach(const RoseBuildImpl &build, - const RoseVertex v) { - bool first = true; - vector<LookProto> look; - +vector<LookProto> findLiteralReach(const RoseBuildImpl &build, + const RoseVertex v) { + bool first = true; + vector<LookProto> look; + for (u32 lit_id : build.g[v].literals) { - const rose_literal_id &lit = build.literals.at(lit_id); - auto lit_look = findLiteralReach(lit); - - if (first) { - look = std::move(lit_look); - first = false; - continue; - } - - // Erase elements from look with keys not in lit_look. Where a key is - // in both maps, union its reach with the lookaround. - auto jt = begin(lit_look); - for (auto it = begin(look); it != end(look);) { - if (jt == end(lit_look)) { - // No further lit_look entries, erase remaining elements from - // look. - look.erase(it, end(look)); - break; - } - if (it->offset < jt->offset) { - // Offset is present in look but not in lit_look, erase. - it = look.erase(it); - } else if (it->offset > jt->offset) { - // Offset is preset in lit_look but not in look, ignore. - ++jt; - } else { - // Offset is present in both, union its reach with look. - it->reach |= jt->reach; - ++it; - ++jt; - } - } + const rose_literal_id &lit = build.literals.at(lit_id); + auto lit_look = findLiteralReach(lit); + + if (first) { + look = std::move(lit_look); + first = false; + continue; + } + + // Erase elements from look with keys not in lit_look. Where a key is + // in both maps, union its reach with the lookaround. + auto jt = begin(lit_look); + for (auto it = begin(look); it != end(look);) { + if (jt == end(lit_look)) { + // No further lit_look entries, erase remaining elements from + // look. + look.erase(it, end(look)); + break; + } + if (it->offset < jt->offset) { + // Offset is present in look but not in lit_look, erase. + it = look.erase(it); + } else if (it->offset > jt->offset) { + // Offset is preset in lit_look but not in look, ignore. + ++jt; + } else { + // Offset is present in both, union its reach with look. + it->reach |= jt->reach; + ++it; + ++jt; + } + } } return look; @@ -538,11 +538,11 @@ void trimLiterals(const RoseBuildImpl &build, const RoseVertex v, DEBUG_PRINTF("pre-trim lookaround: %s\n", dump(look).c_str()); for (const auto &m : findLiteralReach(build, v)) { - auto it = look.find(m.offset); + auto it = look.find(m.offset); if (it == end(look)) { continue; } - if (m.reach.isSubsetOf(it->second)) { + if (m.reach.isSubsetOf(it->second)) { DEBUG_PRINTF("can trim entry at %d\n", it->first); look.erase(it); } @@ -551,76 +551,76 @@ void trimLiterals(const RoseBuildImpl &build, const RoseVertex v, DEBUG_PRINTF("post-trim lookaround: %s\n", dump(look).c_str()); } -static -void normaliseLeftfix(map<s32, CharReach> &look) { - // We can erase entries where the reach is "all characters", except for the - // very first one -- this might be required to establish a minimum bound on - // the literal's match offset. - - // TODO: It would be cleaner to use a literal program instruction to check - // the minimum bound explicitly. - - if (look.empty()) { - return; - } - - const auto earliest = begin(look)->first; - - vector<s32> dead; - for (const auto &m : look) { - if (m.second.all() && m.first != earliest) { - dead.push_back(m.first); - } - } - erase_all(&look, dead); -} - -static -bool trimMultipathLeftfix(const RoseBuildImpl &build, const RoseVertex v, - vector<map<s32, CharReach>> &looks) { - size_t path_count = 0; - for (auto &look : looks) { - ++path_count; - DEBUG_PRINTF("Path #%ld\n", path_count); - - assert(!look.empty()); - trimLiterals(build, v, look); - - if (look.empty()) { - return false; - } - - // Could be optimized here, just keep the empty byte of the longest path - normaliseLeftfix(look); - - if (look.size() > MAX_LOOKAROUND_ENTRIES) { - DEBUG_PRINTF("lookaround too big (%zu entries)\n", look.size()); - return false; - } - } - return true; -} - -static -void transToLookaround(const vector<map<s32, CharReach>> &looks, - vector<vector<LookEntry>> &lookarounds) { - for (const auto &look : looks) { - vector<LookEntry> lookaround; - DEBUG_PRINTF("lookaround: %s\n", dump(look).c_str()); - lookaround.reserve(look.size()); - for (const auto &m : look) { - if (m.first < -128 || m.first > 127) { - DEBUG_PRINTF("range too big\n"); - lookarounds.clear(); - return; - } - s8 offset = verify_s8(m.first); - lookaround.emplace_back(offset, m.second); - } - lookarounds.push_back(lookaround); - } -} - +static +void normaliseLeftfix(map<s32, CharReach> &look) { + // We can erase entries where the reach is "all characters", except for the + // very first one -- this might be required to establish a minimum bound on + // the literal's match offset. + + // TODO: It would be cleaner to use a literal program instruction to check + // the minimum bound explicitly. + + if (look.empty()) { + return; + } + + const auto earliest = begin(look)->first; + + vector<s32> dead; + for (const auto &m : look) { + if (m.second.all() && m.first != earliest) { + dead.push_back(m.first); + } + } + erase_all(&look, dead); +} + +static +bool trimMultipathLeftfix(const RoseBuildImpl &build, const RoseVertex v, + vector<map<s32, CharReach>> &looks) { + size_t path_count = 0; + for (auto &look : looks) { + ++path_count; + DEBUG_PRINTF("Path #%ld\n", path_count); + + assert(!look.empty()); + trimLiterals(build, v, look); + + if (look.empty()) { + return false; + } + + // Could be optimized here, just keep the empty byte of the longest path + normaliseLeftfix(look); + + if (look.size() > MAX_LOOKAROUND_ENTRIES) { + DEBUG_PRINTF("lookaround too big (%zu entries)\n", look.size()); + return false; + } + } + return true; +} + +static +void transToLookaround(const vector<map<s32, CharReach>> &looks, + vector<vector<LookEntry>> &lookarounds) { + for (const auto &look : looks) { + vector<LookEntry> lookaround; + DEBUG_PRINTF("lookaround: %s\n", dump(look).c_str()); + lookaround.reserve(look.size()); + for (const auto &m : look) { + if (m.first < -128 || m.first > 127) { + DEBUG_PRINTF("range too big\n"); + lookarounds.clear(); + return; + } + s8 offset = verify_s8(m.first); + lookaround.emplace_back(offset, m.second); + } + lookarounds.push_back(lookaround); + } +} + void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v, vector<LookEntry> &lookaround) { lookaround.clear(); @@ -659,155 +659,155 @@ void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v, } static -bool checkShuftiBuckets(const vector<map<s32, CharReach>> &looks, - u32 bucket_size) { - set<u32> bucket; - for (const auto &look : looks) { - for (const auto &l : look) { - CharReach cr = l.second; - if (cr.count() > 128) { - cr.flip(); - } - map <u16, u16> lo2hi; - - for (size_t i = cr.find_first(); i != CharReach::npos;) { - u8 it_hi = i >> 4; - u16 low_encode = 0; - while (i != CharReach::npos && (i >> 4) == it_hi) { - low_encode |= 1 << (i &0xf); - i = cr.find_next(i); - } - lo2hi[low_encode] |= 1 << it_hi; - } - - for (const auto &it : lo2hi) { - u32 hi_lo = (it.second << 16) | it.first; - bucket.insert(hi_lo); - } - } - } - DEBUG_PRINTF("shufti has %lu bucket(s)\n", bucket.size()); - return bucket.size() <= bucket_size; -} - -static -bool getTransientPrefixReach(const NGHolder &g, ReportID report, u32 lag, - vector<map<s32, CharReach>> &looks) { - if (!isAcyclic(g)) { - DEBUG_PRINTF("contains back-edge\n"); +bool checkShuftiBuckets(const vector<map<s32, CharReach>> &looks, + u32 bucket_size) { + set<u32> bucket; + for (const auto &look : looks) { + for (const auto &l : look) { + CharReach cr = l.second; + if (cr.count() > 128) { + cr.flip(); + } + map <u16, u16> lo2hi; + + for (size_t i = cr.find_first(); i != CharReach::npos;) { + u8 it_hi = i >> 4; + u16 low_encode = 0; + while (i != CharReach::npos && (i >> 4) == it_hi) { + low_encode |= 1 << (i &0xf); + i = cr.find_next(i); + } + lo2hi[low_encode] |= 1 << it_hi; + } + + for (const auto &it : lo2hi) { + u32 hi_lo = (it.second << 16) | it.first; + bucket.insert(hi_lo); + } + } + } + DEBUG_PRINTF("shufti has %lu bucket(s)\n", bucket.size()); + return bucket.size() <= bucket_size; +} + +static +bool getTransientPrefixReach(const NGHolder &g, ReportID report, u32 lag, + vector<map<s32, CharReach>> &looks) { + if (!isAcyclic(g)) { + DEBUG_PRINTF("contains back-edge\n"); return false; } - // Must be floating chains wired to startDs. - if (!isFloating(g)) { - DEBUG_PRINTF("not a floating start\n"); + // Must be floating chains wired to startDs. + if (!isFloating(g)) { + DEBUG_PRINTF("not a floating start\n"); return false; } - vector<NFAVertex> curr; - for (auto v : inv_adjacent_vertices_range(g.accept, g)) { - if (v == g.start || v == g.startDs) { - DEBUG_PRINTF("empty graph\n"); - return true; - } - if (contains(g[v].reports, report)) { - curr.push_back(v); - } - } - - assert(!curr.empty()); - - u32 total_len = curr.size(); - - for (const auto &v : curr) { - looks.emplace_back(map<s32, CharReach>()); - looks.back()[0 - (lag + 1)] = g[v].char_reach; - } - - bool curr_active = false; - - /* For each offset -i, we backwardly trace the path by vertices in curr. - * Once there are more than 8 paths and more than 64 bits total_len, - * which means that neither MULTIPATH_LOOKAROUND nor MULTIPATH_SHUFTI - * could be successfully built, we will give up the path finding. - * Otherwise, the loop will halt when all vertices in curr are startDs. - */ - for (u32 i = lag + 2; i < (lag + 2) + MAX_BACK_LEN; i++) { - curr_active = false; - size_t curr_size = curr.size(); - if (curr.size() > 1 && i > lag + MULTIPATH_MAX_LEN) { - DEBUG_PRINTF("range is larger than 16 in multi-path\n"); + vector<NFAVertex> curr; + for (auto v : inv_adjacent_vertices_range(g.accept, g)) { + if (v == g.start || v == g.startDs) { + DEBUG_PRINTF("empty graph\n"); + return true; + } + if (contains(g[v].reports, report)) { + curr.push_back(v); + } + } + + assert(!curr.empty()); + + u32 total_len = curr.size(); + + for (const auto &v : curr) { + looks.emplace_back(map<s32, CharReach>()); + looks.back()[0 - (lag + 1)] = g[v].char_reach; + } + + bool curr_active = false; + + /* For each offset -i, we backwardly trace the path by vertices in curr. + * Once there are more than 8 paths and more than 64 bits total_len, + * which means that neither MULTIPATH_LOOKAROUND nor MULTIPATH_SHUFTI + * could be successfully built, we will give up the path finding. + * Otherwise, the loop will halt when all vertices in curr are startDs. + */ + for (u32 i = lag + 2; i < (lag + 2) + MAX_BACK_LEN; i++) { + curr_active = false; + size_t curr_size = curr.size(); + if (curr.size() > 1 && i > lag + MULTIPATH_MAX_LEN) { + DEBUG_PRINTF("range is larger than 16 in multi-path\n"); return false; } - for (size_t idx = 0; idx < curr_size; idx++) { - NFAVertex v = curr[idx]; - if (v == g.startDs) { - continue; - } - assert(!is_special(v, g)); - - for (auto u : inv_adjacent_vertices_range(v, g)) { - if (u == g.start || u == g.startDs) { - curr[idx] = g.startDs; - break; - } - } - - if (is_special(curr[idx], g)) { - continue; - } - - for (auto u : inv_adjacent_vertices_range(v, g)) { - curr_active = true; - if (curr[idx] == v) { - curr[idx] = u; - looks[idx][0 - i] = g[u].char_reach; - total_len++; - } else { - curr.push_back(u); - looks.push_back(looks[idx]); - (looks.back())[0 - i] = g[u].char_reach; - total_len += looks.back().size(); - } - - if (curr.size() > MAX_LOOKAROUND_PATHS && total_len > 64) { - DEBUG_PRINTF("too many branches\n"); - return false; - } - } - } - if (!curr_active) { - break; - } - } - - if (curr_active) { - DEBUG_PRINTF("single path too long\n"); - return false; - } - - // More than 8 paths, check multi-path shufti. - if (curr.size() > MAX_LOOKAROUND_PATHS) { - u32 bucket_size = total_len > 32 ? 8 : 16; - if (!checkShuftiBuckets(looks, bucket_size)) { - DEBUG_PRINTF("shufti has too many buckets\n"); + for (size_t idx = 0; idx < curr_size; idx++) { + NFAVertex v = curr[idx]; + if (v == g.startDs) { + continue; + } + assert(!is_special(v, g)); + + for (auto u : inv_adjacent_vertices_range(v, g)) { + if (u == g.start || u == g.startDs) { + curr[idx] = g.startDs; + break; + } + } + + if (is_special(curr[idx], g)) { + continue; + } + + for (auto u : inv_adjacent_vertices_range(v, g)) { + curr_active = true; + if (curr[idx] == v) { + curr[idx] = u; + looks[idx][0 - i] = g[u].char_reach; + total_len++; + } else { + curr.push_back(u); + looks.push_back(looks[idx]); + (looks.back())[0 - i] = g[u].char_reach; + total_len += looks.back().size(); + } + + if (curr.size() > MAX_LOOKAROUND_PATHS && total_len > 64) { + DEBUG_PRINTF("too many branches\n"); + return false; + } + } + } + if (!curr_active) { + break; + } + } + + if (curr_active) { + DEBUG_PRINTF("single path too long\n"); + return false; + } + + // More than 8 paths, check multi-path shufti. + if (curr.size() > MAX_LOOKAROUND_PATHS) { + u32 bucket_size = total_len > 32 ? 8 : 16; + if (!checkShuftiBuckets(looks, bucket_size)) { + DEBUG_PRINTF("shufti has too many buckets\n"); return false; } - } + } - assert(!looks.empty()); - if (looks.size() == 1) { - DEBUG_PRINTF("single lookaround\n"); - } else { - DEBUG_PRINTF("multi-path lookaround\n"); + assert(!looks.empty()); + if (looks.size() == 1) { + DEBUG_PRINTF("single lookaround\n"); + } else { + DEBUG_PRINTF("multi-path lookaround\n"); } DEBUG_PRINTF("done\n"); return true; } bool makeLeftfixLookaround(const RoseBuildImpl &build, const RoseVertex v, - vector<vector<LookEntry>> &lookaround) { + vector<vector<LookEntry>> &lookaround) { lookaround.clear(); const RoseGraph &g = build.g; @@ -823,19 +823,19 @@ bool makeLeftfixLookaround(const RoseBuildImpl &build, const RoseVertex v, return false; } - vector<map<s32, CharReach>> looks; - if (!getTransientPrefixReach(*leftfix.graph(), g[v].left.leftfix_report, - g[v].left.lag, looks)) { - DEBUG_PRINTF("graph has loop or too large\n"); + vector<map<s32, CharReach>> looks; + if (!getTransientPrefixReach(*leftfix.graph(), g[v].left.leftfix_report, + g[v].left.lag, looks)) { + DEBUG_PRINTF("graph has loop or too large\n"); return false; } - if (!trimMultipathLeftfix(build, v, looks)) { + if (!trimMultipathLeftfix(build, v, looks)) { return false; } - transToLookaround(looks, lookaround); + transToLookaround(looks, lookaround); - return !lookaround.empty(); + return !lookaround.empty(); } void mergeLookaround(vector<LookEntry> &lookaround, @@ -846,7 +846,7 @@ void mergeLookaround(vector<LookEntry> &lookaround, } // Don't merge lookarounds at offsets we already have entries for. - flat_set<s8> offsets; + flat_set<s8> offsets; for (const auto &e : lookaround) { offsets.insert(e.offset); } |