aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/hyperscan/src/rose/rose_build_lookaround.cpp
diff options
context:
space:
mode:
authorIvan Blinkov <ivan@blinkov.ru>2022-02-10 16:47:11 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:47:11 +0300
commit5b283123c882433dafbaf6b338adeea16c1a0ea0 (patch)
tree339adc63bce23800021202ae4a8328a843dc447a /contrib/libs/hyperscan/src/rose/rose_build_lookaround.cpp
parent1aeb9a455974457866f78722ad98114bafc84e8a (diff)
downloadydb-5b283123c882433dafbaf6b338adeea16c1a0ea0.tar.gz
Restoring authorship annotation for Ivan Blinkov <ivan@blinkov.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/hyperscan/src/rose/rose_build_lookaround.cpp')
-rw-r--r--contrib/libs/hyperscan/src/rose/rose_build_lookaround.cpp588
1 files changed, 294 insertions, 294 deletions
diff --git a/contrib/libs/hyperscan/src/rose/rose_build_lookaround.cpp b/contrib/libs/hyperscan/src/rose/rose_build_lookaround.cpp
index 11f9fa2aa2..d0540d79b0 100644
--- a/contrib/libs/hyperscan/src/rose/rose_build_lookaround.cpp
+++ b/contrib/libs/hyperscan/src/rose/rose_build_lookaround.cpp
@@ -40,12 +40,12 @@
#include "util/container.h"
#include "util/dump_charclass.h"
#include "util/graph_range.h"
-#include "util/flat_containers.h"
+#include "util/flat_containers.h"
#include "util/verify_types.h"
#include <cstdlib>
#include <queue>
-#include <sstream>
+#include <sstream>
using namespace std;
@@ -63,23 +63,23 @@ static const u32 MAX_LOOKAROUND_ENTRIES = 32;
/** \brief We would rather have lookarounds with smaller reach than this. */
static const u32 LOOKAROUND_WIDE_REACH = 200;
-#if defined(DEBUG) || defined(DUMP_SUPPORT)
-static UNUSED
-string dump(const map<s32, CharReach> &look) {
- ostringstream oss;
- for (auto it = look.begin(), ite = look.end(); it != ite; ++it) {
- if (it != look.begin()) {
- oss << ", ";
- }
- oss << "{" << it->first << ": " << describeClass(it->second) << "}";
- }
- return oss.str();
-}
-#endif
-
+#if defined(DEBUG) || defined(DUMP_SUPPORT)
+static UNUSED
+string dump(const map<s32, CharReach> &look) {
+ ostringstream oss;
+ for (auto it = look.begin(), ite = look.end(); it != ite; ++it) {
+ if (it != look.begin()) {
+ oss << ", ";
+ }
+ oss << "{" << it->first << ": " << describeClass(it->second) << "}";
+ }
+ return oss.str();
+}
+#endif
+
static
void getForwardReach(const NGHolder &g, u32 top, map<s32, CharReach> &look) {
- flat_set<NFAVertex> curr, next;
+ flat_set<NFAVertex> curr, next;
// Consider only successors of start with the required top.
for (const auto &e : out_edges_range(g.start, g)) {
@@ -87,7 +87,7 @@ void getForwardReach(const NGHolder &g, u32 top, map<s32, CharReach> &look) {
if (v == g.startDs) {
continue;
}
- if (contains(g[e].tops, top)) {
+ if (contains(g[e].tops, top)) {
curr.insert(v);
}
}
@@ -116,7 +116,7 @@ void getForwardReach(const NGHolder &g, u32 top, map<s32, CharReach> &look) {
static
void getBackwardReach(const NGHolder &g, ReportID report, u32 lag,
map<s32, CharReach> &look) {
- flat_set<NFAVertex> curr, next;
+ flat_set<NFAVertex> curr, next;
for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
if (contains(g[v].reports, report)) {
@@ -187,7 +187,7 @@ void getForwardReach(const raw_dfa &rdfa, map<s32, CharReach> &look) {
return;
}
- flat_set<dstate_id_t> curr, next;
+ flat_set<dstate_id_t> curr, next;
curr.insert(rdfa.start_anchored);
for (u32 i = 0; i < MAX_FWD_LEN && !curr.empty(); i++) {
@@ -276,7 +276,7 @@ void findForwardReach(const RoseGraph &g, const RoseVertex v,
for (const auto &e : out_edges_range(v, g)) {
RoseVertex t = target(e, g);
if (!g[t].left) {
- DEBUG_PRINTF("successor %zu has no leftfix\n", g[t].index);
+ DEBUG_PRINTF("successor %zu has no leftfix\n", g[t].index);
return;
}
rose_look.push_back(map<s32, CharReach>());
@@ -447,7 +447,7 @@ static
void findFloodReach(const RoseBuildImpl &tbi, const RoseVertex v,
set<CharReach> &flood_reach) {
for (u32 lit_id : tbi.g[v].literals) {
- const ue2_literal &s = tbi.literals.at(lit_id).s;
+ const ue2_literal &s = tbi.literals.at(lit_id).s;
if (s.empty()) {
continue;
}
@@ -460,69 +460,69 @@ void findFloodReach(const RoseBuildImpl &tbi, const RoseVertex v,
}
}
-
-namespace {
-struct LookProto {
- LookProto(s32 offset_in, CharReach reach_in)
- : offset(offset_in), reach(move(reach_in)) {}
- s32 offset;
- CharReach reach;
-};
-}
-
+
+namespace {
+struct LookProto {
+ LookProto(s32 offset_in, CharReach reach_in)
+ : offset(offset_in), reach(move(reach_in)) {}
+ s32 offset;
+ CharReach reach;
+};
+}
+
+static
+vector<LookProto> findLiteralReach(const rose_literal_id &lit) {
+ vector<LookProto> look;
+ look.reserve(lit.s.length());
+
+ s32 i = 0 - lit.s.length() - lit.delay;
+ for (const auto &c : lit.s) {
+ look.emplace_back(i, c);
+ i++;
+ }
+
+ return look;
+}
+
static
-vector<LookProto> findLiteralReach(const rose_literal_id &lit) {
- vector<LookProto> look;
- look.reserve(lit.s.length());
-
- s32 i = 0 - lit.s.length() - lit.delay;
- for (const auto &c : lit.s) {
- look.emplace_back(i, c);
- i++;
- }
-
- return look;
-}
-
-static
-vector<LookProto> findLiteralReach(const RoseBuildImpl &build,
- const RoseVertex v) {
- bool first = true;
- vector<LookProto> look;
-
+vector<LookProto> findLiteralReach(const RoseBuildImpl &build,
+ const RoseVertex v) {
+ bool first = true;
+ vector<LookProto> look;
+
for (u32 lit_id : build.g[v].literals) {
- const rose_literal_id &lit = build.literals.at(lit_id);
- auto lit_look = findLiteralReach(lit);
-
- if (first) {
- look = std::move(lit_look);
- first = false;
- continue;
- }
-
- // Erase elements from look with keys not in lit_look. Where a key is
- // in both maps, union its reach with the lookaround.
- auto jt = begin(lit_look);
- for (auto it = begin(look); it != end(look);) {
- if (jt == end(lit_look)) {
- // No further lit_look entries, erase remaining elements from
- // look.
- look.erase(it, end(look));
- break;
- }
- if (it->offset < jt->offset) {
- // Offset is present in look but not in lit_look, erase.
- it = look.erase(it);
- } else if (it->offset > jt->offset) {
- // Offset is preset in lit_look but not in look, ignore.
- ++jt;
- } else {
- // Offset is present in both, union its reach with look.
- it->reach |= jt->reach;
- ++it;
- ++jt;
- }
- }
+ const rose_literal_id &lit = build.literals.at(lit_id);
+ auto lit_look = findLiteralReach(lit);
+
+ if (first) {
+ look = std::move(lit_look);
+ first = false;
+ continue;
+ }
+
+ // Erase elements from look with keys not in lit_look. Where a key is
+ // in both maps, union its reach with the lookaround.
+ auto jt = begin(lit_look);
+ for (auto it = begin(look); it != end(look);) {
+ if (jt == end(lit_look)) {
+ // No further lit_look entries, erase remaining elements from
+ // look.
+ look.erase(it, end(look));
+ break;
+ }
+ if (it->offset < jt->offset) {
+ // Offset is present in look but not in lit_look, erase.
+ it = look.erase(it);
+ } else if (it->offset > jt->offset) {
+ // Offset is preset in lit_look but not in look, ignore.
+ ++jt;
+ } else {
+ // Offset is present in both, union its reach with look.
+ it->reach |= jt->reach;
+ ++it;
+ ++jt;
+ }
+ }
}
return look;
@@ -538,11 +538,11 @@ void trimLiterals(const RoseBuildImpl &build, const RoseVertex v,
DEBUG_PRINTF("pre-trim lookaround: %s\n", dump(look).c_str());
for (const auto &m : findLiteralReach(build, v)) {
- auto it = look.find(m.offset);
+ auto it = look.find(m.offset);
if (it == end(look)) {
continue;
}
- if (m.reach.isSubsetOf(it->second)) {
+ if (m.reach.isSubsetOf(it->second)) {
DEBUG_PRINTF("can trim entry at %d\n", it->first);
look.erase(it);
}
@@ -551,76 +551,76 @@ void trimLiterals(const RoseBuildImpl &build, const RoseVertex v,
DEBUG_PRINTF("post-trim lookaround: %s\n", dump(look).c_str());
}
-static
-void normaliseLeftfix(map<s32, CharReach> &look) {
- // We can erase entries where the reach is "all characters", except for the
- // very first one -- this might be required to establish a minimum bound on
- // the literal's match offset.
-
- // TODO: It would be cleaner to use a literal program instruction to check
- // the minimum bound explicitly.
-
- if (look.empty()) {
- return;
- }
-
- const auto earliest = begin(look)->first;
-
- vector<s32> dead;
- for (const auto &m : look) {
- if (m.second.all() && m.first != earliest) {
- dead.push_back(m.first);
- }
- }
- erase_all(&look, dead);
-}
-
-static
-bool trimMultipathLeftfix(const RoseBuildImpl &build, const RoseVertex v,
- vector<map<s32, CharReach>> &looks) {
- size_t path_count = 0;
- for (auto &look : looks) {
- ++path_count;
- DEBUG_PRINTF("Path #%ld\n", path_count);
-
- assert(!look.empty());
- trimLiterals(build, v, look);
-
- if (look.empty()) {
- return false;
- }
-
- // Could be optimized here, just keep the empty byte of the longest path
- normaliseLeftfix(look);
-
- if (look.size() > MAX_LOOKAROUND_ENTRIES) {
- DEBUG_PRINTF("lookaround too big (%zu entries)\n", look.size());
- return false;
- }
- }
- return true;
-}
-
-static
-void transToLookaround(const vector<map<s32, CharReach>> &looks,
- vector<vector<LookEntry>> &lookarounds) {
- for (const auto &look : looks) {
- vector<LookEntry> lookaround;
- DEBUG_PRINTF("lookaround: %s\n", dump(look).c_str());
- lookaround.reserve(look.size());
- for (const auto &m : look) {
- if (m.first < -128 || m.first > 127) {
- DEBUG_PRINTF("range too big\n");
- lookarounds.clear();
- return;
- }
- s8 offset = verify_s8(m.first);
- lookaround.emplace_back(offset, m.second);
- }
- lookarounds.push_back(lookaround);
- }
-}
-
+static
+void normaliseLeftfix(map<s32, CharReach> &look) {
+ // We can erase entries where the reach is "all characters", except for the
+ // very first one -- this might be required to establish a minimum bound on
+ // the literal's match offset.
+
+ // TODO: It would be cleaner to use a literal program instruction to check
+ // the minimum bound explicitly.
+
+ if (look.empty()) {
+ return;
+ }
+
+ const auto earliest = begin(look)->first;
+
+ vector<s32> dead;
+ for (const auto &m : look) {
+ if (m.second.all() && m.first != earliest) {
+ dead.push_back(m.first);
+ }
+ }
+ erase_all(&look, dead);
+}
+
+static
+bool trimMultipathLeftfix(const RoseBuildImpl &build, const RoseVertex v,
+ vector<map<s32, CharReach>> &looks) {
+ size_t path_count = 0;
+ for (auto &look : looks) {
+ ++path_count;
+ DEBUG_PRINTF("Path #%ld\n", path_count);
+
+ assert(!look.empty());
+ trimLiterals(build, v, look);
+
+ if (look.empty()) {
+ return false;
+ }
+
+ // Could be optimized here, just keep the empty byte of the longest path
+ normaliseLeftfix(look);
+
+ if (look.size() > MAX_LOOKAROUND_ENTRIES) {
+ DEBUG_PRINTF("lookaround too big (%zu entries)\n", look.size());
+ return false;
+ }
+ }
+ return true;
+}
+
+static
+void transToLookaround(const vector<map<s32, CharReach>> &looks,
+ vector<vector<LookEntry>> &lookarounds) {
+ for (const auto &look : looks) {
+ vector<LookEntry> lookaround;
+ DEBUG_PRINTF("lookaround: %s\n", dump(look).c_str());
+ lookaround.reserve(look.size());
+ for (const auto &m : look) {
+ if (m.first < -128 || m.first > 127) {
+ DEBUG_PRINTF("range too big\n");
+ lookarounds.clear();
+ return;
+ }
+ s8 offset = verify_s8(m.first);
+ lookaround.emplace_back(offset, m.second);
+ }
+ lookarounds.push_back(lookaround);
+ }
+}
+
void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v,
vector<LookEntry> &lookaround) {
lookaround.clear();
@@ -659,155 +659,155 @@ void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v,
}
static
-bool checkShuftiBuckets(const vector<map<s32, CharReach>> &looks,
- u32 bucket_size) {
- set<u32> bucket;
- for (const auto &look : looks) {
- for (const auto &l : look) {
- CharReach cr = l.second;
- if (cr.count() > 128) {
- cr.flip();
- }
- map <u16, u16> lo2hi;
-
- for (size_t i = cr.find_first(); i != CharReach::npos;) {
- u8 it_hi = i >> 4;
- u16 low_encode = 0;
- while (i != CharReach::npos && (i >> 4) == it_hi) {
- low_encode |= 1 << (i &0xf);
- i = cr.find_next(i);
- }
- lo2hi[low_encode] |= 1 << it_hi;
- }
-
- for (const auto &it : lo2hi) {
- u32 hi_lo = (it.second << 16) | it.first;
- bucket.insert(hi_lo);
- }
- }
- }
- DEBUG_PRINTF("shufti has %lu bucket(s)\n", bucket.size());
- return bucket.size() <= bucket_size;
-}
-
-static
-bool getTransientPrefixReach(const NGHolder &g, ReportID report, u32 lag,
- vector<map<s32, CharReach>> &looks) {
- if (!isAcyclic(g)) {
- DEBUG_PRINTF("contains back-edge\n");
+bool checkShuftiBuckets(const vector<map<s32, CharReach>> &looks,
+ u32 bucket_size) {
+ set<u32> bucket;
+ for (const auto &look : looks) {
+ for (const auto &l : look) {
+ CharReach cr = l.second;
+ if (cr.count() > 128) {
+ cr.flip();
+ }
+ map <u16, u16> lo2hi;
+
+ for (size_t i = cr.find_first(); i != CharReach::npos;) {
+ u8 it_hi = i >> 4;
+ u16 low_encode = 0;
+ while (i != CharReach::npos && (i >> 4) == it_hi) {
+ low_encode |= 1 << (i &0xf);
+ i = cr.find_next(i);
+ }
+ lo2hi[low_encode] |= 1 << it_hi;
+ }
+
+ for (const auto &it : lo2hi) {
+ u32 hi_lo = (it.second << 16) | it.first;
+ bucket.insert(hi_lo);
+ }
+ }
+ }
+ DEBUG_PRINTF("shufti has %lu bucket(s)\n", bucket.size());
+ return bucket.size() <= bucket_size;
+}
+
+static
+bool getTransientPrefixReach(const NGHolder &g, ReportID report, u32 lag,
+ vector<map<s32, CharReach>> &looks) {
+ if (!isAcyclic(g)) {
+ DEBUG_PRINTF("contains back-edge\n");
return false;
}
- // Must be floating chains wired to startDs.
- if (!isFloating(g)) {
- DEBUG_PRINTF("not a floating start\n");
+ // Must be floating chains wired to startDs.
+ if (!isFloating(g)) {
+ DEBUG_PRINTF("not a floating start\n");
return false;
}
- vector<NFAVertex> curr;
- for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
- if (v == g.start || v == g.startDs) {
- DEBUG_PRINTF("empty graph\n");
- return true;
- }
- if (contains(g[v].reports, report)) {
- curr.push_back(v);
- }
- }
-
- assert(!curr.empty());
-
- u32 total_len = curr.size();
-
- for (const auto &v : curr) {
- looks.emplace_back(map<s32, CharReach>());
- looks.back()[0 - (lag + 1)] = g[v].char_reach;
- }
-
- bool curr_active = false;
-
- /* For each offset -i, we backwardly trace the path by vertices in curr.
- * Once there are more than 8 paths and more than 64 bits total_len,
- * which means that neither MULTIPATH_LOOKAROUND nor MULTIPATH_SHUFTI
- * could be successfully built, we will give up the path finding.
- * Otherwise, the loop will halt when all vertices in curr are startDs.
- */
- for (u32 i = lag + 2; i < (lag + 2) + MAX_BACK_LEN; i++) {
- curr_active = false;
- size_t curr_size = curr.size();
- if (curr.size() > 1 && i > lag + MULTIPATH_MAX_LEN) {
- DEBUG_PRINTF("range is larger than 16 in multi-path\n");
+ vector<NFAVertex> curr;
+ for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
+ if (v == g.start || v == g.startDs) {
+ DEBUG_PRINTF("empty graph\n");
+ return true;
+ }
+ if (contains(g[v].reports, report)) {
+ curr.push_back(v);
+ }
+ }
+
+ assert(!curr.empty());
+
+ u32 total_len = curr.size();
+
+ for (const auto &v : curr) {
+ looks.emplace_back(map<s32, CharReach>());
+ looks.back()[0 - (lag + 1)] = g[v].char_reach;
+ }
+
+ bool curr_active = false;
+
+ /* For each offset -i, we backwardly trace the path by vertices in curr.
+ * Once there are more than 8 paths and more than 64 bits total_len,
+ * which means that neither MULTIPATH_LOOKAROUND nor MULTIPATH_SHUFTI
+ * could be successfully built, we will give up the path finding.
+ * Otherwise, the loop will halt when all vertices in curr are startDs.
+ */
+ for (u32 i = lag + 2; i < (lag + 2) + MAX_BACK_LEN; i++) {
+ curr_active = false;
+ size_t curr_size = curr.size();
+ if (curr.size() > 1 && i > lag + MULTIPATH_MAX_LEN) {
+ DEBUG_PRINTF("range is larger than 16 in multi-path\n");
return false;
}
- for (size_t idx = 0; idx < curr_size; idx++) {
- NFAVertex v = curr[idx];
- if (v == g.startDs) {
- continue;
- }
- assert(!is_special(v, g));
-
- for (auto u : inv_adjacent_vertices_range(v, g)) {
- if (u == g.start || u == g.startDs) {
- curr[idx] = g.startDs;
- break;
- }
- }
-
- if (is_special(curr[idx], g)) {
- continue;
- }
-
- for (auto u : inv_adjacent_vertices_range(v, g)) {
- curr_active = true;
- if (curr[idx] == v) {
- curr[idx] = u;
- looks[idx][0 - i] = g[u].char_reach;
- total_len++;
- } else {
- curr.push_back(u);
- looks.push_back(looks[idx]);
- (looks.back())[0 - i] = g[u].char_reach;
- total_len += looks.back().size();
- }
-
- if (curr.size() > MAX_LOOKAROUND_PATHS && total_len > 64) {
- DEBUG_PRINTF("too many branches\n");
- return false;
- }
- }
- }
- if (!curr_active) {
- break;
- }
- }
-
- if (curr_active) {
- DEBUG_PRINTF("single path too long\n");
- return false;
- }
-
- // More than 8 paths, check multi-path shufti.
- if (curr.size() > MAX_LOOKAROUND_PATHS) {
- u32 bucket_size = total_len > 32 ? 8 : 16;
- if (!checkShuftiBuckets(looks, bucket_size)) {
- DEBUG_PRINTF("shufti has too many buckets\n");
+ for (size_t idx = 0; idx < curr_size; idx++) {
+ NFAVertex v = curr[idx];
+ if (v == g.startDs) {
+ continue;
+ }
+ assert(!is_special(v, g));
+
+ for (auto u : inv_adjacent_vertices_range(v, g)) {
+ if (u == g.start || u == g.startDs) {
+ curr[idx] = g.startDs;
+ break;
+ }
+ }
+
+ if (is_special(curr[idx], g)) {
+ continue;
+ }
+
+ for (auto u : inv_adjacent_vertices_range(v, g)) {
+ curr_active = true;
+ if (curr[idx] == v) {
+ curr[idx] = u;
+ looks[idx][0 - i] = g[u].char_reach;
+ total_len++;
+ } else {
+ curr.push_back(u);
+ looks.push_back(looks[idx]);
+ (looks.back())[0 - i] = g[u].char_reach;
+ total_len += looks.back().size();
+ }
+
+ if (curr.size() > MAX_LOOKAROUND_PATHS && total_len > 64) {
+ DEBUG_PRINTF("too many branches\n");
+ return false;
+ }
+ }
+ }
+ if (!curr_active) {
+ break;
+ }
+ }
+
+ if (curr_active) {
+ DEBUG_PRINTF("single path too long\n");
+ return false;
+ }
+
+ // More than 8 paths, check multi-path shufti.
+ if (curr.size() > MAX_LOOKAROUND_PATHS) {
+ u32 bucket_size = total_len > 32 ? 8 : 16;
+ if (!checkShuftiBuckets(looks, bucket_size)) {
+ DEBUG_PRINTF("shufti has too many buckets\n");
return false;
}
- }
+ }
- assert(!looks.empty());
- if (looks.size() == 1) {
- DEBUG_PRINTF("single lookaround\n");
- } else {
- DEBUG_PRINTF("multi-path lookaround\n");
+ assert(!looks.empty());
+ if (looks.size() == 1) {
+ DEBUG_PRINTF("single lookaround\n");
+ } else {
+ DEBUG_PRINTF("multi-path lookaround\n");
}
DEBUG_PRINTF("done\n");
return true;
}
bool makeLeftfixLookaround(const RoseBuildImpl &build, const RoseVertex v,
- vector<vector<LookEntry>> &lookaround) {
+ vector<vector<LookEntry>> &lookaround) {
lookaround.clear();
const RoseGraph &g = build.g;
@@ -823,19 +823,19 @@ bool makeLeftfixLookaround(const RoseBuildImpl &build, const RoseVertex v,
return false;
}
- vector<map<s32, CharReach>> looks;
- if (!getTransientPrefixReach(*leftfix.graph(), g[v].left.leftfix_report,
- g[v].left.lag, looks)) {
- DEBUG_PRINTF("graph has loop or too large\n");
+ vector<map<s32, CharReach>> looks;
+ if (!getTransientPrefixReach(*leftfix.graph(), g[v].left.leftfix_report,
+ g[v].left.lag, looks)) {
+ DEBUG_PRINTF("graph has loop or too large\n");
return false;
}
- if (!trimMultipathLeftfix(build, v, looks)) {
+ if (!trimMultipathLeftfix(build, v, looks)) {
return false;
}
- transToLookaround(looks, lookaround);
+ transToLookaround(looks, lookaround);
- return !lookaround.empty();
+ return !lookaround.empty();
}
void mergeLookaround(vector<LookEntry> &lookaround,
@@ -846,7 +846,7 @@ void mergeLookaround(vector<LookEntry> &lookaround,
}
// Don't merge lookarounds at offsets we already have entries for.
- flat_set<s8> offsets;
+ flat_set<s8> offsets;
for (const auto &e : lookaround) {
offsets.insert(e.offset);
}