aboutsummaryrefslogtreecommitdiffstats
path: root/contrib
diff options
context:
space:
mode:
authorrobot-contrib <robot-contrib@yandex-team.ru>2022-06-02 04:26:10 +0300
committerrobot-contrib <robot-contrib@yandex-team.ru>2022-06-02 04:26:10 +0300
commitc86da49a855599859d815b0ec18778380f2d0294 (patch)
treeb8573adc7bcdedfd8c45e1b5df05474cc1ba0815 /contrib
parent847b2e265f5e9cebc04021278fa068110fabe934 (diff)
downloadydb-c86da49a855599859d815b0ec18778380f2d0294.tar.gz
Update contrib/libs/re2 to 2022-06-01
ref:07a800b447ebeefbdea1234f6a3ba3e52d6bb126
Diffstat (limited to 'contrib')
-rw-r--r--contrib/libs/re2/re2/prefilter_tree.cc77
1 files changed, 47 insertions, 30 deletions
diff --git a/contrib/libs/re2/re2/prefilter_tree.cc b/contrib/libs/re2/re2/prefilter_tree.cc
index 1e4eaffcae..409794eb1b 100644
--- a/contrib/libs/re2/re2/prefilter_tree.cc
+++ b/contrib/libs/re2/re2/prefilter_tree.cc
@@ -6,6 +6,7 @@
#include <stddef.h>
#include <algorithm>
+#include <cmath>
#include <map>
#include <memory>
#include <string>
@@ -65,33 +66,6 @@ void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {
NodeMap nodes;
AssignUniqueIds(&nodes, atom_vec);
-
- // Identify nodes that are too common among prefilters and are
- // triggering too many parents. Then get rid of them if possible.
- // Note that getting rid of a prefilter node simply means they are
- // no longer necessary for their parent to trigger; that is, we do
- // not miss out on any regexps triggering by getting rid of a
- // prefilter node.
- for (size_t i = 0; i < entries_.size(); i++) {
- std::vector<int>& parents = entries_[i].parents;
- if (parents.size() > 8) {
- // This one triggers too many things. If all the parents are AND
- // nodes and have other things guarding them, then get rid of
- // this trigger. TODO(vsri): Adjust the threshold appropriately,
- // make it a function of total number of nodes?
- bool have_other_guard = true;
- for (int parent : parents) {
- have_other_guard = have_other_guard &&
- (entries_[parent].propagate_up_at_count > 1);
- }
- if (have_other_guard) {
- for (int parent : parents)
- entries_[parent].propagate_up_at_count -= 1;
- parents.clear(); // Forget the parents
- }
- }
- }
-
if (ExtraDebug)
PrintDebugInfo(&nodes);
}
@@ -215,12 +189,9 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
-
if (CanonicalNode(nodes, prefilter) != prefilter)
continue;
-
int id = prefilter->unique_id();
-
switch (prefilter->op()) {
default:
LOG(DFATAL) << "Unexpected op: " << prefilter->op();
@@ -261,6 +232,52 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
Entry* entry = &entries_[id];
entry->regexps.push_back(static_cast<int>(i));
}
+
+ // Lastly, using probability-based heuristics, we identify nodes
+ // that trigger too many parents and then we try to prune edges.
+ // We use logarithms below to avoid the likelihood of underflow.
+ double log_num_regexps = std::log(prefilter_vec_.size() - unfiltered_.size());
+ // Hoisted this above the loop so that we don't thrash the heap.
+ std::vector<std::pair<size_t, int>> entries_by_num_edges;
+ for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
+ Prefilter* prefilter = v[i];
+ // Pruning applies only to AND nodes because it "just" reduces
+ // precision; applied to OR nodes, it would break correctness.
+ if (prefilter == NULL || prefilter->op() != Prefilter::AND)
+ continue;
+ if (CanonicalNode(nodes, prefilter) != prefilter)
+ continue;
+ int id = prefilter->unique_id();
+
+ // Sort the current node's children by the numbers of parents.
+ entries_by_num_edges.clear();
+ for (size_t j = 0; j < prefilter->subs()->size(); j++) {
+ int child_id = (*prefilter->subs())[j]->unique_id();
+ const std::vector<int>& parents = entries_[child_id].parents;
+ entries_by_num_edges.emplace_back(parents.size(), child_id);
+ }
+ std::stable_sort(entries_by_num_edges.begin(), entries_by_num_edges.end());
+
+ // A running estimate of how many regexps will be triggered by
+ // pruning the remaining children's edges to the current node.
+ // Our nominal target is one, so the threshold is log(1) == 0;
+ // pruning occurs iff the child has more than nine edges left.
+ double log_num_triggered = log_num_regexps;
+ for (const auto& pair : entries_by_num_edges) {
+ int child_id = pair.second;
+ std::vector<int>& parents = entries_[child_id].parents;
+ if (log_num_triggered > 0.) {
+ log_num_triggered += std::log(parents.size());
+ log_num_triggered -= log_num_regexps;
+ } else if (parents.size() > 9) {
+ auto it = std::find(parents.begin(), parents.end(), id);
+ if (it != parents.end()) {
+ parents.erase(it);
+ entries_[id].propagate_up_at_count--;
+ }
+ }
+ }
+ }
}
// Functions for triggering during search.