aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorrobot-contrib <robot-contrib@yandex-team.com>2023-08-02 10:19:42 +0300
committerrobot-contrib <robot-contrib@yandex-team.com>2023-08-02 10:19:42 +0300
commit3d55f8dfc54006f3d6a8b943c034ead23214800e (patch)
tree0f2486183e9ddd2ca915a04fb75c402721baecfa
parentf3f107ba8959babdcd096d7d599efca438fd1f97 (diff)
downloadydb-3d55f8dfc54006f3d6a8b943c034ead23214800e.tar.gz
Update contrib/libs/re2 to 2023-08-01
-rw-r--r--contrib/libs/re2/re2/prefilter.h38
-rw-r--r--contrib/libs/re2/re2/prefilter_tree.cc47
-rw-r--r--contrib/libs/re2/re2/prefilter_tree.h41
-rw-r--r--contrib/libs/re2/re2/regexp.cc5
-rw-r--r--contrib/libs/re2/re2/simplify.cc21
-rw-r--r--contrib/libs/re2/re2/testing/simplify_test.cc16
-rw-r--r--contrib/libs/re2/ya.make4
7 files changed, 122 insertions, 50 deletions
diff --git a/contrib/libs/re2/re2/prefilter.h b/contrib/libs/re2/re2/prefilter.h
index b2545e1ea5..018691dcd6 100644
--- a/contrib/libs/re2/re2/prefilter.h
+++ b/contrib/libs/re2/re2/prefilter.h
@@ -59,6 +59,44 @@ class Prefilter {
std::string DebugString() const;
private:
+ template <typename H>
+ friend H AbslHashValue(H h, const Prefilter& a) {
+ h = H::combine(std::move(h), a.op_);
+ if (a.op_ == ATOM) {
+ h = H::combine(std::move(h), a.atom_);
+ } else if (a.op_ == AND || a.op_ == OR) {
+ h = H::combine(std::move(h), a.subs_->size());
+ for (size_t i = 0; i < a.subs_->size(); ++i) {
+ h = H::combine(std::move(h), (*a.subs_)[i]->unique_id_);
+ }
+ }
+ return h;
+ }
+
+ friend bool operator==(const Prefilter& a, const Prefilter& b) {
+ if (&a == &b) {
+ return true;
+ }
+ if (a.op_ != b.op_) {
+ return false;
+ }
+ if (a.op_ == ATOM) {
+ if (a.atom_ != b.atom_) {
+ return false;
+ }
+ } else if (a.op_ == AND || a.op_ == OR) {
+ if (a.subs_->size() != b.subs_->size()) {
+ return false;
+ }
+ for (size_t i = 0; i < a.subs_->size(); ++i) {
+ if ((*a.subs_)[i]->unique_id_ != (*b.subs_)[i]->unique_id_) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
// A comparator used to store exact strings. We compare by length,
// then lexicographically. This ordering makes it easier to reduce the
// set of strings in SimplifyStringSet.
diff --git a/contrib/libs/re2/re2/prefilter_tree.cc b/contrib/libs/re2/re2/prefilter_tree.cc
index 41f65a3561..3afb241c90 100644
--- a/contrib/libs/re2/re2/prefilter_tree.cc
+++ b/contrib/libs/re2/re2/prefilter_tree.cc
@@ -7,7 +7,6 @@
#include <stddef.h>
#include <algorithm>
#include <cmath>
-#include <map>
#include <memory>
#include <string>
#include <utility>
@@ -63,33 +62,18 @@ void PrefilterTree::Compile(std::vector<std::string>* atom_vec) {
compiled_ = true;
- NodeMap nodes;
+ NodeSet nodes;
AssignUniqueIds(&nodes, atom_vec);
if (ExtraDebug)
PrintDebugInfo(&nodes);
}
-Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) {
- std::string node_string = NodeString(node);
- NodeMap::iterator iter = nodes->find(node_string);
- if (iter == nodes->end())
- return NULL;
- return (*iter).second;
-}
-
-std::string PrefilterTree::NodeString(Prefilter* node) const {
- // Adding the operation disambiguates AND/OR/atom nodes.
- std::string s = absl::StrFormat("%d", node->op()) + ":";
- if (node->op() == Prefilter::ATOM) {
- s += node->atom();
- } else {
- for (size_t i = 0; i < node->subs()->size(); i++) {
- if (i > 0)
- s += ',';
- s += absl::StrFormat("%d", (*node->subs())[i]->unique_id());
- }
+Prefilter* PrefilterTree::CanonicalNode(NodeSet* nodes, Prefilter* node) {
+ NodeSet::const_iterator iter = nodes->find(node);
+ if (iter != nodes->end()) {
+ return *iter;
}
- return s;
+ return NULL;
}
bool PrefilterTree::KeepNode(Prefilter* node) const {
@@ -129,7 +113,7 @@ bool PrefilterTree::KeepNode(Prefilter* node) const {
}
}
-void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
+void PrefilterTree::AssignUniqueIds(NodeSet* nodes,
std::vector<std::string>* atom_vec) {
atom_vec->clear();
@@ -169,9 +153,9 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
node->set_unique_id(-1);
Prefilter* canonical = CanonicalNode(nodes, node);
if (canonical == NULL) {
- // Any further nodes that have the same node string
+ // Any further nodes that have the same atom/subs
// will find this node as the canonical node.
- nodes->emplace(NodeString(node), node);
+ nodes->emplace(node);
if (node->op() == Prefilter::ATOM) {
atom_vec->push_back(node->atom());
atom_index_to_id_.push_back(unique_id);
@@ -300,7 +284,7 @@ void PrefilterTree::RegexpsGivenStrings(
for (size_t j = 0; j < matched_atoms.size(); j++)
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
PropagateMatch(matched_atom_ids, &regexps_map);
- for (IntMap::iterator it = regexps_map.begin();
+ for (IntMap::const_iterator it = regexps_map.begin();
it != regexps_map.end();
++it)
regexps->push_back(it->index());
@@ -316,7 +300,7 @@ void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
IntMap work(static_cast<int>(entries_.size()));
for (size_t i = 0; i < atom_ids.size(); i++)
work.set(atom_ids[i], 1);
- for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
+ for (IntMap::const_iterator it = work.begin(); it != work.end(); ++it) {
const Entry& entry = entries_[it->index()];
// Record regexps triggered.
for (size_t i = 0; i < entry.regexps.size(); i++)
@@ -348,7 +332,7 @@ void PrefilterTree::PrintPrefilter(int regexpid) {
LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
}
-void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
+void PrefilterTree::PrintDebugInfo(NodeSet* nodes) {
LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
LOG(ERROR) << "#Unique Nodes: " << entries_.size();
@@ -360,11 +344,10 @@ void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
for (int parent : parents)
LOG(ERROR) << parent;
}
- LOG(ERROR) << "Map:";
- for (NodeMap::const_iterator iter = nodes->begin();
+ LOG(ERROR) << "Set:";
+ for (NodeSet::const_iterator iter = nodes->begin();
iter != nodes->end(); ++iter)
- LOG(ERROR) << "NodeId: " << (*iter).second->unique_id()
- << " Str: " << (*iter).first;
+ LOG(ERROR) << "NodeId: " << (*iter)->unique_id();
}
std::string PrefilterTree::DebugNodeString(Prefilter* node) const {
diff --git a/contrib/libs/re2/re2/prefilter_tree.h b/contrib/libs/re2/re2/prefilter_tree.h
index 3eb8056db0..71e7a294f9 100644
--- a/contrib/libs/re2/re2/prefilter_tree.h
+++ b/contrib/libs/re2/re2/prefilter_tree.h
@@ -16,12 +16,13 @@
// atoms) that the user of this class should use to do the string
// matching.
-#include <map>
#include <string>
#include <vector>
+#include "absl/container/flat_hash_set.h"
#include "re2/prefilter.h"
#include "re2/sparse_array.h"
+#include "util/logging.h"
namespace re2 {
@@ -57,10 +58,25 @@ class PrefilterTree {
void PrintPrefilter(int regexpid);
private:
- typedef SparseArray<int> IntMap;
- // TODO(junyer): Use absl::flat_hash_set<Prefilter*> instead?
- // It should be trivial to get rid of the stringification...
- typedef std::map<std::string, Prefilter*> NodeMap;
+ using IntMap = SparseArray<int>;
+
+ struct PrefilterHash {
+ size_t operator()(const Prefilter* a) const {
+ DCHECK(a != NULL);
+ return absl::Hash<Prefilter>()(*a);
+ }
+ };
+
+ struct PrefilterEqual {
+ bool operator()(const Prefilter* a, const Prefilter* b) const {
+ DCHECK(a != NULL);
+ DCHECK(b != NULL);
+ return *a == *b;
+ }
+ };
+
+ using NodeSet =
+ absl::flat_hash_set<Prefilter*, PrefilterHash, PrefilterEqual>;
// Each unique node has a corresponding Entry that helps in
// passing the matching trigger information along the tree.
@@ -90,25 +106,22 @@ class PrefilterTree {
// This function assigns unique ids to various parts of the
// prefilter, by looking at if these nodes are already in the
// PrefilterTree.
- void AssignUniqueIds(NodeMap* nodes, std::vector<std::string>* atom_vec);
+ void AssignUniqueIds(NodeSet* nodes, std::vector<std::string>* atom_vec);
// Given the matching atoms, find the regexps to be triggered.
void PropagateMatch(const std::vector<int>& atom_ids,
IntMap* regexps) const;
- // Returns the prefilter node that has the same NodeString as this
- // node. For the canonical node, returns node.
- Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node);
-
- // A string that uniquely identifies the node. Assumes that the
- // children of node has already been assigned unique ids.
- std::string NodeString(Prefilter* node) const;
+ // Returns the prefilter node that has the same atom/subs as this
+ // node. For the canonical node, returns node. Assumes that the
+ // children of node have already been assigned unique ids.
+ Prefilter* CanonicalNode(NodeSet* nodes, Prefilter* node);
// Recursively constructs a readable prefilter string.
std::string DebugNodeString(Prefilter* node) const;
// Used for debugging.
- void PrintDebugInfo(NodeMap* nodes);
+ void PrintDebugInfo(NodeSet* nodes);
// These are all the nodes formed by Compile. Essentially, there is
// one node for each unique atom and each unique AND/OR node.
diff --git a/contrib/libs/re2/re2/regexp.cc b/contrib/libs/re2/re2/regexp.cc
index 3cfb5ae149..1614bb0fed 100644
--- a/contrib/libs/re2/re2/regexp.cc
+++ b/contrib/libs/re2/re2/regexp.cc
@@ -17,6 +17,7 @@
#include "absl/base/call_once.h"
#include "absl/base/macros.h"
+#include "absl/container/flat_hash_map.h"
#include "absl/synchronization/mutex.h"
#include "util/logging.h"
#include "util/utf.h"
@@ -76,7 +77,7 @@ bool Regexp::QuickDestroy() {
// Similar to EmptyStorage in re2.cc.
struct RefStorage {
absl::Mutex ref_mutex;
- std::map<Regexp*, int> ref_map;
+ absl::flat_hash_map<Regexp*, int> ref_map;
};
alignas(RefStorage) static char ref_storage[sizeof(RefStorage)];
@@ -84,7 +85,7 @@ static inline absl::Mutex* ref_mutex() {
return &reinterpret_cast<RefStorage*>(ref_storage)->ref_mutex;
}
-static inline std::map<Regexp*, int>* ref_map() {
+static inline absl::flat_hash_map<Regexp*, int>* ref_map() {
return &reinterpret_cast<RefStorage*>(ref_storage)->ref_map;
}
diff --git a/contrib/libs/re2/re2/simplify.cc b/contrib/libs/re2/re2/simplify.cc
index 8cd10cfbb0..cea100b08a 100644
--- a/contrib/libs/re2/re2/simplify.cc
+++ b/contrib/libs/re2/re2/simplify.cc
@@ -6,6 +6,7 @@
// to use simple extended regular expression features.
// Also sort and simplify character classes.
+#include <algorithm>
#include <string>
#include "util/logging.h"
@@ -579,6 +580,16 @@ Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
return re;
}
+// Returns true if re is an empty-width op.
+static bool IsEmptyOp(Regexp* re) {
+ return (re->op() == kRegexpBeginLine ||
+ re->op() == kRegexpEndLine ||
+ re->op() == kRegexpWordBoundary ||
+ re->op() == kRegexpNoWordBoundary ||
+ re->op() == kRegexpBeginText ||
+ re->op() == kRegexpEndText);
+}
+
// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
@@ -587,6 +598,16 @@ Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
// but in the Regexp* representation, both (x) are marked as $1.
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags f) {
+ // For an empty-width op OR a concatenation or alternation of empty-width
+ // ops, cap the repetition count at 1.
+ if (IsEmptyOp(re) ||
+ ((re->op() == kRegexpConcat ||
+ re->op() == kRegexpAlternate) &&
+ std::all_of(re->sub(), re->sub() + re->nsub(), IsEmptyOp))) {
+ min = std::min(min, 1);
+ max = std::min(max, 1);
+ }
+
// x{n,} means at least n matches of x.
if (max == -1) {
// Special case: x{0,} is x*
diff --git a/contrib/libs/re2/re2/testing/simplify_test.cc b/contrib/libs/re2/re2/testing/simplify_test.cc
index d2c136a3cc..5b683f580c 100644
--- a/contrib/libs/re2/re2/testing/simplify_test.cc
+++ b/contrib/libs/re2/re2/testing/simplify_test.cc
@@ -140,6 +140,22 @@ static Test tests[] = {
{ "(){1,}", "()+" },
{ "(){0,2}", "(?:()()?)?" },
+ // For an empty-width op OR a concatenation or alternation of empty-width
+ // ops, test that the repetition count is capped at 1.
+ { "(?:^){0,}", "^*" }, // x{0,} -> x*
+ { "(?:$){28,}", "$+" }, // x{N,} -> x{1,} -> x+
+ { "(?-m:^){0,30}", "(?-m:^)?" }, // x{0,N} -> x{0,1} -> x?
+ { "(?-m:$){28,30}", "(?-m:$)" }, // x{N,M} -> x{1,1} -> x
+ { "\\b(?:\\b\\B){999}\\B", "\\b\\b\\B\\B" },
+ { "\\b(?:\\b|\\B){999}\\B", "\\b(?:\\b|\\B)\\B" },
+ // NonGreedy should also be handled.
+ { "(?:^){0,}?", "^*?" },
+ { "(?:$){28,}?", "$+?" },
+ { "(?-m:^){0,30}?", "(?-m:^)??" },
+ { "(?-m:$){28,30}?", "(?-m:$)" },
+ { "\\b(?:\\b\\B){999}?\\B", "\\b\\b\\B\\B" },
+ { "\\b(?:\\b|\\B){999}?\\B", "\\b(?:\\b|\\B)\\B" },
+
// Test that coalescing occurs and that the resulting repeats are simplified.
// Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal:
{ "a*a*", "a*" },
diff --git a/contrib/libs/re2/ya.make b/contrib/libs/re2/ya.make
index 1ad631b2e6..e01531f4d5 100644
--- a/contrib/libs/re2/ya.make
+++ b/contrib/libs/re2/ya.make
@@ -9,9 +9,9 @@ LICENSE(
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-VERSION(2023-07-01)
+VERSION(2023-08-01)
-ORIGINAL_SOURCE(https://github.com/google/re2/archive/2023-07-01.tar.gz)
+ORIGINAL_SOURCE(https://github.com/google/re2/archive/2023-08-01.tar.gz)
PEERDIR(
contrib/restricted/abseil-cpp/absl/base