diff options
author | robot-contrib <robot-contrib@yandex-team.com> | 2023-08-02 10:19:42 +0300 |
---|---|---|
committer | robot-contrib <robot-contrib@yandex-team.com> | 2023-08-02 10:19:42 +0300 |
commit | 3d55f8dfc54006f3d6a8b943c034ead23214800e (patch) | |
tree | 0f2486183e9ddd2ca915a04fb75c402721baecfa | |
parent | f3f107ba8959babdcd096d7d599efca438fd1f97 (diff) | |
download | ydb-3d55f8dfc54006f3d6a8b943c034ead23214800e.tar.gz |
Update contrib/libs/re2 to 2023-08-01
-rw-r--r-- | contrib/libs/re2/re2/prefilter.h | 38 | ||||
-rw-r--r-- | contrib/libs/re2/re2/prefilter_tree.cc | 47 | ||||
-rw-r--r-- | contrib/libs/re2/re2/prefilter_tree.h | 41 | ||||
-rw-r--r-- | contrib/libs/re2/re2/regexp.cc | 5 | ||||
-rw-r--r-- | contrib/libs/re2/re2/simplify.cc | 21 | ||||
-rw-r--r-- | contrib/libs/re2/re2/testing/simplify_test.cc | 16 | ||||
-rw-r--r-- | contrib/libs/re2/ya.make | 4 |
7 files changed, 122 insertions, 50 deletions
diff --git a/contrib/libs/re2/re2/prefilter.h b/contrib/libs/re2/re2/prefilter.h index b2545e1ea5..018691dcd6 100644 --- a/contrib/libs/re2/re2/prefilter.h +++ b/contrib/libs/re2/re2/prefilter.h @@ -59,6 +59,44 @@ class Prefilter { std::string DebugString() const; private: + template <typename H> + friend H AbslHashValue(H h, const Prefilter& a) { + h = H::combine(std::move(h), a.op_); + if (a.op_ == ATOM) { + h = H::combine(std::move(h), a.atom_); + } else if (a.op_ == AND || a.op_ == OR) { + h = H::combine(std::move(h), a.subs_->size()); + for (size_t i = 0; i < a.subs_->size(); ++i) { + h = H::combine(std::move(h), (*a.subs_)[i]->unique_id_); + } + } + return h; + } + + friend bool operator==(const Prefilter& a, const Prefilter& b) { + if (&a == &b) { + return true; + } + if (a.op_ != b.op_) { + return false; + } + if (a.op_ == ATOM) { + if (a.atom_ != b.atom_) { + return false; + } + } else if (a.op_ == AND || a.op_ == OR) { + if (a.subs_->size() != b.subs_->size()) { + return false; + } + for (size_t i = 0; i < a.subs_->size(); ++i) { + if ((*a.subs_)[i]->unique_id_ != (*b.subs_)[i]->unique_id_) { + return false; + } + } + } + return true; + } + // A comparator used to store exact strings. We compare by length, // then lexicographically. This ordering makes it easier to reduce the // set of strings in SimplifyStringSet. diff --git a/contrib/libs/re2/re2/prefilter_tree.cc b/contrib/libs/re2/re2/prefilter_tree.cc index 41f65a3561..3afb241c90 100644 --- a/contrib/libs/re2/re2/prefilter_tree.cc +++ b/contrib/libs/re2/re2/prefilter_tree.cc @@ -7,7 +7,6 @@ #include <stddef.h> #include <algorithm> #include <cmath> -#include <map> #include <memory> #include <string> #include <utility> @@ -63,33 +62,18 @@ void PrefilterTree::Compile(std::vector<std::string>* atom_vec) { compiled_ = true; - NodeMap nodes; + NodeSet nodes; AssignUniqueIds(&nodes, atom_vec); if (ExtraDebug) PrintDebugInfo(&nodes); } -Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) { - std::string node_string = NodeString(node); - NodeMap::iterator iter = nodes->find(node_string); - if (iter == nodes->end()) - return NULL; - return (*iter).second; -} - -std::string PrefilterTree::NodeString(Prefilter* node) const { - // Adding the operation disambiguates AND/OR/atom nodes. - std::string s = absl::StrFormat("%d", node->op()) + ":"; - if (node->op() == Prefilter::ATOM) { - s += node->atom(); - } else { - for (size_t i = 0; i < node->subs()->size(); i++) { - if (i > 0) - s += ','; - s += absl::StrFormat("%d", (*node->subs())[i]->unique_id()); - } +Prefilter* PrefilterTree::CanonicalNode(NodeSet* nodes, Prefilter* node) { + NodeSet::const_iterator iter = nodes->find(node); + if (iter != nodes->end()) { + return *iter; } - return s; + return NULL; } bool PrefilterTree::KeepNode(Prefilter* node) const { @@ -129,7 +113,7 @@ bool PrefilterTree::KeepNode(Prefilter* node) const { } } -void PrefilterTree::AssignUniqueIds(NodeMap* nodes, +void PrefilterTree::AssignUniqueIds(NodeSet* nodes, std::vector<std::string>* atom_vec) { atom_vec->clear(); @@ -169,9 +153,9 @@ void PrefilterTree::AssignUniqueIds(NodeMap* nodes, node->set_unique_id(-1); Prefilter* canonical = CanonicalNode(nodes, node); if (canonical == NULL) { - // Any further nodes that have the same node string + // Any further nodes that have the same atom/subs // will find this node as the canonical node. - nodes->emplace(NodeString(node), node); + nodes->emplace(node); if (node->op() == Prefilter::ATOM) { atom_vec->push_back(node->atom()); atom_index_to_id_.push_back(unique_id); @@ -300,7 +284,7 @@ void PrefilterTree::RegexpsGivenStrings( for (size_t j = 0; j < matched_atoms.size(); j++) matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); PropagateMatch(matched_atom_ids, ®exps_map); - for (IntMap::iterator it = regexps_map.begin(); + for (IntMap::const_iterator it = regexps_map.begin(); it != regexps_map.end(); ++it) regexps->push_back(it->index()); @@ -316,7 +300,7 @@ void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids, IntMap work(static_cast<int>(entries_.size())); for (size_t i = 0; i < atom_ids.size(); i++) work.set(atom_ids[i], 1); - for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { + for (IntMap::const_iterator it = work.begin(); it != work.end(); ++it) { const Entry& entry = entries_[it->index()]; // Record regexps triggered. for (size_t i = 0; i < entry.regexps.size(); i++) @@ -348,7 +332,7 @@ void PrefilterTree::PrintPrefilter(int regexpid) { LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]); } -void PrefilterTree::PrintDebugInfo(NodeMap* nodes) { +void PrefilterTree::PrintDebugInfo(NodeSet* nodes) { LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size(); LOG(ERROR) << "#Unique Nodes: " << entries_.size(); @@ -360,11 +344,10 @@ void PrefilterTree::PrintDebugInfo(NodeMap* nodes) { for (int parent : parents) LOG(ERROR) << parent; } - LOG(ERROR) << "Map:"; - for (NodeMap::const_iterator iter = nodes->begin(); + LOG(ERROR) << "Set:"; + for (NodeSet::const_iterator iter = nodes->begin(); iter != nodes->end(); ++iter) - LOG(ERROR) << "NodeId: " << (*iter).second->unique_id() - << " Str: " << (*iter).first; + LOG(ERROR) << "NodeId: " << (*iter)->unique_id(); } std::string PrefilterTree::DebugNodeString(Prefilter* node) const { diff --git a/contrib/libs/re2/re2/prefilter_tree.h b/contrib/libs/re2/re2/prefilter_tree.h index 3eb8056db0..71e7a294f9 100644 --- a/contrib/libs/re2/re2/prefilter_tree.h +++ b/contrib/libs/re2/re2/prefilter_tree.h @@ -16,12 +16,13 @@ // atoms) that the user of this class should use to do the string // matching. -#include <map> #include <string> #include <vector> +#include "absl/container/flat_hash_set.h" #include "re2/prefilter.h" #include "re2/sparse_array.h" +#include "util/logging.h" namespace re2 { @@ -57,10 +58,25 @@ class PrefilterTree { void PrintPrefilter(int regexpid); private: - typedef SparseArray<int> IntMap; - // TODO(junyer): Use absl::flat_hash_set<Prefilter*> instead? - // It should be trivial to get rid of the stringification... - typedef std::map<std::string, Prefilter*> NodeMap; + using IntMap = SparseArray<int>; + + struct PrefilterHash { + size_t operator()(const Prefilter* a) const { + DCHECK(a != NULL); + return absl::Hash<Prefilter>()(*a); + } + }; + + struct PrefilterEqual { + bool operator()(const Prefilter* a, const Prefilter* b) const { + DCHECK(a != NULL); + DCHECK(b != NULL); + return *a == *b; + } + }; + + using NodeSet = + absl::flat_hash_set<Prefilter*, PrefilterHash, PrefilterEqual>; // Each unique node has a corresponding Entry that helps in // passing the matching trigger information along the tree. @@ -90,25 +106,22 @@ class PrefilterTree { // This function assigns unique ids to various parts of the // prefilter, by looking at if these nodes are already in the // PrefilterTree. - void AssignUniqueIds(NodeMap* nodes, std::vector<std::string>* atom_vec); + void AssignUniqueIds(NodeSet* nodes, std::vector<std::string>* atom_vec); // Given the matching atoms, find the regexps to be triggered. void PropagateMatch(const std::vector<int>& atom_ids, IntMap* regexps) const; - // Returns the prefilter node that has the same NodeString as this - // node. For the canonical node, returns node. - Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node); - - // A string that uniquely identifies the node. Assumes that the - // children of node has already been assigned unique ids. - std::string NodeString(Prefilter* node) const; + // Returns the prefilter node that has the same atom/subs as this + // node. For the canonical node, returns node. Assumes that the + // children of node have already been assigned unique ids. + Prefilter* CanonicalNode(NodeSet* nodes, Prefilter* node); // Recursively constructs a readable prefilter string. std::string DebugNodeString(Prefilter* node) const; // Used for debugging. - void PrintDebugInfo(NodeMap* nodes); + void PrintDebugInfo(NodeSet* nodes); // These are all the nodes formed by Compile. Essentially, there is // one node for each unique atom and each unique AND/OR node. diff --git a/contrib/libs/re2/re2/regexp.cc b/contrib/libs/re2/re2/regexp.cc index 3cfb5ae149..1614bb0fed 100644 --- a/contrib/libs/re2/re2/regexp.cc +++ b/contrib/libs/re2/re2/regexp.cc @@ -17,6 +17,7 @@ #include "absl/base/call_once.h" #include "absl/base/macros.h" +#include "absl/container/flat_hash_map.h" #include "absl/synchronization/mutex.h" #include "util/logging.h" #include "util/utf.h" @@ -76,7 +77,7 @@ bool Regexp::QuickDestroy() { // Similar to EmptyStorage in re2.cc. struct RefStorage { absl::Mutex ref_mutex; - std::map<Regexp*, int> ref_map; + absl::flat_hash_map<Regexp*, int> ref_map; }; alignas(RefStorage) static char ref_storage[sizeof(RefStorage)]; @@ -84,7 +85,7 @@ static inline absl::Mutex* ref_mutex() { return &reinterpret_cast<RefStorage*>(ref_storage)->ref_mutex; } -static inline std::map<Regexp*, int>* ref_map() { +static inline absl::flat_hash_map<Regexp*, int>* ref_map() { return &reinterpret_cast<RefStorage*>(ref_storage)->ref_map; } diff --git a/contrib/libs/re2/re2/simplify.cc b/contrib/libs/re2/re2/simplify.cc index 8cd10cfbb0..cea100b08a 100644 --- a/contrib/libs/re2/re2/simplify.cc +++ b/contrib/libs/re2/re2/simplify.cc @@ -6,6 +6,7 @@ // to use simple extended regular expression features. // Also sort and simplify character classes. +#include <algorithm> #include <string> #include "util/logging.h" @@ -579,6 +580,16 @@ Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, return re; } +// Returns true if re is an empty-width op. +static bool IsEmptyOp(Regexp* re) { + return (re->op() == kRegexpBeginLine || + re->op() == kRegexpEndLine || + re->op() == kRegexpWordBoundary || + re->op() == kRegexpNoWordBoundary || + re->op() == kRegexpBeginText || + re->op() == kRegexpEndText); +} + // Simplifies the expression re{min,max} in terms of *, +, and ?. // Returns a new regexp. Does not edit re. Does not consume reference to re. // Caller must Decref return value when done with it. @@ -587,6 +598,16 @@ Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, // but in the Regexp* representation, both (x) are marked as $1. Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, Regexp::ParseFlags f) { + // For an empty-width op OR a concatenation or alternation of empty-width + // ops, cap the repetition count at 1. + if (IsEmptyOp(re) || + ((re->op() == kRegexpConcat || + re->op() == kRegexpAlternate) && + std::all_of(re->sub(), re->sub() + re->nsub(), IsEmptyOp))) { + min = std::min(min, 1); + max = std::min(max, 1); + } + // x{n,} means at least n matches of x. if (max == -1) { // Special case: x{0,} is x* diff --git a/contrib/libs/re2/re2/testing/simplify_test.cc b/contrib/libs/re2/re2/testing/simplify_test.cc index d2c136a3cc..5b683f580c 100644 --- a/contrib/libs/re2/re2/testing/simplify_test.cc +++ b/contrib/libs/re2/re2/testing/simplify_test.cc @@ -140,6 +140,22 @@ static Test tests[] = { { "(){1,}", "()+" }, { "(){0,2}", "(?:()()?)?" }, + // For an empty-width op OR a concatenation or alternation of empty-width + // ops, test that the repetition count is capped at 1. + { "(?:^){0,}", "^*" }, // x{0,} -> x* + { "(?:$){28,}", "$+" }, // x{N,} -> x{1,} -> x+ + { "(?-m:^){0,30}", "(?-m:^)?" }, // x{0,N} -> x{0,1} -> x? + { "(?-m:$){28,30}", "(?-m:$)" }, // x{N,M} -> x{1,1} -> x + { "\\b(?:\\b\\B){999}\\B", "\\b\\b\\B\\B" }, + { "\\b(?:\\b|\\B){999}\\B", "\\b(?:\\b|\\B)\\B" }, + // NonGreedy should also be handled. + { "(?:^){0,}?", "^*?" }, + { "(?:$){28,}?", "$+?" }, + { "(?-m:^){0,30}?", "(?-m:^)??" }, + { "(?-m:$){28,30}?", "(?-m:$)" }, + { "\\b(?:\\b\\B){999}?\\B", "\\b\\b\\B\\B" }, + { "\\b(?:\\b|\\B){999}?\\B", "\\b(?:\\b|\\B)\\B" }, + // Test that coalescing occurs and that the resulting repeats are simplified. // Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal: { "a*a*", "a*" }, diff --git a/contrib/libs/re2/ya.make b/contrib/libs/re2/ya.make index 1ad631b2e6..e01531f4d5 100644 --- a/contrib/libs/re2/ya.make +++ b/contrib/libs/re2/ya.make @@ -9,9 +9,9 @@ LICENSE( LICENSE_TEXTS(.yandex_meta/licenses.list.txt) -VERSION(2023-07-01) +VERSION(2023-08-01) -ORIGINAL_SOURCE(https://github.com/google/re2/archive/2023-07-01.tar.gz) +ORIGINAL_SOURCE(https://github.com/google/re2/archive/2023-08-01.tar.gz) PEERDIR( contrib/restricted/abseil-cpp/absl/base |