diff options
author | robot-contrib <robot-contrib@yandex-team.com> | 2022-12-16 12:29:02 +0300 |
---|---|---|
committer | robot-contrib <robot-contrib@yandex-team.com> | 2022-12-16 12:29:02 +0300 |
commit | a8bebdbb76e6a6e629f5ca6c84425f7f51649892 (patch) | |
tree | a6a70f6bfe80ea55889e1731678a1e7550399e4c /contrib | |
parent | cb6356ca0d52f1d0437996c4f5d662c5201f682b (diff) | |
download | ydb-a8bebdbb76e6a6e629f5ca6c84425f7f51649892.tar.gz |
Update contrib/libs/re2 to 2022-12-01
Diffstat (limited to 'contrib')
-rw-r--r-- | contrib/libs/re2/re2/compile.cc | 6 | ||||
-rw-r--r-- | contrib/libs/re2/re2/dfa.cc | 2 | ||||
-rw-r--r-- | contrib/libs/re2/re2/parse.cc | 4 | ||||
-rw-r--r-- | contrib/libs/re2/re2/prefilter.cc | 51 | ||||
-rw-r--r-- | contrib/libs/re2/re2/prefilter.h | 24 | ||||
-rw-r--r-- | contrib/libs/re2/re2/prog.cc | 2 | ||||
-rw-r--r-- | contrib/libs/re2/re2/re2.cc | 124 | ||||
-rw-r--r-- | contrib/libs/re2/re2/re2.h | 82 | ||||
-rw-r--r-- | contrib/libs/re2/re2/regexp.cc | 38 | ||||
-rw-r--r-- | contrib/libs/re2/re2/set.cc | 4 | ||||
-rw-r--r-- | contrib/libs/re2/re2/simplify.cc | 4 | ||||
-rw-r--r-- | contrib/libs/re2/re2/testing/filtered_re2_test.cc | 3 | ||||
-rw-r--r-- | contrib/libs/re2/re2/unicode_groups.cc | 225 | ||||
-rw-r--r-- | contrib/libs/re2/util/rune.cc | 8 |
14 files changed, 363 insertions, 214 deletions
diff --git a/contrib/libs/re2/re2/compile.cc b/contrib/libs/re2/re2/compile.cc index 61d801a630..03bffab421 100644 --- a/contrib/libs/re2/re2/compile.cc +++ b/contrib/libs/re2/re2/compile.cc @@ -789,8 +789,8 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { // Should not be called. Frag Compiler::Copy(Frag arg) { // We're using WalkExponential; there should be no copying. - LOG(DFATAL) << "Compiler::Copy called!"; failed_ = true; + LOG(DFATAL) << "Compiler::Copy called!"; return NoMatch(); } @@ -916,8 +916,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, CharClass* cc = re->cc(); if (cc->empty()) { // This can't happen. - LOG(DFATAL) << "No ranges in char class"; failed_ = true; + LOG(DFATAL) << "No ranges in char class"; return NoMatch(); } @@ -974,8 +974,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, case kRegexpNoWordBoundary: return EmptyWidth(kEmptyNonWordBoundary); } - LOG(DFATAL) << "Missing case in Compiler: " << re->op(); failed_ = true; + LOG(DFATAL) << "Missing case in Compiler: " << re->op(); return NoMatch(); } diff --git a/contrib/libs/re2/re2/dfa.cc b/contrib/libs/re2/re2/dfa.cc index d47c7d50a7..55def2b1be 100644 --- a/contrib/libs/re2/re2/dfa.cc +++ b/contrib/libs/re2/re2/dfa.cc @@ -1675,8 +1675,8 @@ bool DFA::AnalyzeSearch(SearchParams* params) { if (!AnalyzeSearchHelper(params, info, flags)) { ResetCache(params->cache_lock); if (!AnalyzeSearchHelper(params, info, flags)) { - LOG(DFATAL) << "Failed to analyze start state."; params->failed = true; + LOG(DFATAL) << "Failed to analyze start state."; return false; } } diff --git a/contrib/libs/re2/re2/parse.cc b/contrib/libs/re2/re2/parse.cc index 85f16f060b..d7a9fe5084 100644 --- a/contrib/libs/re2/re2/parse.cc +++ b/contrib/libs/re2/re2/parse.cc @@ -1589,8 +1589,6 @@ static bool ParseEscape(StringPiece* s, Rune* rp, // return true; } - LOG(DFATAL) << "Not reached in ParseEscape."; - BadEscape: // Unrecognized escape sequence. status->set_code(kRegexpBadEscape); @@ -2059,8 +2057,8 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { // Caller is supposed to check this. if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') { - LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags"; status_->set_code(kRegexpInternalError); + LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags"; return false; } diff --git a/contrib/libs/re2/re2/prefilter.cc b/contrib/libs/re2/re2/prefilter.cc index a47b3120fb..37b0cf8a73 100644 --- a/contrib/libs/re2/re2/prefilter.cc +++ b/contrib/libs/re2/re2/prefilter.cc @@ -7,6 +7,7 @@ #include <stddef.h> #include <stdint.h> #include <string> +#include <utility> #include <vector> #include "util/util.h" @@ -21,9 +22,6 @@ namespace re2 { static const bool ExtraDebug = false; -typedef std::set<std::string>::iterator SSIter; -typedef std::set<std::string>::const_iterator ConstSSIter; - // Initializes a Prefilter, allocating subs_ as necessary. Prefilter::Prefilter(Op op) { op_ = op; @@ -140,7 +138,7 @@ Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) { return AndOr(OR, a, b); } -static void SimplifyStringSet(std::set<std::string>* ss) { +void Prefilter::SimplifyStringSet(SSet* ss) { // Now make sure that the strings aren't redundant. For example, if // we know "ab" is a required string, then it doesn't help at all to // know that "abc" is also a required string, so delete "abc". This @@ -149,13 +147,19 @@ static void SimplifyStringSet(std::set<std::string>* ss) { // candidate for match, so further matching "abc" is redundant. // Note that we must ignore "" because find() would find it at the // start of everything and thus we would end up erasing everything. - for (SSIter i = ss->begin(); i != ss->end(); ++i) { - if (i->empty()) - continue; + // + // The SSet sorts strings by length, then lexicographically. Note that + // smaller strings appear first and all strings must be unique. These + // observations let us skip string comparisons when possible. + SSIter i = ss->begin(); + if (i != ss->end() && i->empty()) { + ++i; + } + for (; i != ss->end(); ++i) { SSIter j = i; ++j; while (j != ss->end()) { - if (j->find(*i) != std::string::npos) { + if (j->size() > i->size() && j->find(*i) != std::string::npos) { j = ss->erase(j); continue; } @@ -164,7 +168,7 @@ static void SimplifyStringSet(std::set<std::string>* ss) { } } -Prefilter* Prefilter::OrStrings(std::set<std::string>* ss) { +Prefilter* Prefilter::OrStrings(SSet* ss) { Prefilter* or_prefilter = new Prefilter(NONE); SimplifyStringSet(ss); for (SSIter i = ss->begin(); i != ss->end(); ++i) @@ -226,14 +230,14 @@ class Prefilter::Info { // Caller takes ownership of the Prefilter. Prefilter* TakeMatch(); - std::set<std::string>& exact() { return exact_; } + SSet& exact() { return exact_; } bool is_exact() const { return is_exact_; } class Walker; private: - std::set<std::string> exact_; + SSet exact_; // When is_exact_ is true, the strings that match // are placed in exact_. When it is no longer an exact @@ -286,18 +290,7 @@ std::string Prefilter::Info::ToString() { return ""; } -// Add the strings from src to dst. -static void CopyIn(const std::set<std::string>& src, - std::set<std::string>* dst) { - for (ConstSSIter i = src.begin(); i != src.end(); ++i) - dst->insert(*i); -} - -// Add the cross-product of a and b to dst. -// (For each string i in a and j in b, add i+j.) -static void CrossProduct(const std::set<std::string>& a, - const std::set<std::string>& b, - std::set<std::string>* dst) { +void Prefilter::CrossProduct(const SSet& a, const SSet& b, SSet* dst) { for (ConstSSIter i = a.begin(); i != a.end(); ++i) for (ConstSSIter j = b.begin(); j != b.end(); ++j) dst->insert(*i + *j); @@ -343,8 +336,14 @@ Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) { Info *ab = new Info(); if (a->is_exact_ && b->is_exact_) { - CopyIn(a->exact_, &ab->exact_); - CopyIn(b->exact_, &ab->exact_); + // Avoid string copies by moving the larger exact_ set into + // ab directly, then merge in the smaller set. + if (a->exact_.size() < b->exact_.size()) { + using std::swap; + swap(a, b); + } + ab->exact_ = std::move(a->exact_); + ab->exact_.insert(b->exact_.begin(), b->exact_.end()); ab->is_exact_ = true; } else { // Either a or b has is_exact_ = false. If the other @@ -532,8 +531,8 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit( switch (re->op()) { default: case kRegexpRepeat: - LOG(DFATAL) << "Bad regexp op " << re->op(); info = EmptyString(); + LOG(DFATAL) << "Bad regexp op " << re->op(); break; case kRegexpNoMatch: diff --git a/contrib/libs/re2/re2/prefilter.h b/contrib/libs/re2/re2/prefilter.h index 4fedeb4a7c..e149e59a86 100644 --- a/contrib/libs/re2/re2/prefilter.h +++ b/contrib/libs/re2/re2/prefilter.h @@ -60,8 +60,21 @@ class Prefilter { std::string DebugString() const; private: + // A comparator used to store exact strings. We compare by length, + // then lexicographically. This ordering makes it easier to reduce the + // set of strings in SimplifyStringSet. + struct LengthThenLex { + bool operator()(const std::string& a, const std::string& b) const { + return (a.size() < b.size()) || (a.size() == b.size() && a < b); + } + }; + class Info; + using SSet = std::set<std::string, LengthThenLex>; + using SSIter = SSet::iterator; + using ConstSSIter = SSet::const_iterator; + // Combines two prefilters together to create an AND. The passed // Prefilters will be part of the returned Prefilter or deleted. static Prefilter* And(Prefilter* a, Prefilter* b); @@ -77,12 +90,21 @@ class Prefilter { static Prefilter* FromString(const std::string& str); - static Prefilter* OrStrings(std::set<std::string>* ss); + static Prefilter* OrStrings(SSet* ss); static Info* BuildInfo(Regexp* re); Prefilter* Simplify(); + // Removes redundant strings from the set. A string is redundant if + // any of the other strings appear as a substring. The empty string + // is a special case, which is ignored. + static void SimplifyStringSet(SSet* ss); + + // Adds the cross-product of a and b to dst. + // (For each string i in a and j in b, add i+j.) + static void CrossProduct(const SSet& a, const SSet& b, SSet* dst); + // Kind of Prefilter. Op op_; diff --git a/contrib/libs/re2/re2/prog.cc b/contrib/libs/re2/re2/prog.cc index a700d35de3..3b9596acab 100644 --- a/contrib/libs/re2/re2/prog.cc +++ b/contrib/libs/re2/re2/prog.cc @@ -511,7 +511,7 @@ void Prog::ComputeByteMap() { builder.Build(bytemap_, &bytemap_range_); - if (0) { // For debugging, use trivial bytemap. + if ((0)) { // For debugging, use trivial bytemap. LOG(ERROR) << "Using trivial bytemap."; for (int i = 0; i < 256; i++) bytemap_[i] = static_cast<uint8_t>(i); diff --git a/contrib/libs/re2/re2/re2.cc b/contrib/libs/re2/re2/re2.cc index ad126d00bd..1c2645bf07 100644 --- a/contrib/libs/re2/re2/re2.cc +++ b/contrib/libs/re2/re2/re2.cc @@ -36,6 +36,13 @@ namespace re2 { +// Controls the maximum count permitted by GlobalReplace(); -1 is unlimited. +static int maximum_global_replace_count = -1; + +void RE2::FUZZING_ONLY_set_maximum_global_replace_count(int i) { + maximum_global_replace_count = i; +} + // Maximum number of args we can set static const int kMaxArgs = 16; static const int kVecSize = 1+kMaxArgs; @@ -43,11 +50,11 @@ static const int kVecSize = 1+kMaxArgs; const int RE2::Options::kDefaultMaxMem; // initialized in re2.h RE2::Options::Options(RE2::CannedOptions opt) - : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), + : max_mem_(kDefaultMaxMem), + encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), posix_syntax_(opt == RE2::POSIX), longest_match_(opt == RE2::POSIX), log_errors_(opt != RE2::Quiet), - max_mem_(kDefaultMaxMem), literal_(false), never_nl_(false), dot_nl_(false), @@ -58,11 +65,30 @@ RE2::Options::Options(RE2::CannedOptions opt) one_line_(false) { } -// static empty objects for use as const references. -// To avoid global constructors, allocated in RE2::Init(). -static const std::string* empty_string; -static const std::map<std::string, int>* empty_named_groups; -static const std::map<int, std::string>* empty_group_names; +// Empty objects for use as const references. +// Statically allocating the storage and then +// lazily constructing the objects (in a once +// in RE2::Init()) avoids global constructors +// and the false positives (thanks, Valgrind) +// about memory leaks at program termination. +struct EmptyStorage { + std::string empty_string; + std::map<std::string, int> empty_named_groups; + std::map<int, std::string> empty_group_names; +}; +alignas(EmptyStorage) static char empty_storage[sizeof(EmptyStorage)]; + +static inline std::string* empty_string() { + return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_string; +} + +static inline std::map<std::string, int>* empty_named_groups() { + return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_named_groups; +} + +static inline std::map<int, std::string>* empty_group_names() { + return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_group_names; +} // Converts from Regexp error code to RE2 error code. // Maybe some day they will diverge. In any event, this @@ -173,23 +199,23 @@ int RE2::Options::ParseFlags() const { void RE2::Init(const StringPiece& pattern, const Options& options) { static std::once_flag empty_once; std::call_once(empty_once, []() { - empty_string = new std::string; - empty_named_groups = new std::map<std::string, int>; - empty_group_names = new std::map<int, std::string>; + (void) new (empty_storage) EmptyStorage; }); - pattern_.assign(pattern.data(), pattern.size()); + pattern_ = new std::string(pattern); options_.Copy(options); entire_regexp_ = NULL; - error_ = empty_string; - error_code_ = NoError; - error_arg_.clear(); - prefix_.clear(); - prefix_foldcase_ = false; suffix_regexp_ = NULL; - prog_ = NULL; + error_ = empty_string(); + error_arg_ = empty_string(); + num_captures_ = -1; + error_code_ = NoError; + longest_match_ = options_.longest_match(); is_one_pass_ = false; + prefix_foldcase_ = false; + prefix_.clear(); + prog_ = NULL; rprog_ = NULL; named_groups_ = NULL; @@ -197,25 +223,29 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { RegexpStatus status; entire_regexp_ = Regexp::Parse( - pattern_, + *pattern_, static_cast<Regexp::ParseFlags>(options_.ParseFlags()), &status); if (entire_regexp_ == NULL) { if (options_.log_errors()) { - LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " + LOG(ERROR) << "Error parsing '" << trunc(*pattern_) << "': " << status.Text(); } error_ = new std::string(status.Text()); error_code_ = RegexpErrorToRE2(status.code()); - error_arg_ = std::string(status.error_arg()); + error_arg_ = new std::string(status.error_arg()); return; } + bool foldcase; re2::Regexp* suffix; - if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix)) + if (entire_regexp_->RequiredPrefix(&prefix_, &foldcase, &suffix)) { + prefix_foldcase_ = foldcase; suffix_regexp_ = suffix; - else + } + else { suffix_regexp_ = entire_regexp_->Incref(); + } // Two thirds of the memory goes to the forward Prog, // one third to the reverse prog, because the forward @@ -223,7 +253,7 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3); if (prog_ == NULL) { if (options_.log_errors()) - LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'"; + LOG(ERROR) << "Error compiling '" << trunc(*pattern_) << "'"; error_ = new std::string("pattern too large - compile failed"); error_code_ = RE2::ErrorPatternTooLarge; return; @@ -249,7 +279,8 @@ re2::Prog* RE2::ReverseProg() const { re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3); if (re->rprog_ == NULL) { if (re->options_.log_errors()) - LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'"; + LOG(ERROR) << "Error reverse compiling '" << trunc(*re->pattern_) + << "'"; // We no longer touch error_ and error_code_ because failing to compile // the reverse Prog is not a showstopper: falling back to NFA execution // is fine. More importantly, an RE2 object is supposed to be logically @@ -261,18 +292,21 @@ re2::Prog* RE2::ReverseProg() const { } RE2::~RE2() { + if (group_names_ != empty_group_names()) + delete group_names_; + if (named_groups_ != empty_named_groups()) + delete named_groups_; + delete rprog_; + delete prog_; + if (error_arg_ != empty_string()) + delete error_arg_; + if (error_ != empty_string()) + delete error_; if (suffix_regexp_) suffix_regexp_->Decref(); if (entire_regexp_) entire_regexp_->Decref(); - delete prog_; - delete rprog_; - if (error_ != empty_string) - delete error_; - if (named_groups_ != NULL && named_groups_ != empty_named_groups) - delete named_groups_; - if (group_names_ != NULL && group_names_ != empty_group_names) - delete group_names_; + delete pattern_; } int RE2::ProgramSize() const { @@ -352,7 +386,7 @@ const std::map<std::string, int>& RE2::NamedCapturingGroups() const { if (re->suffix_regexp_ != NULL) re->named_groups_ = re->suffix_regexp_->NamedCaptures(); if (re->named_groups_ == NULL) - re->named_groups_ = empty_named_groups; + re->named_groups_ = empty_named_groups(); }, this); return *named_groups_; } @@ -363,7 +397,7 @@ const std::map<int, std::string>& RE2::CapturingGroupNames() const { if (re->suffix_regexp_ != NULL) re->group_names_ = re->suffix_regexp_->CaptureNames(); if (re->group_names_ == NULL) - re->group_names_ = empty_group_names; + re->group_names_ = empty_group_names(); }, this); return *group_names_; } @@ -439,13 +473,10 @@ int RE2::GlobalReplace(std::string* str, const char* lastend = NULL; std::string out; int count = 0; -#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION - // Iterate just once when fuzzing. Otherwise, we easily get bogged down - // and coverage is unlikely to improve despite significant expense. - while (p == str->data()) { -#else while (p <= ep) { -#endif + if (maximum_global_replace_count != -1 && + count >= maximum_global_replace_count) + break; if (!re.Match(*str, static_cast<size_t>(p - str->data()), str->size(), UNANCHORED, vec, nvec)) break; @@ -686,9 +717,8 @@ bool RE2::Match(const StringPiece& text, } Prog::Anchor anchor = Prog::kUnanchored; - Prog::MatchKind kind = Prog::kFirstMatch; - if (options_.longest_match()) - kind = Prog::kLongestMatch; + Prog::MatchKind kind = + longest_match_ ? Prog::kLongestMatch : Prog::kFirstMatch; bool can_one_pass = is_one_pass_ && ncap <= Prog::kMaxOnePassCapture; bool can_bit_state = prog_->CanBitState(); @@ -720,7 +750,7 @@ bool RE2::Match(const StringPiece& text, if (dfa_failed) { if (options_.log_errors()) LOG(ERROR) << "DFA out of memory: " - << "pattern length " << pattern_.size() << ", " + << "pattern length " << pattern_->size() << ", " << "program size " << prog->size() << ", " << "list count " << prog->list_count() << ", " << "bytemap range " << prog->bytemap_range(); @@ -740,7 +770,7 @@ bool RE2::Match(const StringPiece& text, if (dfa_failed) { if (options_.log_errors()) LOG(ERROR) << "DFA out of memory: " - << "pattern length " << pattern_.size() << ", " + << "pattern length " << pattern_->size() << ", " << "program size " << prog_->size() << ", " << "list count " << prog_->list_count() << ", " << "bytemap range " << prog_->bytemap_range(); @@ -766,7 +796,7 @@ bool RE2::Match(const StringPiece& text, if (dfa_failed) { if (options_.log_errors()) LOG(ERROR) << "DFA out of memory: " - << "pattern length " << pattern_.size() << ", " + << "pattern length " << pattern_->size() << ", " << "program size " << prog->size() << ", " << "list count " << prog->list_count() << ", " << "bytemap range " << prog->bytemap_range(); @@ -809,7 +839,7 @@ bool RE2::Match(const StringPiece& text, if (dfa_failed) { if (options_.log_errors()) LOG(ERROR) << "DFA out of memory: " - << "pattern length " << pattern_.size() << ", " + << "pattern length " << pattern_->size() << ", " << "program size " << prog_->size() << ", " << "list count " << prog_->list_count() << ", " << "bytemap range " << prog_->bytemap_range(); diff --git a/contrib/libs/re2/re2/re2.h b/contrib/libs/re2/re2/re2.h index c9694d35ac..09eb287136 100644 --- a/contrib/libs/re2/re2/re2.h +++ b/contrib/libs/re2/re2/re2.h @@ -66,17 +66,17 @@ // CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1))); // // ----------------------------------------------------------------------- -// MATCHING WITH SUBSTRING EXTRACTION: +// SUBMATCH EXTRACTION: // -// You can supply extra pointer arguments to extract matched substrings. +// You can supply extra pointer arguments to extract submatches. // On match failure, none of the pointees will have been modified. -// On match success, the substrings will be converted (as necessary) and +// On match success, the submatches will be converted (as necessary) and // their values will be assigned to their pointees until all conversions // have succeeded or one conversion has failed. // On conversion failure, the pointees will be in an indeterminate state // because the caller has no way of knowing which conversion failed. // However, conversion cannot fail for types like string and StringPiece -// that do not inspect the substring contents. Hence, in the common case +// that do not inspect the submatch contents. Hence, in the common case // where all of the pointees are of such types, failure is always due to // match failure and thus none of the pointees will have been modified. // @@ -100,10 +100,10 @@ // Example: integer overflow causes failure // CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i)); // -// NOTE(rsc): Asking for substrings slows successful matches quite a bit. +// NOTE(rsc): Asking for submatches slows successful matches quite a bit. // This may get a little faster in the future, but right now is slower // than PCRE. On the other hand, failed matches run *very* fast (faster -// than PCRE), as do matches without substring extraction. +// than PCRE), as do matches without submatch extraction. // // ----------------------------------------------------------------------- // PARTIAL MATCHES @@ -275,23 +275,35 @@ class RE2 { // Need to have the const char* and const std::string& forms for implicit // conversions when passing string literals to FullMatch and PartialMatch. // Otherwise the StringPiece form would be sufficient. -#ifndef SWIG RE2(const char* pattern); RE2(const std::string& pattern); -#endif RE2(const StringPiece& pattern); RE2(const StringPiece& pattern, const Options& options); // ambiguity resolution. RE2(const TString& pattern) : RE2(StringPiece(pattern)) {} ~RE2(); + // Not copyable. + // RE2 objects are expensive. You should probably use std::shared_ptr<RE2> + // instead. If you really must copy, RE2(first.pattern(), first.options()) + // effectively does so: it produces a second object that mimics the first. + RE2(const RE2&) = delete; + RE2& operator=(const RE2&) = delete; + // Not movable. + // RE2 objects are thread-safe and logically immutable. You should probably + // use std::unique_ptr<RE2> instead. Otherwise, consider std::deque<RE2> if + // direct emplacement into a container is desired. If you really must move, + // be prepared to submit a design document along with your feature request. + RE2(RE2&&) = delete; + RE2& operator=(RE2&&) = delete; + // Returns whether RE2 was created properly. bool ok() const { return error_code() == NoError; } // The string specification for this RE2. E.g. // RE2 re("ab*c?d+"); // re.pattern(); // "ab*c?d+" - const std::string& pattern() const { return pattern_; } + const std::string& pattern() const { return *pattern_; } // If RE2 could not be created properly, returns an error string. // Else returns the empty string. @@ -303,7 +315,7 @@ class RE2 { // If RE2 could not be created properly, returns the offending // portion of the regexp. - const std::string& error_arg() const { return error_arg_; } + const std::string& error_arg() const { return *error_arg_; } // Returns the program size, a very approximate measure of a regexp's "cost". // Larger numbers are more expensive than smaller numbers. @@ -336,7 +348,6 @@ class RE2 { static bool FindAndConsumeN(StringPiece* input, const RE2& re, const Arg* const args[], int n); -#ifndef SWIG private: template <typename F, typename SP> static inline bool Apply(F f, SP sp, const RE2& re) { @@ -442,7 +453,6 @@ class RE2 { static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) { return Apply(FindAndConsumeN, input, re, Arg(std::forward<A>(a))...); } -#endif // Replace the first match of "re" in "str" with "rewrite". // Within "rewrite", backslash-escaped digits (\1 to \9) can be @@ -698,11 +708,11 @@ class RE2 { }; Options() : + max_mem_(kDefaultMaxMem), encoding_(EncodingUTF8), posix_syntax_(false), longest_match_(false), log_errors_(true), - max_mem_(kDefaultMaxMem), literal_(false), never_nl_(false), dot_nl_(false), @@ -715,6 +725,9 @@ class RE2 { /*implicit*/ Options(CannedOptions); + int64_t max_mem() const { return max_mem_; } + void set_max_mem(int64_t m) { max_mem_ = m; } + Encoding encoding() const { return encoding_; } void set_encoding(Encoding encoding) { encoding_ = encoding; } @@ -727,9 +740,6 @@ class RE2 { bool log_errors() const { return log_errors_; } void set_log_errors(bool b) { log_errors_ = b; } - int64_t max_mem() const { return max_mem_; } - void set_max_mem(int64_t m) { max_mem_ = m; } - bool literal() const { return literal_; } void set_literal(bool b) { literal_ = b; } @@ -761,11 +771,11 @@ class RE2 { int ParseFlags() const; private: + int64_t max_mem_; Encoding encoding_; bool posix_syntax_; bool longest_match_; bool log_errors_; - int64_t max_mem_; bool literal_; bool never_nl_; bool dot_nl_; @@ -787,6 +797,10 @@ class RE2 { template <typename T> static Arg Octal(T* ptr); + // Controls the maximum count permitted by GlobalReplace(); -1 is unlimited. + // FOR FUZZING ONLY. + static void FUZZING_ONLY_set_maximum_global_replace_count(int i); + private: void Init(const StringPiece& pattern, const Options& options); @@ -798,18 +812,23 @@ class RE2 { re2::Prog* ReverseProg() const; - std::string pattern_; // string regular expression - Options options_; // option flags - re2::Regexp* entire_regexp_; // parsed regular expression - const std::string* error_; // error indicator (or points to empty string) - ErrorCode error_code_; // error code - std::string error_arg_; // fragment of regexp showing error - std::string prefix_; // required prefix (before suffix_regexp_) - bool prefix_foldcase_; // prefix_ is ASCII case-insensitive - re2::Regexp* suffix_regexp_; // parsed regular expression, prefix_ removed - re2::Prog* prog_; // compiled program for regexp - int num_captures_; // number of capturing groups - bool is_one_pass_; // can use prog_->SearchOnePass? + // First cache line is relatively cold fields. + const std::string* pattern_; // string regular expression + Options options_; // option flags + re2::Regexp* entire_regexp_; // parsed regular expression + re2::Regexp* suffix_regexp_; // parsed regular expression, prefix_ removed + const std::string* error_; // error indicator (or points to empty string) + const std::string* error_arg_; // fragment of regexp showing error (or ditto) + + // Second cache line is relatively hot fields. + // These are ordered oddly to pack everything. + int num_captures_; // number of capturing groups + ErrorCode error_code_ : 29; // error code (29 bits is more than enough) + bool longest_match_ : 1; // cached copy of options_.longest_match() + bool is_one_pass_ : 1; // can use prog_->SearchOnePass? + bool prefix_foldcase_ : 1; // prefix_ is ASCII case-insensitive + std::string prefix_; // required prefix (before suffix_regexp_) + re2::Prog* prog_; // compiled program for regexp // Reverse Prog for DFA execution only mutable re2::Prog* rprog_; @@ -821,9 +840,6 @@ class RE2 { mutable std::once_flag rprog_once_; mutable std::once_flag named_groups_once_; mutable std::once_flag group_names_once_; - - RE2(const RE2&) = delete; - RE2& operator=(const RE2&) = delete; }; /***** Implementation details *****/ @@ -954,7 +970,6 @@ inline RE2::Arg RE2::Octal(T* ptr) { }); } -#ifndef SWIG // Silence warnings about missing initializers for members of LazyRE2. #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 6 #pragma GCC diagnostic ignored "-Wmissing-field-initializers" @@ -1005,7 +1020,6 @@ class LazyRE2 { void operator=(const LazyRE2&); // disallowed }; -#endif namespace hooks { diff --git a/contrib/libs/re2/re2/regexp.cc b/contrib/libs/re2/re2/regexp.cc index ca1318b43d..74ecb31969 100644 --- a/contrib/libs/re2/re2/regexp.cc +++ b/contrib/libs/re2/re2/regexp.cc @@ -74,16 +74,27 @@ bool Regexp::QuickDestroy() { return false; } -// Lazily allocated. -static Mutex* ref_mutex; -static std::map<Regexp*, int>* ref_map; +// Similar to EmptyStorage in re2.cc. +struct RefStorage { + Mutex ref_mutex; + std::map<Regexp*, int> ref_map; +}; +alignas(RefStorage) static char ref_storage[sizeof(RefStorage)]; + +static inline Mutex* ref_mutex() { + return &reinterpret_cast<RefStorage*>(ref_storage)->ref_mutex; +} + +static inline std::map<Regexp*, int>* ref_map() { + return &reinterpret_cast<RefStorage*>(ref_storage)->ref_map; +} int Regexp::Ref() { if (ref_ < kMaxRef) return ref_; - MutexLock l(ref_mutex); - return (*ref_map)[this]; + MutexLock l(ref_mutex()); + return (*ref_map())[this]; } // Increments reference count, returns object as convenience. @@ -91,18 +102,17 @@ Regexp* Regexp::Incref() { if (ref_ >= kMaxRef-1) { static std::once_flag ref_once; std::call_once(ref_once, []() { - ref_mutex = new Mutex; - ref_map = new std::map<Regexp*, int>; + (void) new (ref_storage) RefStorage; }); // Store ref count in overflow map. - MutexLock l(ref_mutex); + MutexLock l(ref_mutex()); if (ref_ == kMaxRef) { // already overflowed - (*ref_map)[this]++; + (*ref_map())[this]++; } else { // overflowing now - (*ref_map)[this] = kMaxRef; + (*ref_map())[this] = kMaxRef; ref_ = kMaxRef; } return this; @@ -116,13 +126,13 @@ Regexp* Regexp::Incref() { void Regexp::Decref() { if (ref_ == kMaxRef) { // Ref count is stored in overflow map. - MutexLock l(ref_mutex); - int r = (*ref_map)[this] - 1; + MutexLock l(ref_mutex()); + int r = (*ref_map())[this] - 1; if (r < kMaxRef) { ref_ = static_cast<uint16_t>(r); - ref_map->erase(this); + ref_map()->erase(this); } else { - (*ref_map)[this] = r; + (*ref_map())[this] = r; } return; } diff --git a/contrib/libs/re2/re2/set.cc b/contrib/libs/re2/re2/set.cc index 18705663a5..fe0ea055fe 100644 --- a/contrib/libs/re2/re2/set.cc +++ b/contrib/libs/re2/re2/set.cc @@ -128,9 +128,9 @@ bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const { bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v, ErrorInfo* error_info) const { if (!compiled_) { - LOG(DFATAL) << "RE2::Set::Match() called before compiling"; if (error_info != NULL) error_info->kind = kNotCompiled; + LOG(DFATAL) << "RE2::Set::Match() called before compiling"; return false; } #ifdef RE2_HAVE_THREAD_LOCAL @@ -161,9 +161,9 @@ bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v, } if (v != NULL) { if (matches->empty()) { - LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!"; if (error_info != NULL) error_info->kind = kInconsistent; + LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!"; return false; } v->assign(matches->begin(), matches->end()); diff --git a/contrib/libs/re2/re2/simplify.cc b/contrib/libs/re2/re2/simplify.cc index 663d5fcd45..0df9051587 100644 --- a/contrib/libs/re2/re2/simplify.cc +++ b/contrib/libs/re2/re2/simplify.cc @@ -371,8 +371,8 @@ void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) { break; default: - LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op(); nre->Decref(); + LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op(); return; } @@ -432,8 +432,8 @@ void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) { } default: - LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op(); nre->Decref(); + LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op(); return; } diff --git a/contrib/libs/re2/re2/testing/filtered_re2_test.cc b/contrib/libs/re2/re2/testing/filtered_re2_test.cc index 073a70a745..79fd874078 100644 --- a/contrib/libs/re2/re2/testing/filtered_re2_test.cc +++ b/contrib/libs/re2/re2/testing/filtered_re2_test.cc @@ -106,12 +106,13 @@ AtomTest atom_tests[] = { // substring in an OR are removed; that is, only the shortest // substring is kept. "SubstrAtomRemovesSuperStrInOr", { - "(abc123|abc|ghi789|abc1234).*[x-z]+", + "(abc123|abc|defxyz|ghi789|abc1234|xyz).*[x-z]+", "abcd..yyy..yyyzzz", "mnmnpp[a-z]+PPP" }, { "abc", "ghi789", + "xyz", "abcd", "yyy", "yyyzzz", diff --git a/contrib/libs/re2/re2/unicode_groups.cc b/contrib/libs/re2/re2/unicode_groups.cc index 2a8d7dae1f..3b58be4cb8 100644 --- a/contrib/libs/re2/re2/unicode_groups.cc +++ b/contrib/libs/re2/re2/unicode_groups.cc @@ -29,7 +29,7 @@ static const URange16 C_range16[] = { static const URange32 C_range32[] = { { 69821, 69821 }, { 69837, 69837 }, - { 78896, 78904 }, + { 78896, 78911 }, { 113824, 113827 }, { 119155, 119162 }, { 917505, 917505 }, @@ -60,7 +60,7 @@ static const URange16 Cf_range16[] = { static const URange32 Cf_range32[] = { { 69821, 69821 }, { 69837, 69837 }, - { 78896, 78904 }, + { 78896, 78911 }, { 113824, 113827 }, { 119155, 119162 }, { 917505, 917505 }, @@ -548,6 +548,7 @@ static const URange32 L_range32[] = { { 70108, 70108 }, { 70144, 70161 }, { 70163, 70187 }, + { 70207, 70208 }, { 70272, 70278 }, { 70280, 70280 }, { 70282, 70285 }, @@ -610,11 +611,15 @@ static const URange32 L_range32[] = { { 73066, 73097 }, { 73112, 73112 }, { 73440, 73458 }, + { 73474, 73474 }, + { 73476, 73488 }, + { 73490, 73523 }, { 73648, 73648 }, { 73728, 74649 }, { 74880, 75075 }, { 77712, 77808 }, - { 77824, 78894 }, + { 77824, 78895 }, + { 78913, 78918 }, { 82944, 83526 }, { 92160, 92728 }, { 92736, 92766 }, @@ -637,7 +642,9 @@ static const URange32 L_range32[] = { { 110581, 110587 }, { 110589, 110590 }, { 110592, 110882 }, + { 110898, 110898 }, { 110928, 110930 }, + { 110933, 110933 }, { 110948, 110951 }, { 110960, 111355 }, { 113664, 113770 }, @@ -675,11 +682,14 @@ static const URange32 L_range32[] = { { 120746, 120770 }, { 120772, 120779 }, { 122624, 122654 }, + { 122661, 122666 }, + { 122928, 122989 }, { 123136, 123180 }, { 123191, 123197 }, { 123214, 123214 }, { 123536, 123565 }, { 123584, 123627 }, + { 124112, 124139 }, { 124896, 124902 }, { 124904, 124907 }, { 124909, 124910 }, @@ -721,12 +731,13 @@ static const URange32 L_range32[] = { { 126629, 126633 }, { 126635, 126651 }, { 131072, 173791 }, - { 173824, 177976 }, + { 173824, 177977 }, { 177984, 178205 }, { 178208, 183969 }, { 183984, 191456 }, { 194560, 195101 }, { 196608, 201546 }, + { 201552, 205743 }, }; static const URange16 Ll_range16[] = { { 97, 122 }, @@ -1387,6 +1398,7 @@ static const URange32 Ll_range32[] = { { 120779, 120779 }, { 122624, 122633 }, { 122635, 122654 }, + { 122661, 122666 }, { 125218, 125251 }, }; static const URange16 Lm_range16[] = { @@ -1459,7 +1471,9 @@ static const URange32 Lm_range32[] = { { 110576, 110579 }, { 110581, 110587 }, { 110589, 110590 }, + { 122928, 122989 }, { 123191, 123197 }, + { 124139, 124139 }, { 125259, 125259 }, }; static const URange16 Lo_range16[] = { @@ -1829,6 +1843,7 @@ static const URange32 Lo_range32[] = { { 70108, 70108 }, { 70144, 70161 }, { 70163, 70187 }, + { 70207, 70208 }, { 70272, 70278 }, { 70280, 70280 }, { 70282, 70285 }, @@ -1890,11 +1905,15 @@ static const URange32 Lo_range32[] = { { 73066, 73097 }, { 73112, 73112 }, { 73440, 73458 }, + { 73474, 73474 }, + { 73476, 73488 }, + { 73490, 73523 }, { 73648, 73648 }, { 73728, 74649 }, { 74880, 75075 }, { 77712, 77808 }, - { 77824, 78894 }, + { 77824, 78895 }, + { 78913, 78918 }, { 82944, 83526 }, { 92160, 92728 }, { 92736, 92766 }, @@ -1909,7 +1928,9 @@ static const URange32 Lo_range32[] = { { 100352, 101589 }, { 101632, 101640 }, { 110592, 110882 }, + { 110898, 110898 }, { 110928, 110930 }, + { 110933, 110933 }, { 110948, 110951 }, { 110960, 111355 }, { 113664, 113770 }, @@ -1921,6 +1942,7 @@ static const URange32 Lo_range32[] = { { 123214, 123214 }, { 123536, 123565 }, { 123584, 123627 }, + { 124112, 124138 }, { 124896, 124902 }, { 124904, 124907 }, { 124909, 124910 }, @@ -1960,12 +1982,13 @@ static const URange32 Lo_range32[] = { { 126629, 126633 }, { 126635, 126651 }, { 131072, 173791 }, - { 173824, 177976 }, + { 173824, 177977 }, { 177984, 178205 }, { 178208, 183969 }, { 183984, 191456 }, { 194560, 195101 }, { 196608, 201546 }, + { 201552, 205743 }, }; static const URange16 Lt_range16[] = { { 453, 453 }, @@ -2710,6 +2733,7 @@ static const URange16 M_range16[] = { { 3274, 3277 }, { 3285, 3286 }, { 3298, 3299 }, + { 3315, 3315 }, { 3328, 3331 }, { 3387, 3388 }, { 3390, 3396 }, @@ -2728,7 +2752,7 @@ static const URange16 M_range16[] = { { 3655, 3662 }, { 3761, 3761 }, { 3764, 3772 }, - { 3784, 3789 }, + { 3784, 3790 }, { 3864, 3865 }, { 3893, 3893 }, { 3895, 3895 }, @@ -2832,6 +2856,7 @@ static const URange32 M_range32[] = { { 68325, 68326 }, { 68900, 68903 }, { 69291, 69292 }, + { 69373, 69375 }, { 69446, 69456 }, { 69506, 69509 }, { 69632, 69634 }, @@ -2851,6 +2876,7 @@ static const URange32 M_range32[] = { { 70094, 70095 }, { 70188, 70199 }, { 70206, 70206 }, + { 70209, 70209 }, { 70367, 70378 }, { 70400, 70403 }, { 70459, 70460 }, @@ -2898,6 +2924,12 @@ static const URange32 M_range32[] = { { 73104, 73105 }, { 73107, 73111 }, { 73459, 73462 }, + { 73472, 73473 }, + { 73475, 73475 }, + { 73524, 73530 }, + { 73534, 73538 }, + { 78912, 78912 }, + { 78919, 78933 }, { 92912, 92916 }, { 92976, 92982 }, { 94031, 94031 }, @@ -2925,9 +2957,11 @@ static const URange32 M_range32[] = { { 122907, 122913 }, { 122915, 122916 }, { 122918, 122922 }, + { 123023, 123023 }, { 123184, 123190 }, { 123566, 123566 }, { 123628, 123631 }, + { 124140, 124143 }, { 125136, 125142 }, { 125252, 125258 }, { 917760, 917999 }, @@ -2968,6 +3002,7 @@ static const URange16 Mc_range16[] = { { 3271, 3272 }, { 3274, 3275 }, { 3285, 3286 }, + { 3315, 3315 }, { 3330, 3331 }, { 3390, 3392 }, { 3398, 3400 }, @@ -3108,6 +3143,10 @@ static const URange32 Mc_range32[] = { { 73107, 73108 }, { 73110, 73110 }, { 73461, 73462 }, + { 73475, 73475 }, + { 73524, 73525 }, + { 73534, 73535 }, + { 73537, 73537 }, { 94033, 94087 }, { 94192, 94193 }, { 119141, 119142 }, @@ -3213,7 +3252,7 @@ static const URange16 Mn_range16[] = { { 3655, 3662 }, { 3761, 3761 }, { 3764, 3772 }, - { 3784, 3789 }, + { 3784, 3790 }, { 3864, 3865 }, { 3893, 3893 }, { 3895, 3895 }, @@ -3346,6 +3385,7 @@ static const URange32 Mn_range32[] = { { 68325, 68326 }, { 68900, 68903 }, { 69291, 69292 }, + { 69373, 69375 }, { 69446, 69456 }, { 69506, 69509 }, { 69633, 69633 }, @@ -3368,6 +3408,7 @@ static const URange32 Mn_range32[] = { { 70196, 70196 }, { 70198, 70199 }, { 70206, 70206 }, + { 70209, 70209 }, { 70367, 70367 }, { 70371, 70378 }, { 70400, 70401 }, @@ -3429,6 +3470,12 @@ static const URange32 Mn_range32[] = { { 73109, 73109 }, { 73111, 73111 }, { 73459, 73460 }, + { 73472, 73473 }, + { 73526, 73530 }, + { 73536, 73536 }, + { 73538, 73538 }, + { 78912, 78912 }, + { 78919, 78933 }, { 92912, 92916 }, { 92976, 92982 }, { 94031, 94031 }, @@ -3453,9 +3500,11 @@ static const URange32 Mn_range32[] = { { 122907, 122913 }, { 122915, 122916 }, { 122918, 122922 }, + { 123023, 123023 }, { 123184, 123190 }, { 123566, 123566 }, { 123628, 123631 }, + { 124140, 124143 }, { 125136, 125142 }, { 125252, 125258 }, { 917760, 917999 }, @@ -3576,6 +3625,7 @@ static const URange32 N_range32[] = { { 72784, 72812 }, { 73040, 73049 }, { 73120, 73129 }, + { 73552, 73561 }, { 73664, 73684 }, { 74752, 74862 }, { 92768, 92777 }, @@ -3583,11 +3633,13 @@ static const URange32 N_range32[] = { { 93008, 93017 }, { 93019, 93025 }, { 93824, 93846 }, + { 119488, 119507 }, { 119520, 119539 }, { 119648, 119672 }, { 120782, 120831 }, { 123200, 123209 }, { 123632, 123641 }, + { 124144, 124153 }, { 125127, 125135 }, { 125264, 125273 }, { 126065, 126123 }, @@ -3655,12 +3707,14 @@ static const URange32 Nd_range32[] = { { 72784, 72793 }, { 73040, 73049 }, { 73120, 73129 }, + { 73552, 73561 }, { 92768, 92777 }, { 92864, 92873 }, { 93008, 93017 }, { 120782, 120831 }, { 123200, 123209 }, { 123632, 123641 }, + { 124144, 124153 }, { 125264, 125273 }, { 130032, 130041 }, }; @@ -3745,6 +3799,7 @@ static const URange32 No_range32[] = { { 73664, 73684 }, { 93019, 93025 }, { 93824, 93846 }, + { 119488, 119507 }, { 119520, 119539 }, { 119648, 119672 }, { 125127, 125135 }, @@ -3932,9 +3987,11 @@ static const URange32 P_range32[] = { { 72255, 72262 }, { 72346, 72348 }, { 72350, 72354 }, + { 72448, 72457 }, { 72769, 72773 }, { 72816, 72817 }, { 73463, 73464 }, + { 73539, 73551 }, { 73727, 73727 }, { 74864, 74868 }, { 77809, 77810 }, @@ -4255,9 +4312,11 @@ static const URange32 Po_range32[] = { { 72255, 72262 }, { 72346, 72348 }, { 72350, 72354 }, + { 72448, 72457 }, { 72769, 72773 }, { 72816, 72817 }, { 73463, 73464 }, + { 73539, 73551 }, { 73727, 73727 }, { 74864, 74868 }, { 77809, 77810 }, @@ -4564,10 +4623,10 @@ static const URange32 S_range32[] = { { 127568, 127569 }, { 127584, 127589 }, { 127744, 128727 }, - { 128733, 128748 }, + { 128732, 128748 }, { 128752, 128764 }, - { 128768, 128883 }, - { 128896, 128984 }, + { 128768, 128886 }, + { 128891, 128985 }, { 128992, 129003 }, { 129008, 129008 }, { 129024, 129035 }, @@ -4578,15 +4637,13 @@ static const URange32 S_range32[] = { { 129200, 129201 }, { 129280, 129619 }, { 129632, 129645 }, - { 129648, 129652 }, - { 129656, 129660 }, - { 129664, 129670 }, - { 129680, 129708 }, - { 129712, 129722 }, - { 129728, 129733 }, - { 129744, 129753 }, - { 129760, 129767 }, - { 129776, 129782 }, + { 129648, 129660 }, + { 129664, 129672 }, + { 129680, 129725 }, + { 129727, 129733 }, + { 129742, 129755 }, + { 129760, 129768 }, + { 129776, 129784 }, { 129792, 129938 }, { 129940, 129994 }, }; @@ -4882,10 +4939,10 @@ static const URange32 So_range32[] = { { 127584, 127589 }, { 127744, 127994 }, { 128000, 128727 }, - { 128733, 128748 }, + { 128732, 128748 }, { 128752, 128764 }, - { 128768, 128883 }, - { 128896, 128984 }, + { 128768, 128886 }, + { 128891, 128985 }, { 128992, 129003 }, { 129008, 129008 }, { 129024, 129035 }, @@ -4896,15 +4953,13 @@ static const URange32 So_range32[] = { { 129200, 129201 }, { 129280, 129619 }, { 129632, 129645 }, - { 129648, 129652 }, - { 129656, 129660 }, - { 129664, 129670 }, - { 129680, 129708 }, - { 129712, 129722 }, - { 129728, 129733 }, - { 129744, 129753 }, - { 129760, 129767 }, - { 129776, 129782 }, + { 129648, 129660 }, + { 129664, 129672 }, + { 129680, 129725 }, + { 129727, 129733 }, + { 129742, 129755 }, + { 129760, 129768 }, + { 129776, 129784 }, { 129792, 129938 }, { 129940, 129994 }, }; @@ -4972,6 +5027,7 @@ static const URange16 Arabic_range16[] = { }; static const URange32 Arabic_range32[] = { { 69216, 69246 }, + { 69373, 69375 }, { 126464, 126467 }, { 126469, 126495 }, { 126497, 126498 }, @@ -5218,6 +5274,7 @@ static const URange32 Common_range32[] = { { 119171, 119172 }, { 119180, 119209 }, { 119214, 119274 }, + { 119488, 119507 }, { 119520, 119539 }, { 119552, 119638 }, { 119648, 119672 }, @@ -5258,10 +5315,10 @@ static const URange32 Common_range32[] = { { 127568, 127569 }, { 127584, 127589 }, { 127744, 128727 }, - { 128733, 128748 }, + { 128732, 128748 }, { 128752, 128764 }, - { 128768, 128883 }, - { 128896, 128984 }, + { 128768, 128886 }, + { 128891, 128985 }, { 128992, 129003 }, { 129008, 129008 }, { 129024, 129035 }, @@ -5272,15 +5329,13 @@ static const URange32 Common_range32[] = { { 129200, 129201 }, { 129280, 129619 }, { 129632, 129645 }, - { 129648, 129652 }, - { 129656, 129660 }, - { 129664, 129670 }, - { 129680, 129708 }, - { 129712, 129722 }, - { 129728, 129733 }, - { 129744, 129753 }, - { 129760, 129767 }, - { 129776, 129782 }, + { 129648, 129660 }, + { 129664, 129672 }, + { 129680, 129725 }, + { 129727, 129733 }, + { 129742, 129755 }, + { 129760, 129768 }, + { 129776, 129784 }, { 129792, 129938 }, { 129940, 129994 }, { 130032, 130041 }, @@ -5319,6 +5374,10 @@ static const URange16 Cyrillic_range16[] = { { 42560, 42655 }, { 65070, 65071 }, }; +static const URange32 Cyrillic_range32[] = { + { 122928, 122989 }, + { 123023, 123023 }, +}; static const URange32 Deseret_range32[] = { { 66560, 66639 }, }; @@ -5328,6 +5387,9 @@ static const URange16 Devanagari_range16[] = { { 2406, 2431 }, { 43232, 43263 }, }; +static const URange32 Devanagari_range32[] = { + { 72448, 72457 }, +}; static const URange32 Dives_Akuru_range32[] = { { 71936, 71942 }, { 71945, 71945 }, @@ -5349,8 +5411,7 @@ static const URange32 Duployan_range32[] = { { 113820, 113823 }, }; static const URange32 Egyptian_Hieroglyphs_range32[] = { - { 77824, 78894 }, - { 78896, 78904 }, + { 77824, 78933 }, }; static const URange32 Elbasan_range32[] = { { 66816, 66855 }, @@ -5539,12 +5600,13 @@ static const URange32 Han_range32[] = { { 94178, 94179 }, { 94192, 94193 }, { 131072, 173791 }, - { 173824, 177976 }, + { 173824, 177977 }, { 177984, 178205 }, { 178208, 183969 }, { 183984, 191456 }, { 194560, 195101 }, { 196608, 201546 }, + { 201552, 205743 }, }; static const URange16 Hangul_range16[] = { { 4352, 4607 }, @@ -5591,6 +5653,7 @@ static const URange16 Hiragana_range16[] = { }; static const URange32 Hiragana_range32[] = { { 110593, 110879 }, + { 110898, 110898 }, { 110928, 110930 }, { 127488, 127488 }, }; @@ -5661,7 +5724,7 @@ static const URange16 Kannada_range16[] = { { 3293, 3294 }, { 3296, 3299 }, { 3302, 3311 }, - { 3313, 3314 }, + { 3313, 3315 }, }; static const URange16 Katakana_range16[] = { { 12449, 12538 }, @@ -5678,8 +5741,14 @@ static const URange32 Katakana_range32[] = { { 110589, 110590 }, { 110592, 110592 }, { 110880, 110882 }, + { 110933, 110933 }, { 110948, 110951 }, }; +static const URange32 Kawi_range32[] = { + { 73472, 73488 }, + { 73490, 73530 }, + { 73534, 73561 }, +}; static const URange16 Kayah_Li_range16[] = { { 43264, 43309 }, { 43311, 43311 }, @@ -5706,7 +5775,7 @@ static const URange16 Khmer_range16[] = { }; static const URange32 Khojki_range32[] = { { 70144, 70161 }, - { 70163, 70206 }, + { 70163, 70209 }, }; static const URange32 Khudawadi_range32[] = { { 70320, 70378 }, @@ -5721,7 +5790,7 @@ static const URange16 Lao_range16[] = { { 3751, 3773 }, { 3776, 3780 }, { 3782, 3782 }, - { 3784, 3789 }, + { 3784, 3790 }, { 3792, 3801 }, { 3804, 3807 }, }; @@ -5766,6 +5835,7 @@ static const URange32 Latin_range32[] = { { 67463, 67504 }, { 67506, 67514 }, { 122624, 122654 }, + { 122661, 122666 }, }; static const URange16 Lepcha_range16[] = { { 7168, 7223 }, @@ -5903,6 +5973,9 @@ static const URange32 Nabataean_range32[] = { { 67712, 67742 }, { 67751, 67759 }, }; +static const URange32 Nag_Mundari_range32[] = { + { 124112, 124153 }, +}; static const URange32 Nandinagari_range32[] = { { 72096, 72103 }, { 72106, 72151 }, @@ -6229,12 +6302,12 @@ static const URange16 Yi_range16[] = { static const URange32 Zanabazar_Square_range32[] = { { 72192, 72263 }, }; -// 4038 16-bit ranges, 1712 32-bit ranges +// 4040 16-bit ranges, 1775 32-bit ranges const UGroup unicode_groups[] = { { "Adlam", +1, 0, 0, Adlam_range32, 3 }, { "Ahom", +1, 0, 0, Ahom_range32, 3 }, { "Anatolian_Hieroglyphs", +1, 0, 0, Anatolian_Hieroglyphs_range32, 1 }, - { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 35 }, + { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 36 }, { "Armenian", +1, Armenian_range16, 4, 0, 0 }, { "Avestan", +1, 0, 0, Avestan_range32, 2 }, { "Balinese", +1, Balinese_range16, 2, 0, 0 }, @@ -6259,19 +6332,19 @@ const UGroup unicode_groups[] = { { "Cherokee", +1, Cherokee_range16, 3, 0, 0 }, { "Chorasmian", +1, 0, 0, Chorasmian_range32, 1 }, { "Co", +1, Co_range16, 1, Co_range32, 2 }, - { "Common", +1, Common_range16, 91, Common_range32, 83 }, + { "Common", +1, Common_range16, 91, Common_range32, 82 }, { "Coptic", +1, Coptic_range16, 3, 0, 0 }, { "Cs", +1, Cs_range16, 1, 0, 0 }, { "Cuneiform", +1, 0, 0, Cuneiform_range32, 4 }, { "Cypriot", +1, 0, 0, Cypriot_range32, 6 }, { "Cypro_Minoan", +1, 0, 0, Cypro_Minoan_range32, 1 }, - { "Cyrillic", +1, Cyrillic_range16, 8, 0, 0 }, + { "Cyrillic", +1, Cyrillic_range16, 8, Cyrillic_range32, 2 }, { "Deseret", +1, 0, 0, Deseret_range32, 1 }, - { "Devanagari", +1, Devanagari_range16, 4, 0, 0 }, + { "Devanagari", +1, Devanagari_range16, 4, Devanagari_range32, 1 }, { "Dives_Akuru", +1, 0, 0, Dives_Akuru_range32, 8 }, { "Dogra", +1, 0, 0, Dogra_range32, 1 }, { "Duployan", +1, 0, 0, Duployan_range32, 5 }, - { "Egyptian_Hieroglyphs", +1, 0, 0, Egyptian_Hieroglyphs_range32, 2 }, + { "Egyptian_Hieroglyphs", +1, 0, 0, Egyptian_Hieroglyphs_range32, 1 }, { "Elbasan", +1, 0, 0, Elbasan_range32, 1 }, { "Elymaic", +1, 0, 0, Elymaic_range32, 1 }, { "Ethiopic", +1, Ethiopic_range16, 32, Ethiopic_range32, 4 }, @@ -6283,13 +6356,13 @@ const UGroup unicode_groups[] = { { "Gujarati", +1, Gujarati_range16, 14, 0, 0 }, { "Gunjala_Gondi", +1, 0, 0, Gunjala_Gondi_range32, 6 }, { "Gurmukhi", +1, Gurmukhi_range16, 16, 0, 0 }, - { "Han", +1, Han_range16, 11, Han_range32, 9 }, + { "Han", +1, Han_range16, 11, Han_range32, 10 }, { "Hangul", +1, Hangul_range16, 14, 0, 0 }, { "Hanifi_Rohingya", +1, 0, 0, Hanifi_Rohingya_range32, 2 }, { "Hanunoo", +1, Hanunoo_range16, 1, 0, 0 }, { "Hatran", +1, 0, 0, Hatran_range32, 3 }, { "Hebrew", +1, Hebrew_range16, 9, 0, 0 }, - { "Hiragana", +1, Hiragana_range16, 2, Hiragana_range32, 3 }, + { "Hiragana", +1, Hiragana_range16, 2, Hiragana_range32, 4 }, { "Imperial_Aramaic", +1, 0, 0, Imperial_Aramaic_range32, 2 }, { "Inherited", +1, Inherited_range16, 19, Inherited_range32, 10 }, { "Inscriptional_Pahlavi", +1, 0, 0, Inscriptional_Pahlavi_range32, 2 }, @@ -6297,29 +6370,30 @@ const UGroup unicode_groups[] = { { "Javanese", +1, Javanese_range16, 3, 0, 0 }, { "Kaithi", +1, 0, 0, Kaithi_range32, 2 }, { "Kannada", +1, Kannada_range16, 13, 0, 0 }, - { "Katakana", +1, Katakana_range16, 7, Katakana_range32, 6 }, + { "Katakana", +1, Katakana_range16, 7, Katakana_range32, 7 }, + { "Kawi", +1, 0, 0, Kawi_range32, 3 }, { "Kayah_Li", +1, Kayah_Li_range16, 2, 0, 0 }, { "Kharoshthi", +1, 0, 0, Kharoshthi_range32, 8 }, { "Khitan_Small_Script", +1, 0, 0, Khitan_Small_Script_range32, 2 }, { "Khmer", +1, Khmer_range16, 4, 0, 0 }, { "Khojki", +1, 0, 0, Khojki_range32, 2 }, { "Khudawadi", +1, 0, 0, Khudawadi_range32, 2 }, - { "L", +1, L_range16, 380, L_range32, 268 }, + { "L", +1, L_range16, 380, L_range32, 279 }, { "Lao", +1, Lao_range16, 11, 0, 0 }, - { "Latin", +1, Latin_range16, 34, Latin_range32, 4 }, + { "Latin", +1, Latin_range16, 34, Latin_range32, 5 }, { "Lepcha", +1, Lepcha_range16, 3, 0, 0 }, { "Limbu", +1, Limbu_range16, 5, 0, 0 }, { "Linear_A", +1, 0, 0, Linear_A_range32, 3 }, { "Linear_B", +1, 0, 0, Linear_B_range32, 7 }, { "Lisu", +1, Lisu_range16, 1, Lisu_range32, 1 }, - { "Ll", +1, Ll_range16, 617, Ll_range32, 40 }, - { "Lm", +1, Lm_range16, 57, Lm_range32, 12 }, - { "Lo", +1, Lo_range16, 290, Lo_range32, 211 }, + { "Ll", +1, Ll_range16, 617, Ll_range32, 41 }, + { "Lm", +1, Lm_range16, 57, Lm_range32, 14 }, + { "Lo", +1, Lo_range16, 290, Lo_range32, 220 }, { "Lt", +1, Lt_range16, 10, 0, 0 }, { "Lu", +1, Lu_range16, 605, Lu_range32, 41 }, { "Lycian", +1, 0, 0, Lycian_range32, 1 }, { "Lydian", +1, 0, 0, Lydian_range32, 2 }, - { "M", +1, M_range16, 189, M_range32, 110 }, + { "M", +1, M_range16, 190, M_range32, 120 }, { "Mahajani", +1, 0, 0, Mahajani_range32, 1 }, { "Makasar", +1, 0, 0, Makasar_range32, 1 }, { "Malayalam", +1, Malayalam_range16, 7, 0, 0 }, @@ -6327,7 +6401,7 @@ const UGroup unicode_groups[] = { { "Manichaean", +1, 0, 0, Manichaean_range32, 2 }, { "Marchen", +1, 0, 0, Marchen_range32, 3 }, { "Masaram_Gondi", +1, 0, 0, Masaram_Gondi_range32, 7 }, - { "Mc", +1, Mc_range16, 111, Mc_range32, 66 }, + { "Mc", +1, Mc_range16, 112, Mc_range32, 70 }, { "Me", +1, Me_range16, 5, 0, 0 }, { "Medefaidrin", +1, 0, 0, Medefaidrin_range32, 1 }, { "Meetei_Mayek", +1, Meetei_Mayek_range16, 3, 0, 0 }, @@ -6335,21 +6409,22 @@ const UGroup unicode_groups[] = { { "Meroitic_Cursive", +1, 0, 0, Meroitic_Cursive_range32, 3 }, { "Meroitic_Hieroglyphs", +1, 0, 0, Meroitic_Hieroglyphs_range32, 1 }, { "Miao", +1, 0, 0, Miao_range32, 3 }, - { "Mn", +1, Mn_range16, 212, Mn_range32, 124 }, + { "Mn", +1, Mn_range16, 212, Mn_range32, 134 }, { "Modi", +1, 0, 0, Modi_range32, 2 }, { "Mongolian", +1, Mongolian_range16, 5, Mongolian_range32, 1 }, { "Mro", +1, 0, 0, Mro_range32, 3 }, { "Multani", +1, 0, 0, Multani_range32, 5 }, { "Myanmar", +1, Myanmar_range16, 3, 0, 0 }, - { "N", +1, N_range16, 67, N_range32, 67 }, + { "N", +1, N_range16, 67, N_range32, 70 }, { "Nabataean", +1, 0, 0, Nabataean_range32, 2 }, + { "Nag_Mundari", +1, 0, 0, Nag_Mundari_range32, 1 }, { "Nandinagari", +1, 0, 0, Nandinagari_range32, 3 }, - { "Nd", +1, Nd_range16, 37, Nd_range32, 25 }, + { "Nd", +1, Nd_range16, 37, Nd_range32, 27 }, { "New_Tai_Lue", +1, New_Tai_Lue_range16, 4, 0, 0 }, { "Newa", +1, 0, 0, Newa_range32, 2 }, { "Nko", +1, Nko_range16, 2, 0, 0 }, { "Nl", +1, Nl_range16, 7, Nl_range32, 5 }, - { "No", +1, No_range16, 29, No_range32, 42 }, + { "No", +1, No_range16, 29, No_range32, 43 }, { "Nushu", +1, 0, 0, Nushu_range32, 2 }, { "Nyiakeng_Puachue_Hmong", +1, 0, 0, Nyiakeng_Puachue_Hmong_range32, 4 }, { "Ogham", +1, Ogham_range16, 1, 0, 0 }, @@ -6366,7 +6441,7 @@ const UGroup unicode_groups[] = { { "Oriya", +1, Oriya_range16, 14, 0, 0 }, { "Osage", +1, 0, 0, Osage_range32, 2 }, { "Osmanya", +1, 0, 0, Osmanya_range32, 2 }, - { "P", +1, P_range16, 133, P_range32, 56 }, + { "P", +1, P_range16, 133, P_range32, 58 }, { "Pahawh_Hmong", +1, 0, 0, Pahawh_Hmong_range32, 5 }, { "Palmyrene", +1, 0, 0, Palmyrene_range32, 1 }, { "Pau_Cin_Hau", +1, 0, 0, Pau_Cin_Hau_range32, 1 }, @@ -6377,12 +6452,12 @@ const UGroup unicode_groups[] = { { "Phags_Pa", +1, Phags_Pa_range16, 1, 0, 0 }, { "Phoenician", +1, 0, 0, Phoenician_range32, 2 }, { "Pi", +1, Pi_range16, 11, 0, 0 }, - { "Po", +1, Po_range16, 130, Po_range32, 55 }, + { "Po", +1, Po_range16, 130, Po_range32, 57 }, { "Ps", +1, Ps_range16, 79, 0, 0 }, { "Psalter_Pahlavi", +1, 0, 0, Psalter_Pahlavi_range32, 3 }, { "Rejang", +1, Rejang_range16, 2, 0, 0 }, { "Runic", +1, Runic_range16, 2, 0, 0 }, - { "S", +1, S_range16, 151, S_range32, 83 }, + { "S", +1, S_range16, 151, S_range32, 81 }, { "Samaritan", +1, Samaritan_range16, 2, 0, 0 }, { "Saurashtra", +1, Saurashtra_range16, 2, 0, 0 }, { "Sc", +1, Sc_range16, 18, Sc_range32, 3 }, @@ -6393,7 +6468,7 @@ const UGroup unicode_groups[] = { { "Sinhala", +1, Sinhala_range16, 12, Sinhala_range32, 1 }, { "Sk", +1, Sk_range16, 30, Sk_range32, 1 }, { "Sm", +1, Sm_range16, 53, Sm_range32, 11 }, - { "So", +1, So_range16, 114, So_range32, 72 }, + { "So", +1, So_range16, 114, So_range32, 70 }, { "Sogdian", +1, 0, 0, Sogdian_range32, 1 }, { "Sora_Sompeng", +1, 0, 0, Sora_Sompeng_range32, 2 }, { "Soyombo", +1, 0, 0, Soyombo_range32, 1 }, @@ -6429,7 +6504,7 @@ const UGroup unicode_groups[] = { { "Zp", +1, Zp_range16, 1, 0, 0 }, { "Zs", +1, Zs_range16, 7, 0, 0 }, }; -const int num_unicode_groups = 197; +const int num_unicode_groups = 199; } // namespace re2 diff --git a/contrib/libs/re2/util/rune.cc b/contrib/libs/re2/util/rune.cc index 4f625ea380..a40e756c4e 100644 --- a/contrib/libs/re2/util/rune.cc +++ b/contrib/libs/re2/util/rune.cc @@ -51,7 +51,7 @@ int chartorune(Rune *rune, const char *str) { int c, c1, c2, c3; - long l; + Rune l; /* * one character sequence @@ -127,7 +127,7 @@ int runetochar(char *str, const Rune *rune) { /* Runes are signed, so convert to unsigned for range check. */ - unsigned long c; + unsigned int c; /* * one character sequence @@ -212,7 +212,7 @@ int utflen(const char *s) { int c; - long n; + int n; Rune rune; n = 0; @@ -232,7 +232,7 @@ utflen(const char *s) char* utfrune(const char *s, Rune c) { - long c1; + int c1; Rune r; int n; |